volcengine · ETOgaosion · Mar 31, 2025 · Mar 31, 2025 · Mar 31, 2025 · Apr 1, 2025
diff --git a/.github/workflows/checkpoints.yml b/.github/workflows/checkpoints.yml
@@ -22,7 +22,7 @@ permissions:
   contents: read
 
 jobs:
-  e2e_gsm8k_megatron:
+  checkpoints:
     runs-on: [self-hosted, l20-0]
     timeout-minutes: 40 # Increase this timeout value as needed
     env:
@@ -31,7 +31,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -47,7 +47,6 @@ jobs:
       - name: Running Checkpoint Integration Test (Qwen Megatron)
         run: |
           ray stop --force
-          export PYTHONPATH=$PYTHONPATH:/opt/nvidia/Megatron-LM
           bash tests/checkpoint/run_qwen_megatron_ckpt.sh
       - name: Running Checkpoint Integration Test (Deepseek Megatron)
         run: |

diff --git a/.github/workflows/dataset.yml b/.github/workflows/dataset.yml
@@ -32,7 +32,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/e2e_eval_aime24.yml b/.github/workflows/e2e_eval_aime24.yml
@@ -28,7 +28,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: hiyouga/verl:ngc-th2.6.0-cu120-vllm0.8.2
+      image: whatcanyousee/verl:ngc-th2.6.0-cu126-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/e2e_grpo.yml b/.github/workflows/e2e_grpo.yml
@@ -24,7 +24,7 @@ permissions:
   contents: read
 
 jobs:
-  e2e_gsm8k_megatron-l20-0:
+  e2e_grpo-l20-0:
     runs-on: [self-hosted, l20-0]
     timeout-minutes: 40 # Increase this timeout value as needed
     env:
@@ -33,7 +33,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -54,7 +54,7 @@ jobs:
         run: |
           ray stop --force
           bash tests/e2e/run_qwen_grpo_megatron.sh
-  e2e_gsm8k_megatron-l20-1:
+  e2e_grpo-l20-1:
     runs-on: [self-hosted, l20-1]
     timeout-minutes: 40 # Increase this timeout value as needed
     env:
@@ -63,7 +63,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/e2e_gsm8k.yml b/.github/workflows/e2e_gsm8k.yml
@@ -33,7 +33,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: hiyouga/verl:ngc-th2.6.0-cu120-vllm0.8.2
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/e2e_gsm8k_megatron.yml b/.github/workflows/e2e_gsm8k_megatron.yml
@@ -1,5 +1,5 @@
 name: e2e_gsm8k_megatron
-# latest version: Megatron-LM core_r0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/core_r0.11.0
+# latest version: Megatron-LM v0.11.0 https://github.com/NVIDIA/Megatron-LM/tree/v0.11.0
 
 on:
   # Trigger the workflow on push or pull request,
@@ -26,7 +26,7 @@ permissions:
   contents: read
 
 jobs:
-  e2e_gsm8k_megatron:
+  e2e_gsm8k_megatron-l20-0:
     runs-on: [self-hosted, l20-0]
     timeout-minutes: 40 # Increase this timeout value as needed
     env:
@@ -35,7 +35,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -52,6 +52,28 @@ jobs:
         run: |
           ray stop --force
           bash tests/e2e/run_deepseek_megatron_parallelism.sh
+  e2e_gsm8k_megatron-l20-1:
+    runs-on: [self-hosted, l20-1]
+    timeout-minutes: 40 # Increase this timeout value as needed
+    env:
+      HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
+      HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
+      NO_PROXY: "localhost,127.0.0.1"
+      HF_HUB_ENABLE_HF_TRANSFER: 1
+    container:
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
+      options: --gpus all --shm-size=10g
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+            fetch-depth: 0
+      - name: Install the current repository
+        run: |
+          pip3 install hf_transfer
+          pip3 install -e .[test]
+      - name: Prepare gsm8k dataset
+        run: |
+          python3 examples/data_preprocess/gsm8k.py
       - name: Running gsm8k e2e training tests with 3D parallelism on 8 L20 GPUs with Megatron (Qwen)
         run: |
           ray stop --force

diff --git a/.github/workflows/e2e_gsm8k_prime.yml b/.github/workflows/e2e_gsm8k_prime.yml
@@ -30,7 +30,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: hiyouga/verl:ngc-th2.6.0-cu120-vllm0.8.2
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/e2e_lora.yml b/.github/workflows/e2e_lora.yml
@@ -33,7 +33,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/e2e_sft.yml b/.github/workflows/e2e_sft.yml
@@ -33,7 +33,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/e2e_vlm_geo3k.yml b/.github/workflows/e2e_vlm_geo3k.yml
@@ -27,7 +27,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: hiyouga/verl:ngc-th2.6.0-cu120-vllm0.8.2
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=40g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/model.yml b/.github/workflows/model.yml
@@ -27,7 +27,7 @@ jobs:
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/ray_test.yml b/.github/workflows/ray_test.yml
@@ -31,14 +31,14 @@ permissions:
 jobs:
   ray:
     runs-on: [self-hosted, l20-0]
-    timeout-minutes: 5 # Increase this timeout value as needed
+    timeout-minutes: 10 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/sandbox.yml b/.github/workflows/sandbox.yml
@@ -23,14 +23,14 @@ permissions:
 jobs:
   sandbox:
     runs-on: [self-hosted, l20-0]
-    timeout-minutes: 3 # Increase this timeout value as needed
+    timeout-minutes: 10 # Increase this timeout value as needed
     env:
       HTTP_PROXY: ${{ secrets.PROXY_HTTP }}
       HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }}
       NO_PROXY: "localhost,127.0.0.1"
       HF_HUB_ENABLE_HF_TRANSFER: 1
     container:
-      image: verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+      image: whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/vllm.yml b/.github/workflows/vllm.yml
@@ -43,13 +43,15 @@ jobs:
           pip3 install hf_transfer
           pip3 install -e .[test]
           pip3 install vllm==0.5.4
+          pip3 install flash_attn
       - name: Running vllm tests on 8 L20 GPUs
         run: |
           cd tests/rollout
           torchrun --standalone --nnodes=1 --nproc_per_node=8 $(which pytest) -s test_vllm_hf_loader.py
       - name: Test the latest vLLM
         run: |
           pip3 install --upgrade vllm==0.7.3
+          pip3 install flash_attn
           cd tests/rollout
           torchrun --standalone --nnodes=1 --nproc_per_node=4 $(which pytest) -s test_vllm_spmd.py
       - name: Run Qwen 0.5B generation test

diff --git a/docker/Dockerfile.megatron b/docker/Dockerfile.megatron
@@ -1,9 +1,36 @@
-FROM verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
+FROM hiyouga/verl:ngc-th2.6.0-cu120-vllm0.8.2
 
-RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
+# Define environments
+ENV MAX_JOBS=64
 
-RUN cd /opt/nvidia && git clone --single-branch --branch core_r0.11.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
+RUN apt-get update && \
+    apt-get install -y aria2
 
-# only config pip index with https://pypi.tuna.tsinghua.edu.cn/simple if needed
-# unset for now
-RUN cd /opt/nvidia/Megatron-LM && pip3 install --no-deps -e .
+# 1. Reinstall CUDA 12.4
+RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
+
+RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
+
+RUN dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
+
+RUN cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/
+
+RUN apt-get update
+
+RUN apt-get -y install cuda-toolkit-12-4
+
+RUN rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb
+
+RUN update-alternatives --set cuda /usr/local/cuda-12.4
+
+# 2. Install Apex
+RUN git clone https://github.com/NVIDIA/apex.git && \
+    cd apex && \
+    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
+
+# 3. Install TransformerEngine
+RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps git+https://github.com/NVIDIA/[email protected]
+
+# 4. Install Megatron-LM
+RUN pip3 install git+https://github.com/NVIDIA/[email protected]
@@ -84,10 +84,11 @@ So example use of Megatron model merger is:
 
 .. code:: bash
 
-    python3 scripts/model_merger.py --backend megatron \
-        --is-value-model \
-        --hf_model_path Qwen/Qwen2-7B \
-        --local_dir checkpoints/verl_megatron_gsm8k_examples/deepseek_megatron_checkpoint_saveload/global_step_1/actor/model
+    python scripts/model_merger.py \
+        --backend megatron \
+        --tie-word-embedding \
+        --hf_model_path Qwen/Qwen2.5-0.5B \
+        --local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor
 
 Megatron Merger details
 -----------------------

@@ -19,7 +19,7 @@ Choices of Backend Engines
 
 We recommend using **FSDP** backend to investigate, research and prototype different models, datasets and RL algorithms. The guide for using FSDP backend can be found in :doc:`FSDP Workers<../workers/fsdp_workers>`.
 
-For users who pursue better scalability, we recommend using **Megatron-LM** backend. Currently, we support Megatron-LM v0.11 [1]_. The guide for using Megatron-LM backend can be found in :doc:`Megatron-LM Workers<../workers/megatron_workers>`.
+For users who pursue better scalability, we recommend using **Megatron-LM** backend. Currently, we support `Megatron-LM v0.11<https://github.com/NVIDIA/Megatron-LM/tree/v0.11.0>`_. The guide for using Megatron-LM backend can be found in :doc:`Megatron-LM Workers<../workers/megatron_workers>`.
 
 .. note:: 
 
@@ -40,17 +40,15 @@ Install from docker image
 
 We provide pre-built Docker images for quick setup. For SGLang usage, please follow the later sections in this doc.
 
-Image and tag: ``whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6`` if you need both FSDP and Megatron support.
+Image and tag: ``whatcanyousee/verl:ngc-th2.6.0-cu124-vllm0.8.2-mcore0.11.0``. Check files under ``docker/`` for NGC-based image or if you want to build your own.
 
-We highly recommend ``hiyouga/verl:ngc-th2.6.0-cu120-vllm0.8.2-verl0.3.0.post1`` with vllm v0.8.2 for fastest rollout performance with FSDP.
-
-See files under ``docker/`` for NGC-based image or if you want to build your own.
-
-1. Launch the desired Docker image:
+1. Launch the desired Docker image and attach into it:
 
 .. code:: bash
 
-    docker run --runtime=nvidia -it --rm --shm-size="10g" --cap-add=SYS_ADMIN -v <image:tag>
+    docker create --runtime=nvidia --gpus all --net=host --shm-size="10g" --cap-add=SYS_ADMIN -v .:/workspace/verl --name verl <image:tag>
+    docker start verl
+    docker exec -it verl bash
 
 
 2.	Inside the container, install latest verl:
@@ -65,14 +63,14 @@ See files under ``docker/`` for NGC-based image or if you want to build your own
 
     The Docker image ``whatcanyousee/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te2.0-megatron0.11.0-v0.0.6`` is built with the following configurations:
 
-    - **PyTorch**: 2.4.0+cu124
-    - **CUDA**: 12.4
-    - **Megatron-LM**: core_r0.11.0
-    - **vLLM**: 0.6.3
-    - **Ray**: 2.10.0
-    - **TransformerEngine**: 2.0.0+754d2a0
+    - **PyTorch**: 2.6.0+cu124
+    - **CUDA**: 12.6
+    - **Megatron-LM**: v0.11.0
+    - **vLLM**: 0.8.2
+    - **Ray**: 2.44.0
+    - **TransformerEngine**: 2.1.0+8eb1712
 
-    Now verl has been **compatible to Megatron-LM core_r0.11.0**, and there is **no need to apply patches** to Megatron-LM. Also, the image has integrated **Megatron-LM core_r0.11.0**, located at ``/opt/nvidia/Meagtron-LM``. One more thing, because verl only use ``megatron.core`` module for now, there is **no need to modify** ``PATH`` if you have installed Megatron-LM with this docker image.
+    Now verl has been **compatible to Megatron-LM v0.11.0**, and there is **no need to apply patches** to Megatron-LM. Also, the image has integrated **Megatron-LM v0.11.0**, located at ``/opt/nvidia/Meagtron-LM``. One more thing, because verl only use ``megatron.core`` module for now, there is **no need to modify** ``PATH`` if you have installed Megatron-LM with this docker image.
 
 
 Install SGLang as rollout backend
@@ -127,7 +125,7 @@ own post-training jobs.
 .. code:: bash
 
    # install verl together with some lightweight dependencies in setup.py
-   pip3 install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu124
+   pip3 install torch==2.6.0 --index-url https://download.pytorch.org/whl/cu124
    pip3 install flash-attn --no-build-isolation
    git clone https://github.com/volcengine/verl.git
    cd verl

diff --git a/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh b/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh
@@ -40,7 +40,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger=['console','wandb'] \
-    trainer.project_name='verl_grpo_example_gsm8k' \
+    trainer.project_name='try_fix_megatron_loss_calc' \
     trainer.experiment_name='deepseek_llm_7b_function_rm_math_megatron' \
     trainer.n_gpus_per_node=16 \
     trainer.nnodes=1 \

diff --git a/examples/grpo_trainer/run_qwen2-7b_math_megatron.sh b/examples/grpo_trainer/run_qwen2-7b_math_megatron.sh
@@ -41,7 +41,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \
     algorithm.use_kl_in_reward=False \
     trainer.critic_warmup=0 \
     trainer.logger=['console','wandb'] \
-    trainer.project_name='verl_grpo_example_gsm8k' \
+    trainer.project_name='try_fix_megatron_loss_calc' \
     trainer.experiment_name='qwen2_7b_function_rm_megatron' \
     trainer.n_gpus_per_node=16 \
     trainer.nnodes=1 \