[Bug] Bump SGLang version to 0.4.6.post4; Fix AsyncSGLangRollout

HollowMan6 · HollowMan6 · commit 97e49d43910b · 2025-05-13T14:52:18.000+03:00
Similar to sgl-project/sglang#5997 In the PP PR sgl-project/sglang#5724 broadcast_pyobj function changed its condition from judging rank==0 (if rank is local rank 0 of the passing ProcessGroup) to rank==src (if rank is global rank src), which breaks VerlEngine's broadcast logic when dp>1 and tp>1. Signed-off-by: Hollow Man <hollowman@opensuse.org>
diff --git a/docker/Dockerfile.sglang b/docker/Dockerfile.sglang
@@ -36,8 +36,8 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \
     pip config set global.extra-index-url "${PIP_INDEX}" && \
     python -m pip install --upgrade pip
 
-# Install sglang-0.4.6.post1 and torch-memory-saver
-RUN pip install "sglang[all]==0.4.6.post1" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+# Install sglang-0.4.6.post4 and torch-memory-saver
+RUN pip install "sglang[all]==0.4.6.post4" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
 
 # Install torch-2.6.0
 RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
diff --git a/docker/Dockerfile.stage2.megatron b/docker/Dockerfile.stage2.megatron
@@ -7,7 +7,7 @@ RUN apt-get update && \
     apt-get install -y aria2 libfreeimage3 libfreeimage-dev zlib1g
 
 # 4. Install Sglang
-RUN pip install --no-deps "sglang[all]>=0.4.5.post3"
+RUN pip install --no-deps "sglang[all]>=0.4.6.post3"
 
 # 5. Install cudnn
 RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
diff --git a/requirements_sglang.txt b/requirements_sglang.txt
@@ -17,5 +17,5 @@ torchdata
 torchvision
 transformers
 wandb
-sglang[all]==0.4.4.post4
+sglang[all]==0.4.6.post4
 torch-memory-saver>=0.0.5
diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@
 VLLM_REQUIRES = ["tensordict<=0.6.2", "vllm<=0.8.3"]
 SGLANG_REQUIRES = [
     "tensordict<=0.6.2",
-    "sglang[srt,openai]==0.4.6.post1",
+    "sglang[srt,openai]==0.4.6.post4",
     "torch-memory-saver>=0.0.5",
 ]
 
diff --git a/verl/workers/rollout/sglang_rollout/async_sglang_rollout.py b/verl/workers/rollout/sglang_rollout/async_sglang_rollout.py
@@ -191,6 +191,7 @@ def initialize_tools(tools_config) -> list:
         dist.all_gather_object(visible_devices, os.environ["CUDA_VISIBLE_DEVICES"], device_mesh_cpu.get_group("tp"))
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(visible_devices)
 
+        self._rank = device_mesh_cpu["tp"].get_rank()
         # initialize the inference engine
         monkey_patch_torch_reductions()
         nnodes = -(-tp_size // len(visible_devices))
@@ -199,7 +200,7 @@ def initialize_tools(tools_config) -> list:
             port = get_open_port() if port is None else port
             [ip, port] = broadcast_pyobj(
                 [ip, port],
-                rank=self._tp_rank,
+                rank=self._rank,
                 dist_group=device_mesh_cpu.get_group("tp"),
                 src=device_mesh_cpu["tp"].mesh[0].item(),
                 force_cpu_device=False,
@@ -423,7 +424,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
 
         # free cache engine
         if self.config.free_cache_engine and self._engine is not None:
-            self._engine.tokenizer_manager.flush_cache()
+            self._engine.flush_cache()
 
         return DataProto(batch=batch)
 
@@ -591,7 +592,7 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro
 
         [sorted_output_req_list] = broadcast_pyobj(
             data=[sorted_output_req_list],
-            rank=self._tp_rank,
+            rank=self._rank,
             dist_group=self._device_mesh_cpu["tp"].get_group(),
             src=self._device_mesh_cpu["tp"].mesh[0].item(),
             force_cpu_device=False,
@@ -681,7 +682,7 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro
 
         # free cache engine
         if self.config.free_cache_engine and self._engine is not None and self._tp_rank == 0:
-            self._engine.tokenizer_manager.flush_cache()
+            self._engine.flush_cache()
 
         return DataProto(batch=batch, non_tensor_batch={"messages": np.array(messages), "reward_scores": np.array(reward_scores)})
 

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@`
`51`	`51`	`VLLM_REQUIRES = ["tensordict<=0.6.2", "vllm<=0.8.3"]`
`52`	`52`	`SGLANG_REQUIRES = [`
`53`	`53`	`"tensordict<=0.6.2",`
`54`		`- "sglang[srt,openai]==0.4.6.post1",`
	`54`	`+ "sglang[srt,openai]==0.4.6.post4",`
`55`	`55`	`"torch-memory-saver>=0.0.5",`
`56`	`56`	`]`
`57`	`57`