vllm-project · vllm-bot · Apr 27, 2025 · Mar 31, 2025 · Apr 6, 2025 · Apr 7, 2025
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -294,6 +294,7 @@ steps:
   commands:
     - pytest -v -s compile/test_pass_manager.py
     - pytest -v -s compile/test_fusion.py
+    - pytest -v -s compile/test_sequence_parallelism.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
   source_file_dependencies:
@@ -547,6 +548,8 @@ steps:
   - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
+  # test sequence parallel
+  - pytest -v -s distributed/test_sequence_parallel.py
   # this test fails consistently.
   # TODO: investigate and fix
   # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py

diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py
@@ -10,7 +10,7 @@
                                      kFp8DynamicTokenSym, kFp8StaticTensorSym)
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
-from vllm.config import CompilationConfig
+from vllm.config import CompilationConfig, VllmConfig
 
 from .backend import TestBackend
 
@@ -49,13 +49,14 @@ def test_fix_functionalization(model: str, quant_key: QuantKey,
                                do_fusion: bool):
     torch.set_default_device("cuda")
 
-    config = CompilationConfig.PassConfig(enable_fusion=do_fusion,
-                                          enable_noop=True)
-    noop_pass = NoOpEliminationPass(config)
-    fusion_pass = FusionPass.instance(config)
+    vllm_config = VllmConfig(pass_config= \
+        CompilationConfig.PassConfig(enable_fusion=do_fusion,
+                                          enable_noop=True))
+    noop_pass = NoOpEliminationPass(vllm_config)
+    fusion_pass = FusionPass.instance(vllm_config)
 
     passes = [noop_pass, fusion_pass] if do_fusion else [noop_pass]
-    func_pass = FixFunctionalizationPass(config)
+    func_pass = FixFunctionalizationPass(vllm_config)
     backend_func = TestBackend(*passes, func_pass)
     backend_no_func = TestBackend(*passes)
 

diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
@@ -77,12 +77,13 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
 
     vllm_config = VllmConfig(compilation_config=CompilationConfig(
         level=CompilationLevel.PIECEWISE, custom_ops=["+rms_norm"]))
+    vllm_config.compilation_config.pass_config = \
+            CompilationConfig.PassConfig(enable_fusion=True,
+                                              enable_noop=True)
     with vllm.config.set_current_vllm_config(vllm_config):
         # Reshape pass is needed for the fusion pass to work
-        config = CompilationConfig.PassConfig(enable_fusion=True,
-                                              enable_noop=True)
-        noop_pass = NoOpEliminationPass(config)
-        fusion_pass = FusionPass.instance(config)
+        noop_pass = NoOpEliminationPass(vllm_config)
+        fusion_pass = FusionPass.instance(vllm_config)
 
         backend = TestBackend(noop_pass, fusion_pass)
         model = TestModel(hidden_size, eps, static, cutlass_fp8_enabled)

diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py
@@ -6,7 +6,7 @@
 
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
 from vllm.compilation.pass_manager import PostGradPassManager
-from vllm.config import CompilationConfig
+from vllm.config import VllmConfig
 
 
 # dummy custom pass that doesn't inherit
@@ -16,7 +16,7 @@ def simple_callable(graph: torch.fx.Graph):
 
 # Should fail to add directly to the pass manager
 def test_bad_callable():
-    config = CompilationConfig().pass_config
+    config = VllmConfig()
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
@@ -43,7 +43,7 @@ def __call__(self, graph: torch.fx.graph.Graph) -> None:
     ],
 )
 def test_pass_manager_uuid(callable):
-    config = CompilationConfig().pass_config
+    config = VllmConfig()
 
     pass_manager = PostGradPassManager()
     pass_manager.configure(config)
@@ -64,7 +64,8 @@ def test_pass_manager_uuid(callable):
 
     # UUID should be different due to config change
     config2 = copy.deepcopy(config)
-    config2.enable_fusion = not config2.enable_fusion
+    config2.compilation_config.pass_config.enable_fusion = not \
+        config2.compilation_config.pass_config.enable_fusion
     pass_manager3 = PostGradPassManager()
     pass_manager3.configure(config2)
     pass_manager3.add(callable)

diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
@@ -0,0 +1,100 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import tempfile
+from pathlib import Path
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig
+
+ALL_REDUCE_OP = "torch.ops.vllm.all_reduce.default"
+ALL_GATHER_OP = "torch.ops.vllm.all_gather.default"
+REDUCE_SCATTER_OP = "torch.ops.vllm.reduce_scatter.default"
+
+
+def count_comm_ops(graph_path):
+    all_reduce_cnt = 0
+    all_gather_cnt = 0
+    reduce_scatter_cnt = 0
+    try:
+        with open(graph_path) as f:
+            for line in f:
+                if ALL_REDUCE_OP in line:
+                    all_reduce_cnt += 1
+                if ALL_GATHER_OP in line:
+                    all_gather_cnt += 1
+                if REDUCE_SCATTER_OP in line:
+                    reduce_scatter_cnt += 1
+    except FileNotFoundError:
+        print(f"Error: File '{graph_path}' not found.")
+    except Exception as e:
+        print(f"Error: {e}")
+    return all_reduce_cnt, all_gather_cnt, reduce_scatter_cnt
+
+
+def test_sequence_parallelism_compilation():
+    temp_dir = tempfile.mkdtemp()
+
+    config = CompilationConfig(
+        level=3,
+        custom_ops=["+rms_norm"],
+        compile_sizes=[4, 8],
+        splitting_ops=[],
+    )
+    config.pass_config.enable_sequence_parallelism = True
+    config.pass_config.dump_graph_dir = Path(temp_dir)
+    config.pass_config.dump_graph_stages = \
+        ["before_sequence_parallelism_pass", "after_sequence_parallelism_pass"]
+
+    sampling_params = SamplingParams(temperature=0, )
+
+    llm = LLM(model="unsloth/Llama-3.2-1B-Instruct",
+              enforce_eager=False,
+              tensor_parallel_size=2,
+              dtype=torch.float16,
+              max_num_batched_tokens=2048,
+              compilation_config=config)
+
+    prompts = [
+        "Can you calculate 19 + 20?", "How to make a cake?",
+        "How old a baby can start to try solid food?",
+        "What's pros and cons of using a pacifier for baby?"
+    ]
+
+    answers = [
+        " I'll let you know if you're correct", " A step-by-step guide",
+        " Most pediatricians recommend ", " The American Academy of Pediatrics"
+    ]
+
+    outputs = llm.generate(prompts, sampling_params)
+    for output, answer in zip(outputs, answers):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        assert generated_text.startswith(answer)
+
+    before_graph = os.path.join(temp_dir,
+                                "before_sequence_parallelism_pass-0.py")
+    c1, c2, c3 = count_comm_ops(before_graph)
+    assert c1 > 0, "Expected all_reduce ops, but found 0 before \
+        apply sequence parallelism pass"
+    assert c2 == 0, f"Expected 0 all_gather ops, but found {c2} before" + \
+        "apply sequence parallelism pass"
+    assert c3 == 0, f"Expected 0 reduce_scatter ops, but found {c3} before" + \
+        "apply sequence parallelism pass"
+
+    after_graph = os.path.join(temp_dir,
+                               "after_sequence_parallelism_pass-0.py")
+    c1, c2, c3 = count_comm_ops(after_graph)
+
+    assert c1 == 0, f"Expected 0 all_reduce ops, but found {c1} after" + \
+        "apply sequence parallelism pass"
+    assert c2 > 0, "Expected all_gather ops, but found 0 in after" + \
+        "apply sequence parallelism pass"
+    assert c3 > 0, "Expected 0 reduce_scatter ops, but found 0 after \
+        apply sequence parallelism pass"
+
+    assert c2 == c3, f"Expected all_gather ops and reduce_scatter ops to be \
+        equal, but found {c2} and {c3} after apply sequence parallelism pass"
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
@@ -14,7 +14,8 @@
 
 from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_gather,
-                              tensor_model_parallel_all_reduce)
+                              tensor_model_parallel_all_reduce,
+                              tensor_model_parallel_reduce_scatter)
 
 from ..utils import init_test_distributed_environment, multi_process_parallel
 
@@ -47,6 +48,34 @@ def all_reduce_test_worker(
     torch.testing.assert_close(t, expected)
 
 
+@ray.remote(num_gpus=1, max_calls=1)
+def reduce_scatter_test_worker(monkeypatch: pytest.MonkeyPatch, tp_size: int,
+                               pp_size: int, rank: int,
+                               distributed_init_port: str):
+    # it is important to delete the CUDA_VISIBLE_DEVICES environment variable
+    # so that each worker can see all the GPUs
+    # they will be able to set the device to the correct GPU
+    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    num_elements = 8
+    all_tensors = [
+        torch.arange(num_elements, dtype=torch.float32, device="cuda") *
+        (r + 1) for r in range(tp_size)
+    ]
+
+    index = rank % tp_size
+    partition_size = num_elements // tp_size
+    all_reduce = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
+    expected = all_reduce[index * partition_size:(index + 1) * partition_size]
+    t = all_tensors[index]
+    t = tensor_model_parallel_reduce_scatter(t, 0)
+    torch.testing.assert_close(t, expected)
+
+
 @ray.remote(num_gpus=1, max_calls=1)
 def all_gather_test_worker(
     monkeypatch: pytest.MonkeyPatch,