vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test_long_term.yaml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/vllm_ascend_test_long_term.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
Lines changed: 12 additions & 6 deletions b/‎tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
Lines changed: 12 additions & 6 deletions
diff --git a/‎tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
Lines changed: 1 addition & 1 deletion b/‎tests/e2e/long_term/test_deepseek_v2_lite_tp2_accuracy.py
Lines changed: 1 addition & 1 deletion
@@ -100,7 +100,7 @@ jobs:
             # spec decode test
             VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
             # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
-            # VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py
             VLLM_USE_MODELSCOPE=True pytest -sv tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py  # it needs a clean process
             pytest -sv tests/e2e/long_term/spec_decode --ignore=tests/e2e/long_term/spec_decode/e2e/test_mtp_correctness.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_spec_decode.py --ignore=tests/e2e/long_term/spec_decode/e2e/test_v1_mtp_correctness.py
             pytest -sv tests/e2e/long_term/test_accuracy.py
 
@@ -11,7 +11,7 @@
 @pytest.fixture
 def test_prompts():
     prompt_types = ["repeat", "sentence"]
-    num_prompts = 100
+    num_prompts = 10
     prompts = []
 
     random.seed(0)
@@ -69,6 +69,7 @@ def test_ngram_correctness(
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using ngram speculative decoding.
     '''
+    pytest.skip("Not current support for the test.")
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
@@ -116,11 +117,12 @@ def test_eagle_correctness(
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using eagle speculative decoding.
     '''
-    pytest.skip("Not current support for the test.")
+    if not use_eagle3:
+        pytest.skip("Not current support for the test.")
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        ref_llm = LLM(model=model_name, max_model_len=2048)
+        ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
@@ -129,13 +131,17 @@ def test_eagle_correctness(
         spec_llm = LLM(
             model=model_name,
             trust_remote_code=True,
+            enable_chunked_prefill=True,
+            max_num_seqs=1,
+            max_num_batched_tokens=2048,
+            gpu_memory_utilization=0.6,
             speculative_config={
                 "method": "eagle3" if use_eagle3 else "eagle",
                 "model": spec_model_name,
-                "num_speculative_tokens": 3,
-                "max_model_len": 2048,
+                "num_speculative_tokens": 2,
+                "max_model_len": 128,
             },
-            max_model_len=2048,
+            max_model_len=128,
             enforce_eager=True,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
 
@@ -38,7 +38,7 @@
 
 
 def run_test(model_name, queue, more_args=None):
-    model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4"
+    model_args = f"pretrained={model_name},max_model_len=4096,trust_remote_code=True,tensor_parallel_size=4,enforce_eager=True"
     if more_args is not None:
         model_args = f"{model_args},{more_args}"
     results = lm_eval.simple_evaluate(