|
| 1 | +import json |
| 2 | +import os |
| 3 | +import unittest |
| 4 | +import warnings |
| 5 | +from datetime import datetime |
| 6 | +from types import SimpleNamespace |
| 7 | + |
| 8 | +from sglang.srt.utils import kill_process_tree |
| 9 | +from sglang.test.run_eval import run_eval |
| 10 | +from sglang.test.test_utils import ( |
| 11 | + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1, |
| 12 | + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2, |
| 13 | + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1, |
| 14 | + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, |
| 15 | + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| 16 | + DEFAULT_URL_FOR_TEST, |
| 17 | + is_in_ci, |
| 18 | + popen_launch_server, |
| 19 | + write_github_step_summary, |
| 20 | +) |
| 21 | + |
| 22 | +MODEL_SCORE_THRESHOLDS = { |
| 23 | + "meta-llama/Llama-3.1-8B-Instruct": 0.82, |
| 24 | + "mistralai/Mistral-7B-Instruct-v0.3": 0.56, |
| 25 | + "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": 0.85, |
| 26 | + "meta-llama/Llama-3.1-70B-Instruct": 0.95, |
| 27 | + "mistralai/Mixtral-8x7B-Instruct-v0.1": 0.64, |
| 28 | + "Qwen/Qwen2-57B-A14B-Instruct": 0.86, |
| 29 | + "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8": 0.81, |
| 30 | + "neuralmagic/Mistral-7B-Instruct-v0.3-FP8": 0.54, |
| 31 | + "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8": 0.94, |
| 32 | + "neuralmagic/Qwen2-72B-Instruct-FP8": 0.94, |
| 33 | + "neuralmagic/Qwen2-57B-A14B-Instruct-FP8": 0.82, |
| 34 | +} |
| 35 | + |
| 36 | +# Models currently failing on AMD MI300x. |
| 37 | +failing_models = { |
| 38 | + "google/gemma-2-27b-it", |
| 39 | + "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8", |
| 40 | + "neuralmagic/gemma-2-2b-it-FP8", |
| 41 | + "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8", |
| 42 | +} |
| 43 | + |
| 44 | + |
| 45 | +def remove_failing_models(model_str): |
| 46 | + models = model_str.split(",") |
| 47 | + filtered = [m for m in models if m not in failing_models] |
| 48 | + return ",".join(filtered) |
| 49 | + |
| 50 | + |
| 51 | +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = remove_failing_models( |
| 52 | + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 |
| 53 | +) |
| 54 | +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = remove_failing_models( |
| 55 | + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 |
| 56 | +) |
| 57 | +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = remove_failing_models( |
| 58 | + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 |
| 59 | +) |
| 60 | +DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = remove_failing_models( |
| 61 | + DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 |
| 62 | +) |
| 63 | + |
| 64 | + |
| 65 | +def parse_models(model_string): |
| 66 | + return [model.strip() for model in model_string.split(",") if model.strip()] |
| 67 | + |
| 68 | + |
| 69 | +def popen_launch_server_wrapper(base_url, model, is_tp2): |
| 70 | + other_args = ["--log-level-http", "warning", "--trust-remote-code"] |
| 71 | + if is_tp2: |
| 72 | + other_args.extend(["--tp", "2"]) |
| 73 | + |
| 74 | + process = popen_launch_server( |
| 75 | + model, |
| 76 | + base_url, |
| 77 | + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, |
| 78 | + other_args=other_args, |
| 79 | + ) |
| 80 | + return process |
| 81 | + |
| 82 | + |
| 83 | +def write_results_to_json(model, metrics, mode="a"): |
| 84 | + result = { |
| 85 | + "timestamp": datetime.now().isoformat(), |
| 86 | + "model": model, |
| 87 | + "metrics": metrics, |
| 88 | + "score": metrics["score"], |
| 89 | + } |
| 90 | + |
| 91 | + existing_results = [] |
| 92 | + if mode == "a" and os.path.exists("results.json"): |
| 93 | + try: |
| 94 | + with open("results.json", "r") as f: |
| 95 | + existing_results = json.load(f) |
| 96 | + except json.JSONDecodeError: |
| 97 | + existing_results = [] |
| 98 | + |
| 99 | + if isinstance(existing_results, list): |
| 100 | + existing_results.append(result) |
| 101 | + else: |
| 102 | + existing_results = [result] |
| 103 | + |
| 104 | + with open("results.json", "w") as f: |
| 105 | + json.dump(existing_results, f, indent=2) |
| 106 | + |
| 107 | + |
| 108 | +def check_model_scores(results): |
| 109 | + failed_models = [] |
| 110 | + summary = " | model | score | threshold |\n" |
| 111 | + summary += "| ----- | ----- | --------- |\n" |
| 112 | + |
| 113 | + for model, score in results: |
| 114 | + threshold = MODEL_SCORE_THRESHOLDS.get(model) |
| 115 | + if threshold is None: |
| 116 | + print(f"Warning: No threshold defined for model {model}") |
| 117 | + continue |
| 118 | + |
| 119 | + if score < threshold: |
| 120 | + failed_models.append( |
| 121 | + f"\nScore Check Failed: {model}\n" |
| 122 | + f"Model {model} score ({score:.4f}) is below threshold ({threshold:.4f})" |
| 123 | + ) |
| 124 | + |
| 125 | + line = f"| {model} | {score} | {threshold} |\n" |
| 126 | + summary += line |
| 127 | + |
| 128 | + print(summary) |
| 129 | + |
| 130 | + if is_in_ci(): |
| 131 | + write_github_step_summary(f"### TestNightlyGsm8KEval\n{summary}") |
| 132 | + |
| 133 | + if failed_models: |
| 134 | + raise AssertionError("\n".join(failed_models)) |
| 135 | + |
| 136 | + |
| 137 | +# Do not use `CustomTestCase` since `test_mgsm_en_all_models` does not want retry |
| 138 | +class TestNightlyGsm8KEval(unittest.TestCase): |
| 139 | + @classmethod |
| 140 | + def setUpClass(cls): |
| 141 | + cls.model_groups = [ |
| 142 | + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), |
| 143 | + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), |
| 144 | + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), |
| 145 | + (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), |
| 146 | + ] |
| 147 | + cls.base_url = DEFAULT_URL_FOR_TEST |
| 148 | + |
| 149 | + def test_mgsm_en_all_models(self): |
| 150 | + warnings.filterwarnings( |
| 151 | + "ignore", category=ResourceWarning, message="unclosed.*socket" |
| 152 | + ) |
| 153 | + is_first = True |
| 154 | + all_results = [] |
| 155 | + |
| 156 | + for model_group, is_fp8, is_tp2 in self.model_groups: |
| 157 | + for model in model_group: |
| 158 | + with self.subTest(model=model): |
| 159 | + process = popen_launch_server_wrapper(self.base_url, model, is_tp2) |
| 160 | + |
| 161 | + args = SimpleNamespace( |
| 162 | + base_url=self.base_url, |
| 163 | + model=model, |
| 164 | + eval_name="mgsm_en", |
| 165 | + num_examples=None, |
| 166 | + num_threads=1024, |
| 167 | + ) |
| 168 | + |
| 169 | + metrics = run_eval(args) |
| 170 | + print( |
| 171 | + f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n" |
| 172 | + ) |
| 173 | + |
| 174 | + write_results_to_json(model, metrics, "w" if is_first else "a") |
| 175 | + is_first = False |
| 176 | + |
| 177 | + all_results.append((model, metrics["score"])) |
| 178 | + kill_process_tree(process.pid) |
| 179 | + |
| 180 | + try: |
| 181 | + with open("results.json", "r") as f: |
| 182 | + print("\nFinal Results from results.json:") |
| 183 | + print(json.dumps(json.load(f), indent=2)) |
| 184 | + except Exception as e: |
| 185 | + print(f"Error reading results.json: {e}") |
| 186 | + |
| 187 | + # Check all scores after collecting all results |
| 188 | + check_model_scores(all_results) |
| 189 | + |
| 190 | + |
| 191 | +if __name__ == "__main__": |
| 192 | + unittest.main() |
0 commit comments