Skip to content

Commit 72b7caa

Browse files
committed
* 'main' of https://github.com/domonic18/opencompass: [Refactor] Refactorize openicl eval task (open-compass#1990) [ci] update baseline for kernal change of vllm and lmdeploy (open-compass#2011) [Feature] Make dump-eval-details default behavior (open-compass#1999) [Fix] OpenICL Math Evaluator Config (open-compass#2007) [Feature] Add CascadeEvaluator (open-compass#1992) [Dataset] Add MedXpertQA (open-compass#2002) [Dataset] Update dingo 1.5.0 (open-compass#2008) [CI] fix baseline score (open-compass#2000) [Doc] Fix links between zh & en (open-compass#2001)
2 parents 7b626ef + 1221320 commit 72b7caa

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+2280
-1421
lines changed

.github/scripts/eval_regression_api.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@
2424
abbr='lmdeploy-api-test',
2525
type=OpenAISDK,
2626
key='EMPTY',
27-
openai_api_base='http://0.0.0.0:23333/v1',
28-
path='internlm2',
29-
tokenizer_path='internlm/internlm2_5-7b-chat',
27+
openai_api_base='http://localhost:23333/v1',
28+
path='internlm3',
29+
tokenizer_path='internlm/internlm3-8b-instruct',
3030
rpm_verbose=True,
3131
meta_template=api_meta_template,
3232
query_per_second=128,

.github/scripts/eval_regression_base_models.py

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,10 @@
1111
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
1212
winogrande_datasets # noqa: F401, E501
1313
# read hf models - chat models
14-
from opencompass.configs.models.chatglm.hf_glm4_9b import \
15-
models as hf_glm4_9b_model # noqa: F401, E501
1614
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
1715
models as lmdeploy_glm4_9b_model # noqa: F401, E501
1816
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
1917
models as hf_deepseek_7b_base_model # noqa: F401, E501
20-
from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
21-
models as hf_deepseek_67b_base_model # noqa: F401, E501
22-
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
23-
models as hf_deepseek_moe_16b_base_model # noqa: F401, E501
24-
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
25-
models as hf_deepseek_v2_lite_model # noqa: F401, E501
2618
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
2719
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
2820
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
@@ -49,12 +41,6 @@
4941
models as hf_internlm2_5_7b_model # noqa: F401, E501
5042
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
5143
models as hf_internlm2_7b_model # noqa: F401, E501
52-
from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
53-
models as hf_internlm2_20b_model # noqa: F401, E501
54-
from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
55-
models as hf_internlm2_base_7b_model # noqa: F401, E501
56-
from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
57-
models as hf_internlm2_base_20b_model # noqa: F401, E501
5844
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
5945
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
6046
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
@@ -65,14 +51,14 @@
6551
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
6652
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
6753
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
54+
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
55+
models as lmdeploy_internlm2_base_20b_model # noqa: F401, E501
6856
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
6957
models as hf_llama2_7b_model # noqa: F401, E501
7058
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
7159
models as hf_llama3_1_8b_model # noqa: F401, E501
7260
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
7361
models as hf_llama3_8b_model # noqa: F401, E501
74-
from opencompass.configs.models.hf_llama.hf_llama3_70b import \
75-
models as hf_llama3_70b_model # noqa: F401, E501
7662
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
7763
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
7864
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \

.github/scripts/eval_regression_chat_models.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,24 @@
1515
models as vllm_glm4_9b_chat_model # noqa: F401, E501
1616
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
1717
models as hf_deepseek_7b_chat_model # noqa: F401, E501
18-
from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
19-
models as hf_deepseek_67b_chat_model # noqa: F401, E501
20-
from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
21-
models as hf_deepseek_moe_16b_chat_model # noqa: F401, E501
22-
from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
23-
models as hf_deepseek_v2_lite_chat_model # noqa: F401, E501
18+
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
19+
models as lmdeploy_deepseek_67b_chat_model # noqa: F401, E501
20+
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
21+
models as \
22+
lmdeploy_deepseek_r1_distill_llama_8b_model # noqa: F401, E501
23+
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
24+
models as \
25+
lmdeploy_deepseek_r1_distill_llama_70b_model # noqa: F401, E501
26+
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
27+
models as \
28+
lmdeploy_deepseek_r1_distill_qwen_1_5b_model # noqa: F401, E501
29+
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
30+
models as \
31+
lmdeploy_deepseek_r1_distill_qwen_32b_model # noqa: F401, E501
2432
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
2533
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
34+
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
35+
models as lmdeploy_deepseek_v2_lite_model # noqa: F401, E501
2636
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
2737
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
2838
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
@@ -45,6 +55,8 @@
4555
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
4656
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
4757
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
58+
from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
59+
models as hf_internlm3_8b_instruct_model # noqa: F401, E501
4860
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
4961
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
5062
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
@@ -57,6 +69,8 @@
5769
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
5870
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
5971
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
72+
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
73+
models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501
6074
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
6175
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
6276
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
@@ -83,10 +97,6 @@
8397
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
8498
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
8599
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
86-
from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
87-
models as hf_mixtral_8x7b_instruct_v0_1_model # noqa: F401, E501
88-
from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
89-
models as hf_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
90100
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
91101
models as \
92102
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
@@ -95,14 +105,19 @@
95105
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
96106
models as \
97107
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
108+
from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
109+
models as \
110+
lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
98111
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
99112
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
100113
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
101114
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
115+
from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
116+
models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
102117
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
103118
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
104-
from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
105-
models as hf_phi_3_mini_4k_instruct_model # noqa: F401, E501
119+
from opencompass.configs.models.phi.hf_phi_4 import \
120+
models as hf_phi_4_model # noqa: F401, E501
106121
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
107122
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
108123
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
@@ -142,6 +157,8 @@
142157

143158
from ...volc import infer as volc_infer # noqa: F401, E501
144159

160+
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
161+
145162
race_datasets = [race_datasets[1]]
146163
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
147164

.github/scripts/oc_score_assert.py

Lines changed: 25 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -175,10 +175,11 @@ def test_api(self, baseline_scores, result_scores, model, dataset):
175175
class TestVolcFullbench:
176176
"""Test cases for chat model."""
177177

178-
@pytest.mark.parametrize(
179-
'model, dataset',
180-
[(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
181-
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
178+
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
179+
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
180+
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
181+
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
182+
] for p2 in dataset_list(p1, 'objective')])
182183
@pytest.mark.chat_objective
183184
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
184185
model, dataset):
@@ -245,10 +246,7 @@ class TestCmdCase:
245246
@pytest.mark.parametrize('model, dataset',
246247
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
247248
('internlm2_5-7b-hf', 'race-high_accuracy'),
248-
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
249-
('internlm2-1.8b-hf', 'race-middle_accuracy'),
250-
('internlm2-1.8b-hf', 'race-high_accuracy'),
251-
('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
249+
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
252250
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
253251
base_score = baseline_scores.get(model).get(dataset)
254252
result_score = result_scores.get(model).get(dataset)
@@ -260,9 +258,9 @@ def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
260258
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
261259
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
262260
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
263-
('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
264-
('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
265-
('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
261+
('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
262+
('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
263+
('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
266264
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
267265
base_score = baseline_scores.get(model).get(dataset)
268266
result_score = result_scores.get(model).get(dataset)
@@ -280,13 +278,25 @@ def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
280278

281279
@pytest.mark.case4
282280
@pytest.mark.parametrize(
283-
'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
284-
('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
285-
('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
281+
'model, dataset',
282+
[('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
283+
('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
284+
('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
286285
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
287286
base_score = baseline_scores.get(model).get(dataset)
288287
result_score = result_scores.get(model).get(dataset)
289-
assert_score(model, result_score, base_score, dataset)
288+
assert_score(model + '_batch', result_score, base_score, dataset)
289+
290+
@pytest.mark.case5
291+
@pytest.mark.parametrize(
292+
'model, dataset',
293+
[('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
294+
('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
295+
('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
296+
def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
297+
base_score = baseline_scores.get(model).get(dataset)
298+
result_score = result_scores.get(model).get(dataset)
299+
assert_score(model + '_batch', result_score, base_score, dataset)
290300

291301

292302
def assert_score(model_type, score, baseline, dataset: str = ''):

.github/scripts/oc_score_baseline.yaml

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,32 @@ internlm2_5-7b_hf:
88
race-middle_accuracy: 91.78
99
race-high_accuracy: 90.02
1010

11-
internlm2-1.8b-hf:
12-
demo_gsm8k_accuracy: 15.62
13-
race-middle_accuracy: 71.66
14-
race-high_accuracy: 66.38
15-
1611
internlm2_5-7b-chat-lmdeploy:
17-
demo_gsm8k_accuracy: 89.06
12+
demo_gsm8k_accuracy: 87.50
1813
race-middle_accuracy: 92.76
1914
race-high_accuracy: 90.54
2015

21-
internlm2-chat-1.8b-lmdeploy:
22-
demo_gsm8k_accuracy: 31
23-
race-middle_accuracy: 81.34
24-
race-high_accuracy: 73.96
16+
internlm3-8b-instruct-lmdeploy:
17+
demo_gsm8k_accuracy: 73.44
18+
race-middle_accuracy: 93.38
19+
race-high_accuracy: 90.34
20+
21+
internlm3-8b-instruct_hf-lmdeploy:
22+
demo_gsm8k_accuracy: 73.44
23+
race-middle_accuracy: 93.38
24+
race-high_accuracy: 90.34
25+
26+
internlm3-8b-instruct_hf-vllm:
27+
demo_gsm8k_accuracy: 81.25
28+
race-middle_accuracy: 92.20
29+
race-high_accuracy: 89.88
2530

2631
internlm2_5-7b-chat_hf:
2732
demo_gsm8k_accuracy: 87.50
2833
race-middle_accuracy: 92.76
2934
race-high_accuracy: 90.48
3035

3136
lmdeploy-api-test:
32-
gsm8k_accuracy: 68.75
33-
race-middle_accuracy: 87.50
37+
gsm8k_accuracy: 56.25
38+
race-middle_accuracy: 93.75
3439
race-high_accuracy: 93.75

0 commit comments

Comments
 (0)