Skip to content

Commit 6ac9b06

Browse files
[ci] update baseline for kernal change of vllm and lmdeploy (open-compass#2011)
* update * update * update * update * update * update * update
1 parent a05f9da commit 6ac9b06

File tree

5 files changed

+110
-53
lines changed

5 files changed

+110
-53
lines changed

.github/scripts/oc_score_baseline_fullbench.yaml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
99
drop_accuracy: 81.25
1010
GPQA_diamond_accuracy: 25
1111
hellaswag_accuracy: 87.5
12-
TheoremQA_score: 18.75
12+
TheoremQA_score: 12.50
1313
musr_average_naive_average: 39.58
1414
korbench_single_naive_average: 40
1515
gsm8k_accuracy: 62.50
@@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
162162
drop_accuracy: 62.5
163163
GPQA_diamond_accuracy: 62.5
164164
hellaswag_accuracy: 93.75
165-
TheoremQA_score: 25
165+
TheoremQA_score: 12.50
166166
winogrande_accuracy: 75
167167
gsm8k_accuracy: 37.5
168168
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@@ -190,7 +190,7 @@ internlm2_5-7b-turbomind_fullbench:
190190
drop_accuracy: 62.5
191191
GPQA_diamond_accuracy: 62.5
192192
hellaswag_accuracy: 93.75
193-
TheoremQA_score: 31.25
193+
TheoremQA_score: 12.50
194194
winogrande_accuracy: 87.5
195195
gsm8k_accuracy: 56.25
196196
GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
@@ -391,7 +391,7 @@ internlm2_5-7b-chat-turbomind:
391391
alpaca_eval_total: 25.96
392392
arenahard_score: 17.15
393393
Followbench_naive_average: 0.81
394-
CompassArena_naive_average: 34.61
394+
CompassArena_naive_average: 39.49
395395
FoFo_naive_average: 0.38
396396
mtbench101_avg: 8.01
397397
wildbench_average: -10.49
@@ -410,10 +410,10 @@ internlm2_5-7b-chat-turbomind:
410410
alpaca_eval_oasst: 23.4
411411
alpaca_eval_selfinstruct: 30.95
412412
alpaca_eval_vicuna: 33.75
413-
compassarena_language_naive_average: 52.5
413+
compassarena_language_naive_average: 58.50
414414
compassarena_knowledge_naive_average: 36
415415
compassarena_reason_v2_naive_average: 35
416-
compassarena_math_v2_naive_average: 19.91
416+
compassarena_math_v2_naive_average: 25.95
417417
compassarena_creationv2_zh_naive_average: 43.64
418418
fofo_test_prompts_overall: 0.35
419419
fofo_test_prompts_cn_overall: 0.41
@@ -493,7 +493,7 @@ qwen2.5-7b-instruct-turbomind:
493493
bigcodebench_hard_instruct_pass@1: 16.22
494494
bigcodebench_hard_complete_pass@1: 11.49
495495
teval_naive_average: 79.72
496-
SciCode_sub_accuracy: 100
496+
SciCode_sub_accuracy: 10.76
497497
qa_dingo_cn_score: 99.01
498498
mmlu_accuracy: 76.01
499499
mmlu-stem_accuracy: 77.59
@@ -600,7 +600,7 @@ internlm2_5-7b-chat-pytorch:
600600
bigcodebench_hard_instruct_pass@1: 6.08
601601
bigcodebench_hard_complete_pass@1: 6.76
602602
teval_naive_average: 79.73
603-
SciCode_sub_accuracy: 100
603+
SciCode_sub_accuracy: 3.47
604604
qa_dingo_cn_score: 100
605605
mmlu_accuracy: 70.2
606606
mmlu-stem_accuracy: 67.73
@@ -689,7 +689,7 @@ qwen2.5-7b-instruct-pytorch:
689689
GaokaoBench_weighted_average: 80.02
690690
math_accuracy: 73.74
691691
cmo_fib_accuracy: 26.44
692-
aime2024_accuracy: 10
692+
aime2024_accuracy: 13.33
693693
Mathbench_naive_average: 77.08
694694
wikibench-wiki-single_choice_cncircular_perf_4: 34
695695
cmmlu_naive_average: 75.9
@@ -705,7 +705,7 @@ qwen2.5-7b-instruct-pytorch:
705705
bigcodebench_hard_instruct_pass@1: 16.89
706706
bigcodebench_hard_complete_pass@1: 12.16
707707
teval_naive_average: 79.46
708-
SciCode_sub_accuracy: 100
708+
SciCode_sub_accuracy: 10.42
709709
qa_dingo_cn_score: 100
710710
mmlu_accuracy: 76.27
711711
mmlu-stem_accuracy: 77.75
@@ -810,7 +810,7 @@ internlm3-8b-instruct-turbomind:
810810
bigcodebench_hard_instruct_pass@1: 13.51
811811
bigcodebench_hard_complete_pass@1: 15.54
812812
teval_naive_average: 82.86
813-
SciCode_sub_accuracy: 100
813+
SciCode_sub_accuracy: 11.11
814814
qa_dingo_cn_score: 100
815815
mmlu_accuracy: 76.21
816816
mmlu-stem_accuracy: 77.7
@@ -889,7 +889,7 @@ internlm3-8b-instruct-pytorch:
889889
IFEval_Prompt-level-strict-accuracy: 79.11
890890
drop_accuracy: 83.32
891891
bbh_naive_average: 54.76
892-
GPQA_diamond_accuracy: 42.42
892+
GPQA_diamond_accuracy: 33.84
893893
hellaswag_accuracy: 91.31
894894
TheoremQA_score: 18
895895
musr_average_naive_average: 36.62
@@ -915,7 +915,7 @@ internlm3-8b-instruct-pytorch:
915915
bigcodebench_hard_instruct_pass@1: 12.84
916916
bigcodebench_hard_complete_pass@1: 15.54
917917
teval_naive_average: 82.86
918-
SciCode_sub_accuracy: 100
918+
SciCode_sub_accuracy: 9.38
919919
qa_dingo_cn_score: 100
920920
mmlu_accuracy: 76.23
921921
mmlu-stem_accuracy: 78.08

.github/scripts/oc_score_baseline_testrange.yaml

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ chat:
66
gsm8k_accuracy: 71.88
77
race-high_accuracy: 90.62
88
glm-4-9b-chat-vllm:
9-
gsm8k_accuracy: 68.75
9+
gsm8k_accuracy: 71.88
1010
race-high_accuracy: 90.62
1111
deepseek-7b-chat-hf:
1212
gsm8k_accuracy: 46.88
@@ -84,7 +84,7 @@ chat:
8484
gsm8k_accuracy: 81.25
8585
race-high_accuracy: 90.62
8686
llama-3_2-3b-instruct-turbomind:
87-
gsm8k_accuracy: 75.00
87+
gsm8k_accuracy: 68.75
8888
race-high_accuracy: 81.25
8989
llama-3-8b-instruct-turbomind:
9090
gsm8k_accuracy: 68.75
@@ -204,14 +204,14 @@ chat:
204204
gsm8k_accuracy: 90.62
205205
race-high_accuracy: 84.38
206206
mixtral-8x22b-instruct-v0.1-turbomind:
207-
gsm8k_accuracy: 75
207+
gsm8k_accuracy: 78.12
208208
race-high_accuracy: 78.12
209209
mixtral-8x22b-instruct-v0.1-vllm:
210210
gsm8k_accuracy: 78.12
211211
race-high_accuracy: 78.12
212212
base:
213213
glm-4-9b-turbomind:
214-
gsm8k_accuracy: 56.25
214+
gsm8k_accuracy: 59.38
215215
GPQA_diamond_accuracy: 28.12
216216
race-high_accuracy: 93.75
217217
winogrande_accuracy: 84.38
@@ -253,8 +253,8 @@ base:
253253
gemma-2-9b-turbomind:
254254
gsm8k_accuracy: 68.75
255255
GPQA_diamond_accuracy: 0
256-
race-high_accuracy: 78.12
257-
winogrande_accuracy: 50
256+
race-high_accuracy: 18.75
257+
winogrande_accuracy: 46.88
258258
gemma-2b-vllm:
259259
gsm8k_accuracy: 15.62
260260
GPQA_diamond_accuracy: 3.12
@@ -281,20 +281,20 @@ base:
281281
race-high_accuracy: 71.88
282282
winogrande_accuracy: 75
283283
internlm2_5-7b-turbomind:
284-
gsm8k_accuracy: 59.38
285-
GPQA_diamond_accuracy: 34.38
284+
gsm8k_accuracy: 62.5
285+
GPQA_diamond_accuracy: 31.25
286286
race-high_accuracy: 93.75
287-
winogrande_accuracy: 84.38
287+
winogrande_accuracy: 87.5
288288
internlm2-7b-turbomind:
289-
gsm8k_accuracy: 50
290-
GPQA_diamond_accuracy: 18.75
291-
race-high_accuracy: 71.88
292-
winogrande_accuracy: 84.38
289+
gsm8k_accuracy: 59.38
290+
GPQA_diamond_accuracy: 34.38
291+
race-high_accuracy: 78.12
292+
winogrande_accuracy: 71.88
293293
internlm2-base-7b-turbomind:
294-
gsm8k_accuracy: 37.50
295-
GPQA_diamond_accuracy: 21.88
296-
race-high_accuracy: 84.38
297-
winogrande_accuracy: 75
294+
gsm8k_accuracy: 28.12
295+
GPQA_diamond_accuracy: 31.25
296+
race-high_accuracy: 71.88
297+
winogrande_accuracy: 62.50
298298
llama-2-7b-hf:
299299
gsm8k_accuracy: 21.88
300300
GPQA_diamond_accuracy: 21.88
@@ -311,15 +311,15 @@ base:
311311
race-high_accuracy: 65.62
312312
winogrande_accuracy: 65.62
313313
llama-3.1-8b-turbomind:
314-
gsm8k_accuracy: 56.25
315-
GPQA_diamond_accuracy: 9.38
314+
gsm8k_accuracy: 59.38
315+
GPQA_diamond_accuracy: 15.62
316316
race-high_accuracy: 78.12
317317
winogrande_accuracy: 78.12
318318
llama-3-8b-turbomind:
319319
gsm8k_accuracy: 46.88
320320
GPQA_diamond_accuracy: 12.50
321321
race-high_accuracy: 65.62
322-
winogrande_accuracy: 78.12
322+
winogrande_accuracy: 81.25
323323
mistral-7b-v0.3-hf:
324324
gsm8k_accuracy: 31.25
325325
GPQA_diamond_accuracy: 6.25
@@ -331,8 +331,8 @@ base:
331331
race-high_accuracy: 87.5
332332
winogrande_accuracy: 71.88
333333
qwen2.5-1.5b-turbomind:
334-
gsm8k_accuracy: 62.50
335-
GPQA_diamond_accuracy: 15.62
334+
gsm8k_accuracy: 59.38
335+
GPQA_diamond_accuracy: 18.75
336336
race-high_accuracy: 75
337337
winogrande_accuracy: 71.88
338338
qwen2.5-7b-turbomind:
@@ -362,19 +362,19 @@ base:
362362
winogrande_accuracy: 68.75
363363
qwen2-1.5b-turbomind:
364364
gsm8k_accuracy: 59.38
365-
GPQA_diamond_accuracy: 12.50
365+
GPQA_diamond_accuracy: 6.25
366366
race-high_accuracy: 81.25
367367
winogrande_accuracy: 75
368368
qwen2-7b-turbomind:
369-
gsm8k_accuracy: 65.62
369+
gsm8k_accuracy: 62.5
370370
GPQA_diamond_accuracy: 12.5
371371
race-high_accuracy: 87.5
372-
winogrande_accuracy: 71.88
372+
winogrande_accuracy: 75
373373
qwen1.5-0.5b-vllm:
374-
gsm8k_accuracy: 6.25
374+
gsm8k_accuracy: 9.38
375375
GPQA_diamond_accuracy: 0
376376
race-high_accuracy: 56.25
377-
winogrande_accuracy: 62.5
377+
winogrande_accuracy: 59.38
378378
yi-1.5-6b-hf:
379379
gsm8k_accuracy: 62.5
380380
GPQA_diamond_accuracy: 3.12
@@ -387,11 +387,11 @@ base:
387387
winogrande_accuracy: 59.38
388388
yi-1.5-9b-turbomind:
389389
gsm8k_accuracy: 78.12
390-
GPQA_diamond_accuracy: 43.75
390+
GPQA_diamond_accuracy: 40.62
391391
race-high_accuracy: 87.5
392-
winogrande_accuracy: 71.88
392+
winogrande_accuracy: 65.62
393393
internlm2-20b-turbomind:
394-
gsm8k_accuracy: 75
394+
gsm8k_accuracy: 71.88
395395
GPQA_diamond_accuracy: 18.75
396396
race-high_accuracy: 68.75
397397
winogrande_accuracy: 81.25
@@ -406,18 +406,18 @@ base:
406406
race-high_accuracy: 93.75
407407
winogrande_accuracy: 78.12
408408
qwen2.5-32b-turbomind:
409-
gsm8k_accuracy: 87.5
410-
GPQA_diamond_accuracy: 18.75
409+
gsm8k_accuracy: 84.38
410+
GPQA_diamond_accuracy: 28.12
411411
race-high_accuracy: 93.75
412412
winogrande_accuracy: 81.25
413413
deepseek-67b-base-turbomind:
414-
gsm8k_accuracy: 53.12
415-
GPQA_diamond_accuracy: 28.12
416-
race-high_accuracy: 81.25
417-
winogrande_accuracy: 84.38
414+
gsm8k_accuracy: 59.38
415+
GPQA_diamond_accuracy: 34.38
416+
race-high_accuracy: 78.12
417+
winogrande_accuracy: 81.25
418418
llama-3-70b-turbomind:
419419
gsm8k_accuracy: 56.25
420-
GPQA_diamond_accuracy: 12.50
420+
GPQA_diamond_accuracy: 15.62
421421
race-high_accuracy: 93.75
422422
winogrande_accuracy: 84.38
423423
qwen2.5-72b-turbomind:
@@ -426,7 +426,7 @@ base:
426426
race-high_accuracy: 93.75
427427
winogrande_accuracy: 87.5
428428
deepseek-v2-turbomind:
429-
gsm8k_accuracy: 59.38
430-
GPQA_diamond_accuracy: 3.12
429+
gsm8k_accuracy: 65.62
430+
GPQA_diamond_accuracy: 9.38
431431
race-high_accuracy: 93.75
432432
winogrande_accuracy: 81.25

.github/workflows/daily-run-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ on:
4444
type: string
4545
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
4646
schedule:
47-
- cron: '15 14 * * 0,2'
47+
- cron: '15 14 * * 0,3'
4848

4949
env:
5050
HF_DATASETS_OFFLINE: 1

opencompass/datasets/subjective/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from .arena_hard import ArenaHardDataset # noqa: F401, F403
88
from .arena_hard import arenahard_bradleyterry_postprocess # noqa: F401, F403
99
from .arena_hard import arenahard_postprocess # noqa: F401, F403
10+
from .commonbench import commonbench_postprocess
1011
from .compass_arena import CompassArenaDataset # noqa: F401, F403
1112
from .compass_arena import \
1213
compassarena_bradleyterry_postprocess # noqa: F401, F403
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# flake8: noqa: E501
2+
import re
3+
from collections import defaultdict
4+
from typing import Optional
5+
6+
from opencompass.registry import DICT_POSTPROCESSORS
7+
8+
from .utils import get_judgeanswer_and_reference
9+
10+
11+
def post_process(judgement: str):
12+
"""Input a string like below:
13+
14+
xxx[[5]]xxx, and extract the score
15+
"""
16+
judgement = judgement['prediction']
17+
pattern = r'\[\[([\d.]+)\]\]'
18+
matched_result = re.findall(pattern, judgement)
19+
if matched_result:
20+
score = float(matched_result[0])
21+
else:
22+
return None
23+
return {'score': score}
24+
25+
26+
def get_capability_results(judged_answers, references):
27+
capability_ratings = defaultdict(int)
28+
capability_counts = defaultdict(int)
29+
for ans, ref in zip(judged_answers, references):
30+
capability_ratings['total'] += ans['score']
31+
capability_counts['total'] += 1
32+
capability_ratings[ref['capability']] += ans['score']
33+
capability_counts[ref['capability']] += 1
34+
35+
capability_avg_ratings = defaultdict(float)
36+
37+
for capability, total_score in capability_ratings.items():
38+
s = total_score / capability_counts[capability]
39+
s = round(s, 2)
40+
capability_avg_ratings[capability] = s
41+
42+
return capability_avg_ratings
43+
44+
45+
@DICT_POSTPROCESSORS.register_module('commenbench')
46+
def commonbench_postprocess(
47+
output: dict,
48+
output_path: str,
49+
post_process: Optional[callable] = post_process,
50+
) -> dict:
51+
judged_answers, references = get_judgeanswer_and_reference(
52+
output, output_path, post_process)
53+
54+
results = get_capability_results(judged_answers, references)
55+
results['details'] = output
56+
return results

0 commit comments

Comments
 (0)