diff --git a/dataset-index.yml b/dataset-index.yml index 1241ea290..7f4331fe4 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -1124,4 +1124,10 @@ category: Science /Physics paper: https://arxiv.org/abs/2504.16074 configpath: opencompass/configs/datasets/PHYBench/phybench_gen.py + configpath_llmjudge: '' +- aider: + name: Aider + category: Code + paper: '' + configpath: opencompass/configs/datasets/aider/aider.py configpath_llmjudge: '' \ No newline at end of file diff --git a/examples/eval_aider.py b/examples/eval_aider.py new file mode 100644 index 000000000..2c907ff86 --- /dev/null +++ b/examples/eval_aider.py @@ -0,0 +1,53 @@ +from mmengine.config import read_base +with read_base(): + from opencompass.configs.datasets.aider.aider import get_aider_dataset +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI, OpenAISDK +from opencompass.utils.text_postprocessors import extract_non_reasoning_content +from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner +from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner +from opencompass.runners import SlurmSequentialRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) +datasets = [*get_aider_dataset] + +from opencompass.models import TurboMindModelwithChatTemplate + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='qwen-7b-hf', + path='Qwen/Qwen-7B', + engine_config=dict(session_len=16384, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), + max_seq_len=16384, + max_out_len=2048, + batch_size=16, + run_cfg=dict(num_gpus=1), + ), +] + + +infer = dict( + # partitioner=dict(type=NaivePartitioner), + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict( + type=LocalRunner, + max_num_workers=72, + task=dict(type=OpenICLInferTask), + ), +) + + + +work_dir = './outputs/aider/' diff --git a/opencompass/configs/datasets/aider/aider.md b/opencompass/configs/datasets/aider/aider.md new file mode 100644 index 000000000..4ceb3014e --- /dev/null +++ b/opencompass/configs/datasets/aider/aider.md @@ -0,0 +1,25 @@ + +# Aider polyglot benchmark + +## Prepare the dataset + +We support the [Aider polyglot benchmark](https://aider.chat/docs/leaderboards/). + +You have to download our preprocessed dataset. The format of dir should be like: + +``` +aider +---Aider.json +``` + +The Aider.json is the preprocessed dataset used for score. + +> **Note**: Currently, the supported version of Aider only supports **single-turn conversations**, meaning multi-turn dialogues are not yet supported. Additionally, it only supports the `whole` edit format and does not support incremental or diff-based formats. + +## Run + +We have provide the script for wildbench in `examples/eval_aider.py`. + +## Acknowledgement + +We greatly appreciate the authors of [Aider polyglot benchmark](https://github.com/Aider-AI/aider/tree/main). If you find it is useful in your research, please consider cite them. \ No newline at end of file diff --git a/opencompass/configs/datasets/aider/aider.py b/opencompass/configs/datasets/aider/aider.py new file mode 100644 index 000000000..da8d01f93 --- /dev/null +++ b/opencompass/configs/datasets/aider/aider.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AiderEvaluator +from opencompass.datasets import AiderDataset + +aider_reader_cfg = dict( + input_columns=['prompt'], + output_column='judge', + ) + +data_path = './data/aider/' +aider_all_sets = ['Aider.json'] +get_aider_dataset = [] + + +for _name in aider_all_sets: + aider_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[dict(role='SYSTEM', fallback_role='HUMAN', prompt='{system_prompt}')],round=[dict( + role='HUMAN', + prompt='{prompt}' + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + aider_eval_cfg = dict( + evaluator=dict( + type=AiderEvaluator, + ip_address='https://sd17oge6kiaj519k4ofj0.apigateway-cn-beijing.volceapi.com' + ), + ) + + get_aider_dataset.append( + dict( + abbr=f'{_name.split(".")[0]}', + type=AiderDataset, + path=data_path, + name=_name, + reader_cfg=aider_reader_cfg, + infer_cfg=aider_infer_cfg, + eval_cfg=aider_eval_cfg, + )) diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 78d692029..0d221734a 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -1,6 +1,7 @@ from .advglue import * # noqa: F401, F403 from .afqmcd import * # noqa: F401, F403 from .agieval import * # noqa: F401, F403 +from .aider import * # noqa: F401, F403 from .aime2024 import * # noqa: F401, F403 from .anli import AnliDataset # noqa: F401, F403 from .anthropics_evals import * # noqa: F401, F403 diff --git a/opencompass/datasets/aider.py b/opencompass/datasets/aider.py new file mode 100644 index 000000000..d4485e325 --- /dev/null +++ b/opencompass/datasets/aider.py @@ -0,0 +1,51 @@ +# flake8: noqa +import json +import os.path as osp +import re + +import numpy as np +import pandas as pd +from datasets import Dataset + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_data_path + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class AiderDataset(BaseDataset): + + def load(self, path: str, name: str, *args, **kwargs): + path = get_data_path(path, local_mode=True) + filename = osp.join(path, f'{name}') + raw_data = [] + with open(filename, 'r', encoding='utf-8') as f: + data = json.load(f) + for key, item in data.items(): + raw_data.append(self.process_item(key, item)) + dataset = Dataset.from_list(raw_data) + return dataset + + def process_item(self, key, item): + question = '' + system_prompt = '' + for line in item: + if line['role'] == 'system': + system_prompt = line['content'] + elif line['role'] == 'user': + question += '\n\n ### User:' + line['content'] + else: + question += '\n\n ### Assistant:' + line['content'] + question += '\n\n ### Assistant:' + raw_item = { + 'system_prompt': system_prompt, + 'prompt': question, + 'judge': { + 'system_prompt': system_prompt, + 'prompt': question, + 'test_dir': key + } + } + return raw_item diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py index f043982b3..1010f4500 100644 --- a/opencompass/openicl/icl_evaluator/__init__.py +++ b/opencompass/openicl/icl_evaluator/__init__.py @@ -1,4 +1,5 @@ from .icl_agent_evaluator import * # noqa +from .icl_aider_evaluator import AiderEvaluator # noqa from .icl_aucroc_evaluator import AUCROCEvaluator # noqa from .icl_base_evaluator import BaseEvaluator # noqa from .icl_bpc_evaluator import BPCEvaluator # noqa diff --git a/opencompass/openicl/icl_evaluator/icl_aider_evaluator.py b/opencompass/openicl/icl_evaluator/icl_aider_evaluator.py new file mode 100644 index 000000000..dc3c4b75a --- /dev/null +++ b/opencompass/openicl/icl_evaluator/icl_aider_evaluator.py @@ -0,0 +1,97 @@ +# flake8: noqa +import concurrent.futures +import json +import os +import re +from collections import defaultdict +from typing import Any, Dict + +import requests +from tqdm import tqdm + +from .icl_base_evaluator import BaseEvaluator + + +class AiderEvaluator(BaseEvaluator): + + def __init__(self, + language: str = 'py', + ip_address: str = 'localhost') -> None: + self.language = language + self.url = f'{ip_address}/run_test' + self.client = requests.Session() + super().__init__() + + def _send_single_request(self, test_dir: str, + prediction: str) -> Dict[str, Any]: + request = { + 'data': { + test_dir: prediction + }, + 'model_name': 'default', + 'edit_format': 'whole', + 'no_unit_tests': False, + 'verbose': True + } + + try: + response = self.client.post( + self.url, + json=request, + headers={'Content-Type': 'application/json'}) + return {test_dir: response.json()[test_dir]} + except Exception as e: + print(f'Error processing {test_dir}: {str(e)}') + return {test_dir: {'test_outcomes': [False]}} + + def score(self, predictions, references): + batch_size = 3 + total_correct = 0 + total_count = 0 + details = [] + + for i in tqdm(range(0, len(predictions), batch_size), + desc='Evaluating batches'): + batch_predictions = predictions[i:i + batch_size] + batch_references = references[i:i + batch_size] + + tasks = [] + for prediction, reference in zip(batch_predictions, + batch_references): + test_dir = reference['test_dir'] + tasks.append((test_dir, prediction)) + + batch_results = {} + with concurrent.futures.ThreadPoolExecutor( + max_workers=batch_size) as executor: + future_to_testdir = { + executor.submit(self._send_single_request, test_dir, + prediction): test_dir + for test_dir, prediction in tasks + } + + for future in concurrent.futures.as_completed( + future_to_testdir): + result = future.result() + batch_results.update(result) + + print(f'Batch {i//batch_size + 1} results:', + batch_results, + flush=True) + for test_dir, outcome in batch_results.items(): + is_correct = outcome['test_outcomes'][0] + if is_correct: + total_correct += 1 + total_count += 1 + + details.append({ + 'test_dir': test_dir, + 'correct': is_correct, + }) + + result = { + 'accuracy': + 100 * total_correct / total_count if total_count > 0 else 0, + 'details': details + } + return result