Skip to content

[Feature] Add Aider benchmark #2155

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions dataset-index.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1124,4 +1124,10 @@
category: Science /Physics
paper: https://arxiv.org/abs/2504.16074
configpath: opencompass/configs/datasets/PHYBench/phybench_gen.py
configpath_llmjudge: ''
- aider:
name: Aider
category: Code
paper: ''
configpath: opencompass/configs/datasets/aider/aider.py
configpath_llmjudge: ''
53 changes: 53 additions & 0 deletions examples/eval_aider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.aider.aider import get_aider_dataset
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI, OpenAISDK
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask

api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
datasets = [*get_aider_dataset]

from opencompass.models import TurboMindModelwithChatTemplate

models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='qwen-7b-hf',
path='Qwen/Qwen-7B',
engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
max_seq_len=16384,
max_out_len=2048,
batch_size=16,
run_cfg=dict(num_gpus=1),
),
]


infer = dict(
# partitioner=dict(type=NaivePartitioner),
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=72,
task=dict(type=OpenICLInferTask),
),
)



work_dir = './outputs/aider/'
25 changes: 25 additions & 0 deletions opencompass/configs/datasets/aider/aider.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

# Aider polyglot benchmark

## Prepare the dataset

We support the [Aider polyglot benchmark](https://aider.chat/docs/leaderboards/).

You have to download our preprocessed dataset. The format of dir should be like:

```
aider
---Aider.json
```

The Aider.json is the preprocessed dataset used for score.

> **Note**: Currently, the supported version of Aider only supports **single-turn conversations**, meaning multi-turn dialogues are not yet supported. Additionally, it only supports the `whole` edit format and does not support incremental or diff-based formats.

## Run

We have provide the script for wildbench in `examples/eval_aider.py`.

## Acknowledgement

We greatly appreciate the authors of [Aider polyglot benchmark](https://github.com/Aider-AI/aider/tree/main). If you find it is useful in your research, please consider cite them.
48 changes: 48 additions & 0 deletions opencompass/configs/datasets/aider/aider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AiderEvaluator
from opencompass.datasets import AiderDataset

aider_reader_cfg = dict(
input_columns=['prompt'],
output_column='judge',
)

data_path = './data/aider/'
aider_all_sets = ['Aider.json']
get_aider_dataset = []


for _name in aider_all_sets:
aider_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[dict(role='SYSTEM', fallback_role='HUMAN', prompt='{system_prompt}')],round=[dict(
role='HUMAN',
prompt='{prompt}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

aider_eval_cfg = dict(
evaluator=dict(
type=AiderEvaluator,
ip_address='https://sd17oge6kiaj519k4ofj0.apigateway-cn-beijing.volceapi.com'
),
)

get_aider_dataset.append(
dict(
abbr=f'{_name.split(".")[0]}',
type=AiderDataset,
path=data_path,
name=_name,
reader_cfg=aider_reader_cfg,
infer_cfg=aider_infer_cfg,
eval_cfg=aider_eval_cfg,
))
1 change: 1 addition & 0 deletions opencompass/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .advglue import * # noqa: F401, F403
from .afqmcd import * # noqa: F401, F403
from .agieval import * # noqa: F401, F403
from .aider import * # noqa: F401, F403
from .aime2024 import * # noqa: F401, F403
from .anli import AnliDataset # noqa: F401, F403
from .anthropics_evals import * # noqa: F401, F403
Expand Down
51 changes: 51 additions & 0 deletions opencompass/datasets/aider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# flake8: noqa
import json
import os.path as osp
import re

import numpy as np
import pandas as pd
from datasets import Dataset

from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils import get_data_path

from .base import BaseDataset


@LOAD_DATASET.register_module()
class AiderDataset(BaseDataset):

def load(self, path: str, name: str, *args, **kwargs):
path = get_data_path(path, local_mode=True)
filename = osp.join(path, f'{name}')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
data = json.load(f)
for key, item in data.items():
raw_data.append(self.process_item(key, item))
dataset = Dataset.from_list(raw_data)
return dataset

def process_item(self, key, item):
question = ''
system_prompt = ''
for line in item:
if line['role'] == 'system':
system_prompt = line['content']
elif line['role'] == 'user':
question += '\n\n ### User:' + line['content']
else:
question += '\n\n ### Assistant:' + line['content']
question += '\n\n ### Assistant:'
raw_item = {
'system_prompt': system_prompt,
'prompt': question,
'judge': {
'system_prompt': system_prompt,
'prompt': question,
'test_dir': key
}
}
return raw_item
1 change: 1 addition & 0 deletions opencompass/openicl/icl_evaluator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .icl_agent_evaluator import * # noqa
from .icl_aider_evaluator import AiderEvaluator # noqa
from .icl_aucroc_evaluator import AUCROCEvaluator # noqa
from .icl_base_evaluator import BaseEvaluator # noqa
from .icl_bpc_evaluator import BPCEvaluator # noqa
Expand Down
97 changes: 97 additions & 0 deletions opencompass/openicl/icl_evaluator/icl_aider_evaluator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# flake8: noqa
import concurrent.futures
import json
import os
import re
from collections import defaultdict
from typing import Any, Dict

import requests
from tqdm import tqdm

from .icl_base_evaluator import BaseEvaluator


class AiderEvaluator(BaseEvaluator):

def __init__(self,
language: str = 'py',
ip_address: str = 'localhost') -> None:
self.language = language
self.url = f'{ip_address}/run_test'
self.client = requests.Session()
super().__init__()

def _send_single_request(self, test_dir: str,
prediction: str) -> Dict[str, Any]:
request = {
'data': {
test_dir: prediction
},
'model_name': 'default',
'edit_format': 'whole',
'no_unit_tests': False,
'verbose': True
}

try:
response = self.client.post(
self.url,
json=request,
headers={'Content-Type': 'application/json'})
return {test_dir: response.json()[test_dir]}
except Exception as e:
print(f'Error processing {test_dir}: {str(e)}')
return {test_dir: {'test_outcomes': [False]}}

def score(self, predictions, references):
batch_size = 3
total_correct = 0
total_count = 0
details = []

for i in tqdm(range(0, len(predictions), batch_size),
desc='Evaluating batches'):
batch_predictions = predictions[i:i + batch_size]
batch_references = references[i:i + batch_size]

tasks = []
for prediction, reference in zip(batch_predictions,
batch_references):
test_dir = reference['test_dir']
tasks.append((test_dir, prediction))

batch_results = {}
with concurrent.futures.ThreadPoolExecutor(
max_workers=batch_size) as executor:
future_to_testdir = {
executor.submit(self._send_single_request, test_dir,
prediction): test_dir
for test_dir, prediction in tasks
}

for future in concurrent.futures.as_completed(
future_to_testdir):
result = future.result()
batch_results.update(result)

print(f'Batch {i//batch_size + 1} results:',
batch_results,
flush=True)
for test_dir, outcome in batch_results.items():
is_correct = outcome['test_outcomes'][0]
if is_correct:
total_correct += 1
total_count += 1

details.append({
'test_dir': test_dir,
'correct': is_correct,
})

result = {
'accuracy':
100 * total_correct / total_count if total_count > 0 else 0,
'details': details
}
return result