Skip to content

[Fix] Update LCBench #2166

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
300 changes: 233 additions & 67 deletions opencompass/datasets/LCBench.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# flake8: noqa
import contextlib
import io
import itertools
Expand Down Expand Up @@ -117,25 +118,83 @@ class LCEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {'error': 'preds and refrs have different length'}

result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
details = {}

with ProcessPoolExecutor() as executor:
futures = []
for i, (refer, pred) in enumerate(zip(references, predictions)):
pred = self._process_answer(pred)
programs = self._process_test(refer, pred)
future = executor.submit(execution, programs, i, 3)
futures.append(future)

from tqdm import tqdm
for future in tqdm(as_completed(futures), total=len(futures)):
index, ret = future.result()
result[ret] += 1
details[str(index)] = {
'programs': predictions[index],
'result': ret,
'is_correct': ret == 'pass',
}
code_blocks = self._process_answer(pred)

# Try each code block until one passes
for code_idx, code_block in enumerate(code_blocks):
test_programs = self._process_test(refer, code_block)

# Submit each test program variant for execution
for prog_idx, program in enumerate(test_programs):
future = executor.submit(
execution,
program,
(
i,
code_idx,
prog_idx,
), # Pass indices for tracking
3,
)
futures.append(future)

from tqdm import tqdm

# Track which examples passed
passed_examples = set()
all_results = {}

for future in tqdm(as_completed(futures), total=len(futures)):
(example_idx, code_idx, prog_idx), ret, program = future.result()

# Store result
if example_idx not in all_results:
all_results[example_idx] = []

all_results[example_idx].append({
'code_idx': code_idx,
'prog_idx': prog_idx,
'result': ret,
'is_correct': ret == 'pass',
'program': program,
})

# If this example passed with any code block or test variant
if ret == 'pass':
passed_examples.add(example_idx)

# Process final results
for example_idx, results in all_results.items():
# Did any variant pass?
example_passed = example_idx in passed_examples

# Get the first passing result if any, otherwise get the first result
result_to_use = next((r for r in results if r['is_correct']),
results[0])

# Update counters
if example_passed:
result['pass'] += 1
else:
result[result_to_use['result']] += 1

# Store details
details[str(example_idx)] = {
'result':
('pass' if example_passed else result_to_use['result']),
'is_correct': example_passed,
'num_attempts': len(results),
'code_blocks_tried': len(set(r['code_idx'] for r in results)),
'program': result_to_use['program'],
}

result['score'] = result['pass'] / len(predictions) * 100
result['details'] = details
Expand All @@ -150,53 +209,104 @@ def _process_answer(self, text):
else:
if isinstance(eval_text, str):
text = eval_text
# deal with code block

code_blocks = []
# breakpoint()
# extract all code blocks with ```python or ``` markers
if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
if len(blocks) == 0:
text = text.split('```')[1] # fall back to default strategy
# Try to find ```python blocks first
python_blocks = re.findall(r'```python\s*(.*?)```', text,
re.DOTALL)

# If no ```python blocks, look for generic ``` blocks
if not python_blocks:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
if not blocks:
# Fall back: split by ``` and take the content between markers
parts = text.split('```')
if len(parts) > 1:
code_blocks.append(parts[1])
else:
for block in blocks:
# Skip language identifier if present
if not block.startswith('\n') and '\n' in block:
block = block[block.find('\n') + 1:]
code_blocks.append(block.strip())
else:
text = blocks[0] # fetch the first code block
if not text.startswith('\n'): # in case starting with ```xxx
text = text[max(text.find('\n') + 1, 0):]
text = text.strip()
match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
if match:
text = text[:match.start()]
match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text)
if match:
text = text[match.end():]
text = text.strip()
if text.startswith("'"):
text = text[1:]
if text.endswith("'"):
text = text[:-1]
text = text.replace('\\', '')
match = re.search(r'```python(.*)```', text, re.DOTALL)
if match:
text = match.group(1).strip().split('```')[0].strip()
return text

def _process_test(self, test_case, pred):
formatted = pred + '\n'
code_blocks.extend([block.strip() for block in python_blocks])

# If no code blocks found, use the entire text
if not code_blocks:
code_blocks = [text]

# Process each code block
processed_blocks = []
for code in code_blocks:
# Clean up the code block
code = code.strip()
# Remove [BEGIN]/[DONE] markers
match = re.search(r"('\s*|)(\[DONE\]|DONE)", code)
if match:
code = code[:match.start()]
match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", code)
if match:
code = code[match.end():]
code = code.strip()
if code.startswith("'"):
code = code[1:]
if code.endswith("'"):
code = code[:-1]
code = code.replace('\\', '')

processed_blocks.append(code)

return processed_blocks

def _process_test(self, test_case, code):
"""Process test with both direct function call and Solution class.

Args:
test_case (str): Test case code
code (str): User submitted code
"""

# Add wrapper to support Solution class if it exists in the code
if 'class Solution' in code:
# Extract the function name from assert statements
# Looking for patterns like: assert func_name(args)
func_calls = re.findall(r'assert\s+(\w+)\(', test_case)
if func_calls:
# Get unique function names from the test case
func_names = set(func_calls)

modified_test = test_case
for func_name in func_names:
# Replace all occurrences of function calls with Solution().func_name
modified_test = re.sub(
r'(\bassert\s+)' + func_name + r'(\()',
r'\1Solution().' + func_name + r'\2',
modified_test,
)

# Use the modified test
test_case = modified_test

formatted = code + '\n'
formatted += test_case
# breakpoint()
return formatted


def execution(programs, task_id, timeout):
def execution(programs, task_ids, timeout):
"""Execution function for running generation code.

Args:
programs(str): Python code to be executed.
task_id(int): Task id of the current example.
timeout(int): Time limit for execution, avoid unnecessary
blocking.

In pass@k scenario, a lot of programs should be executed.
Some internal error cannot be handled properly, such as
`RecursionError` might cause system break. It is better to
separate the execution in thread or multiprocess to better
control the process.
task_ids(tuple): Tuple containing (example_idx, code_block_idx, program_variant_idx).
timeout(int): Time limit for execution.

Returns:
tuple: (task_ids, result_status, program_code)
"""

def _execution(programs, timeout):
Expand Down Expand Up @@ -227,8 +337,8 @@ def _execution(programs, timeout):
if p.is_alive():
p.kill()
# key might not have value if killed
return task_id, 'timeout'
return task_id, key[0]
return task_ids, 'timeout', programs
return task_ids, key[0], programs


class LCPassKEvaluator(LCEvaluator):
Expand Down Expand Up @@ -279,43 +389,99 @@ def score(self, predictions, references):

result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
details = {}

with ProcessPoolExecutor() as executor:
futures = []
index, programs = 0, []
task_info = [] # Store info for each task

index = 0
for refer, preds in zip(references, predictions):
# suits for two case
# 1. use repeated dataset
# 2. use `num_return_sequences` to generate multiple responses
if not isinstance(preds, list):
preds = [preds]

test_case = refer['test_list_2']
task_id = refer['task_id']

# create empty task_pass in case all example failed
if task_id not in task_pass:
task_pass[task_id] = 0

for pred in preds:
pred = self._process_answer(pred)
program = self._process_test(test_case, pred)
future = executor.submit(execution, program,
(index, task_id), 3)
futures.append(future)
programs.append(program)
# Extract all code blocks from the prediction
code_blocks = self._process_answer(pred)

# Try each code block with various test program formats
for code_idx, code_block in enumerate(code_blocks):
# Process test with the current code block
test_program = self._process_test(
test_case, code_block)

# Submit this program for execution
future = executor.submit(
execution,
test_program,
(
index,
task_id,
code_idx,
0,
), # prog_idx always 0 since we only have one program per code block
30,
)
futures.append(future)
task_info.append({
'index': index,
'task_id': task_id,
'code_block': code_block,
'program': test_program,
})

index += 1

# Track which tasks have passed with any code block
passed_tasks = set()
task_results = defaultdict(list)

from tqdm import tqdm

for future in tqdm(as_completed(futures), total=len(futures)):
(index, task_id), ret = future.result()
result[ret] += 1
task_total[task_id] += 1
is_correct = ret == 'pass'
task_pass[task_id] += is_correct
details[str(index)] = {
'program': programs[index],
(index, task_id, code_idx,
prog_idx), ret, program = future.result()

# Store result
task_results[(index, task_id)].append({
'result': ret,
'is_correct': ret == 'pass',
'program': program
})

# If this is a pass, mark the task
if ret == 'pass':
passed_tasks.add((index, task_id))

# Store detailed result
details[f'{index}_{code_idx}_{prog_idx}'] = {
'program': program,
'task_id': task_id,
'result': ret,
'is_correct': is_correct,
'is_correct': ret == 'pass',
}

# Process all tasks
for (index, task_id), results in task_results.items():
task_total[task_id] += 1
# Task passes if any code block passes
if (index, task_id) in passed_tasks:
task_pass[task_id] += 1
result['pass'] += 1
else:
# Get the first result to classify the error
first_result = results[0]['result']
result[first_result] += 1

result['details'] = details

def get_number(tasks):
Expand Down
Loading