open-compass · Myhs-phz · Jun 25, 2025 · Jul 1, 2025
diff --git a/opencompass/datasets/LCBench.py b/opencompass/datasets/LCBench.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 import contextlib
 import io
 import itertools
@@ -117,25 +118,83 @@ class LCEvaluator(BaseEvaluator):
     def score(self, predictions, references):
         if len(predictions) != len(references):
             return {'error': 'preds and refrs have different length'}
+
         result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
         details = {}
+
         with ProcessPoolExecutor() as executor:
             futures = []
             for i, (refer, pred) in enumerate(zip(references, predictions)):
-                pred = self._process_answer(pred)
-                programs = self._process_test(refer, pred)
-                future = executor.submit(execution, programs, i, 3)
-                futures.append(future)
 
-            from tqdm import tqdm
-            for future in tqdm(as_completed(futures), total=len(futures)):
-                index, ret = future.result()
-                result[ret] += 1
-                details[str(index)] = {
-                    'programs': predictions[index],
-                    'result': ret,
-                    'is_correct': ret == 'pass',
-                }
+                code_blocks = self._process_answer(pred)
+
+                # Try each code block until one passes
+                for code_idx, code_block in enumerate(code_blocks):
+                    test_programs = self._process_test(refer, code_block)
+
+                    # Submit each test program variant for execution
+                    for prog_idx, program in enumerate(test_programs):
+                        future = executor.submit(
+                            execution,
+                            program,
+                            (
+                                i,
+                                code_idx,
+                                prog_idx,
+                            ),  # Pass indices for tracking
+                            3,
+                        )
+                        futures.append(future)
+
+        from tqdm import tqdm
+
+        # Track which examples passed
+        passed_examples = set()
+        all_results = {}
+
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            (example_idx, code_idx, prog_idx), ret, program = future.result()
+
+            # Store result
+            if example_idx not in all_results:
+                all_results[example_idx] = []
+
+            all_results[example_idx].append({
+                'code_idx': code_idx,
+                'prog_idx': prog_idx,
+                'result': ret,
+                'is_correct': ret == 'pass',
+                'program': program,
+            })
+
+            # If this example passed with any code block or test variant
+            if ret == 'pass':
+                passed_examples.add(example_idx)
+
+        # Process final results
+        for example_idx, results in all_results.items():
+            # Did any variant pass?
+            example_passed = example_idx in passed_examples
+
+            # Get the first passing result if any, otherwise get the first result
+            result_to_use = next((r for r in results if r['is_correct']),
+                                 results[0])
+
+            # Update counters
+            if example_passed:
+                result['pass'] += 1
+            else:
+                result[result_to_use['result']] += 1
+
+            # Store details
+            details[str(example_idx)] = {
+                'result':
+                ('pass' if example_passed else result_to_use['result']),
+                'is_correct': example_passed,
+                'num_attempts': len(results),
+                'code_blocks_tried': len(set(r['code_idx'] for r in results)),
+                'program': result_to_use['program'],
+            }
 
         result['score'] = result['pass'] / len(predictions) * 100
         result['details'] = details
@@ -150,53 +209,104 @@ def _process_answer(self, text):
         else:
             if isinstance(eval_text, str):
                 text = eval_text
-        # deal with code block
+
+        code_blocks = []
+        # breakpoint()
+        # extract all code blocks with ```python or ``` markers
         if '```' in text:
-            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
-            if len(blocks) == 0:
-                text = text.split('```')[1]  # fall back to default strategy
+            # Try to find ```python blocks first
+            python_blocks = re.findall(r'```python\s*(.*?)```', text,
+                                       re.DOTALL)
+
+            # If no ```python blocks, look for generic ``` blocks
+            if not python_blocks:
+                blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+                if not blocks:
+                    # Fall back: split by ``` and take the content between markers
+                    parts = text.split('```')
+                    if len(parts) > 1:
+                        code_blocks.append(parts[1])
+                else:
+                    for block in blocks:
+                        # Skip language identifier if present
+                        if not block.startswith('\n') and '\n' in block:
+                            block = block[block.find('\n') + 1:]
+                        code_blocks.append(block.strip())
             else:
-                text = blocks[0]  # fetch the first code block
-                if not text.startswith('\n'):  # in case starting with ```xxx
-                    text = text[max(text.find('\n') + 1, 0):]
-        text = text.strip()
-        match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
-        if match:
-            text = text[:match.start()]
-        match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text)
-        if match:
-            text = text[match.end():]
-        text = text.strip()
-        if text.startswith("'"):
-            text = text[1:]
-        if text.endswith("'"):
-            text = text[:-1]
-        text = text.replace('\\', '')
-        match = re.search(r'```python(.*)```', text, re.DOTALL)
-        if match:
-            text = match.group(1).strip().split('```')[0].strip()
-        return text
-
-    def _process_test(self, test_case, pred):
-        formatted = pred + '\n'
+                code_blocks.extend([block.strip() for block in python_blocks])
+
+        # If no code blocks found, use the entire text
+        if not code_blocks:
+            code_blocks = [text]
+
+        # Process each code block
+        processed_blocks = []
+        for code in code_blocks:
+            # Clean up the code block
+            code = code.strip()
+            # Remove [BEGIN]/[DONE] markers
+            match = re.search(r"('\s*|)(\[DONE\]|DONE)", code)
+            if match:
+                code = code[:match.start()]
+            match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", code)
+            if match:
+                code = code[match.end():]
+            code = code.strip()
+            if code.startswith("'"):
+                code = code[1:]
+            if code.endswith("'"):
+                code = code[:-1]
+            code = code.replace('\\', '')
+
+            processed_blocks.append(code)
+
+        return processed_blocks
+
+    def _process_test(self, test_case, code):
+        """Process test with both direct function call and Solution class.
+
+        Args:
+            test_case (str): Test case code
+            code (str): User submitted code
+        """
+
+        # Add wrapper to support Solution class if it exists in the code
+        if 'class Solution' in code:
+            # Extract the function name from assert statements
+            # Looking for patterns like: assert func_name(args)
+            func_calls = re.findall(r'assert\s+(\w+)\(', test_case)
+            if func_calls:
+                # Get unique function names from the test case
+                func_names = set(func_calls)
+
+                modified_test = test_case
+                for func_name in func_names:
+                    # Replace all occurrences of function calls with Solution().func_name
+                    modified_test = re.sub(
+                        r'(\bassert\s+)' + func_name + r'(\()',
+                        r'\1Solution().' + func_name + r'\2',
+                        modified_test,
+                    )
+
+                # Use the modified test
+                test_case = modified_test
+
+        formatted = code + '\n'
         formatted += test_case
+        # breakpoint()
         return formatted
 
 
-def execution(programs, task_id, timeout):
+def execution(programs, task_ids, timeout):
     """Execution function for running generation code.
 
     Args:
         programs(str): Python code to be executed.
-        task_id(int): Task id of the current example.
-        timeout(int): Time limit for execution, avoid unnecessary
-            blocking.
-
-    In pass@k scenario, a lot of programs should be executed.
-    Some internal error cannot be handled properly, such as
-    `RecursionError` might cause system break. It is better to
-    separate the execution in thread or multiprocess to better
-    control the process.
+        task_ids(tuple): Tuple containing (example_idx, code_block_idx, program_variant_idx).
+        timeout(int): Time limit for execution.
+
+    Returns:
+        tuple: (task_ids, result_status, program_code)
     """
 
     def _execution(programs, timeout):
@@ -227,8 +337,8 @@ def _execution(programs, timeout):
     if p.is_alive():
         p.kill()
         # key might not have value if killed
-        return task_id, 'timeout'
-    return task_id, key[0]
+        return task_ids, 'timeout', programs
+    return task_ids, key[0], programs
 
 
 class LCPassKEvaluator(LCEvaluator):
@@ -279,43 +389,99 @@ def score(self, predictions, references):
 
         result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
         details = {}
+
         with ProcessPoolExecutor() as executor:
             futures = []
-            index, programs = 0, []
+            task_info = []  # Store info for each task
+
+            index = 0
             for refer, preds in zip(references, predictions):
                 # suits for two case
                 # 1. use repeated dataset
                 # 2. use `num_return_sequences` to generate multiple responses
                 if not isinstance(preds, list):
                     preds = [preds]
+
                 test_case = refer['test_list_2']
                 task_id = refer['task_id']
+
                 # create empty task_pass in case all example failed
                 if task_id not in task_pass:
                     task_pass[task_id] = 0
+
                 for pred in preds:
-                    pred = self._process_answer(pred)
-                    program = self._process_test(test_case, pred)
-                    future = executor.submit(execution, program,
-                                             (index, task_id), 3)
-                    futures.append(future)
-                    programs.append(program)
+                    # Extract all code blocks from the prediction
+                    code_blocks = self._process_answer(pred)
+
+                    # Try each code block with various test program formats
+                    for code_idx, code_block in enumerate(code_blocks):
+                        # Process test with the current code block
+                        test_program = self._process_test(
+                            test_case, code_block)
+
+                        # Submit this program for execution
+                        future = executor.submit(
+                            execution,
+                            test_program,
+                            (
+                                index,
+                                task_id,
+                                code_idx,
+                                0,
+                            ),  # prog_idx always 0 since we only have one program per code block
+                            30,
+                        )
+                        futures.append(future)
+                        task_info.append({
+                            'index': index,
+                            'task_id': task_id,
+                            'code_block': code_block,
+                            'program': test_program,
+                        })
+
                     index += 1
 
+            # Track which tasks have passed with any code block
+            passed_tasks = set()
+            task_results = defaultdict(list)
+
             from tqdm import tqdm
+
             for future in tqdm(as_completed(futures), total=len(futures)):
-                (index, task_id), ret = future.result()
-                result[ret] += 1
-                task_total[task_id] += 1
-                is_correct = ret == 'pass'
-                task_pass[task_id] += is_correct
-                details[str(index)] = {
-                    'program': programs[index],
+                (index, task_id, code_idx,
+                 prog_idx), ret, program = future.result()
+
+                # Store result
+                task_results[(index, task_id)].append({
+                    'result': ret,
+                    'is_correct': ret == 'pass',
+                    'program': program
+                })
+
+                # If this is a pass, mark the task
+                if ret == 'pass':
+                    passed_tasks.add((index, task_id))
+
+                # Store detailed result
+                details[f'{index}_{code_idx}_{prog_idx}'] = {
+                    'program': program,
                     'task_id': task_id,
                     'result': ret,
-                    'is_correct': is_correct,
+                    'is_correct': ret == 'pass',
                 }
 
+            # Process all tasks
+            for (index, task_id), results in task_results.items():
+                task_total[task_id] += 1
+                # Task passes if any code block passes
+                if (index, task_id) in passed_tasks:
+                    task_pass[task_id] += 1
+                    result['pass'] += 1
+                else:
+                    # Get the first result to classify the error
+                    first_result = results[0]['result']
+                    result[first_result] += 1
+
         result['details'] = details
 
         def get_number(tasks):