option to use --reduced-test in dabstep evaluation (#16)

vinid · web-flow · commit 4633edd29fb8 · 2025-06-17T10:06:34.000-07:00
diff --git a/eval/README.md b/eval/README.md
@@ -1,4 +1,3 @@
-
 ## 📊 Evaluation
 
 The ReAct Data Science Agent includes two comprehensive evaluation frameworks to test its capabilities on real-world data science tasks:
@@ -22,12 +21,12 @@ The DABstep evaluation requires no manual data setup - everything is handled aut
 cd eval
 python dabstep.py
 
-# Test with just the first few examples
-python dabstep.py --test-first-only
-
 # Skip hard difficulty questions
 python dabstep.py --skip-hard
 
+# Sample 30 easy and 30 hard tasks (mutually exclusive with --skip-hard)
+python dabstep.py --reduced-test
+
 # Submit results (creates submission file)
 python dabstep.py --submit --which-split dev
 ```
@@ -37,6 +36,7 @@ python dabstep.py --submit --which-split dev
 2. **Specialized prompts**: The agent receives detailed instructions about financial domain concepts from `manual.md`
 3. **Precise file paths**: Uses absolute paths like `/app/downloaded_data/data/context/payments.csv`
 4. **Domain validation**: Emphasizes reading domain manuals before analysis to ensure correct interpretations
+5. **Task sampling**: Supports either skipping hard tasks or sampling a balanced easy and hard tasks
 
 ### 🏆 Kaggle Competition Evaluation
 
diff --git a/eval/dabstep.py b/eval/dabstep.py
@@ -3,8 +3,9 @@
 import json
 from dataclasses import dataclass
 from pathlib import Path
+import random
 
-from datasets import load_dataset
+from datasets import load_dataset, concatenate_datasets
 from open_data_scientist.codeagent import ReActDataScienceAgent
 
 
@@ -150,31 +151,49 @@ def write_jsonl(data: list[dict], filepath: Path) -> None:
 
 
 def main(
-    test_first_only=False,
     submit=False,
     data_dir=None,
     which_split="dev",
     skip_hard=False,
+    reduced_test=False,
 ):
+    if skip_hard and reduced_test:
+        raise ValueError("Cannot use both --skip-hard and --reduced-test at the same time")
+
     # Load the dataset
     ds = load_dataset("adyen/DABstep", "tasks")
 
     dataset = ds[which_split]
 
     # Store hard tasks before filtering if we're skipping and submitting
     skipped_tasks = []
-    if skip_hard and submit:
-        skipped_tasks = [task for task in dataset if task.get("level") == "hard"]
-
     if skip_hard:
+        skipped_tasks = [task for task in dataset if task.get("level") == "hard"]
         dataset = dataset.filter(lambda example: example.get("level") != "hard")
-
-    if test_first_only:
-        dataset = dataset.select([0, 1, 2])
+    elif reduced_test:
+        dataset = dataset.shuffle(seed=42)
+        easy_tasks = dataset.filter(lambda x: x["level"] == "easy")
+        hard_tasks = dataset.filter(lambda x: x["level"] == "hard")
+
+        # Sample 20 tasks from each difficulty level
+        sampled_easy = easy_tasks.select(range(20))  
+        sampled_hard = hard_tasks.select(range(20))  
+
+        sampled_ids = set()
+        for task in sampled_easy:
+            sampled_ids.add(task["task_id"])
+        for task in sampled_hard:
+            sampled_ids.add(task["task_id"])
+
+        skipped_tasks = [task for task in dataset if task["task_id"] not in sampled_ids]
+        dataset = concatenate_datasets([sampled_easy, sampled_hard])
+        dataset = dataset.shuffle(seed=42)
+    else:
+        print("Running all tasks")
 
     number_of_examples = len(dataset)
     results = []
-    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+    with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
         future_to_task = {
             executor.submit(process_task, task, submit, data_dir): task
             for task in dataset
@@ -222,9 +241,6 @@ def main(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Run DABstep evaluation")
-    parser.add_argument(
-        "--test-first-only", action="store_true", help="Test only the first example"
-    )
     parser.add_argument(
         "--submit", action="store_true", help="Submit the results to the leaderboard"
     )
@@ -237,6 +253,9 @@ def main(
     parser.add_argument(
         "--skip-hard", action="store_true", help="Skip examples with level=hard"
     )
+    parser.add_argument(
+        "--reduced-test", action="store_true", help="Sample 20 easy and 20 hard tasks"
+    )
     parser.add_argument(
         "--data-dir",
         default=None,
@@ -245,9 +264,9 @@ def main(
     args = parser.parse_args()
 
     main(
-        test_first_only=args.test_first_only,
         submit=args.submit,
         data_dir=args.data_dir,
         which_split=args.which_split,
         skip_hard=args.skip_hard,
+        reduced_test=args.reduced_test,
     )