-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Expand file tree
/
Copy pathrun_benchmark.py
More file actions
121 lines (98 loc) · 4.26 KB
/
run_benchmark.py
File metadata and controls
121 lines (98 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
"""
Run the Vending-Bench benchmark and generate results.
This script runs the benchmark with the heuristic agent and generates
a comprehensive report comparing results with the leaderboard.
"""
import asyncio
import logging
import sys
from decimal import Decimal
from pathlib import Path
# Add the package to path for direct execution
sys.path.insert(0, str(Path(__file__).parent))
from elizaos_vending_bench.runner import VendingBenchRunner
from elizaos_vending_bench.types import VendingBenchConfig
def setup_logging() -> None:
"""Configure logging."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
async def main() -> int:
"""Run the benchmark."""
setup_logging()
logger = logging.getLogger(__name__)
# Configure benchmark
config = VendingBenchConfig(
num_runs=10, # 10 simulation runs for statistical significance
max_days_per_run=30, # Full 30-day simulation
initial_cash=Decimal("500.00"),
random_seed=42, # Fixed seed for reproducibility
model_name="heuristic",
temperature=0.0,
output_dir="./benchmark_results/vending-bench",
save_detailed_logs=True,
save_trajectories=True,
generate_report=True,
compare_leaderboard=True,
)
# Create and run benchmark
runner = VendingBenchRunner(config)
logger.info("=" * 70)
logger.info("Starting Vending-Bench Evaluation for ElizaOS")
logger.info("=" * 70)
logger.info("Configuration:")
logger.info(f" - Runs: {config.num_runs}")
logger.info(f" - Days per run: {config.max_days_per_run}")
logger.info(f" - Initial cash: ${config.initial_cash}")
logger.info(f" - Model: {config.model_name}")
logger.info(f" - Seed: {config.random_seed}")
logger.info("=" * 70)
report = await runner.run_benchmark()
# Print summary
print("\n" + "=" * 70)
print("VENDING-BENCH RESULTS")
print("=" * 70)
print("\n📊 PERFORMANCE SUMMARY")
print(f" Best Net Worth: ${report.metrics.max_net_worth:.2f}")
print(f" Average Net Worth: ${report.metrics.avg_net_worth:.2f}")
print(f" Median Net Worth: ${report.metrics.median_net_worth:.2f}")
print(f" Min Net Worth: ${report.metrics.min_net_worth:.2f}")
print(f" Std Deviation: ${report.metrics.std_net_worth:.2f}")
print("\n📈 SUCCESS METRICS")
print(f" Success Rate: {report.metrics.success_rate:.1%}")
print(f" Profitability Rate: {report.metrics.profitability_rate:.1%}")
print(f" Avg Profit: ${report.metrics.avg_profit:.2f}")
print("\n🧠 COHERENCE ANALYSIS")
print(f" Coherence Score: {report.metrics.coherence_score:.1%}")
print(f" Avg Errors/Run: {report.metrics.avg_coherence_errors:.1f}")
print("\n📦 OPERATIONAL METRICS")
print(f" Avg Items Sold: {report.metrics.avg_items_sold:.1f}")
print(f" Avg Orders Placed: {report.metrics.avg_orders_placed:.1f}")
print(f" Avg Stockout Days: {report.metrics.avg_stockout_days:.1f}")
print(f" Avg Simulation Days:{report.metrics.avg_simulation_days:.1f}")
if report.leaderboard_comparison:
print("\n🏆 LEADERBOARD COMPARISON")
print(f" Our Best Score: ${report.leaderboard_comparison.our_score:.2f}")
print(
f" Rank: #{report.leaderboard_comparison.our_rank} of {report.leaderboard_comparison.total_entries}"
)
print(f" Percentile: Top {100 - report.leaderboard_comparison.percentile:.0f}%")
print("\n Comparison with top models:")
for model, score, comparison in report.leaderboard_comparison.comparisons[:5]:
print(f" - {model}: ${score:.2f} ({comparison})")
print("\n" + "=" * 70)
print(f"📁 Results saved to: {config.output_dir}")
print("=" * 70 + "\n")
# Print key findings
print("KEY FINDINGS:")
for finding in report.summary.get("key_findings", []):
print(f" • {finding}")
print("\nRECOMMENDATIONS:")
for rec in report.summary.get("recommendations", []):
print(f" • {rec}")
return 0
if __name__ == "__main__":
sys.exit(asyncio.run(main()))