import asyncio
import json
from datetime import datetime
from agent_diff import AgentDiff, PythonExecutorProxy, create_openai_tool
from agents import Agent, Runner
client = AgentDiff()
async def run_benchmark(suite_name: str, model: str = "gpt-4o"):
"""Run a full benchmark suite and return results."""
# Get suite
suites = client.list_test_suites(name=suite_name)
suite = client.get_test_suite(suites.testSuites[0].id, expand=True)
results = []
for i, test in enumerate(suite.tests):
print(f"[{i+1}/{len(suite.tests)}] Running: {test.name}")
start_time = datetime.now()
# Create environment
env = client.init_env(testId=test.id)
run = client.start_run(envId=env.environmentId, testId=test.id)
try:
# Create agent
executor = PythonExecutorProxy(env.environmentId, client.base_url)
agent = Agent(
model=model,
name="Benchmark Agent",
tools=[create_openai_tool(executor)],
instructions="Use execute_python to interact with the API..."
)
# Run
await Runner.run(agent, test.prompt)
# Evaluate
result = client.evaluate_run(runId=run.runId)
elapsed = (datetime.now() - start_time).total_seconds()
results.append({
"test_id": test.id,
"test_name": test.name,
"passed": result.passed,
"score": result.score,
"failures": result.failures,
"elapsed_seconds": elapsed
})
status = "✓" if result.passed else "✗"
print(f" {status} {result.score:.0%} ({elapsed:.1f}s)")
except Exception as e:
results.append({
"test_id": test.id,
"test_name": test.name,
"passed": False,
"score": 0,
"error": str(e),
"elapsed_seconds": (datetime.now() - start_time).total_seconds()
})
print(f" ✗ Error: {e}")
finally:
client.delete_env(envId=env.environmentId)
return results
# Run benchmark
results = asyncio.run(run_benchmark("Slack Bench", model="gpt-4o"))
# Summary
passed = sum(1 for r in results if r["passed"])
total_time = sum(r["elapsed_seconds"] for r in results)
print(f"\n{'='*50}")
print(f"BENCHMARK RESULTS: Slack Bench")
print(f"{'='*50}")
print(f"Passed: {passed}/{len(results)} ({passed/len(results):.0%})")
print(f"Total time: {total_time:.1f}s")
print(f"Avg time per test: {total_time/len(results):.1f}s")