Skip to main content

Overview

Benchmarks let you systematically evaluate your agent against a suite of tests, tracking pass rates and comparing models.
Try it in Colab:
Pass Rates Annotated Pn

Built-in Evaluations

Slack Bench

suites = client.list_test_suites(name="Slack Bench")
suite = client.get_test_suite(suites.testSuites[0].id, expand=True)
print(f"Slack Bench: {len(suite.tests)} tests")
Coverage:
  • Message sending (5 tests)
  • Channel operations (4 tests)
  • Reactions (3 tests)
  • Threading (4 tests)
  • User mentions (4 tests)

Linear Bench

suites = client.list_test_suites(name="Linear Bench")
suite = client.get_test_suite(suites.testSuites[0].id, expand=True)
print(f"Linear Bench: {len(suite.tests)} tests")
Coverage:
  • Issue CRUD (12 tests)
  • Labels (6 tests)
  • Comments (5 tests)
  • Workflow states (8 tests)
  • Team operations (5 tests)
  • Projects (4 tests)
View evaluation suites files on GitHub: slack bench | linear bench

Running a Full Benchmark

import asyncio
import json
from datetime import datetime
from agent_diff import AgentDiff, PythonExecutorProxy, create_openai_tool
from agents import Agent, Runner

client = AgentDiff()

async def run_benchmark(suite_name: str, model: str = "gpt-4o"):
    """Run a full benchmark suite and return results."""
    
    # Get suite
    suites = client.list_test_suites(name=suite_name)
    suite = client.get_test_suite(suites.testSuites[0].id, expand=True)
    
    results = []
    
    for i, test in enumerate(suite.tests):
        print(f"[{i+1}/{len(suite.tests)}] Running: {test.name}")
        
        start_time = datetime.now()
        
        # Create environment
        env = client.init_env(testId=test.id)
        run = client.start_run(envId=env.environmentId, testId=test.id)
        
        try:
            # Create agent
            executor = PythonExecutorProxy(env.environmentId, client.base_url)
            agent = Agent(
                model=model,
                name="Benchmark Agent",
                tools=[create_openai_tool(executor)],
                instructions="Use execute_python to interact with the API..."
            )
            
            # Run
            await Runner.run(agent, test.prompt)
            
            # Evaluate
            result = client.evaluate_run(runId=run.runId)
            
            elapsed = (datetime.now() - start_time).total_seconds()
            
            results.append({
                "test_id": test.id,
                "test_name": test.name,
                "passed": result.passed,
                "score": result.score,
                "failures": result.failures,
                "elapsed_seconds": elapsed
            })
            
            status = "✓" if result.passed else "✗"
            print(f"    {status} {result.score:.0%} ({elapsed:.1f}s)")
            
        except Exception as e:
            results.append({
                "test_id": test.id,
                "test_name": test.name,
                "passed": False,
                "score": 0,
                "error": str(e),
                "elapsed_seconds": (datetime.now() - start_time).total_seconds()
            })
            print(f"    ✗ Error: {e}")
        
        finally:
            client.delete_env(envId=env.environmentId)
    
    return results

# Run benchmark
results = asyncio.run(run_benchmark("Slack Bench", model="gpt-4o"))

# Summary
passed = sum(1 for r in results if r["passed"])
total_time = sum(r["elapsed_seconds"] for r in results)

print(f"\n{'='*50}")
print(f"BENCHMARK RESULTS: Slack Bench")
print(f"{'='*50}")
print(f"Passed: {passed}/{len(results)} ({passed/len(results):.0%})")
print(f"Total time: {total_time:.1f}s")
print(f"Avg time per test: {total_time/len(results):.1f}s")

HuggingFace Dataset

Linear Bench is available as a HuggingFace dataset for reproducible research and model comparisons:

Linear Bench Mini

40 agent evaluation tasks for Linear GraphQL API
from datasets import load_dataset

# Load the benchmark
dataset = load_dataset("hubertmarek/linear-bench-mini")

for example in dataset["train"]:
    print(f"Prompt: {example['prompt']}")
    print(f"Expected: {example['expected_state']}")
The dataset includes:
  • 40 diverse agent tasks (issue CRUD, labels, comments, workflows)
  • Pre-defined assertions in JSON format
  • Seed template references
  • Tags for filtering (linear, graphql, agent-eval)

Next Steps