Documentation Index Fetch the complete documentation index at: https://agentdiff.mintlify.app/llms.txt
Use this file to discover all available pages before exploring further.
Overview
Benchmarks let you systematically evaluate your agent against a suite of tests, tracking pass rates and comparing models.
Built-in Evaluations
Slack Bench
suites = client.list_test_suites( name = "Slack Bench" )
suite = client.get_test_suite(suites.testSuites[ 0 ].id, expand = True )
print ( f "Slack Bench: { len (suite.tests) } tests" )
Coverage:
Message sending (5 tests)
Channel operations (4 tests)
Reactions (3 tests)
Threading (4 tests)
User mentions (4 tests)
Linear Bench
suites = client.list_test_suites( name = "Linear Bench" )
suite = client.get_test_suite(suites.testSuites[ 0 ].id, expand = True )
print ( f "Linear Bench: { len (suite.tests) } tests" )
Coverage:
Issue CRUD (12 tests)
Labels (6 tests)
Comments (5 tests)
Workflow states (8 tests)
Team operations (5 tests)
Projects (4 tests)
Running a Full Benchmark
import asyncio
import json
from datetime import datetime
from agent_diff import AgentDiff, PythonExecutorProxy, create_openai_tool
from agents import Agent, Runner
client = AgentDiff()
async def run_benchmark ( suite_name : str , model : str = "gpt-4o" ):
"""Run a full benchmark suite and return results."""
# Get suite
suites = client.list_test_suites( name = suite_name)
suite = client.get_test_suite(suites.testSuites[ 0 ].id, expand = True )
results = []
for i, test in enumerate (suite.tests):
print ( f "[ { i + 1 } / { len (suite.tests) } ] Running: { test.name } " )
start_time = datetime.now()
# Create environment
env = client.init_env( testId = test.id)
run = client.start_run( envId = env.environmentId, testId = test.id)
try :
# Create agent
executor = PythonExecutorProxy(env.environmentId, client.base_url)
agent = Agent(
model = model,
name = "Benchmark Agent" ,
tools = [create_openai_tool(executor)],
instructions = "Use execute_python to interact with the API..."
)
# Run
await Runner.run(agent, test.prompt)
# Evaluate
result = client.evaluate_run( runId = run.runId)
elapsed = (datetime.now() - start_time).total_seconds()
results.append({
"test_id" : test.id,
"test_name" : test.name,
"passed" : result.passed,
"score" : result.score,
"failures" : result.failures,
"elapsed_seconds" : elapsed
})
status = "✓" if result.passed else "✗"
print ( f " { status } { result.score :.0%} ( { elapsed :.1f} s)" )
except Exception as e:
results.append({
"test_id" : test.id,
"test_name" : test.name,
"passed" : False ,
"score" : 0 ,
"error" : str (e),
"elapsed_seconds" : (datetime.now() - start_time).total_seconds()
})
print ( f " ✗ Error: { e } " )
finally :
client.delete_env( envId = env.environmentId)
return results
# Run benchmark
results = asyncio.run(run_benchmark( "Slack Bench" , model = "gpt-4o" ))
# Summary
passed = sum ( 1 for r in results if r[ "passed" ])
total_time = sum (r[ "elapsed_seconds" ] for r in results)
print ( f " \n { '=' * 50 } " )
print ( f "BENCHMARK RESULTS: Slack Bench" )
print ( f " { '=' * 50 } " )
print ( f "Passed: { passed } / { len (results) } ( { passed / len (results) :.0%} )" )
print ( f "Total time: { total_time :.1f} s" )
print ( f "Avg time per test: { total_time / len (results) :.1f} s" )
HuggingFace Dataset
Linear Bench is available as a HuggingFace dataset for reproducible research and model comparisons:
Linear Bench Mini 40 agent evaluation tasks for Linear GraphQL API
from datasets import load_dataset
# Load the benchmark
dataset = load_dataset( "hubertmarek/linear-bench-mini" )
for example in dataset[ "train" ]:
print ( f "Prompt: { example[ 'prompt' ] } " )
print ( f "Expected: { example[ 'expected_state' ] } " )
The dataset includes:
40 diverse agent tasks (issue CRUD, labels, comments, workflows)
Pre-defined assertions in JSON format
Seed template references
Tags for filtering (linear, graphql, agent-eval)
Next Steps
Creating Test Suites Create your own test suites
Integrate with Agents Connect your AI agent framework