Skip to main content

What Is a Test Suite?

A test suite is a collection of related tests that share:
  • A common service (Slack, Linear)
  • A template/seed configuration
  • Related functionality being tested

Test Suite Structure

{
  "id": "suite-123",
  "name": "Slack Bench",
  "description": "Core Slack agent capabilities",
  "visibility": "public",
  "tests": [
    {
      "id": "test-001",
      "name": "Post message to channel",
      "prompt": "Post 'Hello World!' to #general",
      "type": "actionEval",
      "environmentTemplate": "slack_bench_default",
      "impersonateUserId": "U01AGENBOT9",
      "expected_output": {
        "assertions": [...],
        "ignore_fields": {
          "global": ["created_at", "updated_at", "message_id"]
        }
      }
    }
  ]
}

Test Suite Parameters

ParameterTypeRequiredDescription
namestringYesName of the test suite
descriptionstringYesDescription of what the suite tests
visibilitystringNo"public" or "private" (default)

Test Parameters

ParameterTypeRequiredDescription
namestringYesName of the test
promptstringYesTask prompt for the agent
typestringYesTest type: "actionEval", "retriEval", or "compositeEval"
environmentTemplatestring/UUIDYesTemplate name or ID to use
impersonateUserIdstringNoUser ID the agent acts as
expected_outputobjectYesExpected assertions and ignore_fields

Expected Output Structure

FieldTypeDescription
assertionsarrayList of assertions to evaluate
ignore_fieldsobjectFields to ignore: {"global": [...], "entity_name": [...]}
strictbooleanFail if extra changes exist (default: false)
You read more here on how to create the expected output: writing assertions

Listing Test Suites

from agent_diff import AgentDiff

client = AgentDiff()

# List all suites
suites = client.list_test_suites()
for suite in suites.testSuites:
    print(f"- {suite.name} ({suite.id})")

# Filter by name
slack_suites = client.list_test_suites(name="Slack")

Getting Suite Details

# Get suite with test details
suite = client.get_test_suite(suite_id, expand=True)

print(f"Suite: {suite.name}")
print(f"Tests: {len(suite.tests)}")

for test in suite.tests:
    print(f"  - {test.name}: {test.prompt}")

Creating Test Suites

# Create a new test suite
suite = client.create_test_suite(
    name="My Agent Tests",
    description="Custom tests for my Slack agent",
    visibility="private"
)

print(f"Created suite: {suite.id}")

Adding Tests to a Suite

# Add tests with assertions
test = client.create_test(suite.id, {
    "name": "Post welcome message",
    "prompt": "Post a welcome message to #general",
    "type": "actionEval",
    "environmentTemplate": "slack_default",
    "impersonateUserId": "U01AGENBOT9",
    "expected_output": {
        "assertions": [{
            "diff_type": "added",
            "entity": "messages",
            "where": {
                "channel_id": {"eq": "C01GENERAL99"},
                "message_text": {"contains": "welcome"}
            },
            "expected_count": 1
        }],
        "ignore_fields": {
            "global": ["ts", "message_id", "created_at"]
        }
    }
})

# Add another test
test2 = client.create_test(suite.id, {
    "name": "Create thread reply",
    "prompt": "Reply 'Got it!' to the latest message in #general",
    "type": "actionEval",
    "environmentTemplate": "slack_default",
    "impersonateUserId": "U01AGENBOT9",
    "expected_output": {
        "assertions": [{
            "diff_type": "added",
            "entity": "messages",
            "where": {
                "parent_id": {"not_null": true},
                "message_text": {"eq": "Got it!"}
            }
        }],
        "ignore_fields": {
            "global": ["ts", "message_id", "created_at"]
        }
    }
})

Running All Tests in a Suite

Read more about running tests here: running evaluations
from agent_diff import AgentDiff, PythonExecutorProxy, create_langchain_tool
from langgraph.prebuilt import create_react_agent as create_agent
from langchain_openai import ChatOpenAI

client = AgentDiff()

# Get suite
suite = client.get_test_suite("suite-123", expand=True)

results = []
model = ChatOpenAI(model="gpt-4o")

for test in suite.tests:
    # Create environment for this test
    env = client.init_env(testId=test.id)
    run = client.start_run(envId=env.environmentId, testId=test.id)
    
    # Run agent
    executor = PythonExecutorProxy(env.environmentId)
    agent = create_agent(
        model=model,
        tools=[create_langchain_tool(executor)],
        system_prompt="Use execute_python to interact with the API..."
    )
    agent.invoke({"messages": [{"role": "user", "content": test.prompt}]})
    
    # Evaluate
    result = client.evaluate_run(runId=run.runId)
    
    results.append({
        "test": test.name,
        "passed": result.passed,
        "score": result.score
    })
    
    # Cleanup
    client.delete_env(envId=env.environmentId)

# Summary
passed = sum(1 for r in results if r["passed"])
print(f"\nResults: {passed}/{len(results)} tests passed")

for r in results:
    status = "✓" if r["passed"] else "✗"
    print(f"  {status} {r['test']}")

Test Visibility

VisibilityDescription
publicVisible to all users
privateOnly visible to the creator

Next Steps