from agent_diff import AgentDiffclient = AgentDiff()# List all suitessuites = client.list_test_suites()for suite in suites.testSuites: print(f"- {suite.name} ({suite.id})")# Filter by nameslack_suites = client.list_test_suites(name="Slack")
# Get suite with test detailssuite = client.get_test_suite(suite_id, expand=True)print(f"Suite: {suite.name}")print(f"Tests: {len(suite.tests)}")for test in suite.tests: print(f" - {test.name}: {test.prompt}")
# Create a new test suitesuite = client.create_test_suite( name="My Agent Tests", description="Custom tests for my Slack agent", visibility="private")print(f"Created suite: {suite.id}")
Read more about running tests here: running evaluations
from agent_diff import AgentDiff, PythonExecutorProxy, create_langchain_toolfrom langgraph.prebuilt import create_react_agent as create_agentfrom langchain_openai import ChatOpenAIclient = AgentDiff()# Get suitesuite = client.get_test_suite("suite-123", expand=True)results = []model = ChatOpenAI(model="gpt-4o")for test in suite.tests: # Create environment for this test env = client.init_env(testId=test.id) run = client.start_run(envId=env.environmentId, testId=test.id) # Run agent executor = PythonExecutorProxy(env.environmentId) agent = create_agent( model=model, tools=[create_langchain_tool(executor)], system_prompt="Use execute_python to interact with the API..." ) agent.invoke({"messages": [{"role": "user", "content": test.prompt}]}) # Evaluate result = client.evaluate_run(runId=run.runId) results.append({ "test": test.name, "passed": result.passed, "score": result.score }) # Cleanup client.delete_env(envId=env.environmentId)# Summarypassed = sum(1 for r in results if r["passed"])print(f"\nResults: {passed}/{len(results)} tests passed")for r in results: status = "✓" if r["passed"] else "✗" print(f" {status} {r['test']}")