A benchmark for evaluating AI agents' performance and reliability in real-world settings with dynamic user and tool interaction. It tests agents on completing complex tasks while interacting with (LLM-simulated) users and tools to gather required information.
import { Bench } from 'benchthing';
const bench = new Bench('tau-bench');
await bench.run({
benchmark: 'tau-bench',
taskId: '1',
agents: yourAgents,
});
const result = await bench.getResult('1');