SWE Benchmarks
import { Bench } from "benchthing"; const bench = new Bench("super-glue"); await bench.run({ benchmark: "super-glue", taskId: "1", models: yourLanguageModels }); const result = await bench.getResult("1");
from benchthing import Bench bench = Bench("webarena") bench.run( benchmark="webarena", task_id="1", agents=your_agents ) result = bench.get_result("1")