SWE-Bench

SWE Benchmarks

Code Block

run.ts
import { Bench } from "benchthing";
 
const bench = new Bench("super-glue");
 
await bench.run({
    benchmark: "super-glue",
    taskId: "1",
    models: yourLanguageModels
});
 
const result = await bench.getResult("1");
run.py
from benchthing import Bench
 
bench = Bench("webarena")
 
bench.run(
    benchmark="webarena",
    task_id="1",
    agents=your_agents
)
 
result = bench.get_result("1")

Cards