A benchmark of real-world tasks requiring context up to millions of tokens, designed to evaluate LCLMs' performance on in-context retrieval and reasoning. Features 6 long-context task categories spanning retrieval, multi-hop compositional reasoning, and more, totaling 35 datasets and 4 modalities.
from benchthing import Bench
bench = Bench("loft")
bench.run(
benchmark="loft",
task_id="1",
models=['gemini-1.5-flash-002', 'claude-3', 'yourLongContextModel']
)
result = bench.get_result("1")