A comprehensive benchmark designed to evaluate long-context language models (LCLMs) effectively and thoroughly. Features seven diverse, application-centric categories with controllable lengths up to 128k tokens. Includes model-based evaluation for reliable metrics and few-shot prompting for robustly evaluating base models.
from benchthing import Bench
bench = Bench("helmet")
bench.run(
benchmark="helmet",
task_id="1",
models=['gpt-4', 'claude-2', 'yourLongContextModel']
)
result = bench.get_result("1")