A comprehensive benchmark that evaluates models on multiple choice questions across 57 subjects, including STEM, humanities, social sciences, and more, with difficulty levels ranging from elementary to advanced.
from benchthing import Bench
bench = Bench("mmlu")
bench.run(
benchmark="mmlu",
task_id="1",
models=yourLanguageModels
)
result = bench.get_result("1")