A dataset of 7,787 grade-school level, multiple-choice science questions designed to encourage research in advanced question-answering. It includes a Challenge Set with questions answered incorrectly by both retrieval-based and word co-occurrence algorithms.
from benchthing import Bench
bench = Bench("arc-ai2")
bench.run(
benchmark="arc-ai2",
task_id="1",
models=yourLanguageModels
)
result = bench.get_result("1")