A benchmark for evaluating model truthfulness. It comprises 817 questions that span 38 categories, designed to test models' ability to avoid generating false answers learned from imitating human texts.
from benchthing import Bench
bench = Bench("truthful-qa")
bench.run(
benchmark="truthful-qa",
task_id="1",
models=yourLanguageModels
)
result = bench.get_result("1")