Evaluates functional correctness of code generation based on docstrings. Contains 164 hand-written programming problems that test language models' ability to generate functionally correct Python code.
from benchthing import Bench
bench = Bench("human-eval")
bench.run(
benchmark="human-eval",
task_id="1",
models=yourCodeGenModel
)
result = bench.get_result("1")