Part of Meta's Purple Llama initiative, CyberSecEval is designed to evaluate cybersecurity risks and capabilities in LLMs. Version 3 includes test suites for visual prompt injection, spear phishing capabilities, and autonomous offensive cyber operations assessment.
from benchthing import Bench
bench = Bench("cyberseceval")
bench.run(
benchmark="cyberseceval",
task_id="1",
models=['speculative-model-1', 'speculative-model-2']
)
result = bench.get_result("1")