A comprehensive benchmark containing diverse IR tasks and providing a common framework for evaluation of NLP-based retrieval models. It includes 15+ datasets spanning various domains and supports evaluation with metrics like NDCG@k, MAP@K, Recall@K and Precision@K.
from benchthing import Bench
bench = Bench("beir")
bench.run(
benchmark="beir",
task_id="1",
models=yourRetrievalModels
)
result = bench.get_result("1")