Spaces:
Sleeping
Sleeping
import json | |
import subprocess | |
from pathlib import Path | |
import pytest | |
from evaluation.retrievers.bm25 import BM25Retriever | |
from evaluation.retrievers.base import Context | |
class DummyHit: | |
def __init__(self, docid, raw, score): | |
self.docid = docid | |
self.raw = raw | |
self.score = score | |
class DummySearcher: | |
def __init__(self, index_dir): | |
# do nothing | |
pass | |
def set_bm25(self): | |
pass | |
def search(self, query, k): | |
# Return a predictable list of hits | |
return [ | |
DummyHit(docid=0, raw="first doc text", score=2.0), | |
DummyHit(docid=1, raw="second doc text", score=1.5), | |
] | |
def patch_subprocess_and_pyserini(monkeypatch): | |
# ❶ Prevent subprocess.run from actually calling "pyserini.index" | |
monkeypatch.setattr(subprocess, "run", lambda *args, **kwargs: None) | |
# ❷ Stub out pyserini.search.SimpleSearcher if available | |
try: | |
import pyserini.search | |
monkeypatch.setattr(pyserini.search, "SimpleSearcher", DummySearcher) | |
except ImportError: | |
pass | |
def test_bm25_index_build_and_query(tmp_path): | |
# Create a tiny doc_store JSONL | |
docs = [ | |
{"id": 0, "text": "Retrieval Augmented Generation"}, | |
{"id": 1, "text": "BM25 is strong"}, | |
] | |
doc_store_path = tmp_path / "docs.jsonl" | |
with doc_store_path.open("w") as f: | |
for obj in docs: | |
f.write(json.dumps(obj) + "\n") | |
# Point to a non‐existent index directory | |
index_dir = tmp_path / "bm25_index" | |
assert not index_dir.exists() | |
# Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops) | |
retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path)) | |
# After init, index_dir “exists” (because build_index created it) | |
assert index_dir.exists() | |
# Now call retrieve(...) | |
results = retriever.retrieve("any query", top_k=2) | |
# Verify that we get two Context objects with correct fields | |
assert isinstance(results, list) | |
assert len(results) == 2 | |
assert all(isinstance(r, Context) for r in results) | |
# Because DummySearcher returns docid=0 then docid=1 | |
assert results[0].id == "0" | |
assert results[0].text == "first doc text" | |
assert results[0].score == pytest.approx(2.0, rel=1e-6) | |
assert results[1].id == "1" | |
assert results[1].text == "second doc text" | |
assert results[1].score == pytest.approx(1.5, rel=1e-6) | |
def test_bm25_retrieve_when_pyserini_missing(monkeypatch, tmp_path): | |
# Simulate ImportError for pyserini.search.SimpleSearcher | |
monkeypatch.setitem(__import__("sys").modules, "pyserini.search", None) | |
doc_store_path = tmp_path / "docs.jsonl" | |
doc_store_path.write_text('{"id":0,"text":"hello"}\n') | |
index_dir = tmp_path / "bm25_index2" | |
retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path)) | |
# If SimpleSearcher failed to import, retrieve() returns [] | |
assert retriever.retrieve("whatever", top_k=5) == [] |