RAG_Eval / tests /test_sparse_retriever.py
Rom89823974978's picture
Updated and resolved issues
3b8840f
import json
import subprocess
from pathlib import Path
import pytest
from evaluation.retrievers.bm25 import BM25Retriever
from evaluation.retrievers.base import Context
class DummyHit:
def __init__(self, docid, raw, score):
self.docid = docid
self.raw = raw
self.score = score
class DummySearcher:
def __init__(self, index_dir):
# do nothing
pass
def set_bm25(self):
pass
def search(self, query, k):
# Return a predictable list of hits
return [
DummyHit(docid=0, raw="first doc text", score=2.0),
DummyHit(docid=1, raw="second doc text", score=1.5),
]
@pytest.fixture(autouse=True)
def patch_subprocess_and_pyserini(monkeypatch):
# ❶ Prevent subprocess.run from actually calling "pyserini.index"
monkeypatch.setattr(subprocess, "run", lambda *args, **kwargs: None)
# ❷ Stub out pyserini.search.SimpleSearcher if available
try:
import pyserini.search
monkeypatch.setattr(pyserini.search, "SimpleSearcher", DummySearcher)
except ImportError:
pass
def test_bm25_index_build_and_query(tmp_path):
# Create a tiny doc_store JSONL
docs = [
{"id": 0, "text": "Retrieval Augmented Generation"},
{"id": 1, "text": "BM25 is strong"},
]
doc_store_path = tmp_path / "docs.jsonl"
with doc_store_path.open("w") as f:
for obj in docs:
f.write(json.dumps(obj) + "\n")
# Point to a non‐existent index directory
index_dir = tmp_path / "bm25_index"
assert not index_dir.exists()
# Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops)
retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path))
# After init, index_dir “exists” (because build_index created it)
assert index_dir.exists()
# Now call retrieve(...)
results = retriever.retrieve("any query", top_k=2)
# Verify that we get two Context objects with correct fields
assert isinstance(results, list)
assert len(results) == 2
assert all(isinstance(r, Context) for r in results)
# Because DummySearcher returns docid=0 then docid=1
assert results[0].id == "0"
assert results[0].text == "first doc text"
assert results[0].score == pytest.approx(2.0, rel=1e-6)
assert results[1].id == "1"
assert results[1].text == "second doc text"
assert results[1].score == pytest.approx(1.5, rel=1e-6)
def test_bm25_retrieve_when_pyserini_missing(monkeypatch, tmp_path):
# Simulate ImportError for pyserini.search.SimpleSearcher
monkeypatch.setitem(__import__("sys").modules, "pyserini.search", None)
doc_store_path = tmp_path / "docs.jsonl"
doc_store_path.write_text('{"id":0,"text":"hello"}\n')
index_dir = tmp_path / "bm25_index2"
retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path))
# If SimpleSearcher failed to import, retrieve() returns []
assert retriever.retrieve("whatever", top_k=5) == []