Spaces:

Rom89823974978
/

RAG_Eval

Sleeping

App Files Files Community

RAG_Eval / tests /test_sparse_retriever.py

Rom89823974978

Updated and resolved issues

3b8840f 4 months ago

raw

history blame contribute delete

3.06 kB

	import json
	import subprocess
	from pathlib import Path

	import pytest

	from evaluation.retrievers.bm25 import BM25Retriever
	from evaluation.retrievers.base import Context

	class DummyHit:
	def __init__(self, docid, raw, score):
	self.docid = docid
	self.raw = raw
	self.score = score


	class DummySearcher:
	def __init__(self, index_dir):
	# do nothing
	pass

	def set_bm25(self):
	pass

	def search(self, query, k):
	# Return a predictable list of hits
	return [
	DummyHit(docid=0, raw="first doc text", score=2.0),
	DummyHit(docid=1, raw="second doc text", score=1.5),
	]


	@pytest.fixture(autouse=True)
	def patch_subprocess_and_pyserini(monkeypatch):
	# ❶ Prevent subprocess.run from actually calling "pyserini.index"
	monkeypatch.setattr(subprocess, "run", lambda args, *kwargs: None)

	# ❷ Stub out pyserini.search.SimpleSearcher if available
	try:
	import pyserini.search
	monkeypatch.setattr(pyserini.search, "SimpleSearcher", DummySearcher)
	except ImportError:
	pass


	def test_bm25_index_build_and_query(tmp_path):
	# Create a tiny doc_store JSONL
	docs = [
	{"id": 0, "text": "Retrieval Augmented Generation"},
	{"id": 1, "text": "BM25 is strong"},
	]
	doc_store_path = tmp_path / "docs.jsonl"
	with doc_store_path.open("w") as f:
	for obj in docs:
	f.write(json.dumps(obj) + "\n")

	# Point to a non‐existent index directory
	index_dir = tmp_path / "bm25_index"
	assert not index_dir.exists()

	# Instantiate BM25Retriever; __init__ should “build” the index (subprocess.run no‐ops)
	retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path))

	# After init, index_dir “exists” (because build_index created it)
	assert index_dir.exists()

	# Now call retrieve(...)
	results = retriever.retrieve("any query", top_k=2)

	# Verify that we get two Context objects with correct fields
	assert isinstance(results, list)
	assert len(results) == 2
	assert all(isinstance(r, Context) for r in results)

	# Because DummySearcher returns docid=0 then docid=1
	assert results[0].id == "0"
	assert results[0].text == "first doc text"
	assert results[0].score == pytest.approx(2.0, rel=1e-6)

	assert results[1].id == "1"
	assert results[1].text == "second doc text"
	assert results[1].score == pytest.approx(1.5, rel=1e-6)

	def test_bm25_retrieve_when_pyserini_missing(monkeypatch, tmp_path):
	# Simulate ImportError for pyserini.search.SimpleSearcher
	monkeypatch.setitem(__import__("sys").modules, "pyserini.search", None)

	doc_store_path = tmp_path / "docs.jsonl"
	doc_store_path.write_text('{"id":0,"text":"hello"}\n')

	index_dir = tmp_path / "bm25_index2"
	retriever = BM25Retriever(bm25_idx=str(index_dir), doc_store=str(doc_store_path))
	# If SimpleSearcher failed to import, retrieve() returns []
	assert retriever.retrieve("whatever", top_k=5) == []