import json import shutil import tempfile from pathlib import Path from types import SimpleNamespace from typing import List import numpy as np import pytest @pytest.fixture(scope="session") def tmp_doc_store(tmp_path_factory): """Create a tiny JSONL doc store for testing.""" docs = [ {"id": 0, "text": "Retrieval Augmented Generation combines retrieval and generation."}, {"id": 1, "text": "BM25 is a strong lexical baseline in information retrieval."}, {"id": 2, "text": "FAISS enables efficient similarity search over dense embeddings."}, ] doc_path = tmp_path_factory.mktemp("docs") / "docs.jsonl" with doc_path.open("w") as f: for doc in docs: f.write(json.dumps(doc) + "\n") return doc_path class _DummyEmbedder: """Fast, deterministic replacement for SentenceTransformer during tests. * Encodes text into a 16‑dim vector with a fixed random seed. * Normalises vectors so the retriever workflow (IP metric) is preserved. """ _dim = 16 def __init__(self, *args, **kwargs): self.rs = np.random.RandomState(42) def encode(self, texts, **kw): if isinstance(texts, str): texts = [texts] vecs = [] for t in texts: # Simple hash-based seed for determinism h = abs(hash(t)) % (2**32) self.rs.seed(h) v = self.rs.randn(self._dim) v = v / np.linalg.norm(v) vecs.append(v.astype("float32")) return np.stack(vecs) # SentenceTransformer.elasticsearch compatibility def __str__(self): return "DummyEmbedder" @pytest.fixture(autouse=True) def patch_sentence_transformers(monkeypatch): """Monkeypatch SentenceTransformer to a lightweight dummy implementation.""" # Import path inside our retriever module from evaluation.retrievers import dense as dense_mod monkeypatch.setattr(dense_mod, "SentenceTransformer", _DummyEmbedder) yield