Spaces:

Rom89823974978
/

RAG_Eval

Sleeping

App Files Files Community

Rom89823974978 commited on Jun 5

Commit

cdf4160

1 Parent(s): 8521f60

Updated work

Browse files

Files changed (9) hide show

evaluation/config.py +7 -0
evaluation/pipeline.py +6 -6
evaluation/retrievers/bm25.py +65 -7
evaluation/retrievers/dense.py +102 -12
tests/conftest.py +64 -0
tests/test_dense_retriever.py +26 -0
tests/test_metrics.py +26 -0
tests/test_pipeline_end_to_end.py +39 -0
tests/test_smoke.py +0 -2

evaluation/config.py CHANGED Viewed

@@ -15,6 +15,13 @@ class RetrieverConfig:
     faiss_index: Optional[Path] = None
     doc_store: Optional[Path] = None
     device: str = "cpu"
 @dataclass

     faiss_index: Optional[Path] = None
     doc_store: Optional[Path] = None
     device: str = "cpu"
+    # hybrid only
+    alpha: float = 0.5                         # sparse ↔ dense weight
+    # dense-only
+    model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
+    embedder_cache: Optional[Path] = None
 @dataclass

evaluation/pipeline.py CHANGED Viewed

@@ -44,16 +44,16 @@ class RAGPipeline:
     # ---------------------------------------------------------------------
     def _build_retriever(self, cfg: PipelineConfig) -> Retriever:
         name = cfg.retriever.name
         if name == "bm25":
-            return bm25.BM25Retriever(str(cfg.retriever.index_path))
         if name == "dense":
-            return dense.DenseRetriever(str(cfg.retriever.index_path))
         if name == "hybrid":
-            # In a real setting one would supply two paths; simplified here.
             return hybrid.HybridRetriever(
-                bm25_idx=str(cfg.retriever.index_path),
-                dense_idx=str(cfg.retriever.index_path),
-                alpha=0.5,
             )
         raise ValueError(f"Unsupported retriever '{name}'")

     # ---------------------------------------------------------------------
     def _build_retriever(self, cfg: PipelineConfig) -> Retriever:
         name = cfg.retriever.name
+        r=cfg.retriever
         if name == "bm25":
+            return bm25.BM25Retriever(index_path=str(r.bm25_index))
         if name == "dense":
+            return dense.DenseRetriever(faiss_index=str(r.faiss_index),doc_store=r.doc_store,model_name=r.model_name,embedder_cache=r.embedder_cache,device=r.device)
         if name == "hybrid":
             return hybrid.HybridRetriever(
+                bm25_idx=str(r.bm25_index),
+                dense_idx=str(r.faiss_index),
+                alpha=r.alpha,
             )
         raise ValueError(f"Unsupported retriever '{name}'")

evaluation/retrievers/bm25.py CHANGED Viewed

@@ -1,30 +1,88 @@
-"""BM25 sparse retriever backed by Pyserini SimpleSearcher."""
 from __future__ import annotations
-from typing import List
 import logging
 from pyserini.search import SimpleSearcher
 from .base import Retriever, Context
 logger = logging.getLogger(__name__)
 class BM25Retriever(Retriever):
-    """Thin wrapper around Pyserini's BM25 searcher."""
-    def __init__(self, index_path: str | None):
         if index_path is None:
-            raise ValueError("BM25 retriever requires a path to a Pyserini index.")
-        self.searcher = SimpleSearcher(index_path)
         self.searcher.set_bm25()
         logger.info("BM25Retriever initialised with index: %s", index_path)
     def retrieve(self, query: str, *, top_k: int = 5) -> List[Context]:
         hits = self.searcher.search(query, k=top_k)
         return [
             Context(id=str(hit.docid), text=hit.raw, score=hit.score)  # type: ignore[attr-defined]
             for hit in hits
         ]

+"""BM25 sparse retriever backed by Pyserini SimpleSearcher, with auto-indexing."""
 from __future__ import annotations
 import logging
+import os
+import subprocess
+from pathlib import Path
+from typing import List, Optional
 from pyserini.search import SimpleSearcher
 from .base import Retriever, Context
 logger = logging.getLogger(__name__)
 class BM25Retriever(Retriever):
+    """Pyserini BM25 searcher that will create the Lucene index on-the-fly."""
+    def __init__(
+        self,
+        index_path: str | os.PathLike | None,
+        *,
+        doc_store_path: Optional[str | os.PathLike] = None,
+        threads: int = 4,
+    ):
         if index_path is None:
+            raise ValueError("`index_path` (directory) is required.")
+        index_path = Path(index_path)
+        # ------------------------------------------------------------------
+        # Build index if it does not already exist
+        # ------------------------------------------------------------------
+        if not index_path.exists():
+            if doc_store_path is None:
+                raise FileNotFoundError(
+                    f"BM25 index {index_path} not found and no `doc_store_path` supplied."
+                )
+            logger.info("BM25 index %s missing – building from %s ...",
+                        index_path, doc_store_path)
+            self._build_index(Path(doc_store_path), index_path, threads)
+        # ------------------------------------------------------------------
+        # Searcher
+        # ------------------------------------------------------------------
+        self.searcher = SimpleSearcher(str(index_path))
         self.searcher.set_bm25()
         logger.info("BM25Retriever initialised with index: %s", index_path)
+    # ------------------------------------------------------------------ #
+    # Public API
+    # ------------------------------------------------------------------ #
     def retrieve(self, query: str, *, top_k: int = 5) -> List[Context]:
         hits = self.searcher.search(query, k=top_k)
         return [
             Context(id=str(hit.docid), text=hit.raw, score=hit.score)  # type: ignore[attr-defined]
             for hit in hits
         ]
+    # ------------------------------------------------------------------ #
+    # Helpers
+    # ------------------------------------------------------------------ #
+    @staticmethod
+    def _build_index(
+        doc_store: Path,
+        index_dir: Path,
+        threads: int,
+    ):
+        """Call Pyserini’s CLI to build a Lucene index from JSONL documents.
+        `doc_store` must be a JSONL file or directory containing JSONL files
+        with at least {"id": ..., "text": ...} per line.
+        """
+        index_dir.mkdir(parents=True, exist_ok=True)
+        cmd = [
+            "python", "-m", "pyserini.index",
+            "-collection", "JsonCollection",
+            "-generator", "DefaultLuceneDocumentGenerator",
+            "-input", str(doc_store),
+            "-index", str(index_dir),
+            "-threads", str(threads),
+            "-storePositions", "-storeDocvectors", "-storeRaw",
+        ]
+        logger.info("Running Pyserini indexer: %s", " ".join(cmd))
+        subprocess.run(cmd, check=True)  # raises if indexing fails
+        logger.info("Finished building Lucene index in %s", index_dir)

evaluation/retrievers/dense.py CHANGED Viewed

@@ -1,25 +1,115 @@
-"""Dense vector retriever placeholder (FAISS)."""
 from __future__ import annotations
-from typing import List
 import logging
-from .base import Retriever, Context
 logger = logging.getLogger(__name__)
 class DenseRetriever(Retriever):
-    """A dense vector retriever using FAISS (placeholder implementation)."""
-    def __init__(self, index_path: str | None):
-        if index_path is None:
-            raise ValueError("Dense retriever requires a FAISS index file.")
-        import faiss  # pylint: disable=import-error
-        self.index = faiss.read_index(index_path)
-        logger.info("DenseRetriever initialised with FAISS index: %s", index_path)
     def retrieve(self, query: str, *, top_k: int = 5) -> List[Context]:
-        # TODO: embed the query via a sentence transformer or similar.
-        raise NotImplementedError("DenseRetriever embedding is not implemented yet.")

+"""Dense vector retriever with automatic FAISS index construction."""
 from __future__ import annotations
+import json
 import logging
+import os
+from pathlib import Path
+from typing import List, Optional, Sequence, Union
+import faiss                   # type: ignore
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from .base import Context, Retriever
 logger = logging.getLogger(__name__)
 class DenseRetriever(Retriever):
+    """Sentence-Transformers + FAISS ANN search.
+    * If `faiss_index` does **not** exist, it is built from `doc_store`.
+    * Embedding model (and its cache location) are configurable.
+    """
+    def __init__(
+        self,
+        faiss_index: Union[str, Path],
+        *,
+        doc_store: Union[str, Path],
+        model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+        embedder_cache: Optional[Union[str, Path]] = None,
+        device: str = "cpu",
+    ):
+        self.faiss_index = Path(faiss_index)
+        self.doc_store = Path(doc_store)
+        # ------------------------------------------------------------------
+        # Sentence-Transformers embedder
+        # ------------------------------------------------------------------
+        self.embedder = SentenceTransformer(
+            model_name,
+            device=device,
+            cache_folder=str(embedder_cache) if embedder_cache else None,
+        )
+        logger.info("Embedder '%s' ready (device=%s)", model_name, device)
+        # ------------------------------------------------------------------
+        # Build FAISS index if absent
+        # ------------------------------------------------------------------
+        if not self.faiss_index.exists():
+            logger.info("FAISS index %s missing – building ...", self.faiss_index)
+            self._build_index()
+        self.index = faiss.read_index(str(self.faiss_index))
+        logger.info("Loaded FAISS index with %d vectors", self.index.ntotal)
+        # Keep doc texts in memory for convenience
+        self._texts: List[str] = []
+        with self.doc_store.open() as f:
+            for line in f:
+                obj = json.loads(line)
+                self._texts.append(obj.get("text", ""))
+    # ------------------------------------------------------------------ #
+    # Public API
+    # ------------------------------------------------------------------ #
     def retrieve(self, query: str, *, top_k: int = 5) -> List[Context]:
+        vec = self._embed(query)
+        vec = np.asarray(vec, dtype="float32")[None, :]
+        dists, idxs = self.index.search(vec, top_k)
+        dists, idxs = dists[0], idxs[0]
+        results: List[Context] = []
+        for i, score in zip(idxs, dists):
+            if i == -1:
+                continue
+            if self.index.metric_type == faiss.METRIC_L2:
+                score = -score
+            text = self._texts[i] if i < len(self._texts) else ""
+            results.append(Context(id=str(i), text=text, score=float(score)))
+        results.sort(key=lambda c: c.score, reverse=True)
+        return results
+    # ------------------------------------------------------------------ #
+    # Internal helpers
+    # ------------------------------------------------------------------ #
+    def _embed(self, text: str) -> Sequence[float]:
+        return self.embedder.encode(text, normalize_embeddings=True).tolist()
+    def _build_index(self):
+        """Read all texts, embed them, and write a FAISS IP index."""
+        logger.info("Reading documents from %s", self.doc_store)
+        ids, vectors = [], []
+        with self.doc_store.open() as f:
+            for line in f:
+                obj = json.loads(line)
+                ids.append(int(obj["id"]))
+                vectors.append(obj["text"])
+        logger.info("Embedding %d documents ...", len(ids))
+        embs = self.embedder.encode(
+            vectors,
+            batch_size=128,
+            show_progress_bar=True,
+            normalize_embeddings=True,
+        ).astype("float32")
+        logger.info("Creating FAISS index (Inner-Product)")
+        index = faiss.IndexFlatIP(embs.shape[1])
+        index.add(embs)
+        faiss.write_index(index, str(self.faiss_index))
+        logger.info("Saved FAISS index to %s", self.faiss_index)

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import json
+import shutil
+import tempfile
+from pathlib import Path
+from types import SimpleNamespace
+from typing import List
+import numpy as np
+import pytest
+@pytest.fixture(scope="session")
+def tmp_doc_store(tmp_path_factory):
+    """Create a tiny JSONL doc store for testing."""
+    docs = [
+        {"id": 0, "text": "Retrieval Augmented Generation combines retrieval and generation."},
+        {"id": 1, "text": "BM25 is a strong lexical baseline in information retrieval."},
+        {"id": 2, "text": "FAISS enables efficient similarity search over dense embeddings."},
+    ]
+    doc_path = tmp_path_factory.mktemp("docs") / "docs.jsonl"
+    with doc_path.open("w") as f:
+        for doc in docs:
+            f.write(json.dumps(doc) + "\n")
+    return doc_path
+class _DummyEmbedder:
+    """Fast, deterministic replacement for SentenceTransformer during tests.
+    * Encodes text into a 16â€‘dim vector with a fixed random seed.
+    * Normalises vectors so the retriever workflow (IP metric) is preserved.
+    """
+    _dim = 16
+    def __init__(self, *args, **kwargs):
+        self.rs = np.random.RandomState(42)
+    def encode(self, texts, **kw):
+        if isinstance(texts, str):
+            texts = [texts]
+        vecs = []
+        for t in texts:
+            # Simple hash-based seed for determinism
+            h = abs(hash(t)) % (2**32)
+            self.rs.seed(h)
+            v = self.rs.randn(self._dim)
+            v = v / np.linalg.norm(v)
+            vecs.append(v.astype("float32"))
+        return np.stack(vecs)
+    # SentenceTransformer.elasticsearch compatibility
+    def __str__(self):
+        return "DummyEmbedder"
+@pytest.fixture(autouse=True)
+def patch_sentence_transformers(monkeypatch):
+    """Monkeypatch SentenceTransformer to a lightweight dummy implementation."""
+    # Import path inside our retriever module
+    from evaluation.retrievers import dense as dense_mod
+    monkeypatch.setattr(dense_mod, "SentenceTransformer", _DummyEmbedder)
+    yield

tests/test_dense_retriever.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import faiss
+import numpy as np
+from pathlib import Path
+from evaluation.retrievers.dense import DenseRetriever
+def test_dense_retriever_build_and_search(tmp_doc_store, tmp_path):
+    faiss_index = tmp_path / "dense.index"
+    # Build index automatically
+    retriever = DenseRetriever(
+        faiss_index=faiss_index,
+        doc_store=tmp_doc_store,
+        model_name="dummy/ignored",          # ignored by dummy embedder
+        device="cpu",
+    )
+    assert faiss_index.exists(), "FAISS index should have been autoâ€‘created"
+    # Basic retrieval
+    results = retriever.retrieve("What enables similarity search?", top_k=3)
+    assert results, "Should return at least one context"
+    # Check score ordering descending
+    assert all(results[i].score >= results[i + 1].score for i in range(len(results) - 1))
+    # IDs must be strings by contract
+    assert isinstance(results[0].id, str)

tests/test_metrics.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from evaluation.metrics import (
+    precision_at_k,
+    recall_at_k,
+    mean_reciprocal_rank,
+    average_precision,
+    rag_score,
+)
+def test_retrieval_metrics_simple():
+    retrieved = ["a", "b", "c", "d"]
+    relevant = {"b", "d"}
+    assert precision_at_k(retrieved, relevant, 2) == 0.5
+    assert recall_at_k(retrieved, relevant, 4) == 1.0
+    assert mean_reciprocal_rank(retrieved, relevant) == 1 / 2
+    assert 0 < average_precision(retrieved, relevant) <= 1
+def test_rag_score_harmonic_mean():
+    r = {"prec": 0.8, "rec": 0.6}
+    g = {"bleu": 0.7}
+    s = rag_score(r, g)
+    assert 0 <= s <= 1
+    # harmonic mean must be less than or equal to arithmetic mean
+    assert s <= (sum(r.values()) / len(r) + sum(g.values()) / len(g)) / 2

tests/test_pipeline_end_to_end.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from pathlib import Path
+import pytest
+from evaluation.config import PipelineConfig, RetrieverConfig, GeneratorConfig
+from evaluation.pipeline import RAGPipeline
+class _DummyGenerator:
+    def __init__(self, *args, **kwargs):
+        pass
+    def generate(self, *args, **kw):
+        return "dummy answer"
+def test_pipeline_with_dense(tmp_doc_store, monkeypatch, tmp_path):
+    # Monkeyâ€‘patch HFGenerator with dummy fast implementation
+    from evaluation import generators as gens_pkg  # noqa: F401
+    from evaluation.generators import hf_generator
+    monkeypatch.setattr(hf_generator, "HFGenerator", _DummyGenerator)
+    cfg = PipelineConfig(
+        retriever=RetrieverConfig(
+            name="dense",
+            top_k=2,
+            faiss_index=tmp_path / "dense.idx",
+            doc_store=tmp_doc_store,
+            device="cpu",
+            model_name="dummy/ignored",
+        ),
+        generator=GeneratorConfig(model_name="dummy"),
+    )
+    pipeline = RAGPipeline(cfg)
+    result = pipeline("What is BM25?")
+    assert result["answer"] == "dummy answer"
+    assert len(result["contexts"]) > 0

tests/test_smoke.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- def test_smoke():
2	- assert True