Spaces:

Rom89823974978
/

RAG_Eval

Sleeping

App Files Files Community

Rom89823974978 commited on Jun 5

Commit

bdb49ae

1 Parent(s): cdf4160

Further development

Browse files

Files changed (9) hide show

evaluation/config.py +31 -1
evaluation/pipeline.py +19 -4
evaluation/rerankers/cross_encoder.py +34 -0
evaluation/stats/__init__.py +18 -0
evaluation/stats/correlation.py +81 -0
evaluation/stats/robustness.py +79 -0
evaluation/stats/significance.py +38 -0
tests/test_reranker.py +7 -0
tests/test_stats.py +48 -0

evaluation/config.py CHANGED Viewed

@@ -4,6 +4,14 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Optional, Literal
 @dataclass
 class RetrieverConfig:
@@ -34,9 +42,31 @@ class GeneratorConfig:
     temperature: float = 0.0
 @dataclass
 class PipelineConfig:
     """Top‑level pipeline configuration."""
     retriever: RetrieverConfig = RetrieverConfig()
     generator: GeneratorConfig = GeneratorConfig()

 from pathlib import Path
 from typing import Optional, Literal
+@dataclass
+class CrossEncoderConfig:
+    enable: bool = False                          # master switch
+    model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+    device: str = "cpu"
+    max_length: int = 512                         # truncation length
+    first_stage_k: int = 50                       # how many docs to pass to re-ranker
+    final_k: Optional[int] = None                 # override PipelineConfig.retriever.top_k
 @dataclass
 class RetrieverConfig:
     temperature: float = 0.0
+@dataclass
+class StatsConfig:
+    """Configuration parameters for all statistical analyses."""
+    # Correlation (RQ1 & RQ2)
+    correlation_method: Literal["spearman", "kendall"] = "spearman"
+    n_boot: int = 1000       # bootstrap replicates for CIs
+    ci: float = 0.95         # confidence level (e.g. 0.95 = 95 %)
+    # Significance tests (RQ2)
+    wilcoxon_alternative: Literal["two-sided", "greater", "less"] = "two-sided"
+    multiple_correction: Literal["holm-bonferroni", "none"] = "holm-bonferroni"
+    alpha: float = 0.05      # family-wise error rate
+    # Robustness / sensitivity (RQ3 & RQ4)
+    compute_effect_size: bool = True
+    report_conditional_rates: bool = True
 @dataclass
 class PipelineConfig:
     """Top‑level pipeline configuration."""
+    reranker: CrossEncoderConfig = CrossEncoderConfig()
     retriever: RetrieverConfig = RetrieverConfig()
     generator: GeneratorConfig = GeneratorConfig()
+    stats: StatsConfig = StatsConfig()

evaluation/pipeline.py CHANGED Viewed

@@ -8,7 +8,7 @@ from .config import PipelineConfig
 from .retrievers import bm25, dense, hybrid
 from .generators.hf_generator import HFGenerator
 from .retrievers.base import Retriever, Context
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
@@ -22,6 +22,15 @@ class RAGPipeline:
         self.generator = HFGenerator(
             model_name=cfg.generator.model_name, device=cfg.generator.device
         )
     # ---------------------------------------------------------------------
     # Public API
@@ -59,9 +68,15 @@ class RAGPipeline:
     def _retrieve(self, question: str) -> List[Context]:
         logger.info("Retrieving top‑%d passages", self.cfg.retriever.top_k)
-        return self.retriever.retrieve(
-            question, top_k=self.cfg.retriever.top_k
-        )
     def _generate(self, question: str, contexts: List[Context]) -> str:
         texts = [c.text for c in contexts]

 from .retrievers import bm25, dense, hybrid
 from .generators.hf_generator import HFGenerator
 from .retrievers.base import Retriever, Context
+from .rerankers.cross_encoder import CrossEncoderReranker
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO)
         self.generator = HFGenerator(
             model_name=cfg.generator.model_name, device=cfg.generator.device
         )
+        self.reranker = (
+            CrossEncoderReranker(
+                cfg.reranker.model_name,
+                device=cfg.reranker.device,
+                max_len=cfg.reranker.max_length,
+            )
+            if cfg.reranker.enable
+            else None
+        )
     # ---------------------------------------------------------------------
     # Public API
     def _retrieve(self, question: str) -> List[Context]:
         logger.info("Retrieving top‑%d passages", self.cfg.retriever.top_k)
+        k_first = self.cfg.reranker.first_stage_k if self.reranker else self.cfg.retriever.top_k
+        initial = self.retriever.retrieve(question, top_k=k_first)
+        if self.reranker:
+            final_k = self.cfg.reranker.final_k or self.cfg.retriever.top_k
+            logger.info("Re-ranking %d docs with cross-encoder ...", len(initial))
+            initial = self.reranker.rerank(question, initial, k=final_k)
+        return initial
     def _generate(self, question: str, contexts: List[Context]) -> str:
         texts = [c.text for c in contexts]

evaluation/rerankers/cross_encoder.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""Cross-encoder re-ranker built on SentenceTransformers CrossEncoder."""
+from __future__ import annotations
+from typing import List
+import logging
+from sentence_transformers import CrossEncoder
+import torch
+from evaluation.retrievers.base import Context
+logger = logging.getLogger(__name__)
+class CrossEncoderReranker:
+    """Re-scores (query, passage) pairs and returns top-k Contexts."""
+    def __init__(self, model_name: str, device: str = "cpu", max_len: int = 512):
+        self.model = CrossEncoder(model_name, device=device)
+        self.max_len = max_len
+        logger.info("Cross-encoder '%s' loaded on %s", model_name, device)
+    def rerank(self, query: str, contexts: List[Context], k: int) -> List[Context]:
+        pairs = [[query, c.text] for c in contexts]
+        scores = self.model.predict(
+            pairs,
+            convert_to_numpy=True,
+            show_progress_bar=False,
+            max_length=self.max_len,
+        )
+        for c, s in zip(contexts, scores):
+            c.score = float(s)
+        contexts.sort(key=lambda c: c.score, reverse=True)
+        return contexts[:k]

evaluation/stats/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+"""Statistical utilities for analysis scripts."""
+from .correlation import corr_ci
+from .significance import wilcoxon_signed_rank, holm_bonferroni
+from .robustness import (
+    delta_metric,
+    conditional_failure_rate,
+    chi2_error_propagation,
+)
+__all__ = [
+    "corr_ci",
+    "wilcoxon_signed_rank",
+    "holm_bonferroni",
+    "delta_metric",
+    "conditional_failure_rate",
+    "chi2_error_propagation",
+]

evaluation/stats/correlation.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Correlation helpers for RQ1 and RQ2 analyses.
+Functions here wrap `scipy.stats` to compute non‑parametric correlations
+(Spearman ρ, Kendall τ) with optional bootstrap confidence intervals so
+results can be reported with uncertainty estimates.
+Typical usage
+-------------
+>>> from evaluation.stats.correlation import corr_ci
+>>> rho, (lo, hi), p = corr_ci(x, y, method="spearman", n_boot=1000)
+"""
+from __future__ import annotations
+from typing import Tuple, Sequence, Literal
+import numpy as np
+from scipy import stats
+Method = Literal["spearman", "kendall"]
+def _correlate(x: Sequence[float], y: Sequence[float], method: Method):
+    if method == "spearman":
+        return stats.spearmanr(x, y, nan_policy="omit")
+    if method == "kendall":
+        return stats.kendalltau(x, y, nan_policy="omit")
+    raise ValueError(method)
+def corr_ci(
+    x: Sequence[float],
+    y: Sequence[float],
+    *,
+    method: Method = "spearman",
+    n_boot: int = 1000,
+    ci: float = 0.95,
+    random_state: int | None = None,
+) -> Tuple[float, Tuple[float, float], float]:
+    """Correlation coefficient, bootstrap CI, and p‑value.
+    Parameters
+    ----------
+    x, y
+        Numeric sequences of equal length.
+    method
+        'spearman' or 'kendall'.
+    n_boot
+        Number of bootstrap resamples for the CI. 0 → no CI.
+    ci
+        Confidence level (e.g. 0.95 for 95 %).
+    random_state
+        Seed for reproducibility.
+    Returns
+    -------
+    r : float
+        Correlation coefficient.
+    (lo, hi) : Tuple[float, float]
+        Lower/upper CI bounds. ``(nan, nan)`` if *n_boot* == 0.
+    p : float
+        Two‑sided p‑value from the correlation test.
+    """
+    x = np.asarray(x, dtype=float)
+    y = np.asarray(y, dtype=float)
+    if x.shape != y.shape:
+        raise ValueError("x and y must have the same length")
+    r, p = _correlate(x, y, method)
+    if n_boot == 0:
+        return float(r), (float("nan"), float("nan")), float(p)
+    rng = np.random.default_rng(random_state)
+    bs = []
+    for _ in range(n_boot):
+        idx = rng.integers(0, len(x), len(x))
+        r_bs, _ = _correlate(x[idx], y[idx], method)
+        bs.append(r_bs)
+    lo, hi = np.percentile(bs, [(1 - ci) / 2 * 100, (1 + ci) / 2 * 100])
+    return float(r), (float(lo), float(hi)), float(p)

evaluation/stats/robustness.py ADDED Viewed

	@@ -0,0 +1,79 @@

+"""Robustness & sensitivity analysis helpers (RQ3 / RQ4)."""
+from __future__ import annotations
+from typing import Sequence, Tuple, Mapping, Any
+import numpy as np
+from scipy import stats
+def delta_metric(
+    orig: Sequence[float], perturbed: Sequence[float]
+) -> Tuple[float, float]:
+    """Return mean delta and Cohen's *d* effect size.
+    *orig* and *perturbed* must be paired metric values (same length).
+    """
+    orig = np.asarray(orig, dtype=float)
+    perturbed = np.asarray(perturbed, dtype=float)
+    if orig.shape != perturbed.shape:
+        raise ValueError("orig and perturbed must have the same length")
+    delta = np.mean(perturbed - orig)
+    pooled_sd = np.sqrt(((orig.std(ddof=1) ** 2) + (perturbed.std(ddof=1) ** 2)) / 2)
+    cohen_d = delta / pooled_sd if pooled_sd else float("nan")
+    return float(delta), float(cohen_d)
+def conditional_failure_rate(
+    retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
+) -> Mapping[str, float]:
+    """Fraction of hallucinations conditional on retrieval failure.
+    Returns
+    -------
+    dict with keys:
+        p_hallucination_given_error
+        p_hallucination_given_success
+    """
+    import numpy as np
+    retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
+    hallucinations = np.asarray(hallucinations, dtype=bool)
+    if retrieval_errors.shape != hallucinations.shape:
+        raise ValueError("Input lengths differ")
+    err_idx = retrieval_errors
+    succ_idx = ~retrieval_errors
+    def _rate(mask):
+        if mask.sum() == 0:
+            return float("nan")
+        return float(hallucinations[mask].mean())
+    return {
+        "p_hallucination_given_error": _rate(err_idx),
+        "p_hallucination_given_success": _rate(succ_idx),
+    }
+def chi2_error_propagation(
+    retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
+):
+    """Chi‑square test of independence between retrieval error and hallucination."""
+    from scipy.stats import chi2_contingency
+    retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
+    hallucinations = np.asarray(hallucinations, dtype=bool)
+    table = [
+        [
+            ((~retrieval_errors) & (~hallucinations)).sum(),
+            ((~retrieval_errors) & hallucinations).sum(),
+        ],
+        [
+            (retrieval_errors & (~hallucinations)).sum(),
+            (retrieval_errors & hallucinations).sum(),
+        ],
+    ]
+    chi2, p, dof, expected = chi2_contingency(table)
+    return dict(chi2=chi2, p=p, dof=dof, expected=expected, table=table)

evaluation/stats/significance.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""Significance testing utilities (Wilcoxon, Holm‑Bonferroni)."""
+from __future__ import annotations
+from typing import Sequence, Mapping, List, Tuple
+import numpy as np
+from scipy import stats
+def wilcoxon_signed_rank(
+    x: Sequence[float],
+    y: Sequence[float],
+    *,
+    alternative: str = "two-sided",
+):
+    """Paired Wilcoxon signed‑rank test (wrapper)."""
+    return stats.wilcoxon(x, y, alternative=alternative)
+def holm_bonferroni(pvalues: Mapping[str, float]) -> Mapping[str, float]:
+    """Holm‑Bonferroni correction for multiple hypotheses.
+    Parameters
+    ----------
+    pvalues : dict
+        Mapping from *name* → raw p‑value.
+    Returns
+    -------
+    dict
+        Mapping from *name* → adjusted p‑value.
+    """
+    m = len(pvalues)
+    sorted_items: List[Tuple[str, float]] = sorted(pvalues.items(), key=lambda kv: kv[1])
+    adjusted = {}
+    for i, (name, p) in enumerate(sorted_items, start=1):
+        adjusted[name] = min((m - i + 1) * p, 1.0)
+    return adjusted

tests/test_reranker.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def test_rerank():
+    from evaluation.rerankers.cross_encoder import CrossEncoderReranker
+    from evaluation.retrievers.base import Context
+    rer = CrossEncoderReranker("cross-encoder/ms-marco-MiniLM-L-6-v2", device="cpu")
+    dummy = [Context(id=str(i), text=f"text {i}", score=1.0) for i in range(5)]
+    out = rer.rerank("dummy query", dummy, k=3)
+    assert len(out) == 3

tests/test_stats.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from evaluation.stats import (
+    corr_ci,
+    wilcoxon_signed_rank,
+    holm_bonferroni,
+    delta_metric,
+    conditional_failure_rate,
+    chi2_error_propagation,
+)
+import numpy as np
+def test_corr_ci():
+    x = np.arange(10)
+    y = np.arange(10)
+    r, (lo, hi), p = corr_ci(x, y, n_boot=100)
+    assert r > 0.9 and lo <= r <= hi
+def test_wilcoxon():
+    x = [1, 2, 3]
+    y = [1, 3, 5]
+    stat, p = wilcoxon_signed_rank(x, y)
+    assert p < 0.2  # not exact, just smoke
+def test_holm():
+    raw = {"a": 0.01, "b": 0.04, "c": 0.20}
+    adj = holm_bonferroni(raw)
+    assert adj["a"] <= raw["a"]
+def test_delta_metric():
+    d, eff = delta_metric([1, 2, 3], [2, 3, 4])
+    assert d > 0 and eff > 0
+def test_conditional_failure_rate():
+    r = [True, False, True, False]
+    h = [True, False, False, True]
+    rates = conditional_failure_rate(r, h)
+    assert "p_hallucination_given_error" in rates
+def test_chi2():
+    r = [True, True, False, False]
+    h = [True, False, True, False]
+    out = chi2_error_propagation(r, h)
+    assert out["dof"] == 1