Rom89823974978 commited on
Commit
bdb49ae
·
1 Parent(s): cdf4160

Further development

Browse files
evaluation/config.py CHANGED
@@ -4,6 +4,14 @@ from dataclasses import dataclass
4
  from pathlib import Path
5
  from typing import Optional, Literal
6
 
 
 
 
 
 
 
 
 
7
 
8
  @dataclass
9
  class RetrieverConfig:
@@ -34,9 +42,31 @@ class GeneratorConfig:
34
  temperature: float = 0.0
35
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  @dataclass
38
  class PipelineConfig:
39
  """Top‑level pipeline configuration."""
40
-
41
  retriever: RetrieverConfig = RetrieverConfig()
42
  generator: GeneratorConfig = GeneratorConfig()
 
 
 
 
4
  from pathlib import Path
5
  from typing import Optional, Literal
6
 
7
+ @dataclass
8
+ class CrossEncoderConfig:
9
+ enable: bool = False # master switch
10
+ model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
11
+ device: str = "cpu"
12
+ max_length: int = 512 # truncation length
13
+ first_stage_k: int = 50 # how many docs to pass to re-ranker
14
+ final_k: Optional[int] = None # override PipelineConfig.retriever.top_k
15
 
16
  @dataclass
17
  class RetrieverConfig:
 
42
  temperature: float = 0.0
43
 
44
 
45
+ @dataclass
46
+ class StatsConfig:
47
+ """Configuration parameters for all statistical analyses."""
48
+
49
+ # Correlation (RQ1 & RQ2)
50
+ correlation_method: Literal["spearman", "kendall"] = "spearman"
51
+ n_boot: int = 1000 # bootstrap replicates for CIs
52
+ ci: float = 0.95 # confidence level (e.g. 0.95 = 95 %)
53
+
54
+ # Significance tests (RQ2)
55
+ wilcoxon_alternative: Literal["two-sided", "greater", "less"] = "two-sided"
56
+ multiple_correction: Literal["holm-bonferroni", "none"] = "holm-bonferroni"
57
+ alpha: float = 0.05 # family-wise error rate
58
+
59
+ # Robustness / sensitivity (RQ3 & RQ4)
60
+ compute_effect_size: bool = True
61
+ report_conditional_rates: bool = True
62
+
63
+
64
  @dataclass
65
  class PipelineConfig:
66
  """Top‑level pipeline configuration."""
67
+ reranker: CrossEncoderConfig = CrossEncoderConfig()
68
  retriever: RetrieverConfig = RetrieverConfig()
69
  generator: GeneratorConfig = GeneratorConfig()
70
+ stats: StatsConfig = StatsConfig()
71
+
72
+
evaluation/pipeline.py CHANGED
@@ -8,7 +8,7 @@ from .config import PipelineConfig
8
  from .retrievers import bm25, dense, hybrid
9
  from .generators.hf_generator import HFGenerator
10
  from .retrievers.base import Retriever, Context
11
-
12
  logger = logging.getLogger(__name__)
13
  logging.basicConfig(level=logging.INFO)
14
 
@@ -22,6 +22,15 @@ class RAGPipeline:
22
  self.generator = HFGenerator(
23
  model_name=cfg.generator.model_name, device=cfg.generator.device
24
  )
 
 
 
 
 
 
 
 
 
25
 
26
  # ---------------------------------------------------------------------
27
  # Public API
@@ -59,9 +68,15 @@ class RAGPipeline:
59
 
60
  def _retrieve(self, question: str) -> List[Context]:
61
  logger.info("Retrieving top‑%d passages", self.cfg.retriever.top_k)
62
- return self.retriever.retrieve(
63
- question, top_k=self.cfg.retriever.top_k
64
- )
 
 
 
 
 
 
65
 
66
  def _generate(self, question: str, contexts: List[Context]) -> str:
67
  texts = [c.text for c in contexts]
 
8
  from .retrievers import bm25, dense, hybrid
9
  from .generators.hf_generator import HFGenerator
10
  from .retrievers.base import Retriever, Context
11
+ from .rerankers.cross_encoder import CrossEncoderReranker
12
  logger = logging.getLogger(__name__)
13
  logging.basicConfig(level=logging.INFO)
14
 
 
22
  self.generator = HFGenerator(
23
  model_name=cfg.generator.model_name, device=cfg.generator.device
24
  )
25
+ self.reranker = (
26
+ CrossEncoderReranker(
27
+ cfg.reranker.model_name,
28
+ device=cfg.reranker.device,
29
+ max_len=cfg.reranker.max_length,
30
+ )
31
+ if cfg.reranker.enable
32
+ else None
33
+ )
34
 
35
  # ---------------------------------------------------------------------
36
  # Public API
 
68
 
69
  def _retrieve(self, question: str) -> List[Context]:
70
  logger.info("Retrieving top‑%d passages", self.cfg.retriever.top_k)
71
+ k_first = self.cfg.reranker.first_stage_k if self.reranker else self.cfg.retriever.top_k
72
+ initial = self.retriever.retrieve(question, top_k=k_first)
73
+
74
+ if self.reranker:
75
+ final_k = self.cfg.reranker.final_k or self.cfg.retriever.top_k
76
+ logger.info("Re-ranking %d docs with cross-encoder ...", len(initial))
77
+ initial = self.reranker.rerank(question, initial, k=final_k)
78
+
79
+ return initial
80
 
81
  def _generate(self, question: str, contexts: List[Context]) -> str:
82
  texts = [c.text for c in contexts]
evaluation/rerankers/cross_encoder.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Cross-encoder re-ranker built on SentenceTransformers CrossEncoder."""
2
+
3
+ from __future__ import annotations
4
+ from typing import List
5
+ import logging
6
+
7
+ from sentence_transformers import CrossEncoder
8
+ import torch
9
+
10
+ from evaluation.retrievers.base import Context
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class CrossEncoderReranker:
16
+ """Re-scores (query, passage) pairs and returns top-k Contexts."""
17
+
18
+ def __init__(self, model_name: str, device: str = "cpu", max_len: int = 512):
19
+ self.model = CrossEncoder(model_name, device=device)
20
+ self.max_len = max_len
21
+ logger.info("Cross-encoder '%s' loaded on %s", model_name, device)
22
+
23
+ def rerank(self, query: str, contexts: List[Context], k: int) -> List[Context]:
24
+ pairs = [[query, c.text] for c in contexts]
25
+ scores = self.model.predict(
26
+ pairs,
27
+ convert_to_numpy=True,
28
+ show_progress_bar=False,
29
+ max_length=self.max_len,
30
+ )
31
+ for c, s in zip(contexts, scores):
32
+ c.score = float(s)
33
+ contexts.sort(key=lambda c: c.score, reverse=True)
34
+ return contexts[:k]
evaluation/stats/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Statistical utilities for analysis scripts."""
2
+
3
+ from .correlation import corr_ci
4
+ from .significance import wilcoxon_signed_rank, holm_bonferroni
5
+ from .robustness import (
6
+ delta_metric,
7
+ conditional_failure_rate,
8
+ chi2_error_propagation,
9
+ )
10
+
11
+ __all__ = [
12
+ "corr_ci",
13
+ "wilcoxon_signed_rank",
14
+ "holm_bonferroni",
15
+ "delta_metric",
16
+ "conditional_failure_rate",
17
+ "chi2_error_propagation",
18
+ ]
evaluation/stats/correlation.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Correlation helpers for RQ1 and RQ2 analyses.
2
+
3
+ Functions here wrap `scipy.stats` to compute non‑parametric correlations
4
+ (Spearman ρ, Kendall τ) with optional bootstrap confidence intervals so
5
+ results can be reported with uncertainty estimates.
6
+
7
+ Typical usage
8
+ -------------
9
+ >>> from evaluation.stats.correlation import corr_ci
10
+ >>> rho, (lo, hi), p = corr_ci(x, y, method="spearman", n_boot=1000)
11
+ """
12
+
13
+ from __future__ import annotations
14
+ from typing import Tuple, Sequence, Literal
15
+
16
+ import numpy as np
17
+ from scipy import stats
18
+
19
+
20
+ Method = Literal["spearman", "kendall"]
21
+
22
+
23
+ def _correlate(x: Sequence[float], y: Sequence[float], method: Method):
24
+ if method == "spearman":
25
+ return stats.spearmanr(x, y, nan_policy="omit")
26
+ if method == "kendall":
27
+ return stats.kendalltau(x, y, nan_policy="omit")
28
+ raise ValueError(method)
29
+
30
+
31
+ def corr_ci(
32
+ x: Sequence[float],
33
+ y: Sequence[float],
34
+ *,
35
+ method: Method = "spearman",
36
+ n_boot: int = 1000,
37
+ ci: float = 0.95,
38
+ random_state: int | None = None,
39
+ ) -> Tuple[float, Tuple[float, float], float]:
40
+ """Correlation coefficient, bootstrap CI, and p‑value.
41
+
42
+ Parameters
43
+ ----------
44
+ x, y
45
+ Numeric sequences of equal length.
46
+ method
47
+ 'spearman' or 'kendall'.
48
+ n_boot
49
+ Number of bootstrap resamples for the CI. 0 → no CI.
50
+ ci
51
+ Confidence level (e.g. 0.95 for 95 %).
52
+ random_state
53
+ Seed for reproducibility.
54
+
55
+ Returns
56
+ -------
57
+ r : float
58
+ Correlation coefficient.
59
+ (lo, hi) : Tuple[float, float]
60
+ Lower/upper CI bounds. ``(nan, nan)`` if *n_boot* == 0.
61
+ p : float
62
+ Two‑sided p‑value from the correlation test.
63
+ """
64
+ x = np.asarray(x, dtype=float)
65
+ y = np.asarray(y, dtype=float)
66
+ if x.shape != y.shape:
67
+ raise ValueError("x and y must have the same length")
68
+
69
+ r, p = _correlate(x, y, method)
70
+
71
+ if n_boot == 0:
72
+ return float(r), (float("nan"), float("nan")), float(p)
73
+
74
+ rng = np.random.default_rng(random_state)
75
+ bs = []
76
+ for _ in range(n_boot):
77
+ idx = rng.integers(0, len(x), len(x))
78
+ r_bs, _ = _correlate(x[idx], y[idx], method)
79
+ bs.append(r_bs)
80
+ lo, hi = np.percentile(bs, [(1 - ci) / 2 * 100, (1 + ci) / 2 * 100])
81
+ return float(r), (float(lo), float(hi)), float(p)
evaluation/stats/robustness.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Robustness & sensitivity analysis helpers (RQ3 / RQ4)."""
2
+
3
+ from __future__ import annotations
4
+ from typing import Sequence, Tuple, Mapping, Any
5
+ import numpy as np
6
+ from scipy import stats
7
+
8
+
9
+ def delta_metric(
10
+ orig: Sequence[float], perturbed: Sequence[float]
11
+ ) -> Tuple[float, float]:
12
+ """Return mean delta and Cohen's *d* effect size.
13
+
14
+ *orig* and *perturbed* must be paired metric values (same length).
15
+ """
16
+ orig = np.asarray(orig, dtype=float)
17
+ perturbed = np.asarray(perturbed, dtype=float)
18
+ if orig.shape != perturbed.shape:
19
+ raise ValueError("orig and perturbed must have the same length")
20
+ delta = np.mean(perturbed - orig)
21
+ pooled_sd = np.sqrt(((orig.std(ddof=1) ** 2) + (perturbed.std(ddof=1) ** 2)) / 2)
22
+ cohen_d = delta / pooled_sd if pooled_sd else float("nan")
23
+ return float(delta), float(cohen_d)
24
+
25
+
26
+ def conditional_failure_rate(
27
+ retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
28
+ ) -> Mapping[str, float]:
29
+ """Fraction of hallucinations conditional on retrieval failure.
30
+
31
+ Returns
32
+ -------
33
+ dict with keys:
34
+ p_hallucination_given_error
35
+ p_hallucination_given_success
36
+ """
37
+ import numpy as np
38
+
39
+ retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
40
+ hallucinations = np.asarray(hallucinations, dtype=bool)
41
+
42
+ if retrieval_errors.shape != hallucinations.shape:
43
+ raise ValueError("Input lengths differ")
44
+
45
+ err_idx = retrieval_errors
46
+ succ_idx = ~retrieval_errors
47
+
48
+ def _rate(mask):
49
+ if mask.sum() == 0:
50
+ return float("nan")
51
+ return float(hallucinations[mask].mean())
52
+
53
+ return {
54
+ "p_hallucination_given_error": _rate(err_idx),
55
+ "p_hallucination_given_success": _rate(succ_idx),
56
+ }
57
+
58
+
59
+ def chi2_error_propagation(
60
+ retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
61
+ ):
62
+ """Chi‑square test of independence between retrieval error and hallucination."""
63
+ from scipy.stats import chi2_contingency
64
+
65
+ retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
66
+ hallucinations = np.asarray(hallucinations, dtype=bool)
67
+
68
+ table = [
69
+ [
70
+ ((~retrieval_errors) & (~hallucinations)).sum(),
71
+ ((~retrieval_errors) & hallucinations).sum(),
72
+ ],
73
+ [
74
+ (retrieval_errors & (~hallucinations)).sum(),
75
+ (retrieval_errors & hallucinations).sum(),
76
+ ],
77
+ ]
78
+ chi2, p, dof, expected = chi2_contingency(table)
79
+ return dict(chi2=chi2, p=p, dof=dof, expected=expected, table=table)
evaluation/stats/significance.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Significance testing utilities (Wilcoxon, Holm‑Bonferroni)."""
2
+
3
+ from __future__ import annotations
4
+ from typing import Sequence, Mapping, List, Tuple
5
+
6
+ import numpy as np
7
+ from scipy import stats
8
+
9
+
10
+ def wilcoxon_signed_rank(
11
+ x: Sequence[float],
12
+ y: Sequence[float],
13
+ *,
14
+ alternative: str = "two-sided",
15
+ ):
16
+ """Paired Wilcoxon signed‑rank test (wrapper)."""
17
+ return stats.wilcoxon(x, y, alternative=alternative)
18
+
19
+
20
+ def holm_bonferroni(pvalues: Mapping[str, float]) -> Mapping[str, float]:
21
+ """Holm‑Bonferroni correction for multiple hypotheses.
22
+
23
+ Parameters
24
+ ----------
25
+ pvalues : dict
26
+ Mapping from *name* → raw p‑value.
27
+
28
+ Returns
29
+ -------
30
+ dict
31
+ Mapping from *name* → adjusted p‑value.
32
+ """
33
+ m = len(pvalues)
34
+ sorted_items: List[Tuple[str, float]] = sorted(pvalues.items(), key=lambda kv: kv[1])
35
+ adjusted = {}
36
+ for i, (name, p) in enumerate(sorted_items, start=1):
37
+ adjusted[name] = min((m - i + 1) * p, 1.0)
38
+ return adjusted
tests/test_reranker.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def test_rerank():
2
+ from evaluation.rerankers.cross_encoder import CrossEncoderReranker
3
+ from evaluation.retrievers.base import Context
4
+ rer = CrossEncoderReranker("cross-encoder/ms-marco-MiniLM-L-6-v2", device="cpu")
5
+ dummy = [Context(id=str(i), text=f"text {i}", score=1.0) for i in range(5)]
6
+ out = rer.rerank("dummy query", dummy, k=3)
7
+ assert len(out) == 3
tests/test_stats.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from evaluation.stats import (
2
+ corr_ci,
3
+ wilcoxon_signed_rank,
4
+ holm_bonferroni,
5
+ delta_metric,
6
+ conditional_failure_rate,
7
+ chi2_error_propagation,
8
+ )
9
+ import numpy as np
10
+
11
+
12
+ def test_corr_ci():
13
+ x = np.arange(10)
14
+ y = np.arange(10)
15
+ r, (lo, hi), p = corr_ci(x, y, n_boot=100)
16
+ assert r > 0.9 and lo <= r <= hi
17
+
18
+
19
+ def test_wilcoxon():
20
+ x = [1, 2, 3]
21
+ y = [1, 3, 5]
22
+ stat, p = wilcoxon_signed_rank(x, y)
23
+ assert p < 0.2 # not exact, just smoke
24
+
25
+
26
+ def test_holm():
27
+ raw = {"a": 0.01, "b": 0.04, "c": 0.20}
28
+ adj = holm_bonferroni(raw)
29
+ assert adj["a"] <= raw["a"]
30
+
31
+
32
+ def test_delta_metric():
33
+ d, eff = delta_metric([1, 2, 3], [2, 3, 4])
34
+ assert d > 0 and eff > 0
35
+
36
+
37
+ def test_conditional_failure_rate():
38
+ r = [True, False, True, False]
39
+ h = [True, False, False, True]
40
+ rates = conditional_failure_rate(r, h)
41
+ assert "p_hallucination_given_error" in rates
42
+
43
+
44
+ def test_chi2():
45
+ r = [True, True, False, False]
46
+ h = [True, False, True, False]
47
+ out = chi2_error_propagation(r, h)
48
+ assert out["dof"] == 1