Spaces:
Sleeping
Sleeping
Commit
·
bdb49ae
1
Parent(s):
cdf4160
Further development
Browse files- evaluation/config.py +31 -1
- evaluation/pipeline.py +19 -4
- evaluation/rerankers/cross_encoder.py +34 -0
- evaluation/stats/__init__.py +18 -0
- evaluation/stats/correlation.py +81 -0
- evaluation/stats/robustness.py +79 -0
- evaluation/stats/significance.py +38 -0
- tests/test_reranker.py +7 -0
- tests/test_stats.py +48 -0
evaluation/config.py
CHANGED
|
@@ -4,6 +4,14 @@ from dataclasses import dataclass
|
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Optional, Literal
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
@dataclass
|
| 9 |
class RetrieverConfig:
|
|
@@ -34,9 +42,31 @@ class GeneratorConfig:
|
|
| 34 |
temperature: float = 0.0
|
| 35 |
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
@dataclass
|
| 38 |
class PipelineConfig:
|
| 39 |
"""Top‑level pipeline configuration."""
|
| 40 |
-
|
| 41 |
retriever: RetrieverConfig = RetrieverConfig()
|
| 42 |
generator: GeneratorConfig = GeneratorConfig()
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Optional, Literal
|
| 6 |
|
| 7 |
+
@dataclass
|
| 8 |
+
class CrossEncoderConfig:
|
| 9 |
+
enable: bool = False # master switch
|
| 10 |
+
model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
| 11 |
+
device: str = "cpu"
|
| 12 |
+
max_length: int = 512 # truncation length
|
| 13 |
+
first_stage_k: int = 50 # how many docs to pass to re-ranker
|
| 14 |
+
final_k: Optional[int] = None # override PipelineConfig.retriever.top_k
|
| 15 |
|
| 16 |
@dataclass
|
| 17 |
class RetrieverConfig:
|
|
|
|
| 42 |
temperature: float = 0.0
|
| 43 |
|
| 44 |
|
| 45 |
+
@dataclass
|
| 46 |
+
class StatsConfig:
|
| 47 |
+
"""Configuration parameters for all statistical analyses."""
|
| 48 |
+
|
| 49 |
+
# Correlation (RQ1 & RQ2)
|
| 50 |
+
correlation_method: Literal["spearman", "kendall"] = "spearman"
|
| 51 |
+
n_boot: int = 1000 # bootstrap replicates for CIs
|
| 52 |
+
ci: float = 0.95 # confidence level (e.g. 0.95 = 95 %)
|
| 53 |
+
|
| 54 |
+
# Significance tests (RQ2)
|
| 55 |
+
wilcoxon_alternative: Literal["two-sided", "greater", "less"] = "two-sided"
|
| 56 |
+
multiple_correction: Literal["holm-bonferroni", "none"] = "holm-bonferroni"
|
| 57 |
+
alpha: float = 0.05 # family-wise error rate
|
| 58 |
+
|
| 59 |
+
# Robustness / sensitivity (RQ3 & RQ4)
|
| 60 |
+
compute_effect_size: bool = True
|
| 61 |
+
report_conditional_rates: bool = True
|
| 62 |
+
|
| 63 |
+
|
| 64 |
@dataclass
|
| 65 |
class PipelineConfig:
|
| 66 |
"""Top‑level pipeline configuration."""
|
| 67 |
+
reranker: CrossEncoderConfig = CrossEncoderConfig()
|
| 68 |
retriever: RetrieverConfig = RetrieverConfig()
|
| 69 |
generator: GeneratorConfig = GeneratorConfig()
|
| 70 |
+
stats: StatsConfig = StatsConfig()
|
| 71 |
+
|
| 72 |
+
|
evaluation/pipeline.py
CHANGED
|
@@ -8,7 +8,7 @@ from .config import PipelineConfig
|
|
| 8 |
from .retrievers import bm25, dense, hybrid
|
| 9 |
from .generators.hf_generator import HFGenerator
|
| 10 |
from .retrievers.base import Retriever, Context
|
| 11 |
-
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
logging.basicConfig(level=logging.INFO)
|
| 14 |
|
|
@@ -22,6 +22,15 @@ class RAGPipeline:
|
|
| 22 |
self.generator = HFGenerator(
|
| 23 |
model_name=cfg.generator.model_name, device=cfg.generator.device
|
| 24 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
# ---------------------------------------------------------------------
|
| 27 |
# Public API
|
|
@@ -59,9 +68,15 @@ class RAGPipeline:
|
|
| 59 |
|
| 60 |
def _retrieve(self, question: str) -> List[Context]:
|
| 61 |
logger.info("Retrieving top‑%d passages", self.cfg.retriever.top_k)
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
def _generate(self, question: str, contexts: List[Context]) -> str:
|
| 67 |
texts = [c.text for c in contexts]
|
|
|
|
| 8 |
from .retrievers import bm25, dense, hybrid
|
| 9 |
from .generators.hf_generator import HFGenerator
|
| 10 |
from .retrievers.base import Retriever, Context
|
| 11 |
+
from .rerankers.cross_encoder import CrossEncoderReranker
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
logging.basicConfig(level=logging.INFO)
|
| 14 |
|
|
|
|
| 22 |
self.generator = HFGenerator(
|
| 23 |
model_name=cfg.generator.model_name, device=cfg.generator.device
|
| 24 |
)
|
| 25 |
+
self.reranker = (
|
| 26 |
+
CrossEncoderReranker(
|
| 27 |
+
cfg.reranker.model_name,
|
| 28 |
+
device=cfg.reranker.device,
|
| 29 |
+
max_len=cfg.reranker.max_length,
|
| 30 |
+
)
|
| 31 |
+
if cfg.reranker.enable
|
| 32 |
+
else None
|
| 33 |
+
)
|
| 34 |
|
| 35 |
# ---------------------------------------------------------------------
|
| 36 |
# Public API
|
|
|
|
| 68 |
|
| 69 |
def _retrieve(self, question: str) -> List[Context]:
|
| 70 |
logger.info("Retrieving top‑%d passages", self.cfg.retriever.top_k)
|
| 71 |
+
k_first = self.cfg.reranker.first_stage_k if self.reranker else self.cfg.retriever.top_k
|
| 72 |
+
initial = self.retriever.retrieve(question, top_k=k_first)
|
| 73 |
+
|
| 74 |
+
if self.reranker:
|
| 75 |
+
final_k = self.cfg.reranker.final_k or self.cfg.retriever.top_k
|
| 76 |
+
logger.info("Re-ranking %d docs with cross-encoder ...", len(initial))
|
| 77 |
+
initial = self.reranker.rerank(question, initial, k=final_k)
|
| 78 |
+
|
| 79 |
+
return initial
|
| 80 |
|
| 81 |
def _generate(self, question: str, contexts: List[Context]) -> str:
|
| 82 |
texts = [c.text for c in contexts]
|
evaluation/rerankers/cross_encoder.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Cross-encoder re-ranker built on SentenceTransformers CrossEncoder."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
from typing import List
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
from sentence_transformers import CrossEncoder
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
from evaluation.retrievers.base import Context
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class CrossEncoderReranker:
|
| 16 |
+
"""Re-scores (query, passage) pairs and returns top-k Contexts."""
|
| 17 |
+
|
| 18 |
+
def __init__(self, model_name: str, device: str = "cpu", max_len: int = 512):
|
| 19 |
+
self.model = CrossEncoder(model_name, device=device)
|
| 20 |
+
self.max_len = max_len
|
| 21 |
+
logger.info("Cross-encoder '%s' loaded on %s", model_name, device)
|
| 22 |
+
|
| 23 |
+
def rerank(self, query: str, contexts: List[Context], k: int) -> List[Context]:
|
| 24 |
+
pairs = [[query, c.text] for c in contexts]
|
| 25 |
+
scores = self.model.predict(
|
| 26 |
+
pairs,
|
| 27 |
+
convert_to_numpy=True,
|
| 28 |
+
show_progress_bar=False,
|
| 29 |
+
max_length=self.max_len,
|
| 30 |
+
)
|
| 31 |
+
for c, s in zip(contexts, scores):
|
| 32 |
+
c.score = float(s)
|
| 33 |
+
contexts.sort(key=lambda c: c.score, reverse=True)
|
| 34 |
+
return contexts[:k]
|
evaluation/stats/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Statistical utilities for analysis scripts."""
|
| 2 |
+
|
| 3 |
+
from .correlation import corr_ci
|
| 4 |
+
from .significance import wilcoxon_signed_rank, holm_bonferroni
|
| 5 |
+
from .robustness import (
|
| 6 |
+
delta_metric,
|
| 7 |
+
conditional_failure_rate,
|
| 8 |
+
chi2_error_propagation,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"corr_ci",
|
| 13 |
+
"wilcoxon_signed_rank",
|
| 14 |
+
"holm_bonferroni",
|
| 15 |
+
"delta_metric",
|
| 16 |
+
"conditional_failure_rate",
|
| 17 |
+
"chi2_error_propagation",
|
| 18 |
+
]
|
evaluation/stats/correlation.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Correlation helpers for RQ1 and RQ2 analyses.
|
| 2 |
+
|
| 3 |
+
Functions here wrap `scipy.stats` to compute non‑parametric correlations
|
| 4 |
+
(Spearman ρ, Kendall τ) with optional bootstrap confidence intervals so
|
| 5 |
+
results can be reported with uncertainty estimates.
|
| 6 |
+
|
| 7 |
+
Typical usage
|
| 8 |
+
-------------
|
| 9 |
+
>>> from evaluation.stats.correlation import corr_ci
|
| 10 |
+
>>> rho, (lo, hi), p = corr_ci(x, y, method="spearman", n_boot=1000)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
from typing import Tuple, Sequence, Literal
|
| 15 |
+
|
| 16 |
+
import numpy as np
|
| 17 |
+
from scipy import stats
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
Method = Literal["spearman", "kendall"]
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _correlate(x: Sequence[float], y: Sequence[float], method: Method):
|
| 24 |
+
if method == "spearman":
|
| 25 |
+
return stats.spearmanr(x, y, nan_policy="omit")
|
| 26 |
+
if method == "kendall":
|
| 27 |
+
return stats.kendalltau(x, y, nan_policy="omit")
|
| 28 |
+
raise ValueError(method)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def corr_ci(
|
| 32 |
+
x: Sequence[float],
|
| 33 |
+
y: Sequence[float],
|
| 34 |
+
*,
|
| 35 |
+
method: Method = "spearman",
|
| 36 |
+
n_boot: int = 1000,
|
| 37 |
+
ci: float = 0.95,
|
| 38 |
+
random_state: int | None = None,
|
| 39 |
+
) -> Tuple[float, Tuple[float, float], float]:
|
| 40 |
+
"""Correlation coefficient, bootstrap CI, and p‑value.
|
| 41 |
+
|
| 42 |
+
Parameters
|
| 43 |
+
----------
|
| 44 |
+
x, y
|
| 45 |
+
Numeric sequences of equal length.
|
| 46 |
+
method
|
| 47 |
+
'spearman' or 'kendall'.
|
| 48 |
+
n_boot
|
| 49 |
+
Number of bootstrap resamples for the CI. 0 → no CI.
|
| 50 |
+
ci
|
| 51 |
+
Confidence level (e.g. 0.95 for 95 %).
|
| 52 |
+
random_state
|
| 53 |
+
Seed for reproducibility.
|
| 54 |
+
|
| 55 |
+
Returns
|
| 56 |
+
-------
|
| 57 |
+
r : float
|
| 58 |
+
Correlation coefficient.
|
| 59 |
+
(lo, hi) : Tuple[float, float]
|
| 60 |
+
Lower/upper CI bounds. ``(nan, nan)`` if *n_boot* == 0.
|
| 61 |
+
p : float
|
| 62 |
+
Two‑sided p‑value from the correlation test.
|
| 63 |
+
"""
|
| 64 |
+
x = np.asarray(x, dtype=float)
|
| 65 |
+
y = np.asarray(y, dtype=float)
|
| 66 |
+
if x.shape != y.shape:
|
| 67 |
+
raise ValueError("x and y must have the same length")
|
| 68 |
+
|
| 69 |
+
r, p = _correlate(x, y, method)
|
| 70 |
+
|
| 71 |
+
if n_boot == 0:
|
| 72 |
+
return float(r), (float("nan"), float("nan")), float(p)
|
| 73 |
+
|
| 74 |
+
rng = np.random.default_rng(random_state)
|
| 75 |
+
bs = []
|
| 76 |
+
for _ in range(n_boot):
|
| 77 |
+
idx = rng.integers(0, len(x), len(x))
|
| 78 |
+
r_bs, _ = _correlate(x[idx], y[idx], method)
|
| 79 |
+
bs.append(r_bs)
|
| 80 |
+
lo, hi = np.percentile(bs, [(1 - ci) / 2 * 100, (1 + ci) / 2 * 100])
|
| 81 |
+
return float(r), (float(lo), float(hi)), float(p)
|
evaluation/stats/robustness.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Robustness & sensitivity analysis helpers (RQ3 / RQ4)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
from typing import Sequence, Tuple, Mapping, Any
|
| 5 |
+
import numpy as np
|
| 6 |
+
from scipy import stats
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def delta_metric(
|
| 10 |
+
orig: Sequence[float], perturbed: Sequence[float]
|
| 11 |
+
) -> Tuple[float, float]:
|
| 12 |
+
"""Return mean delta and Cohen's *d* effect size.
|
| 13 |
+
|
| 14 |
+
*orig* and *perturbed* must be paired metric values (same length).
|
| 15 |
+
"""
|
| 16 |
+
orig = np.asarray(orig, dtype=float)
|
| 17 |
+
perturbed = np.asarray(perturbed, dtype=float)
|
| 18 |
+
if orig.shape != perturbed.shape:
|
| 19 |
+
raise ValueError("orig and perturbed must have the same length")
|
| 20 |
+
delta = np.mean(perturbed - orig)
|
| 21 |
+
pooled_sd = np.sqrt(((orig.std(ddof=1) ** 2) + (perturbed.std(ddof=1) ** 2)) / 2)
|
| 22 |
+
cohen_d = delta / pooled_sd if pooled_sd else float("nan")
|
| 23 |
+
return float(delta), float(cohen_d)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def conditional_failure_rate(
|
| 27 |
+
retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
|
| 28 |
+
) -> Mapping[str, float]:
|
| 29 |
+
"""Fraction of hallucinations conditional on retrieval failure.
|
| 30 |
+
|
| 31 |
+
Returns
|
| 32 |
+
-------
|
| 33 |
+
dict with keys:
|
| 34 |
+
p_hallucination_given_error
|
| 35 |
+
p_hallucination_given_success
|
| 36 |
+
"""
|
| 37 |
+
import numpy as np
|
| 38 |
+
|
| 39 |
+
retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
|
| 40 |
+
hallucinations = np.asarray(hallucinations, dtype=bool)
|
| 41 |
+
|
| 42 |
+
if retrieval_errors.shape != hallucinations.shape:
|
| 43 |
+
raise ValueError("Input lengths differ")
|
| 44 |
+
|
| 45 |
+
err_idx = retrieval_errors
|
| 46 |
+
succ_idx = ~retrieval_errors
|
| 47 |
+
|
| 48 |
+
def _rate(mask):
|
| 49 |
+
if mask.sum() == 0:
|
| 50 |
+
return float("nan")
|
| 51 |
+
return float(hallucinations[mask].mean())
|
| 52 |
+
|
| 53 |
+
return {
|
| 54 |
+
"p_hallucination_given_error": _rate(err_idx),
|
| 55 |
+
"p_hallucination_given_success": _rate(succ_idx),
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def chi2_error_propagation(
|
| 60 |
+
retrieval_errors: Sequence[bool], hallucinations: Sequence[bool]
|
| 61 |
+
):
|
| 62 |
+
"""Chi‑square test of independence between retrieval error and hallucination."""
|
| 63 |
+
from scipy.stats import chi2_contingency
|
| 64 |
+
|
| 65 |
+
retrieval_errors = np.asarray(retrieval_errors, dtype=bool)
|
| 66 |
+
hallucinations = np.asarray(hallucinations, dtype=bool)
|
| 67 |
+
|
| 68 |
+
table = [
|
| 69 |
+
[
|
| 70 |
+
((~retrieval_errors) & (~hallucinations)).sum(),
|
| 71 |
+
((~retrieval_errors) & hallucinations).sum(),
|
| 72 |
+
],
|
| 73 |
+
[
|
| 74 |
+
(retrieval_errors & (~hallucinations)).sum(),
|
| 75 |
+
(retrieval_errors & hallucinations).sum(),
|
| 76 |
+
],
|
| 77 |
+
]
|
| 78 |
+
chi2, p, dof, expected = chi2_contingency(table)
|
| 79 |
+
return dict(chi2=chi2, p=p, dof=dof, expected=expected, table=table)
|
evaluation/stats/significance.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Significance testing utilities (Wilcoxon, Holm‑Bonferroni)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
from typing import Sequence, Mapping, List, Tuple
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
from scipy import stats
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def wilcoxon_signed_rank(
|
| 11 |
+
x: Sequence[float],
|
| 12 |
+
y: Sequence[float],
|
| 13 |
+
*,
|
| 14 |
+
alternative: str = "two-sided",
|
| 15 |
+
):
|
| 16 |
+
"""Paired Wilcoxon signed‑rank test (wrapper)."""
|
| 17 |
+
return stats.wilcoxon(x, y, alternative=alternative)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def holm_bonferroni(pvalues: Mapping[str, float]) -> Mapping[str, float]:
|
| 21 |
+
"""Holm‑Bonferroni correction for multiple hypotheses.
|
| 22 |
+
|
| 23 |
+
Parameters
|
| 24 |
+
----------
|
| 25 |
+
pvalues : dict
|
| 26 |
+
Mapping from *name* → raw p‑value.
|
| 27 |
+
|
| 28 |
+
Returns
|
| 29 |
+
-------
|
| 30 |
+
dict
|
| 31 |
+
Mapping from *name* → adjusted p‑value.
|
| 32 |
+
"""
|
| 33 |
+
m = len(pvalues)
|
| 34 |
+
sorted_items: List[Tuple[str, float]] = sorted(pvalues.items(), key=lambda kv: kv[1])
|
| 35 |
+
adjusted = {}
|
| 36 |
+
for i, (name, p) in enumerate(sorted_items, start=1):
|
| 37 |
+
adjusted[name] = min((m - i + 1) * p, 1.0)
|
| 38 |
+
return adjusted
|
tests/test_reranker.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def test_rerank():
|
| 2 |
+
from evaluation.rerankers.cross_encoder import CrossEncoderReranker
|
| 3 |
+
from evaluation.retrievers.base import Context
|
| 4 |
+
rer = CrossEncoderReranker("cross-encoder/ms-marco-MiniLM-L-6-v2", device="cpu")
|
| 5 |
+
dummy = [Context(id=str(i), text=f"text {i}", score=1.0) for i in range(5)]
|
| 6 |
+
out = rer.rerank("dummy query", dummy, k=3)
|
| 7 |
+
assert len(out) == 3
|
tests/test_stats.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from evaluation.stats import (
|
| 2 |
+
corr_ci,
|
| 3 |
+
wilcoxon_signed_rank,
|
| 4 |
+
holm_bonferroni,
|
| 5 |
+
delta_metric,
|
| 6 |
+
conditional_failure_rate,
|
| 7 |
+
chi2_error_propagation,
|
| 8 |
+
)
|
| 9 |
+
import numpy as np
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_corr_ci():
|
| 13 |
+
x = np.arange(10)
|
| 14 |
+
y = np.arange(10)
|
| 15 |
+
r, (lo, hi), p = corr_ci(x, y, n_boot=100)
|
| 16 |
+
assert r > 0.9 and lo <= r <= hi
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def test_wilcoxon():
|
| 20 |
+
x = [1, 2, 3]
|
| 21 |
+
y = [1, 3, 5]
|
| 22 |
+
stat, p = wilcoxon_signed_rank(x, y)
|
| 23 |
+
assert p < 0.2 # not exact, just smoke
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_holm():
|
| 27 |
+
raw = {"a": 0.01, "b": 0.04, "c": 0.20}
|
| 28 |
+
adj = holm_bonferroni(raw)
|
| 29 |
+
assert adj["a"] <= raw["a"]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_delta_metric():
|
| 33 |
+
d, eff = delta_metric([1, 2, 3], [2, 3, 4])
|
| 34 |
+
assert d > 0 and eff > 0
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def test_conditional_failure_rate():
|
| 38 |
+
r = [True, False, True, False]
|
| 39 |
+
h = [True, False, False, True]
|
| 40 |
+
rates = conditional_failure_rate(r, h)
|
| 41 |
+
assert "p_hallucination_given_error" in rates
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_chi2():
|
| 45 |
+
r = [True, True, False, False]
|
| 46 |
+
h = [True, False, True, False]
|
| 47 |
+
out = chi2_error_propagation(r, h)
|
| 48 |
+
assert out["dof"] == 1
|