""" Launch with: streamlit run scripts/dashboard.py Relies on the directory structure produced by analysis.py: outputs/grid///{aggregates.yaml, rq1.yaml, ...} """ from __future__ import annotations import json import yaml from pathlib import Path import pandas as pd import streamlit as st import matplotlib.pyplot as plt BASE_DIR = Path("outputs/grid") METRIC_KEY = "rag_score" # --------------------------------------------------------------------- Sidebar st.sidebar.title("RAG-Eval Dashboard") if not BASE_DIR.exists(): st.sidebar.error(f"Folder {BASE_DIR} not found – run experiments first.") st.stop() datasets = sorted([p.name for p in BASE_DIR.iterdir() if p.is_dir()]) dataset = st.sidebar.selectbox("Dataset", datasets) conf_dir = BASE_DIR / dataset configs = sorted([p.name for p in conf_dir.iterdir() if p.is_dir()]) sel_cfgs = st.sidebar.multiselect("Configurations", configs, default=configs) if not sel_cfgs: st.warning("Select at least one configuration.") st.stop() # ---------------------------------------------------------------- Load helpers def _yaml(path: Path): return yaml.safe_load(path.read_text()) def _jsonl(path: Path): return [json.loads(l) for l in path.read_text().splitlines()] # ---------------------------------------------------------------- Main view st.title(f"Dataset: {dataset}") # ── Aggregated metrics table ──────────────────────────────────────────────── agg = {c: _yaml(conf_dir / c / "aggregates.yaml") for c in sel_cfgs} agg_df = pd.DataFrame(agg).T st.subheader("Aggregated metrics") st.dataframe(agg_df, use_container_width=True) # ── Bar chart of rag_score means ──────────────────────────────────────────── st.subheader(f"Mean {METRIC_KEY}") fig, ax = plt.subplots() agg_df[METRIC_KEY].plot.bar(ax=ax) ax.set_ylabel(METRIC_KEY) ax.set_ylim(0, 1) st.pyplot(fig) # ── Scatter MRR vs Correctness per config ─────────────────────────────────── st.subheader("MRR vs Human Correctness") cols = st.columns(len(sel_cfgs)) for col, cfg in zip(cols, sel_cfgs): rows = _jsonl(conf_dir / cfg / "results.jsonl") x = [r["metrics"].get("mrr", float("nan")) for r in rows] y = [1 if r.get("human_correct") else 0 for r in rows] fig, ax = plt.subplots() ax.scatter(x, y, alpha=0.5) ax.set(title=cfg, xlabel="MRR", ylabel="Correct?") col.pyplot(fig) # ── Pairwise Wilcoxon-Holm table (rag_score) ──────────────────────────────── wh_path = conf_dir / "wilcoxon_rag_holm.yaml" if wh_path.exists(): st.subheader("Pairwise Wilcoxon-Holm (rag_score)") wh_df = pd.Series(_yaml(wh_path), name="p_adj").to_frame() st.dataframe(wh_df) else: st.info("Wilcoxon table not found – run_grid_experiments.py computes it.") # ── Research-question YAMLs ───────────────────────────────────────────────── rq_tabs = st.tabs([f"{cfg}" for cfg in sel_cfgs]) for tab, cfg in zip(rq_tabs, sel_cfgs): with tab: for rq in ("rq1", "rq2", "rq3", "rq4"): path = conf_dir / cfg / f"{rq}.yaml" if path.exists(): st.markdown(f"**{rq.upper()}**") st.json(_yaml(path)) else: st.markdown(f"*{rq.upper()} – not available*") # ── Raw results download ──────────────────────────────────────────────────── st.sidebar.subheader("Download") for cfg in sel_cfgs: st.sidebar.download_button( label=f"{cfg} results.jsonl", data=(conf_dir / cfg / "results.jsonl").read_bytes(), file_name=f"{dataset}_{cfg}_results.jsonl", mime="application/jsonl", )