Spaces:

Rom89823974978
/

RAG_Eval

Sleeping

App Files Files Community

RAG_Eval / scripts /dashboard.py

Rom89823974978

Updated codebase

12409b1 4 months ago

raw

history blame contribute delete

4.12 kB

	"""
	Launch with:
	streamlit run scripts/dashboard.py

	Relies on the directory structure produced by analysis.py:
	outputs/grid/<dataset>/<config>/{aggregates.yaml, rq1.yaml, ...}
	"""
	from __future__ import annotations

	import json
	import yaml
	from pathlib import Path

	import pandas as pd
	import streamlit as st
	import matplotlib.pyplot as plt

	BASE_DIR = Path("outputs/grid")
	METRIC_KEY = "rag_score"

	# --------------------------------------------------------------------- Sidebar
	st.sidebar.title("RAG-Eval Dashboard")

	if not BASE_DIR.exists():
	st.sidebar.error(f"Folder {BASE_DIR} not found – run experiments first.")
	st.stop()

	datasets = sorted([p.name for p in BASE_DIR.iterdir() if p.is_dir()])
	dataset = st.sidebar.selectbox("Dataset", datasets)
	conf_dir = BASE_DIR / dataset
	configs = sorted([p.name for p in conf_dir.iterdir() if p.is_dir()])
	sel_cfgs = st.sidebar.multiselect("Configurations", configs, default=configs)

	if not sel_cfgs:
	st.warning("Select at least one configuration.")
	st.stop()

	# ---------------------------------------------------------------- Load helpers
	def _yaml(path: Path): return yaml.safe_load(path.read_text())
	def _jsonl(path: Path): return [json.loads(l) for l in path.read_text().splitlines()]

	# ---------------------------------------------------------------- Main view
	st.title(f"Dataset: {dataset}")

	# ── Aggregated metrics table ────────────────────────────────────────────────
	agg = {c: _yaml(conf_dir / c / "aggregates.yaml") for c in sel_cfgs}
	agg_df = pd.DataFrame(agg).T
	st.subheader("Aggregated metrics")
	st.dataframe(agg_df, use_container_width=True)

	# ── Bar chart of rag_score means ────────────────────────────────────────────
	st.subheader(f"Mean {METRIC_KEY}")
	fig, ax = plt.subplots()
	agg_df[METRIC_KEY].plot.bar(ax=ax)
	ax.set_ylabel(METRIC_KEY)
	ax.set_ylim(0, 1)
	st.pyplot(fig)

	# ── Scatter MRR vs Correctness per config ───────────────────────────────────
	st.subheader("MRR vs Human Correctness")
	cols = st.columns(len(sel_cfgs))
	for col, cfg in zip(cols, sel_cfgs):
	rows = _jsonl(conf_dir / cfg / "results.jsonl")
	x = [r["metrics"].get("mrr", float("nan")) for r in rows]
	y = [1 if r.get("human_correct") else 0 for r in rows]
	fig, ax = plt.subplots()
	ax.scatter(x, y, alpha=0.5)
	ax.set(title=cfg, xlabel="MRR", ylabel="Correct?")
	col.pyplot(fig)

	# ── Pairwise Wilcoxon-Holm table (rag_score) ────────────────────────────────
	wh_path = conf_dir / "wilcoxon_rag_holm.yaml"
	if wh_path.exists():
	st.subheader("Pairwise Wilcoxon-Holm (rag_score)")
	wh_df = pd.Series(_yaml(wh_path), name="p_adj").to_frame()
	st.dataframe(wh_df)
	else:
	st.info("Wilcoxon table not found – run_grid_experiments.py computes it.")

	# ── Research-question YAMLs ─────────────────────────────────────────────────
	rq_tabs = st.tabs([f"{cfg}" for cfg in sel_cfgs])
	for tab, cfg in zip(rq_tabs, sel_cfgs):
	with tab:
	for rq in ("rq1", "rq2", "rq3", "rq4"):
	path = conf_dir / cfg / f"{rq}.yaml"
	if path.exists():
	st.markdown(f"{rq.upper()}")
	st.json(_yaml(path))
	else:
	st.markdown(f"{rq.upper()} – not available")

	# ── Raw results download ────────────────────────────────────────────────────
	st.sidebar.subheader("Download")
	for cfg in sel_cfgs:
	st.sidebar.download_button(
	label=f"{cfg} results.jsonl",
	data=(conf_dir / cfg / "results.jsonl").read_bytes(),
	file_name=f"{dataset}_{cfg}_results.jsonl",
	mime="application/jsonl",
	)