Spaces:

gaius-lex
/

pl-legal-rag

Running

App Files Files Community

pl-legal-rag / app.py

wwydmanski

Update app.py

e320f9b verified 27 days ago

raw

history blame contribute delete

20.6 kB

	import json
	from pathlib import Path

	import pandas as pd
	import numpy as np
	import gradio as gr
	from typing import Dict, List

	import re

	DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
	DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve())

	# Predefined parameter bins for filtering (in billions)
	PARAM_BIN_CHOICES: list[str] = [
	"<10B",
	"10B-25B",
	"25B-50B",
	"50B-100B",
	"100B+",
	]


	def load_leaderboard_json(json_path: str) -> pd.DataFrame:
	path = Path(json_path)
	if not path.exists() or not path.is_file():
	return pd.DataFrame()
	try:
	with open(path, "r", encoding="utf-8") as f:
	records = json.load(f)
	# records should be a list of dicts; fallback if dict
	if isinstance(records, dict):
	# If wrapped, try to unwrap common keys
	for key in ["data", "records", "items", "leaderboard"]:
	if key in records and isinstance(records[key], list):
	records = records[key]
	break
	if not isinstance(records, list):
	return pd.DataFrame()
	return pd.DataFrame.from_records(records)
	except Exception:
	return pd.DataFrame()


	def _hex_from_rgb(r: float, g: float, b: float) -> str:
	r = max(0, min(255, int(round(r))))
	g = max(0, min(255, int(round(g))))
	b = max(0, min(255, int(round(b))))
	return f"#{r:02x}{g:02x}{b:02x}"


	def _bg_color_from_t(t: float) -> str:
	t = max(0.0, min(1.0, float(t)))
	# Green (small) -> Red (big)
	g_start = (34, 197, 94) # #22c55e
	r_end = (239, 68, 68) # #ef4444
	r = g_start[0] + t * (r_end[0] - g_start[0])
	g = g_start[1] + t * (r_end[1] - g_start[1])
	b = g_start[2] + t * (r_end[2] - g_start[2])
	return f"background-color: {_hex_from_rgb(r, g, b)}"


	def _style_parameters(series: pd.Series) -> list[str]:
	s = pd.to_numeric(series, errors="coerce")
	s_pos = s[s > 0]
	if s_pos.empty:
	return [""] * len(series)
	logs = np.log10(s_pos)
	lmin = float(np.nanmin(logs))
	lmax = float(np.nanmax(logs))
	if not np.isfinite(lmin) or not np.isfinite(lmax):
	return [""] * len(series)

	colors: list[str] = []
	for v in s:
	if pd.isna(v) or v <= 0:
	colors.append("")
	else:
	lv = np.log10(v)
	if lmax == lmin:
	t = 0.0
	else:
	t = (lv - lmin) / (lmax - lmin)
	colors.append(_bg_color_from_t(float(t)))
	return colors


	def _format_value_minimal(v) -> str:
	if pd.isna(v):
	return ""
	if isinstance(v, str):
	return v
	if isinstance(v, (int, np.integer)):
	return str(int(v))
	if isinstance(v, (float, np.floating)):
	if abs(v - round(v)) < 1e-9:
	return str(int(round(v)))
	s = f"{float(v):.6f}".rstrip("0").rstrip(".")
	return s



	def _prepare_dataframe(json_path: str) -> pd.DataFrame:
	df = load_leaderboard_json(json_path)
	if df.empty:
	return df

	# Remove columns not to be displayed per schema (Quantization, any *_time or time)
	columns_to_exclude = [
	c for c in df.columns
	if c.lower() == "quantization" or c.lower().endswith("_time") or c.lower() == "time"
	]
	df = df.drop(columns=columns_to_exclude, errors="ignore")

	# Normalize types
	if "Parameters" in df.columns:
	df["Parameters"] = pd.to_numeric(df["Parameters"], errors="coerce")
	if "src_clf" in df.columns:
	df["src_clf"] = pd.to_numeric(df["src_clf"], errors="coerce")

	# Compute avg_score across numeric metric columns (exclude meta)
	meta_cols = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
	metric_candidates = [c for c in df.columns if c not in meta_cols]
	if metric_candidates:
	numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in metric_candidates})
	df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)

	# Sort by avg_score descending by default if present
	if "avg_score" in df.columns:
	df = df.sort_values(by="avg_score", ascending=False, na_position="last")

	# Preferred column order
	preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
	remaining_cols = [c for c in df.columns if c not in preferred_order]
	# Ensure avg_score is first among metric columns
	if "avg_score" in remaining_cols:
	remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
	if preferred_order:
	df = df[preferred_order + remaining_cols]

	# Insert a visual separator column after Parameters to split meta from scores
	if "Parameters" in df.columns:
	sep_col_name = "—"
	insert_at = df.columns.get_loc("Parameters") + 1
	df.insert(insert_at, sep_col_name, "")

	return df


	def _param_bins_mask(param_series: pd.Series, selected_bins: list[str] \| None) -> pd.Series:
	"""Build a boolean mask for selected parameter bins.

	Bins are in billions: <10B, 10B-25B, 25B-50B, 50B-100B, 100B-200B, 200B-300B, 300B+
	Automatically converts raw counts to billions if values look large.
	"""
	if not selected_bins:
	return pd.Series(True, index=param_series.index)

	# Ensure numeric
	s = pd.to_numeric(param_series, errors="coerce")

	# Heuristic: if median is large, assume raw parameter counts and convert to billions
	median_val = s.dropna().median()
	if pd.notna(median_val) and median_val > 1e6:
	s_b = s / 1e9
	else:
	s_b = s

	bin_map: dict[str, tuple[float, float \| None]] = {
	"<10B": (0.0, 10.0),
	"10B-25B": (10.0, 25.0),
	"25B-50B": (25.0, 50.0),
	"50B-100B": (50.0, 100.0),
	"100B+": (100.0, None),
	}

	mask = pd.Series(False, index=s_b.index)
	for label in selected_bins:
	if label not in bin_map:
	continue
	low, high = bin_map[label]
	if high is None:
	mask \|= s_b >= low
	else:
	mask \|= (s_b >= low) & (s_b < high)
	# Drop NaNs from consideration
	mask &= s_b.notna()
	return mask


	def _apply_filters(df: pd.DataFrame, name_filter: str \| None, param_bins: list[str] \| None) -> pd.DataFrame:
	if df.empty:
	return df

	mask = pd.Series(True, index=df.index)

	# Name filter (case-insensitive substring match on Model)
	if name_filter:
	col = "Model" if "Model" in df.columns else None
	if col is not None:
	name_mask = df[col].astype(str).str.contains(name_filter, case=False, na=False)
	mask &= name_mask

	# Parameter bins filter
	if param_bins and "Parameters" in df.columns:
	bins_mask = _param_bins_mask(df["Parameters"], param_bins)
	mask &= bins_mask

	return df[mask]


	def build_view(json_path: str, name_filter: str = "", param_bins: list[str] \| None = None) -> object:
	df = _prepare_dataframe(json_path)

	df = df.dropna(subset=["src_clf", "sum_rag", "sum_rag_v2"], axis=0)

	# Apply filters if provided
	df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)

	# Produce a styled DataFrame (log-scale colors on Parameters, minimal decimals formatting)
	if isinstance(df, pd.DataFrame) and not df.empty:
	styler = df.style
	if "Parameters" in df.columns:
	styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore
	styler = styler.format(_format_value_minimal)
	table_value: object = styler
	else:
	# Empty DataFrame fallback
	table_value = pd.DataFrame()


	return table_value


	def build_view_only(
	json_path: str,
	name_filter: str = "",
	param_bins: list[str] \| None = None,
	excluded_tasks: list[str] \| None = None,
	):
	"""Return only the table without updating the exclude-tasks control.

	This prevents infinite loops when called from change handlers.
	"""
	df = _prepare_dataframe(json_path)

	# Determine all task-like columns (before exclusion)
	meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
	tasks_all = [c for c in df.columns if c not in meta_cols_base]

	excluded_set = set(excluded_tasks or [])
	# Keep only tasks that actually exist
	excluded_valid = [t for t in excluded_set if t in tasks_all]
	included_tasks = [c for c in tasks_all if c not in excluded_set]

	# Drop rows that are missing values for required tasks (only those that are included)
	required_cols = [c for c in ["src_clf", "sum_rag", "sum_rag_v2"] if c in included_tasks]
	if required_cols:
	df = df.dropna(subset=required_cols, axis=0)

	# Apply filters
	df = _apply_filters(df, name_filter=name_filter, param_bins=param_bins)

	# Remove excluded task columns from view
	if excluded_valid:
	df = df.drop(columns=[c for c in excluded_valid if c in df.columns], errors="ignore")

	# Recompute avg_score from only included tasks
	# Determine tasks present in df after exclusion
	meta_cols_after = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
	current_metric_cols = [c for c in df.columns if c not in meta_cols_after]

	# Drop existing avg_score before recomputation
	if "avg_score" in df.columns:
	df = df.drop(columns=["avg_score"]) # will be re-added below

	if current_metric_cols:
	numeric_df = pd.DataFrame({c: pd.to_numeric(df[c], errors="coerce") for c in current_metric_cols})
	df["avg_score"] = numeric_df.mean(axis=1, skipna=True).round(2)
	else:
	# No metrics left; fill avg_score with NaN to keep schema consistent
	df["avg_score"] = np.nan

	# Sort and reorder columns similar to _prepare_dataframe
	if "avg_score" in df.columns:
	df = df.sort_values(by="avg_score", ascending=False, na_position="last")

	preferred_order = [c for c in ["Model", "Provider", "Parameters"] if c in df.columns]
	remaining_cols = [c for c in df.columns if c not in preferred_order]
	if "avg_score" in remaining_cols:
	remaining_cols = ["avg_score"] + [c for c in remaining_cols if c != "avg_score"]
	if preferred_order:
	df = df[preferred_order + remaining_cols]

	# Ensure separator column exists right after Parameters
	if "Parameters" in df.columns and "—" not in df.columns:
	insert_at = df.columns.get_loc("Parameters") + 1
	df.insert(insert_at, "—", "")

	# Style for display
	if isinstance(df, pd.DataFrame) and not df.empty:
	styler = df.style
	if "Parameters" in df.columns:
	styler = styler.apply(_style_parameters, subset=["Parameters"]) # type: ignore
	styler = styler.format(_format_value_minimal)
	table_value: object = styler
	else:
	table_value = pd.DataFrame()

	return table_value


	def initialize_tasks_choices(json_path: str):
	"""Initialize the task choices for the exclude tasks checkbox.

	This is separate from the table building to avoid infinite loops.
	"""
	df = _prepare_dataframe(json_path)

	# Determine all task-like columns
	meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
	tasks_all = [c for c in df.columns if c not in meta_cols_base]

	# Return update for the exclude tasks checkbox with just the choices, no value change
	tasks_update = gr.update(choices=tasks_all)

	return tasks_update


	def build_view_and_tasks(
	json_path: str,
	name_filter: str = "",
	param_bins: list[str] \| None = None,
	excluded_tasks: list[str] \| None = None,
	):
	"""Return the table and an update object for the exclude-tasks control.

	Used only for initial loading to set up the choices.
	"""
	table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks)
	tasks_update = initialize_tasks_choices(json_path)

	return table_value, tasks_update


	# ---------------------- Failure cases handling ----------------------

	def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]:
	"""Load failure cases from JSON file.

	Returns dict mapping model_id -> list of failure cases.
	"""
	path = Path(json_path)
	if not path.exists() or not path.is_file():
	return {}
	try:
	with open(path, "r", encoding="utf-8") as f:
	data = json.load(f)
	if isinstance(data, dict):
	return data
	return {}
	except Exception:
	return {}


	def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]:
	"""Get list of available models from failure cases data."""
	return sorted(failure_cases_data.keys()) if failure_cases_data else []


	def render_failure_cases(
	json_path: str,
	selected_model: str
	) -> str:
	"""Render failure cases for selected model as JSON string."""
	if not selected_model:
	return "{}"

	failure_cases_data = load_failure_cases_json(json_path)

	if selected_model not in failure_cases_data:
	return "{}"

	cases = failure_cases_data[selected_model]
	if not cases:
	return "[]"

	for case in cases:
	score = re.search(r"(\d+\.\d+)", case["reasoning"])
	if score:
	case["score"] = float(score.group(1))

	# Return formatted JSON string
	return json.dumps(cases, ensure_ascii=False, indent=2)


	def initialize_failure_cases_dropdown(json_path: str):
	"""Initialize the model dropdown for failure cases."""
	failure_cases_data = load_failure_cases_json(json_path)
	models = get_available_models(failure_cases_data)

	if models:
	return gr.update(choices=models, value=models[0] if models else None)
	else:
	return gr.update(choices=[], value=None)


	def ui() -> gr.Blocks:
	with gr.Blocks(title="Model Leaderboard") as demo:
	gr.Markdown("""
	### Polish Legal RAG Leaderboard

	Explore and compare model performance on Polish legal QA tasks.
	""")

	# Fixed internal state for the JSON paths
	json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
	failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON)

	with gr.Tabs():
	with gr.Tab("Leaderboard"):
	gr.Markdown("""
	- Use filters to narrow by name and parameter bins.
	- Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
	- Click column headers to sort; data updates automatically as filters change.
	""")

	# Filters
	with gr.Row():
	name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
	param_bins_in = gr.CheckboxGroup(
	label="Parameter bins",
	choices=PARAM_BIN_CHOICES,
	value=[],
	info="Select one or more bins"
	)
	excluded_tasks_in = gr.CheckboxGroup(
	label="Exclude tasks",
	choices=[],
	value=[],
	info="Select tasks to hide; all are shown by default",
	)

	# Non-interactive so Pandas Styler is respected; header sorting remains available
	leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)

	demo.load(
	fn=build_view_and_tasks,
	inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
	outputs=[leaderboard_out, excluded_tasks_in],
	)

	# Recompute table on filter changes
	name_filter_in.change(
	fn=build_view_only,
	inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
	outputs=[leaderboard_out],
	)
	param_bins_in.change(
	fn=build_view_only,
	inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
	outputs=[leaderboard_out],
	)
	excluded_tasks_in.change(
	fn=build_view_only,
	inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
	outputs=[leaderboard_out],
	)

	gr.Markdown("""
	### Methodology
	- `src_clf`: Source classification of a fragment.
	- `sum_rag`: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
	- `sum_rag_v2`: Advanced legal reasoning dataset with multiple question types:
	- Contradiction resolution: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements
	- Legal inference: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions
	""")
	gr.Markdown("""
	### Notes
	- GPT-5-nano sometimes fails to answer, responding with an empty string.
	- GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
	- Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
	- Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
	""")
	gr.Markdown("""
	### Language and RAG prompt
	- All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.

	```text
	Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
	Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
	<relevant_info>
	{passages}
	</relevant_info>

	Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
	Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
	To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
	```
	""")

	with gr.Tab("Failure Cases"):
	gr.Markdown("""
	### Failure Cases Analysis

	Explore failure cases by model to understand where models struggle.
	""")

	with gr.Row():
	model_dropdown = gr.Dropdown(
	label="Select Model",
	choices=[],
	value=None,
	info="Choose a model to view its failure cases"
	)

	failure_cases_out = gr.Code(
	label="Failure Cases",
	language="json",
	interactive=False,
	lines=15
	)

	# Initialize dropdown and load data
	demo.load(
	fn=initialize_failure_cases_dropdown,
	inputs=[failure_cases_path_state],
	outputs=[model_dropdown],
	)

	# Update failure cases when model selection changes
	model_dropdown.change(
	fn=render_failure_cases,
	inputs=[failure_cases_path_state, model_dropdown],
	outputs=[failure_cases_out],
	)

	return demo


	if __name__ == "__main__":
	app = ui()
	app.queue().launch(server_name="0.0.0.0", server_port=7860, show_api=False)