Spaces:

gaius-lex
/

pl-legal-rag

Running

App Files Files Community

wwydmanski commited on 30 days ago

Commit

e263dc5

verified ·

1 Parent(s): b07df7a

Update app.py

Browse files

Files changed (1) hide show

app.py +223 -84

app.py CHANGED Viewed

@@ -4,8 +4,12 @@ from pathlib import Path
 import pandas as pd
 import numpy as np
 import gradio as gr
 DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
 # Predefined parameter bins for filtering (in billions)
 PARAM_BIN_CHOICES: list[str] = [
@@ -228,17 +232,15 @@ def build_view(json_path: str, name_filter: str = "", param_bins: list[str] | No
     return table_value
-def build_view_and_tasks(
     json_path: str,
     name_filter: str = "",
     param_bins: list[str] | None = None,
     excluded_tasks: list[str] | None = None,
 ):
-    """Return the table and an update object for the exclude-tasks control.
-    - The available task choices are derived from the columns of the prepared dataframe
-      by excluding meta columns and helper columns.
-    - The table excludes the selected tasks and recomputes avg_score from only the included tasks.
     """
     df = _prepare_dataframe(json_path)
@@ -305,97 +307,234 @@ def build_view_and_tasks(
     else:
         table_value = pd.DataFrame()
-    # Update object for the exclude tasks checkbox
-    tasks_update = gr.update(choices=tasks_all, value=excluded_valid)
     return table_value, tasks_update
 def ui() -> gr.Blocks:
     with gr.Blocks(title="Model Leaderboard") as demo:
         gr.Markdown("""
         ### Polish Legal RAG Leaderboard
         Explore and compare model performance on Polish legal QA tasks.
-        - Use filters to narrow by name and parameter bins.
-        - Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
-        - Click column headers to sort; data updates automatically as filters change.
         """)
-        # Fixed internal state for the JSON path; users cannot change this
         json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
-        # Filters
-        with gr.Row():
-            name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
-            param_bins_in = gr.CheckboxGroup(
-                label="Parameter bins",
-                choices=PARAM_BIN_CHOICES,
-                value=[],
-                info="Select one or more bins"
-            )
-            excluded_tasks_in = gr.CheckboxGroup(
-                label="Exclude tasks",
-                choices=[],
-                value=[],
-                info="Select tasks to hide; all are shown by default",
-            )
-        # Non-interactive so Pandas Styler is respected; header sorting remains available
-        leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
-        demo.load(
-            fn=build_view_and_tasks,
-            inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
-            outputs=[leaderboard_out, excluded_tasks_in],
-        )
-        # Recompute table on filter changes
-        name_filter_in.change(
-            fn=build_view_and_tasks,
-            inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
-            outputs=[leaderboard_out, excluded_tasks_in],
-        )
-        param_bins_in.change(
-            fn=build_view_and_tasks,
-            inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
-            outputs=[leaderboard_out, excluded_tasks_in],
-        )
-        excluded_tasks_in.change(
-            fn=build_view_and_tasks,
-            inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
-            outputs=[leaderboard_out, excluded_tasks_in],
-        )
-        gr.Markdown("""
-        ### Methodology
-        - **`src_clf`**: Source classification of a fragment.
-        - **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
-        - **`sum_rag_v2`**: Like `sum_rag` but harder - with longer, augmented contexts and strict deranged negatives built. Same generation and 0-2 judging; we report F1 score.
-        """)
-        gr.Markdown("""
-        ### Notes
-        - GPT-5-nano sometimes fails to answer, responding with an empty string.
-        - GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
-        - Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
-        - Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
-        """)
-        gr.Markdown("""
-        ### Language and RAG prompt
-        - All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
-        ```text
-        Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
-        Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
-        <relevant_info>
-        {passages}
-        </relevant_info>
-        Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
-        Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
-        To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
-        ```
-        """)
     return demo

 import pandas as pd
 import numpy as np
 import gradio as gr
+from typing import Dict, List
+import re
 DEFAULT_OUTPUT_JSON = str((Path(__file__).parent / "leaderboard.json").resolve())
+DEFAULT_FAILURE_CASES_JSON = str((Path(__file__).parent / "failure_cases.json").resolve())
 # Predefined parameter bins for filtering (in billions)
 PARAM_BIN_CHOICES: list[str] = [
     return table_value
+def build_view_only(
     json_path: str,
     name_filter: str = "",
     param_bins: list[str] | None = None,
     excluded_tasks: list[str] | None = None,
 ):
+    """Return only the table without updating the exclude-tasks control.
+    This prevents infinite loops when called from change handlers.
     """
     df = _prepare_dataframe(json_path)
     else:
         table_value = pd.DataFrame()
+    return table_value
+def initialize_tasks_choices(json_path: str):
+    """Initialize the task choices for the exclude tasks checkbox.
+    This is separate from the table building to avoid infinite loops.
+    """
+    df = _prepare_dataframe(json_path)
+    # Determine all task-like columns
+    meta_cols_base = [c for c in ["Model", "Provider", "Parameters", "—", "avg_score"] if c in df.columns]
+    tasks_all = [c for c in df.columns if c not in meta_cols_base]
+    # Return update for the exclude tasks checkbox with just the choices, no value change
+    tasks_update = gr.update(choices=tasks_all)
+    return tasks_update
+def build_view_and_tasks(
+    json_path: str,
+    name_filter: str = "",
+    param_bins: list[str] | None = None,
+    excluded_tasks: list[str] | None = None,
+):
+    """Return the table and an update object for the exclude-tasks control.
+    Used only for initial loading to set up the choices.
+    """
+    table_value = build_view_only(json_path, name_filter, param_bins, excluded_tasks)
+    tasks_update = initialize_tasks_choices(json_path)
     return table_value, tasks_update
+# ---------------------- Failure cases handling ----------------------
+def load_failure_cases_json(json_path: str) -> Dict[str, List[Dict[str, str]]]:
+    """Load failure cases from JSON file.
+    Returns dict mapping model_id -> list of failure cases.
+    """
+    path = Path(json_path)
+    if not path.exists() or not path.is_file():
+        return {}
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        if isinstance(data, dict):
+            return data
+        return {}
+    except Exception:
+        return {}
+def get_available_models(failure_cases_data: Dict[str, List[Dict[str, str]]]) -> List[str]:
+    """Get list of available models from failure cases data."""
+    return sorted(failure_cases_data.keys()) if failure_cases_data else []
+def render_failure_cases(
+    json_path: str,
+    selected_model: str
+) -> str:
+    """Render failure cases for selected model as JSON string."""
+    if not selected_model:
+        return "{}"
+    failure_cases_data = load_failure_cases_json(json_path)
+    if selected_model not in failure_cases_data:
+        return "{}"
+    cases = failure_cases_data[selected_model]
+    if not cases:
+        return "[]"
+    for case in cases:
+        score = re.search(r"(\d+\.\d+)", case["reasoning"])
+        if score:
+            case["score"] = float(score.group(1))
+    # Return formatted JSON string
+    return json.dumps(cases, ensure_ascii=False, indent=2)
+def initialize_failure_cases_dropdown(json_path: str):
+    """Initialize the model dropdown for failure cases."""
+    failure_cases_data = load_failure_cases_json(json_path)
+    models = get_available_models(failure_cases_data)
+    if models:
+        return gr.update(choices=models, value=models[0] if models else None)
+    else:
+        return gr.update(choices=[], value=None)
 def ui() -> gr.Blocks:
     with gr.Blocks(title="Model Leaderboard") as demo:
         gr.Markdown("""
         ### Polish Legal RAG Leaderboard
         Explore and compare model performance on Polish legal QA tasks.
         """)
+        # Fixed internal state for the JSON paths
         json_path_state = gr.State(value=DEFAULT_OUTPUT_JSON)
+        failure_cases_path_state = gr.State(value=DEFAULT_FAILURE_CASES_JSON)
+        with gr.Tabs():
+            with gr.Tab("Leaderboard"):
+                gr.Markdown("""
+                - Use filters to narrow by name and parameter bins.
+                - Use "Exclude tasks" to hide selected metrics; avg_score updates accordingly.
+                - Click column headers to sort; data updates automatically as filters change.
+                """)
+                # Filters
+                with gr.Row():
+                    name_filter_in = gr.Textbox(label="Filter by name", placeholder="e.g. llama", lines=1)
+                    param_bins_in = gr.CheckboxGroup(
+                        label="Parameter bins",
+                        choices=PARAM_BIN_CHOICES,
+                        value=[],
+                        info="Select one or more bins"
+                    )
+                    excluded_tasks_in = gr.CheckboxGroup(
+                        label="Exclude tasks",
+                        choices=[],
+                        value=[],
+                        info="Select tasks to hide; all are shown by default",
+                    )
+                # Non-interactive so Pandas Styler is respected; header sorting remains available
+                leaderboard_out = gr.Dataframe(label="Leaderboard", interactive=False)
+                demo.load(
+                    fn=build_view_and_tasks,
+                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
+                    outputs=[leaderboard_out, excluded_tasks_in],
+                )
+                # Recompute table on filter changes
+                name_filter_in.change(
+                    fn=build_view_only,
+                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
+                    outputs=[leaderboard_out],
+                )
+                param_bins_in.change(
+                    fn=build_view_only,
+                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
+                    outputs=[leaderboard_out],
+                )
+                excluded_tasks_in.change(
+                    fn=build_view_only,
+                    inputs=[json_path_state, name_filter_in, param_bins_in, excluded_tasks_in],
+                    outputs=[leaderboard_out],
+                )
+                gr.Markdown("""
+                ### Methodology
+                - **`src_clf`**: Source classification of a fragment.
+                - **`sum_rag`**: RAG-style QA strictly from provided passages. Answers are graded by a judge gpt-4o model on a 0-2 scale; we report F1 score.
+                - **`sum_rag_v2`**: Like `sum_rag` but harder - with longer, augmented contexts and strict deranged negatives built. Same generation and 0-2 judging; we report F1 score.
+                - **`rag_v2`**: Advanced legal reasoning dataset with multiple question types:
+                  - **Contradiction resolution**: Questions about resolving contradictions or ambiguities within legal texts, requiring analysis of conflicting rules or statements
+                  - **Legal inference**: Questions testing whether hypothetical situations meet specific legal criteria, requiring identification of legal prerequisites and exceptions
+                """)
+                gr.Markdown("""
+                ### Notes
+                - GPT-5-nano sometimes fails to answer, responding with an empty string.
+                - GPT-4o has 100% precision on the `sum_rag_v2` task, but seems to have surprisingly low recall.
+                - Llama-3-8B-Instruct family has limited context length (3 - 8k, 3.1 - 16k), so if the passages are too long, the model will not be able to answer (and will thus be given score 0).
+                - Gaius-Lex v0.8 model is based on Llama-3-8B-Instruct with RoPE scaling = 2.0. It wasn't trained for `src_clf` task.
+                """)
+                gr.Markdown("""
+                ### Language and RAG prompt
+                - All tasks, passages and questions are in Polish. The models are instructed to answer in Polish.
+                ```text
+                Odpowiadasz tylko i wyłącznie po polsku. Twoim zadaniem jest odpowiedzieć na pytanie na podstawie źródeł. Podaj wszystkie interesujące informacje oraz argumenty i cytaty na dowód ich prawdziwości.
+                Nie odpowiadaj na podstawie własnej wiedzy. Jeżeli w źródłach nie ma wymaganych informacji, powiedz to krótko.
+                <relevant_info>
+                {passages}
+                </relevant_info>
+                Odpowiedz na pytanie: `{question}` tylko i wyłącznie na podstawie źródeł. Nie odbiegaj od ich treści.
+                Jeżeli odpowiedź nie jest zawarta w <relevant_info>, odpowiedz że nie ma odpowiedzi w źródłach.
+                To jest kluczowe, że odpowiedź musi być oparta wyłącznie na <relevant_info>.
+                ```
+                """)
+            with gr.Tab("Failure Cases"):
+                gr.Markdown("""
+                ### Failure Cases Analysis
+                Explore failure cases by model to understand where models struggle.
+                """)
+                with gr.Row():
+                    model_dropdown = gr.Dropdown(
+                        label="Select Model",
+                        choices=[],
+                        value=None,
+                        info="Choose a model to view its failure cases"
+                    )
+                failure_cases_out = gr.Code(
+                    label="Failure Cases",
+                    language="json",
+                    interactive=False,
+                    lines=15
+                )
+                # Initialize dropdown and load data
+                demo.load(
+                    fn=initialize_failure_cases_dropdown,
+                    inputs=[failure_cases_path_state],
+                    outputs=[model_dropdown],
+                )
+                # Update failure cases when model selection changes
+                model_dropdown.change(
+                    fn=render_failure_cases,
+                    inputs=[failure_cases_path_state, model_dropdown],
+                    outputs=[failure_cases_out],
+                )
     return demo