Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Mar 27

Commit

7a90675

1 Parent(s): ea6af72

Small Changes

Browse files

Files changed (4) hide show

app.py +42 -11
src/about.py +1 -1
src/leaderboard/read_evals.py +10 -4
src/tasks.py +1 -1

app.py CHANGED Viewed

@@ -12,13 +12,17 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
 # Define task metadata (icons, names, descriptions)
-TASK_METADATA = {
     "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
     "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""},
     "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
     "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""},
     "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""},
-    "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""},
     "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
     "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
     "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
@@ -40,16 +44,16 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in field_list],
-        select_columns=SelectColumns(
-            default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
-            cant_deselect=[c.name for c in field_list if c.never_hidden],
-            label="Select Columns to Display:",
-        ),
         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
         filter_columns=[
-            ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
-            ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
@@ -109,7 +113,7 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # Main leaderboard tab
-        with gr.TabItem("🏅 EVALITA-LLM Benchmark"):
             leaderboard = init_leaderboard(
                 LEADERBOARD_DF,
@@ -121,8 +125,13 @@ with demo:
         with gr.TabItem("📝 About"):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
         # Task-specific leaderboards
-        for task, metadata in TASK_METADATA.items():
             with gr.TabItem(f"{metadata['icon']}{task}"):
                 task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
@@ -134,6 +143,28 @@ with demo:
                     hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
                 )
     # Citation section
     with gr.Accordion("📙 Citation", open=False):
         gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)

 from src.submission.submit import add_new_eval
 # Define task metadata (icons, names, descriptions)
+TASK_METADATA_MULTIPLECHOICE = {
     "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""},
     "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""},
     "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
     "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""},
     "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""},
+    "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
+}
+# Define task metadata (icons, names, descriptions)
+TASK_METADATA_GENERATIVE = {
     "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""},
     "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""},
     "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in field_list],
+        #select_columns=SelectColumns(
+        #    default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
+        #    cant_deselect=[c.name for c in field_list if c.never_hidden],
+        #    label="Select Columns to Display:",
+        #),
         search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
         hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
         filter_columns=[
+            ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)")
+        #   ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
         ],
         bool_checkboxgroup_label="Hide models",
         interactive=False,
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # Main leaderboard tab
+        with gr.TabItem("🏅 Benchmark"):
             leaderboard = init_leaderboard(
                 LEADERBOARD_DF,
         with gr.TabItem("📝 About"):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        # About tab
+        with gr.TabItem("║", interactive=False):
+            gr.Markdown("", elem_classes="markdown-text")
         # Task-specific leaderboards
+        for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
             with gr.TabItem(f"{metadata['icon']}{task}"):
                 task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
                     hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
                 )
+        # About tab
+        with gr.TabItem("│", interactive=False):
+            gr.Markdown("", elem_classes="markdown-text")
+        # Task-specific leaderboards
+        for task, metadata in TASK_METADATA_GENERATIVE.items():
+            with gr.TabItem(f"{metadata['icon']}{task}"):
+                task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
+                gr.Markdown(task_description, elem_classes="markdown-text")
+                leaderboard = init_leaderboard(
+                    LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
+                                                   f"{task} Best Prompt": "Best Prompt",
+                                                   f"{task} Best Prompt Id": "Best Prompt Id",
+                                                   task: "Combined Performance"}),
+                    default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
+                                       'Best Prompt Id'],
+                    hidden_columns=[col for col in LEADERBOARD_DF.columns if
+                                    col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average',
+                                                'Best Prompt', 'Best Prompt Id']]
+                )
     # Citation section
     with gr.Accordion("📙 Citation", open=False):
         gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)

src/about.py CHANGED Viewed

@@ -91,7 +91,7 @@ TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
 """
 # Which evaluations are you running? how can people reproduce what you have?

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) **all tasks are native Italian**, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well-established **multiple-choice** tasks (6 tasks), the benchmark includes **generative** tasks (4 tasks), enabling more natural interaction with LLMs; (iii) **all tasks are evaluated against multiple prompts**, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
 """
 # Which evaluations are you running? how can people reproduce what you have?

src/leaderboard/read_evals.py CHANGED Viewed

@@ -6,6 +6,7 @@ from dataclasses import dataclass
 import dateutil
 import numpy as np
 #from get_model_info import num_params
 from src.display.formatting import make_clickable_model
@@ -22,8 +23,8 @@ class EvalResult:
     org: str
     model: str
     revision: str # commit hash, "" if main
-    results: dict
-    average_CPS: str
     fewshot: int
     fewshot_type: FewShotType = FewShotType.Unknown
     weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -42,7 +43,9 @@ class EvalResult:
         config = data.get("config")
-        average_CPS = f"{data.get('average_CPS'):.2f}"
         num_fewshot = config.get("num_fewshot", 0)  # Imposta il valore predefinito a 0
         try:
@@ -92,7 +95,10 @@ class EvalResult:
                     if "Best Prompt Id" in task.col_name:
                         results[task.benchmark] = int(v[task.metric_type][-1:])
                     else:
-                        results[task.benchmark] = f"{v[task.metric_type]:.2f}"  # Ensure two decimals for display
         return self(
             eval_name=result_key,

 import dateutil
 import numpy as np
+from typing import Dict, Union
 #from get_model_info import num_params
 from src.display.formatting import make_clickable_model
     org: str
     model: str
     revision: str # commit hash, "" if main
+    results: Dict[str, Union[float, int]]  # float o int
+    average_CPS: float
     fewshot: int
     fewshot_type: FewShotType = FewShotType.Unknown
     weight_type: WeightType = WeightType.Original # Original or Adapter
         config = data.get("config")
+        #average_CPS = f"{data.get('average_CPS'):.2f}"
+        # Ottieni average_CPS come float
+        average_CPS = float(data.get('average_CPS', 0.0))  # 0.0 come valore di default
         num_fewshot = config.get("num_fewshot", 0)  # Imposta il valore predefinito a 0
         try:
                     if "Best Prompt Id" in task.col_name:
                         results[task.benchmark] = int(v[task.metric_type][-1:])
                     else:
+                        #results[task.benchmark] = f"{v[task.metric_type]:.2f}"  # Ensure two decimals for display
+                        results[task.benchmark] = float(v[task.metric_type])
+                        #value = float(v[task.metric_type])
+                        #results[task.benchmark] = round(value, 2)  # Arrotonda a 2 decimali
         return self(
             eval_name=result_key,

src/tasks.py CHANGED Viewed

@@ -16,7 +16,7 @@ TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
 """
 #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding translation issues and potential cultural biases; (ii) the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
 """
 #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"