rzanoli commited on
Commit
7a90675
Β·
1 Parent(s): ea6af72

Small Changes

Browse files
Files changed (4) hide show
  1. app.py +42 -11
  2. src/about.py +1 -1
  3. src/leaderboard/read_evals.py +10 -4
  4. src/tasks.py +1 -1
app.py CHANGED
@@ -12,13 +12,17 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
12
  from src.submission.submit import add_new_eval
13
 
14
  # Define task metadata (icons, names, descriptions)
15
- TASK_METADATA = {
16
  "TE": {"icon": "πŸ“Š", "name": "Textual Entailment", "tooltip": ""},
17
  "SA": {"icon": "πŸ˜ƒ", "name": "Sentiment Analysis", "tooltip": ""},
18
  "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
19
  "AT": {"icon": "πŸ₯", "name": "Admission Test", "tooltip": ""},
20
  "WIC": {"icon": "πŸ”€", "name": "Word in Context", "tooltip": ""},
21
- "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""},
 
 
 
 
22
  "LS": {"icon": "πŸ”„", "name": "Lexical Substitution", "tooltip": ""},
23
  "SU": {"icon": "πŸ“", "name": "Summarization", "tooltip": ""},
24
  "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
@@ -40,16 +44,16 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
40
  return Leaderboard(
41
  value=dataframe,
42
  datatype=[c.type for c in field_list],
43
- select_columns=SelectColumns(
44
- default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
45
- cant_deselect=[c.name for c in field_list if c.never_hidden],
46
- label="Select Columns to Display:",
47
- ),
48
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
49
  hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
50
  filter_columns=[
51
- ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"),
52
- ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
53
  ],
54
  bool_checkboxgroup_label="Hide models",
55
  interactive=False,
@@ -109,7 +113,7 @@ with demo:
109
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
110
 
111
  # Main leaderboard tab
112
- with gr.TabItem("πŸ… EVALITA-LLM Benchmark"):
113
 
114
  leaderboard = init_leaderboard(
115
  LEADERBOARD_DF,
@@ -121,8 +125,13 @@ with demo:
121
  with gr.TabItem("πŸ“ About"):
122
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
123
 
 
 
 
 
124
  # Task-specific leaderboards
125
- for task, metadata in TASK_METADATA.items():
 
126
  with gr.TabItem(f"{metadata['icon']}{task}"):
127
 
128
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
@@ -134,6 +143,28 @@ with demo:
134
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
135
  )
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  # Citation section
138
  with gr.Accordion("πŸ“™ Citation", open=False):
139
  gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
 
12
  from src.submission.submit import add_new_eval
13
 
14
  # Define task metadata (icons, names, descriptions)
15
+ TASK_METADATA_MULTIPLECHOICE = {
16
  "TE": {"icon": "πŸ“Š", "name": "Textual Entailment", "tooltip": ""},
17
  "SA": {"icon": "πŸ˜ƒ", "name": "Sentiment Analysis", "tooltip": ""},
18
  "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""},
19
  "AT": {"icon": "πŸ₯", "name": "Admission Test", "tooltip": ""},
20
  "WIC": {"icon": "πŸ”€", "name": "Word in Context", "tooltip": ""},
21
+ "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}
22
+ }
23
+
24
+ # Define task metadata (icons, names, descriptions)
25
+ TASK_METADATA_GENERATIVE = {
26
  "LS": {"icon": "πŸ”„", "name": "Lexical Substitution", "tooltip": ""},
27
  "SU": {"icon": "πŸ“", "name": "Summarization", "tooltip": ""},
28
  "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""},
 
44
  return Leaderboard(
45
  value=dataframe,
46
  datatype=[c.type for c in field_list],
47
+ #select_columns=SelectColumns(
48
+ # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
49
+ # cant_deselect=[c.name for c in field_list if c.never_hidden],
50
+ # label="Select Columns to Display:",
51
+ #),
52
  search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
53
  hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
54
  filter_columns=[
55
+ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)")
56
+ # ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
57
  ],
58
  bool_checkboxgroup_label="Hide models",
59
  interactive=False,
 
113
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
114
 
115
  # Main leaderboard tab
116
+ with gr.TabItem("πŸ… Benchmark"):
117
 
118
  leaderboard = init_leaderboard(
119
  LEADERBOARD_DF,
 
125
  with gr.TabItem("πŸ“ About"):
126
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
127
 
128
+ # About tab
129
+ with gr.TabItem("β•‘", interactive=False):
130
+ gr.Markdown("", elem_classes="markdown-text")
131
+
132
  # Task-specific leaderboards
133
+ for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
134
+
135
  with gr.TabItem(f"{metadata['icon']}{task}"):
136
 
137
  task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
 
143
  hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
144
  )
145
 
146
+ # About tab
147
+ with gr.TabItem("β”‚", interactive=False):
148
+ gr.Markdown("", elem_classes="markdown-text")
149
+
150
+ # Task-specific leaderboards
151
+ for task, metadata in TASK_METADATA_GENERATIVE.items():
152
+ with gr.TabItem(f"{metadata['icon']}{task}"):
153
+ task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
154
+ gr.Markdown(task_description, elem_classes="markdown-text")
155
+
156
+ leaderboard = init_leaderboard(
157
+ LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
158
+ f"{task} Best Prompt": "Best Prompt",
159
+ f"{task} Best Prompt Id": "Best Prompt Id",
160
+ task: "Combined Performance"}),
161
+ default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
162
+ 'Best Prompt Id'],
163
+ hidden_columns=[col for col in LEADERBOARD_DF.columns if
164
+ col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average',
165
+ 'Best Prompt', 'Best Prompt Id']]
166
+ )
167
+
168
  # Citation section
169
  with gr.Accordion("πŸ“™ Citation", open=False):
170
  gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
src/about.py CHANGED
@@ -91,7 +91,7 @@ TITLE = """<h1 align="center" id="space-title">πŸš€ EVALITA-LLM Leaderboard πŸš€
91
 
92
  # What does your leaderboard evaluate?
93
  INTRODUCTION_TEXT = """
94
- Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
95
  """
96
 
97
  # Which evaluations are you running? how can people reproduce what you have?
 
91
 
92
  # What does your leaderboard evaluate?
93
  INTRODUCTION_TEXT = """
94
+ Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) **all tasks are native Italian**, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well-established **multiple-choice** tasks (6 tasks), the benchmark includes **generative** tasks (4 tasks), enabling more natural interaction with LLMs; (iii) **all tasks are evaluated against multiple prompts**, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
95
  """
96
 
97
  # Which evaluations are you running? how can people reproduce what you have?
src/leaderboard/read_evals.py CHANGED
@@ -6,6 +6,7 @@ from dataclasses import dataclass
6
 
7
  import dateutil
8
  import numpy as np
 
9
 
10
  #from get_model_info import num_params
11
  from src.display.formatting import make_clickable_model
@@ -22,8 +23,8 @@ class EvalResult:
22
  org: str
23
  model: str
24
  revision: str # commit hash, "" if main
25
- results: dict
26
- average_CPS: str
27
  fewshot: int
28
  fewshot_type: FewShotType = FewShotType.Unknown
29
  weight_type: WeightType = WeightType.Original # Original or Adapter
@@ -42,7 +43,9 @@ class EvalResult:
42
 
43
  config = data.get("config")
44
 
45
- average_CPS = f"{data.get('average_CPS'):.2f}"
 
 
46
 
47
  num_fewshot = config.get("num_fewshot", 0) # Imposta il valore predefinito a 0
48
  try:
@@ -92,7 +95,10 @@ class EvalResult:
92
  if "Best Prompt Id" in task.col_name:
93
  results[task.benchmark] = int(v[task.metric_type][-1:])
94
  else:
95
- results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display
 
 
 
96
 
97
  return self(
98
  eval_name=result_key,
 
6
 
7
  import dateutil
8
  import numpy as np
9
+ from typing import Dict, Union
10
 
11
  #from get_model_info import num_params
12
  from src.display.formatting import make_clickable_model
 
23
  org: str
24
  model: str
25
  revision: str # commit hash, "" if main
26
+ results: Dict[str, Union[float, int]] # float o int
27
+ average_CPS: float
28
  fewshot: int
29
  fewshot_type: FewShotType = FewShotType.Unknown
30
  weight_type: WeightType = WeightType.Original # Original or Adapter
 
43
 
44
  config = data.get("config")
45
 
46
+ #average_CPS = f"{data.get('average_CPS'):.2f}"
47
+ # Ottieni average_CPS come float
48
+ average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default
49
 
50
  num_fewshot = config.get("num_fewshot", 0) # Imposta il valore predefinito a 0
51
  try:
 
95
  if "Best Prompt Id" in task.col_name:
96
  results[task.benchmark] = int(v[task.metric_type][-1:])
97
  else:
98
+ #results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display
99
+ results[task.benchmark] = float(v[task.metric_type])
100
+ #value = float(v[task.metric_type])
101
+ #results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali
102
 
103
  return self(
104
  eval_name=result_key,
src/tasks.py CHANGED
@@ -16,7 +16,7 @@ TITLE = """<h1 align="center" id="space-title">πŸš€ EVALITA-LLM Leaderboard πŸš€
16
 
17
  # What does your leaderboard evaluate?
18
  INTRODUCTION_TEXT = """
19
- Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
20
  """
21
 
22
  #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
 
16
 
17
  # What does your leaderboard evaluate?
18
  INTRODUCTION_TEXT = """
19
+ Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding translation issues and potential cultural biases; (ii) the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
20
  """
21
 
22
  #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"