Small Changes
Browse files- app.py +42 -11
- src/about.py +1 -1
- src/leaderboard/read_evals.py +10 -4
- src/tasks.py +1 -1
app.py
CHANGED
@@ -12,13 +12,17 @@ from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
12 |
from src.submission.submit import add_new_eval
|
13 |
|
14 |
# Define task metadata (icons, names, descriptions)
|
15 |
-
|
16 |
"TE": {"icon": "π", "name": "Textual Entailment", "tooltip": ""},
|
17 |
"SA": {"icon": "π", "name": "Sentiment Analysis", "tooltip": ""},
|
18 |
"HS": {"icon": "β οΈ", "name": "Hate Speech", "tooltip": ""},
|
19 |
"AT": {"icon": "π₯", "name": "Admission Test", "tooltip": ""},
|
20 |
"WIC": {"icon": "π€", "name": "Word in Context", "tooltip": ""},
|
21 |
-
"FAQ": {"icon": "β", "name": "Frequently Asked Questions", "tooltip": ""}
|
|
|
|
|
|
|
|
|
22 |
"LS": {"icon": "π", "name": "Lexical Substitution", "tooltip": ""},
|
23 |
"SU": {"icon": "π", "name": "Summarization", "tooltip": ""},
|
24 |
"NER": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""},
|
@@ -40,16 +44,16 @@ def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
|
|
40 |
return Leaderboard(
|
41 |
value=dataframe,
|
42 |
datatype=[c.type for c in field_list],
|
43 |
-
select_columns=SelectColumns(
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
),
|
48 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
49 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
50 |
filter_columns=[
|
51 |
-
ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)")
|
52 |
-
|
53 |
],
|
54 |
bool_checkboxgroup_label="Hide models",
|
55 |
interactive=False,
|
@@ -109,7 +113,7 @@ with demo:
|
|
109 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
110 |
|
111 |
# Main leaderboard tab
|
112 |
-
with gr.TabItem("π
|
113 |
|
114 |
leaderboard = init_leaderboard(
|
115 |
LEADERBOARD_DF,
|
@@ -121,8 +125,13 @@ with demo:
|
|
121 |
with gr.TabItem("π About"):
|
122 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
123 |
|
|
|
|
|
|
|
|
|
124 |
# Task-specific leaderboards
|
125 |
-
for task, metadata in
|
|
|
126 |
with gr.TabItem(f"{metadata['icon']}{task}"):
|
127 |
|
128 |
task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
|
@@ -134,6 +143,28 @@ with demo:
|
|
134 |
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
|
135 |
)
|
136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
# Citation section
|
138 |
with gr.Accordion("π Citation", open=False):
|
139 |
gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
|
|
|
12 |
from src.submission.submit import add_new_eval
|
13 |
|
14 |
# Define task metadata (icons, names, descriptions)
|
15 |
+
TASK_METADATA_MULTIPLECHOICE = {
|
16 |
"TE": {"icon": "π", "name": "Textual Entailment", "tooltip": ""},
|
17 |
"SA": {"icon": "π", "name": "Sentiment Analysis", "tooltip": ""},
|
18 |
"HS": {"icon": "β οΈ", "name": "Hate Speech", "tooltip": ""},
|
19 |
"AT": {"icon": "π₯", "name": "Admission Test", "tooltip": ""},
|
20 |
"WIC": {"icon": "π€", "name": "Word in Context", "tooltip": ""},
|
21 |
+
"FAQ": {"icon": "β", "name": "Frequently Asked Questions", "tooltip": ""}
|
22 |
+
}
|
23 |
+
|
24 |
+
# Define task metadata (icons, names, descriptions)
|
25 |
+
TASK_METADATA_GENERATIVE = {
|
26 |
"LS": {"icon": "π", "name": "Lexical Substitution", "tooltip": ""},
|
27 |
"SU": {"icon": "π", "name": "Summarization", "tooltip": ""},
|
28 |
"NER": {"icon": "π·οΈ", "name": "Named Entity Recognition", "tooltip": ""},
|
|
|
44 |
return Leaderboard(
|
45 |
value=dataframe,
|
46 |
datatype=[c.type for c in field_list],
|
47 |
+
#select_columns=SelectColumns(
|
48 |
+
# default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default],
|
49 |
+
# cant_deselect=[c.name for c in field_list if c.never_hidden],
|
50 |
+
# label="Select Columns to Display:",
|
51 |
+
#),
|
52 |
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
|
53 |
hide_columns=hidden_columns or [c.name for c in field_list if c.hidden],
|
54 |
filter_columns=[
|
55 |
+
ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)")
|
56 |
+
# ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"),
|
57 |
],
|
58 |
bool_checkboxgroup_label="Hide models",
|
59 |
interactive=False,
|
|
|
113 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
114 |
|
115 |
# Main leaderboard tab
|
116 |
+
with gr.TabItem("π
Benchmark"):
|
117 |
|
118 |
leaderboard = init_leaderboard(
|
119 |
LEADERBOARD_DF,
|
|
|
125 |
with gr.TabItem("π About"):
|
126 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
127 |
|
128 |
+
# About tab
|
129 |
+
with gr.TabItem("β", interactive=False):
|
130 |
+
gr.Markdown("", elem_classes="markdown-text")
|
131 |
+
|
132 |
# Task-specific leaderboards
|
133 |
+
for task, metadata in TASK_METADATA_MULTIPLECHOICE.items():
|
134 |
+
|
135 |
with gr.TabItem(f"{metadata['icon']}{task}"):
|
136 |
|
137 |
task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
|
|
|
143 |
hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
|
144 |
)
|
145 |
|
146 |
+
# About tab
|
147 |
+
with gr.TabItem("β", interactive=False):
|
148 |
+
gr.Markdown("", elem_classes="markdown-text")
|
149 |
+
|
150 |
+
# Task-specific leaderboards
|
151 |
+
for task, metadata in TASK_METADATA_GENERATIVE.items():
|
152 |
+
with gr.TabItem(f"{metadata['icon']}{task}"):
|
153 |
+
task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
|
154 |
+
gr.Markdown(task_description, elem_classes="markdown-text")
|
155 |
+
|
156 |
+
leaderboard = init_leaderboard(
|
157 |
+
LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average",
|
158 |
+
f"{task} Best Prompt": "Best Prompt",
|
159 |
+
f"{task} Best Prompt Id": "Best Prompt Id",
|
160 |
+
task: "Combined Performance"}),
|
161 |
+
default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt',
|
162 |
+
'Best Prompt Id'],
|
163 |
+
hidden_columns=[col for col in LEADERBOARD_DF.columns if
|
164 |
+
col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average',
|
165 |
+
'Best Prompt', 'Best Prompt Id']]
|
166 |
+
)
|
167 |
+
|
168 |
# Citation section
|
169 |
with gr.Accordion("π Citation", open=False):
|
170 |
gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)
|
src/about.py
CHANGED
@@ -91,7 +91,7 @@ TITLE = """<h1 align="center" id="space-title">π EVALITA-LLM Leaderboard π
|
|
91 |
|
92 |
# What does your leaderboard evaluate?
|
93 |
INTRODUCTION_TEXT = """
|
94 |
-
Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian
|
95 |
"""
|
96 |
|
97 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
91 |
|
92 |
# What does your leaderboard evaluate?
|
93 |
INTRODUCTION_TEXT = """
|
94 |
+
Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) **all tasks are native Italian**, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well-established **multiple-choice** tasks (6 tasks), the benchmark includes **generative** tasks (4 tasks), enabling more natural interaction with LLMs; (iii) **all tasks are evaluated against multiple prompts**, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
|
95 |
"""
|
96 |
|
97 |
# Which evaluations are you running? how can people reproduce what you have?
|
src/leaderboard/read_evals.py
CHANGED
@@ -6,6 +6,7 @@ from dataclasses import dataclass
|
|
6 |
|
7 |
import dateutil
|
8 |
import numpy as np
|
|
|
9 |
|
10 |
#from get_model_info import num_params
|
11 |
from src.display.formatting import make_clickable_model
|
@@ -22,8 +23,8 @@ class EvalResult:
|
|
22 |
org: str
|
23 |
model: str
|
24 |
revision: str # commit hash, "" if main
|
25 |
-
results:
|
26 |
-
average_CPS:
|
27 |
fewshot: int
|
28 |
fewshot_type: FewShotType = FewShotType.Unknown
|
29 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
@@ -42,7 +43,9 @@ class EvalResult:
|
|
42 |
|
43 |
config = data.get("config")
|
44 |
|
45 |
-
average_CPS = f"{data.get('average_CPS'):.2f}"
|
|
|
|
|
46 |
|
47 |
num_fewshot = config.get("num_fewshot", 0) # Imposta il valore predefinito a 0
|
48 |
try:
|
@@ -92,7 +95,10 @@ class EvalResult:
|
|
92 |
if "Best Prompt Id" in task.col_name:
|
93 |
results[task.benchmark] = int(v[task.metric_type][-1:])
|
94 |
else:
|
95 |
-
results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display
|
|
|
|
|
|
|
96 |
|
97 |
return self(
|
98 |
eval_name=result_key,
|
|
|
6 |
|
7 |
import dateutil
|
8 |
import numpy as np
|
9 |
+
from typing import Dict, Union
|
10 |
|
11 |
#from get_model_info import num_params
|
12 |
from src.display.formatting import make_clickable_model
|
|
|
23 |
org: str
|
24 |
model: str
|
25 |
revision: str # commit hash, "" if main
|
26 |
+
results: Dict[str, Union[float, int]] # float o int
|
27 |
+
average_CPS: float
|
28 |
fewshot: int
|
29 |
fewshot_type: FewShotType = FewShotType.Unknown
|
30 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
|
|
43 |
|
44 |
config = data.get("config")
|
45 |
|
46 |
+
#average_CPS = f"{data.get('average_CPS'):.2f}"
|
47 |
+
# Ottieni average_CPS come float
|
48 |
+
average_CPS = float(data.get('average_CPS', 0.0)) # 0.0 come valore di default
|
49 |
|
50 |
num_fewshot = config.get("num_fewshot", 0) # Imposta il valore predefinito a 0
|
51 |
try:
|
|
|
95 |
if "Best Prompt Id" in task.col_name:
|
96 |
results[task.benchmark] = int(v[task.metric_type][-1:])
|
97 |
else:
|
98 |
+
#results[task.benchmark] = f"{v[task.metric_type]:.2f}" # Ensure two decimals for display
|
99 |
+
results[task.benchmark] = float(v[task.metric_type])
|
100 |
+
#value = float(v[task.metric_type])
|
101 |
+
#results[task.benchmark] = round(value, 2) # Arrotonda a 2 decimali
|
102 |
|
103 |
return self(
|
104 |
eval_name=result_key,
|
src/tasks.py
CHANGED
@@ -16,7 +16,7 @@ TITLE = """<h1 align="center" id="space-title">π EVALITA-LLM Leaderboard π
|
|
16 |
|
17 |
# What does your leaderboard evaluate?
|
18 |
INTRODUCTION_TEXT = """
|
19 |
-
Evalita-LLM
|
20 |
"""
|
21 |
|
22 |
#MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
|
|
|
16 |
|
17 |
# What does your leaderboard evaluate?
|
18 |
INTRODUCTION_TEXT = """
|
19 |
+
Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding translation issues and potential cultural biases; (ii) the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
|
20 |
"""
|
21 |
|
22 |
#MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
|