Add the results of GPT-4o for NER, REL and LS
Browse files
app.py
CHANGED
|
@@ -118,7 +118,7 @@ def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
|
|
| 118 |
return fig
|
| 119 |
|
| 120 |
|
| 121 |
-
def boxplot_per_task(dataframe=None, baselines=None):
|
| 122 |
|
| 123 |
#print(dataframe.columns)
|
| 124 |
|
|
@@ -176,6 +176,16 @@ def boxplot_per_task(dataframe=None, baselines=None):
|
|
| 176 |
)
|
| 177 |
'''
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
fig.update_layout(
|
| 180 |
title="Distribution of Model Accuracy by Task",
|
| 181 |
xaxis_title="Task",
|
|
@@ -190,7 +200,8 @@ def boxplot_per_task(dataframe=None, baselines=None):
|
|
| 190 |
fig.add_annotation(
|
| 191 |
text=(
|
| 192 |
"In tasks like TE and SA, models approach the accuracy of supervised <br>"
|
| 193 |
-
"models at EVALITA (dashed line); in NER and REL they remain lower."
|
|
|
|
| 194 |
),
|
| 195 |
xref="paper", yref="paper",
|
| 196 |
x=0.5, y=-0.30,
|
|
@@ -203,12 +214,19 @@ def boxplot_per_task(dataframe=None, baselines=None):
|
|
| 203 |
|
| 204 |
return fig
|
| 205 |
|
| 206 |
-
|
| 207 |
BASELINES = {
|
| 208 |
"TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
|
| 209 |
"LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99
|
| 210 |
}
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
def boxplot_prompts_per_task(dataframe, tasks=None):
|
| 214 |
if tasks is None:
|
|
@@ -690,7 +708,7 @@ with demo:
|
|
| 690 |
# ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs
|
| 691 |
with gr.Row():
|
| 692 |
gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart")
|
| 693 |
-
gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES), elem_id="boxplot-task")
|
| 694 |
#gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF), elem_id="boxplot-prompt-task")
|
| 695 |
|
| 696 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
|
|
|
| 118 |
return fig
|
| 119 |
|
| 120 |
|
| 121 |
+
def boxplot_per_task(dataframe=None, baselines=None, references=None):
|
| 122 |
|
| 123 |
#print(dataframe.columns)
|
| 124 |
|
|
|
|
| 176 |
)
|
| 177 |
'''
|
| 178 |
|
| 179 |
+
# reference GPT-4o
|
| 180 |
+
if task in references and references[task] is not None:
|
| 181 |
+
fig.add_shape(
|
| 182 |
+
type="line",
|
| 183 |
+
x0=i - 0.3, x1=i + 0.3,
|
| 184 |
+
y0=references[task], y1=references[task],
|
| 185 |
+
line=dict(color="red", width=2, dash="dashdot"),
|
| 186 |
+
xref="x", yref="y"
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
fig.update_layout(
|
| 190 |
title="Distribution of Model Accuracy by Task",
|
| 191 |
xaxis_title="Task",
|
|
|
|
| 200 |
fig.add_annotation(
|
| 201 |
text=(
|
| 202 |
"In tasks like TE and SA, models approach the accuracy of supervised <br>"
|
| 203 |
+
"models at EVALITA (dashed black line); in NER and REL they remain lower. <br>"
|
| 204 |
+
"Dashed red lines indicate GPT-4o results on generative tasks."
|
| 205 |
),
|
| 206 |
xref="paper", yref="paper",
|
| 207 |
x=0.5, y=-0.30,
|
|
|
|
| 214 |
|
| 215 |
return fig
|
| 216 |
|
| 217 |
+
# EVALITA results
|
| 218 |
BASELINES = {
|
| 219 |
"TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
|
| 220 |
"LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99
|
| 221 |
}
|
| 222 |
|
| 223 |
+
# GPT-4o
|
| 224 |
+
REFERENCES = {
|
| 225 |
+
"NER": 79.11,
|
| 226 |
+
"REL": 63.32,
|
| 227 |
+
"LS": 59.25
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
|
| 231 |
def boxplot_prompts_per_task(dataframe, tasks=None):
|
| 232 |
if tasks is None:
|
|
|
|
| 708 |
# ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs
|
| 709 |
with gr.Row():
|
| 710 |
gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart")
|
| 711 |
+
gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
|
| 712 |
#gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF), elem_id="boxplot-prompt-task")
|
| 713 |
|
| 714 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|