evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Sep 15

Commit

07ad260

1 Parent(s): 496735b

Add the results of GPT-4o for NER, REL and LS

Browse files

Files changed (1) hide show

app.py +22 -4

app.py CHANGED Viewed

@@ -118,7 +118,7 @@ def barplot_mean_few_minus_zero_shot(dataframe, tasks=None):
     return fig
-def boxplot_per_task(dataframe=None, baselines=None):
     #print(dataframe.columns)
@@ -176,6 +176,16 @@ def boxplot_per_task(dataframe=None, baselines=None):
                 )
                 '''
     fig.update_layout(
         title="Distribution of Model Accuracy by Task",
         xaxis_title="Task",
@@ -190,7 +200,8 @@ def boxplot_per_task(dataframe=None, baselines=None):
     fig.add_annotation(
         text=(
             "In tasks like TE and SA, models approach the accuracy of supervised <br>"
-            "models at EVALITA (dashed line); in NER and REL they remain lower."
         ),
         xref="paper", yref="paper",
         x=0.5, y=-0.30,
@@ -203,12 +214,19 @@ def boxplot_per_task(dataframe=None, baselines=None):
     return fig
 BASELINES = {
     "TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
     "LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99
 }
 def boxplot_prompts_per_task(dataframe, tasks=None):
     if tasks is None:
@@ -690,7 +708,7 @@ with demo:
     # ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs
     with gr.Row():
         gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart")
-        gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES), elem_id="boxplot-task")
         #gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF), elem_id="boxplot-prompt-task")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:

     return fig
+def boxplot_per_task(dataframe=None, baselines=None, references=None):
     #print(dataframe.columns)
                 )
                 '''
+            # reference GPT-4o
+            if task in references and references[task] is not None:
+                fig.add_shape(
+                    type="line",
+                    x0=i - 0.3, x1=i + 0.3,
+                    y0=references[task], y1=references[task],
+                    line=dict(color="red", width=2, dash="dashdot"),
+                    xref="x", yref="y"
+                )
     fig.update_layout(
         title="Distribution of Model Accuracy by Task",
         xaxis_title="Task",
     fig.add_annotation(
         text=(
             "In tasks like TE and SA, models approach the accuracy of supervised <br>"
+            "models at EVALITA (dashed black line); in NER and REL they remain lower. <br>"
+            "Dashed red lines indicate GPT-4o results on generative tasks."
         ),
         xref="paper", yref="paper",
         x=0.5, y=-0.30,
     return fig
+# EVALITA results
 BASELINES = {
     "TE":71.00, "SA": 66.38, "HS": 80.88, "AT": 82.40, "WIC": 85.00,
     "LS": 38.82, "SU": 38.91, "NER":88.00, "REL": 62.99
 }
+# GPT-4o
+REFERENCES = {
+    "NER": 79.11,
+    "REL": 63.32,
+    "LS": 59.25
+}
 def boxplot_prompts_per_task(dataframe, tasks=None):
     if tasks is None:
     # ⬇️ QUI aggiungiamo i grafici subito sotto la barra del titolo e sopra le tabs
     with gr.Row():
         gr.Plot(value=line_chart(LEADERBOARD_DF), elem_id="line-chart")
+        gr.Plot(value=boxplot_per_task(LEADERBOARD_DF, BASELINES, REFERENCES), elem_id="boxplot-task")
         #gr.Plot(value=boxplot_prompts_per_task(LEADERBOARD_DF), elem_id="boxplot-prompt-task")
     with gr.Tabs(elem_classes="tab-buttons") as tabs: