Minor changes
Browse filesAdd get_model_info.py to get model's information from HuggingFace
- get_model_info.py +1 -1
- src/about.py +0 -10
- src/populate.py +1 -1
- src/tasks.py +0 -10
get_model_info.py
CHANGED
@@ -84,4 +84,4 @@ for filename in os.listdir(input_folder):
|
|
84 |
except Exception as e:
|
85 |
print(f"Error retrieving info for {model_name}: {e}")
|
86 |
|
87 |
-
print("Process
|
|
|
84 |
except Exception as e:
|
85 |
print(f"Error retrieving info for {model_name}: {e}")
|
86 |
|
87 |
+
print("Process completed!")
|
src/about.py
CHANGED
@@ -64,7 +64,6 @@ class Tasks(Enum):
|
|
64 |
task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
|
65 |
task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
|
66 |
|
67 |
-
|
68 |
'''
|
69 |
task0 = Task("TextualEntailment", "acc", "Textual Entailment")
|
70 |
task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best")
|
@@ -88,18 +87,9 @@ class Tasks(Enum):
|
|
88 |
task19 = Task("REL_best", "acc", "REL_best")
|
89 |
'''
|
90 |
|
91 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
92 |
-
# ---------------------------------------------------
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
# Your leaderboard name
|
97 |
-
|
98 |
-
#TITLE = """<h1 align="center" id="space-title">Work in progress!</h1>"""
|
99 |
# Your leaderboard name
|
100 |
TITLE = """<h1 align="center" id="space-title">π EVALITA-LLM Leaderboard π</h1>"""
|
101 |
|
102 |
-
|
103 |
# What does your leaderboard evaluate?
|
104 |
INTRODUCTION_TEXT = """
|
105 |
Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
|
|
|
64 |
task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
|
65 |
task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
|
66 |
|
|
|
67 |
'''
|
68 |
task0 = Task("TextualEntailment", "acc", "Textual Entailment")
|
69 |
task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best")
|
|
|
87 |
task19 = Task("REL_best", "acc", "REL_best")
|
88 |
'''
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
# Your leaderboard name
|
91 |
TITLE = """<h1 align="center" id="space-title">π EVALITA-LLM Leaderboard π</h1>"""
|
92 |
|
|
|
93 |
# What does your leaderboard evaluate?
|
94 |
INTRODUCTION_TEXT = """
|
95 |
Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
|
src/populate.py
CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
-
from src.leaderboard.
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
|
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
+
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
src/tasks.py
CHANGED
@@ -22,7 +22,6 @@ Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) o
|
|
22 |
#MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
|
23 |
MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)."
|
24 |
|
25 |
-
|
26 |
# Tasks Descriptions
|
27 |
TE_DESCRIPTION = """### Textual Entailment (TE)
|
28 |
The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
|
@@ -40,7 +39,6 @@ TE_DESCRIPTION = """### Textual Entailment (TE)
|
|
40 |
|
41 |
"""
|
42 |
|
43 |
-
|
44 |
SA_DESCRIPTION = """### Sentiment Analysis (SA)
|
45 |
The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
|
46 |
|
@@ -57,7 +55,6 @@ SA_DESCRIPTION = """### Sentiment Analysis (SA)
|
|
57 |
|
58 |
"""
|
59 |
|
60 |
-
|
61 |
HS_DESCRIPTION = """### Hate Speech (HS)
|
62 |
The input is a tweet. The model has to determine whether the text contains hateful content directed at specific target groups: immigrants, Muslims, or Roma. The output is a binary classification: hateful or not hateful.
|
63 |
|
@@ -74,7 +71,6 @@ HS_DESCRIPTION = """### Hate Speech (HS)
|
|
74 |
|
75 |
"""
|
76 |
|
77 |
-
|
78 |
AT_DESCRIPTION = """### Admission Tests (AT)
|
79 |
The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
|
80 |
|
@@ -107,7 +103,6 @@ WIC_DESCRIPTION = """### Word in Context (WIC)
|
|
107 |
|
108 |
"""
|
109 |
|
110 |
-
|
111 |
FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
|
112 |
The input is a user query made by customers to the Acquedotto Pugliese service. The model must determine which of the 4 possible answers is the correct response to the question.
|
113 |
|
@@ -124,7 +119,6 @@ FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
|
|
124 |
|
125 |
"""
|
126 |
|
127 |
-
|
128 |
LS_DESCRIPTION = """### Lexical Substitution (LS)
|
129 |
The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
|
130 |
|
@@ -137,7 +131,6 @@ LS_DESCRIPTION = """### Lexical Substitution (LS)
|
|
137 |
|
138 |
"""
|
139 |
|
140 |
-
|
141 |
SU_DESCRIPTION = """### Summarization (SUM)
|
142 |
The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
|
143 |
|
@@ -150,7 +143,6 @@ SU_DESCRIPTION = """### Summarization (SUM)
|
|
150 |
|
151 |
"""
|
152 |
|
153 |
-
|
154 |
NER_DESCRIPTION = """### Named Entity Recognition (NER)
|
155 |
The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
|
156 |
|
@@ -163,7 +155,6 @@ NER_DESCRIPTION = """### Named Entity Recognition (NER)
|
|
163 |
|
164 |
"""
|
165 |
|
166 |
-
|
167 |
REL_DESCRIPTION = """### Relation Extraction (REL)
|
168 |
The task involves analyzing clinical text to extract relationships between laboratory test results (e.g., blood pressure) and the tests or procedures that produced them (e.g., blood pressure test).
|
169 |
|
@@ -176,7 +167,6 @@ REL_DESCRIPTION = """### Relation Extraction (REL)
|
|
176 |
|
177 |
"""
|
178 |
|
179 |
-
|
180 |
# Create a dictionary to map task names to their descriptions
|
181 |
TASK_DESCRIPTIONS = {
|
182 |
"TE": TE_DESCRIPTION,
|
|
|
22 |
#MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
|
23 |
MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)."
|
24 |
|
|
|
25 |
# Tasks Descriptions
|
26 |
TE_DESCRIPTION = """### Textual Entailment (TE)
|
27 |
The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
|
|
|
39 |
|
40 |
"""
|
41 |
|
|
|
42 |
SA_DESCRIPTION = """### Sentiment Analysis (SA)
|
43 |
The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
|
44 |
|
|
|
55 |
|
56 |
"""
|
57 |
|
|
|
58 |
HS_DESCRIPTION = """### Hate Speech (HS)
|
59 |
The input is a tweet. The model has to determine whether the text contains hateful content directed at specific target groups: immigrants, Muslims, or Roma. The output is a binary classification: hateful or not hateful.
|
60 |
|
|
|
71 |
|
72 |
"""
|
73 |
|
|
|
74 |
AT_DESCRIPTION = """### Admission Tests (AT)
|
75 |
The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
|
76 |
|
|
|
103 |
|
104 |
"""
|
105 |
|
|
|
106 |
FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
|
107 |
The input is a user query made by customers to the Acquedotto Pugliese service. The model must determine which of the 4 possible answers is the correct response to the question.
|
108 |
|
|
|
119 |
|
120 |
"""
|
121 |
|
|
|
122 |
LS_DESCRIPTION = """### Lexical Substitution (LS)
|
123 |
The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
|
124 |
|
|
|
131 |
|
132 |
"""
|
133 |
|
|
|
134 |
SU_DESCRIPTION = """### Summarization (SUM)
|
135 |
The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
|
136 |
|
|
|
143 |
|
144 |
"""
|
145 |
|
|
|
146 |
NER_DESCRIPTION = """### Named Entity Recognition (NER)
|
147 |
The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
|
148 |
|
|
|
155 |
|
156 |
"""
|
157 |
|
|
|
158 |
REL_DESCRIPTION = """### Relation Extraction (REL)
|
159 |
The task involves analyzing clinical text to extract relationships between laboratory test results (e.g., blood pressure) and the tests or procedures that produced them (e.g., blood pressure test).
|
160 |
|
|
|
167 |
|
168 |
"""
|
169 |
|
|
|
170 |
# Create a dictionary to map task names to their descriptions
|
171 |
TASK_DESCRIPTIONS = {
|
172 |
"TE": TE_DESCRIPTION,
|