rzanoli commited on
Commit
12c62aa
Β·
1 Parent(s): c996d40

Minor changes

Browse files

Add get_model_info.py to get model's information from HuggingFace

Files changed (4) hide show
  1. get_model_info.py +1 -1
  2. src/about.py +0 -10
  3. src/populate.py +1 -1
  4. src/tasks.py +0 -10
get_model_info.py CHANGED
@@ -84,4 +84,4 @@ for filename in os.listdir(input_folder):
84
  except Exception as e:
85
  print(f"Error retrieving info for {model_name}: {e}")
86
 
87
- print("Process completed1.")
 
84
  except Exception as e:
85
  print(f"Error retrieving info for {model_name}: {e}")
86
 
87
+ print("Process completed!")
src/about.py CHANGED
@@ -64,7 +64,6 @@ class Tasks(Enum):
64
  task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
65
  task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
66
 
67
-
68
  '''
69
  task0 = Task("TextualEntailment", "acc", "Textual Entailment")
70
  task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best")
@@ -88,18 +87,9 @@ class Tasks(Enum):
88
  task19 = Task("REL_best", "acc", "REL_best")
89
  '''
90
 
91
- NUM_FEWSHOT = 0 # Change with your few shot
92
- # ---------------------------------------------------
93
-
94
-
95
-
96
- # Your leaderboard name
97
-
98
- #TITLE = """<h1 align="center" id="space-title">Work in progress!</h1>"""
99
  # Your leaderboard name
100
  TITLE = """<h1 align="center" id="space-title">πŸš€ EVALITA-LLM Leaderboard πŸš€</h1>"""
101
 
102
-
103
  # What does your leaderboard evaluate?
104
  INTRODUCTION_TEXT = """
105
  Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
 
64
  task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
65
  task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
66
 
 
67
  '''
68
  task0 = Task("TextualEntailment", "acc", "Textual Entailment")
69
  task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best")
 
87
  task19 = Task("REL_best", "acc", "REL_best")
88
  '''
89
 
 
 
 
 
 
 
 
 
90
  # Your leaderboard name
91
  TITLE = """<h1 align="center" id="space-title">πŸš€ EVALITA-LLM Leaderboard πŸš€</h1>"""
92
 
 
93
  # What does your leaderboard evaluate?
94
  INTRODUCTION_TEXT = """
95
  Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.
src/populate.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals_old import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
+ from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
src/tasks.py CHANGED
@@ -22,7 +22,6 @@ Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) o
22
  #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
23
  MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)."
24
 
25
-
26
  # Tasks Descriptions
27
  TE_DESCRIPTION = """### Textual Entailment (TE)
28
  The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
@@ -40,7 +39,6 @@ TE_DESCRIPTION = """### Textual Entailment (TE)
40
 
41
  """
42
 
43
-
44
  SA_DESCRIPTION = """### Sentiment Analysis (SA)
45
  The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
46
 
@@ -57,7 +55,6 @@ SA_DESCRIPTION = """### Sentiment Analysis (SA)
57
 
58
  """
59
 
60
-
61
  HS_DESCRIPTION = """### Hate Speech (HS)
62
  The input is a tweet. The model has to determine whether the text contains hateful content directed at specific target groups: immigrants, Muslims, or Roma. The output is a binary classification: hateful or not hateful.
63
 
@@ -74,7 +71,6 @@ HS_DESCRIPTION = """### Hate Speech (HS)
74
 
75
  """
76
 
77
-
78
  AT_DESCRIPTION = """### Admission Tests (AT)
79
  The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
80
 
@@ -107,7 +103,6 @@ WIC_DESCRIPTION = """### Word in Context (WIC)
107
 
108
  """
109
 
110
-
111
  FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
112
  The input is a user query made by customers to the Acquedotto Pugliese service. The model must determine which of the 4 possible answers is the correct response to the question.
113
 
@@ -124,7 +119,6 @@ FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
124
 
125
  """
126
 
127
-
128
  LS_DESCRIPTION = """### Lexical Substitution (LS)
129
  The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
130
 
@@ -137,7 +131,6 @@ LS_DESCRIPTION = """### Lexical Substitution (LS)
137
 
138
  """
139
 
140
-
141
  SU_DESCRIPTION = """### Summarization (SUM)
142
  The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
143
 
@@ -150,7 +143,6 @@ SU_DESCRIPTION = """### Summarization (SUM)
150
 
151
  """
152
 
153
-
154
  NER_DESCRIPTION = """### Named Entity Recognition (NER)
155
  The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
156
 
@@ -163,7 +155,6 @@ NER_DESCRIPTION = """### Named Entity Recognition (NER)
163
 
164
  """
165
 
166
-
167
  REL_DESCRIPTION = """### Relation Extraction (REL)
168
  The task involves analyzing clinical text to extract relationships between laboratory test results (e.g., blood pressure) and the tests or procedures that produced them (e.g., blood pressure test).
169
 
@@ -176,7 +167,6 @@ REL_DESCRIPTION = """### Relation Extraction (REL)
176
 
177
  """
178
 
179
-
180
  # Create a dictionary to map task names to their descriptions
181
  TASK_DESCRIPTIONS = {
182
  "TE": TE_DESCRIPTION,
 
22
  #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
23
  MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)."
24
 
 
25
  # Tasks Descriptions
26
  TE_DESCRIPTION = """### Textual Entailment (TE)
27
  The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.
 
39
 
40
  """
41
 
 
42
  SA_DESCRIPTION = """### Sentiment Analysis (SA)
43
  The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
44
 
 
55
 
56
  """
57
 
 
58
  HS_DESCRIPTION = """### Hate Speech (HS)
59
  The input is a tweet. The model has to determine whether the text contains hateful content directed at specific target groups: immigrants, Muslims, or Roma. The output is a binary classification: hateful or not hateful.
60
 
 
71
 
72
  """
73
 
 
74
  AT_DESCRIPTION = """### Admission Tests (AT)
75
  The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
76
 
 
103
 
104
  """
105
 
 
106
  FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
107
  The input is a user query made by customers to the Acquedotto Pugliese service. The model must determine which of the 4 possible answers is the correct response to the question.
108
 
 
119
 
120
  """
121
 
 
122
  LS_DESCRIPTION = """### Lexical Substitution (LS)
123
  The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
124
 
 
131
 
132
  """
133
 
 
134
  SU_DESCRIPTION = """### Summarization (SUM)
135
  The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
136
 
 
143
 
144
  """
145
 
 
146
  NER_DESCRIPTION = """### Named Entity Recognition (NER)
147
  The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
148
 
 
155
 
156
  """
157
 
 
158
  REL_DESCRIPTION = """### Relation Extraction (REL)
159
  The task involves analyzing clinical text to extract relationships between laboratory test results (e.g., blood pressure) and the tests or procedures that produced them (e.g., blood pressure test).
160
 
 
167
 
168
  """
169
 
 
170
  # Create a dictionary to map task names to their descriptions
171
  TASK_DESCRIPTIONS = {
172
  "TE": TE_DESCRIPTION,