Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

rzanoli commited on Mar 20

Commit

12c62aa

1 Parent(s): c996d40

Minor changes

Browse files

Add get_model_info.py to get model's information from HuggingFace

Files changed (4) hide show

get_model_info.py +1 -1
src/about.py +0 -10
src/populate.py +1 -1
src/tasks.py +0 -10

get_model_info.py CHANGED Viewed

@@ -84,4 +84,4 @@ for filename in os.listdir(input_folder):
         except Exception as e:
             print(f"Error retrieving info for {model_name}: {e}")
-        print("Process completed1.")

         except Exception as e:
             print(f"Error retrieving info for {model_name}: {e}")
+        print("Process completed!")

src/about.py CHANGED Viewed

@@ -64,7 +64,6 @@ class Tasks(Enum):
     task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
     task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
     '''
     task0 = Task("TextualEntailment", "acc", "Textual Entailment")
     task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best")
@@ -88,18 +87,9 @@ class Tasks(Enum):
     task19 = Task("REL_best", "acc", "REL_best")
     '''
-NUM_FEWSHOT = 0 # Change with your few shot
-# ---------------------------------------------------
-# Your leaderboard name
-#TITLE = """<h1 align="center" id="space-title">Work in progress!</h1>"""
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.

     task39 = Task("relation-extraction_3", "acc", "best_prompt", "REL Best Prompt")
     task40 = Task("relation-extraction_4", "acc", "prompt_id", "REL Best Prompt Id")
     '''
     task0 = Task("TextualEntailment", "acc", "Textual Entailment")
     task1 = Task("TextualEntailment_best", "acc", "TextualEntailment Best")
     task19 = Task("REL_best", "acc", "REL_best")
     '''
 # Your leaderboard name
 TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
 Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing and innovative features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding issues of translating from Italian and potential cultural biases; (ii) in addition to well established multiple-choice tasks, the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer and objective evaluation.

src/populate.py CHANGED Viewed

@@ -5,7 +5,7 @@ import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
-from src.leaderboard.read_evals_old import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:

 from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
+from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:

src/tasks.py CHANGED Viewed

@@ -22,7 +22,6 @@ Evalita-LLM, a new benchmark designed to evaluate Large Language Models (LLMs) o
 #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
 MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)."
 # Tasks Descriptions
 TE_DESCRIPTION = """### Textual Entailment (TE)
     The input are two sentences: the text (T) and the hypothesis (H). The model  has to determine whether the meaning of the hypothesis is logically entailed by the text.
@@ -40,7 +39,6 @@ TE_DESCRIPTION = """### Textual Entailment (TE)
 """
 SA_DESCRIPTION = """### Sentiment Analysis (SA)
     The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
@@ -57,7 +55,6 @@ SA_DESCRIPTION = """### Sentiment Analysis (SA)
 """
 HS_DESCRIPTION = """### Hate Speech (HS)
     The input is a tweet. The model has to determine whether the text contains hateful content directed at specific target groups: immigrants, Muslims, or Roma. The output is a binary classification: hateful or not hateful.
@@ -74,7 +71,6 @@ HS_DESCRIPTION = """### Hate Speech (HS)
 """
 AT_DESCRIPTION = """### Admission Tests (AT)
     The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
@@ -107,7 +103,6 @@ WIC_DESCRIPTION = """### Word in Context (WIC)
 """
 FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
     The input is a user query made by customers to the Acquedotto Pugliese service. The model must determine which of the 4 possible answers is the correct response to the question.
@@ -124,7 +119,6 @@ FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
 """
 LS_DESCRIPTION = """### Lexical Substitution (LS)
     The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
@@ -137,7 +131,6 @@ LS_DESCRIPTION = """### Lexical Substitution (LS)
 """
 SU_DESCRIPTION = """### Summarization (SUM)
     The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
@@ -150,7 +143,6 @@ SU_DESCRIPTION = """### Summarization (SUM)
 """
 NER_DESCRIPTION = """### Named Entity Recognition (NER)
     The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
@@ -163,7 +155,6 @@ NER_DESCRIPTION = """### Named Entity Recognition (NER)
 """
 REL_DESCRIPTION = """### Relation Extraction (REL)
     The task involves analyzing clinical text to extract relationships between laboratory test results (e.g., blood pressure) and the tests or procedures that produced them (e.g., blood pressure test).
@@ -176,7 +167,6 @@ REL_DESCRIPTION = """### Relation Extraction (REL)
 """
 # Create a dictionary to map task names to their descriptions
 TASK_DESCRIPTIONS = {
     "TE": TE_DESCRIPTION,

 #MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
 MEASURE_DESCRIPTION = "**Combined Performance** = (1 - (**Best Prompt** - **Prompt Average**) / 100) * **Best Prompt**. **Prompt Average** = accuracy averaged over the assessed prompts. **Best Prompt** = accuracy of the best prompt. **Prompt ID** = ID of the best prompt (see legend above)."
 # Tasks Descriptions
 TE_DESCRIPTION = """### Textual Entailment (TE)
     The input are two sentences: the text (T) and the hypothesis (H). The model  has to determine whether the meaning of the hypothesis is logically entailed by the text.
 """
 SA_DESCRIPTION = """### Sentiment Analysis (SA)
     The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.
 """
 HS_DESCRIPTION = """### Hate Speech (HS)
     The input is a tweet. The model has to determine whether the text contains hateful content directed at specific target groups: immigrants, Muslims, or Roma. The output is a binary classification: hateful or not hateful.
 """
 AT_DESCRIPTION = """### Admission Tests (AT)
     The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.
 """
 FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ)
     The input is a user query made by customers to the Acquedotto Pugliese service. The model must determine which of the 4 possible answers is the correct response to the question.
 """
 LS_DESCRIPTION = """### Lexical Substitution (LS)
     The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.
 """
 SU_DESCRIPTION = """### Summarization (SUM)
     The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.
 """
 NER_DESCRIPTION = """### Named Entity Recognition (NER)
     The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.
 """
 REL_DESCRIPTION = """### Relation Extraction (REL)
     The task involves analyzing clinical text to extract relationships between laboratory test results (e.g., blood pressure) and the tests or procedures that produced them (e.g., blood pressure test).
 """
 # Create a dictionary to map task names to their descriptions
 TASK_DESCRIPTIONS = {
     "TE": TE_DESCRIPTION,