Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

evalita_llm_leaderboard / src /tasks.py

rzanoli

Small Changes

338193d 3 months ago

raw

history blame contribute delete

18.9 kB

	from dataclasses import dataclass
	from enum import Enum

	@dataclass
	class Task:
	benchmark: str
	# metric: str
	accuracy: str
	col_name: str

	NUM_FEWSHOT = 0 # Change with your few shot
	# ---------------------------------------------------

	# Your leaderboard name
	TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""

	# What does your leaderboard evaluate?
	INTRODUCTION_TEXT = """
	Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding translation issues and potential cultural biases; (ii) the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
	"""

	#MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
	MEASURE_DESCRIPTION = "<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = accuracy averaged over the assessed prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above).</small>"
	#MEASURE_DESCRIPTION = "<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = avg. accuracy over prompts. Best Prompt = accuracy of best prompt. Prompt ID = ID of the best prompt (see legend above).</small>"

	# Tasks Descriptions
	TE_DESCRIPTION = """### Textual Entailment (TE) --- Multiple-choice task
	The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|------------\|--------------\|
	\| 1 \| La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? \| ["Sì", "No"] \|
	\| 2 \| Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? \| ["Sì", "No"] \|
	\| 3 \| La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: \| ["A", "B"] \|
	\| 4 \| Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: \| ["A", "B"] \|
	\| 5 \| Frase 1: '{{text1}}' Frase 2: '{{text2}}' \| ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] \|
	\| 6 \| Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' \| ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = accuracy averaged over the 6 prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	SA_DESCRIPTION = """### Sentiment Analysis (SA) --- Multiple-choice task
	The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-----------------------------\|
	\| 1 \| Qual è il sentiment espresso nel seguente tweet: '{{text}}'? \| ["Positivo", "Negativo", "Neutro", "Misto"] \|
	\| 2 \| Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'? \| ["Positivo", "Negativo", "Neutro", "Misto"] \|
	\| 3 \| Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: Misto\\nRisposta: \| ["A", "B", "C", "D"] \|
	\| 4 \| Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: \\nRisposta: \| ["A", "B", "C", "D"] \|
	\| 5 \| Il seguente tweet: '{{text}}' esprime un sentiment \| ["Positivo", "Negativo", "Neutro", "Misto"] \|
	\| 6 \| Devi svolgere un compito di analisi del sentiment. Il seguente tweet: '{{text}}' esprime un sentiment \| ["Positivo", "Negativo", "Neutro", "Misto"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1-macro averaged over the 6 prompts. Best Prompt = F1-macro of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	HS_DESCRIPTION = """### Hate Speech (HS) --- Multiple-choice task
	The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-------------------------------------------------\|
	\| 1 \| C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? \| ["Falso", "Vero"] \|
	\| 2 \| Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? \| ["Falso", "Vero"] \|
	\| 3 \| C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: Falso\\nRisposta: \| ["B", "A"] \|
	\| 4 \| Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: \\nRisposta: \| ["B", "A"] \|
	\| 5 \| Il tweet: '{{full_text}}' \| ["non contiene incitamento all'odio", "contiene incitamento all'odio"] \|
	\| 6 \| Devi svolgere un compito di identificazione di incitamento all'odio. Il tweet: '{{full_text}}' \| ["non contiene incitamento all'odio", "contiene incitamento all'odio"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1-micro averaged over the 6 prompts. Best Prompt = F1-micro of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	AT_DESCRIPTION = """### Admission Tests (AT) --- Multiple-choice task
	The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-----------------------------\|
	\| 1 \| Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? \| ["A", "B", "C", "D", "E"] \|
	\| 2 \| Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? \| ["A", "B", "C", "D", "E"] \|
	\| 3 \| Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: \| ["A", "B", "C", "D", "E"] \|
	\| 4 \| Devi risolvere un compito a scelta multipla. Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta:Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: \| ["A", "B", "C", "D", "E"] \|
	\| 5 \| Dato il seguente caso clinico: '{{background}}'. La risposta corretta alla domanda: '{{domanda}}' èDato il seguente quesito di medicina '{{Question}}' la risposta corretta è: \| ["A", "B", "C", "D", "E"] \|
	\| 6 \| Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: \| ["A", "B", "C", "D", "E"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = accuracy averaged over the 6 prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	WIC_DESCRIPTION = """### Word in Context (WIC) --- Multiple-choice task
	The input consists of a word (w) and two sentences. The model has to determine whether the word w has the same meaning in both sentences. The output is a binary classification: 1 (same meaning) or 0 (different meaning).

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-------------------------------------------------\|
	\| 1 \| La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? \| ["No", "Sì"] \|
	\| 2 \| Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? \| ["No", "Sì"] \|
	\| 3 \| La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: Sì\\nB: No\\nRisposta: \| ["B", "A"] \|
	\| 4 \| Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: \\nB: No\\nRisposta: \| ["B", "A"] \|
	\| 5 \| La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' \| ["non hanno lo stesso significato", "hanno lo stesso significato"] \|
	\| 6 \| Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' \| ["non hanno lo stesso significato", "hanno lo stesso significato"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1-macro averaged over the 6 prompts. Best Prompt = F1-macro of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ) --- Multiple-choice task
	The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-----------------------------\|
	\| 1 \| Rispondi alla seguente domanda: '{{question}}' \| {{[A, B, C, D]}} \|
	\| 2 \| Devi risolvere un compito di risposte a domande. Rispondi alla seguente domanda: '{{question}}' \| {{[A, B, C, D]}} \|
	\| 3 \| Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: \| ["A", "B", "C", "D"] \|
	\| 4 \| Devi risolvere un compito a scelta multipla. Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: \| ["A", "B", "C", "D"] \|
	\| 5 \| La risposta alla domanda: '{{question}}' è: \| {{[A, B, C, D]}} \|
	\| 6 \| Devi risolvere un compito di risposte a domande. La risposta alla domanda: '{{question}}' è: \| {{[A, B, C, D]}} \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = accuracy averaged over the 6 prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	LS_DESCRIPTION = """### Lexical Substitution (LS) --- Generative task
	The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.

	\| # \| Prompt \|
	\|-----\|--------------------------------------------------------------------------------\|
	\| 1 \| Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: \|
	\| 2 \| Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1 averaged over the 2 prompts. Best Prompt = F1 of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	SU_DESCRIPTION = """### Summarization (SUM) --- Generative task
	The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.

	\| # \| Prompt \|
	\|-----\|--------------------------------------------------------------------------------\|
	\| 1 \| Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: \|
	\| 2 \| Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1 averaged over the 2 prompts. Best Prompt = F1 of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	NER_DESCRIPTION = """### Named Entity Recognition (NER) --- Generative task
	The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.

	\| # \| Prompt \|
	\|-----\|--------------------------------------------------------------------------------\|
	\| 1 \| Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: \|
	\| 2 \| Devi svolgere un compito di riconoscimento delle entità nei testi. Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1 averaged over the 2 prompts. Best Prompt = F1 of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	REL_DESCRIPTION = """### Relation Extraction (REL) --- Generative task
	The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test).

	\| # \| Prompt \|
	\|-----\|--------------------------------------------------------------------------------\|
	\| 1 \| Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: \|
	\| 2 \| Devi svolgere un compito di estrazione di relazioni da documenti medici. Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1 averaged over the 2 prompts. Best Prompt = F1 of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	# Create a dictionary to map task names to their descriptions
	TASK_DESCRIPTIONS = {
	"TE": TE_DESCRIPTION,
	"SA": SA_DESCRIPTION,
	"HS": HS_DESCRIPTION,
	"AT": AT_DESCRIPTION,
	"WIC": WIC_DESCRIPTION,
	"FAQ": FAQ_DESCRIPTION,
	"LS": LS_DESCRIPTION,
	"SU": SU_DESCRIPTION,
	"NER": NER_DESCRIPTION,
	"REL": REL_DESCRIPTION
	}

	from dataclasses import dataclass
	from enum import Enum

	@dataclass
	class Task:
	benchmark: str
	# metric: str
	accuracy: str
	col_name: str

	NUM_FEWSHOT = 0 # Change with your few shot
	# ---------------------------------------------------

	# Your leaderboard name
	TITLE = """<h1 align="center" id="space-title">🚀 EVALITA-LLM Leaderboard 🚀</h1>"""

	# What does your leaderboard evaluate?
	INTRODUCTION_TEXT = """
	Evalita-LLM is a benchmark designed to evaluate Large Language Models (LLMs) on Italian tasks. The distinguishing features of Evalita-LLM are the following: (i) all tasks are native Italian, avoiding translation issues and potential cultural biases; (ii) the benchmark includes generative tasks, enabling more natural interaction with LLMs; (iii) all tasks are evaluated against multiple prompts, this way mitigating the model sensitivity to specific prompts and allowing a fairer evaluation.
	"""

	#MEASURE_DESCRIPTION = "Combined Performance = (1 - (Best_Prompt - Prompt_Average) / 100) * Best_Prompt. Prompt Average = accuracy averaged over the six prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above)"
	MEASURE_DESCRIPTION = "<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = accuracy averaged over the assessed prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above).</small>"
	#MEASURE_DESCRIPTION = "<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = avg. accuracy over prompts. Best Prompt = accuracy of best prompt. Prompt ID = ID of the best prompt (see legend above).</small>"

	# Tasks Descriptions
	TE_DESCRIPTION = """### Textual Entailment (TE) --- Multiple-choice task
	The input are two sentences: the text (T) and the hypothesis (H). The model has to determine whether the meaning of the hypothesis is logically entailed by the text.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|------------\|--------------\|
	\| 1 \| La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? \| ["Sì", "No"] \|
	\| 2 \| Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera? \| ["Sì", "No"] \|
	\| 3 \| La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: \| ["A", "B"] \|
	\| 4 \| Devi risolvere un compito di inferenza semantica. La frase: '{{text1}}' implica logicamente che la frase: '{{text2}}' sia vera?\\nA: Sì\\nB: No\\nRisposta: \| ["A", "B"] \|
	\| 5 \| Frase 1: '{{text1}}' Frase 2: '{{text2}}' \| ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] \|
	\| 6 \| Devi risolvere un compito di inferenza semantica. Frase 1: '{{text1}}' Frase 2: '{{text2}}' \| ["La frase 1 implica logicamente che la frase 2 sia vera", "La frase 1 non implica logicamente che la frase 2 sia vera"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = accuracy averaged over the 6 prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	SA_DESCRIPTION = """### Sentiment Analysis (SA) --- Multiple-choice task
	The input is a tweet. The model has to determine the sentiment polarity of the text, categorizing it into one of four classes: positive, negative, neutral, or mixed.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-----------------------------\|
	\| 1 \| Qual è il sentiment espresso nel seguente tweet: '{{text}}'? \| ["Positivo", "Negativo", "Neutro", "Misto"] \|
	\| 2 \| Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'? \| ["Positivo", "Negativo", "Neutro", "Misto"] \|
	\| 3 \| Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: Misto\\nRisposta: \| ["A", "B", "C", "D"] \|
	\| 4 \| Devi svolgere un compito di analisi del sentiment. Qual è il sentiment espresso nel seguente tweet: '{{text}}'?\\nA: Positivo\\nB: Negativo\\nC: Neutro\\nD: \\nRisposta: \| ["A", "B", "C", "D"] \|
	\| 5 \| Il seguente tweet: '{{text}}' esprime un sentiment \| ["Positivo", "Negativo", "Neutro", "Misto"] \|
	\| 6 \| Devi svolgere un compito di analisi del sentiment. Il seguente tweet: '{{text}}' esprime un sentiment \| ["Positivo", "Negativo", "Neutro", "Misto"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1-macro averaged over the 6 prompts. Best Prompt = F1-macro of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	HS_DESCRIPTION = """### Hate Speech (HS) --- Multiple-choice task
	The input is a tweet. The model has to determine whether the text contains hateful content directed towards marginalized or minority groups. The output is a binary classification: hateful or not hateful.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-------------------------------------------------\|
	\| 1 \| C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? \| ["Falso", "Vero"] \|
	\| 2 \| Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'? \| ["Falso", "Vero"] \|
	\| 3 \| C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: Falso\\nRisposta: \| ["B", "A"] \|
	\| 4 \| Devi svolgere un compito di identificazione di incitamento all'odio. C'è incitamento all'odio nel seguente tweet: '{{full_text}}'?\\nA: \\nB: \\nRisposta: \| ["B", "A"] \|
	\| 5 \| Il tweet: '{{full_text}}' \| ["non contiene incitamento all'odio", "contiene incitamento all'odio"] \|
	\| 6 \| Devi svolgere un compito di identificazione di incitamento all'odio. Il tweet: '{{full_text}}' \| ["non contiene incitamento all'odio", "contiene incitamento all'odio"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1-micro averaged over the 6 prompts. Best Prompt = F1-micro of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	AT_DESCRIPTION = """### Admission Tests (AT) --- Multiple-choice task
	The input is a multiple-choice question with five options (A-E) from Italian medical specialty entrance exams, and the model must identify the correct answer.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-----------------------------\|
	\| 1 \| Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? \| ["A", "B", "C", "D", "E"] \|
	\| 2 \| Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta? \| ["A", "B", "C", "D", "E"] \|
	\| 3 \| Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: \| ["A", "B", "C", "D", "E"] \|
	\| 4 \| Devi risolvere un compito a scelta multipla. Dato il seguente caso clinico: '{{background}}', qual è la risposta corretta alla domanda: '{{domanda}}'?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta:Devi risolvere un compito a scelta multipla. Dato il seguente quesito di medicina: '{{Question}}' qual è la risposta corretta?\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nE: {{E}}\\nRisposta: \| ["A", "B", "C", "D", "E"] \|
	\| 5 \| Dato il seguente caso clinico: '{{background}}'. La risposta corretta alla domanda: '{{domanda}}' èDato il seguente quesito di medicina '{{Question}}' la risposta corretta è: \| ["A", "B", "C", "D", "E"] \|
	\| 6 \| Devi risolvere un compito di risposte a domande. Dato il seguente quesito di medicina '{{Question}}' la risposta corretta è: \| ["A", "B", "C", "D", "E"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = accuracy averaged over the 6 prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	WIC_DESCRIPTION = """### Word in Context (WIC) --- Multiple-choice task
	The input consists of a word (w) and two sentences. The model has to determine whether the word w has the same meaning in both sentences. The output is a binary classification: 1 (same meaning) or 0 (different meaning).

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-------------------------------------------------\|
	\| 1 \| La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? \| ["No", "Sì"] \|
	\| 2 \| Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'? \| ["No", "Sì"] \|
	\| 3 \| La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: Sì\\nB: No\\nRisposta: \| ["B", "A"] \|
	\| 4 \| Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' ha lo stesso significato della parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}'?\\nA: \\nB: No\\nRisposta: \| ["B", "A"] \|
	\| 5 \| La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' \| ["non hanno lo stesso significato", "hanno lo stesso significato"] \|
	\| 6 \| Devi determinare se una stessa parola usata in due frasi differenti ha lo stesso significato in entrambi i contesti. La parola: '{{sentence1[start1:end1]}}' nella frase: '{{sentence1}}' e la parola: '{{sentence2[start2:end2]}}' nella frase: '{{sentence2}}' \| ["non hanno lo stesso significato", "hanno lo stesso significato"] \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1-macro averaged over the 6 prompts. Best Prompt = F1-macro of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	FAQ_DESCRIPTION = """### Frequently Asked Questions & Question Answering (FAQ) --- Multiple-choice task
	The input is a user query regarding the water supply service. The model must identify the correct answer from the 4 available options.

	\| # \| Prompt \| Answer Choices \|
	\|-----\|--------------------------------------------------------------------------------\|-----------------------------\|
	\| 1 \| Rispondi alla seguente domanda: '{{question}}' \| {{[A, B, C, D]}} \|
	\| 2 \| Devi risolvere un compito di risposte a domande. Rispondi alla seguente domanda: '{{question}}' \| {{[A, B, C, D]}} \|
	\| 3 \| Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: \| ["A", "B", "C", "D"] \|
	\| 4 \| Devi risolvere un compito a scelta multipla. Rispondi alla seguente domanda: '{{question}}'\\nA: {{A}}\\nB: {{B}}\\nC: {{C}}\\nD: {{D}}\\nRisposta: \| ["A", "B", "C", "D"] \|
	\| 5 \| La risposta alla domanda: '{{question}}' è: \| {{[A, B, C, D]}} \|
	\| 6 \| Devi risolvere un compito di risposte a domande. La risposta alla domanda: '{{question}}' è: \| {{[A, B, C, D]}} \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = accuracy averaged over the 6 prompts. Best Prompt = accuracy of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	LS_DESCRIPTION = """### Lexical Substitution (LS) --- Generative task
	The input is a sentence containing a target word (w). The model has to replace the target word w with its most suitable synonyms that are contextually relevant.

	\| # \| Prompt \|
	\|-----\|--------------------------------------------------------------------------------\|
	\| 1 \| Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: \|
	\| 2 \| Devi risolvere un compito di sostituzione lessicale. Trova 10 parole che possono sostituire la parola racchiusa tra i marcatori `<head>` nella seguente frase: '{{context}}', mantenendo lo stesso significato. Elenca i lemmi (forme base) di queste parole, separandoli con una virgola, ad esempio: lemma1, lemma2, lemma3, lemma4, lemma5. Non aggiungere commenti o altro testo. Risposta: \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1 averaged over the 2 prompts. Best Prompt = F1 of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	SU_DESCRIPTION = """### Summarization (SUM) --- Generative task
	The input is a news article. The model has to generate a concise summary of the input text, capturing the key information and main points.

	\| # \| Prompt \|
	\|-----\|--------------------------------------------------------------------------------\|
	\| 1 \| Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: \|
	\| 2 \| Devi risolvere un compito di sintesi automatica del testo. Riassumi il seguente articolo di giornale: '{{source}}'\\nRiassunto: \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1 averaged over the 2 prompts. Best Prompt = F1 of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	NER_DESCRIPTION = """### Named Entity Recognition (NER) --- Generative task
	The input is a sentence. The model has to identify and classify Named Entities into predefined categories such as person, organization, and location.

	\| # \| Prompt \|
	\|-----\|--------------------------------------------------------------------------------\|
	\| 1 \| Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: \|
	\| 2 \| Devi svolgere un compito di riconoscimento delle entità nei testi. Estrai tutte le entità di tipo PER (persona), LOC (luogo) e ORG (organizzazione) dal testo seguente. Riporta ogni entità con il formato: Entità$Tipo, separando ciascuna coppia con ','. Se non ci sono entità da estrarre, rispondi con '&&NOENT&&'.\\nTesto: '{{text}}'\\nEntità: \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1 averaged over the 2 prompts. Best Prompt = F1 of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	REL_DESCRIPTION = """### Relation Extraction (REL) --- Generative task
	The input is a sentence of a clinical text. The model must identify and extract relationships between laboratory test results (e.g., blood pressure) and the corresponding tests or procedures that generated them (e.g., blood pressure test).

	\| # \| Prompt \|
	\|-----\|--------------------------------------------------------------------------------\|
	\| 1 \| Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: \|
	\| 2 \| Devi svolgere un compito di estrazione di relazioni da documenti medici. Dato un documento medico devi estrarre tutte le misurazioni degli esami medici presenti. Riporta ogni relazione nel formato: misurazione$esame, separando ciascuna coppia con '%'. Se non ci sono relazioni da estrarre, rispondi con '&&NOREL&&'.\\nTesto: '{{text}}'\\nRelazioni: \|

	<small>Combined Performance = (1 - (Best Prompt - Prompt Average) / 100) * Best Prompt. Prompt Average = F1 averaged over the 2 prompts. Best Prompt = F1 of the best prompt. Prompt ID = ID of the best prompt (see legend above). </small>

	"""

	# Create a dictionary to map task names to their descriptions
	TASK_DESCRIPTIONS = {
	"TE": TE_DESCRIPTION,
	"SA": SA_DESCRIPTION,
	"HS": HS_DESCRIPTION,
	"AT": AT_DESCRIPTION,
	"WIC": WIC_DESCRIPTION,
	"FAQ": FAQ_DESCRIPTION,
	"LS": LS_DESCRIPTION,
	"SU": SU_DESCRIPTION,
	"NER": NER_DESCRIPTION,
	"REL": REL_DESCRIPTION
	}