Spaces:

evalitahf
/

evalita_llm_leaderboard

Running

App Files Files Community

evalita_llm_leaderboard / app.py

rzanoli

Small changes

cae4d0f 6 months ago

raw

history blame

6.76 kB

	import gradio as gr
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download
	from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE
	from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION
	from src.display.css_html_js import custom_css
	from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision
	from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
	from src.populate import get_evaluation_queue_df, get_leaderboard_df
	from src.submission.submit import add_new_eval


	# Define task metadata (icons, names, descriptions)
	TASK_METADATA = {
	"TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": "Identify logical relationships between two text segments."},
	"SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": "Classify the sentiment (positive, negative, neutral) of a text."},
	"HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": "Detect hate speech in a text."},
	"AT": {"icon": "🏥", "name": "Admission Test", "tooltip": "Classify whether a clinical statement pertains to an admission test."},
	"WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": "Identify words in context and their meaning."},
	"FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": "Answer frequently asked questions based on given text."},
	"LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": "Identify alternative words in a given context."},
	"SU": {"icon": "📝", "name": "Summarization", "tooltip": "Summarize long text into a shorter version."},
	"NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": "Identify named entities (e.g., persons, locations, organizations) in text."},
	"REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": "Extract and link laboratory test results to the respective tests in clinical narratives."},
	}

	def restart_space():
	"""Restart the Hugging Face space."""
	API.restart_space(repo_id=REPO_ID)

	# Helper function for leaderboard initialization
	def init_leaderboard(dataframe, default_selection=None, hidden_columns=None):
	"""Initialize and return a leaderboard."""
	if dataframe is None or dataframe.empty:
	raise ValueError("Leaderboard DataFrame is empty or None.")

	return Leaderboard(
	value=dataframe,
	datatype=[c.type for c in fields(AutoEvalColumn)],
	select_columns=SelectColumns(
	default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
	cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
	label="Select Columns to Display:",
	),
	search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
	hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden],
	filter_columns=[
	ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="Few-Shot Learning (FS)"),
	ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)"),
	],
	bool_checkboxgroup_label="Hide models",
	interactive=False,
	)


	def download_snapshot(repo, local_dir):
	"""Try to download a snapshot from Hugging Face Hub."""
	try:
	print(f"Downloading from {repo} to {local_dir}...")
	snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN)
	except Exception as e:
	print(f"Error downloading {repo}: {e}")
	restart_space()


	# Initialize the app by downloading snapshots
	download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH)
	download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH)

	# Load leaderboard data
	LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
	finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

	# Prepare the main interface
	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	# Main leaderboard tab
	with gr.TabItem("🏅 EVALITA-LLM Benchmark"):
	leaderboard = init_leaderboard(
	LEADERBOARD_DF,
	default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"],
	hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]]
	)

	# About tab
	with gr.TabItem("📝 About"):
	gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

	# Task-specific leaderboards
	for task, metadata in TASK_METADATA.items():
	with gr.TabItem(f"{metadata['icon']}{task}"):

	task_description = TASK_DESCRIPTIONS.get(task, "Description not available.")
	gr.Markdown(task_description, elem_classes="markdown-text")

	gr.Markdown(MEASURE_DESCRIPTION, elem_classes="markdown-text")

	leaderboard = init_leaderboard(
	LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}),
	default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'],
	hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']]
	)

	# Citation section
	with gr.Accordion("📙 Citation", open=False):
	gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True)

	# Background job to restart space
	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.start()

	# Launch the app with concurrent queueing
	demo.queue(default_concurrency_limit=40).launch()