import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION from src.display.css_html_js import custom_css from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval # Define task metadata (icons, names, descriptions) TASK_METADATA = { "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""}, "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""}, "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""}, "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""}, "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""}, "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""}, "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""}, "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""}, "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""}, "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""}, } def restart_space(): """Restart the Hugging Face space.""" API.restart_space(repo_id=REPO_ID) # Helper function for leaderboard initialization def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): """Initialize and return a leaderboard.""" if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=SelectColumns( default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], label="Select Columns to Display:", ), search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden], filter_columns=[ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"), ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"), ], bool_checkboxgroup_label="Hide models", interactive=False, ) def download_snapshot(repo, local_dir): """Try to download a snapshot from Hugging Face Hub.""" try: print(f"Downloading from {repo} to {local_dir}...") snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN) except Exception as e: print(f"Error downloading {repo}: {e}") restart_space() # Initialize the app by downloading snapshots download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH) download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) # Load leaderboard data LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) # Prepare the main interface demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: # Main leaderboard tab with gr.TabItem("🏅 EVALITA-LLM Benchmark"): leaderboard = init_leaderboard( LEADERBOARD_DF, default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"], hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]] ) # About tab with gr.TabItem("📝 About"): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") # Task-specific leaderboards for task, metadata in TASK_METADATA.items(): with gr.TabItem(f"{metadata['icon']}{task}"): task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") gr.Markdown(task_description, elem_classes="markdown-text") leaderboard = init_leaderboard( LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}), default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'], hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']] ) # Citation section with gr.Accordion("📙 Citation", open=False): gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True) # Background job to restart space scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() # Launch the app with concurrent queueing demo.queue(default_concurrency_limit=40).launch()