import gradio as gr from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from src.about import CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE from src.tasks import TASK_DESCRIPTIONS, MEASURE_DESCRIPTION from src.display.css_html_js import custom_css from src.display.utils import BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, fields, WeightType, Precision from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval import random # Define task metadata (icons, names, descriptions) TASK_METADATA_MULTIPLECHOICE = { "TE": {"icon": "📊", "name": "Textual Entailment", "tooltip": ""}, "SA": {"icon": "😃", "name": "Sentiment Analysis", "tooltip": ""}, "HS": {"icon": "⚠️", "name": "Hate Speech", "tooltip": ""}, "AT": {"icon": "🏥", "name": "Admission Test", "tooltip": ""}, "WIC": {"icon": "🔤", "name": "Word in Context", "tooltip": ""}, "FAQ": {"icon": "❓", "name": "Frequently Asked Questions", "tooltip": ""} } # Define task metadata (icons, names, descriptions) TASK_METADATA_GENERATIVE = { "LS": {"icon": "🔄", "name": "Lexical Substitution", "tooltip": ""}, "SU": {"icon": "📝", "name": "Summarization", "tooltip": ""}, "NER": {"icon": "🏷️", "name": "Named Entity Recognition", "tooltip": ""}, "REL": {"icon": "🔗", "name": "Relation Extraction", "tooltip": ""}, } def restart_space(): """Restart the Hugging Face space.""" API.restart_space(repo_id=REPO_ID) def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): """ Initialize and return the leaderboard when it is first loaded or when 'benchmark' is selected. The table is sorted based on the "Avg. Combined Performance" field. """ if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") field_list = fields(AutoEvalColumn) return Leaderboard( value=dataframe, datatype=[c.type for c in field_list], #select_columns=SelectColumns( # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default], # cant_deselect=[c.name for c in field_list if c.never_hidden], # label="Select Columns to Display:", #), search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], filter_columns=[ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"), #ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)", # default=[["0️⃣", "0️⃣"]]), # ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"), ], #filter_columns=[ # ColumnFilter("IS_FS", type="checkbox", default=False, label="5-Few-Shot") # #ColumnFilter("FS", type="dropdown", label="5-Few-Shot") #], bool_checkboxgroup_label="Evaluation Mode", interactive=False, ) def update_task_leaderboard(dataframe, default_selection=None, hidden_columns=None): """ Update and return the leaderboard when a specific task is selected. The table is sorted based on the "Combined Performance" field. """ if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") sorted_dataframe = dataframe.sort_values(by="Combined Performance", ascending=False) #print(sorted_dataframe['Combined Performance']) field_list = fields(AutoEvalColumn) return Leaderboard( value=sorted_dataframe, datatype=[c.type for c in field_list], #select_columns=SelectColumns( # default_selection=default_selection or [c.name for c in field_list if c.displayed_by_default], # cant_deselect=[c.name for c in field_list if c.never_hidden], # label="Select Columns to Display:", #), search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], hide_columns=hidden_columns or [c.name for c in field_list if c.hidden], filter_columns=[ ColumnFilter(AutoEvalColumn.fewshot_symbol.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"), ], bool_checkboxgroup_label="Evaluation Mode", interactive=False ) ''' # Helper function for leaderboard initialization def init_leaderboard(dataframe, default_selection=None, hidden_columns=None): """Initialize and return a leaderboard.""" if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") return Leaderboard( value=dataframe, datatype=[c.type for c in fields(AutoEvalColumn)], select_columns=SelectColumns( default_selection=default_selection or [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default], cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden], label="Select Columns to Display:", ), search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name], hide_columns=hidden_columns or [c.name for c in fields(AutoEvalColumn) if c.hidden], filter_columns=[ ColumnFilter(AutoEvalColumn.fewshot_type.name, type="checkboxgroup", label="N-Few-Shot Learning (FS)"), ColumnFilter(AutoEvalColumn.params.name, type="slider", min=0, max=150, label="Select the number of parameters (B)"), ], bool_checkboxgroup_label="Hide models", interactive=False, ) ''' def download_snapshot(repo, local_dir): """Try to download a snapshot from Hugging Face Hub.""" try: print(f"Downloading from {repo} to {local_dir}...") snapshot_download(repo_id=repo, local_dir=local_dir, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN) except Exception as e: print(f"Error downloading {repo}: {e}") restart_space() # Initialize the app by downloading snapshots download_snapshot(QUEUE_REPO, EVAL_REQUESTS_PATH) download_snapshot(RESULTS_REPO, EVAL_RESULTS_PATH) # Load leaderboard data LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS) finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) # Prepare the main interface demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: # Main leaderboard tab with gr.TabItem("🏅 Benchmark"): leaderboard = init_leaderboard( LEADERBOARD_DF, default_selection=['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"], hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', "Avg. Combined Performance ⬆️", "TE", "SA", "HS", "AT", "WIC", "FAQ", "LS", "SU", "NER", "REL"]] ) # About tab with gr.TabItem("📝 About"): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") # About tab with gr.TabItem("║", interactive=False): gr.Markdown("", elem_classes="markdown-text") # Task-specific leaderboards for task, metadata in TASK_METADATA_MULTIPLECHOICE.items(): with gr.TabItem(f"{metadata['icon']}{task}"): task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") gr.Markdown(task_description, elem_classes="markdown-text") leaderboard = update_task_leaderboard( LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}), default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'], hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']] ) # About tab with gr.TabItem("│", interactive=False): gr.Markdown("", elem_classes="markdown-text") # Task-specific leaderboards for task, metadata in TASK_METADATA_GENERATIVE.items(): with gr.TabItem(f"{metadata['icon']}{task}"): task_description = TASK_DESCRIPTIONS.get(task, "Description not available.") gr.Markdown(task_description, elem_classes="markdown-text") leaderboard = update_task_leaderboard( LEADERBOARD_DF.rename(columns={f"{task} Prompt Average": "Prompt Average", f"{task} Best Prompt": "Best Prompt", f"{task} Best Prompt Id": "Best Prompt Id", task: "Combined Performance"}), default_selection=['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id'], hidden_columns=[col for col in LEADERBOARD_DF.columns if col not in ['FS', 'Model', 'Combined Performance', 'Prompt Average', 'Best Prompt', 'Best Prompt Id']] ) # Citation section with gr.Accordion("📙 Citation", open=False): gr.Textbox(value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True) # Background job to restart space scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() # Launch the app with concurrent queueing demo.queue(default_concurrency_limit=40).launch(debug=True, # Enable Gradio debug mode show_error=True)