File size: 3,557 Bytes
			
			| 4a78d34 8ad1a09 1d1f5e9 4a78d34 ed6229f 4a78d34 8ad1a09 4a78d34 ed6229f 4a78d34 aa87c61 ed6229f 4410a31 4a78d34 bb10943 11c3aa7 4a78d34 6fa5c81 1ce4675 6fa5c81 f17b79a 43ae4c7 4a78d34 9c55d6d 8ad1a09 1d1f5e9 ed6229f 4a78d34 8596ab1 4a78d34 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 | import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    COLS,
    ST_BENCHMARK_COLS,
    AGENTIC_BENCHMARK_COLS,
    EVAL_COLS,
    EVAL_TYPES,
    AutoEvalColumn,
    ModelType,
    fields,
    WeightType,
    Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df, TASK_NAME_INVERSE_MAP
from src.submission.submit import add_new_eval
def restart_space():
    API.restart_space(repo_id=REPO_ID)
### Space initialisation
try:
    print(EVAL_REQUESTS_PATH)
    snapshot_download(
        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    restart_space()
try:
    print(EVAL_RESULTS_PATH)
    snapshot_download(
        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    restart_space()
ST_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, ST_BENCHMARK_COLS)
AGENTIC_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, AGENTIC_BENCHMARK_COLS)
(
    finished_eval_queue_df,
    running_eval_queue_df,
    pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
def init_leaderboard(dataframe, benchmark_type):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    
    AutoEvalColumnSubset = [c for c in fields(AutoEvalColumn) if ((c.name=="Model") or (TASK_NAME_INVERSE_MAP.get(c.name, dict()).get("type", "")==benchmark_type))]
    return gr.components.Dataframe(
        value=dataframe,
        datatype=[c.type for c in AutoEvalColumnSubset],
        column_widths=["150px" if c.name != "Model" else "250px" for c in AutoEvalColumnSubset],
        wrap=False,
    )
demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(f"""
    <div style="text-align:center; margin-bottom:1rem;">
        <h1 style="margin-bottom: 0.25rem;">{TITLE}</h1>
        <p style="color:#eb088a; margin:0; font-size:1.2rem;">Performance Insights & Comparison</p>
    </div>
    """)
    # gr.HTML(TITLE)
    with gr.Group(elem_classes="intro-block"):
        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("Base Benchmark", elem_id="llm-benchmark-tab-table", id=0):
            leaderboard = init_leaderboard(ST_LEADERBOARD_DF, "base")
        with gr.TabItem("Agentic Benchmark", elem_id="llm-benchmark-tab-table", id=1):
            leaderboard = init_leaderboard(AGENTIC_LEADERBOARD_DF, "agentic")
        with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch() | 
