|
import gradio as gr |
|
import pandas as pd |
|
import json |
|
from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS, LEADERBOARD_CSS, EU_LANGUAGES, MULTILINGUAL_TAB_TEXT, LONGFORM_TAB_TEXT |
|
from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub |
|
from utils_display import AutoEvalColumn, MultilingualColumn, LongformColumn, fields, make_clickable_model, styled_error, styled_message |
|
import numpy as np |
|
from datetime import datetime, timezone |
|
|
|
LAST_UPDATED = "Aug 15th 2025" |
|
|
|
|
|
benchmark_details = {} |
|
expanded_languages = set() |
|
|
|
column_names = { |
|
"MODEL": "Model", |
|
"Avg. WER": "Average WER β¬οΈ", |
|
"RTFx": "RTFx β¬οΈοΈ", |
|
"AMI WER": "AMI", |
|
"Earnings22 WER": "Earnings22", |
|
"Gigaspeech WER": "Gigaspeech", |
|
"LS Clean WER": "LS Clean", |
|
"LS Other WER": "LS Other", |
|
"SPGISpeech WER": "SPGISpeech", |
|
"Tedlium WER": "Tedlium", |
|
"Voxpopuli WER": "Voxpopuli", |
|
} |
|
|
|
eval_queue_repo, requested_models, csv_results, multilingual_csv_path, longform_csv_path = load_all_info_from_dataset_hub() |
|
|
|
if not csv_results.exists(): |
|
raise Exception(f"CSV file {csv_results} does not exist locally") |
|
|
|
original_df = pd.read_csv(csv_results) |
|
|
|
def formatter(x): |
|
if type(x) is str: |
|
x = x |
|
elif x == -1: |
|
x = "NA" |
|
else: |
|
x = round(x, 2) |
|
return x |
|
|
|
for col in original_df.columns: |
|
if col == "model": |
|
original_df[col] = original_df[col].apply(lambda x: x.replace(x, make_clickable_model(x))) |
|
else: |
|
original_df[col] = original_df[col].apply(formatter) |
|
original_df.rename(columns=column_names, inplace=True) |
|
original_df.sort_values(by='Average WER β¬οΈ', inplace=True) |
|
|
|
COLS = [c.name for c in fields(AutoEvalColumn)] |
|
TYPES = [c.type for c in fields(AutoEvalColumn)] |
|
|
|
|
|
MULTILINGUAL_COLS = [c.name for c in fields(MultilingualColumn)] |
|
|
|
|
|
LONGFORM_COLS = [c.name for c in fields(LongformColumn)] |
|
LONGFORM_TYPES = [c.type for c in fields(LongformColumn)] |
|
|
|
def create_multilingual_dataframe(): |
|
"""Create multilingual dataframe with CoVoST, MLS, and FLEURS benchmark data""" |
|
global benchmark_details, expanded_languages |
|
|
|
if multilingual_csv_path is None or not multilingual_csv_path.exists(): |
|
raise Exception("Multilingual CSV file not found") |
|
|
|
|
|
multilingual_raw_df = pd.read_csv(multilingual_csv_path) |
|
|
|
|
|
benchmark_details = {} |
|
|
|
multilingual_data = [] |
|
for _, row_data in multilingual_raw_df.iterrows(): |
|
model_name = row_data['model'] |
|
model_details = {} |
|
row = {"Model": make_clickable_model(model_name)} |
|
|
|
|
|
all_datapoints = [] |
|
|
|
for lang_code, lang_info in EU_LANGUAGES.items(): |
|
|
|
|
|
if lang_code == "pt": |
|
covost_score = None |
|
else: |
|
covost_score = row_data.get(f"{lang_code}_covost", None) |
|
|
|
if lang_code == "de": |
|
mls_score = None |
|
else: |
|
mls_score = row_data.get(f"{lang_code}_mls", None) |
|
|
|
fleurs_score = row_data.get(f"{lang_code}_fleurs", None) |
|
|
|
|
|
for score_name, score_val in [("covost", covost_score), ("mls", mls_score), ("fleurs", fleurs_score)]: |
|
if score_val is not None and (score_val == 0.0 or score_val == "" or str(score_val).strip() == "0" or str(score_val).strip() == ""): |
|
if score_name == "covost": |
|
covost_score = None |
|
elif score_name == "mls": |
|
mls_score = None |
|
elif score_name == "fleurs": |
|
fleurs_score = None |
|
|
|
|
|
if covost_score is not None and covost_score > 0: |
|
all_datapoints.append(covost_score) |
|
if mls_score is not None and mls_score > 0: |
|
all_datapoints.append(mls_score) |
|
if fleurs_score is not None and fleurs_score > 0: |
|
all_datapoints.append(fleurs_score) |
|
|
|
|
|
available_scores = [s for s in [covost_score, mls_score, fleurs_score] if s is not None and s > 0] |
|
if available_scores: |
|
avg_score = round(sum(available_scores) / len(available_scores), 2) |
|
else: |
|
avg_score = None |
|
|
|
|
|
lang_data = {"average": avg_score if avg_score is not None else "NA"} |
|
|
|
|
|
if lang_code != "pt" and covost_score is not None: |
|
lang_data["CoVoST"] = covost_score |
|
if lang_code != "de" and mls_score is not None: |
|
lang_data["MLS"] = mls_score |
|
if fleurs_score is not None: |
|
lang_data["FLEURS"] = fleurs_score |
|
|
|
model_details[lang_code] = lang_data |
|
|
|
|
|
if all_datapoints: |
|
row["Average WER β¬οΈ"] = round(np.mean(all_datapoints), 2) |
|
else: |
|
row["Average WER β¬οΈ"] = 0.0 |
|
|
|
|
|
rtfx_value = row_data.get("rtfx", row_data.get("RTFx", 0.0)) |
|
|
|
if rtfx_value == 0.0 or rtfx_value == -1 or rtfx_value == 0 or rtfx_value == "0" or rtfx_value == "0.0": |
|
row["RTFx β¬οΈοΈ"] = "NA" |
|
else: |
|
row["RTFx β¬οΈοΈ"] = rtfx_value |
|
|
|
|
|
for lang_code, lang_info in EU_LANGUAGES.items(): |
|
lang_col_name = f"{lang_info['flag']} {lang_info['name']}" |
|
model_data = model_details[lang_code] |
|
|
|
if lang_code in expanded_languages: |
|
|
|
row[f"{lang_col_name} Avg"] = model_data["average"] |
|
|
|
|
|
if "CoVoST" in model_data: |
|
row[f"{lang_col_name} CoVoST"] = model_data["CoVoST"] |
|
if "MLS" in model_data: |
|
row[f"{lang_col_name} MLS"] = model_data["MLS"] |
|
if "FLEURS" in model_data: |
|
row[f"{lang_col_name} FLEURS"] = model_data["FLEURS"] |
|
else: |
|
|
|
row[lang_col_name] = model_data["average"] |
|
|
|
|
|
benchmark_details[model_name] = model_details |
|
multilingual_data.append(row) |
|
|
|
multilingual_df = pd.DataFrame(multilingual_data) |
|
multilingual_df = multilingual_df.sort_values(by='Average WER β¬οΈ') |
|
return multilingual_df |
|
|
|
def get_multilingual_datatypes(df): |
|
"""Generate appropriate datatypes for multilingual dataframe columns""" |
|
datatypes = [] |
|
for col in df.columns: |
|
if col == "Model": |
|
datatypes.append("markdown") |
|
else: |
|
datatypes.append("number") |
|
return datatypes |
|
|
|
def get_language_details(model, language_code): |
|
"""Get detailed breakdown for a specific model and language""" |
|
global benchmark_details |
|
|
|
if model not in benchmark_details or language_code not in benchmark_details[model]: |
|
return None |
|
|
|
language_info = EU_LANGUAGES.get(language_code, {}) |
|
language_name = language_info.get("name", "Unknown") |
|
model_data = benchmark_details[model][language_code] |
|
|
|
details = { |
|
"Language": f"{language_info.get('flag', '')} {language_name}", |
|
"Model": model, |
|
"CoVoST WER": model_data["CoVoST"], |
|
"MLS WER": model_data["MLS"], |
|
"FLEURS WER": model_data["FLEURS"], |
|
"Average WER": model_data["average"] |
|
} |
|
|
|
return details |
|
|
|
def toggle_language_expansion(language_code): |
|
"""Toggle expansion of language columns when button is clicked""" |
|
global expanded_languages |
|
|
|
|
|
if language_code in expanded_languages: |
|
expanded_languages.remove(language_code) |
|
else: |
|
expanded_languages.add(language_code) |
|
|
|
|
|
updated_df = create_multilingual_dataframe() |
|
updated_datatypes = get_multilingual_datatypes(updated_df) |
|
|
|
return gr.update(value=updated_df, datatype=updated_datatypes) |
|
|
|
|
|
multilingual_df = create_multilingual_dataframe() |
|
|
|
def create_longform_dataframe(): |
|
"""Create longform dataframe from CSV data""" |
|
if longform_csv_path is None or not longform_csv_path.exists(): |
|
raise Exception(f"Longform CSV file {longform_csv_path} does not exist locally") |
|
|
|
|
|
longform_raw_df = pd.read_csv(longform_csv_path) |
|
longform_data = [] |
|
|
|
for _, row_data in longform_raw_df.iterrows(): |
|
model_name = row_data['model'] |
|
|
|
|
|
earnings21_wer = row_data.get('earnings21_wer', -1) |
|
mustc_wer = row_data.get('mustc_wer', -1) |
|
rtfx_value = row_data.get('rtfx', 0) |
|
|
|
|
|
available_wers = [w for w in [earnings21_wer, mustc_wer] if w != -1 and w > 0] |
|
avg_wer = round(np.mean(available_wers), 2) if available_wers else 0.0 |
|
|
|
row = { |
|
"Model": make_clickable_model(model_name), |
|
"Average WER β¬οΈ": avg_wer, |
|
"RTFx β¬οΈοΈ": rtfx_value if rtfx_value > 0 else "NA", |
|
"Earnings21": earnings21_wer if earnings21_wer != -1 else "NA", |
|
} |
|
longform_data.append(row) |
|
|
|
longform_df = pd.DataFrame(longform_data) |
|
longform_df = longform_df.sort_values(by='Average WER β¬οΈ') |
|
return longform_df |
|
|
|
|
|
longform_df = create_longform_dataframe() |
|
|
|
|
|
def request_model(model_text, chbcoco2017): |
|
|
|
|
|
dataset_selection = [] |
|
if chbcoco2017: |
|
dataset_selection.append("ESB Datasets tests only") |
|
|
|
if len(dataset_selection) == 0: |
|
return styled_error("You need to select at least one dataset") |
|
|
|
base_model_on_hub, error_msg = is_model_on_hub(model_text) |
|
|
|
if not base_model_on_hub: |
|
return styled_error(f"Base model '{model_text}' {error_msg}") |
|
|
|
|
|
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") |
|
required_datasets = ', '.join(dataset_selection) |
|
eval_entry = { |
|
"date": current_time, |
|
"model": model_text, |
|
"datasets_selected": required_datasets |
|
} |
|
|
|
|
|
DIR_OUTPUT_REQUESTS.mkdir(parents=True, exist_ok=True) |
|
|
|
fn_datasets = '@ '.join(dataset_selection) |
|
filename = model_text.replace("/","@") + "@@" + fn_datasets |
|
if filename in requested_models: |
|
return styled_error(f"A request for this model '{model_text}' and dataset(s) was already made.") |
|
try: |
|
filename_ext = filename + ".txt" |
|
out_filepath = DIR_OUTPUT_REQUESTS / filename_ext |
|
|
|
|
|
with open(out_filepath, "w") as f: |
|
f.write(json.dumps(eval_entry)) |
|
|
|
upload_file(filename, out_filepath) |
|
|
|
|
|
requested_models.append(filename) |
|
|
|
|
|
out_filepath.unlink() |
|
|
|
return styled_message("π€ Your request has been submitted and will be evaluated soon!</p>") |
|
except Exception as e: |
|
return styled_error(f"Error submitting request!") |
|
|
|
def filter_main_table(show_proprietary=True): |
|
filtered_df = original_df.copy() |
|
|
|
|
|
if not show_proprietary and "License" in filtered_df.columns: |
|
|
|
filtered_df = filtered_df[filtered_df["License"] == "Open"] |
|
|
|
return filtered_df |
|
|
|
with gr.Blocks(css=LEADERBOARD_CSS) as demo: |
|
gr.HTML(BANNER, elem_id="banner") |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
with gr.TabItem("π
Leaderboard", elem_id="od-benchmark-tab-table", id=0): |
|
leaderboard_table = gr.components.Dataframe( |
|
value=original_df, |
|
datatype=TYPES, |
|
elem_id="leaderboard-table", |
|
interactive=False, |
|
visible=True, |
|
) |
|
with gr.Row(): |
|
show_proprietary_checkbox = gr.Checkbox( |
|
label="Show proprietary models", |
|
value=True, |
|
elem_id="show-proprietary-checkbox" |
|
) |
|
|
|
|
|
show_proprietary_checkbox.change( |
|
filter_main_table, |
|
inputs=[show_proprietary_checkbox], |
|
outputs=leaderboard_table |
|
) |
|
|
|
with gr.TabItem("π Multilingual", elem_id="multilingual-benchmark-tab-table", id=1): |
|
gr.Markdown(MULTILINGUAL_TAB_TEXT, elem_classes="markdown-text") |
|
|
|
|
|
gr.Markdown("Click on a language button to show/hide detailed benchmark scores (CoVoST, MLS, FLEURS):") |
|
|
|
language_buttons = {} |
|
lang_codes = list(EU_LANGUAGES.keys()) |
|
|
|
|
|
with gr.Row(): |
|
for lang_code in lang_codes[:5]: |
|
lang_info = EU_LANGUAGES[lang_code] |
|
button_label = f"{lang_info['flag']} {lang_info['name']}" |
|
language_buttons[lang_code] = gr.Button( |
|
button_label, |
|
variant="secondary", |
|
size="sm" |
|
) |
|
|
|
|
|
with gr.Row(): |
|
for lang_code in lang_codes[5:]: |
|
lang_info = EU_LANGUAGES[lang_code] |
|
button_label = f"{lang_info['flag']} {lang_info['name']}" |
|
language_buttons[lang_code] = gr.Button( |
|
button_label, |
|
variant="secondary", |
|
size="sm" |
|
) |
|
|
|
multilingual_table = gr.components.Dataframe( |
|
value=multilingual_df, |
|
datatype=get_multilingual_datatypes(multilingual_df), |
|
elem_id="multilingual-table", |
|
interactive=False, |
|
visible=True, |
|
) |
|
|
|
|
|
for lang_code, button in language_buttons.items(): |
|
def create_toggle_func(code): |
|
return lambda: toggle_language_expansion(code) |
|
|
|
button.click( |
|
create_toggle_func(lang_code), |
|
outputs=[multilingual_table] |
|
) |
|
|
|
with gr.TabItem("π Long-form", elem_id="longform-benchmark-tab-table", id=2): |
|
gr.Markdown(LONGFORM_TAB_TEXT, elem_classes="markdown-text") |
|
|
|
longform_table = gr.components.Dataframe( |
|
value=longform_df, |
|
datatype=LONGFORM_TYPES, |
|
elem_id="longform-table", |
|
interactive=False, |
|
visible=True, |
|
) |
|
|
|
with gr.TabItem("π Metrics", elem_id="od-benchmark-tab-table", id=4): |
|
gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text") |
|
|
|
with gr.TabItem("βοΈβ¨ Request a model here!", elem_id="od-benchmark-tab-table", id=5): |
|
with gr.Column(): |
|
gr.Markdown("# βοΈβ¨ Request results for a new model here!", elem_classes="markdown-text") |
|
with gr.Column(): |
|
gr.Markdown("Select a dataset:", elem_classes="markdown-text") |
|
with gr.Column(): |
|
model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)") |
|
chb_coco2017 = gr.Checkbox(label="COCO validation 2017 dataset", visible=False, value=True, interactive=False) |
|
with gr.Column(): |
|
mdw_submission_result = gr.Markdown() |
|
btn_submitt = gr.Button(value="π Request") |
|
btn_submitt.click(request_model, |
|
[model_name_textbox, chb_coco2017], |
|
mdw_submission_result) |
|
|
|
with gr.TabItem("π€ About", elem_id="od-benchmark-tab-table", id=6): |
|
gr.Markdown("## About", elem_classes="markdown-text") |
|
|
|
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text") |
|
|
|
with gr.Row(): |
|
with gr.Accordion("π Citation", open=False): |
|
gr.Textbox( |
|
value=CITATION_TEXT, lines=7, |
|
label="Copy the BibTeX snippet to cite this source", |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
demo.launch(ssr_mode=False) |
|
|