""" Copyright 2025 Balacoon Utils to get data to populate leaderboard. Communicates with `balacoon/speech_gen_baselines` dataset on Hugging Face, that contains evaluation results of different speech generation systems. """ import yaml import logging import requests import pandas as pd from huggingface_hub import hf_hub_url from huggingface_hub.hf_api import RepoFolder from api import api def make_pretty(df: pd.DataFrame) -> pd.DataFrame: """ Rename some columns """ df["whisperv3_cer"] = df["whisperv3_cer"] * 100 rename_map = { "whisperv3_cer": "WhisperV3 CER, %↓", "utmos_mos": "UTMOS MOS, ↑", "aesthetics_enjoyment": "Enjoyment, ↑", "aesthetics_usefullness": "Usefulness, ↑", "aesthetics_complexity": "Complexity, ↑", "aesthetics_quality": "Quality, ↑", "ecapa_secs": "ECAPA SECS, ↑", "redimnet_secs": "ReDimNet SECS ↑" } description_map = { "whisperv3_cer": "Character Error Rate in %, measured with [WhisperV3](https://huggingface.co/openai/whisper-large-v3-turbo)", "utmos_mos": "Mean Opinion Score, showing how natural the speech is, measured with [UMTOS](https://huggingface.co/balacoon/utmos).", "ecapa_secs": "Speaker Embedding Cosine Similarity between reference audio and generated speech, measured with [ECAPA](https://huggingface.co/balacoon/ecapa).", "redimnet_secs": "Speaker Embedding Cosine Similarity between reference audio and generated speech, measured with [ReDimNet](https://github.com/IDRnD/redimnet).", } # compose a description for columns description = "" for k in df.columns: if k in description_map: description += f"* {description_map[k]}\n" if any("aesthetics_" in k for k in df.columns): description += "* Enjoyment / Usefulness / Complexity / Quality are Aesthetics metrics, measured with [audiobox-aesthetics](https://github.com/facebookresearch/audiobox-aesthetics).\n" # Only rename columns that exist in the dataframe existing_columns = {k: v for k, v in rename_map.items() if k in df.columns} df = df.rename(columns=existing_columns) return df, description def get_leaderboard_data(system_type: str, dataset: str) -> pd.DataFrame: """ Fetches metrics.yaml for all systems of given type if they have evaluation for the given dataset. Returns a DataFrame with metrics per system. """ # Get all models under the system type models_tree = api.list_repo_tree( "balacoon/speech_gen_baselines", repo_type="dataset", path_in_repo=system_type, recursive=False ) model_dirs = [item.path for item in models_tree if isinstance(item, RepoFolder)] # Collect metrics for each model that has the dataset metrics_data = [] for model_dir in model_dirs: model_name = model_dir.split('/')[-1] metrics_path = f"{model_dir}/{dataset}/metrics.yaml" try: url = hf_hub_url( repo_id="balacoon/speech_gen_baselines", filename=metrics_path, repo_type="dataset" ) response = requests.get(url) metrics_content = yaml.safe_load(response.text) if "metrics" not in metrics_content: logging.error(f"`metrics` are missing from metrics.yaml ({metrics_path})") continue # prepare a row for the table if "model_name" in metrics_content: # overwrite model name model_name = metrics_content["model_name"] # add a link to a model if it is provided if "link" in metrics_content: model_name = f"[{model_name}]({metrics_content['link']})" row = {"Model": model_name} # Round all metric values to 4 decimal places rounded_metrics = {k: float(f"{float(v):.4f}") for k, v in metrics_content["metrics"].items()} row.update(rounded_metrics) metrics_data.append(row) except: # Skip if metrics.yaml doesn't exist for this model/dataset continue df = pd.DataFrame(metrics_data) # Remove 'aesthetics_' prefix from column names where applicable df, description = make_pretty(df) # compose datatypes for the table: markdown for model name, and number for all other columns datatypes = ["markdown"] + ["number"] * (len(df.columns) - 1) return df, datatypes, description