Spaces:

balacoon
/

TTSLeaderboard

Running

App Files Files Community

clementruhm commited on Feb 15

Commit

0dc360b

1 Parent(s): c4d0aea

Initial version of leaderboad

Browse files

Files changed (10) hide show

.gitignore +1 -0
README.md +2 -1
api.py +11 -0
app.py +37 -0
dataset_utils.py +89 -0
leaderboard.py +56 -0
leaderboard_data.py +71 -0
requirements.txt +2 -0
samples.py +73 -0
samples_data.py +116 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -11,4 +11,5 @@ license: apache-2.0
 short_description: Objective evaluations for speech generation systems
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: Objective evaluations for speech generation systems
 ---
+Fetches data from `balacoon/speech_gen_baselines` and `balacoon/speech_gen_eval_testsets` datasets on Hugging Face to create a leaderboard for speech generation systems.
+Metrics are computed with `speech_gen_eval` library. Leaderboard allows to compare the metrics and to listen to the samples.

api.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+Copyright 2025 Balacoon
+api obj reused when interacting with datasets
+"""
+from huggingface_hub import HfApi
+api = HfApi()

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+Copyright 2025 Balacoon
+TTSLeaderboard app.
+"""
+import gradio as gr
+from leaderboard import create_leaderboard_tab
+from samples import create_samples_tab
+def main():
+    with gr.Blocks(css="footer {visibility: hidden}") as demo:
+        gr.Markdown(
+            """
+            # TTSLeaderboard
+            ### Objective evaluation of speech generation systems
+            Evaluations are done with [speech_gen_eval](https://github.com/balacoon/speech_gen_eval),
+            on [speech_gen_eval_testsets](https://huggingface.co/datasets/balacoon/speech_gen_eval_testsets).
+            """
+        )
+        with gr.Tabs():
+            with gr.TabItem("📊 Leaderboard"):
+                create_leaderboard_tab()
+            with gr.TabItem("🔊 Samples"):
+                create_samples_tab()
+                gr.Markdown("""
+                Click on the audio files to play sample generations from each model.
+                """)
+    demo.launch()
+if __name__ == "__main__":
+    main()

dataset_utils.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+Copyright 2025 Balacoon
+Utils to interact with the `metrics` dataset.
+"""
+from typing import Optional
+from huggingface_hub.hf_api import RepoFolder
+from api import api
+baselines_repo = "balacoon/speech_gen_baselines"
+def get_system_types() -> list[str]:
+    """
+    Get what types of systems user can check a leaderboard for.
+    We check `balacoon/speech_gen_baselines` dataset,
+    where synthesis from different models are stored.
+    For example, the dataset would have the following structure:
+    ```
+    speech_gen_baselines/
+        zero-tts/
+        vocoder/
+    ```
+    """
+    repo_tree = api.list_repo_tree(
+        baselines_repo,
+        repo_type="dataset",
+        recursive=False
+    )
+    top_level_dirs = [item.path for item in repo_tree if isinstance(item, RepoFolder)]
+    return top_level_dirs
+def get_models(system_type: str) -> list[str]:
+    """
+    Get all models under the given system type.
+    For example, for system_type="zero-tts", returns ["xtts", "yourtts"].
+    """
+    models_tree = api.list_repo_tree(
+        baselines_repo,
+        repo_type="dataset",
+        path_in_repo=system_type,
+        recursive=False
+    )
+    model_dirs = [item.path for item in models_tree if isinstance(item, RepoFolder)]
+    # Extract just the model names from the full paths
+    model_names = [path.split('/')[-1] for path in model_dirs]
+    return model_names
+def get_datasets(system_type: str, model_dirs: Optional[list[str]] = None, return_union: bool = True) -> list[str]:
+    """
+    Get what metrics on which datasets are available for the given system type.
+    Go through all systems under system type, and check datasets under each system.
+    The dataset would have the following structure:
+    ```
+    speech_gen_baselines/
+        zero-tts/
+            xtts/
+                vctk/
+                daps_celeb/
+            yourtts/
+                vctk/
+                daps_celeb/
+    ```
+    """
+    if model_dirs is None:
+        # Get all models under the system type
+        model_dirs = get_models(system_type)
+    # Get all unique datasets across all models
+    datasets_per_model = []
+    for model_dir in model_dirs:
+        datasets_tree = api.list_repo_tree(
+            baselines_repo,
+            repo_type="dataset",
+            path_in_repo=system_type + "/" + model_dir,
+            recursive=False
+        )
+        model_datasets = [item.path.split('/')[-1] for item in datasets_tree if isinstance(item, RepoFolder)]
+        datasets_per_model.append(model_datasets)
+    if return_union:
+        # return all possible datasets for these models
+        return sorted(list(set().union(*datasets_per_model)))
+    else:
+        # return only datasets which are present in all models
+        return sorted(list(set.intersection(*map(set, datasets_per_model))))

leaderboard.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+Copyright 2025 Balacoon
+Leaderboard tab for TTSLeaderboard.
+"""
+import gradio as gr
+from dataset_utils import get_system_types, get_datasets
+from leaderboard_data import get_leaderboard_data
+def create_leaderboard_tab():
+    with gr.Row():
+        gr.Markdown("""
+            Select system type and a dataset, to see the leaderboard.
+        """)
+    with gr.Row():
+        system_type = gr.Dropdown(choices=get_system_types(), label="System Type", value=None)
+        dataset = gr.Dropdown(choices=[], label="Dataset", value=None)
+    with gr.Row():
+        table = gr.DataFrame(
+            value=None,
+            row_count=0,
+            col_count=0,
+        )
+    def update_datasets(system_type: str):
+        # When system type changes, clear dataset and table
+        dataset_choices = get_datasets(system_type) if system_type else []
+        return [
+            gr.Dropdown(choices=dataset_choices, value=None),
+            gr.DataFrame(value=None, row_count=0, col_count=0)
+        ]
+    def update_table(system_type: str, dataset: str):
+        # Only populate table when both selections are made
+        if not system_type or not dataset:
+            return gr.DataFrame(value=None, row_count=0)
+        df, datatypes = get_leaderboard_data(system_type, dataset)
+        return gr.DataFrame(
+            value=df,
+            row_count=(len(df), "fixed"),
+            datatype=datatypes
+        )
+    system_type.change(
+        fn=update_datasets,
+        inputs=[system_type],
+        outputs=[dataset, table]
+    )
+    dataset.change(
+        fn=update_table,
+        inputs=[system_type, dataset],
+        outputs=[table]
+    )

leaderboard_data.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Copyright 2025 Balacoon
+Utils to get data to populate leaderboard.
+Communicates with `balacoon/speech_gen_baselines` dataset on Hugging Face,
+that contains evaluation results of different speech generation systems.
+"""
+import yaml
+import logging
+import requests
+import pandas as pd
+from huggingface_hub import hf_hub_url
+from huggingface_hub.hf_api import RepoFolder
+from api import api
+def get_leaderboard_data(system_type: str, dataset: str) -> pd.DataFrame:
+    """
+    Fetches metrics.yaml for all systems of given type if they have evaluation for the given dataset.
+    Returns a DataFrame with metrics per system.
+    """
+    # Get all models under the system type
+    models_tree = api.list_repo_tree(
+        "balacoon/speech_gen_baselines",
+        repo_type="dataset",
+        path_in_repo=system_type,
+        recursive=False
+    )
+    model_dirs = [item.path for item in models_tree if isinstance(item, RepoFolder)]
+    # Collect metrics for each model that has the dataset
+    metrics_data = []
+    for model_dir in model_dirs:
+        model_name = model_dir.split('/')[-1]
+        metrics_path = f"{model_dir}/{dataset}/metrics.yaml"
+        try:
+            url = hf_hub_url(
+                repo_id="balacoon/speech_gen_baselines",
+                filename=metrics_path,
+                repo_type="dataset"
+            )
+            response = requests.get(url)
+            metrics_content = yaml.safe_load(response.text)
+            if "metrics" not in metrics_content:
+                logging.error(f"`metrics` are missing from metrics.yaml ({metrics_path})")
+                continue
+            # prepare a row for the table
+            if "model_name" in metrics_content:
+                # overwrite model name
+                model_name = metrics_content["model_name"]
+            # add a link to a model if it is provided
+            if "link" in metrics_content:
+                model_name = f"[{model_name}]({metrics_content['link']})"
+            row = {"Model": model_name}
+            # Round all metric values to 4 decimal places
+            rounded_metrics = {k: round(float(v), 4) for k, v in metrics_content["metrics"].items()}
+            row.update(rounded_metrics)
+            metrics_data.append(row)
+        except:
+            # Skip if metrics.yaml doesn't exist for this model/dataset
+            continue
+    df = pd.DataFrame(metrics_data)
+    # Remove 'aesthetics_' prefix from column names where applicable
+    df.columns = [col.replace('aesthetics_', '') if col.startswith('aesthetics_') else col for col in df.columns]
+    # compose datatypes for the table: markdown for model name, and number for all other columns
+    datatypes = ["markdown"] + ["number"] * (len(df.columns) - 1)
+    return df, datatypes

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ huggingface_hub
2	+ pandas

samples.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Copyright 2025 Balacoon
+Samples tab for TTSLeaderboard.
+"""
+import gradio as gr
+from dataset_utils import get_system_types, get_models, get_datasets
+from samples_data import get_samples_data
+def create_samples_tab():
+    with gr.Row():
+        gr.Markdown("""
+            Select system type, models of interest, and a dataset, to see samples.
+        """)
+    with gr.Row():
+        system_type = gr.Dropdown(choices=get_system_types(), label="System Type", value=None)
+        models = gr.Dropdown(choices=[], label="Models", value=None, multiselect=True)
+        dataset = gr.Dropdown(choices=[], label="Dataset", value=None)
+    with gr.Row():
+        table = gr.DataFrame(
+            value=None,
+            row_count=0,
+            col_count=0,
+        )
+    def update_models(system_type: str):
+        # When system type changes, clear dataset and table
+        models = get_models(system_type) if system_type else []
+        return [
+            gr.Dropdown(choices=models, value=None),
+            gr.Dropdown(choices=[], value=None),
+            gr.DataFrame(value=None, row_count=0, col_count=0)
+        ]
+    def update_datasets(system_type: str, models: list[str]):
+        datasets = get_datasets(system_type, models, return_union=False)
+        return [
+            gr.Dropdown(choices=datasets, value=None),
+            gr.DataFrame(value=None, row_count=0, col_count=0)
+        ]
+    def update_table(system_type: str, models: list[str], dataset: str):
+        # Only populate table when both selections are made
+        if not system_type or not dataset or not models:
+            return gr.DataFrame(value=None, row_count=0)
+        df, datatypes = get_samples_data(system_type, models, dataset)
+        return gr.DataFrame(
+            value=df,
+            row_count=(len(df), "fixed"),
+            datatype=datatypes,
+            wrap=True,
+        )
+    system_type.change(
+        fn=update_models,
+        inputs=[system_type],
+        outputs=[models, dataset, table]
+    )
+    models.change(
+        fn=update_datasets,
+        inputs=[system_type, models],
+        outputs=[dataset, table]
+    )
+    dataset.change(
+        fn=update_table,
+        inputs=[system_type, models, dataset],
+        outputs=[table]
+    )

samples_data.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Copyright 2025 Balacoon
+Fetches samples from `balacoon/speech_gen_baselines` and
+`balacoon/speech_gen_eval_testsets` datasets.
+"""
+import re
+import logging
+import requests
+import pandas as pd
+from huggingface_hub import hf_hub_url
+def get_samples_data(system_type: str, models: list[str], dataset: str) -> tuple[pd.DataFrame, list[str]]:
+    """
+    Fetches `demo` and `id_mapping` from `balacoon/speech_gen_eval_testsets` for the given dataset.
+    Then fetches reference files according to `id_mapping` from `balacoon/speech_gen_eval_testsets`.
+    Finally fetches synthetic samples for different models from `balacoon/speech_gen_baselines`
+    according to `demo`.
+    """
+    testsets_repo = "balacoon/speech_gen_eval_testsets"
+    # 1. get demo and id_mapping
+    demo_path = f"{dataset}/demo"
+    id_mapping_path = f"{dataset}/id_mapping"
+    try:
+        # read demo ids
+        url = hf_hub_url(
+            repo_id=testsets_repo,
+            filename=demo_path,
+            repo_type="dataset"
+        )
+        response = requests.get(url)
+        demo = response.text.splitlines()
+        demo = [re.split(r"\s+", x.strip(), maxsplit=1) for x in demo]
+        if system_type == "vocoder":
+            # no need for mapping, mapping is to itself
+            mapping = {name: name for name, _ in demo}
+        else:
+            # read id mapping
+            url = hf_hub_url(
+                repo_id=testsets_repo,
+                filename=id_mapping_path,
+                repo_type="dataset"
+            )
+            response = requests.get(url)
+            mapping = response.text.splitlines()
+            mapping = [x.split() for x in mapping]
+            mapping = {k: v for k, v in mapping}
+    except Exception as e:
+        logging.error(f"Failed to read demo / mapping for {dataset}: {e}")
+        return pd.DataFrame()
+    # 2. get reference files
+    if not all(x in mapping for x, _ in demo):
+        raise ValueError(f"Failed to fetch demo or mapping for {dataset}, refresh the page.")
+    ref_ids = list(set([mapping[x] for x, _ in demo]))
+    reference_samples = {}
+    for id in ref_ids:
+        try:
+            url = hf_hub_url(
+                repo_id=testsets_repo,
+                filename=f"{dataset}/wav/{id}.wav",
+                repo_type="dataset"
+            )
+            reference_samples[id] = f"<audio src='{url}' controls></audio>"
+        except Exception as e:
+            logging.error(f"Failed to read reference {id} for {dataset}: {e}")
+            continue
+    # 3. get synthetic samples
+    systems_samples = {model: {} for model in models}
+    baselines_repo = "balacoon/speech_gen_baselines"
+    for model in models:
+        for id, _ in demo:
+            try:
+                filename = f"{system_type}/{model}/{dataset}/wav/{id}.wav"
+                url = hf_hub_url(
+                    repo_id=baselines_repo,
+                    filename=filename,
+                    repo_type="dataset"
+                )
+                systems_samples[model][id] = f"<audio src='{url}' controls></audio>"
+            except Exception as e:
+                logging.error(f"Failed to read sample {id} from {filename} in {dataset}: {e}")
+                continue
+    # filter out demo ids, checking if all samples are present
+    filtered_demo = []
+    for id, txt in demo:
+        if id not in mapping:
+            continue
+        ref_id = mapping[id]
+        if ref_id not in reference_samples:
+            continue
+        if all(id in systems_samples[model] for model in models):
+            filtered_demo.append((id, txt))
+    # finally create a dataframe
+    rows = []
+    for id, txt in filtered_demo:
+        row = {
+            "id": id,
+            "text": txt,
+            "reference": reference_samples[mapping[id]],
+        }
+        for model in models:
+            row[model] = systems_samples[model][id]
+        rows.append(row)
+    datatypes = ["text", "text", "markdown"] + ["markdown"] * len(models)
+    return pd.DataFrame(rows), datatypes