clementruhm commited on
Commit
0dc360b
·
1 Parent(s): c4d0aea

Initial version of leaderboad

Browse files
Files changed (10) hide show
  1. .gitignore +1 -0
  2. README.md +2 -1
  3. api.py +11 -0
  4. app.py +37 -0
  5. dataset_utils.py +89 -0
  6. leaderboard.py +56 -0
  7. leaderboard_data.py +71 -0
  8. requirements.txt +2 -0
  9. samples.py +73 -0
  10. samples_data.py +116 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
README.md CHANGED
@@ -11,4 +11,5 @@ license: apache-2.0
11
  short_description: Objective evaluations for speech generation systems
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
11
  short_description: Objective evaluations for speech generation systems
12
  ---
13
 
14
+ Fetches data from `balacoon/speech_gen_baselines` and `balacoon/speech_gen_eval_testsets` datasets on Hugging Face to create a leaderboard for speech generation systems.
15
+ Metrics are computed with `speech_gen_eval` library. Leaderboard allows to compare the metrics and to listen to the samples.
api.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2025 Balacoon
3
+
4
+ api obj reused when interacting with datasets
5
+ """
6
+
7
+ from huggingface_hub import HfApi
8
+ api = HfApi()
9
+
10
+
11
+
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2025 Balacoon
3
+
4
+ TTSLeaderboard app.
5
+ """
6
+
7
+ import gradio as gr
8
+ from leaderboard import create_leaderboard_tab
9
+ from samples import create_samples_tab
10
+
11
+
12
+ def main():
13
+ with gr.Blocks(css="footer {visibility: hidden}") as demo:
14
+ gr.Markdown(
15
+ """
16
+ # TTSLeaderboard
17
+ ### Objective evaluation of speech generation systems
18
+ Evaluations are done with [speech_gen_eval](https://github.com/balacoon/speech_gen_eval),
19
+ on [speech_gen_eval_testsets](https://huggingface.co/datasets/balacoon/speech_gen_eval_testsets).
20
+ """
21
+ )
22
+
23
+ with gr.Tabs():
24
+ with gr.TabItem("📊 Leaderboard"):
25
+ create_leaderboard_tab()
26
+
27
+ with gr.TabItem("🔊 Samples"):
28
+ create_samples_tab()
29
+ gr.Markdown("""
30
+ Click on the audio files to play sample generations from each model.
31
+ """)
32
+
33
+ demo.launch()
34
+
35
+
36
+ if __name__ == "__main__":
37
+ main()
dataset_utils.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2025 Balacoon
3
+
4
+ Utils to interact with the `metrics` dataset.
5
+ """
6
+
7
+ from typing import Optional
8
+ from huggingface_hub.hf_api import RepoFolder
9
+
10
+ from api import api
11
+
12
+ baselines_repo = "balacoon/speech_gen_baselines"
13
+
14
+ def get_system_types() -> list[str]:
15
+ """
16
+ Get what types of systems user can check a leaderboard for.
17
+ We check `balacoon/speech_gen_baselines` dataset,
18
+ where synthesis from different models are stored.
19
+ For example, the dataset would have the following structure:
20
+ ```
21
+ speech_gen_baselines/
22
+ zero-tts/
23
+ vocoder/
24
+ ```
25
+ """
26
+ repo_tree = api.list_repo_tree(
27
+ baselines_repo,
28
+ repo_type="dataset",
29
+ recursive=False
30
+ )
31
+ top_level_dirs = [item.path for item in repo_tree if isinstance(item, RepoFolder)]
32
+ return top_level_dirs
33
+
34
+
35
+ def get_models(system_type: str) -> list[str]:
36
+ """
37
+ Get all models under the given system type.
38
+ For example, for system_type="zero-tts", returns ["xtts", "yourtts"].
39
+ """
40
+ models_tree = api.list_repo_tree(
41
+ baselines_repo,
42
+ repo_type="dataset",
43
+ path_in_repo=system_type,
44
+ recursive=False
45
+ )
46
+ model_dirs = [item.path for item in models_tree if isinstance(item, RepoFolder)]
47
+ # Extract just the model names from the full paths
48
+ model_names = [path.split('/')[-1] for path in model_dirs]
49
+ return model_names
50
+
51
+
52
+ def get_datasets(system_type: str, model_dirs: Optional[list[str]] = None, return_union: bool = True) -> list[str]:
53
+ """
54
+ Get what metrics on which datasets are available for the given system type.
55
+ Go through all systems under system type, and check datasets under each system.
56
+ The dataset would have the following structure:
57
+ ```
58
+ speech_gen_baselines/
59
+ zero-tts/
60
+ xtts/
61
+ vctk/
62
+ daps_celeb/
63
+ yourtts/
64
+ vctk/
65
+ daps_celeb/
66
+ ```
67
+ """
68
+ if model_dirs is None:
69
+ # Get all models under the system type
70
+ model_dirs = get_models(system_type)
71
+
72
+ # Get all unique datasets across all models
73
+ datasets_per_model = []
74
+ for model_dir in model_dirs:
75
+ datasets_tree = api.list_repo_tree(
76
+ baselines_repo,
77
+ repo_type="dataset",
78
+ path_in_repo=system_type + "/" + model_dir,
79
+ recursive=False
80
+ )
81
+ model_datasets = [item.path.split('/')[-1] for item in datasets_tree if isinstance(item, RepoFolder)]
82
+ datasets_per_model.append(model_datasets)
83
+
84
+ if return_union:
85
+ # return all possible datasets for these models
86
+ return sorted(list(set().union(*datasets_per_model)))
87
+ else:
88
+ # return only datasets which are present in all models
89
+ return sorted(list(set.intersection(*map(set, datasets_per_model))))
leaderboard.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2025 Balacoon
3
+
4
+ Leaderboard tab for TTSLeaderboard.
5
+ """
6
+
7
+ import gradio as gr
8
+
9
+ from dataset_utils import get_system_types, get_datasets
10
+ from leaderboard_data import get_leaderboard_data
11
+
12
+ def create_leaderboard_tab():
13
+ with gr.Row():
14
+ gr.Markdown("""
15
+ Select system type and a dataset, to see the leaderboard.
16
+ """)
17
+ with gr.Row():
18
+ system_type = gr.Dropdown(choices=get_system_types(), label="System Type", value=None)
19
+ dataset = gr.Dropdown(choices=[], label="Dataset", value=None)
20
+ with gr.Row():
21
+ table = gr.DataFrame(
22
+ value=None,
23
+ row_count=0,
24
+ col_count=0,
25
+ )
26
+ def update_datasets(system_type: str):
27
+ # When system type changes, clear dataset and table
28
+ dataset_choices = get_datasets(system_type) if system_type else []
29
+ return [
30
+ gr.Dropdown(choices=dataset_choices, value=None),
31
+ gr.DataFrame(value=None, row_count=0, col_count=0)
32
+ ]
33
+
34
+ def update_table(system_type: str, dataset: str):
35
+ # Only populate table when both selections are made
36
+ if not system_type or not dataset:
37
+ return gr.DataFrame(value=None, row_count=0)
38
+
39
+ df, datatypes = get_leaderboard_data(system_type, dataset)
40
+ return gr.DataFrame(
41
+ value=df,
42
+ row_count=(len(df), "fixed"),
43
+ datatype=datatypes
44
+ )
45
+
46
+ system_type.change(
47
+ fn=update_datasets,
48
+ inputs=[system_type],
49
+ outputs=[dataset, table]
50
+ )
51
+
52
+ dataset.change(
53
+ fn=update_table,
54
+ inputs=[system_type, dataset],
55
+ outputs=[table]
56
+ )
leaderboard_data.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2025 Balacoon
3
+
4
+ Utils to get data to populate leaderboard.
5
+ Communicates with `balacoon/speech_gen_baselines` dataset on Hugging Face,
6
+ that contains evaluation results of different speech generation systems.
7
+ """
8
+
9
+ import yaml
10
+ import logging
11
+ import requests
12
+
13
+ import pandas as pd
14
+ from huggingface_hub import hf_hub_url
15
+ from huggingface_hub.hf_api import RepoFolder
16
+
17
+ from api import api
18
+
19
+
20
+ def get_leaderboard_data(system_type: str, dataset: str) -> pd.DataFrame:
21
+ """
22
+ Fetches metrics.yaml for all systems of given type if they have evaluation for the given dataset.
23
+ Returns a DataFrame with metrics per system.
24
+ """
25
+ # Get all models under the system type
26
+ models_tree = api.list_repo_tree(
27
+ "balacoon/speech_gen_baselines",
28
+ repo_type="dataset",
29
+ path_in_repo=system_type,
30
+ recursive=False
31
+ )
32
+ model_dirs = [item.path for item in models_tree if isinstance(item, RepoFolder)]
33
+
34
+ # Collect metrics for each model that has the dataset
35
+ metrics_data = []
36
+ for model_dir in model_dirs:
37
+ model_name = model_dir.split('/')[-1]
38
+ metrics_path = f"{model_dir}/{dataset}/metrics.yaml"
39
+ try:
40
+ url = hf_hub_url(
41
+ repo_id="balacoon/speech_gen_baselines",
42
+ filename=metrics_path,
43
+ repo_type="dataset"
44
+ )
45
+ response = requests.get(url)
46
+ metrics_content = yaml.safe_load(response.text)
47
+ if "metrics" not in metrics_content:
48
+ logging.error(f"`metrics` are missing from metrics.yaml ({metrics_path})")
49
+ continue
50
+ # prepare a row for the table
51
+ if "model_name" in metrics_content:
52
+ # overwrite model name
53
+ model_name = metrics_content["model_name"]
54
+ # add a link to a model if it is provided
55
+ if "link" in metrics_content:
56
+ model_name = f"[{model_name}]({metrics_content['link']})"
57
+ row = {"Model": model_name}
58
+ # Round all metric values to 4 decimal places
59
+ rounded_metrics = {k: round(float(v), 4) for k, v in metrics_content["metrics"].items()}
60
+ row.update(rounded_metrics)
61
+ metrics_data.append(row)
62
+ except:
63
+ # Skip if metrics.yaml doesn't exist for this model/dataset
64
+ continue
65
+ df = pd.DataFrame(metrics_data)
66
+ # Remove 'aesthetics_' prefix from column names where applicable
67
+ df.columns = [col.replace('aesthetics_', '') if col.startswith('aesthetics_') else col for col in df.columns]
68
+ # compose datatypes for the table: markdown for model name, and number for all other columns
69
+ datatypes = ["markdown"] + ["number"] * (len(df.columns) - 1)
70
+ return df, datatypes
71
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ huggingface_hub
2
+ pandas
samples.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2025 Balacoon
3
+
4
+ Samples tab for TTSLeaderboard.
5
+ """
6
+
7
+ import gradio as gr
8
+
9
+ from dataset_utils import get_system_types, get_models, get_datasets
10
+ from samples_data import get_samples_data
11
+
12
+ def create_samples_tab():
13
+ with gr.Row():
14
+ gr.Markdown("""
15
+ Select system type, models of interest, and a dataset, to see samples.
16
+ """)
17
+ with gr.Row():
18
+ system_type = gr.Dropdown(choices=get_system_types(), label="System Type", value=None)
19
+ models = gr.Dropdown(choices=[], label="Models", value=None, multiselect=True)
20
+ dataset = gr.Dropdown(choices=[], label="Dataset", value=None)
21
+ with gr.Row():
22
+ table = gr.DataFrame(
23
+ value=None,
24
+ row_count=0,
25
+ col_count=0,
26
+ )
27
+
28
+ def update_models(system_type: str):
29
+ # When system type changes, clear dataset and table
30
+ models = get_models(system_type) if system_type else []
31
+ return [
32
+ gr.Dropdown(choices=models, value=None),
33
+ gr.Dropdown(choices=[], value=None),
34
+ gr.DataFrame(value=None, row_count=0, col_count=0)
35
+ ]
36
+
37
+ def update_datasets(system_type: str, models: list[str]):
38
+ datasets = get_datasets(system_type, models, return_union=False)
39
+ return [
40
+ gr.Dropdown(choices=datasets, value=None),
41
+ gr.DataFrame(value=None, row_count=0, col_count=0)
42
+ ]
43
+
44
+ def update_table(system_type: str, models: list[str], dataset: str):
45
+ # Only populate table when both selections are made
46
+ if not system_type or not dataset or not models:
47
+ return gr.DataFrame(value=None, row_count=0)
48
+
49
+ df, datatypes = get_samples_data(system_type, models, dataset)
50
+ return gr.DataFrame(
51
+ value=df,
52
+ row_count=(len(df), "fixed"),
53
+ datatype=datatypes,
54
+ wrap=True,
55
+ )
56
+
57
+ system_type.change(
58
+ fn=update_models,
59
+ inputs=[system_type],
60
+ outputs=[models, dataset, table]
61
+ )
62
+
63
+ models.change(
64
+ fn=update_datasets,
65
+ inputs=[system_type, models],
66
+ outputs=[dataset, table]
67
+ )
68
+
69
+ dataset.change(
70
+ fn=update_table,
71
+ inputs=[system_type, models, dataset],
72
+ outputs=[table]
73
+ )
samples_data.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright 2025 Balacoon
3
+
4
+ Fetches samples from `balacoon/speech_gen_baselines` and
5
+ `balacoon/speech_gen_eval_testsets` datasets.
6
+ """
7
+
8
+ import re
9
+ import logging
10
+ import requests
11
+
12
+ import pandas as pd
13
+
14
+ from huggingface_hub import hf_hub_url
15
+
16
+
17
+ def get_samples_data(system_type: str, models: list[str], dataset: str) -> tuple[pd.DataFrame, list[str]]:
18
+ """
19
+ Fetches `demo` and `id_mapping` from `balacoon/speech_gen_eval_testsets` for the given dataset.
20
+ Then fetches reference files according to `id_mapping` from `balacoon/speech_gen_eval_testsets`.
21
+ Finally fetches synthetic samples for different models from `balacoon/speech_gen_baselines`
22
+ according to `demo`.
23
+ """
24
+ testsets_repo = "balacoon/speech_gen_eval_testsets"
25
+ # 1. get demo and id_mapping
26
+ demo_path = f"{dataset}/demo"
27
+ id_mapping_path = f"{dataset}/id_mapping"
28
+ try:
29
+ # read demo ids
30
+ url = hf_hub_url(
31
+ repo_id=testsets_repo,
32
+ filename=demo_path,
33
+ repo_type="dataset"
34
+ )
35
+ response = requests.get(url)
36
+ demo = response.text.splitlines()
37
+ demo = [re.split(r"\s+", x.strip(), maxsplit=1) for x in demo]
38
+
39
+ if system_type == "vocoder":
40
+ # no need for mapping, mapping is to itself
41
+ mapping = {name: name for name, _ in demo}
42
+ else:
43
+ # read id mapping
44
+ url = hf_hub_url(
45
+ repo_id=testsets_repo,
46
+ filename=id_mapping_path,
47
+ repo_type="dataset"
48
+ )
49
+ response = requests.get(url)
50
+ mapping = response.text.splitlines()
51
+ mapping = [x.split() for x in mapping]
52
+ mapping = {k: v for k, v in mapping}
53
+ except Exception as e:
54
+ logging.error(f"Failed to read demo / mapping for {dataset}: {e}")
55
+ return pd.DataFrame()
56
+
57
+ # 2. get reference files
58
+ if not all(x in mapping for x, _ in demo):
59
+ raise ValueError(f"Failed to fetch demo or mapping for {dataset}, refresh the page.")
60
+ ref_ids = list(set([mapping[x] for x, _ in demo]))
61
+ reference_samples = {}
62
+ for id in ref_ids:
63
+ try:
64
+ url = hf_hub_url(
65
+ repo_id=testsets_repo,
66
+ filename=f"{dataset}/wav/{id}.wav",
67
+ repo_type="dataset"
68
+ )
69
+ reference_samples[id] = f"<audio src='{url}' controls></audio>"
70
+ except Exception as e:
71
+ logging.error(f"Failed to read reference {id} for {dataset}: {e}")
72
+ continue
73
+
74
+ # 3. get synthetic samples
75
+ systems_samples = {model: {} for model in models}
76
+ baselines_repo = "balacoon/speech_gen_baselines"
77
+ for model in models:
78
+ for id, _ in demo:
79
+ try:
80
+ filename = f"{system_type}/{model}/{dataset}/wav/{id}.wav"
81
+ url = hf_hub_url(
82
+ repo_id=baselines_repo,
83
+ filename=filename,
84
+ repo_type="dataset"
85
+ )
86
+ systems_samples[model][id] = f"<audio src='{url}' controls></audio>"
87
+ except Exception as e:
88
+ logging.error(f"Failed to read sample {id} from {filename} in {dataset}: {e}")
89
+ continue
90
+
91
+ # filter out demo ids, checking if all samples are present
92
+ filtered_demo = []
93
+ for id, txt in demo:
94
+ if id not in mapping:
95
+ continue
96
+ ref_id = mapping[id]
97
+ if ref_id not in reference_samples:
98
+ continue
99
+ if all(id in systems_samples[model] for model in models):
100
+ filtered_demo.append((id, txt))
101
+
102
+ # finally create a dataframe
103
+ rows = []
104
+ for id, txt in filtered_demo:
105
+ row = {
106
+ "id": id,
107
+ "text": txt,
108
+ "reference": reference_samples[mapping[id]],
109
+ }
110
+ for model in models:
111
+ row[model] = systems_samples[model][id]
112
+ rows.append(row)
113
+ datatypes = ["text", "text", "markdown"] + ["markdown"] * len(models)
114
+ return pd.DataFrame(rows), datatypes
115
+
116
+