""" Copyright 2025 Balacoon Utils to interact with the `metrics` dataset. """ from typing import Optional from huggingface_hub.hf_api import RepoFolder from api import api baselines_repo = "balacoon/speech_gen_baselines" def get_system_types() -> list[str]: """ Get what types of systems user can check a leaderboard for. We check `balacoon/speech_gen_baselines` dataset, where synthesis from different models are stored. For example, the dataset would have the following structure: ``` speech_gen_baselines/ zero-tts/ vocoder/ ``` """ repo_tree = api.list_repo_tree( baselines_repo, repo_type="dataset", recursive=False ) top_level_dirs = [item.path for item in repo_tree if isinstance(item, RepoFolder)] return top_level_dirs def get_models(system_type: str) -> list[str]: """ Get all models under the given system type. For example, for system_type="zero-tts", returns ["xtts", "yourtts"]. """ models_tree = api.list_repo_tree( baselines_repo, repo_type="dataset", path_in_repo=system_type, recursive=False ) model_dirs = [item.path for item in models_tree if isinstance(item, RepoFolder)] # Extract just the model names from the full paths model_names = [path.split('/')[-1] for path in model_dirs] return model_names def get_datasets(system_type: str, model_dirs: Optional[list[str]] = None, return_union: bool = True) -> list[str]: """ Get what metrics on which datasets are available for the given system type. Go through all systems under system type, and check datasets under each system. The dataset would have the following structure: ``` speech_gen_baselines/ zero-tts/ xtts/ vctk/ daps_celeb/ yourtts/ vctk/ daps_celeb/ ``` """ if model_dirs is None: # Get all models under the system type model_dirs = get_models(system_type) # Get all unique datasets across all models datasets_per_model = [] for model_dir in model_dirs: datasets_tree = api.list_repo_tree( baselines_repo, repo_type="dataset", path_in_repo=system_type + "/" + model_dir, recursive=False ) model_datasets = [item.path.split('/')[-1] for item in datasets_tree if isinstance(item, RepoFolder)] datasets_per_model.append(model_datasets) if return_union: # return all possible datasets for these models return sorted(list(set().union(*datasets_per_model))) else: # return only datasets which are present in all models return sorted(list(set.intersection(*map(set, datasets_per_model))))