""" Copyright 2025 Balacoon Fetches samples from `balacoon/speech_gen_baselines` and `balacoon/speech_gen_eval_testsets` datasets. """ import re import logging import requests import pandas as pd from huggingface_hub import hf_hub_url def get_samples_data(system_type: str, models: list[str], dataset: str) -> tuple[pd.DataFrame, list[str]]: """ Fetches `demo` and `id_mapping` from `balacoon/speech_gen_eval_testsets` for the given dataset. Then fetches reference files according to `id_mapping` from `balacoon/speech_gen_eval_testsets`. Finally fetches synthetic samples for different models from `balacoon/speech_gen_baselines` according to `demo`. """ testsets_repo = "balacoon/speech_gen_eval_testsets" # 1. get demo and id_mapping demo_path = f"{dataset}/demo" id_mapping_path = f"{dataset}/id_mapping" try: # read demo ids url = hf_hub_url( repo_id=testsets_repo, filename=demo_path, repo_type="dataset" ) response = requests.get(url) demo = response.text.splitlines() demo = [re.split(r"\s+", x.strip(), maxsplit=1) for x in demo] if system_type == "vocoder": # no need for mapping, mapping is to itself mapping = {name: name for name, _ in demo} else: # read id mapping url = hf_hub_url( repo_id=testsets_repo, filename=id_mapping_path, repo_type="dataset" ) response = requests.get(url) mapping = response.text.splitlines() mapping = [x.split() for x in mapping] mapping = {k: v for k, v in mapping} except Exception as e: logging.error(f"Failed to read demo / mapping for {dataset}: {e}") return pd.DataFrame() # 2. get reference files if not all(x in mapping for x, _ in demo): raise ValueError(f"Failed to fetch demo or mapping for {dataset}, refresh the page.") ref_ids = list(set([mapping[x] for x, _ in demo])) reference_samples = {} for id in ref_ids: try: url = hf_hub_url( repo_id=testsets_repo, filename=f"{dataset}/wav/{id}.wav", repo_type="dataset" ) reference_samples[id] = f"" except Exception as e: logging.error(f"Failed to read reference {id} for {dataset}: {e}") continue # 3. get synthetic samples systems_samples = {model: {} for model in models} baselines_repo = "balacoon/speech_gen_baselines" for model in models: for id, _ in demo: try: filename = f"{system_type}/{model}/{dataset}/wav/{id}.wav" url = hf_hub_url( repo_id=baselines_repo, filename=filename, repo_type="dataset" ) systems_samples[model][id] = f"" except Exception as e: logging.error(f"Failed to read sample {id} from {filename} in {dataset}: {e}") continue # filter out demo ids, checking if all samples are present filtered_demo = [] for id, txt in demo: if id not in mapping: continue ref_id = mapping[id] if ref_id not in reference_samples: continue if all(id in systems_samples[model] for model in models): filtered_demo.append((id, txt)) # finally create a dataframe rows = [] for id, txt in filtered_demo: row = { "id": id, "text": txt, "reference": reference_samples[mapping[id]], } for model in models: row[model] = systems_samples[model][id] rows.append(row) datatypes = ["text", "text", "markdown"] + ["markdown"] * len(models) return pd.DataFrame(rows), datatypes