"""
Copyright 2025 Balacoon
Fetches samples from `balacoon/speech_gen_baselines` and
`balacoon/speech_gen_eval_testsets` datasets.
"""
import re
import logging
import requests
import pandas as pd
from huggingface_hub import hf_hub_url
def get_samples_data(system_type: str, models: list[str], dataset: str) -> tuple[pd.DataFrame, list[str]]:
"""
Fetches `demo` and `id_mapping` from `balacoon/speech_gen_eval_testsets` for the given dataset.
Then fetches reference files according to `id_mapping` from `balacoon/speech_gen_eval_testsets`.
Finally fetches synthetic samples for different models from `balacoon/speech_gen_baselines`
according to `demo`.
"""
testsets_repo = "balacoon/speech_gen_eval_testsets"
# 1. get demo and id_mapping
demo_path = f"{dataset}/demo"
id_mapping_path = f"{dataset}/id_mapping"
try:
# read demo ids
url = hf_hub_url(
repo_id=testsets_repo,
filename=demo_path,
repo_type="dataset"
)
response = requests.get(url)
demo = response.text.splitlines()
demo = [re.split(r"\s+", x.strip(), maxsplit=1) for x in demo]
if system_type == "vocoder":
# no need for mapping, mapping is to itself
mapping = {name: name for name, _ in demo}
else:
# read id mapping
url = hf_hub_url(
repo_id=testsets_repo,
filename=id_mapping_path,
repo_type="dataset"
)
response = requests.get(url)
mapping = response.text.splitlines()
mapping = [x.split() for x in mapping]
mapping = {k: v for k, v in mapping}
except Exception as e:
logging.error(f"Failed to read demo / mapping for {dataset}: {e}")
return pd.DataFrame()
# 2. get reference files
if not all(x in mapping for x, _ in demo):
raise ValueError(f"Failed to fetch demo or mapping for {dataset}, refresh the page.")
ref_ids = list(set([mapping[x] for x, _ in demo]))
reference_samples = {}
for id in ref_ids:
try:
url = hf_hub_url(
repo_id=testsets_repo,
filename=f"{dataset}/wav/{id}.wav",
repo_type="dataset"
)
reference_samples[id] = f""
except Exception as e:
logging.error(f"Failed to read reference {id} for {dataset}: {e}")
continue
# 3. get synthetic samples
systems_samples = {model: {} for model in models}
baselines_repo = "balacoon/speech_gen_baselines"
for model in models:
for id, _ in demo:
try:
filename = f"{system_type}/{model}/{dataset}/wav/{id}.wav"
url = hf_hub_url(
repo_id=baselines_repo,
filename=filename,
repo_type="dataset"
)
systems_samples[model][id] = f""
except Exception as e:
logging.error(f"Failed to read sample {id} from {filename} in {dataset}: {e}")
continue
# filter out demo ids, checking if all samples are present
filtered_demo = []
for id, txt in demo:
if id not in mapping:
continue
ref_id = mapping[id]
if ref_id not in reference_samples:
continue
if all(id in systems_samples[model] for model in models):
filtered_demo.append((id, txt))
# finally create a dataframe
rows = []
for id, txt in filtered_demo:
row = {
"id": id,
"text": txt,
"reference": reference_samples[mapping[id]],
}
for model in models:
row[model] = systems_samples[model][id]
rows.append(row)
datatypes = ["text", "text", "markdown"] + ["markdown"] * len(models)
return pd.DataFrame(rows), datatypes