Spaces:
Running
Running
""" | |
Copyright 2025 Balacoon | |
Fetches samples from `balacoon/speech_gen_baselines` and | |
`balacoon/speech_gen_eval_testsets` datasets. | |
""" | |
import re | |
import logging | |
import requests | |
import pandas as pd | |
from huggingface_hub import hf_hub_url | |
def get_samples_data(system_type: str, models: list[str], dataset: str) -> tuple[pd.DataFrame, list[str]]: | |
""" | |
Fetches `demo` and `id_mapping` from `balacoon/speech_gen_eval_testsets` for the given dataset. | |
Then fetches reference files according to `id_mapping` from `balacoon/speech_gen_eval_testsets`. | |
Finally fetches synthetic samples for different models from `balacoon/speech_gen_baselines` | |
according to `demo`. | |
""" | |
testsets_repo = "balacoon/speech_gen_eval_testsets" | |
# 1. get demo and id_mapping | |
demo_path = f"{dataset}/demo" | |
id_mapping_path = f"{dataset}/id_mapping" | |
try: | |
# read demo ids | |
url = hf_hub_url( | |
repo_id=testsets_repo, | |
filename=demo_path, | |
repo_type="dataset" | |
) | |
response = requests.get(url) | |
demo = response.text.splitlines() | |
demo = [re.split(r"\s+", x.strip(), maxsplit=1) for x in demo] | |
if system_type == "vocoder": | |
# no need for mapping, mapping is to itself | |
mapping = {name: name for name, _ in demo} | |
else: | |
# read id mapping | |
url = hf_hub_url( | |
repo_id=testsets_repo, | |
filename=id_mapping_path, | |
repo_type="dataset" | |
) | |
response = requests.get(url) | |
mapping = response.text.splitlines() | |
mapping = [x.split() for x in mapping] | |
mapping = {k: v for k, v in mapping} | |
except Exception as e: | |
logging.error(f"Failed to read demo / mapping for {dataset}: {e}") | |
return pd.DataFrame() | |
# 2. get reference files | |
if not all(x in mapping for x, _ in demo): | |
raise ValueError(f"Failed to fetch demo or mapping for {dataset}, refresh the page.") | |
ref_ids = list(set([mapping[x] for x, _ in demo])) | |
reference_samples = {} | |
for id in ref_ids: | |
try: | |
url = hf_hub_url( | |
repo_id=testsets_repo, | |
filename=f"{dataset}/wav/{id}.wav", | |
repo_type="dataset" | |
) | |
reference_samples[id] = f"<audio src='{url}' controls></audio>" | |
except Exception as e: | |
logging.error(f"Failed to read reference {id} for {dataset}: {e}") | |
continue | |
# 3. get synthetic samples | |
systems_samples = {model: {} for model in models} | |
baselines_repo = "balacoon/speech_gen_baselines" | |
for model in models: | |
for id, _ in demo: | |
try: | |
filename = f"{system_type}/{model}/{dataset}/wav/{id}.wav" | |
url = hf_hub_url( | |
repo_id=baselines_repo, | |
filename=filename, | |
repo_type="dataset" | |
) | |
systems_samples[model][id] = f"<audio src='{url}' controls></audio>" | |
except Exception as e: | |
logging.error(f"Failed to read sample {id} from {filename} in {dataset}: {e}") | |
continue | |
# filter out demo ids, checking if all samples are present | |
filtered_demo = [] | |
for id, txt in demo: | |
if id not in mapping: | |
continue | |
ref_id = mapping[id] | |
if ref_id not in reference_samples: | |
continue | |
if all(id in systems_samples[model] for model in models): | |
filtered_demo.append((id, txt)) | |
# finally create a dataframe | |
rows = [] | |
for id, txt in filtered_demo: | |
row = { | |
"id": id, | |
"text": txt, | |
"reference": reference_samples[mapping[id]], | |
} | |
for model in models: | |
row[model] = systems_samples[model][id] | |
rows.append(row) | |
datatypes = ["text", "text", "markdown"] + ["markdown"] * len(models) | |
return pd.DataFrame(rows), datatypes | |