Spaces:
Running
Running
import os | |
import re | |
import uuid | |
import torch | |
import torchaudio | |
import soundfile as sf | |
from fastapi import FastAPI | |
from fastapi.responses import FileResponse | |
from pydantic import BaseModel | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from speechbrain.inference.speaker import EncoderClassifier | |
app = FastAPI() | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
CACHE_DIR = "/tmp/hf-cache" | |
# Load models (female only) | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device) | |
model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device) | |
# Speaker encoder | |
speaker_model = EncoderClassifier.from_hparams( | |
source="speechbrain/spkrec-xvect-voxceleb", | |
run_opts={"device": device}, | |
savedir="/tmp/spk_model" | |
) | |
# Load female embedding only | |
def get_embedding(wav_path, pt_path): | |
if os.path.exists(pt_path): | |
return torch.load(pt_path).to(device) | |
audio, sr = torchaudio.load(wav_path) | |
audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device) | |
with torch.no_grad(): | |
emb = speaker_model.encode_batch(audio) | |
emb = torch.nn.functional.normalize(emb, dim=2).squeeze() | |
torch.save(emb.cpu(), pt_path) | |
return emb | |
embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt") | |
# Text normalization | |
number_words = { | |
0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan", | |
6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban", | |
20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton", | |
60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan", | |
100: "boqol", 1000: "kun" | |
} | |
def number_to_words(n): | |
if n < 20: | |
return number_words.get(n, str(n)) | |
elif n < 100: | |
tens, unit = divmod(n, 10) | |
return number_words[tens * 10] + (" " + number_words[unit] if unit else "") | |
elif n < 1000: | |
hundreds, rem = divmod(n, 100) | |
return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "") | |
elif n < 1_000_000: | |
th, rem = divmod(n, 1000) | |
return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "") | |
else: | |
return str(n) | |
def replace_numbers_with_words(text): | |
return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text) | |
def normalize_text(text): | |
text = text.lower() | |
text = replace_numbers_with_words(text) | |
text = re.sub(r'[^\w\s]', '', text) | |
return text | |
# Request schema without voice choice | |
class TTSRequest(BaseModel): | |
text: str | |
def speak(payload: TTSRequest): | |
clean_text = normalize_text(payload.text) | |
inputs = processor(text=clean_text, return_tensors="pt").to(device) | |
with torch.no_grad(): | |
waveform = model_female.generate_speech(inputs["input_ids"], embedding_female.unsqueeze(0), vocoder=vocoder) | |
out_path = f"/tmp/{uuid.uuid4().hex}.wav" | |
sf.write(out_path, waveform.cpu().numpy(), 16000) | |
return FileResponse(out_path, media_type="audio/wav", filename="voice.wav") | |