import os import re import uuid import torch import torchaudio import soundfile as sf from fastapi import FastAPI from fastapi.responses import FileResponse from pydantic import BaseModel from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from speechbrain.inference.speaker import EncoderClassifier app = FastAPI() device = "cuda" if torch.cuda.is_available() else "cpu" CACHE_DIR = "/tmp/hf-cache" # Load models processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device) model_male = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/5aad", cache_dir=CACHE_DIR).to(device) model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device) # Speaker encoder speaker_model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-xvect-voxceleb", run_opts={"device": device}, savedir="/tmp/spk_model" ) # Load speaker embeddings def get_embedding(wav_path, pt_path): if os.path.exists(pt_path): return torch.load(pt_path).to(device) audio, sr = torchaudio.load(wav_path) audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device) with torch.no_grad(): emb = speaker_model.encode_batch(audio) emb = torch.nn.functional.normalize(emb, dim=2).squeeze() torch.save(emb.cpu(), pt_path) return emb embedding_male = get_embedding("Hussein.wav", "/tmp/male_embedding.pt") embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt") # Text normalization number_words = { 0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan", 6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban", 20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton", 60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan", 100: "boqol", 1000: "kun" } def number_to_words(n): if n < 20: return number_words.get(n, str(n)) elif n < 100: tens, unit = divmod(n, 10) return number_words[tens * 10] + (" " + number_words[unit] if unit else "") elif n < 1000: hundreds, rem = divmod(n, 100) return (number_words[hundreds] + " boqol" if hundreds > 1 else "boqol") + (" " + number_to_words(rem) if rem else "") elif n < 1_000_000: th, rem = divmod(n, 1000) return (number_to_words(th) + " kun") + (" " + number_to_words(rem) if rem else "") else: return str(n) def replace_numbers_with_words(text): return re.sub(r'\b\d+\b', lambda m: number_to_words(int(m.group())), text) def normalize_text(text): text = text.lower() text = replace_numbers_with_words(text) text = re.sub(r'[^\w\s]', '', text) return text # API request schema class TTSRequest(BaseModel): text: str voice: str # "Male" or "Female" @app.post("/speak") def speak(payload: TTSRequest): clean_text = normalize_text(payload.text) inputs = processor(text=clean_text, return_tensors="pt").to(device) model = model_male if payload.voice.lower() == "male" else model_female embedding = embedding_male if payload.voice.lower() == "male" else embedding_female with torch.no_grad(): waveform = model.generate_speech(inputs["input_ids"], embedding.unsqueeze(0), vocoder=vocoder) out_path = f"/tmp/{uuid.uuid4().hex}.wav" sf.write(out_path, waveform.cpu().numpy(), 16000) return FileResponse(out_path, media_type="audio/wav", filename="voice.wav")