Spaces:

Somalitts
/

labadaba_do

Running

App Files Files Community

Somalitts commited on 3 days ago

Commit

c82ae02

verified ·

1 Parent(s): 3af67f8

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -9

app.py CHANGED Viewed

@@ -14,10 +14,9 @@ app = FastAPI()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 CACHE_DIR = "/tmp/hf-cache"
-# Load models
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
-model_male = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/5aad", cache_dir=CACHE_DIR).to(device)
 model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
 # Speaker encoder
@@ -27,7 +26,7 @@ speaker_model = EncoderClassifier.from_hparams(
     savedir="/tmp/spk_model"
 )
-# Load speaker embeddings
 def get_embedding(wav_path, pt_path):
     if os.path.exists(pt_path):
         return torch.load(pt_path).to(device)
@@ -39,7 +38,6 @@ def get_embedding(wav_path, pt_path):
     torch.save(emb.cpu(), pt_path)
     return emb
-embedding_male = get_embedding("Hussein.wav", "/tmp/male_embedding.pt")
 embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")
 # Text normalization
@@ -75,20 +73,17 @@ def normalize_text(text):
     text = re.sub(r'[^\w\s]', '', text)
     return text
-# API request schema
 class TTSRequest(BaseModel):
     text: str
-    voice: str  # "Male" or "Female"
 @app.post("/speak")
 def speak(payload: TTSRequest):
     clean_text = normalize_text(payload.text)
     inputs = processor(text=clean_text, return_tensors="pt").to(device)
-    model = model_male if payload.voice.lower() == "male" else model_female
-    embedding = embedding_male if payload.voice.lower() == "male" else embedding_female
     with torch.no_grad():
-        waveform = model.generate_speech(inputs["input_ids"], embedding.unsqueeze(0), vocoder=vocoder)
     out_path = f"/tmp/{uuid.uuid4().hex}.wav"
     sf.write(out_path, waveform.cpu().numpy(), 16000)

 device = "cuda" if torch.cuda.is_available() else "cpu"
 CACHE_DIR = "/tmp/hf-cache"
+# Load models (female only)
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts", cache_dir=CACHE_DIR)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan", cache_dir=CACHE_DIR).to(device)
 model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad", cache_dir=CACHE_DIR).to(device)
 # Speaker encoder
     savedir="/tmp/spk_model"
 )
+# Load female embedding only
 def get_embedding(wav_path, pt_path):
     if os.path.exists(pt_path):
         return torch.load(pt_path).to(device)
     torch.save(emb.cpu(), pt_path)
     return emb
 embedding_female = get_embedding("caasho.wav", "/tmp/female_embedding.pt")
 # Text normalization
     text = re.sub(r'[^\w\s]', '', text)
     return text
+# Request schema without voice choice
 class TTSRequest(BaseModel):
     text: str
 @app.post("/speak")
 def speak(payload: TTSRequest):
     clean_text = normalize_text(payload.text)
     inputs = processor(text=clean_text, return_tensors="pt").to(device)
     with torch.no_grad():
+        waveform = model_female.generate_speech(inputs["input_ids"], embedding_female.unsqueeze(0), vocoder=vocoder)
     out_path = f"/tmp/{uuid.uuid4().hex}.wav"
     sf.write(out_path, waveform.cpu().numpy(), 16000)