import IPython from huggingface_hub.inference_api import InferenceApi import torch from TTS.api import TTS import wave import espeakng import subprocess from scipy.io import wavfile from transformers import pipeline import os import numpy as np def synth_mms(text:str, model:str): ''' Use Huggingface inference pipeline to synthesize text. (Can be replaced by inference API, but that requires stored API token.) Inputs: text: Text to synthesze model: Model code of the form mms-tts-LAN Returns: Streaming numpy and sampling rate. ''' #inference = InferenceApi(repo_id=f"facebook/{model}", # token=API_TOKEN) #mms_tts = inference(inputs=text, # raw_response=True)._content if model is not None: pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU mms_tts = pipe(text) return mms_tts['audio'], mms_tts['sampling_rate'] else: return None def synth_coqui(text:str, model:str): ''' Use Coqui inference API to synthesize text. Inputs: text: Text to synthesze model: Model code Returns: Streaming Wav and sampling rate. IMPORTANT: Current implementation assumes 22050 sampling rate, this should be verified when adding a new model. ''' if model is not None: # Get device device = "cuda" if torch.cuda.is_available() else "cpu" # Init TTS tts = TTS(model, progress_bar=False).to(device) # Infer wav = tts.tts(text=text) # is_multi_speaker=False return np.array(wav), 22050 else: return None def synth_espeakng(text:str, model:str): ''' Use ESpeak-NG to synthesize text. Inputs: text: Text to synthesze model: Model code Returns: Streaming Wav and sampling rate. ''' if model is not None: subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text]) #esng = espeakng.Speaker() #esng.voice = model #esng.say(text, export_path="test.wav") sampling_rate, wav = wavfile.read('test.wav') os.remove("test.wav") #wav = tts.tts(text=text) return wav, sampling_rate else: return None