tts_mockingbird

Running

App Files Files Community

tts_mockingbird / src /synthesize.py

khof312

Add support for IMS Toucan.

593cb11 4 months ago

raw

history blame contribute delete

3.29 kB

	import IPython
	from huggingface_hub.inference_api import InferenceApi
	import torch
	from TTS.api import TTS
	import wave
	import espeakng
	import subprocess
	from scipy.io import wavfile
	from transformers import pipeline
	import os
	import numpy as np
	from gradio_client import Client, file


	def synth_mms(text:str, model:str):
	'''
	Use Huggingface inference pipeline to synthesize text.
	(Can be replaced by inference API, but that requires stored API token.)

	Inputs:
	text: Text to synthesze
	model: Model code of the form mms-tts-LAN
	Returns:
	Streaming numpy and sampling rate.
	'''
	#inference = InferenceApi(repo_id=f"facebook/{model}",
	# token=API_TOKEN)
	#mms_tts = inference(inputs=text,
	# raw_response=True)._content

	if model is not None:
	pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU
	mms_tts = pipe(text)
	return mms_tts['audio'], mms_tts['sampling_rate']
	else:
	return None



	def synth_coqui(text:str, model:str):
	'''
	Use Coqui inference API to synthesize text.

	Inputs:
	text: Text to synthesze
	model: Model code
	Returns:
	Streaming Wav and sampling rate.

	IMPORTANT: Current implementation assumes 22050 sampling rate, this should be verified when adding a new model.
	'''
	if model is not None:
	# Get device
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Init TTS
	tts = TTS(model, progress_bar=False).to(device)

	# Infer
	wav = tts.tts(text=text) # is_multi_speaker=False

	return np.array(wav), 22050
	else:
	return None


	def synth_espeakng(text:str, model:str):
	'''
	Use ESpeak-NG to synthesize text.

	Inputs:
	text: Text to synthesze
	model: Model code
	Returns:
	Streaming Wav and sampling rate.
	'''
	if model is not None:

	subprocess.run(['espeak-ng', f'-v{model}', "-w test.wav", text])
	#esng = espeakng.Speaker()
	#esng.voice = model
	#esng.say(text, export_path="test.wav")

	sampling_rate, wav = wavfile.read('test.wav')
	os.remove("test.wav")

	#wav = tts.tts(text=text)
	return wav, sampling_rate
	else:
	return None


	def synth_toucan(text:str, model:str):
	'''
	Use Toucan to synthesize text.

	Inputs:
	text: Text to synthesze
	model: Model code
	Returns:
	Streaming Wav and sampling rate.

	NOTE: This wrapper does not let you explore the full range of options possible with the API. The API should allow you to generate female voices, however, it does not seem to be working at the moment.
	'''
	client = Client("Flux9665/MassivelyMultilingualTTS")
	result = client.predict(
	prompt=text,
	language=model,
	reference_audio=file('https://github.com/gradio-app/gradio/raw/main/test/test_files/audio_sample.wav'),
	voice_seed=123,
	prosody_creativity=0.1,
	duration_scaling_factor=1,
	emb1=0,
	emb2=0,
	api_name="/predict"
	)
	sampling_rate, wav = wavfile.read(result[0])
	return wav, sampling_rate