ruslanmv's picture
First commit
0a6371e
import gradio as gr
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from io import BytesIO
import soundfile as sf
# Load models outside of function calls for efficiency
def load_models():
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
return model, processor, vocoder
model, processor, vocoder = load_models()
# Load speaker embeddings
def get_speaker_embeddings():
speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy")
return torch.tensor(speaker_embeddings).unsqueeze(0)
speaker_embeddings = get_speaker_embeddings()
# Function to convert text to speech
def text_to_speech(text):
try:
# Segment the text if it's too long
max_length = 100 # Set a max length as per model's capability
segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
combined_speech = []
for segment in segments:
inputs = processor(text=segment, return_tensors="pt")
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
with torch.no_grad():
speech = vocoder(spectrogram)
combined_speech.extend(speech.numpy())
# Combine audio data into a single numpy array
combined_speech = np.array(combined_speech)
return 16000, combined_speech # Return sample rate and combined audio data
except Exception as e:
return None, f"Error in text-to-speech conversion: {e}"
# Gradio Interface
def gradio_interface(text):
sample_rate, audio_data = text_to_speech(text)
if sample_rate and isinstance(audio_data, np.ndarray):
return sample_rate, audio_data
else:
return None # Return None if there's an error
interface = gr.Interface(
fn=gradio_interface,
title="Text to Voice", # Add a title to the interface
description="Hight Fidelity TTS. Visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.",
inputs=gr.Textbox(lines=10, label="Enter text to convert to speech"),
outputs=gr.Audio(label="Generated audio")
)
interface.launch()