Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import torch | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from io import BytesIO | |
import soundfile as sf | |
# Load models outside of function calls for efficiency | |
def load_models(): | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
return model, processor, vocoder | |
model, processor, vocoder = load_models() | |
# Load speaker embeddings | |
def get_speaker_embeddings(): | |
speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy") | |
return torch.tensor(speaker_embeddings).unsqueeze(0) | |
speaker_embeddings = get_speaker_embeddings() | |
# Function to convert text to speech | |
def text_to_speech(text): | |
try: | |
# Segment the text if it's too long | |
max_length = 100 # Set a max length as per model's capability | |
segments = [text[i:i+max_length] for i in range(0, len(text), max_length)] | |
combined_speech = [] | |
for segment in segments: | |
inputs = processor(text=segment, return_tensors="pt") | |
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
with torch.no_grad(): | |
speech = vocoder(spectrogram) | |
combined_speech.extend(speech.numpy()) | |
# Combine audio data into a single numpy array | |
combined_speech = np.array(combined_speech) | |
return 16000, combined_speech # Return sample rate and combined audio data | |
except Exception as e: | |
return None, f"Error in text-to-speech conversion: {e}" | |
# Gradio Interface | |
def gradio_interface(text): | |
sample_rate, audio_data = text_to_speech(text) | |
if sample_rate and isinstance(audio_data, np.ndarray): | |
return sample_rate, audio_data | |
else: | |
return None # Return None if there's an error | |
interface = gr.Interface( | |
fn=gradio_interface, | |
title="Text to Voice", # Add a title to the interface | |
description="Hight Fidelity TTS. Visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.", | |
inputs=gr.Textbox(lines=10, label="Enter text to convert to speech"), | |
outputs=gr.Audio(label="Generated audio") | |
) | |
interface.launch() | |