import gradio as gr import numpy as np import torch from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from io import BytesIO import soundfile as sf # Load models outside of function calls for efficiency def load_models(): model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") return model, processor, vocoder model, processor, vocoder = load_models() # Load speaker embeddings def get_speaker_embeddings(): speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy") return torch.tensor(speaker_embeddings).unsqueeze(0) speaker_embeddings = get_speaker_embeddings() # Function to convert text to speech def text_to_speech(text): try: # Segment the text if it's too long max_length = 100 # Set a max length as per model's capability segments = [text[i:i+max_length] for i in range(0, len(text), max_length)] combined_speech = [] for segment in segments: inputs = processor(text=segment, return_tensors="pt") spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) with torch.no_grad(): speech = vocoder(spectrogram) combined_speech.extend(speech.numpy()) # Combine audio data into a single numpy array combined_speech = np.array(combined_speech) return 16000, combined_speech # Return sample rate and combined audio data except Exception as e: return None, f"Error in text-to-speech conversion: {e}" # Gradio Interface def gradio_interface(text): sample_rate, audio_data = text_to_speech(text) if sample_rate and isinstance(audio_data, np.ndarray): return sample_rate, audio_data else: return None # Return None if there's an error interface = gr.Interface( fn=gradio_interface, title="Text to Voice", # Add a title to the interface description="Hight Fidelity TTS. Visit ruslanmv.com for more information.", inputs=gr.Textbox(lines=10, label="Enter text to convert to speech"), outputs=gr.Audio(label="Generated audio") ) interface.launch()