Spaces:
Runtime error
Runtime error
import streamlit as st | |
import numpy as np | |
import torch | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
from io import StringIO | |
import soundfile as sf | |
# Load models outside of function calls for efficiency | |
def load_models(): | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
return model, processor, vocoder | |
model, processor, vocoder = load_models() | |
# Load speaker embeddings | |
def get_speaker_embeddings(): | |
speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy") | |
return torch.tensor(speaker_embeddings).unsqueeze(0) | |
speaker_embeddings = get_speaker_embeddings() | |
# Improved Styling | |
def local_css(file_name): | |
with open(file_name) as f: | |
st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True) | |
local_css("style.css") | |
# Streamlined Layout | |
st.title("Text-to-Voice Conversion") | |
st.markdown("Convert your text to speech using advanced AI models.") | |
# Function to convert text to speech | |
def text_to_speech(text): | |
try: | |
# Segment the text if it's too long | |
max_length = 100 # Set a max length as per model's capability | |
segments = [text[i:i+max_length] for i in range(0, len(text), max_length)] | |
audio_paths = [] | |
for segment in segments: | |
inputs = processor(text=segment, return_tensors="pt") | |
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
with torch.no_grad(): | |
speech = vocoder(spectrogram) | |
audio_path = f"speech_segment_{len(audio_paths)}.wav" | |
sf.write(audio_path, speech.numpy(), samplerate=16000) | |
audio_paths.append(audio_path) | |
return audio_paths | |
except Exception as e: | |
st.error(f"Error in text-to-speech conversion: {e}") | |
return [] | |
# Function to combine audio segments | |
def combine_audio_segments(paths): | |
combined_speech = [] | |
for path in paths: | |
data, samplerate = sf.read(path) | |
combined_speech.extend(data) | |
sf.write("combined_speech.wav", np.array(combined_speech), samplerate) | |
return "combined_speech.wav" | |
# Text Input | |
text = st.text_area("Type your text or upload a text file below.") | |
# Convert Button | |
if st.button("Convert"): | |
if text: | |
audio_paths = text_to_speech(text) | |
combined_audio_path = combine_audio_segments(audio_paths) | |
audio_file = open(combined_audio_path, 'rb') | |
audio_bytes = audio_file.read() | |
st.audio(audio_bytes, format='audio/wav') | |
else: | |
st.error("Please enter some text to convert.") | |
# File Uploader | |
uploaded_file = st.file_uploader("Upload your text file here", type=['txt']) | |
if uploaded_file is not None: | |
stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) | |
text = stringio.read() | |
st.write(text) | |
if st.button("Convert Uploaded File", key=1): | |
audio_paths = text_to_speech(text) | |
combined_audio_path = combine_audio_segments(audio_paths) | |
audio_file = open(combined_audio_path, 'rb') | |
audio_bytes = audio_file.read() | |
st.audio(audio_bytes, format='audio/wav') | |