Spaces:

Nag189
/

Text-to-Voice

Runtime error

App Files Files Community

Nag189 commited on Nov 18, 2023

Commit

cdd5e99

1 Parent(s): 3818b2f

Updated app.py with the segmented streaming

Browse files

Files changed (1) hide show

app.py +54 -37

app.py CHANGED Viewed

@@ -1,25 +1,12 @@
 import streamlit as st
-import time
-from datetime import datetime
-from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, SpeechT5ForTextToSpeech
 import numpy as np
 import torch
 from io import StringIO
 import soundfile as sf
-# Improved Styling
-def local_css(file_name):
-    with open(file_name) as f:
-        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
-local_css("style.css")  # Assuming a CSS file named 'style.css' in the same directory
-# Streamlined Layout
-st.title("Text-to-Voice Conversion")
-st.markdown("Convert your text to speech using advanced AI models.")
 # Load models outside of function calls for efficiency
-@st.cache_data
 def load_models():
     model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
     processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
@@ -29,30 +16,64 @@ def load_models():
 model, processor, vocoder = load_models()
 # Load speaker embeddings
-@st.cache_data
 def get_speaker_embeddings():
     speaker_embeddings = np.load("cmu_us_slt_arctic-wav-arctic_a0508.npy")
     return torch.tensor(speaker_embeddings).unsqueeze(0)
 speaker_embeddings = get_speaker_embeddings()
-# Text Input
-text = st.text_area("Type your text or upload a text file below.")
 # Function to convert text to speech
 def text_to_speech(text):
-    inputs = processor(text=text, return_tensors="pt")
-    spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
-    with torch.no_grad():
-        speech = vocoder(spectrogram)
-        sf.write("speech.wav", speech.numpy(), samplerate=16000)
-        return "speech.wav"
 # Convert Button
 if st.button("Convert"):
     if text:
-        audio_path = text_to_speech(text)
-        audio_file = open(audio_path, 'rb')
         audio_bytes = audio_file.read()
         st.audio(audio_bytes, format='audio/wav')
     else:
@@ -62,16 +83,12 @@ if st.button("Convert"):
 uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
 if uploaded_file is not None:
     stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
-    #To read file as string:
     text = stringio.read()
     st.write(text)
-    st.button("Convert",key=1)
-    inputs = processor(text=text, return_tensors="pt")
-    spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
-    with torch.no_grad():
-        speech = vocoder(spectrogram)
-        sf.write("speech.wav", speech.numpy(), samplerate=16000)
-    audio_file = open('speech.wav', 'rb')
-    audio_bytes = audio_file.read()
-    st.audio(audio_bytes, format='audio/wav')

 import streamlit as st
 import numpy as np
 import torch
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from io import StringIO
 import soundfile as sf
 # Load models outside of function calls for efficiency
+@st.cache(allow_output_mutation=True)
 def load_models():
     model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
     processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 model, processor, vocoder = load_models()
 # Load speaker embeddings
+@st.cache(allow_output_mutation=True)
 def get_speaker_embeddings():
     speaker_embeddings = np.load("cmu_us_slt_arctic-wav-arctic_a0508.npy")
     return torch.tensor(speaker_embeddings).unsqueeze(0)
 speaker_embeddings = get_speaker_embeddings()
+# Improved Styling
+def local_css(file_name):
+    with open(file_name) as f:
+        st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+local_css("style.css")
+# Streamlined Layout
+st.title("Text-to-Voice Conversion")
+st.markdown("Convert your text to speech using advanced AI models.")
 # Function to convert text to speech
 def text_to_speech(text):
+    try:
+        # Segment the text if it's too long
+        max_length = 100  # Set a max length as per model's capability
+        segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
+        audio_paths = []
+        for segment in segments:
+            inputs = processor(text=segment, return_tensors="pt")
+            spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
+            with torch.no_grad():
+                speech = vocoder(spectrogram)
+                audio_path = f"speech_segment_{len(audio_paths)}.wav"
+                sf.write(audio_path, speech.numpy(), samplerate=16000)
+                audio_paths.append(audio_path)
+        return audio_paths
+    except Exception as e:
+        st.error(f"Error in text-to-speech conversion: {e}")
+        return []
+# Function to combine audio segments
+def combine_audio_segments(paths):
+    combined_speech = []
+    for path in paths:
+        data, samplerate = sf.read(path)
+        combined_speech.extend(data)
+    sf.write("combined_speech.wav", np.array(combined_speech), samplerate)
+    return "combined_speech.wav"
+# Text Input
+text = st.text_area("Type your text or upload a text file below.")
 # Convert Button
 if st.button("Convert"):
     if text:
+        audio_paths = text_to_speech(text)
+        combined_audio_path = combine_audio_segments(audio_paths)
+        audio_file = open(combined_audio_path, 'rb')
         audio_bytes = audio_file.read()
         st.audio(audio_bytes, format='audio/wav')
     else:
 uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
 if uploaded_file is not None:
     stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
     text = stringio.read()
     st.write(text)
+    if st.button("Convert Uploaded File", key=1):
+        audio_paths = text_to_speech(text)
+        combined_audio_path = combine_audio_segments(audio_paths)
+        audio_file = open(combined_audio_path, 'rb')
+        audio_bytes = audio_file.read()
+        st.audio(audio_bytes, format='audio/wav')