Nag189 commited on
Commit
cdd5e99
·
1 Parent(s): 3818b2f

Updated app.py with the segmented streaming

Browse files
Files changed (1) hide show
  1. app.py +54 -37
app.py CHANGED
@@ -1,25 +1,12 @@
1
  import streamlit as st
2
- import time
3
- from datetime import datetime
4
- from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, SpeechT5ForTextToSpeech
5
  import numpy as np
6
  import torch
 
7
  from io import StringIO
8
  import soundfile as sf
9
 
10
- # Improved Styling
11
- def local_css(file_name):
12
- with open(file_name) as f:
13
- st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
14
-
15
- local_css("style.css") # Assuming a CSS file named 'style.css' in the same directory
16
-
17
- # Streamlined Layout
18
- st.title("Text-to-Voice Conversion")
19
- st.markdown("Convert your text to speech using advanced AI models.")
20
-
21
  # Load models outside of function calls for efficiency
22
- @st.cache_data
23
  def load_models():
24
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
25
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
@@ -29,30 +16,64 @@ def load_models():
29
  model, processor, vocoder = load_models()
30
 
31
  # Load speaker embeddings
32
- @st.cache_data
33
  def get_speaker_embeddings():
34
  speaker_embeddings = np.load("cmu_us_slt_arctic-wav-arctic_a0508.npy")
35
  return torch.tensor(speaker_embeddings).unsqueeze(0)
36
 
37
  speaker_embeddings = get_speaker_embeddings()
38
 
39
- # Text Input
40
- text = st.text_area("Type your text or upload a text file below.")
 
 
 
 
 
 
 
 
41
 
42
  # Function to convert text to speech
43
  def text_to_speech(text):
44
- inputs = processor(text=text, return_tensors="pt")
45
- spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
46
- with torch.no_grad():
47
- speech = vocoder(spectrogram)
48
- sf.write("speech.wav", speech.numpy(), samplerate=16000)
49
- return "speech.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  # Convert Button
52
  if st.button("Convert"):
53
  if text:
54
- audio_path = text_to_speech(text)
55
- audio_file = open(audio_path, 'rb')
 
56
  audio_bytes = audio_file.read()
57
  st.audio(audio_bytes, format='audio/wav')
58
  else:
@@ -62,16 +83,12 @@ if st.button("Convert"):
62
  uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
63
  if uploaded_file is not None:
64
  stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
65
- #To read file as string:
66
  text = stringio.read()
67
  st.write(text)
68
-
69
- st.button("Convert",key=1)
70
- inputs = processor(text=text, return_tensors="pt")
71
- spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
72
- with torch.no_grad():
73
- speech = vocoder(spectrogram)
74
- sf.write("speech.wav", speech.numpy(), samplerate=16000)
75
- audio_file = open('speech.wav', 'rb')
76
- audio_bytes = audio_file.read()
77
- st.audio(audio_bytes, format='audio/wav')
 
1
  import streamlit as st
 
 
 
2
  import numpy as np
3
  import torch
4
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
  from io import StringIO
6
  import soundfile as sf
7
 
 
 
 
 
 
 
 
 
 
 
 
8
  # Load models outside of function calls for efficiency
9
+ @st.cache(allow_output_mutation=True)
10
  def load_models():
11
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
12
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 
16
  model, processor, vocoder = load_models()
17
 
18
  # Load speaker embeddings
19
+ @st.cache(allow_output_mutation=True)
20
  def get_speaker_embeddings():
21
  speaker_embeddings = np.load("cmu_us_slt_arctic-wav-arctic_a0508.npy")
22
  return torch.tensor(speaker_embeddings).unsqueeze(0)
23
 
24
  speaker_embeddings = get_speaker_embeddings()
25
 
26
+ # Improved Styling
27
+ def local_css(file_name):
28
+ with open(file_name) as f:
29
+ st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
30
+
31
+ local_css("style.css")
32
+
33
+ # Streamlined Layout
34
+ st.title("Text-to-Voice Conversion")
35
+ st.markdown("Convert your text to speech using advanced AI models.")
36
 
37
  # Function to convert text to speech
38
  def text_to_speech(text):
39
+ try:
40
+ # Segment the text if it's too long
41
+ max_length = 100 # Set a max length as per model's capability
42
+ segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
43
+ audio_paths = []
44
+
45
+ for segment in segments:
46
+ inputs = processor(text=segment, return_tensors="pt")
47
+ spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
48
+ with torch.no_grad():
49
+ speech = vocoder(spectrogram)
50
+ audio_path = f"speech_segment_{len(audio_paths)}.wav"
51
+ sf.write(audio_path, speech.numpy(), samplerate=16000)
52
+ audio_paths.append(audio_path)
53
+
54
+ return audio_paths
55
+ except Exception as e:
56
+ st.error(f"Error in text-to-speech conversion: {e}")
57
+ return []
58
+
59
+ # Function to combine audio segments
60
+ def combine_audio_segments(paths):
61
+ combined_speech = []
62
+ for path in paths:
63
+ data, samplerate = sf.read(path)
64
+ combined_speech.extend(data)
65
+ sf.write("combined_speech.wav", np.array(combined_speech), samplerate)
66
+ return "combined_speech.wav"
67
+
68
+ # Text Input
69
+ text = st.text_area("Type your text or upload a text file below.")
70
 
71
  # Convert Button
72
  if st.button("Convert"):
73
  if text:
74
+ audio_paths = text_to_speech(text)
75
+ combined_audio_path = combine_audio_segments(audio_paths)
76
+ audio_file = open(combined_audio_path, 'rb')
77
  audio_bytes = audio_file.read()
78
  st.audio(audio_bytes, format='audio/wav')
79
  else:
 
83
  uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
84
  if uploaded_file is not None:
85
  stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
 
86
  text = stringio.read()
87
  st.write(text)
88
+
89
+ if st.button("Convert Uploaded File", key=1):
90
+ audio_paths = text_to_speech(text)
91
+ combined_audio_path = combine_audio_segments(audio_paths)
92
+ audio_file = open(combined_audio_path, 'rb')
93
+ audio_bytes = audio_file.read()
94
+ st.audio(audio_bytes, format='audio/wav')