Spaces:

stem-content-ai-project
/

swahili-tts-model

Running

mosha255 commited on Mar 25

Commit

19556fd

unverified ·

1 Parent(s): e088c4e

Streamline spaces app

Files changed (2) hide show

spaces_app.py CHANGED Viewed

@@ -41,7 +41,7 @@ def create_gradio_interface():
         with gr.Row():
             with gr.Column():
-                input_text = gr.Textbox(label="Input Text")
                 generate_btn = gr.Button("Generate Speech")
             with gr.Column():

         with gr.Row():
             with gr.Column():
+                input_text = gr.Textbox(label="Input Text", lines=5)
                 generate_btn = gr.Button("Generate Speech")
             with gr.Column():

tts.py CHANGED Viewed

@@ -9,6 +9,11 @@ import json
 import string
 from IPython.display import Audio
 import soundfile as sf
 # Load models
 lightspeech = ort.InferenceSession("./models/lightspeech_quant.onnx")
@@ -45,6 +50,11 @@ class TTS:
         # Remove empty sections
         sections = [section for section in sections if section]
         return sections
     @staticmethod
@@ -53,13 +63,13 @@ class TTS:
         for section in sections:
             if section == '**':
                 # Long pause
-                pause_duration = 1.0
                 sample_rate = 44100
                 pause = np.zeros(int(pause_duration * sample_rate))
                 audio_sections.append(pause)
             elif section == '*':
                 # Short pause
-                pause_duration = 0.4
                 sample_rate = 44100
                 pause = np.zeros(int(pause_duration * sample_rate))
                 audio_sections.append(pause)

 import string
 from IPython.display import Audio
 import soundfile as sf
+import logging
+# Configure logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Load models
 lightspeech = ort.InferenceSession("./models/lightspeech_quant.onnx")
         # Remove empty sections
         sections = [section for section in sections if section]
+        # Trim last long pause marker
+        if sections[-1] == '**':
+            sections = sections[:-1]
+        logger.info(f"Split text into sections: {sections}")
         return sections
     @staticmethod
         for section in sections:
             if section == '**':
                 # Long pause
+                pause_duration = 0.4
                 sample_rate = 44100
                 pause = np.zeros(int(pause_duration * sample_rate))
                 audio_sections.append(pause)
             elif section == '*':
                 # Short pause
+                pause_duration = 0.2
                 sample_rate = 44100
                 pause = np.zeros(int(pause_duration * sample_rate))
                 audio_sections.append(pause)