Spaces:

ixxan
/

uyghur-speech-models

Running

App Files Files Community

Irpan commited on Dec 24, 2024

Commit

81e83c9

1 Parent(s): 9db718b

asr

Browse files

Files changed (3) hide show

app.py +76 -70
asr.py +1 -0
tts.py +1 -2

app.py CHANGED Viewed

@@ -4,90 +4,96 @@ import tts
 import util
 # Define the Speech-to-Text tab
-with gr.Blocks() as mms_transcribe:
-    gr.Markdown("### Speech-To-Text")
-    with gr.Row():
-        audio_input = gr.Audio(
-            label="Record or Upload Uyghur Audio",
-            sources=["microphone", "upload"],
-            type="filepath",
-        )
-        model_selection_stt = gr.Dropdown(
-            choices=[model for model in asr.models_info],
-            label="Select a Model",
-            value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
-            interactive=True
-        )
-    with gr.Row():
-        arabic_output = gr.Textbox(label="Uyghur Arabic Transcription", interactive=False)
-        latin_output = gr.Textbox(label="Uyghur Latin Transcription", interactive=False)
-    with gr.Row():
-        stt_submit_btn = gr.Button("Submit")
-        stt_clear_btn = gr.Button("Clear")
-    # Example button to load examples
-    with gr.Row():
-        stt_examples = gr.Examples(
-            examples=util.asr_examples,
             inputs=[audio_input, model_selection_stt],
-            outputs=[arabic_output, latin_output],
-            label="Examples"
         )
-    # Define button functionality
-    stt_submit_btn.click(
-        asr.transcribe,
-        inputs=[audio_input, model_selection_stt],
-        outputs=[arabic_output, latin_output]
-    )
-    stt_clear_btn.click(
-        lambda: (None, None, None),  # Clear inputs and outputs
-        inputs=[],
-        outputs=[audio_input, arabic_output, latin_output]
-    )
 # Define the Text-to-Speech tab
-with gr.Blocks() as mms_synthesize:
-    gr.Markdown("### Text-To-Speech")
-    with gr.Row():
-        input_text = gr.Text(label="Input text")
-        model_selection_tts = gr.Dropdown(
-            choices=[model for model in tts.models_info],
-            label="Select a Model",
-            value="Meta-MMS",
-            interactive=True
-        )
-    with gr.Row():
-        generated_audio = gr.Audio(label="Generated Audio", interactive=False)
-    with gr.Row():
-        tts_submit_btn = gr.Button("Submit")
-        tts_clear_btn = gr.Button("Clear")
-    # Example button to load examples
-    with gr.Row():
-        tts_examples = gr.Examples(
-            examples=util.tts_examples,
             inputs=[input_text, model_selection_tts],
-            outputs=[generated_audio],
-            label="Examples"
         )
-    # Define button functionality
-    tts_submit_btn.click(
-        tts.synthesize,
-        inputs=[input_text, model_selection_tts],
-        outputs=[generated_audio]
-    )
-    tts_clear_btn.click(
-        lambda: (None, None),  # Clear inputs and outputs
-        inputs=[],
-        outputs=[input_text, generated_audio]
-    )
 # Combine tabs into a Tabbed Interface
 with gr.Blocks() as demo:
     gr.Markdown("### Uyghur Language Tools: STT and TTS")
-    with gr.TabbedInterface([mms_transcribe, mms_synthesize], ["Speech-To-Text", "Text-To-Speech"]):
         pass
 # Run the app

 import util
 # Define the Speech-to-Text tab
+def create_stt_tab():
+    with gr.Blocks() as mms_transcribe:
+        gr.Markdown("### Speech-To-Text")
+        with gr.Row():
+            audio_input = gr.Audio(
+                label="Record or Upload Uyghur Audio",
+                sources=["microphone", "upload"],
+                type="filepath",
+            )
+            model_selection_stt = gr.Dropdown(
+                choices=[model for model in asr.models_info],
+                label="Select a Model",
+                value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
+                interactive=True
+            )
+        with gr.Row():
+            arabic_output = gr.Textbox(label="Uyghur Arabic Transcription", interactive=False)
+            latin_output = gr.Textbox(label="Uyghur Latin Transcription", interactive=False)
+        with gr.Row():
+            stt_submit_btn = gr.Button("Submit")
+            stt_clear_btn = gr.Button("Clear")
+        # Example button to load examples
+        with gr.Row():
+            stt_examples = gr.Examples(
+                examples=util.asr_examples,
+                inputs=[audio_input, model_selection_stt],
+                outputs=[arabic_output, latin_output],
+                label="Examples"
+            )
+        # Define button functionality
+        stt_submit_btn.click(
+            asr.transcribe,
             inputs=[audio_input, model_selection_stt],
+            outputs=[arabic_output, latin_output]
+        )
+        stt_clear_btn.click(
+            lambda: (None, None, None),  # Clear inputs and outputs
+            inputs=[],
+            outputs=[audio_input, arabic_output, latin_output]
         )
+    return mms_transcribe
 # Define the Text-to-Speech tab
+def create_tts_tab():
+    with gr.Blocks() as mms_synthesize:
+        gr.Markdown("### Text-To-Speech")
+        with gr.Row():
+            input_text = gr.Text(label="Input text")
+            model_selection_tts = gr.Dropdown(
+                choices=[model for model in tts.models_info],
+                label="Select a Model",
+                value="Meta-MMS",
+                interactive=True
+            )
+        with gr.Row():
+            generated_audio = gr.Audio(label="Generated Audio", interactive=False)
+        with gr.Row():
+            tts_submit_btn = gr.Button("Submit")
+            tts_clear_btn = gr.Button("Clear")
+        # Example button to load examples
+        with gr.Row():
+            tts_examples = gr.Examples(
+                examples=util.tts_examples,
+                inputs=[input_text, model_selection_tts],
+                outputs=[generated_audio],
+                label="Examples"
+            )
+        # Define button functionality
+        tts_submit_btn.click(
+            tts.synthesize,
             inputs=[input_text, model_selection_tts],
+            outputs=[generated_audio]
+        )
+        tts_clear_btn.click(
+            lambda: (None, None),  # Clear inputs and outputs
+            inputs=[],
+            outputs=[input_text, generated_audio]
         )
+    return mms_synthesize
 # Combine tabs into a Tabbed Interface
 with gr.Blocks() as demo:
     gr.Markdown("### Uyghur Language Tools: STT and TTS")
+    with gr.TabbedInterface([create_stt_tab(), create_tts_tab()], ["Speech-To-Text", "Text-To-Speech"]):
         pass
 # Run the app

asr.py CHANGED Viewed

@@ -109,4 +109,5 @@ def transcribe(audio_data, model_id) -> str:
     else: # Latin script output
         transcription_arabic = util.ug_latn_to_arab(transcription)
         transcription_latin = transcription
     return transcription_arabic, transcription_latin

     else: # Latin script output
         transcription_arabic = util.ug_latn_to_arab(transcription)
         transcription_latin = transcription
+    print(model_id, transcription_arabic, transcription_latin)
     return transcription_arabic, transcription_latin

tts.py CHANGED Viewed

@@ -43,6 +43,7 @@ text2speech = Text2Speech(
 text2speech.spc2wav = None  ### disable griffin-lim
 def synthesize(text, model_id):
     if len(text) > 200:
         raise ValueError(f"Input text exceeds 200 characters. Please provide a shorter input text for faster processing.")
@@ -74,8 +75,6 @@ def synthesize_turkic_tts(text):
         wav = vocoder.inference(c_mel)
     output = wav.view(-1).cpu().numpy()
-    print(output.shape)
     output_path = "tts_output.wav"
     scipy.io.wavfile.write(output_path, rate=22050, data=output)

 text2speech.spc2wav = None  ### disable griffin-lim
 def synthesize(text, model_id):
+    print(text)
     if len(text) > 200:
         raise ValueError(f"Input text exceeds 200 characters. Please provide a shorter input text for faster processing.")
         wav = vocoder.inference(c_mel)
     output = wav.view(-1).cpu().numpy()
     output_path = "tts_output.wav"
     scipy.io.wavfile.write(output_path, rate=22050, data=output)