Spaces:

CLEAR-Global
/

TWB-Voice-TTS

Running

App Files Files Community

CLEARGlobal commited on 17 days ago

Commit

e1bb1bf

verified ·

1 Parent(s): 54ee649

Upload app.py

Browse files

Files changed (1) hide show

app.py +151 -72

app.py CHANGED Viewed

@@ -21,19 +21,6 @@ MODELS = {
             "Lafiyarku tafi kuɗinku muhimmanci.",
             "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
         ]
-    },
-    "Kanuri": {
-        "model_repo": "CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0",
-        "model_name": "best_model_264313.pth",
-        "config_name": "config.json",
-        "speakers": {
-            "spk1": "Female"
-        },
-        "examples": [
-            "Loktu nǝngriyi ye lan, nǝyama kulo ye dǝ so shawwa ro wurazen.",
-            "Nǝlewa nǝm dǝ, kunguna nǝm wa faidan kozǝna.",
-            "Na done hawar kattu ye so kǝla kurun nǝlewa ye tarzeyen so dǝa wane."
-        ]
     }
 }
@@ -125,69 +112,161 @@ def get_example_text(language, example_idx):
     return ""
 def synthesize_speech(text, language, speaker):
-    """Sinteză vocală din text cu loguri detaliate pentru speakeri."""
-    if not text or not text.strip():
-        return None, "Te rog introdu text pentru sinteză."
     tts_model = load_model(language)
     if tts_model is None:
-        return None, f"Nu s-a putut încărca modelul pentru {language}."
     try:
         text = text.lower().strip()
-        print("=" * 60)
-        print("[DEBUG] START Synthesize")
-        print(f"[DEBUG] Text: '{text}'")
-        print(f"[DEBUG] Language: '{language}'")
-        print(f"[DEBUG] Speaker solicitat: '{speaker}'")
-        # 1) speakeri disponibili expuși de model
-        available_speakers = getattr(tts_model, "speakers", None)
-        if available_speakers is not None:
-            try:
-                n_speakers = len(available_speakers)
-            except Exception:
-                n_speakers = None
-            print(f"[DEBUG] Modelul expune 'speakers': {available_speakers}")
-            print(f"[DEBUG] Număr vorbitori (len): {n_speakers}")
-            if n_speakers and n_speakers > 0:
-                print(f"[DEBUG] Speaker valid? {speaker in available_speakers} (căutăm '{speaker}')")
-        else:
-            print("[DEBUG] Modelul NU expune lista de vorbitori (probabil single-speaker).")
-        # 2) încercăm să deducem și alte câmpuri posibile (unele modele folosesc 'speaker_manager' etc.)
-        speaker_manager = getattr(getattr(tts_model, "speaker_manager", None), "speakers", None)
-        if speaker_manager is not None:
-            try:
-                print(f"[DEBUG] speaker_manager.speakers keys: {list(speaker_manager.keys())}")
-            except Exception:
-                print("[DEBUG] speaker_manager.speakers există dar nu poate fi listat simplu.")
-        # 3) apelăm API-ul public
-        wav = None
-        if available_speakers and speaker in available_speakers:
-            print(f"[DEBUG] Apel: tts_model.tts(text=..., speaker='{speaker}')")
-            wav = tts_model.tts(text=text, speaker=speaker)
-        else:
-            print("[DEBUG] Apel: tts_model.tts(text=...) fără speaker (fallback)")
-            wav = tts_model.tts(text=text)
-        if wav is None:
-            print("[DEBUG] Eșec: tts_model.tts() a returnat None")
-            return None, "TTS a returnat None, verifică textul și/sau speakerul."
-        import numpy as np
-        import soundfile as sf
-        wav = np.array(wav, dtype=np.float32)
-        output_path = "output.wav"
-        sr = getattr(tts_model.synthesizer, "output_sample_rate", 22050)
-        print(f"[DEBUG] Scriem WAV la {output_path} cu sample_rate={sr}")
-        sf.write(output_path, wav, sr)
-        print("[DEBUG] SUCCES: tts_model.tts() a rulat corect")
-        print("=" * 60)
-        return output_path, None
-    except Exception as e:
-        print("[DEBUG] EXCEPȚIE în synthesize_speech:", repr(e))
-        return None, f"Eroare la sinteză: {str(e)}"

             "Lafiyarku tafi kuɗinku muhimmanci.",
             "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
         ]
     }
 }
     return ""
 def synthesize_speech(text, language, speaker):
+    """Synthesize speech from text"""
+    if not text.strip():
+        return None, "Please enter some text to synthesize."
+    # Load the model
     tts_model = load_model(language)
     if tts_model is None:
+        return None, f"Failed to load {language} model."
     try:
         text = text.lower().strip()
+        print(f"DEBUG: Processing text: '{text}'")
+        print(f"DEBUG: Speaker name: '{speaker}'")
+        synthesizer = tts_model.synthesizer
+        try:
+            wav = synthesizer.tts(text=text, speaker_name=speaker)
+        except TypeError:
+            wav = synthesizer.tts(text=text)
+        print(f"DEBUG: synthesizer.tts() completed successfully")
+        # Convert to numpy array and save to temporary file
+        wav_array = np.array(wav, dtype=np.float32)
+        # Create temporary file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        # Save audio using the synthesizer's sample rate
+        import scipy.io.wavfile as wavfile
+        wavfile.write(temp_file.name, synthesizer.output_sample_rate, wav_array)
+        print("Speech synthesized successfully!")
+        return temp_file.name, "Speech synthesized successfully!"
+    except Exception as e:
+        return None, f"Error during synthesis: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="TWB Voice TTS Demo") as demo:
+    gr.Markdown("""
+    # TWB Voice Text-to-Speech Demo Space
+    This demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global.
+    Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project.
+    ### Features:
+    - **Hausa**: 3 speakers (1 female, 2 male)
+    - **Kanuri**: 1 female speaker
+    - High-quality 24kHz audio output
+    - Based on YourTTS architecture
+    ### Links:
+    - 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
+    - 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
+    - 📊 [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
+    - 📊 [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
+    - 🌐 [TWB Voice Project](https://twbvoice.org/)
+    ---
+    """)
+    with gr.Row():
+        with gr.Column():
+            # Language selection
+            language_dropdown = gr.Dropdown(
+                choices=list(MODELS.keys()),
+                value="Hausa",
+                label="Language",
+                info="Select the language for synthesis"
+            )
+            # Speaker selection
+            speaker_dropdown = gr.Dropdown(
+                choices=list(MODELS["Hausa"]["speakers"].keys()),
+                value="spk_f_1",
+                label="Speaker",
+                info="Select the voice speaker"
+            )
+            # Text input
+            text_input = gr.Textbox(
+                label="Text to synthesize",
+                placeholder="Enter text in the selected language (will be converted to lowercase)",
+                lines=3,
+                info="Note: Text will be automatically converted to lowercase as required by the models"
+            )
+            # Example buttons
+            gr.Markdown("**Press to load a sentence in selected language:**")
+            with gr.Row():
+                example_btn_1 = gr.Button("Example 1", size="sm")
+                example_btn_2 = gr.Button("Example 2", size="sm")
+                example_btn_3 = gr.Button("Example 3", size="sm")
+            # Synthesize button
+            synthesize_btn = gr.Button("🎤 Synthesize Speech", variant="primary")
+        with gr.Column():
+            # Audio output
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="filepath"
+            )
+            # Status message
+            status_output = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
+    # Event handlers
+    language_dropdown.change(
+        fn=update_speakers,
+        inputs=[language_dropdown],
+        outputs=[speaker_dropdown]
+    )
+    example_btn_1.click(
+        fn=lambda lang: get_example_text(lang, 0),
+        inputs=[language_dropdown],
+        outputs=[text_input]
+    )
+    example_btn_2.click(
+        fn=lambda lang: get_example_text(lang, 1),
+        inputs=[language_dropdown],
+        outputs=[text_input]
+    )
+    example_btn_3.click(
+        fn=lambda lang: get_example_text(lang, 2),
+        inputs=[language_dropdown],
+        outputs=[text_input]
+    )
+    synthesize_btn.click(
+        fn=synthesize_speech,
+        inputs=[text_input, language_dropdown, speaker_dropdown],
+        outputs=[audio_output, status_output]
+    )
+    gr.Markdown("""
+    ---
+    ### Notes:
+    - Models work with **lowercase input text** (automatically converted)
+    - Audio output is generated at 24kHz sample rate
+    ### License:
+    This app and the models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
+    **Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
+    """)
+if __name__ == "__main__":
+    demo.launch()