Spaces:

CLEAR-Global
/

TWB-Voice-TTS

Running

App Files Files Community

CLEARGlobal commited on 18 days ago

Commit

54ee649

verified ·

1 Parent(s): 7432dc3

Upload app.py

Browse files

Files changed (1) hide show

app.py +72 -151

app.py CHANGED Viewed

@@ -21,6 +21,19 @@ MODELS = {
             "Lafiyarku tafi kuɗinku muhimmanci.",
             "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
         ]
     }
 }
@@ -112,161 +125,69 @@ def get_example_text(language, example_idx):
     return ""
 def synthesize_speech(text, language, speaker):
-    """Synthesize speech from text"""
-    if not text.strip():
-        return None, "Please enter some text to synthesize."
-    # Load the model
     tts_model = load_model(language)
     if tts_model is None:
-        return None, f"Failed to load {language} model."
     try:
         text = text.lower().strip()
-        print(f"DEBUG: Processing text: '{text}'")
-        print(f"DEBUG: Speaker name: '{speaker}'")
-        synthesizer = tts_model.synthesizer
-        try:
-            wav = synthesizer.tts(text=text, speaker_name=speaker)
-        except TypeError:
-            wav = synthesizer.tts(text=text)
-        print(f"DEBUG: synthesizer.tts() completed successfully")
-        # Convert to numpy array and save to temporary file
-        wav_array = np.array(wav, dtype=np.float32)
-        # Create temporary file
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        # Save audio using the synthesizer's sample rate
-        import scipy.io.wavfile as wavfile
-        wavfile.write(temp_file.name, synthesizer.output_sample_rate, wav_array)
-        print("Speech synthesized successfully!")
-        return temp_file.name, "Speech synthesized successfully!"
-    except Exception as e:
-        return None, f"Error during synthesis: {str(e)}"
-# Create Gradio interface
-with gr.Blocks(title="TWB Voice TTS Demo") as demo:
-    gr.Markdown("""
-    # TWB Voice Text-to-Speech Demo Space
-    This demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global.
-    Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project.
-    ### Features:
-    - **Hausa**: 3 speakers (1 female, 2 male)
-    - **Kanuri**: 1 female speaker
-    - High-quality 24kHz audio output
-    - Based on YourTTS architecture
-    ### Links:
-    - 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
-    - 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
-    - 📊 [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
-    - 📊 [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
-    - 🌐 [TWB Voice Project](https://twbvoice.org/)
-    ---
-    """)
-    with gr.Row():
-        with gr.Column():
-            # Language selection
-            language_dropdown = gr.Dropdown(
-                choices=list(MODELS.keys()),
-                value="Hausa",
-                label="Language",
-                info="Select the language for synthesis"
-            )
-            # Speaker selection
-            speaker_dropdown = gr.Dropdown(
-                choices=list(MODELS["Hausa"]["speakers"].keys()),
-                value="spk_f_1",
-                label="Speaker",
-                info="Select the voice speaker"
-            )
-            # Text input
-            text_input = gr.Textbox(
-                label="Text to synthesize",
-                placeholder="Enter text in the selected language (will be converted to lowercase)",
-                lines=3,
-                info="Note: Text will be automatically converted to lowercase as required by the models"
-            )
-            # Example buttons
-            gr.Markdown("**Press to load a sentence in selected language:**")
-            with gr.Row():
-                example_btn_1 = gr.Button("Example 1", size="sm")
-                example_btn_2 = gr.Button("Example 2", size="sm")
-                example_btn_3 = gr.Button("Example 3", size="sm")
-            # Synthesize button
-            synthesize_btn = gr.Button("🎤 Synthesize Speech", variant="primary")
-        with gr.Column():
-            # Audio output
-            audio_output = gr.Audio(
-                label="Generated Speech",
-                type="filepath"
-            )
-            # Status message
-            status_output = gr.Textbox(
-                label="Status",
-                interactive=False
-            )
-    # Event handlers
-    language_dropdown.change(
-        fn=update_speakers,
-        inputs=[language_dropdown],
-        outputs=[speaker_dropdown]
-    )
-    example_btn_1.click(
-        fn=lambda lang: get_example_text(lang, 0),
-        inputs=[language_dropdown],
-        outputs=[text_input]
-    )
-    example_btn_2.click(
-        fn=lambda lang: get_example_text(lang, 1),
-        inputs=[language_dropdown],
-        outputs=[text_input]
-    )
-    example_btn_3.click(
-        fn=lambda lang: get_example_text(lang, 2),
-        inputs=[language_dropdown],
-        outputs=[text_input]
-    )
-    synthesize_btn.click(
-        fn=synthesize_speech,
-        inputs=[text_input, language_dropdown, speaker_dropdown],
-        outputs=[audio_output, status_output]
-    )
-    gr.Markdown("""
-    ---
-    ### Notes:
-    - Models work with **lowercase input text** (automatically converted)
-    - Audio output is generated at 24kHz sample rate
-    ### License:
-    This app and the models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
-    **Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
-    """)
-if __name__ == "__main__":
-    demo.launch()

             "Lafiyarku tafi kuɗinku muhimmanci.",
             "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
         ]
+    },
+    "Kanuri": {
+        "model_repo": "CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0",
+        "model_name": "best_model_264313.pth",
+        "config_name": "config.json",
+        "speakers": {
+            "spk1": "Female"
+        },
+        "examples": [
+            "Loktu nǝngriyi ye lan, nǝyama kulo ye dǝ so shawwa ro wurazen.",
+            "Nǝlewa nǝm dǝ, kunguna nǝm wa faidan kozǝna.",
+            "Na done hawar kattu ye so kǝla kurun nǝlewa ye tarzeyen so dǝa wane."
+        ]
     }
 }
     return ""
 def synthesize_speech(text, language, speaker):
+    """Sinteză vocală din text cu loguri detaliate pentru speakeri."""
+    if not text or not text.strip():
+        return None, "Te rog introdu text pentru sinteză."
     tts_model = load_model(language)
     if tts_model is None:
+        return None, f"Nu s-a putut încărca modelul pentru {language}."
     try:
         text = text.lower().strip()
+        print("=" * 60)
+        print("[DEBUG] START Synthesize")
+        print(f"[DEBUG] Text: '{text}'")
+        print(f"[DEBUG] Language: '{language}'")
+        print(f"[DEBUG] Speaker solicitat: '{speaker}'")
+        # 1) speakeri disponibili expuși de model
+        available_speakers = getattr(tts_model, "speakers", None)
+        if available_speakers is not None:
+            try:
+                n_speakers = len(available_speakers)
+            except Exception:
+                n_speakers = None
+            print(f"[DEBUG] Modelul expune 'speakers': {available_speakers}")
+            print(f"[DEBUG] Număr vorbitori (len): {n_speakers}")
+            if n_speakers and n_speakers > 0:
+                print(f"[DEBUG] Speaker valid? {speaker in available_speakers} (căutăm '{speaker}')")
+        else:
+            print("[DEBUG] Modelul NU expune lista de vorbitori (probabil single-speaker).")
+        # 2) încercăm să deducem și alte câmpuri posibile (unele modele folosesc 'speaker_manager' etc.)
+        speaker_manager = getattr(getattr(tts_model, "speaker_manager", None), "speakers", None)
+        if speaker_manager is not None:
+            try:
+                print(f"[DEBUG] speaker_manager.speakers keys: {list(speaker_manager.keys())}")
+            except Exception:
+                print("[DEBUG] speaker_manager.speakers există dar nu poate fi listat simplu.")
+        # 3) apelăm API-ul public
+        wav = None
+        if available_speakers and speaker in available_speakers:
+            print(f"[DEBUG] Apel: tts_model.tts(text=..., speaker='{speaker}')")
+            wav = tts_model.tts(text=text, speaker=speaker)
+        else:
+            print("[DEBUG] Apel: tts_model.tts(text=...) fără speaker (fallback)")
+            wav = tts_model.tts(text=text)
+        if wav is None:
+            print("[DEBUG] Eșec: tts_model.tts() a returnat None")
+            return None, "TTS a returnat None, verifică textul și/sau speakerul."
+        import numpy as np
+        import soundfile as sf
+        wav = np.array(wav, dtype=np.float32)
+        output_path = "output.wav"
+        sr = getattr(tts_model.synthesizer, "output_sample_rate", 22050)
+        print(f"[DEBUG] Scriem WAV la {output_path} cu sample_rate={sr}")
+        sf.write(output_path, wav, sr)
+        print("[DEBUG] SUCCES: tts_model.tts() a rulat corect")
+        print("=" * 60)
+        return output_path, None
+    except Exception as e:
+        print("[DEBUG] EXCEPȚIE în synthesize_speech:", repr(e))
+        return None, f"Eroare la sinteză: {str(e)}"