Spaces:

CLEAR-Global
/

TWB-Voice-TTS

Running

App Files Files Community

Alp commited on Jul 30

Commit

7a55510

1 Parent(s): 77455c1

first try

Browse files

Files changed (3) hide show

README.md +49 -2
app.py +233 -0
requirements.txt +6 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: TWB Voice TTS
 emoji: 🌍
 colorFrom: blue
 colorTo: gray
@@ -11,4 +11,51 @@ license: cc-by-4.0
 short_description: 'Space to demo TTS models '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TWB Voice TTS Demo
 emoji: 🌍
 colorFrom: blue
 colorTo: gray
 short_description: 'Space to demo TTS models '
 ---
+# TWB Voice 1.0 - TTS Demo Space
+This Gradio demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global. Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project.
+## Features
+- **Hausa TTS**: 3 speakers (1 female in Kenanci dialect, 2 male speakers from open.bible)
+- **Kanuri TTS**: 1 female speaker
+- High-quality 24kHz audio synthesis
+- Based on YourTTS architecture (VITS-based)
+## Models
+- 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
+- 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
+## Datasets
+- 📊 [Hausa Dataset Samples](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
+- 📊 [Kanuri Dataset Samples](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
+## Usage
+1. Select your desired language (Hausa or Kanuri)
+2. Choose a speaker from the available options
+3. Enter text or use the example sentences
+4. Click "Synthesize Speech" to generate audio
+## Technical Details
+- **Architecture**: YourTTS (VITS-based) fine-tuned from CML-TTS multilingual checkpoint
+- **Sample Rate**: 24 kHz
+- **Input**: Lowercase text with preserved diacritics
+- **Framework**: Coqui TTS
+## License
+These models are released under **CC-BY-NC-4.0** license for non-commercial use only.
+## Acknowledgments
+Created by CLEAR Global with support from the Patrick J. McGovern Foundation.
+Special thanks to:
+- TWB Voice Project for high-quality voice data
+- Idiap Coqui TTS for the YourTTS architecture
+- CML-TTS Dataset for the multilingual base model
+- Biblica open.bible for additional Hausa recordings

app.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import gradio as gr
+import torch
+from TTS.api import TTS
+import numpy as np
+import tempfile
+import os
+# Model configurations
+MODELS = {
+    "Hausa": {
+        "model_repo": "CLEAR-Global/TWB-Voice-Hausa-TTS-1.0",
+        "model_name": "best_model_498283.pth",
+        "config_name": "config.json",
+        "speakers": {
+            "spk_f_1": "Female",
+            "spk_m_1": "Male 1",
+            "spk_m_2": "Male 2"
+        },
+        "examples": [
+            "Lokacin damuna shuka kan koriya shar.",
+            "Lafiyarku tafi kuɗinku muhimmanci.",
+            "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
+        ]
+    },
+    "Kanuri": {
+        "model_repo": "CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0",
+        "model_name": "best_model_264313.pth",
+        "config_name": "config.json",
+        "speakers": {
+            "spk1": "Female"
+        },
+        "examples": [
+            "Loktu nǝngriyi ye lan, nǝyama kulo ye dǝ so shawwa ro wurazen.",
+            "Nǝlewa nǝm dǝ, kunguna nǝm wa faidan kozǝna.",
+            "Na done hawar kattu ye so kǝla kurun nǝlewa ye tarzeyen so dǝa wane."
+        ]
+    }
+}
+# Initialize models
+device = "cuda" if torch.cuda.is_available() else "cpu"
+loaded_models = {}
+def load_model(language):
+    """Load TTS model for the specified language"""
+    if language not in loaded_models:
+        model_repo = MODELS[language]["model_repo"]
+        model_name = MODELS[language]["model_name"]
+        config_name = MODELS[language]["config_name"]
+        try:
+            from huggingface_hub import hf_hub_download
+            # Download specific model and config files from HuggingFace repo
+            model_path = hf_hub_download(repo_id=model_repo, filename=model_name)
+            config_path = hf_hub_download(repo_id=model_repo, filename=config_name)
+            # Load TTS model with specific model and config paths
+            loaded_models[language] = TTS(model_path=model_path, config_path=config_path, gpu=torch.cuda.is_available())
+        except Exception as e:
+            print(f"Error loading {language} model: {e}")
+            return None
+    return loaded_models[language]
+def update_speakers(language):
+    """Update speaker dropdown based on selected language"""
+    if language in MODELS:
+        speakers = MODELS[language]["speakers"]
+        choices = [(f"{speaker_id}: {description}", speaker_id)
+                  for speaker_id, description in speakers.items()]
+        return gr.Dropdown(choices=choices, value=choices[0][1], interactive=True)
+    return gr.Dropdown(choices=[], interactive=False)
+def get_example_text(language, example_idx):
+    """Get example text for the selected language"""
+    if language in MODELS and 0 <= example_idx < len(MODELS[language]["examples"]):
+        return MODELS[language]["examples"][example_idx]
+    return ""
+def synthesize_speech(text, language, speaker):
+    """Synthesize speech from text"""
+    if not text.strip():
+        return None, "Please enter some text to synthesize."
+    # Load the model
+    tts_model = load_model(language)
+    if tts_model is None:
+        return None, f"Failed to load {language} model."
+    try:
+        # Convert text to lowercase as required by the models
+        text = text.lower().strip()
+        # Generate speech
+        wav = tts_model.tts(text=text, speaker=speaker)
+        # Convert to numpy array and save to temporary file
+        wav_array = np.array(wav, dtype=np.float32)
+        # Create temporary file
+        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+        # Save audio (assuming 24kHz sample rate as specified in model cards)
+        import scipy.io.wavfile as wavfile
+        wavfile.write(temp_file.name, 24000, wav_array)
+        return temp_file.name, "Speech synthesized successfully!"
+    except Exception as e:
+        return None, f"Error during synthesis: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="TWB Voice TTS Demo", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # TWB Voice 1.0 - Hausa & Kanuri Text-to-Speech
+    This demo showcases neural Text-to-Speech models for **Hausa** and **Kanuri** languages,
+    developed as part of the TWB Voice 1.0 project by CLEAR Global.
+    ### Features:
+    - **Hausa**: 3 speakers (1 female, 2 male) - Kenanci dialect
+    - **Kanuri**: 1 female speaker
+    - High-quality 24kHz audio output
+    - Based on YourTTS architecture
+    ### Links:
+    - 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
+    - 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
+    - 📊 [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
+    - 📊 [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
+    - 🌐 [TWB Voice Project](https://twbvoice.org/)
+    ---
+    """)
+    with gr.Row():
+        with gr.Column():
+            # Language selection
+            language_dropdown = gr.Dropdown(
+                choices=list(MODELS.keys()),
+                value="Hausa",
+                label="Language",
+                info="Select the language for synthesis"
+            )
+            # Speaker selection
+            speaker_dropdown = gr.Dropdown(
+                choices=[(f"spk_f_1: Female (Kenanci dialect)", "spk_f_1")],
+                value="spk_f_1",
+                label="Speaker",
+                info="Select the voice speaker"
+            )
+            # Text input
+            text_input = gr.Textbox(
+                label="Text to synthesize",
+                placeholder="Enter text in the selected language (will be converted to lowercase)",
+                lines=3,
+                info="Note: Text will be automatically converted to lowercase as required by the models"
+            )
+            # Example buttons
+            gr.Markdown("**Quick examples:**")
+            with gr.Row():
+                example_btn_1 = gr.Button("Example 1", size="sm")
+                example_btn_2 = gr.Button("Example 2", size="sm")
+                example_btn_3 = gr.Button("Example 3", size="sm")
+            # Synthesize button
+            synthesize_btn = gr.Button("🎤 Synthesize Speech", variant="primary")
+        with gr.Column():
+            # Audio output
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="filepath"
+            )
+            # Status message
+            status_output = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
+    # Event handlers
+    language_dropdown.change(
+        fn=update_speakers,
+        inputs=[language_dropdown],
+        outputs=[speaker_dropdown]
+    )
+    example_btn_1.click(
+        fn=lambda lang: get_example_text(lang, 0),
+        inputs=[language_dropdown],
+        outputs=[text_input]
+    )
+    example_btn_2.click(
+        fn=lambda lang: get_example_text(lang, 1),
+        inputs=[language_dropdown],
+        outputs=[text_input]
+    )
+    example_btn_3.click(
+        fn=lambda lang: get_example_text(lang, 2),
+        inputs=[language_dropdown],
+        outputs=[text_input]
+    )
+    synthesize_btn.click(
+        fn=synthesize_speech,
+        inputs=[text_input, language_dropdown, speaker_dropdown],
+        outputs=[audio_output, status_output]
+    )
+    gr.Markdown("""
+    ---
+    ### Notes:
+    - Models work best with **lowercase input text** (automatically converted)
+    - **Hausa model** supports diacritics: `ăāɓɗƙƴū`
+    - **Kanuri model** supports diacritics: `áúǝəә`
+    - Audio output is generated at 24kHz sample rate
+    - Models are optimized for educational and general content
+    ### License:
+    These models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
+    **Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
+    """)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+TTS
+torch
+scipy
+numpy
+huggingface_hub