File size: 10,960 Bytes
7a55510
 
 
 
 
 
 
 
 
 
 
 
 
0af5104
7a55510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8548594
 
7a55510
 
0af5104
 
 
 
 
 
 
 
 
 
 
1f54f73
6c5520e
407bf70
7a55510
 
 
0af5104
 
1f54f73
99992b9
407bf70
0af5104
8548594
5b609a1
 
1f54f73
5b609a1
 
1f54f73
99992b9
407bf70
3b33912
5b609a1
 
 
 
 
8548594
9502afd
1f54f73
9502afd
1f54f73
8548594
 
7a55510
0af5104
5b609a1
0af5104
7a55510
 
 
5b609a1
 
7a55510
8548594
7a55510
8548594
7a55510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56172cf
 
 
7a55510
 
 
 
 
 
 
56172cf
7a55510
56172cf
7a55510
 
 
 
 
 
 
56172cf
7a55510
2744224
7a55510
2744224
 
7a55510
 
2744224
7a55510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8300797
7a55510
 
 
 
 
 
 
 
 
 
 
 
 
 
2744224
7a55510
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2744224
7a55510
 
 
2744224
7a55510
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
import gradio as gr
import torch
from TTS.api import TTS
import numpy as np
import tempfile
import os

# Model configurations
MODELS = {
    "Hausa": {
        "model_repo": "CLEAR-Global/TWB-Voice-Hausa-TTS-1.0",
        "model_name": "best_model_498283.pth",
        "config_name": "config.json",
        "speakers_pth_name": "speakers.pth",
        "speakers": {
            "spk_f_1": "Female",
            "spk_m_1": "Male 1",
            "spk_m_2": "Male 2"
        },
        "examples": [
            "Lokacin damuna shuka kan koriya shar.",
            "Lafiyarku tafi kuɗinku muhimmanci.",
            "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
        ]
    },
    "Kanuri": {
        "model_repo": "CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0",
        "model_name": "best_model_264313.pth",
        "config_name": "config.json",
        "speakers": {
            "spk1": "Female"
        },
        "examples": [
            "Loktu nǝngriyi ye lan, nǝyama kulo ye dǝ so shawwa ro wurazen.",
            "Nǝlewa nǝm dǝ, kunguna nǝm wa faidan kozǝna.",
            "Na done hawar kattu ye so kǝla kurun nǝlewa ye tarzeyen so dǝa wane."
        ]
    }
}

# Initialize models
device = "cuda" if torch.cuda.is_available() else "cpu"
loaded_models = {}

def load_model(language):
    """Load TTS model for the specified language"""
    if language not in loaded_models:
        model_repo = MODELS[language]["model_repo"]
        model_name = MODELS[language]["model_name"] 
        config_name = MODELS[language]["config_name"]


        try:
            from huggingface_hub import hf_hub_download
            import json
            
            # First download and read the config to get the required filenames
            config_path = hf_hub_download(repo_id=model_repo, filename=config_name)
            
            with open(config_path, 'r') as f:
                config = json.load(f)
            
            # Extract filenames from config (get just the filename, not the full path)
            speakers_filename = os.path.basename(config.get("speakers_file", "speakers.pth"))
            language_ids_filename = os.path.basename(config.get("language_ids_file", "language_ids.json"))
            d_vector_filename = os.path.basename(config.get("d_vector_file", ["d_vector.pth"])[0])
            config_se_filename = os.path.basename(config.get("model_args", {}).get("speaker_encoder_config_path", "config_se.json"))
            model_se_filename = os.path.basename(config.get("model_args", {}).get("speaker_encoder_model_path", "model_se.pth"))
            
            # Download specific model and config files from HuggingFace repo
            model_path = hf_hub_download(repo_id=model_repo, filename=model_name)
            speakers_file = hf_hub_download(repo_id=model_repo, filename=speakers_filename)
            language_ids_file = hf_hub_download(repo_id=model_repo, filename=language_ids_filename)
            d_vector_file = hf_hub_download(repo_id=model_repo, filename=d_vector_filename)
            config_se_file = hf_hub_download(repo_id=model_repo, filename=config_se_filename)
            model_se_file = hf_hub_download(repo_id=model_repo, filename=model_se_filename)
            
            # Update the config paths to point to the downloaded files
            config["speakers_file"] = speakers_file
            config["language_ids_file"] = language_ids_file
            config["d_vector_file"] = [d_vector_file]
            config["model_args"]["speakers_file"] = speakers_file
            config["model_args"]["language_ids_file"] = language_ids_file
            config["model_args"]["d_vector_file"] = [d_vector_file]
            config["model_args"]["speaker_encoder_config_path"] = config_se_file
            config["model_args"]["speaker_encoder_model_path"] = model_se_file
            
            # Save the updated config to a temporary file
            import tempfile
            temp_config = tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False)
            json.dump(config, temp_config, indent=2)
            temp_config.close()

            print(f"Loading {language} model with config:")
            print(f"- language_ids_file: {config.get('language_ids_file')}")
            print(f"- use_speaker_embedding: {config.get('use_speaker_embedding')}")
            print(f"- speakers_file: {config.get('speakers_file')}")
            print(f"- d_vector_file: {config.get('d_vector_file')}")

            # Load TTS model with specific model and config paths
            loaded_models[language] = TTS(model_path=model_path, 
                                          config_path=temp_config.name,
                                          gpu=torch.cuda.is_available())
            
        except Exception as e:
            print(f"Error loading {language} model: {e}")
            import traceback
            traceback.print_exc()
            return None

    return loaded_models[language]

def update_speakers(language):
    """Update speaker dropdown based on selected language"""
    if language in MODELS:
        speakers = MODELS[language]["speakers"]
        choices = [(f"{speaker_id}: {description}", speaker_id) 
                  for speaker_id, description in speakers.items()]
        return gr.Dropdown(choices=choices, value=choices[0][1], interactive=True)
    return gr.Dropdown(choices=[], interactive=False)

def get_example_text(language, example_idx):
    """Get example text for the selected language"""
    if language in MODELS and 0 <= example_idx < len(MODELS[language]["examples"]):
        return MODELS[language]["examples"][example_idx]
    return ""

def synthesize_speech(text, language, speaker):
    """Synthesize speech from text"""
    if not text.strip():
        return None, "Please enter some text to synthesize."
    
    # Load the model
    tts_model = load_model(language)
    if tts_model is None:
        return None, f"Failed to load {language} model."
    
    try:
        # Convert text to lowercase as required by the models
        text = text.lower().strip()
        
        # Generate speech using synthesizer directly (following your inference script)
        synthesizer = tts_model.synthesizer
        wav = synthesizer.tts(text=text, speaker_name=speaker)
        
        # Convert to numpy array and save to temporary file
        wav_array = np.array(wav, dtype=np.float32)
        
        # Create temporary file
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        
        # Save audio using the synthesizer's sample rate
        import scipy.io.wavfile as wavfile
        wavfile.write(temp_file.name, synthesizer.output_sample_rate, wav_array)
        
        return temp_file.name, "Speech synthesized successfully!"
        
    except Exception as e:
        return None, f"Error during synthesis: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="TWB Voice TTS Demo") as demo:
    gr.Markdown("""
    # TWB Voice Text-to-Speech Demo Space
    
    This demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global. 
    Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project.
    
    ### Features:
    - **Hausa**: 3 speakers (1 female, 2 male)
    - **Kanuri**: 1 female speaker
    - High-quality 24kHz audio output
    - Based on YourTTS architecture
    
    ### Links:
    - 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
    - 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
    - 📊 [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
    - 📊 [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
    - 🌐 [TWB Voice Project](https://twbvoice.org/)
    
    ---
    """)
    
    with gr.Row():
        with gr.Column():
            # Language selection
            language_dropdown = gr.Dropdown(
                choices=list(MODELS.keys()),
                value="Hausa",
                label="Language",
                info="Select the language for synthesis"
            )
            
            # Speaker selection
            speaker_dropdown = gr.Dropdown(
                choices=list(MODELS["Hausa"]["speakers"].keys()),
                value="spk_f_1",
                label="Speaker",
                info="Select the voice speaker"
            )
            
            # Text input
            text_input = gr.Textbox(
                label="Text to synthesize",
                placeholder="Enter text in the selected language (will be converted to lowercase)",
                lines=3,
                info="Note: Text will be automatically converted to lowercase as required by the models"
            )
            
            # Example buttons
            gr.Markdown("**Quick examples (press to load):**")
            with gr.Row():
                example_btn_1 = gr.Button("Example 1", size="sm")
                example_btn_2 = gr.Button("Example 2", size="sm")
                example_btn_3 = gr.Button("Example 3", size="sm")
            
            # Synthesize button
            synthesize_btn = gr.Button("🎤 Synthesize Speech", variant="primary")
            
        with gr.Column():
            # Audio output
            audio_output = gr.Audio(
                label="Generated Speech",
                type="filepath"
            )
            
            # Status message
            status_output = gr.Textbox(
                label="Status",
                interactive=False
            )
    
    # Event handlers
    language_dropdown.change(
        fn=update_speakers,
        inputs=[language_dropdown],
        outputs=[speaker_dropdown]
    )
    
    example_btn_1.click(
        fn=lambda lang: get_example_text(lang, 0),
        inputs=[language_dropdown],
        outputs=[text_input]
    )
    
    example_btn_2.click(
        fn=lambda lang: get_example_text(lang, 1),
        inputs=[language_dropdown],
        outputs=[text_input]
    )
    
    example_btn_3.click(
        fn=lambda lang: get_example_text(lang, 2),
        inputs=[language_dropdown],
        outputs=[text_input]
    )
    
    synthesize_btn.click(
        fn=synthesize_speech,
        inputs=[text_input, language_dropdown, speaker_dropdown],
        outputs=[audio_output, status_output]
    )
    
    gr.Markdown("""
    ---
    ### Notes:
    - Models work with **lowercase input text** (automatically converted)
    - Audio output is generated at 24kHz sample rate
    
    ### License:
    This app and the models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
    
    **Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
    """)

if __name__ == "__main__":
    demo.launch()