CLEARGlobal commited on
Commit
54ee649
·
verified ·
1 Parent(s): 7432dc3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -151
app.py CHANGED
@@ -21,6 +21,19 @@ MODELS = {
21
  "Lafiyarku tafi kuɗinku muhimmanci.",
22
  "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
23
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
  }
26
 
@@ -112,161 +125,69 @@ def get_example_text(language, example_idx):
112
  return ""
113
 
114
  def synthesize_speech(text, language, speaker):
115
- """Synthesize speech from text"""
116
- if not text.strip():
117
- return None, "Please enter some text to synthesize."
118
-
119
- # Load the model
120
  tts_model = load_model(language)
121
  if tts_model is None:
122
- return None, f"Failed to load {language} model."
123
-
124
  try:
125
  text = text.lower().strip()
126
- print(f"DEBUG: Processing text: '{text}'")
127
- print(f"DEBUG: Speaker name: '{speaker}'")
128
-
129
- synthesizer = tts_model.synthesizer
130
-
131
- try:
132
- wav = synthesizer.tts(text=text, speaker_name=speaker)
133
- except TypeError:
134
- wav = synthesizer.tts(text=text)
135
-
136
- print(f"DEBUG: synthesizer.tts() completed successfully")
137
-
138
- # Convert to numpy array and save to temporary file
139
- wav_array = np.array(wav, dtype=np.float32)
140
-
141
- # Create temporary file
142
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
143
-
144
- # Save audio using the synthesizer's sample rate
145
- import scipy.io.wavfile as wavfile
146
- wavfile.write(temp_file.name, synthesizer.output_sample_rate, wav_array)
147
-
148
- print("Speech synthesized successfully!")
149
 
150
- return temp_file.name, "Speech synthesized successfully!"
151
-
152
- except Exception as e:
153
- return None, f"Error during synthesis: {str(e)}"
 
 
 
 
 
 
 
 
 
154
 
155
- # Create Gradio interface
156
- with gr.Blocks(title="TWB Voice TTS Demo") as demo:
157
- gr.Markdown("""
158
- # TWB Voice Text-to-Speech Demo Space
159
-
160
- This demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global.
161
- Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project.
162
-
163
- ### Features:
164
- - **Hausa**: 3 speakers (1 female, 2 male)
165
- - **Kanuri**: 1 female speaker
166
- - High-quality 24kHz audio output
167
- - Based on YourTTS architecture
168
-
169
- ### Links:
170
- - 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
171
- - 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
172
- - 📊 [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
173
- - 📊 [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
174
- - 🌐 [TWB Voice Project](https://twbvoice.org/)
175
-
176
- ---
177
- """)
178
-
179
- with gr.Row():
180
- with gr.Column():
181
- # Language selection
182
- language_dropdown = gr.Dropdown(
183
- choices=list(MODELS.keys()),
184
- value="Hausa",
185
- label="Language",
186
- info="Select the language for synthesis"
187
- )
188
-
189
- # Speaker selection
190
- speaker_dropdown = gr.Dropdown(
191
- choices=list(MODELS["Hausa"]["speakers"].keys()),
192
- value="spk_f_1",
193
- label="Speaker",
194
- info="Select the voice speaker"
195
- )
196
-
197
- # Text input
198
- text_input = gr.Textbox(
199
- label="Text to synthesize",
200
- placeholder="Enter text in the selected language (will be converted to lowercase)",
201
- lines=3,
202
- info="Note: Text will be automatically converted to lowercase as required by the models"
203
- )
204
-
205
- # Example buttons
206
- gr.Markdown("**Press to load a sentence in selected language:**")
207
- with gr.Row():
208
- example_btn_1 = gr.Button("Example 1", size="sm")
209
- example_btn_2 = gr.Button("Example 2", size="sm")
210
- example_btn_3 = gr.Button("Example 3", size="sm")
211
-
212
- # Synthesize button
213
- synthesize_btn = gr.Button("🎤 Synthesize Speech", variant="primary")
214
-
215
- with gr.Column():
216
- # Audio output
217
- audio_output = gr.Audio(
218
- label="Generated Speech",
219
- type="filepath"
220
- )
221
-
222
- # Status message
223
- status_output = gr.Textbox(
224
- label="Status",
225
- interactive=False
226
- )
227
-
228
- # Event handlers
229
- language_dropdown.change(
230
- fn=update_speakers,
231
- inputs=[language_dropdown],
232
- outputs=[speaker_dropdown]
233
- )
234
-
235
- example_btn_1.click(
236
- fn=lambda lang: get_example_text(lang, 0),
237
- inputs=[language_dropdown],
238
- outputs=[text_input]
239
- )
240
-
241
- example_btn_2.click(
242
- fn=lambda lang: get_example_text(lang, 1),
243
- inputs=[language_dropdown],
244
- outputs=[text_input]
245
- )
246
-
247
- example_btn_3.click(
248
- fn=lambda lang: get_example_text(lang, 2),
249
- inputs=[language_dropdown],
250
- outputs=[text_input]
251
- )
252
-
253
- synthesize_btn.click(
254
- fn=synthesize_speech,
255
- inputs=[text_input, language_dropdown, speaker_dropdown],
256
- outputs=[audio_output, status_output]
257
- )
258
-
259
- gr.Markdown("""
260
- ---
261
- ### Notes:
262
- - Models work with **lowercase input text** (automatically converted)
263
- - Audio output is generated at 24kHz sample rate
264
-
265
- ### License:
266
- This app and the models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
267
-
268
- **Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
269
- """)
270
 
271
- if __name__ == "__main__":
272
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "Lafiyarku tafi kuɗinku muhimmanci.",
22
  "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
23
  ]
24
+ },
25
+ "Kanuri": {
26
+ "model_repo": "CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0",
27
+ "model_name": "best_model_264313.pth",
28
+ "config_name": "config.json",
29
+ "speakers": {
30
+ "spk1": "Female"
31
+ },
32
+ "examples": [
33
+ "Loktu nǝngriyi ye lan, nǝyama kulo ye dǝ so shawwa ro wurazen.",
34
+ "Nǝlewa nǝm dǝ, kunguna nǝm wa faidan kozǝna.",
35
+ "Na done hawar kattu ye so kǝla kurun nǝlewa ye tarzeyen so dǝa wane."
36
+ ]
37
  }
38
  }
39
 
 
125
  return ""
126
 
127
  def synthesize_speech(text, language, speaker):
128
+ """Sinteză vocală din text cu loguri detaliate pentru speakeri."""
129
+ if not text or not text.strip():
130
+ return None, "Te rog introdu text pentru sinteză."
131
+
 
132
  tts_model = load_model(language)
133
  if tts_model is None:
134
+ return None, f"Nu s-a putut încărca modelul pentru {language}."
135
+
136
  try:
137
  text = text.lower().strip()
138
+ print("=" * 60)
139
+ print("[DEBUG] START Synthesize")
140
+ print(f"[DEBUG] Text: '{text}'")
141
+ print(f"[DEBUG] Language: '{language}'")
142
+ print(f"[DEBUG] Speaker solicitat: '{speaker}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ # 1) speakeri disponibili expuși de model
145
+ available_speakers = getattr(tts_model, "speakers", None)
146
+ if available_speakers is not None:
147
+ try:
148
+ n_speakers = len(available_speakers)
149
+ except Exception:
150
+ n_speakers = None
151
+ print(f"[DEBUG] Modelul expune 'speakers': {available_speakers}")
152
+ print(f"[DEBUG] Număr vorbitori (len): {n_speakers}")
153
+ if n_speakers and n_speakers > 0:
154
+ print(f"[DEBUG] Speaker valid? {speaker in available_speakers} (căutăm '{speaker}')")
155
+ else:
156
+ print("[DEBUG] Modelul NU expune lista de vorbitori (probabil single-speaker).")
157
 
158
+ # 2) încercăm să deducem și alte câmpuri posibile (unele modele folosesc 'speaker_manager' etc.)
159
+ speaker_manager = getattr(getattr(tts_model, "speaker_manager", None), "speakers", None)
160
+ if speaker_manager is not None:
161
+ try:
162
+ print(f"[DEBUG] speaker_manager.speakers keys: {list(speaker_manager.keys())}")
163
+ except Exception:
164
+ print("[DEBUG] speaker_manager.speakers există dar nu poate fi listat simplu.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
+ # 3) apelăm API-ul public
167
+ wav = None
168
+ if available_speakers and speaker in available_speakers:
169
+ print(f"[DEBUG] Apel: tts_model.tts(text=..., speaker='{speaker}')")
170
+ wav = tts_model.tts(text=text, speaker=speaker)
171
+ else:
172
+ print("[DEBUG] Apel: tts_model.tts(text=...) fără speaker (fallback)")
173
+ wav = tts_model.tts(text=text)
174
+
175
+ if wav is None:
176
+ print("[DEBUG] Eșec: tts_model.tts() a returnat None")
177
+ return None, "TTS a returnat None, verifică textul și/sau speakerul."
178
+
179
+ import numpy as np
180
+ import soundfile as sf
181
+ wav = np.array(wav, dtype=np.float32)
182
+ output_path = "output.wav"
183
+ sr = getattr(tts_model.synthesizer, "output_sample_rate", 22050)
184
+ print(f"[DEBUG] Scriem WAV la {output_path} cu sample_rate={sr}")
185
+ sf.write(output_path, wav, sr)
186
+
187
+ print("[DEBUG] SUCCES: tts_model.tts() a rulat corect")
188
+ print("=" * 60)
189
+ return output_path, None
190
+
191
+ except Exception as e:
192
+ print("[DEBUG] EXCEPȚIE în synthesize_speech:", repr(e))
193
+ return None, f"Eroare la sinteză: {str(e)}"