CLEARGlobal commited on
Commit
e1bb1bf
·
verified ·
1 Parent(s): 54ee649

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -72
app.py CHANGED
@@ -21,19 +21,6 @@ MODELS = {
21
  "Lafiyarku tafi kuɗinku muhimmanci.",
22
  "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
23
  ]
24
- },
25
- "Kanuri": {
26
- "model_repo": "CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0",
27
- "model_name": "best_model_264313.pth",
28
- "config_name": "config.json",
29
- "speakers": {
30
- "spk1": "Female"
31
- },
32
- "examples": [
33
- "Loktu nǝngriyi ye lan, nǝyama kulo ye dǝ so shawwa ro wurazen.",
34
- "Nǝlewa nǝm dǝ, kunguna nǝm wa faidan kozǝna.",
35
- "Na done hawar kattu ye so kǝla kurun nǝlewa ye tarzeyen so dǝa wane."
36
- ]
37
  }
38
  }
39
 
@@ -125,69 +112,161 @@ def get_example_text(language, example_idx):
125
  return ""
126
 
127
  def synthesize_speech(text, language, speaker):
128
- """Sinteză vocală din text cu loguri detaliate pentru speakeri."""
129
- if not text or not text.strip():
130
- return None, "Te rog introdu text pentru sinteză."
131
-
 
132
  tts_model = load_model(language)
133
  if tts_model is None:
134
- return None, f"Nu s-a putut încărca modelul pentru {language}."
135
-
136
  try:
137
  text = text.lower().strip()
138
- print("=" * 60)
139
- print("[DEBUG] START Synthesize")
140
- print(f"[DEBUG] Text: '{text}'")
141
- print(f"[DEBUG] Language: '{language}'")
142
- print(f"[DEBUG] Speaker solicitat: '{speaker}'")
143
-
144
- # 1) speakeri disponibili expuși de model
145
- available_speakers = getattr(tts_model, "speakers", None)
146
- if available_speakers is not None:
147
- try:
148
- n_speakers = len(available_speakers)
149
- except Exception:
150
- n_speakers = None
151
- print(f"[DEBUG] Modelul expune 'speakers': {available_speakers}")
152
- print(f"[DEBUG] Număr vorbitori (len): {n_speakers}")
153
- if n_speakers and n_speakers > 0:
154
- print(f"[DEBUG] Speaker valid? {speaker in available_speakers} (căutăm '{speaker}')")
155
- else:
156
- print("[DEBUG] Modelul NU expune lista de vorbitori (probabil single-speaker).")
157
-
158
- # 2) încercăm să deducem și alte câmpuri posibile (unele modele folosesc 'speaker_manager' etc.)
159
- speaker_manager = getattr(getattr(tts_model, "speaker_manager", None), "speakers", None)
160
- if speaker_manager is not None:
161
- try:
162
- print(f"[DEBUG] speaker_manager.speakers keys: {list(speaker_manager.keys())}")
163
- except Exception:
164
- print("[DEBUG] speaker_manager.speakers există dar nu poate fi listat simplu.")
165
-
166
- # 3) apelăm API-ul public
167
- wav = None
168
- if available_speakers and speaker in available_speakers:
169
- print(f"[DEBUG] Apel: tts_model.tts(text=..., speaker='{speaker}')")
170
- wav = tts_model.tts(text=text, speaker=speaker)
171
- else:
172
- print("[DEBUG] Apel: tts_model.tts(text=...) fără speaker (fallback)")
173
- wav = tts_model.tts(text=text)
174
-
175
- if wav is None:
176
- print("[DEBUG] Eșec: tts_model.tts() a returnat None")
177
- return None, "TTS a returnat None, verifică textul și/sau speakerul."
178
 
179
- import numpy as np
180
- import soundfile as sf
181
- wav = np.array(wav, dtype=np.float32)
182
- output_path = "output.wav"
183
- sr = getattr(tts_model.synthesizer, "output_sample_rate", 22050)
184
- print(f"[DEBUG] Scriem WAV la {output_path} cu sample_rate={sr}")
185
- sf.write(output_path, wav, sr)
186
 
187
- print("[DEBUG] SUCCES: tts_model.tts() a rulat corect")
188
- print("=" * 60)
189
- return output_path, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- except Exception as e:
192
- print("[DEBUG] EXCEPȚIE în synthesize_speech:", repr(e))
193
- return None, f"Eroare la sinteză: {str(e)}"
 
21
  "Lafiyarku tafi kuɗinku muhimmanci.",
22
  "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
23
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  }
25
  }
26
 
 
112
  return ""
113
 
114
  def synthesize_speech(text, language, speaker):
115
+ """Synthesize speech from text"""
116
+ if not text.strip():
117
+ return None, "Please enter some text to synthesize."
118
+
119
+ # Load the model
120
  tts_model = load_model(language)
121
  if tts_model is None:
122
+ return None, f"Failed to load {language} model."
123
+
124
  try:
125
  text = text.lower().strip()
126
+ print(f"DEBUG: Processing text: '{text}'")
127
+ print(f"DEBUG: Speaker name: '{speaker}'")
128
+
129
+ synthesizer = tts_model.synthesizer
130
+
131
+ try:
132
+ wav = synthesizer.tts(text=text, speaker_name=speaker)
133
+ except TypeError:
134
+ wav = synthesizer.tts(text=text)
135
+
136
+ print(f"DEBUG: synthesizer.tts() completed successfully")
137
+
138
+ # Convert to numpy array and save to temporary file
139
+ wav_array = np.array(wav, dtype=np.float32)
140
+
141
+ # Create temporary file
142
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
143
+
144
+ # Save audio using the synthesizer's sample rate
145
+ import scipy.io.wavfile as wavfile
146
+ wavfile.write(temp_file.name, synthesizer.output_sample_rate, wav_array)
147
+
148
+ print("Speech synthesized successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
+ return temp_file.name, "Speech synthesized successfully!"
151
+
152
+ except Exception as e:
153
+ return None, f"Error during synthesis: {str(e)}"
 
 
 
154
 
155
+ # Create Gradio interface
156
+ with gr.Blocks(title="TWB Voice TTS Demo") as demo:
157
+ gr.Markdown("""
158
+ # TWB Voice Text-to-Speech Demo Space
159
+
160
+ This demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global.
161
+ Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project.
162
+
163
+ ### Features:
164
+ - **Hausa**: 3 speakers (1 female, 2 male)
165
+ - **Kanuri**: 1 female speaker
166
+ - High-quality 24kHz audio output
167
+ - Based on YourTTS architecture
168
+
169
+ ### Links:
170
+ - 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
171
+ - 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
172
+ - 📊 [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
173
+ - 📊 [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
174
+ - 🌐 [TWB Voice Project](https://twbvoice.org/)
175
+
176
+ ---
177
+ """)
178
+
179
+ with gr.Row():
180
+ with gr.Column():
181
+ # Language selection
182
+ language_dropdown = gr.Dropdown(
183
+ choices=list(MODELS.keys()),
184
+ value="Hausa",
185
+ label="Language",
186
+ info="Select the language for synthesis"
187
+ )
188
+
189
+ # Speaker selection
190
+ speaker_dropdown = gr.Dropdown(
191
+ choices=list(MODELS["Hausa"]["speakers"].keys()),
192
+ value="spk_f_1",
193
+ label="Speaker",
194
+ info="Select the voice speaker"
195
+ )
196
+
197
+ # Text input
198
+ text_input = gr.Textbox(
199
+ label="Text to synthesize",
200
+ placeholder="Enter text in the selected language (will be converted to lowercase)",
201
+ lines=3,
202
+ info="Note: Text will be automatically converted to lowercase as required by the models"
203
+ )
204
+
205
+ # Example buttons
206
+ gr.Markdown("**Press to load a sentence in selected language:**")
207
+ with gr.Row():
208
+ example_btn_1 = gr.Button("Example 1", size="sm")
209
+ example_btn_2 = gr.Button("Example 2", size="sm")
210
+ example_btn_3 = gr.Button("Example 3", size="sm")
211
+
212
+ # Synthesize button
213
+ synthesize_btn = gr.Button("🎤 Synthesize Speech", variant="primary")
214
+
215
+ with gr.Column():
216
+ # Audio output
217
+ audio_output = gr.Audio(
218
+ label="Generated Speech",
219
+ type="filepath"
220
+ )
221
+
222
+ # Status message
223
+ status_output = gr.Textbox(
224
+ label="Status",
225
+ interactive=False
226
+ )
227
+
228
+ # Event handlers
229
+ language_dropdown.change(
230
+ fn=update_speakers,
231
+ inputs=[language_dropdown],
232
+ outputs=[speaker_dropdown]
233
+ )
234
+
235
+ example_btn_1.click(
236
+ fn=lambda lang: get_example_text(lang, 0),
237
+ inputs=[language_dropdown],
238
+ outputs=[text_input]
239
+ )
240
+
241
+ example_btn_2.click(
242
+ fn=lambda lang: get_example_text(lang, 1),
243
+ inputs=[language_dropdown],
244
+ outputs=[text_input]
245
+ )
246
+
247
+ example_btn_3.click(
248
+ fn=lambda lang: get_example_text(lang, 2),
249
+ inputs=[language_dropdown],
250
+ outputs=[text_input]
251
+ )
252
+
253
+ synthesize_btn.click(
254
+ fn=synthesize_speech,
255
+ inputs=[text_input, language_dropdown, speaker_dropdown],
256
+ outputs=[audio_output, status_output]
257
+ )
258
+
259
+ gr.Markdown("""
260
+ ---
261
+ ### Notes:
262
+ - Models work with **lowercase input text** (automatically converted)
263
+ - Audio output is generated at 24kHz sample rate
264
+
265
+ ### License:
266
+ This app and the models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
267
+
268
+ **Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
269
+ """)
270
 
271
+ if __name__ == "__main__":
272
+ demo.launch()