Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
@@ -21,6 +21,19 @@ MODELS = {
|
|
21 |
"Lafiyarku tafi kuɗinku muhimmanci.",
|
22 |
"A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
|
23 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
}
|
25 |
}
|
26 |
|
@@ -112,161 +125,69 @@ def get_example_text(language, example_idx):
|
|
112 |
return ""
|
113 |
|
114 |
def synthesize_speech(text, language, speaker):
|
115 |
-
"""
|
116 |
-
if not text.strip():
|
117 |
-
return None, "
|
118 |
-
|
119 |
-
# Load the model
|
120 |
tts_model = load_model(language)
|
121 |
if tts_model is None:
|
122 |
-
return None, f"
|
123 |
-
|
124 |
try:
|
125 |
text = text.lower().strip()
|
126 |
-
print(
|
127 |
-
print(
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
try:
|
132 |
-
wav = synthesizer.tts(text=text, speaker_name=speaker)
|
133 |
-
except TypeError:
|
134 |
-
wav = synthesizer.tts(text=text)
|
135 |
-
|
136 |
-
print(f"DEBUG: synthesizer.tts() completed successfully")
|
137 |
-
|
138 |
-
# Convert to numpy array and save to temporary file
|
139 |
-
wav_array = np.array(wav, dtype=np.float32)
|
140 |
-
|
141 |
-
# Create temporary file
|
142 |
-
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
143 |
-
|
144 |
-
# Save audio using the synthesizer's sample rate
|
145 |
-
import scipy.io.wavfile as wavfile
|
146 |
-
wavfile.write(temp_file.name, synthesizer.output_sample_rate, wav_array)
|
147 |
-
|
148 |
-
print("Speech synthesized successfully!")
|
149 |
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
-
#
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
### Features:
|
164 |
-
- **Hausa**: 3 speakers (1 female, 2 male)
|
165 |
-
- **Kanuri**: 1 female speaker
|
166 |
-
- High-quality 24kHz audio output
|
167 |
-
- Based on YourTTS architecture
|
168 |
-
|
169 |
-
### Links:
|
170 |
-
- 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
|
171 |
-
- 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
|
172 |
-
- 📊 [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
|
173 |
-
- 📊 [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
|
174 |
-
- 🌐 [TWB Voice Project](https://twbvoice.org/)
|
175 |
-
|
176 |
-
---
|
177 |
-
""")
|
178 |
-
|
179 |
-
with gr.Row():
|
180 |
-
with gr.Column():
|
181 |
-
# Language selection
|
182 |
-
language_dropdown = gr.Dropdown(
|
183 |
-
choices=list(MODELS.keys()),
|
184 |
-
value="Hausa",
|
185 |
-
label="Language",
|
186 |
-
info="Select the language for synthesis"
|
187 |
-
)
|
188 |
-
|
189 |
-
# Speaker selection
|
190 |
-
speaker_dropdown = gr.Dropdown(
|
191 |
-
choices=list(MODELS["Hausa"]["speakers"].keys()),
|
192 |
-
value="spk_f_1",
|
193 |
-
label="Speaker",
|
194 |
-
info="Select the voice speaker"
|
195 |
-
)
|
196 |
-
|
197 |
-
# Text input
|
198 |
-
text_input = gr.Textbox(
|
199 |
-
label="Text to synthesize",
|
200 |
-
placeholder="Enter text in the selected language (will be converted to lowercase)",
|
201 |
-
lines=3,
|
202 |
-
info="Note: Text will be automatically converted to lowercase as required by the models"
|
203 |
-
)
|
204 |
-
|
205 |
-
# Example buttons
|
206 |
-
gr.Markdown("**Press to load a sentence in selected language:**")
|
207 |
-
with gr.Row():
|
208 |
-
example_btn_1 = gr.Button("Example 1", size="sm")
|
209 |
-
example_btn_2 = gr.Button("Example 2", size="sm")
|
210 |
-
example_btn_3 = gr.Button("Example 3", size="sm")
|
211 |
-
|
212 |
-
# Synthesize button
|
213 |
-
synthesize_btn = gr.Button("🎤 Synthesize Speech", variant="primary")
|
214 |
-
|
215 |
-
with gr.Column():
|
216 |
-
# Audio output
|
217 |
-
audio_output = gr.Audio(
|
218 |
-
label="Generated Speech",
|
219 |
-
type="filepath"
|
220 |
-
)
|
221 |
-
|
222 |
-
# Status message
|
223 |
-
status_output = gr.Textbox(
|
224 |
-
label="Status",
|
225 |
-
interactive=False
|
226 |
-
)
|
227 |
-
|
228 |
-
# Event handlers
|
229 |
-
language_dropdown.change(
|
230 |
-
fn=update_speakers,
|
231 |
-
inputs=[language_dropdown],
|
232 |
-
outputs=[speaker_dropdown]
|
233 |
-
)
|
234 |
-
|
235 |
-
example_btn_1.click(
|
236 |
-
fn=lambda lang: get_example_text(lang, 0),
|
237 |
-
inputs=[language_dropdown],
|
238 |
-
outputs=[text_input]
|
239 |
-
)
|
240 |
-
|
241 |
-
example_btn_2.click(
|
242 |
-
fn=lambda lang: get_example_text(lang, 1),
|
243 |
-
inputs=[language_dropdown],
|
244 |
-
outputs=[text_input]
|
245 |
-
)
|
246 |
-
|
247 |
-
example_btn_3.click(
|
248 |
-
fn=lambda lang: get_example_text(lang, 2),
|
249 |
-
inputs=[language_dropdown],
|
250 |
-
outputs=[text_input]
|
251 |
-
)
|
252 |
-
|
253 |
-
synthesize_btn.click(
|
254 |
-
fn=synthesize_speech,
|
255 |
-
inputs=[text_input, language_dropdown, speaker_dropdown],
|
256 |
-
outputs=[audio_output, status_output]
|
257 |
-
)
|
258 |
-
|
259 |
-
gr.Markdown("""
|
260 |
-
---
|
261 |
-
### Notes:
|
262 |
-
- Models work with **lowercase input text** (automatically converted)
|
263 |
-
- Audio output is generated at 24kHz sample rate
|
264 |
-
|
265 |
-
### License:
|
266 |
-
This app and the models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
|
267 |
-
|
268 |
-
**Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
|
269 |
-
""")
|
270 |
|
271 |
-
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
"Lafiyarku tafi kuɗinku muhimmanci.",
|
22 |
"A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
|
23 |
]
|
24 |
+
},
|
25 |
+
"Kanuri": {
|
26 |
+
"model_repo": "CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0",
|
27 |
+
"model_name": "best_model_264313.pth",
|
28 |
+
"config_name": "config.json",
|
29 |
+
"speakers": {
|
30 |
+
"spk1": "Female"
|
31 |
+
},
|
32 |
+
"examples": [
|
33 |
+
"Loktu nǝngriyi ye lan, nǝyama kulo ye dǝ so shawwa ro wurazen.",
|
34 |
+
"Nǝlewa nǝm dǝ, kunguna nǝm wa faidan kozǝna.",
|
35 |
+
"Na done hawar kattu ye so kǝla kurun nǝlewa ye tarzeyen so dǝa wane."
|
36 |
+
]
|
37 |
}
|
38 |
}
|
39 |
|
|
|
125 |
return ""
|
126 |
|
127 |
def synthesize_speech(text, language, speaker):
|
128 |
+
"""Sinteză vocală din text cu loguri detaliate pentru speakeri."""
|
129 |
+
if not text or not text.strip():
|
130 |
+
return None, "Te rog introdu text pentru sinteză."
|
131 |
+
|
|
|
132 |
tts_model = load_model(language)
|
133 |
if tts_model is None:
|
134 |
+
return None, f"Nu s-a putut încărca modelul pentru {language}."
|
135 |
+
|
136 |
try:
|
137 |
text = text.lower().strip()
|
138 |
+
print("=" * 60)
|
139 |
+
print("[DEBUG] START Synthesize")
|
140 |
+
print(f"[DEBUG] Text: '{text}'")
|
141 |
+
print(f"[DEBUG] Language: '{language}'")
|
142 |
+
print(f"[DEBUG] Speaker solicitat: '{speaker}'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
+
# 1) speakeri disponibili expuși de model
|
145 |
+
available_speakers = getattr(tts_model, "speakers", None)
|
146 |
+
if available_speakers is not None:
|
147 |
+
try:
|
148 |
+
n_speakers = len(available_speakers)
|
149 |
+
except Exception:
|
150 |
+
n_speakers = None
|
151 |
+
print(f"[DEBUG] Modelul expune 'speakers': {available_speakers}")
|
152 |
+
print(f"[DEBUG] Număr vorbitori (len): {n_speakers}")
|
153 |
+
if n_speakers and n_speakers > 0:
|
154 |
+
print(f"[DEBUG] Speaker valid? {speaker in available_speakers} (căutăm '{speaker}')")
|
155 |
+
else:
|
156 |
+
print("[DEBUG] Modelul NU expune lista de vorbitori (probabil single-speaker).")
|
157 |
|
158 |
+
# 2) încercăm să deducem și alte câmpuri posibile (unele modele folosesc 'speaker_manager' etc.)
|
159 |
+
speaker_manager = getattr(getattr(tts_model, "speaker_manager", None), "speakers", None)
|
160 |
+
if speaker_manager is not None:
|
161 |
+
try:
|
162 |
+
print(f"[DEBUG] speaker_manager.speakers keys: {list(speaker_manager.keys())}")
|
163 |
+
except Exception:
|
164 |
+
print("[DEBUG] speaker_manager.speakers există dar nu poate fi listat simplu.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
|
166 |
+
# 3) apelăm API-ul public
|
167 |
+
wav = None
|
168 |
+
if available_speakers and speaker in available_speakers:
|
169 |
+
print(f"[DEBUG] Apel: tts_model.tts(text=..., speaker='{speaker}')")
|
170 |
+
wav = tts_model.tts(text=text, speaker=speaker)
|
171 |
+
else:
|
172 |
+
print("[DEBUG] Apel: tts_model.tts(text=...) fără speaker (fallback)")
|
173 |
+
wav = tts_model.tts(text=text)
|
174 |
+
|
175 |
+
if wav is None:
|
176 |
+
print("[DEBUG] Eșec: tts_model.tts() a returnat None")
|
177 |
+
return None, "TTS a returnat None, verifică textul și/sau speakerul."
|
178 |
+
|
179 |
+
import numpy as np
|
180 |
+
import soundfile as sf
|
181 |
+
wav = np.array(wav, dtype=np.float32)
|
182 |
+
output_path = "output.wav"
|
183 |
+
sr = getattr(tts_model.synthesizer, "output_sample_rate", 22050)
|
184 |
+
print(f"[DEBUG] Scriem WAV la {output_path} cu sample_rate={sr}")
|
185 |
+
sf.write(output_path, wav, sr)
|
186 |
+
|
187 |
+
print("[DEBUG] SUCCES: tts_model.tts() a rulat corect")
|
188 |
+
print("=" * 60)
|
189 |
+
return output_path, None
|
190 |
+
|
191 |
+
except Exception as e:
|
192 |
+
print("[DEBUG] EXCEPȚIE în synthesize_speech:", repr(e))
|
193 |
+
return None, f"Eroare la sinteză: {str(e)}"
|