Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
@@ -21,19 +21,6 @@ MODELS = {
|
|
21 |
"Lafiyarku tafi kuɗinku muhimmanci.",
|
22 |
"A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
|
23 |
]
|
24 |
-
},
|
25 |
-
"Kanuri": {
|
26 |
-
"model_repo": "CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0",
|
27 |
-
"model_name": "best_model_264313.pth",
|
28 |
-
"config_name": "config.json",
|
29 |
-
"speakers": {
|
30 |
-
"spk1": "Female"
|
31 |
-
},
|
32 |
-
"examples": [
|
33 |
-
"Loktu nǝngriyi ye lan, nǝyama kulo ye dǝ so shawwa ro wurazen.",
|
34 |
-
"Nǝlewa nǝm dǝ, kunguna nǝm wa faidan kozǝna.",
|
35 |
-
"Na done hawar kattu ye so kǝla kurun nǝlewa ye tarzeyen so dǝa wane."
|
36 |
-
]
|
37 |
}
|
38 |
}
|
39 |
|
@@ -125,69 +112,161 @@ def get_example_text(language, example_idx):
|
|
125 |
return ""
|
126 |
|
127 |
def synthesize_speech(text, language, speaker):
|
128 |
-
"""
|
129 |
-
if not text
|
130 |
-
return None, "
|
131 |
-
|
|
|
132 |
tts_model = load_model(language)
|
133 |
if tts_model is None:
|
134 |
-
return None, f"
|
135 |
-
|
136 |
try:
|
137 |
text = text.lower().strip()
|
138 |
-
print("
|
139 |
-
print("
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
try:
|
162 |
-
print(f"[DEBUG] speaker_manager.speakers keys: {list(speaker_manager.keys())}")
|
163 |
-
except Exception:
|
164 |
-
print("[DEBUG] speaker_manager.speakers există dar nu poate fi listat simplu.")
|
165 |
-
|
166 |
-
# 3) apelăm API-ul public
|
167 |
-
wav = None
|
168 |
-
if available_speakers and speaker in available_speakers:
|
169 |
-
print(f"[DEBUG] Apel: tts_model.tts(text=..., speaker='{speaker}')")
|
170 |
-
wav = tts_model.tts(text=text, speaker=speaker)
|
171 |
-
else:
|
172 |
-
print("[DEBUG] Apel: tts_model.tts(text=...) fără speaker (fallback)")
|
173 |
-
wav = tts_model.tts(text=text)
|
174 |
-
|
175 |
-
if wav is None:
|
176 |
-
print("[DEBUG] Eșec: tts_model.tts() a returnat None")
|
177 |
-
return None, "TTS a returnat None, verifică textul și/sau speakerul."
|
178 |
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
sr = getattr(tts_model.synthesizer, "output_sample_rate", 22050)
|
184 |
-
print(f"[DEBUG] Scriem WAV la {output_path} cu sample_rate={sr}")
|
185 |
-
sf.write(output_path, wav, sr)
|
186 |
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
-
|
192 |
-
|
193 |
-
return None, f"Eroare la sinteză: {str(e)}"
|
|
|
21 |
"Lafiyarku tafi kuɗinku muhimmanci.",
|
22 |
"A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
|
23 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
}
|
25 |
}
|
26 |
|
|
|
112 |
return ""
|
113 |
|
114 |
def synthesize_speech(text, language, speaker):
|
115 |
+
"""Synthesize speech from text"""
|
116 |
+
if not text.strip():
|
117 |
+
return None, "Please enter some text to synthesize."
|
118 |
+
|
119 |
+
# Load the model
|
120 |
tts_model = load_model(language)
|
121 |
if tts_model is None:
|
122 |
+
return None, f"Failed to load {language} model."
|
123 |
+
|
124 |
try:
|
125 |
text = text.lower().strip()
|
126 |
+
print(f"DEBUG: Processing text: '{text}'")
|
127 |
+
print(f"DEBUG: Speaker name: '{speaker}'")
|
128 |
+
|
129 |
+
synthesizer = tts_model.synthesizer
|
130 |
+
|
131 |
+
try:
|
132 |
+
wav = synthesizer.tts(text=text, speaker_name=speaker)
|
133 |
+
except TypeError:
|
134 |
+
wav = synthesizer.tts(text=text)
|
135 |
+
|
136 |
+
print(f"DEBUG: synthesizer.tts() completed successfully")
|
137 |
+
|
138 |
+
# Convert to numpy array and save to temporary file
|
139 |
+
wav_array = np.array(wav, dtype=np.float32)
|
140 |
+
|
141 |
+
# Create temporary file
|
142 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
143 |
+
|
144 |
+
# Save audio using the synthesizer's sample rate
|
145 |
+
import scipy.io.wavfile as wavfile
|
146 |
+
wavfile.write(temp_file.name, synthesizer.output_sample_rate, wav_array)
|
147 |
+
|
148 |
+
print("Speech synthesized successfully!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
+
return temp_file.name, "Speech synthesized successfully!"
|
151 |
+
|
152 |
+
except Exception as e:
|
153 |
+
return None, f"Error during synthesis: {str(e)}"
|
|
|
|
|
|
|
154 |
|
155 |
+
# Create Gradio interface
|
156 |
+
with gr.Blocks(title="TWB Voice TTS Demo") as demo:
|
157 |
+
gr.Markdown("""
|
158 |
+
# TWB Voice Text-to-Speech Demo Space
|
159 |
+
|
160 |
+
This demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global.
|
161 |
+
Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project.
|
162 |
+
|
163 |
+
### Features:
|
164 |
+
- **Hausa**: 3 speakers (1 female, 2 male)
|
165 |
+
- **Kanuri**: 1 female speaker
|
166 |
+
- High-quality 24kHz audio output
|
167 |
+
- Based on YourTTS architecture
|
168 |
+
|
169 |
+
### Links:
|
170 |
+
- 🤗 [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
|
171 |
+
- 🤗 [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
|
172 |
+
- 📊 [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
|
173 |
+
- 📊 [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
|
174 |
+
- 🌐 [TWB Voice Project](https://twbvoice.org/)
|
175 |
+
|
176 |
+
---
|
177 |
+
""")
|
178 |
+
|
179 |
+
with gr.Row():
|
180 |
+
with gr.Column():
|
181 |
+
# Language selection
|
182 |
+
language_dropdown = gr.Dropdown(
|
183 |
+
choices=list(MODELS.keys()),
|
184 |
+
value="Hausa",
|
185 |
+
label="Language",
|
186 |
+
info="Select the language for synthesis"
|
187 |
+
)
|
188 |
+
|
189 |
+
# Speaker selection
|
190 |
+
speaker_dropdown = gr.Dropdown(
|
191 |
+
choices=list(MODELS["Hausa"]["speakers"].keys()),
|
192 |
+
value="spk_f_1",
|
193 |
+
label="Speaker",
|
194 |
+
info="Select the voice speaker"
|
195 |
+
)
|
196 |
+
|
197 |
+
# Text input
|
198 |
+
text_input = gr.Textbox(
|
199 |
+
label="Text to synthesize",
|
200 |
+
placeholder="Enter text in the selected language (will be converted to lowercase)",
|
201 |
+
lines=3,
|
202 |
+
info="Note: Text will be automatically converted to lowercase as required by the models"
|
203 |
+
)
|
204 |
+
|
205 |
+
# Example buttons
|
206 |
+
gr.Markdown("**Press to load a sentence in selected language:**")
|
207 |
+
with gr.Row():
|
208 |
+
example_btn_1 = gr.Button("Example 1", size="sm")
|
209 |
+
example_btn_2 = gr.Button("Example 2", size="sm")
|
210 |
+
example_btn_3 = gr.Button("Example 3", size="sm")
|
211 |
+
|
212 |
+
# Synthesize button
|
213 |
+
synthesize_btn = gr.Button("🎤 Synthesize Speech", variant="primary")
|
214 |
+
|
215 |
+
with gr.Column():
|
216 |
+
# Audio output
|
217 |
+
audio_output = gr.Audio(
|
218 |
+
label="Generated Speech",
|
219 |
+
type="filepath"
|
220 |
+
)
|
221 |
+
|
222 |
+
# Status message
|
223 |
+
status_output = gr.Textbox(
|
224 |
+
label="Status",
|
225 |
+
interactive=False
|
226 |
+
)
|
227 |
+
|
228 |
+
# Event handlers
|
229 |
+
language_dropdown.change(
|
230 |
+
fn=update_speakers,
|
231 |
+
inputs=[language_dropdown],
|
232 |
+
outputs=[speaker_dropdown]
|
233 |
+
)
|
234 |
+
|
235 |
+
example_btn_1.click(
|
236 |
+
fn=lambda lang: get_example_text(lang, 0),
|
237 |
+
inputs=[language_dropdown],
|
238 |
+
outputs=[text_input]
|
239 |
+
)
|
240 |
+
|
241 |
+
example_btn_2.click(
|
242 |
+
fn=lambda lang: get_example_text(lang, 1),
|
243 |
+
inputs=[language_dropdown],
|
244 |
+
outputs=[text_input]
|
245 |
+
)
|
246 |
+
|
247 |
+
example_btn_3.click(
|
248 |
+
fn=lambda lang: get_example_text(lang, 2),
|
249 |
+
inputs=[language_dropdown],
|
250 |
+
outputs=[text_input]
|
251 |
+
)
|
252 |
+
|
253 |
+
synthesize_btn.click(
|
254 |
+
fn=synthesize_speech,
|
255 |
+
inputs=[text_input, language_dropdown, speaker_dropdown],
|
256 |
+
outputs=[audio_output, status_output]
|
257 |
+
)
|
258 |
+
|
259 |
+
gr.Markdown("""
|
260 |
+
---
|
261 |
+
### Notes:
|
262 |
+
- Models work with **lowercase input text** (automatically converted)
|
263 |
+
- Audio output is generated at 24kHz sample rate
|
264 |
+
|
265 |
+
### License:
|
266 |
+
This app and the models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
|
267 |
+
|
268 |
+
**Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
|
269 |
+
""")
|
270 |
|
271 |
+
if __name__ == "__main__":
|
272 |
+
demo.launch()
|
|