Alp commited on
Commit
7a55510
Β·
1 Parent(s): 77455c1
Files changed (3) hide show
  1. README.md +49 -2
  2. app.py +233 -0
  3. requirements.txt +6 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: TWB Voice TTS
3
  emoji: 🌍
4
  colorFrom: blue
5
  colorTo: gray
@@ -11,4 +11,51 @@ license: cc-by-4.0
11
  short_description: 'Space to demo TTS models '
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: TWB Voice TTS Demo
3
  emoji: 🌍
4
  colorFrom: blue
5
  colorTo: gray
 
11
  short_description: 'Space to demo TTS models '
12
  ---
13
 
14
+ # TWB Voice 1.0 - TTS Demo Space
15
+
16
+ This Gradio demo showcases neural Text-to-Speech models developed within the TWB Voice project by CLEAR Global. Currently it supports **Hausa** and **Kanuri** languages, developed as part of the first phase of the project.
17
+
18
+ ## Features
19
+
20
+ - **Hausa TTS**: 3 speakers (1 female in Kenanci dialect, 2 male speakers from open.bible)
21
+ - **Kanuri TTS**: 1 female speaker
22
+ - High-quality 24kHz audio synthesis
23
+ - Based on YourTTS architecture (VITS-based)
24
+
25
+ ## Models
26
+
27
+ - πŸ€— [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
28
+ - πŸ€— [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
29
+
30
+ ## Datasets
31
+
32
+ - πŸ“Š [Hausa Dataset Samples](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
33
+ - πŸ“Š [Kanuri Dataset Samples](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
34
+
35
+ ## Usage
36
+
37
+ 1. Select your desired language (Hausa or Kanuri)
38
+ 2. Choose a speaker from the available options
39
+ 3. Enter text or use the example sentences
40
+ 4. Click "Synthesize Speech" to generate audio
41
+
42
+ ## Technical Details
43
+
44
+ - **Architecture**: YourTTS (VITS-based) fine-tuned from CML-TTS multilingual checkpoint
45
+ - **Sample Rate**: 24 kHz
46
+ - **Input**: Lowercase text with preserved diacritics
47
+ - **Framework**: Coqui TTS
48
+
49
+ ## License
50
+
51
+ These models are released under **CC-BY-NC-4.0** license for non-commercial use only.
52
+
53
+ ## Acknowledgments
54
+
55
+ Created by CLEAR Global with support from the Patrick J. McGovern Foundation.
56
+
57
+ Special thanks to:
58
+ - TWB Voice Project for high-quality voice data
59
+ - Idiap Coqui TTS for the YourTTS architecture
60
+ - CML-TTS Dataset for the multilingual base model
61
+ - Biblica open.bible for additional Hausa recordings
app.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from TTS.api import TTS
4
+ import numpy as np
5
+ import tempfile
6
+ import os
7
+
8
+ # Model configurations
9
+ MODELS = {
10
+ "Hausa": {
11
+ "model_repo": "CLEAR-Global/TWB-Voice-Hausa-TTS-1.0",
12
+ "model_name": "best_model_498283.pth",
13
+ "config_name": "config.json",
14
+ "speakers": {
15
+ "spk_f_1": "Female",
16
+ "spk_m_1": "Male 1",
17
+ "spk_m_2": "Male 2"
18
+ },
19
+ "examples": [
20
+ "Lokacin damuna shuka kan koriya shar.",
21
+ "Lafiyarku tafi kuΙ—inku muhimmanci.",
22
+ "A kiyayi inda ake samun labarun magani ko kariya da cututtuka."
23
+ ]
24
+ },
25
+ "Kanuri": {
26
+ "model_repo": "CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0",
27
+ "model_name": "best_model_264313.pth",
28
+ "config_name": "config.json",
29
+ "speakers": {
30
+ "spk1": "Female"
31
+ },
32
+ "examples": [
33
+ "Loktu nǝngriyi ye lan, nǝyama kulo ye dǝ so shawwa ro wurazen.",
34
+ "Nǝlewa nǝm dǝ, kunguna nǝm wa faidan kozǝna.",
35
+ "Na done hawar kattu ye so kǝla kurun nǝlewa ye tarzeyen so dǝa wane."
36
+ ]
37
+ }
38
+ }
39
+
40
+ # Initialize models
41
+ device = "cuda" if torch.cuda.is_available() else "cpu"
42
+ loaded_models = {}
43
+
44
+ def load_model(language):
45
+ """Load TTS model for the specified language"""
46
+ if language not in loaded_models:
47
+ model_repo = MODELS[language]["model_repo"]
48
+ model_name = MODELS[language]["model_name"]
49
+ config_name = MODELS[language]["config_name"]
50
+
51
+ try:
52
+ from huggingface_hub import hf_hub_download
53
+
54
+ # Download specific model and config files from HuggingFace repo
55
+ model_path = hf_hub_download(repo_id=model_repo, filename=model_name)
56
+ config_path = hf_hub_download(repo_id=model_repo, filename=config_name)
57
+
58
+ # Load TTS model with specific model and config paths
59
+ loaded_models[language] = TTS(model_path=model_path, config_path=config_path, gpu=torch.cuda.is_available())
60
+
61
+ except Exception as e:
62
+ print(f"Error loading {language} model: {e}")
63
+ return None
64
+ return loaded_models[language]
65
+
66
+ def update_speakers(language):
67
+ """Update speaker dropdown based on selected language"""
68
+ if language in MODELS:
69
+ speakers = MODELS[language]["speakers"]
70
+ choices = [(f"{speaker_id}: {description}", speaker_id)
71
+ for speaker_id, description in speakers.items()]
72
+ return gr.Dropdown(choices=choices, value=choices[0][1], interactive=True)
73
+ return gr.Dropdown(choices=[], interactive=False)
74
+
75
+ def get_example_text(language, example_idx):
76
+ """Get example text for the selected language"""
77
+ if language in MODELS and 0 <= example_idx < len(MODELS[language]["examples"]):
78
+ return MODELS[language]["examples"][example_idx]
79
+ return ""
80
+
81
+ def synthesize_speech(text, language, speaker):
82
+ """Synthesize speech from text"""
83
+ if not text.strip():
84
+ return None, "Please enter some text to synthesize."
85
+
86
+ # Load the model
87
+ tts_model = load_model(language)
88
+ if tts_model is None:
89
+ return None, f"Failed to load {language} model."
90
+
91
+ try:
92
+ # Convert text to lowercase as required by the models
93
+ text = text.lower().strip()
94
+
95
+ # Generate speech
96
+ wav = tts_model.tts(text=text, speaker=speaker)
97
+
98
+ # Convert to numpy array and save to temporary file
99
+ wav_array = np.array(wav, dtype=np.float32)
100
+
101
+ # Create temporary file
102
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
103
+
104
+ # Save audio (assuming 24kHz sample rate as specified in model cards)
105
+ import scipy.io.wavfile as wavfile
106
+ wavfile.write(temp_file.name, 24000, wav_array)
107
+
108
+ return temp_file.name, "Speech synthesized successfully!"
109
+
110
+ except Exception as e:
111
+ return None, f"Error during synthesis: {str(e)}"
112
+
113
+ # Create Gradio interface
114
+ with gr.Blocks(title="TWB Voice TTS Demo", theme=gr.themes.Soft()) as demo:
115
+ gr.Markdown("""
116
+ # TWB Voice 1.0 - Hausa & Kanuri Text-to-Speech
117
+
118
+ This demo showcases neural Text-to-Speech models for **Hausa** and **Kanuri** languages,
119
+ developed as part of the TWB Voice 1.0 project by CLEAR Global.
120
+
121
+ ### Features:
122
+ - **Hausa**: 3 speakers (1 female, 2 male) - Kenanci dialect
123
+ - **Kanuri**: 1 female speaker
124
+ - High-quality 24kHz audio output
125
+ - Based on YourTTS architecture
126
+
127
+ ### Links:
128
+ - πŸ€— [Hausa Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Hausa-TTS-1.0)
129
+ - πŸ€— [Kanuri Model](https://huggingface.co/CLEAR-Global/TWB-Voice-Kanuri-TTS-1.0)
130
+ - πŸ“Š [Hausa Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Hausa-1.0-sampleset)
131
+ - πŸ“Š [Kanuri Dataset](https://huggingface.co/datasets/CLEAR-Global/TWB-voice-TTS-Kanuri-1.0-sampleset)
132
+ - 🌐 [TWB Voice Project](https://twbvoice.org/)
133
+
134
+ ---
135
+ """)
136
+
137
+ with gr.Row():
138
+ with gr.Column():
139
+ # Language selection
140
+ language_dropdown = gr.Dropdown(
141
+ choices=list(MODELS.keys()),
142
+ value="Hausa",
143
+ label="Language",
144
+ info="Select the language for synthesis"
145
+ )
146
+
147
+ # Speaker selection
148
+ speaker_dropdown = gr.Dropdown(
149
+ choices=[(f"spk_f_1: Female (Kenanci dialect)", "spk_f_1")],
150
+ value="spk_f_1",
151
+ label="Speaker",
152
+ info="Select the voice speaker"
153
+ )
154
+
155
+ # Text input
156
+ text_input = gr.Textbox(
157
+ label="Text to synthesize",
158
+ placeholder="Enter text in the selected language (will be converted to lowercase)",
159
+ lines=3,
160
+ info="Note: Text will be automatically converted to lowercase as required by the models"
161
+ )
162
+
163
+ # Example buttons
164
+ gr.Markdown("**Quick examples:**")
165
+ with gr.Row():
166
+ example_btn_1 = gr.Button("Example 1", size="sm")
167
+ example_btn_2 = gr.Button("Example 2", size="sm")
168
+ example_btn_3 = gr.Button("Example 3", size="sm")
169
+
170
+ # Synthesize button
171
+ synthesize_btn = gr.Button("🎀 Synthesize Speech", variant="primary")
172
+
173
+ with gr.Column():
174
+ # Audio output
175
+ audio_output = gr.Audio(
176
+ label="Generated Speech",
177
+ type="filepath"
178
+ )
179
+
180
+ # Status message
181
+ status_output = gr.Textbox(
182
+ label="Status",
183
+ interactive=False
184
+ )
185
+
186
+ # Event handlers
187
+ language_dropdown.change(
188
+ fn=update_speakers,
189
+ inputs=[language_dropdown],
190
+ outputs=[speaker_dropdown]
191
+ )
192
+
193
+ example_btn_1.click(
194
+ fn=lambda lang: get_example_text(lang, 0),
195
+ inputs=[language_dropdown],
196
+ outputs=[text_input]
197
+ )
198
+
199
+ example_btn_2.click(
200
+ fn=lambda lang: get_example_text(lang, 1),
201
+ inputs=[language_dropdown],
202
+ outputs=[text_input]
203
+ )
204
+
205
+ example_btn_3.click(
206
+ fn=lambda lang: get_example_text(lang, 2),
207
+ inputs=[language_dropdown],
208
+ outputs=[text_input]
209
+ )
210
+
211
+ synthesize_btn.click(
212
+ fn=synthesize_speech,
213
+ inputs=[text_input, language_dropdown, speaker_dropdown],
214
+ outputs=[audio_output, status_output]
215
+ )
216
+
217
+ gr.Markdown("""
218
+ ---
219
+ ### Notes:
220
+ - Models work best with **lowercase input text** (automatically converted)
221
+ - **Hausa model** supports diacritics: `ăāɓɗƙƴū`
222
+ - **Kanuri model** supports diacritics: `Ñúǝəә`
223
+ - Audio output is generated at 24kHz sample rate
224
+ - Models are optimized for educational and general content
225
+
226
+ ### License:
227
+ These models are released under **CC-BY-NC-4.0** license (Non-Commercial use only).
228
+
229
+ **Created by:** CLEAR Global with support from the Patrick J. McGovern Foundation
230
+ """)
231
+
232
+ if __name__ == "__main__":
233
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ TTS
3
+ torch
4
+ scipy
5
+ numpy
6
+ huggingface_hub