Spaces:
Running
Running
Added TTS: MaskGCT & StyleTTS kokoro; Edge space fixed
Browse files
app.py
CHANGED
|
@@ -98,16 +98,24 @@ AVAILABLE_MODELS = {
|
|
| 98 |
# 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0 # overlly jolly
|
| 99 |
|
| 100 |
# # Microsoft Edge TTS
|
| 101 |
-
|
| 102 |
|
| 103 |
# IMS-Toucan
|
| 104 |
-
# 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
|
| 105 |
|
| 106 |
# IMS-Toucan English non-artificial
|
| 107 |
'Flux9665/EnglishToucan': 'Flux9665/EnglishToucan', # 5.1
|
| 108 |
|
| 109 |
# StyleTTS v2
|
| 110 |
-
'Pendrokar/style-tts-2': 'Pendrokar/style-tts-2',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
# HF TTS w issues
|
| 113 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
|
@@ -276,10 +284,38 @@ HF_SPACES = {
|
|
| 276 |
'function': '/synthesize',
|
| 277 |
'text_param_index': 0,
|
| 278 |
'return_audio_index': 0,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
'is_zero_gpu_space': True,
|
| 280 |
'series': 'StyleTTS',
|
| 281 |
},
|
| 282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
# TTS w issues
|
| 284 |
# 'PolyAI/pheme': '/predict#0', #sleepy HF Space
|
| 285 |
# 'amphion/Text-to-Speech': '/predict#0', #takes a whole minute to synthesize
|
|
@@ -411,6 +447,31 @@ OVERRIDE_INPUTS = {
|
|
| 411 |
3: 8, # lngsteps
|
| 412 |
},
|
| 413 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
}
|
| 415 |
|
| 416 |
hf_clients: Tuple[Client] = {}
|
|
|
|
| 98 |
# 'parler-tts/parler-tts-expresso': 'parler-tts/parler-tts-expresso', # 4.29 4.32 4.36.1 4.42.0 # overlly jolly
|
| 99 |
|
| 100 |
# # Microsoft Edge TTS
|
| 101 |
+
'innoai/Edge-TTS-Text-to-Speech': 'innoai/Edge-TTS-Text-to-Speech', # 4.29
|
| 102 |
|
| 103 |
# IMS-Toucan
|
| 104 |
+
# 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1 # randomly changes pitch
|
| 105 |
|
| 106 |
# IMS-Toucan English non-artificial
|
| 107 |
'Flux9665/EnglishToucan': 'Flux9665/EnglishToucan', # 5.1
|
| 108 |
|
| 109 |
# StyleTTS v2
|
| 110 |
+
# 'Pendrokar/style-tts-2': 'Pendrokar/style-tts-2',
|
| 111 |
+
# StyleTTS kokoro
|
| 112 |
+
'hexgrad/kokoro': 'hexgrad/kokoro',
|
| 113 |
+
|
| 114 |
+
# MaskGCT (by Amphion)
|
| 115 |
+
# DEMANDS 300 seconds of ZeroGPU
|
| 116 |
+
# 'amphion/maskgct': 'amphion/maskgct',
|
| 117 |
+
# default ZeroGPU borrow time
|
| 118 |
+
# 'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab',
|
| 119 |
|
| 120 |
# HF TTS w issues
|
| 121 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
|
|
|
| 284 |
'function': '/synthesize',
|
| 285 |
'text_param_index': 0,
|
| 286 |
'return_audio_index': 0,
|
| 287 |
+
# 'is_zero_gpu_space': True,
|
| 288 |
+
'series': 'StyleTTS',
|
| 289 |
+
},
|
| 290 |
+
|
| 291 |
+
# StyleTTS v2 kokoro fine tune
|
| 292 |
+
'hexgrad/kokoro': {
|
| 293 |
+
'name': 'StyleTTS kokoro',
|
| 294 |
+
'function': '/generate',
|
| 295 |
+
'text_param_index': 0,
|
| 296 |
+
'return_audio_index': 0,
|
| 297 |
'is_zero_gpu_space': True,
|
| 298 |
'series': 'StyleTTS',
|
| 299 |
},
|
| 300 |
|
| 301 |
+
# StyleTTS v2 kokoro fine tune
|
| 302 |
+
'amphion/maskgct': {
|
| 303 |
+
'name': 'MaskGCT',
|
| 304 |
+
'function': '/predict',
|
| 305 |
+
'text_param_index': 1,
|
| 306 |
+
'return_audio_index': 0,
|
| 307 |
+
'is_zero_gpu_space': True,
|
| 308 |
+
'series': 'MaskGCT',
|
| 309 |
+
},
|
| 310 |
+
'Svngoku/maskgct-audio-lab': {
|
| 311 |
+
'name': 'MaskGCT',
|
| 312 |
+
'function': '/predict',
|
| 313 |
+
'text_param_index': 1,
|
| 314 |
+
'return_audio_index': 0,
|
| 315 |
+
'is_zero_gpu_space': True,
|
| 316 |
+
'series': 'MaskGCT',
|
| 317 |
+
},
|
| 318 |
+
|
| 319 |
# TTS w issues
|
| 320 |
# 'PolyAI/pheme': '/predict#0', #sleepy HF Space
|
| 321 |
# 'amphion/Text-to-Speech': '/predict#0', #takes a whole minute to synthesize
|
|
|
|
| 447 |
3: 8, # lngsteps
|
| 448 |
},
|
| 449 |
|
| 450 |
+
# StyleTTS 2 kokoro
|
| 451 |
+
'hexgrad/kokoro': {
|
| 452 |
+
1: "af_0", #voice
|
| 453 |
+
2: None, #ps
|
| 454 |
+
3: 1, #speed
|
| 455 |
+
4: 0.5, #reduce_noise
|
| 456 |
+
5: 4000, #opening_cut
|
| 457 |
+
6: 2000, #closing_cut
|
| 458 |
+
7: 3000, #ease_in
|
| 459 |
+
8: 1000, #ease_out
|
| 460 |
+
9: 5000, #pad_before
|
| 461 |
+
10: 5000, #pad_after
|
| 462 |
+
},
|
| 463 |
+
|
| 464 |
+
# maskGCT (by amphion)
|
| 465 |
+
'amphion/maskgct': {
|
| 466 |
+
0: DEFAULT_VOICE_SAMPLE, #prompt_wav
|
| 467 |
+
2: -1, #target_len
|
| 468 |
+
3: 25, #n_timesteps
|
| 469 |
+
},
|
| 470 |
+
'Svngoku/maskgct-audio-lab': {
|
| 471 |
+
0: DEFAULT_VOICE_SAMPLE, #prompt_wav
|
| 472 |
+
2: -1, #target_len
|
| 473 |
+
3: 25, #n_timesteps
|
| 474 |
+
},
|
| 475 |
}
|
| 476 |
|
| 477 |
hf_clients: Tuple[Client] = {}
|