Spaces:
Running
Running
New TTS: Zonos both archs; disabled xVASynth v3
Browse files- app/models.py +57 -2
- test_tts_zonos.py +52 -0
app/models.py
CHANGED
|
@@ -31,7 +31,7 @@ AVAILABLE_MODELS = {
|
|
| 31 |
#'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # extra_headers error appears for 5.13+
|
| 32 |
# 'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1', # 4.29 4.32
|
| 33 |
'Pendrokar/xVASynth-TTS': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
|
| 34 |
-
'Pendrokar/xVASynth-TTS/NoDeepMoji': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
|
| 35 |
# 'coqui/CoquiTTS': 'coqui/CoquiTTS',
|
| 36 |
'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29 4.32
|
| 37 |
# 'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # Queue ERROR
|
|
@@ -90,6 +90,10 @@ AVAILABLE_MODELS = {
|
|
| 90 |
# Mars6
|
| 91 |
'CAMB-AI/mars6-turbo-demo': 'CAMB-AI/mars6-turbo-demo',
|
| 92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
# HF TTS w issues
|
| 94 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
| 95 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
@@ -417,6 +421,24 @@ HF_SPACES = {
|
|
| 417 |
'is_closed_source': True,
|
| 418 |
'series': 'MARS',
|
| 419 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 420 |
}
|
| 421 |
|
| 422 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
@@ -658,8 +680,41 @@ OVERRIDE_INPUTS = {
|
|
| 658 |
'quality_prefix': "48000",
|
| 659 |
'clone_method': "deep-clone",
|
| 660 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 661 |
}
|
| 662 |
|
|
|
|
|
|
|
|
|
|
| 663 |
|
| 664 |
# Model name mapping, can include models that users cannot vote on
|
| 665 |
model_names = {
|
|
@@ -719,7 +774,7 @@ closed_source = [
|
|
| 719 |
]
|
| 720 |
|
| 721 |
# top five models in order to always have one of them picked and scrutinized
|
| 722 |
-
top_five = []
|
| 723 |
|
| 724 |
# prioritize low vote models
|
| 725 |
sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'
|
|
|
|
| 31 |
#'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # extra_headers error appears for 5.13+
|
| 32 |
# 'mrfakename/MetaVoice-1B-v0.1': 'mrfakename/MetaVoice-1B-v0.1', # 4.29 4.32
|
| 33 |
'Pendrokar/xVASynth-TTS': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
|
| 34 |
+
# 'Pendrokar/xVASynth-TTS/NoDeepMoji': 'Pendrokar/xVASynth-TTS', # 4.29 4.32 4.42.0
|
| 35 |
# 'coqui/CoquiTTS': 'coqui/CoquiTTS',
|
| 36 |
'mrfakename/MeloTTS': 'mrfakename/MeloTTS', # 4.29 4.32
|
| 37 |
# 'fishaudio/fish-speech-1': 'fishaudio/fish-speech-1', # Queue ERROR
|
|
|
|
| 90 |
# Mars6
|
| 91 |
'CAMB-AI/mars6-turbo-demo': 'CAMB-AI/mars6-turbo-demo',
|
| 92 |
|
| 93 |
+
# Zonos
|
| 94 |
+
'Steveeeeeeen/Zonos': 'Steveeeeeeen/Zonos',
|
| 95 |
+
'Steveeeeeeen/Zonos/hybrid': 'Steveeeeeeen/Zonos',
|
| 96 |
+
|
| 97 |
# HF TTS w issues
|
| 98 |
# 'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
| 99 |
# 'PolyAI/pheme': '/predict#0', # sleepy HF Space
|
|
|
|
| 421 |
'is_closed_source': True,
|
| 422 |
'series': 'MARS',
|
| 423 |
},
|
| 424 |
+
|
| 425 |
+
# Zonos
|
| 426 |
+
'Steveeeeeeen/Zonos': {
|
| 427 |
+
'name': 'Zonos T',
|
| 428 |
+
'function': '/generate_audio',
|
| 429 |
+
'text_param_index': 'text',
|
| 430 |
+
'return_audio_index': 0,
|
| 431 |
+
'is_zero_gpu_space': True,
|
| 432 |
+
'series': 'Zonos',
|
| 433 |
+
},
|
| 434 |
+
'Steveeeeeeen/Zonos/hybrid': {
|
| 435 |
+
'name': 'Zonos H',
|
| 436 |
+
'function': '/generate_audio',
|
| 437 |
+
'text_param_index': 'text',
|
| 438 |
+
'return_audio_index': 0,
|
| 439 |
+
'is_zero_gpu_space': True,
|
| 440 |
+
'series': 'Zonos',
|
| 441 |
+
},
|
| 442 |
}
|
| 443 |
|
| 444 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
|
|
| 680 |
'quality_prefix': "48000",
|
| 681 |
'clone_method': "deep-clone",
|
| 682 |
},
|
| 683 |
+
|
| 684 |
+
# Zonos
|
| 685 |
+
'Steveeeeeeen/Zonos': {
|
| 686 |
+
'model_choice':"Zyphra/Zonos-v0.1-transformer",
|
| 687 |
+
'language': "en-us",
|
| 688 |
+
'speaker_audio': None, # optional
|
| 689 |
+
'prefix_audio': handle_file('https://huggingface.co/spaces/Steveeeeeeen/Zonos/resolve/main/assets/silence_100ms.wav'),
|
| 690 |
+
# 'e1': 1,
|
| 691 |
+
# 'e2': 0.05,
|
| 692 |
+
# 'e3': 0.05,
|
| 693 |
+
# 'e4': 0.05,
|
| 694 |
+
# 'e5': 0.05,
|
| 695 |
+
# 'e6': 0.05,
|
| 696 |
+
# 'e7': 0.1,
|
| 697 |
+
# 'e8': 0.2,
|
| 698 |
+
'vq_single': 0.78,
|
| 699 |
+
'fmax': 24000,
|
| 700 |
+
'pitch_std': 45,
|
| 701 |
+
'speaking_rate': 15,
|
| 702 |
+
'dnsmos_ovrl': 4,
|
| 703 |
+
'speaker_noised': False,
|
| 704 |
+
'cfg_scale': 2,
|
| 705 |
+
'min_p': 0.15,
|
| 706 |
+
'seed': 420,
|
| 707 |
+
'randomize_seed': False, # Set to False to easily recreate the state
|
| 708 |
+
'unconditional_keys': ["emotion"], # makes it ignore e1-e8
|
| 709 |
+
},
|
| 710 |
+
# 'Steveeeeeeen/Zonos/hybrid': {
|
| 711 |
+
# 'model_choice': 'Zyphra/Zonos-v0.1-hybrid',
|
| 712 |
+
# },
|
| 713 |
}
|
| 714 |
|
| 715 |
+
# minor mods to model from the same space
|
| 716 |
+
OVERRIDE_INPUTS['Steveeeeeeen/Zonos/hybrid'] = OVERRIDE_INPUTS['Steveeeeeeen/Zonos']
|
| 717 |
+
OVERRIDE_INPUTS['Steveeeeeeen/Zonos/hybrid']['model_choice'] = 'Zyphra/Zonos-v0.1-hybrid'
|
| 718 |
|
| 719 |
# Model name mapping, can include models that users cannot vote on
|
| 720 |
model_names = {
|
|
|
|
| 774 |
]
|
| 775 |
|
| 776 |
# top five models in order to always have one of them picked and scrutinized
|
| 777 |
+
top_five = ['Steveeeeeeen/Zonos', 'Steveeeeeeen/Zonos/hybrid']
|
| 778 |
|
| 779 |
# prioritize low vote models
|
| 780 |
sql = 'SELECT name FROM model WHERE (upvote + downvote) < 750 ORDER BY (upvote + downvote) ASC'
|
test_tts_zonos.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from test_overrides import _get_param_examples, _override_params
|
| 3 |
+
from gradio_client import Client, file
|
| 4 |
+
|
| 5 |
+
model = "Steveeeeeeen/Zonos/hybrid"
|
| 6 |
+
# client = Client("Pendrokar/Zonos", hf_token=os.getenv('HF_TOKEN'))
|
| 7 |
+
client = Client("Steveeeeeeen/Zonos", hf_token=os.getenv('HF_TOKEN'))
|
| 8 |
+
# client = Client(model, hf_token=os.getenv('HF_TOKEN'))
|
| 9 |
+
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
| 10 |
+
# print(endpoints)
|
| 11 |
+
|
| 12 |
+
api_name = '/generate_audio'
|
| 13 |
+
fn_index = None
|
| 14 |
+
end_parameters = None
|
| 15 |
+
text = 'This is what my voice sounds like.'
|
| 16 |
+
|
| 17 |
+
end_parameters = _get_param_examples(
|
| 18 |
+
endpoints['named_endpoints'][api_name]['parameters']
|
| 19 |
+
)
|
| 20 |
+
print(end_parameters)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
space_inputs = end_parameters
|
| 24 |
+
# override some or all default parameters
|
| 25 |
+
space_inputs = _override_params(end_parameters, model)
|
| 26 |
+
|
| 27 |
+
if(type(space_inputs) == dict):
|
| 28 |
+
space_inputs['text'] = text
|
| 29 |
+
result = client.predict(
|
| 30 |
+
**space_inputs,
|
| 31 |
+
api_name=api_name,
|
| 32 |
+
fn_index=fn_index
|
| 33 |
+
)
|
| 34 |
+
else:
|
| 35 |
+
space_inputs[0] = text
|
| 36 |
+
result = client.predict(
|
| 37 |
+
*space_inputs,
|
| 38 |
+
api_name=api_name,
|
| 39 |
+
fn_index=fn_index
|
| 40 |
+
)
|
| 41 |
+
# space_inputs = {str(i): value for i, value in enumerate(space_inputs)}
|
| 42 |
+
|
| 43 |
+
print(space_inputs)
|
| 44 |
+
# print(*space_inputs)
|
| 45 |
+
# print(**space_inputs)
|
| 46 |
+
|
| 47 |
+
# result = client.predict(
|
| 48 |
+
# **space_inputs,
|
| 49 |
+
# api_name=api_name,
|
| 50 |
+
# fn_index=fn_index
|
| 51 |
+
# )
|
| 52 |
+
print(result)
|