Spaces:
Running
on
Zero
Running
on
Zero
Update inference_webui.py
Browse files- inference_webui.py +39 -31
inference_webui.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
|
| 3 |
os.makedirs("pretrained_models", exist_ok=True)
|
| 4 |
from huggingface_hub import snapshot_download
|
|
@@ -84,15 +85,22 @@ from module.mel_processing import spectrogram_torch
|
|
| 84 |
from module.models import SynthesizerTrn
|
| 85 |
from text import cleaned_text_to_sequence
|
| 86 |
from text.cleaner import clean_text
|
| 87 |
-
from tools.i18n.i18n import I18nAuto, scan_language_list
|
| 88 |
from tools.my_utils import load_audio
|
| 89 |
|
| 90 |
-
language=os.environ.get("language","Auto")
|
| 91 |
-
language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
| 92 |
-
i18n = I18nAuto(language="Auto")
|
| 93 |
|
| 94 |
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
if torch.cuda.is_available():
|
| 98 |
device = "cuda"
|
|
@@ -102,25 +110,25 @@ else:
|
|
| 102 |
is_half = False
|
| 103 |
|
| 104 |
dict_language_v1 = {
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
}
|
| 112 |
dict_language_v2 = {
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
}
|
| 125 |
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
| 126 |
|
|
@@ -211,7 +219,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
|
| 211 |
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
| 212 |
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
| 213 |
if prompt_language is not None and text_language is not None:
|
| 214 |
-
if prompt_language in
|
| 215 |
prompt_text_update, prompt_language_update = (
|
| 216 |
{"__type__": "update"},
|
| 217 |
{"__type__": "update", "value": prompt_language},
|
|
@@ -219,7 +227,7 @@ def change_sovits_weights(sovits_path, prompt_language=None, text_language=None)
|
|
| 219 |
else:
|
| 220 |
prompt_text_update = {"__type__": "update", "value": ""}
|
| 221 |
prompt_language_update = {"__type__": "update", "value": i18n("中文")}
|
| 222 |
-
if text_language in
|
| 223 |
text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language}
|
| 224 |
else:
|
| 225 |
text_update = {"__type__": "update", "value": ""}
|
|
@@ -468,12 +476,12 @@ def get_tts_wav(
|
|
| 468 |
prompt_text = prompt_text.strip("\n")
|
| 469 |
if prompt_text[-1] not in splits:
|
| 470 |
prompt_text += "。" if prompt_language != "en" else "."
|
| 471 |
-
print(i18n("实际输入的参考文本:"), prompt_text)
|
| 472 |
text = text.strip("\n")
|
| 473 |
if text[0] not in splits and len(get_first(text)) < 4:
|
| 474 |
text = "。" + text if text_language != "en" else "." + text
|
| 475 |
|
| 476 |
-
print(i18n("实际输入的目标文本:"), text)
|
| 477 |
zero_wav = np.zeros(
|
| 478 |
int(hps.data.sampling_rate * 0.3),
|
| 479 |
dtype=np.float16 if is_half == True else np.float32,
|
|
@@ -517,7 +525,7 @@ def get_tts_wav(
|
|
| 517 |
text = cut5(text)
|
| 518 |
while "\n\n" in text:
|
| 519 |
text = text.replace("\n\n", "\n")
|
| 520 |
-
print(i18n("实际输入的目标文本(切句后):"), text)
|
| 521 |
texts = text.split("\n")
|
| 522 |
texts = process_text(texts)
|
| 523 |
texts = merge_short_text_in_array(texts, 5)
|
|
@@ -533,9 +541,9 @@ def get_tts_wav(
|
|
| 533 |
continue
|
| 534 |
if text[-1] not in splits:
|
| 535 |
text += "。" if text_language != "en" else "."
|
| 536 |
-
print(i18n("实际输入的目标文本(每句):"), text)
|
| 537 |
phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version)
|
| 538 |
-
print(i18n("前端处理后的文本(每句):"), norm_text2)
|
| 539 |
if not ref_free:
|
| 540 |
bert = torch.cat([bert1, bert2], 1)
|
| 541 |
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
|
@@ -892,9 +900,9 @@ if __name__ == "__main__":
|
|
| 892 |
gen = get_tts_wav(
|
| 893 |
ref_wav_path=file_name,
|
| 894 |
prompt_text="",
|
| 895 |
-
prompt_language=
|
| 896 |
text="犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之.你好世界 Love you 世界へ 안녕하세요",
|
| 897 |
-
text_language=
|
| 898 |
inp_refs=[],
|
| 899 |
)
|
| 900 |
next(gen)
|
|
@@ -903,5 +911,5 @@ if __name__ == "__main__":
|
|
| 903 |
server_name="0.0.0.0",
|
| 904 |
inbrowser=True,
|
| 905 |
show_api=False,
|
| 906 |
-
allowed_paths=["/"]
|
| 907 |
)
|
|
|
|
| 1 |
import os
|
| 2 |
+
os.system("pip install gradio-client==1.10.4 gradio-5.35.0-py3-none-any.whl")
|
| 3 |
|
| 4 |
os.makedirs("pretrained_models", exist_ok=True)
|
| 5 |
from huggingface_hub import snapshot_download
|
|
|
|
| 85 |
from module.models import SynthesizerTrn
|
| 86 |
from text import cleaned_text_to_sequence
|
| 87 |
from text.cleaner import clean_text
|
| 88 |
+
# from tools.i18n.i18n import I18nAuto, scan_language_list
|
| 89 |
from tools.my_utils import load_audio
|
| 90 |
|
| 91 |
+
# language=os.environ.get("language","Auto")
|
| 92 |
+
# language=sys.argv[-1] if sys.argv[-1] in scan_language_list() else language
|
| 93 |
+
# i18n = I18nAuto(language="Auto")
|
| 94 |
|
| 95 |
# os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1' # 确保直接启动推理UI时也能够设置。
|
| 96 |
|
| 97 |
+
i18n_dict={}
|
| 98 |
+
json_root="tools/i18n/locale"
|
| 99 |
+
for name in os.listdir(json_root):
|
| 100 |
+
with open("%s/%s"%(json_root,name),"r")as f:
|
| 101 |
+
data=json.loads(f.read())
|
| 102 |
+
i18n_dict[name.split(".json")[0].replace("_","-")]=data
|
| 103 |
+
i18n=gr.I18n(**i18n_dict)
|
| 104 |
|
| 105 |
if torch.cuda.is_available():
|
| 106 |
device = "cuda"
|
|
|
|
| 110 |
is_half = False
|
| 111 |
|
| 112 |
dict_language_v1 = {
|
| 113 |
+
"中文": "all_zh", # 全部按中文识别
|
| 114 |
+
"英文": "en", # 全部按英文识别#######不变
|
| 115 |
+
"日文": "all_ja", # 全部按日文识别
|
| 116 |
+
"中英混合": "zh", # 按中英混合识别####不变
|
| 117 |
+
"日英混合": "ja", # 按日英混合识别####不变
|
| 118 |
+
"多语种混合": "auto", # 多语种启动切分识别语种
|
| 119 |
}
|
| 120 |
dict_language_v2 = {
|
| 121 |
+
"中文": "all_zh", # 全部按中文识别
|
| 122 |
+
"英文": "en", # 全部按英文识别#######不变
|
| 123 |
+
"日文": "all_ja", # 全部按日文识别
|
| 124 |
+
"粤语": "all_yue", # 全部按中文识别
|
| 125 |
+
"韩文": "all_ko", # 全部按韩文识别
|
| 126 |
+
"中英混合": "zh", # 按中英混合识别####不变
|
| 127 |
+
"日英混合": "ja", # 按日英混合识别####不变
|
| 128 |
+
"粤英混合": "yue", # 按粤英混合识别####不变
|
| 129 |
+
"韩英混合": "ko", # 按韩英混合识别####不变
|
| 130 |
+
"多语种混合": "auto", # 多语种启动切分识别语种
|
| 131 |
+
"多语种混合(粤语)": "auto_yue", # 多语种启动切分识别语种
|
| 132 |
}
|
| 133 |
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
| 134 |
|
|
|
|
| 219 |
print(vq_model.load_state_dict(dict_s2["weight"], strict=False))
|
| 220 |
dict_language = dict_language_v1 if version == "v1" else dict_language_v2
|
| 221 |
if prompt_language is not None and text_language is not None:
|
| 222 |
+
if prompt_language in dict_language:
|
| 223 |
prompt_text_update, prompt_language_update = (
|
| 224 |
{"__type__": "update"},
|
| 225 |
{"__type__": "update", "value": prompt_language},
|
|
|
|
| 227 |
else:
|
| 228 |
prompt_text_update = {"__type__": "update", "value": ""}
|
| 229 |
prompt_language_update = {"__type__": "update", "value": i18n("中文")}
|
| 230 |
+
if text_language in dict_language:
|
| 231 |
text_update, text_language_update = {"__type__": "update"}, {"__type__": "update", "value": text_language}
|
| 232 |
else:
|
| 233 |
text_update = {"__type__": "update", "value": ""}
|
|
|
|
| 476 |
prompt_text = prompt_text.strip("\n")
|
| 477 |
if prompt_text[-1] not in splits:
|
| 478 |
prompt_text += "。" if prompt_language != "en" else "."
|
| 479 |
+
print(i18n("实际输入的参考文本:").key, prompt_text)
|
| 480 |
text = text.strip("\n")
|
| 481 |
if text[0] not in splits and len(get_first(text)) < 4:
|
| 482 |
text = "。" + text if text_language != "en" else "." + text
|
| 483 |
|
| 484 |
+
print(i18n("实际输入的目标文本:").key, text)
|
| 485 |
zero_wav = np.zeros(
|
| 486 |
int(hps.data.sampling_rate * 0.3),
|
| 487 |
dtype=np.float16 if is_half == True else np.float32,
|
|
|
|
| 525 |
text = cut5(text)
|
| 526 |
while "\n\n" in text:
|
| 527 |
text = text.replace("\n\n", "\n")
|
| 528 |
+
print(i18n("实际输入的目标文本(切句后):").key, text)
|
| 529 |
texts = text.split("\n")
|
| 530 |
texts = process_text(texts)
|
| 531 |
texts = merge_short_text_in_array(texts, 5)
|
|
|
|
| 541 |
continue
|
| 542 |
if text[-1] not in splits:
|
| 543 |
text += "。" if text_language != "en" else "."
|
| 544 |
+
print(i18n("实际输入的目标文本(每句):").key, text)
|
| 545 |
phones2, bert2, norm_text2 = get_phones_and_bert(text, text_language, version)
|
| 546 |
+
print(i18n("前端处理后的文本(每句):").key, norm_text2)
|
| 547 |
if not ref_free:
|
| 548 |
bert = torch.cat([bert1, bert2], 1)
|
| 549 |
all_phoneme_ids = torch.LongTensor(phones1 + phones2).to(device).unsqueeze(0)
|
|
|
|
| 900 |
gen = get_tts_wav(
|
| 901 |
ref_wav_path=file_name,
|
| 902 |
prompt_text="",
|
| 903 |
+
prompt_language="中文",
|
| 904 |
text="犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之,犯大吴疆土者,盛必击而破之.你好世界 Love you 世界へ 안녕하세요",
|
| 905 |
+
text_language="多语种混合",
|
| 906 |
inp_refs=[],
|
| 907 |
)
|
| 908 |
next(gen)
|
|
|
|
| 911 |
server_name="0.0.0.0",
|
| 912 |
inbrowser=True,
|
| 913 |
show_api=False,
|
| 914 |
+
allowed_paths=["/"],i18n=i18n
|
| 915 |
)
|