Spaces:

Genius-Society
/

hoyoTTS

Running

App Files Files

admin commited on Apr 22

Commit

a71a582

1 Parent(s): be79c50

sync ms

Browse files

Files changed (4) hide show

app.py +51 -82
requirements.txt +4 -4
text/chinese_bert.py +7 -1
utils.py +37 -1

app.py CHANGED Viewed

@@ -1,19 +1,18 @@
-import re
-import os
-import sys
 import utils
 import torch
-import random
-import commons
-import numpy as np
-import gradio as gr
-from tqdm import tqdm
-from models import SynthesizerTrn
-from huggingface_hub import snapshot_download
-from text import cleaned_text_to_sequence, get_bert
-from text.cleaner import clean_text
-from text.symbols import symbols
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
@@ -25,7 +24,8 @@ logging.getLogger("markdown_it").setLevel(logging.WARNING)
 logging.getLogger("urllib3").setLevel(logging.WARNING)
 logging.getLogger("matplotlib").setLevel(logging.WARNING)
 logging.basicConfig(
-    level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s"
 )
 logger = logging.getLogger(__name__)
@@ -102,34 +102,39 @@ def tts_fn(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
 def text_splitter(text: str):
     punctuation = r"[。,；,！,？,〜,\n,\r,\t,.,!,;,?,~, ]"
     sentences = re.split(punctuation, text.strip())
     return [sentence.strip() for sentence in sentences if sentence.strip()]
 def concatenate_audios(audio_samples, sample_rate=44100):
     half_second_silence = np.zeros(int(sample_rate / 2))
     final_audio = audio_samples[0]
     for sample in audio_samples[1:]:
         final_audio = np.concatenate((final_audio, half_second_silence, sample))
-    print("Audio pieces concatenated!")
     return (sample_rate, final_audio)
 def read_text(file_path: str):
     try:
         with open(file_path, "r", encoding="utf-8") as file:
             content = file.read()
             return content
     except FileNotFoundError:
-        print(f"File Not Found: {file_path}")
     except IOError:
-        print(f"An error occurred reading the file: {file_path}")
     except Exception as e:
-        print(f"An unknown error has occurred: {e}")
 def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
@@ -137,7 +142,7 @@ def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scal
         content = read_text(text)
         sentences = text_splitter(content)
         audios = []
-        for sentence in tqdm(sentences, desc="TTS inferring..."):
             with torch.no_grad():
                 audios.append(
                     infer(
@@ -160,7 +165,7 @@ def infer_tab2(content, speaker, sdp_ratio, noise_scale, noise_scale_w, length_s
     try:
         sentences = text_splitter(content)
         audios = []
-        for sentence in tqdm(sentences, desc="TTS inferring..."):
             with torch.no_grad():
                 audios.append(
                     infer(
@@ -181,12 +186,11 @@ def infer_tab2(content, speaker, sdp_ratio, noise_scale, noise_scale_w, length_s
 if __name__ == "__main__":
-    model_dir = snapshot_download("Genius-Society/hoyoTTS", cache_dir="./__pycache__")
     if debug:
         logger.info("Enable DEBUG-LEVEL log")
         logging.basicConfig(level=logging.DEBUG)
-    hps = utils.get_hparams_from_dir(model_dir)
     device = (
         "cuda:0"
         if torch.cuda.is_available()
@@ -204,105 +208,70 @@ if __name__ == "__main__":
         **hps.model,
     ).to(device)
     net_g.eval()
-    utils.load_checkpoint(f"{model_dir}/G_78000.pth", net_g, None, skip_optimizer=True)
     speaker_ids = hps.data.spk2id
     speakers = list(speaker_ids.keys())
     random.shuffle(speakers)
     with gr.Blocks() as app:
         gr.Markdown(
             """
-Welcome to the Space, which is based on the open source project <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a>, and moved to the bottom for an explanation of the principle. This Space must be used in accordance with local laws and regulations, prohibiting the use of it for any criminal activities."""
         )
-        with gr.Tab("Input Mode"):
             gr.Interface(
-                fn=infer_tab2,
                 inputs=[
                     gr.TextArea(
-                        label="Please input the Simplified Chinese text",
-                        placeholder="The first inference takes time to download the model, so be patient.",
                         show_copy_button=True,
                     ),
-                    gr.Dropdown(choices=speakers, value="莱依拉", label="Role"),
                     gr.Slider(
-                        minimum=0,
-                        maximum=1,
-                        value=0.2,
-                        step=0.1,
-                        label="Modulation of intonation",
-                    ),  # SDP/DP Mix Ratio
                     gr.Slider(
-                        minimum=0.1,
-                        maximum=2,
-                        value=0.6,
-                        step=0.1,
-                        label="Emotional adjustment",
                     ),
                     gr.Slider(
-                        minimum=0.1,
-                        maximum=2,
-                        value=0.8,
-                        step=0.1,
-                        label="Phoneme length",
                     ),
                     gr.Slider(
-                        minimum=0.1,
-                        maximum=2,
-                        value=1,
-                        step=0.1,
-                        label="Output duration",
                     ),
                 ],
-                outputs=gr.Audio(label="Output Audio", show_share_button=False),
                 flagging_mode="never",
                 concurrency_limit=4,
             )
-        with gr.Tab("Upload Mode"):
             gr.Interface(
-                fn=infer_tab1,  # Use text_to_speech func
                 inputs=[
                     gr.components.File(
-                        label="Please upload a simplified Chinese TXT",
                         type="filepath",
                         file_types=[".txt"],
                     ),
-                    gr.Dropdown(choices=speakers, value="莱依拉", label="Role"),
                     gr.Slider(
-                        minimum=0,
-                        maximum=1,
-                        value=0.2,
-                        step=0.1,
-                        label="Modulation of intonation",
-                    ),
                     gr.Slider(
-                        minimum=0.1,
-                        maximum=2,
-                        value=0.6,
-                        step=0.1,
-                        label="Emotional adjustment",
                     ),
                     gr.Slider(
-                        minimum=0.1,
-                        maximum=2,
-                        value=0.8,
-                        step=0.1,
-                        label="Phoneme length",
                     ),
                     gr.Slider(
-                        minimum=0.1,
-                        maximum=2,
-                        value=1,
-                        step=0.1,
-                        label="Output duration",
                     ),
                 ],
                 outputs=[
-                    gr.Audio(label="Output Audio", show_share_button=False),
-                    gr.TextArea(
-                        label="Result of TXT extraction",
-                        show_copy_button=True,
-                    ),
                 ],
                 flagging_mode="never",
                 concurrency_limit=4,

+from text.symbols import symbols
+from text.cleaner import clean_text
+from text import cleaned_text_to_sequence, get_bert
+from models import SynthesizerTrn
+from tqdm import tqdm
+from utils import _L, MODEL_DIR
+import gradio as gr
+import numpy as np
+import commons
+import random
 import utils
 import torch
+import sys
+import re
+import os
 if sys.platform == "darwin":
     os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 logging.getLogger("urllib3").setLevel(logging.WARNING)
 logging.getLogger("matplotlib").setLevel(logging.WARNING)
 logging.basicConfig(
+    level=logging.INFO,
+    format="| %(name)s | %(levelname)s | %(message)s",
 )
 logger = logging.getLogger(__name__)
 def text_splitter(text: str):
     punctuation = r"[。,；,！,？,〜,\n,\r,\t,.,!,;,?,~, ]"
+    # 使用正则表达式根据标点符号分割文本, 并忽略重叠的分隔符
     sentences = re.split(punctuation, text.strip())
+    # 过滤掉空字符串
     return [sentence.strip() for sentence in sentences if sentence.strip()]
 def concatenate_audios(audio_samples, sample_rate=44100):
     half_second_silence = np.zeros(int(sample_rate / 2))
+    # 初始化最终的音频数组
     final_audio = audio_samples[0]
+    # 遍历音频样本列表, 并将它们连接起来, 每个样本之间插入半秒钟的静音
     for sample in audio_samples[1:]:
         final_audio = np.concatenate((final_audio, half_second_silence, sample))
+    print("音频片段连接完成！")
     return (sample_rate, final_audio)
 def read_text(file_path: str):
     try:
+        # 打开文件并读取内容
         with open(file_path, "r", encoding="utf-8") as file:
             content = file.read()
             return content
     except FileNotFoundError:
+        print(f"文件未找到: {file_path}")
     except IOError:
+        print(f"读取文件时发生错误: {file_path}")
     except Exception as e:
+        print(f"发生未知错误: {e}")
 def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
         content = read_text(text)
         sentences = text_splitter(content)
         audios = []
+        for sentence in tqdm(sentences, desc="TTS 推理中..."):
             with torch.no_grad():
                 audios.append(
                     infer(
     try:
         sentences = text_splitter(content)
         audios = []
+        for sentence in tqdm(sentences, desc="TTS 推理中..."):
             with torch.no_grad():
                 audios.append(
                     infer(
 if __name__ == "__main__":
     if debug:
         logger.info("Enable DEBUG-LEVEL log")
         logging.basicConfig(level=logging.DEBUG)
+    hps = utils.get_hparams_from_dir(MODEL_DIR)
     device = (
         "cuda:0"
         if torch.cuda.is_available()
         **hps.model,
     ).to(device)
     net_g.eval()
+    utils.load_checkpoint(f"{MODEL_DIR}/G_78000.pth", net_g, None, skip_optimizer=True)
     speaker_ids = hps.data.spk2id
     speakers = list(speaker_ids.keys())
     random.shuffle(speakers)
     with gr.Blocks() as app:
         gr.Markdown(
             """
+欢迎使用此创空间，此创空间基于 <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a> 开源项目制作，移至最底端有原理浅讲。使用此创空间必须遵守当地相关法律法规，禁止用其从事任何违法犯罪活动。"""
         )
+        with gr.Tab("输入模式"):
             gr.Interface(
+                fn=infer_tab2,  # 使用 text_to_speech 函数
                 inputs=[
                     gr.TextArea(
+                        label="请输入简体中文文案",
+                        placeholder="首次推理需耗时下载模型，还请耐心等待。",
                         show_copy_button=True,
                     ),
+                    gr.Dropdown(choices=speakers, value="莱依拉", label="角色"),
                     gr.Slider(
+                        minimum=0, maximum=1, value=0.2, step=0.1, label="语调调节"
+                    ),  # SDP/DP混合比
                     gr.Slider(
+                        minimum=0.1, maximum=2, value=0.6, step=0.1, label="感情调节"
                     ),
                     gr.Slider(
+                        minimum=0.1, maximum=2, value=0.8, step=0.1, label="音素长度"
                     ),
                     gr.Slider(
+                        minimum=0.1, maximum=2, value=1, step=0.1, label="生成时长"
                     ),
                 ],
+                outputs=gr.Audio(label="输出音频"),
                 flagging_mode="never",
                 concurrency_limit=4,
             )
+        with gr.Tab("上传模式"):
             gr.Interface(
+                fn=infer_tab1,  # 使用 text_to_speech 函数
                 inputs=[
                     gr.components.File(
+                        label="请上传简体中文 TXT 文案",
                         type="filepath",
                         file_types=[".txt"],
                     ),
+                    gr.Dropdown(choices=speakers, value="莱依拉", label="角色"),
                     gr.Slider(
+                        minimum=0, maximum=1, value=0.2, step=0.1, label="语调调节"
+                    ),  # SDP/DP混合比
                     gr.Slider(
+                        minimum=0.1, maximum=2, value=0.6, step=0.1, label="感情调节"
                     ),
                     gr.Slider(
+                        minimum=0.1, maximum=2, value=0.8, step=0.1, label="音素长度"
                     ),
                     gr.Slider(
+                        minimum=0.1, maximum=2, value=1, step=0.1, label="生成时长"
                     ),
                 ],
                 outputs=[
+                    gr.Audio(label="输出音频"),
+                    gr.TextArea(label="文案提取结果", show_copy_button=True),
                 ],
                 flagging_mode="never",
                 concurrency_limit=4,

requirements.txt CHANGED Viewed

@@ -1,9 +1,11 @@
 av
 cn2an
 jieba
 numba
 scipy
-gradio
 pypinyin
 Unidecode
 matplotlib
@@ -11,6 +13,4 @@ phonemizer
 tensorboard
 amfm_decompy
 transformers
-torch==2.3.1
-numpy==1.26.4
-librosa==0.9.1

+torch==2.6.0+cu118
+-f https://download.pytorch.org/whl/torch
 av
 cn2an
 jieba
 numba
 scipy
+librosa
 pypinyin
 Unidecode
 matplotlib
 tensorboard
 amfm_decompy
 transformers
+numpy==1.26.4

text/chinese_bert.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import sys
 import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM
 device = torch.device(
     "cuda"
@@ -13,7 +15,11 @@ device = torch.device(
 )
 # 模型下载
-model_dir = "hfl/chinese-roberta-wwm-ext-large"
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
 model = AutoModelForMaskedLM.from_pretrained(model_dir).to(device)

 import sys
 import torch
+from modelscope import snapshot_download
 from transformers import AutoTokenizer, AutoModelForMaskedLM
+from utils import EN_US
 device = torch.device(
     "cuda"
 )
 # 模型下载
+model_dir = (
+    "hfl/chinese-roberta-wwm-ext-large"
+    if EN_US
+    else snapshot_download("dienstag/chinese-roberta-wwm-ext-large")
+)
 tokenizer = AutoTokenizer.from_pretrained(model_dir)
 model = AutoModelForMaskedLM.from_pretrained(model_dir).to(device)

utils.py CHANGED Viewed

@@ -6,14 +6,50 @@ import logging
 import argparse
 import requests
 import subprocess
 import numpy as np
 from tqdm import tqdm
 from scipy.io.wavfile import read
 MATPLOTLIB_FLAG = False
 logger = logging.getLogger(__name__)
 def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):

 import argparse
 import requests
 import subprocess
+import modelscope
+import huggingface_hub
 import numpy as np
 from tqdm import tqdm
 from scipy.io.wavfile import read
 MATPLOTLIB_FLAG = False
 logger = logging.getLogger(__name__)
+EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
+ZH2EN = {
+    "输入模式": "Input Mode",
+    "请输入简体中文文案": "Please input the Simplified Chinese text",
+    "首次推理需耗时下载模型，还请耐心等待。": "The first inference takes time to download the model, so be patient.",
+    "角色": "Role",
+    "状态栏": "Status",
+    "语调调节": "Modulation of intonation",
+    "感情调节": "Emotional adjustment",
+    "音素长度": "Phoneme length",
+    "生成时长": "Output duration",
+    "输出音频": "Output Audio",
+    "上传模式": "Upload Mode",
+    "请上传简体中文 TXT 文案": "Please upload a simplified Chinese TXT",
+    "文案提取结果": "Result of TXT extraction",
+    """
+欢迎使用此创空间，此创空间基于 <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a> 开源项目制作，移至最底端有原理浅讲。使用此创空间必须遵守当地相关法律法规，禁止用其从事任何违法犯罪活动。""": """
+Welcome to the Space, which is based on the open source project <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a>, and moved to the bottom for an explanation of the principle. This Space must be used in accordance with local laws and regulations, prohibiting the use of it for any criminal activities.""",
+}
+MODEL_DIR = (
+    huggingface_hub.snapshot_download(
+        "Genius-Society/hoyoTTS",
+        cache_dir="./__pycache__",
+    )
+    if EN_US
+    else modelscope.snapshot_download(
+        "Genius-Society/hoyoTTS",
+        cache_dir="./__pycache__",
+    )
+)
+def _L(zh_txt: str):
+    return ZH2EN[zh_txt] if EN_US else zh_txt
 def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):