Spaces:
Running
Running
admin
commited on
Commit
·
a71a582
1
Parent(s):
be79c50
sync ms
Browse files- app.py +51 -82
- requirements.txt +4 -4
- text/chinese_bert.py +7 -1
- utils.py +37 -1
app.py
CHANGED
@@ -1,19 +1,18 @@
|
|
1 |
-
import
|
2 |
-
import
|
3 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import utils
|
5 |
import torch
|
6 |
-
import
|
7 |
-
import
|
8 |
-
import
|
9 |
-
import gradio as gr
|
10 |
-
from tqdm import tqdm
|
11 |
-
from models import SynthesizerTrn
|
12 |
-
from huggingface_hub import snapshot_download
|
13 |
-
from text import cleaned_text_to_sequence, get_bert
|
14 |
-
from text.cleaner import clean_text
|
15 |
-
from text.symbols import symbols
|
16 |
-
|
17 |
|
18 |
if sys.platform == "darwin":
|
19 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
@@ -25,7 +24,8 @@ logging.getLogger("markdown_it").setLevel(logging.WARNING)
|
|
25 |
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
26 |
logging.getLogger("matplotlib").setLevel(logging.WARNING)
|
27 |
logging.basicConfig(
|
28 |
-
level=logging.INFO,
|
|
|
29 |
)
|
30 |
|
31 |
logger = logging.getLogger(__name__)
|
@@ -102,34 +102,39 @@ def tts_fn(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
|
|
102 |
|
103 |
def text_splitter(text: str):
|
104 |
punctuation = r"[。,;,!,?,〜,\n,\r,\t,.,!,;,?,~, ]"
|
|
|
105 |
sentences = re.split(punctuation, text.strip())
|
|
|
106 |
return [sentence.strip() for sentence in sentences if sentence.strip()]
|
107 |
|
108 |
|
109 |
def concatenate_audios(audio_samples, sample_rate=44100):
|
110 |
half_second_silence = np.zeros(int(sample_rate / 2))
|
|
|
111 |
final_audio = audio_samples[0]
|
|
|
112 |
for sample in audio_samples[1:]:
|
113 |
final_audio = np.concatenate((final_audio, half_second_silence, sample))
|
114 |
|
115 |
-
print("
|
116 |
return (sample_rate, final_audio)
|
117 |
|
118 |
|
119 |
def read_text(file_path: str):
|
120 |
try:
|
|
|
121 |
with open(file_path, "r", encoding="utf-8") as file:
|
122 |
content = file.read()
|
123 |
return content
|
124 |
|
125 |
except FileNotFoundError:
|
126 |
-
print(f"
|
127 |
|
128 |
except IOError:
|
129 |
-
print(f"
|
130 |
|
131 |
except Exception as e:
|
132 |
-
print(f"
|
133 |
|
134 |
|
135 |
def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
|
@@ -137,7 +142,7 @@ def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scal
|
|
137 |
content = read_text(text)
|
138 |
sentences = text_splitter(content)
|
139 |
audios = []
|
140 |
-
for sentence in tqdm(sentences, desc="TTS
|
141 |
with torch.no_grad():
|
142 |
audios.append(
|
143 |
infer(
|
@@ -160,7 +165,7 @@ def infer_tab2(content, speaker, sdp_ratio, noise_scale, noise_scale_w, length_s
|
|
160 |
try:
|
161 |
sentences = text_splitter(content)
|
162 |
audios = []
|
163 |
-
for sentence in tqdm(sentences, desc="TTS
|
164 |
with torch.no_grad():
|
165 |
audios.append(
|
166 |
infer(
|
@@ -181,12 +186,11 @@ def infer_tab2(content, speaker, sdp_ratio, noise_scale, noise_scale_w, length_s
|
|
181 |
|
182 |
|
183 |
if __name__ == "__main__":
|
184 |
-
model_dir = snapshot_download("Genius-Society/hoyoTTS", cache_dir="./__pycache__")
|
185 |
if debug:
|
186 |
logger.info("Enable DEBUG-LEVEL log")
|
187 |
logging.basicConfig(level=logging.DEBUG)
|
188 |
|
189 |
-
hps = utils.get_hparams_from_dir(
|
190 |
device = (
|
191 |
"cuda:0"
|
192 |
if torch.cuda.is_available()
|
@@ -204,105 +208,70 @@ if __name__ == "__main__":
|
|
204 |
**hps.model,
|
205 |
).to(device)
|
206 |
net_g.eval()
|
207 |
-
utils.load_checkpoint(f"{
|
208 |
speaker_ids = hps.data.spk2id
|
209 |
speakers = list(speaker_ids.keys())
|
210 |
random.shuffle(speakers)
|
211 |
with gr.Blocks() as app:
|
212 |
gr.Markdown(
|
213 |
"""
|
214 |
-
|
215 |
)
|
216 |
|
217 |
-
with gr.Tab("
|
218 |
gr.Interface(
|
219 |
-
fn=infer_tab2,
|
220 |
inputs=[
|
221 |
gr.TextArea(
|
222 |
-
label="
|
223 |
-
placeholder="
|
224 |
show_copy_button=True,
|
225 |
),
|
226 |
-
gr.Dropdown(choices=speakers, value="莱依拉", label="
|
227 |
gr.Slider(
|
228 |
-
minimum=0,
|
229 |
-
|
230 |
-
value=0.2,
|
231 |
-
step=0.1,
|
232 |
-
label="Modulation of intonation",
|
233 |
-
), # SDP/DP Mix Ratio
|
234 |
gr.Slider(
|
235 |
-
minimum=0.1,
|
236 |
-
maximum=2,
|
237 |
-
value=0.6,
|
238 |
-
step=0.1,
|
239 |
-
label="Emotional adjustment",
|
240 |
),
|
241 |
gr.Slider(
|
242 |
-
minimum=0.1,
|
243 |
-
maximum=2,
|
244 |
-
value=0.8,
|
245 |
-
step=0.1,
|
246 |
-
label="Phoneme length",
|
247 |
),
|
248 |
gr.Slider(
|
249 |
-
minimum=0.1,
|
250 |
-
maximum=2,
|
251 |
-
value=1,
|
252 |
-
step=0.1,
|
253 |
-
label="Output duration",
|
254 |
),
|
255 |
],
|
256 |
-
outputs=gr.Audio(label="
|
257 |
flagging_mode="never",
|
258 |
concurrency_limit=4,
|
259 |
)
|
260 |
|
261 |
-
with gr.Tab("
|
262 |
gr.Interface(
|
263 |
-
fn=infer_tab1, #
|
264 |
inputs=[
|
265 |
gr.components.File(
|
266 |
-
label="
|
267 |
type="filepath",
|
268 |
file_types=[".txt"],
|
269 |
),
|
270 |
-
gr.Dropdown(choices=speakers, value="莱依拉", label="
|
271 |
gr.Slider(
|
272 |
-
minimum=0,
|
273 |
-
|
274 |
-
value=0.2,
|
275 |
-
step=0.1,
|
276 |
-
label="Modulation of intonation",
|
277 |
-
),
|
278 |
gr.Slider(
|
279 |
-
minimum=0.1,
|
280 |
-
maximum=2,
|
281 |
-
value=0.6,
|
282 |
-
step=0.1,
|
283 |
-
label="Emotional adjustment",
|
284 |
),
|
285 |
gr.Slider(
|
286 |
-
minimum=0.1,
|
287 |
-
maximum=2,
|
288 |
-
value=0.8,
|
289 |
-
step=0.1,
|
290 |
-
label="Phoneme length",
|
291 |
),
|
292 |
gr.Slider(
|
293 |
-
minimum=0.1,
|
294 |
-
maximum=2,
|
295 |
-
value=1,
|
296 |
-
step=0.1,
|
297 |
-
label="Output duration",
|
298 |
),
|
299 |
],
|
300 |
outputs=[
|
301 |
-
gr.Audio(label="
|
302 |
-
gr.TextArea(
|
303 |
-
label="Result of TXT extraction",
|
304 |
-
show_copy_button=True,
|
305 |
-
),
|
306 |
],
|
307 |
flagging_mode="never",
|
308 |
concurrency_limit=4,
|
|
|
1 |
+
from text.symbols import symbols
|
2 |
+
from text.cleaner import clean_text
|
3 |
+
from text import cleaned_text_to_sequence, get_bert
|
4 |
+
from models import SynthesizerTrn
|
5 |
+
from tqdm import tqdm
|
6 |
+
from utils import _L, MODEL_DIR
|
7 |
+
import gradio as gr
|
8 |
+
import numpy as np
|
9 |
+
import commons
|
10 |
+
import random
|
11 |
import utils
|
12 |
import torch
|
13 |
+
import sys
|
14 |
+
import re
|
15 |
+
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
if sys.platform == "darwin":
|
18 |
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
|
24 |
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
25 |
logging.getLogger("matplotlib").setLevel(logging.WARNING)
|
26 |
logging.basicConfig(
|
27 |
+
level=logging.INFO,
|
28 |
+
format="| %(name)s | %(levelname)s | %(message)s",
|
29 |
)
|
30 |
|
31 |
logger = logging.getLogger(__name__)
|
|
|
102 |
|
103 |
def text_splitter(text: str):
|
104 |
punctuation = r"[。,;,!,?,〜,\n,\r,\t,.,!,;,?,~, ]"
|
105 |
+
# 使用正则表达式根据标点符号分割文本, 并忽略重叠的分隔符
|
106 |
sentences = re.split(punctuation, text.strip())
|
107 |
+
# 过滤掉空字符串
|
108 |
return [sentence.strip() for sentence in sentences if sentence.strip()]
|
109 |
|
110 |
|
111 |
def concatenate_audios(audio_samples, sample_rate=44100):
|
112 |
half_second_silence = np.zeros(int(sample_rate / 2))
|
113 |
+
# 初始化最终的音频数组
|
114 |
final_audio = audio_samples[0]
|
115 |
+
# 遍历音频样本列表, 并将它们连接起来, 每个样本之间插入半秒钟的静音
|
116 |
for sample in audio_samples[1:]:
|
117 |
final_audio = np.concatenate((final_audio, half_second_silence, sample))
|
118 |
|
119 |
+
print("音频片段连接完成!")
|
120 |
return (sample_rate, final_audio)
|
121 |
|
122 |
|
123 |
def read_text(file_path: str):
|
124 |
try:
|
125 |
+
# 打开文件并读取内容
|
126 |
with open(file_path, "r", encoding="utf-8") as file:
|
127 |
content = file.read()
|
128 |
return content
|
129 |
|
130 |
except FileNotFoundError:
|
131 |
+
print(f"文件未找到: {file_path}")
|
132 |
|
133 |
except IOError:
|
134 |
+
print(f"读取文件时发生错误: {file_path}")
|
135 |
|
136 |
except Exception as e:
|
137 |
+
print(f"发生未知错误: {e}")
|
138 |
|
139 |
|
140 |
def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
|
|
|
142 |
content = read_text(text)
|
143 |
sentences = text_splitter(content)
|
144 |
audios = []
|
145 |
+
for sentence in tqdm(sentences, desc="TTS 推理中..."):
|
146 |
with torch.no_grad():
|
147 |
audios.append(
|
148 |
infer(
|
|
|
165 |
try:
|
166 |
sentences = text_splitter(content)
|
167 |
audios = []
|
168 |
+
for sentence in tqdm(sentences, desc="TTS 推理中..."):
|
169 |
with torch.no_grad():
|
170 |
audios.append(
|
171 |
infer(
|
|
|
186 |
|
187 |
|
188 |
if __name__ == "__main__":
|
|
|
189 |
if debug:
|
190 |
logger.info("Enable DEBUG-LEVEL log")
|
191 |
logging.basicConfig(level=logging.DEBUG)
|
192 |
|
193 |
+
hps = utils.get_hparams_from_dir(MODEL_DIR)
|
194 |
device = (
|
195 |
"cuda:0"
|
196 |
if torch.cuda.is_available()
|
|
|
208 |
**hps.model,
|
209 |
).to(device)
|
210 |
net_g.eval()
|
211 |
+
utils.load_checkpoint(f"{MODEL_DIR}/G_78000.pth", net_g, None, skip_optimizer=True)
|
212 |
speaker_ids = hps.data.spk2id
|
213 |
speakers = list(speaker_ids.keys())
|
214 |
random.shuffle(speakers)
|
215 |
with gr.Blocks() as app:
|
216 |
gr.Markdown(
|
217 |
"""
|
218 |
+
欢迎使用此创空间,此创空间基于 <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a> 开源项目制作,移至最底端有原理浅讲。使用此创空间必须遵守当地相关法律法规,禁止用其从事任何违法犯罪活动。"""
|
219 |
)
|
220 |
|
221 |
+
with gr.Tab("输入模式"):
|
222 |
gr.Interface(
|
223 |
+
fn=infer_tab2, # 使用 text_to_speech 函数
|
224 |
inputs=[
|
225 |
gr.TextArea(
|
226 |
+
label="请输入简体中文文案",
|
227 |
+
placeholder="首次推理需耗时下载模型,还请耐心等待。",
|
228 |
show_copy_button=True,
|
229 |
),
|
230 |
+
gr.Dropdown(choices=speakers, value="莱依拉", label="角色"),
|
231 |
gr.Slider(
|
232 |
+
minimum=0, maximum=1, value=0.2, step=0.1, label="语调调节"
|
233 |
+
), # SDP/DP混合比
|
|
|
|
|
|
|
|
|
234 |
gr.Slider(
|
235 |
+
minimum=0.1, maximum=2, value=0.6, step=0.1, label="感情调节"
|
|
|
|
|
|
|
|
|
236 |
),
|
237 |
gr.Slider(
|
238 |
+
minimum=0.1, maximum=2, value=0.8, step=0.1, label="音素长度"
|
|
|
|
|
|
|
|
|
239 |
),
|
240 |
gr.Slider(
|
241 |
+
minimum=0.1, maximum=2, value=1, step=0.1, label="生成时长"
|
|
|
|
|
|
|
|
|
242 |
),
|
243 |
],
|
244 |
+
outputs=gr.Audio(label="输出音频"),
|
245 |
flagging_mode="never",
|
246 |
concurrency_limit=4,
|
247 |
)
|
248 |
|
249 |
+
with gr.Tab("上传模式"):
|
250 |
gr.Interface(
|
251 |
+
fn=infer_tab1, # 使用 text_to_speech 函数
|
252 |
inputs=[
|
253 |
gr.components.File(
|
254 |
+
label="请上传简体中文 TXT 文案",
|
255 |
type="filepath",
|
256 |
file_types=[".txt"],
|
257 |
),
|
258 |
+
gr.Dropdown(choices=speakers, value="莱依拉", label="角色"),
|
259 |
gr.Slider(
|
260 |
+
minimum=0, maximum=1, value=0.2, step=0.1, label="语调调节"
|
261 |
+
), # SDP/DP混合比
|
|
|
|
|
|
|
|
|
262 |
gr.Slider(
|
263 |
+
minimum=0.1, maximum=2, value=0.6, step=0.1, label="感情调节"
|
|
|
|
|
|
|
|
|
264 |
),
|
265 |
gr.Slider(
|
266 |
+
minimum=0.1, maximum=2, value=0.8, step=0.1, label="音素长度"
|
|
|
|
|
|
|
|
|
267 |
),
|
268 |
gr.Slider(
|
269 |
+
minimum=0.1, maximum=2, value=1, step=0.1, label="生成时长"
|
|
|
|
|
|
|
|
|
270 |
),
|
271 |
],
|
272 |
outputs=[
|
273 |
+
gr.Audio(label="输出音频"),
|
274 |
+
gr.TextArea(label="文案提取结果", show_copy_button=True),
|
|
|
|
|
|
|
275 |
],
|
276 |
flagging_mode="never",
|
277 |
concurrency_limit=4,
|
requirements.txt
CHANGED
@@ -1,9 +1,11 @@
|
|
|
|
|
|
1 |
av
|
2 |
cn2an
|
3 |
jieba
|
4 |
numba
|
5 |
scipy
|
6 |
-
|
7 |
pypinyin
|
8 |
Unidecode
|
9 |
matplotlib
|
@@ -11,6 +13,4 @@ phonemizer
|
|
11 |
tensorboard
|
12 |
amfm_decompy
|
13 |
transformers
|
14 |
-
|
15 |
-
numpy==1.26.4
|
16 |
-
librosa==0.9.1
|
|
|
1 |
+
torch==2.6.0+cu118
|
2 |
+
-f https://download.pytorch.org/whl/torch
|
3 |
av
|
4 |
cn2an
|
5 |
jieba
|
6 |
numba
|
7 |
scipy
|
8 |
+
librosa
|
9 |
pypinyin
|
10 |
Unidecode
|
11 |
matplotlib
|
|
|
13 |
tensorboard
|
14 |
amfm_decompy
|
15 |
transformers
|
16 |
+
numpy==1.26.4
|
|
|
|
text/chinese_bert.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import sys
|
2 |
import torch
|
|
|
3 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
|
|
4 |
|
5 |
device = torch.device(
|
6 |
"cuda"
|
@@ -13,7 +15,11 @@ device = torch.device(
|
|
13 |
)
|
14 |
|
15 |
# 模型下载
|
16 |
-
model_dir =
|
|
|
|
|
|
|
|
|
17 |
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
18 |
model = AutoModelForMaskedLM.from_pretrained(model_dir).to(device)
|
19 |
|
|
|
1 |
import sys
|
2 |
import torch
|
3 |
+
from modelscope import snapshot_download
|
4 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
5 |
+
from utils import EN_US
|
6 |
|
7 |
device = torch.device(
|
8 |
"cuda"
|
|
|
15 |
)
|
16 |
|
17 |
# 模型下载
|
18 |
+
model_dir = (
|
19 |
+
"hfl/chinese-roberta-wwm-ext-large"
|
20 |
+
if EN_US
|
21 |
+
else snapshot_download("dienstag/chinese-roberta-wwm-ext-large")
|
22 |
+
)
|
23 |
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
24 |
model = AutoModelForMaskedLM.from_pretrained(model_dir).to(device)
|
25 |
|
utils.py
CHANGED
@@ -6,14 +6,50 @@ import logging
|
|
6 |
import argparse
|
7 |
import requests
|
8 |
import subprocess
|
|
|
|
|
9 |
import numpy as np
|
10 |
from tqdm import tqdm
|
11 |
from scipy.io.wavfile import read
|
12 |
|
13 |
|
14 |
MATPLOTLIB_FLAG = False
|
15 |
-
|
16 |
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
|
19 |
def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
|
|
|
6 |
import argparse
|
7 |
import requests
|
8 |
import subprocess
|
9 |
+
import modelscope
|
10 |
+
import huggingface_hub
|
11 |
import numpy as np
|
12 |
from tqdm import tqdm
|
13 |
from scipy.io.wavfile import read
|
14 |
|
15 |
|
16 |
MATPLOTLIB_FLAG = False
|
|
|
17 |
logger = logging.getLogger(__name__)
|
18 |
+
EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
|
19 |
+
ZH2EN = {
|
20 |
+
"输入模式": "Input Mode",
|
21 |
+
"请输入简体中文文案": "Please input the Simplified Chinese text",
|
22 |
+
"首次推理需耗时下载模型,还请耐心等待。": "The first inference takes time to download the model, so be patient.",
|
23 |
+
"角色": "Role",
|
24 |
+
"状态栏": "Status",
|
25 |
+
"语调调节": "Modulation of intonation",
|
26 |
+
"感情调节": "Emotional adjustment",
|
27 |
+
"音素长度": "Phoneme length",
|
28 |
+
"生成时长": "Output duration",
|
29 |
+
"输出音频": "Output Audio",
|
30 |
+
"上传模式": "Upload Mode",
|
31 |
+
"请上传简体中文 TXT 文案": "Please upload a simplified Chinese TXT",
|
32 |
+
"文案提取结果": "Result of TXT extraction",
|
33 |
+
"""
|
34 |
+
欢迎使用此创空间,此创空间基于 <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a> 开源项目制作,移至最底端有原理浅讲。使用此创空间必须遵守当地相关法律法规,禁止用其从事任何违法犯罪活动。""": """
|
35 |
+
Welcome to the Space, which is based on the open source project <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a>, and moved to the bottom for an explanation of the principle. This Space must be used in accordance with local laws and regulations, prohibiting the use of it for any criminal activities.""",
|
36 |
+
}
|
37 |
+
|
38 |
+
MODEL_DIR = (
|
39 |
+
huggingface_hub.snapshot_download(
|
40 |
+
"Genius-Society/hoyoTTS",
|
41 |
+
cache_dir="./__pycache__",
|
42 |
+
)
|
43 |
+
if EN_US
|
44 |
+
else modelscope.snapshot_download(
|
45 |
+
"Genius-Society/hoyoTTS",
|
46 |
+
cache_dir="./__pycache__",
|
47 |
+
)
|
48 |
+
)
|
49 |
+
|
50 |
+
|
51 |
+
def _L(zh_txt: str):
|
52 |
+
return ZH2EN[zh_txt] if EN_US else zh_txt
|
53 |
|
54 |
|
55 |
def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
|