admin commited on
Commit
a71a582
·
1 Parent(s): be79c50
Files changed (4) hide show
  1. app.py +51 -82
  2. requirements.txt +4 -4
  3. text/chinese_bert.py +7 -1
  4. utils.py +37 -1
app.py CHANGED
@@ -1,19 +1,18 @@
1
- import re
2
- import os
3
- import sys
 
 
 
 
 
 
 
4
  import utils
5
  import torch
6
- import random
7
- import commons
8
- import numpy as np
9
- import gradio as gr
10
- from tqdm import tqdm
11
- from models import SynthesizerTrn
12
- from huggingface_hub import snapshot_download
13
- from text import cleaned_text_to_sequence, get_bert
14
- from text.cleaner import clean_text
15
- from text.symbols import symbols
16
-
17
 
18
  if sys.platform == "darwin":
19
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
@@ -25,7 +24,8 @@ logging.getLogger("markdown_it").setLevel(logging.WARNING)
25
  logging.getLogger("urllib3").setLevel(logging.WARNING)
26
  logging.getLogger("matplotlib").setLevel(logging.WARNING)
27
  logging.basicConfig(
28
- level=logging.INFO, format="| %(name)s | %(levelname)s | %(message)s"
 
29
  )
30
 
31
  logger = logging.getLogger(__name__)
@@ -102,34 +102,39 @@ def tts_fn(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
102
 
103
  def text_splitter(text: str):
104
  punctuation = r"[。,;,!,?,〜,\n,\r,\t,.,!,;,?,~, ]"
 
105
  sentences = re.split(punctuation, text.strip())
 
106
  return [sentence.strip() for sentence in sentences if sentence.strip()]
107
 
108
 
109
  def concatenate_audios(audio_samples, sample_rate=44100):
110
  half_second_silence = np.zeros(int(sample_rate / 2))
 
111
  final_audio = audio_samples[0]
 
112
  for sample in audio_samples[1:]:
113
  final_audio = np.concatenate((final_audio, half_second_silence, sample))
114
 
115
- print("Audio pieces concatenated!")
116
  return (sample_rate, final_audio)
117
 
118
 
119
  def read_text(file_path: str):
120
  try:
 
121
  with open(file_path, "r", encoding="utf-8") as file:
122
  content = file.read()
123
  return content
124
 
125
  except FileNotFoundError:
126
- print(f"File Not Found: {file_path}")
127
 
128
  except IOError:
129
- print(f"An error occurred reading the file: {file_path}")
130
 
131
  except Exception as e:
132
- print(f"An unknown error has occurred: {e}")
133
 
134
 
135
  def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
@@ -137,7 +142,7 @@ def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scal
137
  content = read_text(text)
138
  sentences = text_splitter(content)
139
  audios = []
140
- for sentence in tqdm(sentences, desc="TTS inferring..."):
141
  with torch.no_grad():
142
  audios.append(
143
  infer(
@@ -160,7 +165,7 @@ def infer_tab2(content, speaker, sdp_ratio, noise_scale, noise_scale_w, length_s
160
  try:
161
  sentences = text_splitter(content)
162
  audios = []
163
- for sentence in tqdm(sentences, desc="TTS inferring..."):
164
  with torch.no_grad():
165
  audios.append(
166
  infer(
@@ -181,12 +186,11 @@ def infer_tab2(content, speaker, sdp_ratio, noise_scale, noise_scale_w, length_s
181
 
182
 
183
  if __name__ == "__main__":
184
- model_dir = snapshot_download("Genius-Society/hoyoTTS", cache_dir="./__pycache__")
185
  if debug:
186
  logger.info("Enable DEBUG-LEVEL log")
187
  logging.basicConfig(level=logging.DEBUG)
188
 
189
- hps = utils.get_hparams_from_dir(model_dir)
190
  device = (
191
  "cuda:0"
192
  if torch.cuda.is_available()
@@ -204,105 +208,70 @@ if __name__ == "__main__":
204
  **hps.model,
205
  ).to(device)
206
  net_g.eval()
207
- utils.load_checkpoint(f"{model_dir}/G_78000.pth", net_g, None, skip_optimizer=True)
208
  speaker_ids = hps.data.spk2id
209
  speakers = list(speaker_ids.keys())
210
  random.shuffle(speakers)
211
  with gr.Blocks() as app:
212
  gr.Markdown(
213
  """
214
- Welcome to the Space, which is based on the open source project <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a>, and moved to the bottom for an explanation of the principle. This Space must be used in accordance with local laws and regulations, prohibiting the use of it for any criminal activities."""
215
  )
216
 
217
- with gr.Tab("Input Mode"):
218
  gr.Interface(
219
- fn=infer_tab2,
220
  inputs=[
221
  gr.TextArea(
222
- label="Please input the Simplified Chinese text",
223
- placeholder="The first inference takes time to download the model, so be patient.",
224
  show_copy_button=True,
225
  ),
226
- gr.Dropdown(choices=speakers, value="莱依拉", label="Role"),
227
  gr.Slider(
228
- minimum=0,
229
- maximum=1,
230
- value=0.2,
231
- step=0.1,
232
- label="Modulation of intonation",
233
- ), # SDP/DP Mix Ratio
234
  gr.Slider(
235
- minimum=0.1,
236
- maximum=2,
237
- value=0.6,
238
- step=0.1,
239
- label="Emotional adjustment",
240
  ),
241
  gr.Slider(
242
- minimum=0.1,
243
- maximum=2,
244
- value=0.8,
245
- step=0.1,
246
- label="Phoneme length",
247
  ),
248
  gr.Slider(
249
- minimum=0.1,
250
- maximum=2,
251
- value=1,
252
- step=0.1,
253
- label="Output duration",
254
  ),
255
  ],
256
- outputs=gr.Audio(label="Output Audio", show_share_button=False),
257
  flagging_mode="never",
258
  concurrency_limit=4,
259
  )
260
 
261
- with gr.Tab("Upload Mode"):
262
  gr.Interface(
263
- fn=infer_tab1, # Use text_to_speech func
264
  inputs=[
265
  gr.components.File(
266
- label="Please upload a simplified Chinese TXT",
267
  type="filepath",
268
  file_types=[".txt"],
269
  ),
270
- gr.Dropdown(choices=speakers, value="莱依拉", label="Role"),
271
  gr.Slider(
272
- minimum=0,
273
- maximum=1,
274
- value=0.2,
275
- step=0.1,
276
- label="Modulation of intonation",
277
- ),
278
  gr.Slider(
279
- minimum=0.1,
280
- maximum=2,
281
- value=0.6,
282
- step=0.1,
283
- label="Emotional adjustment",
284
  ),
285
  gr.Slider(
286
- minimum=0.1,
287
- maximum=2,
288
- value=0.8,
289
- step=0.1,
290
- label="Phoneme length",
291
  ),
292
  gr.Slider(
293
- minimum=0.1,
294
- maximum=2,
295
- value=1,
296
- step=0.1,
297
- label="Output duration",
298
  ),
299
  ],
300
  outputs=[
301
- gr.Audio(label="Output Audio", show_share_button=False),
302
- gr.TextArea(
303
- label="Result of TXT extraction",
304
- show_copy_button=True,
305
- ),
306
  ],
307
  flagging_mode="never",
308
  concurrency_limit=4,
 
1
+ from text.symbols import symbols
2
+ from text.cleaner import clean_text
3
+ from text import cleaned_text_to_sequence, get_bert
4
+ from models import SynthesizerTrn
5
+ from tqdm import tqdm
6
+ from utils import _L, MODEL_DIR
7
+ import gradio as gr
8
+ import numpy as np
9
+ import commons
10
+ import random
11
  import utils
12
  import torch
13
+ import sys
14
+ import re
15
+ import os
 
 
 
 
 
 
 
 
16
 
17
  if sys.platform == "darwin":
18
  os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 
24
  logging.getLogger("urllib3").setLevel(logging.WARNING)
25
  logging.getLogger("matplotlib").setLevel(logging.WARNING)
26
  logging.basicConfig(
27
+ level=logging.INFO,
28
+ format="| %(name)s | %(levelname)s | %(message)s",
29
  )
30
 
31
  logger = logging.getLogger(__name__)
 
102
 
103
  def text_splitter(text: str):
104
  punctuation = r"[。,;,!,?,〜,\n,\r,\t,.,!,;,?,~, ]"
105
+ # 使用正则表达式根据标点符号分割文本, 并忽略重叠的分隔符
106
  sentences = re.split(punctuation, text.strip())
107
+ # 过滤掉空字符串
108
  return [sentence.strip() for sentence in sentences if sentence.strip()]
109
 
110
 
111
  def concatenate_audios(audio_samples, sample_rate=44100):
112
  half_second_silence = np.zeros(int(sample_rate / 2))
113
+ # 初始化最终的音频数组
114
  final_audio = audio_samples[0]
115
+ # 遍历音频样本列表, 并将它们连接起来, 每个样本之间插入半秒钟的静音
116
  for sample in audio_samples[1:]:
117
  final_audio = np.concatenate((final_audio, half_second_silence, sample))
118
 
119
+ print("音频片段连接完成!")
120
  return (sample_rate, final_audio)
121
 
122
 
123
  def read_text(file_path: str):
124
  try:
125
+ # 打开文件并读取内容
126
  with open(file_path, "r", encoding="utf-8") as file:
127
  content = file.read()
128
  return content
129
 
130
  except FileNotFoundError:
131
+ print(f"文件未找到: {file_path}")
132
 
133
  except IOError:
134
+ print(f"读取文件时发生错误: {file_path}")
135
 
136
  except Exception as e:
137
+ print(f"发生未知错误: {e}")
138
 
139
 
140
  def infer_tab1(text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale):
 
142
  content = read_text(text)
143
  sentences = text_splitter(content)
144
  audios = []
145
+ for sentence in tqdm(sentences, desc="TTS 推理中..."):
146
  with torch.no_grad():
147
  audios.append(
148
  infer(
 
165
  try:
166
  sentences = text_splitter(content)
167
  audios = []
168
+ for sentence in tqdm(sentences, desc="TTS 推理中..."):
169
  with torch.no_grad():
170
  audios.append(
171
  infer(
 
186
 
187
 
188
  if __name__ == "__main__":
 
189
  if debug:
190
  logger.info("Enable DEBUG-LEVEL log")
191
  logging.basicConfig(level=logging.DEBUG)
192
 
193
+ hps = utils.get_hparams_from_dir(MODEL_DIR)
194
  device = (
195
  "cuda:0"
196
  if torch.cuda.is_available()
 
208
  **hps.model,
209
  ).to(device)
210
  net_g.eval()
211
+ utils.load_checkpoint(f"{MODEL_DIR}/G_78000.pth", net_g, None, skip_optimizer=True)
212
  speaker_ids = hps.data.spk2id
213
  speakers = list(speaker_ids.keys())
214
  random.shuffle(speakers)
215
  with gr.Blocks() as app:
216
  gr.Markdown(
217
  """
218
+ 欢迎使用此创空间,此创空间基于 <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a> 开源项目制作,移至最底端有原理浅讲。使用此创空间必须遵守当地相关法律法规,禁止用其从事任何违法犯罪活动。"""
219
  )
220
 
221
+ with gr.Tab("输入模式"):
222
  gr.Interface(
223
+ fn=infer_tab2, # 使用 text_to_speech 函数
224
  inputs=[
225
  gr.TextArea(
226
+ label="请输入简体中文文案",
227
+ placeholder="首次推理需耗时下载模型,还请耐心等待。",
228
  show_copy_button=True,
229
  ),
230
+ gr.Dropdown(choices=speakers, value="莱依拉", label="角色"),
231
  gr.Slider(
232
+ minimum=0, maximum=1, value=0.2, step=0.1, label="语调调节"
233
+ ), # SDP/DP混合比
 
 
 
 
234
  gr.Slider(
235
+ minimum=0.1, maximum=2, value=0.6, step=0.1, label="感情调节"
 
 
 
 
236
  ),
237
  gr.Slider(
238
+ minimum=0.1, maximum=2, value=0.8, step=0.1, label="音素长度"
 
 
 
 
239
  ),
240
  gr.Slider(
241
+ minimum=0.1, maximum=2, value=1, step=0.1, label="生成时长"
 
 
 
 
242
  ),
243
  ],
244
+ outputs=gr.Audio(label="输出音频"),
245
  flagging_mode="never",
246
  concurrency_limit=4,
247
  )
248
 
249
+ with gr.Tab("上传模式"):
250
  gr.Interface(
251
+ fn=infer_tab1, # 使用 text_to_speech 函数
252
  inputs=[
253
  gr.components.File(
254
+ label="请上传简体中文 TXT 文案",
255
  type="filepath",
256
  file_types=[".txt"],
257
  ),
258
+ gr.Dropdown(choices=speakers, value="莱依拉", label="角色"),
259
  gr.Slider(
260
+ minimum=0, maximum=1, value=0.2, step=0.1, label="语调调节"
261
+ ), # SDP/DP混合比
 
 
 
 
262
  gr.Slider(
263
+ minimum=0.1, maximum=2, value=0.6, step=0.1, label="感情调节"
 
 
 
 
264
  ),
265
  gr.Slider(
266
+ minimum=0.1, maximum=2, value=0.8, step=0.1, label="音素长度"
 
 
 
 
267
  ),
268
  gr.Slider(
269
+ minimum=0.1, maximum=2, value=1, step=0.1, label="生成时长"
 
 
 
 
270
  ),
271
  ],
272
  outputs=[
273
+ gr.Audio(label="输出音频"),
274
+ gr.TextArea(label="文案提取结果", show_copy_button=True),
 
 
 
275
  ],
276
  flagging_mode="never",
277
  concurrency_limit=4,
requirements.txt CHANGED
@@ -1,9 +1,11 @@
 
 
1
  av
2
  cn2an
3
  jieba
4
  numba
5
  scipy
6
- gradio
7
  pypinyin
8
  Unidecode
9
  matplotlib
@@ -11,6 +13,4 @@ phonemizer
11
  tensorboard
12
  amfm_decompy
13
  transformers
14
- torch==2.3.1
15
- numpy==1.26.4
16
- librosa==0.9.1
 
1
+ torch==2.6.0+cu118
2
+ -f https://download.pytorch.org/whl/torch
3
  av
4
  cn2an
5
  jieba
6
  numba
7
  scipy
8
+ librosa
9
  pypinyin
10
  Unidecode
11
  matplotlib
 
13
  tensorboard
14
  amfm_decompy
15
  transformers
16
+ numpy==1.26.4
 
 
text/chinese_bert.py CHANGED
@@ -1,6 +1,8 @@
1
  import sys
2
  import torch
 
3
  from transformers import AutoTokenizer, AutoModelForMaskedLM
 
4
 
5
  device = torch.device(
6
  "cuda"
@@ -13,7 +15,11 @@ device = torch.device(
13
  )
14
 
15
  # 模型下载
16
- model_dir = "hfl/chinese-roberta-wwm-ext-large"
 
 
 
 
17
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
18
  model = AutoModelForMaskedLM.from_pretrained(model_dir).to(device)
19
 
 
1
  import sys
2
  import torch
3
+ from modelscope import snapshot_download
4
  from transformers import AutoTokenizer, AutoModelForMaskedLM
5
+ from utils import EN_US
6
 
7
  device = torch.device(
8
  "cuda"
 
15
  )
16
 
17
  # 模型下载
18
+ model_dir = (
19
+ "hfl/chinese-roberta-wwm-ext-large"
20
+ if EN_US
21
+ else snapshot_download("dienstag/chinese-roberta-wwm-ext-large")
22
+ )
23
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
24
  model = AutoModelForMaskedLM.from_pretrained(model_dir).to(device)
25
 
utils.py CHANGED
@@ -6,14 +6,50 @@ import logging
6
  import argparse
7
  import requests
8
  import subprocess
 
 
9
  import numpy as np
10
  from tqdm import tqdm
11
  from scipy.io.wavfile import read
12
 
13
 
14
  MATPLOTLIB_FLAG = False
15
-
16
  logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
 
6
  import argparse
7
  import requests
8
  import subprocess
9
+ import modelscope
10
+ import huggingface_hub
11
  import numpy as np
12
  from tqdm import tqdm
13
  from scipy.io.wavfile import read
14
 
15
 
16
  MATPLOTLIB_FLAG = False
 
17
  logger = logging.getLogger(__name__)
18
+ EN_US = os.getenv("LANG") != "zh_CN.UTF-8"
19
+ ZH2EN = {
20
+ "输入模式": "Input Mode",
21
+ "请输入简体中文文案": "Please input the Simplified Chinese text",
22
+ "首次推理需耗时下载模型,还请耐心等待。": "The first inference takes time to download the model, so be patient.",
23
+ "角色": "Role",
24
+ "状态栏": "Status",
25
+ "语调调节": "Modulation of intonation",
26
+ "感情调节": "Emotional adjustment",
27
+ "音素长度": "Phoneme length",
28
+ "生成时长": "Output duration",
29
+ "输出音频": "Output Audio",
30
+ "上传模式": "Upload Mode",
31
+ "请上传简体中文 TXT 文案": "Please upload a simplified Chinese TXT",
32
+ "文案提取结果": "Result of TXT extraction",
33
+ """
34
+ 欢迎使用此创空间,此创空间基于 <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a> 开源项目制作,移至最底端有原理浅讲。使用此创空间必须遵守当地相关法律法规,禁止用其从事任何违法犯罪活动。""": """
35
+ Welcome to the Space, which is based on the open source project <a href="https://github.com/fishaudio/Bert-VITS2">Bert-vits2</a>, and moved to the bottom for an explanation of the principle. This Space must be used in accordance with local laws and regulations, prohibiting the use of it for any criminal activities.""",
36
+ }
37
+
38
+ MODEL_DIR = (
39
+ huggingface_hub.snapshot_download(
40
+ "Genius-Society/hoyoTTS",
41
+ cache_dir="./__pycache__",
42
+ )
43
+ if EN_US
44
+ else modelscope.snapshot_download(
45
+ "Genius-Society/hoyoTTS",
46
+ cache_dir="./__pycache__",
47
+ )
48
+ )
49
+
50
+
51
+ def _L(zh_txt: str):
52
+ return ZH2EN[zh_txt] if EN_US else zh_txt
53
 
54
 
55
  def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):