txya900619 commited on
Commit
0c074b9
·
1 Parent(s): 947b905

feat: add app.py

Browse files
DEMO.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 原語會族語語音合成系統
2
+
3
+ ILRDF Formosan Text-To-Speech System
4
+
5
+ ## 研發團隊
6
+
7
+ - [李鴻欣 Hung-Shin Lee](mailto:[email protected])
8
+ - [陳力瑋 Li-Wei Chen](mailto:[email protected])
9
+ - [意傳科技](https://ithuan.tw/)
10
+ - [原住民族語言研究發展基金會](https://www.ilrdf.org.tw/)
11
+
12
+ ## 特別致謝
13
+ - [聯和科創](https://www.104.com.tw/company/1a2x6bmu75)
14
+ - [Pipalofasaran to Sowal no Pangcah/'Amis 台灣阿美族語言永續發展學會](https://www.facebook.com/groups/ypspt/about)
15
+ - [台灣太魯閣族語言發展學會](https://qkktt.com/)
16
+ - [台灣原住民族賽德克族語言文化學會](https://www.facebook.com/3S3TBL/)
17
+ - 族語老師們
app.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+
3
+ import gradio as gr
4
+ import soundfile as sf
5
+ import torchaudio
6
+ from cached_path import cached_path
7
+ from omegaconf import OmegaConf
8
+
9
+ from ipa.ipa import text_to_ipa
10
+
11
+ try:
12
+ import spaces
13
+
14
+ USING_SPACES = True
15
+ except ImportError:
16
+ USING_SPACES = False
17
+
18
+ from f5_tts.infer.utils_infer import (
19
+ infer_process,
20
+ load_model,
21
+ load_vocoder,
22
+ preprocess_ref_audio_text,
23
+ remove_silence_for_generated_wav,
24
+ save_spectrogram,
25
+ )
26
+ from f5_tts.model import DiT
27
+
28
+
29
+ def gpu_decorator(func):
30
+ if USING_SPACES:
31
+ return spaces.GPU(func)
32
+ else:
33
+ return func
34
+
35
+
36
+ vocoder = load_vocoder()
37
+
38
+
39
+ def load_f5tts(ckpt_path, vocab_path):
40
+ ckpt_path = str(cached_path(ckpt_path))
41
+ F5TTS_model_cfg = dict(
42
+ dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4
43
+ )
44
+ vocab_path = str(cached_path(vocab_path))
45
+ return load_model(DiT, F5TTS_model_cfg, ckpt_path, vocab_file=vocab_path)
46
+
47
+
48
+ OmegaConf.register_new_resolver("load_f5tts", load_f5tts)
49
+
50
+ models_config = OmegaConf.to_object(OmegaConf.load("configs/models.yaml"))
51
+ dialects = OmegaConf.to_object(OmegaConf.load("configs/dialects.yaml"))
52
+
53
+
54
+ DEFAULT_MODEL_ID = list(models_config.keys())[0]
55
+ DEFAULT_DIALECT = list(dialects.values())[0]
56
+
57
+
58
+ @gpu_decorator
59
+ def infer(
60
+ ref_audio_orig,
61
+ ref_text,
62
+ gen_text,
63
+ model,
64
+ remove_silence,
65
+ cross_fade_duration=0.15,
66
+ nfe_step=32,
67
+ fix_duration=1,
68
+ show_info=gr.Info,
69
+ ):
70
+ if not ref_audio_orig:
71
+ gr.Warning("Please provide reference audio.")
72
+ return gr.update(), gr.update(), ref_text
73
+
74
+ if not gen_text.strip():
75
+ gr.Warning("Please enter text to generate.")
76
+ return gr.update(), gr.update(), ref_text
77
+
78
+ ref_audio, ref_text = preprocess_ref_audio_text(
79
+ ref_audio_orig, ref_text, show_info=show_info
80
+ )
81
+
82
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
83
+ ref_audio,
84
+ ref_text,
85
+ gen_text,
86
+ model,
87
+ vocoder,
88
+ cross_fade_duration=cross_fade_duration,
89
+ nfe_step=nfe_step,
90
+ fix_duration=fix_duration,
91
+ show_info=show_info,
92
+ progress=gr.Progress(),
93
+ )
94
+
95
+ # Remove silence
96
+ if remove_silence:
97
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
98
+ sf.write(f.name, final_wave, final_sample_rate)
99
+ remove_silence_for_generated_wav(f.name)
100
+ final_wave, _ = torchaudio.load(f.name)
101
+ final_wave = final_wave.squeeze().cpu().numpy()
102
+
103
+ print(f"Final wave duration: {final_wave.shape[0] / final_sample_rate:.2f}s")
104
+ # Save the spectrogram
105
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
106
+ spectrogram_path = tmp_spectrogram.name
107
+ save_spectrogram(combined_spectrogram, spectrogram_path)
108
+
109
+ return (final_sample_rate, final_wave), spectrogram_path
110
+
111
+
112
+ def get_title():
113
+ with open("DEMO.md") as tong:
114
+ return tong.readline().strip("# ")
115
+
116
+
117
+ demo = gr.Blocks(
118
+ title=get_title(),
119
+ css="@import url(https://tauhu.tw/tauhu-oo.css);",
120
+ theme=gr.themes.Default(
121
+ font=(
122
+ "tauhu-oo",
123
+ gr.themes.GoogleFont("Source Sans Pro"),
124
+ "ui-sans-serif",
125
+ "system-ui",
126
+ "sans-serif",
127
+ )
128
+ ),
129
+ )
130
+
131
+ with demo:
132
+ with open("DEMO.md") as tong:
133
+ gr.Markdown(tong.read())
134
+
135
+ with gr.Row():
136
+ with gr.Column():
137
+ model_drop_down = gr.Dropdown(
138
+ models_config.keys(),
139
+ value=DEFAULT_MODEL_ID,
140
+ label="模型",
141
+ )
142
+
143
+ ref_audio_input = gr.Audio(
144
+ type="filepath",
145
+ waveform_options=gr.WaveformOptions(
146
+ sample_rate=24000,
147
+ ),
148
+ label="Reference Audio",
149
+ )
150
+ ref_text_input = gr.Textbox(
151
+ value="",
152
+ label="Reference Text",
153
+ )
154
+
155
+ gen_text_input = gr.Textbox(
156
+ label="Text to Generate",
157
+ value="",
158
+ )
159
+
160
+ generate_btn = gr.Button("Synthesize", variant="primary")
161
+
162
+ with gr.Accordion("Advanced Settings", open=False):
163
+ remove_silence = gr.Checkbox(
164
+ label="Remove Silences",
165
+ info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
166
+ value=False,
167
+ )
168
+ speed_slider = gr.Slider(
169
+ label="Speed",
170
+ minimum=0.3,
171
+ maximum=2.0,
172
+ value=1.0,
173
+ step=0.1,
174
+ info="語速(越小越慢)",
175
+ )
176
+ nfe_slider = gr.Slider(
177
+ label="NFE Steps",
178
+ minimum=4,
179
+ maximum=64,
180
+ value=32,
181
+ step=2,
182
+ info="Set the number of denoising steps.",
183
+ )
184
+ cross_fade_duration_slider = gr.Slider(
185
+ label="Cross-Fade Duration (s)",
186
+ minimum=0.0,
187
+ maximum=1.0,
188
+ value=0.15,
189
+ step=0.01,
190
+ info="Set the duration of the cross-fade between audio clips.",
191
+ )
192
+ with gr.Column():
193
+ audio_output = gr.Audio(label="Synthesized Audio")
194
+ spectrogram_output = gr.Image(label="Spectrogram")
195
+
196
+ @gpu_decorator
197
+ def basic_tts(
198
+ model_drop_down: str,
199
+ ref_audio_input: str,
200
+ ref_text_input: str,
201
+ gen_text_input: str,
202
+ remove_silence: bool,
203
+ cross_fade_duration_slider: float,
204
+ nfe_slider: int,
205
+ speed_slider: float,
206
+ ):
207
+ ref_audio_info = torchaudio.info(ref_audio_input)
208
+ ref_duration = ref_audio_info.num_frames / ref_audio_info.sample_rate
209
+ target_duration = (
210
+ ref_duration
211
+ * len(gen_text_input.replace(" ", ""))
212
+ / len(ref_text_input.replace(" ", ""))
213
+ / speed_slider
214
+ )
215
+ print(f"Reference duration: {ref_duration}")
216
+ print(f"Target duration: {target_duration}")
217
+ if len(ref_text_input) == 0:
218
+ raise gr.Error("請勿輸入空字串。")
219
+
220
+ ref_text_input = text_to_ipa(ref_text_input)
221
+
222
+ if len(gen_text_input) == 0:
223
+ raise gr.Error("請勿輸入空字串。")
224
+
225
+ gen_text_input = text_to_ipa(gen_text_input)
226
+
227
+ audio_out, spectrogram_path = infer(
228
+ ref_audio_input,
229
+ ref_text_input,
230
+ gen_text_input,
231
+ models_config[model_drop_down],
232
+ remove_silence,
233
+ cross_fade_duration=cross_fade_duration_slider,
234
+ nfe_step=nfe_slider,
235
+ fix_duration=ref_duration + target_duration,
236
+ )
237
+ return audio_out, spectrogram_path
238
+
239
+ generate_btn.click(
240
+ basic_tts,
241
+ inputs=[
242
+ model_drop_down,
243
+ ref_audio_input,
244
+ ref_text_input,
245
+ gen_text_input,
246
+ remove_silence,
247
+ cross_fade_duration_slider,
248
+ nfe_slider,
249
+ speed_slider,
250
+ ],
251
+ outputs=[audio_output, spectrogram_output],
252
+ )
253
+ gr.Examples(
254
+ [
255
+ [
256
+ "./ref_wav/E-PV001-0085.wav",
257
+ "romakat kako a talapicodadan to romi’ami’ad",
258
+ "Mafana’ kiso a misanoPangcah haw?",
259
+ ],
260
+ [
261
+ "./ref_wav/E-PV001-0254.wav",
262
+ "kering sa masoni^ to ko pipahanhanan a tatokian o fe:soc no niyam a tayra i piondoan",
263
+ "Pafelien cingra to misapoeneray a faloco', nanay mada'oc matilid i faloco' nira konini.",
264
+ ],
265
+ ],
266
+ label="範例",
267
+ inputs=[
268
+ ref_audio_input,
269
+ ref_text_input,
270
+ gen_text_input,
271
+ ],
272
+ )
273
+
274
+ demo.launch()
configs/g2p.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ g2p: ${load_g2p:${gh_download:FormoSpeech/FormoG2P, formosan/g2p.csv}}
configs/models.yaml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ step-24000: ${load_f5tts:hf://united-link/f5-tts-ami-xiuguluan-finetune/model_24000.safetensors,hf://united-link/f5-tts-ami-xiuguluan-finetune/vocab.txt}
2
+ step-48000: ${load_f5tts:hf://united-link/f5-tts-ami-xiuguluan-finetune/model_48000.safetensors,hf://united-link/f5-tts-ami-xiuguluan-finetune/vocab.txt}
3
+ step-60000: ${load_f5tts:hf://united-link/f5-tts-ami-xiuguluan-finetune/model_60000.safetensors,hf://united-link/f5-tts-ami-xiuguluan-finetune/vocab.txt}
4
+ step-76500: ${load_f5tts:hf://united-link/f5-tts-ami-xiuguluan-finetune/model_76500.safetensors,hf://united-link/f5-tts-ami-xiuguluan-finetune/vocab.txt}
ipa/__init__.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ from io import BytesIO
3
+
4
+ import requests
5
+ from omegaconf import OmegaConf
6
+
7
+ EXTRA_G2P = {
8
+ "z": "z",
9
+ "o": "o",
10
+ "h": "h",
11
+ "g": "g",
12
+ "y": "j",
13
+ "w": "w",
14
+ "c": "ʦ",
15
+ "u": "u",
16
+ "f": "f",
17
+ "v": "v",
18
+ "j": "ɟ",
19
+ "b": "b",
20
+ "q": "q",
21
+ "e": "e",
22
+ ",": ",",
23
+ }
24
+
25
+
26
+ def gh_download(repo, path):
27
+ headers = {
28
+ "Accept": "application/vnd.github.raw+json",
29
+ }
30
+
31
+ url = f"https://api.github.com/repos/{repo}/contents/{path}"
32
+ response = requests.get(url, headers=headers)
33
+ if response.status_code != 200:
34
+ raise Exception(f"Failed to download {path} from {repo}, response: {response}")
35
+ response.encoding = "utf-8-sig"
36
+
37
+ return response.text
38
+
39
+
40
+ def load_g2p(g2p_string):
41
+ g2p = dict()
42
+
43
+ csv_reader = csv.DictReader(g2p_string.split("\n"))
44
+
45
+ for row in csv_reader:
46
+ language = row["Language"]
47
+ dialect = row["Dialect"]
48
+
49
+ if dialect == "-":
50
+ lang_tag = f"{language}"
51
+ else:
52
+ lang_tag = f"{language}_{dialect}"
53
+
54
+ for key in row:
55
+ if key in ["Language", "Dialect"]:
56
+ continue
57
+
58
+ if row[key] == "-":
59
+ continue
60
+
61
+ g2p[lang_tag] = g2p.get(lang_tag, {})
62
+ g2p[lang_tag][key] = row[key].split(",")[0]
63
+
64
+ for g, p in EXTRA_G2P.items():
65
+ if g not in g2p[lang_tag]:
66
+ g2p[lang_tag][g] = p
67
+
68
+ return g2p
69
+
70
+
71
+ OmegaConf.register_new_resolver("gh_download", gh_download)
72
+ OmegaConf.register_new_resolver("load_g2p", load_g2p)
ipa/ipa.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from omegaconf import OmegaConf
4
+
5
+ XIUGULUAN_G2P = OmegaConf.to_object(OmegaConf.load("configs/g2p.yaml"))["g2p"][
6
+ "阿美_秀姑巒"
7
+ ]
8
+
9
+
10
+ def can_form_string(x, symbol_dict):
11
+ def helper(x, symbol_dict, matched_parts):
12
+ if not x:
13
+ return True, matched_parts
14
+
15
+ for key in symbol_dict.keys():
16
+ if x.startswith(key):
17
+ result, parts = helper(
18
+ x[len(key) :], symbol_dict, matched_parts + [key]
19
+ )
20
+ if result:
21
+ return True, parts
22
+
23
+ return False, []
24
+
25
+ return helper(x, symbol_dict, [])
26
+
27
+
28
+ def text_to_ipa(text, ignore_comma=True):
29
+ ipa = []
30
+ text = text.lower()
31
+ text = re.sub(r"[.?!]", "", text)
32
+ text = text.replace("'", "’")
33
+ words = text.split() # change in future
34
+
35
+ print(f"ipa: {words}")
36
+
37
+ for word in words:
38
+ ipa_parts = ""
39
+ extended_g2p = {**XIUGULUAN_G2P, ",": "" if ignore_comma else ","}
40
+ result, matched_parts = can_form_string(word, extended_g2p)
41
+
42
+ if result is False:
43
+ print(f"no match g2p : {word}")
44
+ return ""
45
+
46
+ for matched_part in matched_parts:
47
+ ipa_parts = ipa_parts + extended_g2p[matched_part]
48
+
49
+ ipa.append(ipa_parts)
50
+ ipa = (
51
+ " ".join(ipa)
52
+ .replace("g", "ɡ")
53
+ .replace("ʦ", "t͡s")
54
+ .replace("ʨ", "t͡ɕ")
55
+ .replace("R", "ʀ")
56
+ .replace("ʤ", "dʒ")
57
+ )
58
+ return ipa
ref_wav/E-PV001-0085.wav ADDED
Binary file (309 kB). View file
 
ref_wav/E-PV001-0254.wav ADDED
Binary file (548 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ omegaconf
2
+ opencc
3
+ git+https://github.com/SWivid/F5-TTS.git