Billpai commited on
Commit
4e04d23
·
1 Parent(s): 0312eff
processors/__init__.py ADDED
File without changes
processors/acoustic_extractor.py ADDED
@@ -0,0 +1,903 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import torch
8
+ import numpy as np
9
+
10
+ import json
11
+ from tqdm import tqdm
12
+ from sklearn.preprocessing import StandardScaler
13
+ from utils.io import save_feature, save_txt
14
+ from utils.util import has_existed
15
+ from utils.tokenizer import extract_encodec_token
16
+ from utils.stft import TacotronSTFT
17
+ from utils.dsp import compress, audio_to_label
18
+ from utils.data_utils import remove_outlier
19
+ from preprocessors.metadata import replace_augment_name
20
+ from scipy.interpolate import interp1d
21
+
22
+ ZERO = 1e-12
23
+
24
+
25
+ def extract_utt_acoustic_features_parallel(metadata, dataset_output, cfg, n_workers=1):
26
+ """Extract acoustic features from utterances using muliprocess
27
+
28
+ Args:
29
+ metadata (dict): dictionary that stores data in train.json and test.json files
30
+ dataset_output (str): directory to store acoustic features
31
+ cfg (dict): dictionary that stores configurations
32
+ n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
33
+
34
+ Returns:
35
+ list: acoustic features
36
+ """
37
+ for utt in tqdm(metadata):
38
+ if cfg.task_type == "tts":
39
+ extract_utt_acoustic_features_tts(dataset_output, cfg, utt)
40
+ if cfg.task_type == "svc":
41
+ extract_utt_acoustic_features_svc(dataset_output, cfg, utt)
42
+ if cfg.task_type == "vocoder":
43
+ extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt)
44
+ if cfg.task_type == "tta":
45
+ extract_utt_acoustic_features_tta(dataset_output, cfg, utt)
46
+
47
+
48
+ def avg_phone_feature(feature, duration, interpolation=False):
49
+ feature = feature[: sum(duration)]
50
+ if interpolation:
51
+ nonzero_ids = np.where(feature != 0)[0]
52
+ interp_fn = interp1d(
53
+ nonzero_ids,
54
+ feature[nonzero_ids],
55
+ fill_value=(feature[nonzero_ids[0]], feature[nonzero_ids[-1]]),
56
+ bounds_error=False,
57
+ )
58
+ feature = interp_fn(np.arange(0, len(feature)))
59
+
60
+ # Phoneme-level average
61
+ pos = 0
62
+ for i, d in enumerate(duration):
63
+ if d > 0:
64
+ feature[i] = np.mean(feature[pos : pos + d])
65
+ else:
66
+ feature[i] = 0
67
+ pos += d
68
+ feature = feature[: len(duration)]
69
+ return feature
70
+
71
+
72
+ def extract_utt_acoustic_features_serial(metadata, dataset_output, cfg):
73
+ """Extract acoustic features from utterances (in single process)
74
+
75
+ Args:
76
+ metadata (dict): dictionary that stores data in train.json and test.json files
77
+ dataset_output (str): directory to store acoustic features
78
+ cfg (dict): dictionary that stores configurations
79
+
80
+ """
81
+ for utt in tqdm(metadata):
82
+ if cfg.task_type == "tts":
83
+ extract_utt_acoustic_features_tts(dataset_output, cfg, utt)
84
+ if cfg.task_type == "svc":
85
+ extract_utt_acoustic_features_svc(dataset_output, cfg, utt)
86
+ if cfg.task_type == "vocoder":
87
+ extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt)
88
+ if cfg.task_type == "tta":
89
+ extract_utt_acoustic_features_tta(dataset_output, cfg, utt)
90
+
91
+
92
+ def __extract_utt_acoustic_features(dataset_output, cfg, utt):
93
+ """Extract acoustic features from utterances (in single process)
94
+
95
+ Args:
96
+ dataset_output (str): directory to store acoustic features
97
+ cfg (dict): dictionary that stores configurations
98
+ utt (dict): utterance info including dataset, singer, uid:{singer}_{song}_{index},
99
+ path to utternace, duration, utternace index
100
+
101
+ """
102
+ from utils import audio, f0, world, duration
103
+
104
+ uid = utt["Uid"]
105
+ wav_path = utt["Path"]
106
+ if os.path.exists(os.path.join(dataset_output, cfg.preprocess.raw_data)):
107
+ wav_path = os.path.join(
108
+ dataset_output, cfg.preprocess.raw_data, utt["Singer"], uid + ".wav"
109
+ )
110
+
111
+ with torch.no_grad():
112
+ # Load audio data into tensor with sample rate of the config file
113
+ wav_torch, _ = audio.load_audio_torch(wav_path, cfg.preprocess.sample_rate)
114
+ wav = wav_torch.cpu().numpy()
115
+
116
+ # extract features
117
+ if cfg.preprocess.extract_duration:
118
+ durations, phones, start, end = duration.get_duration(
119
+ utt, wav, cfg.preprocess
120
+ )
121
+ save_feature(dataset_output, cfg.preprocess.duration_dir, uid, durations)
122
+ save_txt(dataset_output, cfg.preprocess.lab_dir, uid, phones)
123
+ wav = wav[start:end].astype(np.float32)
124
+ wav_torch = torch.from_numpy(wav).to(wav_torch.device)
125
+
126
+ if cfg.preprocess.extract_linear_spec:
127
+ from utils.mel import extract_linear_features
128
+
129
+ linear = extract_linear_features(wav_torch.unsqueeze(0), cfg.preprocess)
130
+ save_feature(
131
+ dataset_output, cfg.preprocess.linear_dir, uid, linear.cpu().numpy()
132
+ )
133
+
134
+ if cfg.preprocess.extract_mel:
135
+ from utils.mel import extract_mel_features
136
+
137
+ if cfg.preprocess.mel_extract_mode == "taco":
138
+ _stft = TacotronSTFT(
139
+ sampling_rate=cfg.preprocess.sample_rate,
140
+ win_length=cfg.preprocess.win_size,
141
+ hop_length=cfg.preprocess.hop_size,
142
+ filter_length=cfg.preprocess.n_fft,
143
+ n_mel_channels=cfg.preprocess.n_mel,
144
+ mel_fmin=cfg.preprocess.fmin,
145
+ mel_fmax=cfg.preprocess.fmax,
146
+ )
147
+ mel = extract_mel_features(
148
+ wav_torch.unsqueeze(0), cfg.preprocess, taco=True, _stft=_stft
149
+ )
150
+ if cfg.preprocess.extract_duration:
151
+ mel = mel[:, : sum(durations)]
152
+ else:
153
+ mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
154
+ save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
155
+
156
+ if cfg.preprocess.extract_energy:
157
+ if (
158
+ cfg.preprocess.energy_extract_mode == "from_mel"
159
+ and cfg.preprocess.extract_mel
160
+ ):
161
+ energy = (mel.exp() ** 2).sum(0).sqrt().cpu().numpy()
162
+ elif cfg.preprocess.energy_extract_mode == "from_waveform":
163
+ energy = audio.energy(wav, cfg.preprocess)
164
+ elif cfg.preprocess.energy_extract_mode == "from_tacotron_stft":
165
+ _stft = TacotronSTFT(
166
+ sampling_rate=cfg.preprocess.sample_rate,
167
+ win_length=cfg.preprocess.win_size,
168
+ hop_length=cfg.preprocess.hop_size,
169
+ filter_length=cfg.preprocess.n_fft,
170
+ n_mel_channels=cfg.preprocess.n_mel,
171
+ mel_fmin=cfg.preprocess.fmin,
172
+ mel_fmax=cfg.preprocess.fmax,
173
+ )
174
+ _, energy = audio.get_energy_from_tacotron(wav, _stft)
175
+ else:
176
+ assert cfg.preprocess.energy_extract_mode in [
177
+ "from_mel",
178
+ "from_waveform",
179
+ "from_tacotron_stft",
180
+ ], f"{cfg.preprocess.energy_extract_mode} not in supported energy_extract_mode [from_mel, from_waveform, from_tacotron_stft]"
181
+ if cfg.preprocess.extract_duration:
182
+ energy = energy[: sum(durations)]
183
+ phone_energy = avg_phone_feature(energy, durations)
184
+ save_feature(
185
+ dataset_output, cfg.preprocess.phone_energy_dir, uid, phone_energy
186
+ )
187
+
188
+ save_feature(dataset_output, cfg.preprocess.energy_dir, uid, energy)
189
+
190
+ if cfg.preprocess.extract_pitch:
191
+ pitch = f0.get_f0(wav, cfg.preprocess)
192
+ if cfg.preprocess.extract_duration:
193
+ pitch = pitch[: sum(durations)]
194
+ phone_pitch = avg_phone_feature(pitch, durations, interpolation=True)
195
+ save_feature(
196
+ dataset_output, cfg.preprocess.phone_pitch_dir, uid, phone_pitch
197
+ )
198
+ save_feature(dataset_output, cfg.preprocess.pitch_dir, uid, pitch)
199
+
200
+ if cfg.preprocess.extract_uv:
201
+ assert isinstance(pitch, np.ndarray)
202
+ uv = pitch != 0
203
+ save_feature(dataset_output, cfg.preprocess.uv_dir, uid, uv)
204
+
205
+ if cfg.preprocess.extract_audio:
206
+ save_feature(dataset_output, cfg.preprocess.audio_dir, uid, wav)
207
+
208
+ if cfg.preprocess.extract_label:
209
+ if cfg.preprocess.is_mu_law:
210
+ # compress audio
211
+ wav = compress(wav, cfg.preprocess.bits)
212
+ label = audio_to_label(wav, cfg.preprocess.bits)
213
+ save_feature(dataset_output, cfg.preprocess.label_dir, uid, label)
214
+
215
+ if cfg.preprocess.extract_acoustic_token:
216
+ if cfg.preprocess.acoustic_token_extractor == "Encodec":
217
+ codes = extract_encodec_token(wav_path)
218
+ save_feature(
219
+ dataset_output, cfg.preprocess.acoustic_token_dir, uid, codes
220
+ )
221
+
222
+
223
+ def extract_utt_acoustic_features_tts(dataset_output, cfg, utt):
224
+ __extract_utt_acoustic_features(dataset_output, cfg, utt)
225
+
226
+
227
+ def extract_utt_acoustic_features_svc(dataset_output, cfg, utt):
228
+ """Extract acoustic features from utterances (in single process)
229
+
230
+ Args:
231
+ dataset_output (str): directory to store acoustic features
232
+ cfg (dict): dictionary that stores configurations
233
+ utt (dict): utterance info including dataset, singer, uid:{singer}_{song}_{index},
234
+ path to utternace, duration, utternace index
235
+
236
+ """
237
+ from utils import audio, f0, world, duration
238
+
239
+ uid = utt["Uid"]
240
+ wav_path = utt["Path"]
241
+
242
+ with torch.no_grad():
243
+ # Load audio data into tensor with sample rate of the config file
244
+ wav_torch, _ = audio.load_audio_torch(wav_path, cfg.preprocess.sample_rate)
245
+ wav = wav_torch.cpu().numpy()
246
+
247
+ # extract features
248
+ if cfg.preprocess.extract_mel:
249
+ from utils.mel import extract_mel_features
250
+
251
+ mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
252
+ save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
253
+
254
+ if cfg.preprocess.extract_energy:
255
+ energy = (mel.exp() ** 2).sum(0).sqrt().cpu().numpy()
256
+ save_feature(dataset_output, cfg.preprocess.energy_dir, uid, energy)
257
+
258
+ if cfg.preprocess.extract_pitch:
259
+ pitch = f0.get_f0(wav, cfg.preprocess)
260
+ save_feature(dataset_output, cfg.preprocess.pitch_dir, uid, pitch)
261
+
262
+ if cfg.preprocess.extract_uv:
263
+ assert isinstance(pitch, np.ndarray)
264
+ uv = pitch != 0
265
+ save_feature(dataset_output, cfg.preprocess.uv_dir, uid, uv)
266
+
267
+
268
+ def extract_utt_acoustic_features_tta(dataset_output, cfg, utt):
269
+ __extract_utt_acoustic_features(dataset_output, cfg, utt)
270
+
271
+
272
+ def extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt):
273
+ """Extract acoustic features from utterances (in single process)
274
+
275
+ Args:
276
+ dataset_output (str): directory to store acoustic features
277
+ cfg (dict): dictionary that stores configurations
278
+ utt (dict): utterance info including dataset, singer, uid:{singer}_{song}_{index},
279
+ path to utternace, duration, utternace index
280
+
281
+ """
282
+ from utils import audio, f0, world, duration
283
+
284
+ uid = utt["Uid"]
285
+ wav_path = utt["Path"]
286
+
287
+ with torch.no_grad():
288
+ # Load audio data into tensor with sample rate of the config file
289
+ wav_torch, _ = audio.load_audio_torch(wav_path, cfg.preprocess.sample_rate)
290
+ wav = wav_torch.cpu().numpy()
291
+
292
+ # extract features
293
+ if cfg.preprocess.extract_mel:
294
+ from utils.mel import extract_mel_features
295
+
296
+ mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
297
+ save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
298
+
299
+ if cfg.preprocess.extract_energy:
300
+ if (
301
+ cfg.preprocess.energy_extract_mode == "from_mel"
302
+ and cfg.preprocess.extract_mel
303
+ ):
304
+ energy = (mel.exp() ** 2).sum(0).sqrt().cpu().numpy()
305
+ elif cfg.preprocess.energy_extract_mode == "from_waveform":
306
+ energy = audio.energy(wav, cfg.preprocess)
307
+ else:
308
+ assert cfg.preprocess.energy_extract_mode in [
309
+ "from_mel",
310
+ "from_waveform",
311
+ ], f"{cfg.preprocess.energy_extract_mode} not in supported energy_extract_mode [from_mel, from_waveform, from_tacotron_stft]"
312
+
313
+ save_feature(dataset_output, cfg.preprocess.energy_dir, uid, energy)
314
+
315
+ if cfg.preprocess.extract_pitch:
316
+ pitch = f0.get_f0(wav, cfg.preprocess)
317
+ save_feature(dataset_output, cfg.preprocess.pitch_dir, uid, pitch)
318
+
319
+ if cfg.preprocess.extract_uv:
320
+ assert isinstance(pitch, np.ndarray)
321
+ uv = pitch != 0
322
+ save_feature(dataset_output, cfg.preprocess.uv_dir, uid, uv)
323
+
324
+ if cfg.preprocess.extract_audio:
325
+ save_feature(dataset_output, cfg.preprocess.audio_dir, uid, wav)
326
+
327
+ if cfg.preprocess.extract_label:
328
+ if cfg.preprocess.is_mu_law:
329
+ # compress audio
330
+ wav = compress(wav, cfg.preprocess.bits)
331
+ label = audio_to_label(wav, cfg.preprocess.bits)
332
+ save_feature(dataset_output, cfg.preprocess.label_dir, uid, label)
333
+
334
+
335
+ def cal_normalized_mel(mel, dataset_name, cfg):
336
+ mel_min, mel_max = load_mel_extrema(cfg, dataset_name)
337
+ mel_norm = normalize_mel_channel(mel, mel_min, mel_max)
338
+ return mel_norm
339
+
340
+
341
+ def cal_mel_min_max(dataset, output_path, cfg, metadata=None):
342
+ dataset_output = os.path.join(output_path, dataset)
343
+
344
+ if metadata is None:
345
+ metadata = []
346
+ for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
347
+ dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
348
+ with open(dataset_file, "r") as f:
349
+ metadata.extend(json.load(f))
350
+
351
+ tmp_mel_min = []
352
+ tmp_mel_max = []
353
+ for item in metadata:
354
+ mel_path = os.path.join(
355
+ dataset_output, cfg.preprocess.mel_dir, item["Uid"] + ".npy"
356
+ )
357
+ if not os.path.exists(mel_path):
358
+ continue
359
+ mel = np.load(mel_path)
360
+ if mel.shape[0] != cfg.preprocess.n_mel:
361
+ mel = mel.T
362
+ # mel: (n_mels, T)
363
+ assert mel.shape[0] == cfg.preprocess.n_mel
364
+
365
+ tmp_mel_min.append(np.min(mel, axis=-1))
366
+ tmp_mel_max.append(np.max(mel, axis=-1))
367
+
368
+ mel_min = np.min(tmp_mel_min, axis=0)
369
+ mel_max = np.max(tmp_mel_max, axis=0)
370
+
371
+ ## save mel min max data
372
+ mel_min_max_dir = os.path.join(dataset_output, cfg.preprocess.mel_min_max_stats_dir)
373
+ os.makedirs(mel_min_max_dir, exist_ok=True)
374
+
375
+ mel_min_path = os.path.join(mel_min_max_dir, "mel_min.npy")
376
+ mel_max_path = os.path.join(mel_min_max_dir, "mel_max.npy")
377
+ np.save(mel_min_path, mel_min)
378
+ np.save(mel_max_path, mel_max)
379
+
380
+
381
+ def denorm_for_pred_mels(cfg, dataset_name, split, pred):
382
+ """
383
+ Args:
384
+ pred: a list whose every element is (frame_len, n_mels)
385
+ Return:
386
+ similar like pred
387
+ """
388
+ mel_min, mel_max = load_mel_extrema(cfg.preprocess, dataset_name)
389
+ recovered_mels = [
390
+ denormalize_mel_channel(mel.T, mel_min, mel_max).T for mel in pred
391
+ ]
392
+
393
+ return recovered_mels
394
+
395
+
396
+ def load_mel_extrema(cfg, dataset_name):
397
+ data_dir = os.path.join(cfg.processed_dir, dataset_name, cfg.mel_min_max_stats_dir)
398
+
399
+ min_file = os.path.join(data_dir, "mel_min.npy")
400
+ max_file = os.path.join(data_dir, "mel_max.npy")
401
+
402
+ mel_min = np.load(min_file)
403
+ mel_max = np.load(max_file)
404
+
405
+ return mel_min, mel_max
406
+
407
+
408
+ def denormalize_mel_channel(mel, mel_min, mel_max):
409
+ mel_min = np.expand_dims(mel_min, -1)
410
+ mel_max = np.expand_dims(mel_max, -1)
411
+ return (mel + 1) / 2 * (mel_max - mel_min + ZERO) + mel_min
412
+
413
+
414
+ def normalize_mel_channel(mel, mel_min, mel_max):
415
+ mel_min = np.expand_dims(mel_min, -1)
416
+ mel_max = np.expand_dims(mel_max, -1)
417
+ return (mel - mel_min) / (mel_max - mel_min + ZERO) * 2 - 1
418
+
419
+
420
+ def normalize(dataset, feat_dir, cfg):
421
+ dataset_output = os.path.join(cfg.preprocess.processed_dir, dataset)
422
+ print(f"normalize {feat_dir}")
423
+
424
+ max_value = np.finfo(np.float64).min
425
+ min_value = np.finfo(np.float64).max
426
+
427
+ scaler = StandardScaler()
428
+ feat_files = os.listdir(os.path.join(dataset_output, feat_dir))
429
+
430
+ for feat_file in tqdm(feat_files):
431
+ feat_file = os.path.join(dataset_output, feat_dir, feat_file)
432
+ if not feat_file.endswith(".npy"):
433
+ continue
434
+ feat = np.load(feat_file)
435
+ max_value = max(max_value, max(feat))
436
+ min_value = min(min_value, min(feat))
437
+ scaler.partial_fit(feat.reshape((-1, 1)))
438
+ mean = scaler.mean_[0]
439
+ std = scaler.scale_[0]
440
+ stat = np.array([min_value, max_value, mean, std])
441
+ stat_npy = os.path.join(dataset_output, f"{feat_dir}_stat.npy")
442
+ np.save(stat_npy, stat)
443
+ return mean, std, min_value, max_value
444
+
445
+
446
+ def load_normalized(feat_dir, dataset_name, cfg):
447
+ dataset_output = os.path.join(cfg.preprocess.processed_dir, dataset_name)
448
+ stat_npy = os.path.join(dataset_output, f"{feat_dir}_stat.npy")
449
+ min_value, max_value, mean, std = np.load(stat_npy)
450
+ return mean, std, min_value, max_value
451
+
452
+
453
+ def cal_pitch_statistics_svc(dataset, output_path, cfg, metadata=None):
454
+ # path of dataset
455
+ dataset_dir = os.path.join(output_path, dataset)
456
+ save_dir = os.path.join(dataset_dir, cfg.preprocess.pitch_dir)
457
+ os.makedirs(save_dir, exist_ok=True)
458
+ if has_existed(os.path.join(save_dir, "statistics.json")):
459
+ return
460
+
461
+ if metadata is None:
462
+ # load singers and ids
463
+ singers = json.load(open(os.path.join(dataset_dir, "singers.json"), "r"))
464
+
465
+ # combine train and test metadata
466
+ metadata = []
467
+ for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
468
+ dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
469
+ with open(dataset_file, "r") as f:
470
+ metadata.extend(json.load(f))
471
+ else:
472
+ singers = list(set([item["Singer"] for item in metadata]))
473
+ singers = {
474
+ "{}_{}".format(dataset, name): idx for idx, name in enumerate(singers)
475
+ }
476
+
477
+ # use different scalers for each singer
478
+ pitch_scalers = [[] for _ in range(len(singers))]
479
+ total_pitch_scalers = [[] for _ in range(len(singers))]
480
+
481
+ for utt_info in tqdm(metadata, desc="Loading F0..."):
482
+ # utt = f'{utt_info["Dataset"]}_{utt_info["Uid"]}'
483
+ singer = utt_info["Singer"]
484
+ pitch_path = os.path.join(
485
+ dataset_dir, cfg.preprocess.pitch_dir, utt_info["Uid"] + ".npy"
486
+ )
487
+ # total_pitch contains all pitch including unvoiced frames
488
+ if not os.path.exists(pitch_path):
489
+ continue
490
+ total_pitch = np.load(pitch_path)
491
+ assert len(total_pitch) > 0
492
+ # pitch contains only voiced frames
493
+ pitch = total_pitch[total_pitch != 0]
494
+ spkid = singers[f"{replace_augment_name(dataset)}_{singer}"]
495
+
496
+ # update pitch scalers
497
+ pitch_scalers[spkid].extend(pitch.tolist())
498
+ # update total pitch scalers
499
+ total_pitch_scalers[spkid].extend(total_pitch.tolist())
500
+
501
+ # save pitch statistics for each singer in dict
502
+ sta_dict = {}
503
+ for singer in tqdm(singers, desc="Singers statistics"):
504
+ spkid = singers[singer]
505
+ # voiced pitch statistics
506
+ mean, std, min, max, median = (
507
+ np.mean(pitch_scalers[spkid]),
508
+ np.std(pitch_scalers[spkid]),
509
+ np.min(pitch_scalers[spkid]),
510
+ np.max(pitch_scalers[spkid]),
511
+ np.median(pitch_scalers[spkid]),
512
+ )
513
+
514
+ # total pitch statistics
515
+ mean_t, std_t, min_t, max_t, median_t = (
516
+ np.mean(total_pitch_scalers[spkid]),
517
+ np.std(total_pitch_scalers[spkid]),
518
+ np.min(total_pitch_scalers[spkid]),
519
+ np.max(total_pitch_scalers[spkid]),
520
+ np.median(total_pitch_scalers[spkid]),
521
+ )
522
+ sta_dict[singer] = {
523
+ "voiced_positions": {
524
+ "mean": mean,
525
+ "std": std,
526
+ "median": median,
527
+ "min": min,
528
+ "max": max,
529
+ },
530
+ "total_positions": {
531
+ "mean": mean_t,
532
+ "std": std_t,
533
+ "median": median_t,
534
+ "min": min_t,
535
+ "max": max_t,
536
+ },
537
+ }
538
+
539
+ # save statistics
540
+ with open(os.path.join(save_dir, "statistics.json"), "w") as f:
541
+ json.dump(sta_dict, f, indent=4, ensure_ascii=False)
542
+
543
+
544
+ def cal_pitch_statistics(dataset, output_path, cfg):
545
+ # path of dataset
546
+ dataset_dir = os.path.join(output_path, dataset)
547
+ if cfg.preprocess.use_phone_pitch:
548
+ pitch_dir = cfg.preprocess.phone_pitch_dir
549
+ else:
550
+ pitch_dir = cfg.preprocess.pitch_dir
551
+ save_dir = os.path.join(dataset_dir, pitch_dir)
552
+
553
+ os.makedirs(save_dir, exist_ok=True)
554
+ if has_existed(os.path.join(save_dir, "statistics.json")):
555
+ return
556
+ # load singers and ids
557
+ singers = json.load(open(os.path.join(dataset_dir, "singers.json"), "r"))
558
+
559
+ # combine train and test metadata
560
+ metadata = []
561
+ for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
562
+ dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
563
+ with open(dataset_file, "r") as f:
564
+ metadata.extend(json.load(f))
565
+
566
+ # use different scalers for each singer
567
+ pitch_scalers = [[] for _ in range(len(singers))]
568
+ total_pitch_scalers = [[] for _ in range(len(singers))]
569
+
570
+ for utt_info in metadata:
571
+ utt = f'{utt_info["Dataset"]}_{utt_info["Uid"]}'
572
+ singer = utt_info["Singer"]
573
+ pitch_path = os.path.join(dataset_dir, pitch_dir, utt_info["Uid"] + ".npy")
574
+ # total_pitch contains all pitch including unvoiced frames
575
+ if not os.path.exists(pitch_path):
576
+ continue
577
+ total_pitch = np.load(pitch_path)
578
+ assert len(total_pitch) > 0
579
+ # pitch contains only voiced frames
580
+ # pitch = total_pitch[total_pitch != 0]
581
+ if cfg.preprocess.pitch_remove_outlier:
582
+ pitch = remove_outlier(total_pitch)
583
+ spkid = singers[f"{replace_augment_name(dataset)}_{singer}"]
584
+
585
+ # update pitch scalers
586
+ pitch_scalers[spkid].extend(pitch.tolist())
587
+ # update total pitch scalers
588
+ total_pitch_scalers[spkid].extend(total_pitch.tolist())
589
+
590
+ # save pitch statistics for each singer in dict
591
+ sta_dict = {}
592
+ for singer in singers:
593
+ spkid = singers[singer]
594
+ # voiced pitch statistics
595
+ mean, std, min, max, median = (
596
+ np.mean(pitch_scalers[spkid]),
597
+ np.std(pitch_scalers[spkid]),
598
+ np.min(pitch_scalers[spkid]),
599
+ np.max(pitch_scalers[spkid]),
600
+ np.median(pitch_scalers[spkid]),
601
+ )
602
+
603
+ # total pitch statistics
604
+ mean_t, std_t, min_t, max_t, median_t = (
605
+ np.mean(total_pitch_scalers[spkid]),
606
+ np.std(total_pitch_scalers[spkid]),
607
+ np.min(total_pitch_scalers[spkid]),
608
+ np.max(total_pitch_scalers[spkid]),
609
+ np.median(total_pitch_scalers[spkid]),
610
+ )
611
+ sta_dict[singer] = {
612
+ "voiced_positions": {
613
+ "mean": mean,
614
+ "std": std,
615
+ "median": median,
616
+ "min": min,
617
+ "max": max,
618
+ },
619
+ "total_positions": {
620
+ "mean": mean_t,
621
+ "std": std_t,
622
+ "median": median_t,
623
+ "min": min_t,
624
+ "max": max_t,
625
+ },
626
+ }
627
+
628
+ # save statistics
629
+ with open(os.path.join(save_dir, "statistics.json"), "w") as f:
630
+ json.dump(sta_dict, f, indent=4, ensure_ascii=False)
631
+
632
+
633
+ def cal_energy_statistics(dataset, output_path, cfg):
634
+ # path of dataset
635
+ dataset_dir = os.path.join(output_path, dataset)
636
+ if cfg.preprocess.use_phone_energy:
637
+ energy_dir = cfg.preprocess.phone_energy_dir
638
+ else:
639
+ energy_dir = cfg.preprocess.energy_dir
640
+ save_dir = os.path.join(dataset_dir, energy_dir)
641
+ os.makedirs(save_dir, exist_ok=True)
642
+ print(os.path.join(save_dir, "statistics.json"))
643
+ if has_existed(os.path.join(save_dir, "statistics.json")):
644
+ return
645
+ # load singers and ids
646
+ singers = json.load(open(os.path.join(dataset_dir, "singers.json"), "r"))
647
+
648
+ # combine train and test metadata
649
+ metadata = []
650
+ for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
651
+ dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
652
+ with open(dataset_file, "r") as f:
653
+ metadata.extend(json.load(f))
654
+
655
+ # use different scalers for each singer
656
+ energy_scalers = [[] for _ in range(len(singers))]
657
+ total_energy_scalers = [[] for _ in range(len(singers))]
658
+
659
+ for utt_info in metadata:
660
+ utt = f'{utt_info["Dataset"]}_{utt_info["Uid"]}'
661
+ singer = utt_info["Singer"]
662
+ energy_path = os.path.join(dataset_dir, energy_dir, utt_info["Uid"] + ".npy")
663
+ # total_energy contains all energy including unvoiced frames
664
+ if not os.path.exists(energy_path):
665
+ continue
666
+ total_energy = np.load(energy_path)
667
+ assert len(total_energy) > 0
668
+ # energy contains only voiced frames
669
+ # energy = total_energy[total_energy != 0]
670
+ if cfg.preprocess.energy_remove_outlier:
671
+ energy = remove_outlier(total_energy)
672
+ spkid = singers[f"{replace_augment_name(dataset)}_{singer}"]
673
+
674
+ # update energy scalers
675
+ energy_scalers[spkid].extend(energy.tolist())
676
+ # update total energyscalers
677
+ total_energy_scalers[spkid].extend(total_energy.tolist())
678
+
679
+ # save energy statistics for each singer in dict
680
+ sta_dict = {}
681
+ for singer in singers:
682
+ spkid = singers[singer]
683
+ # voiced energy statistics
684
+ mean, std, min, max, median = (
685
+ np.mean(energy_scalers[spkid]),
686
+ np.std(energy_scalers[spkid]),
687
+ np.min(energy_scalers[spkid]),
688
+ np.max(energy_scalers[spkid]),
689
+ np.median(energy_scalers[spkid]),
690
+ )
691
+
692
+ # total energy statistics
693
+ mean_t, std_t, min_t, max_t, median_t = (
694
+ np.mean(total_energy_scalers[spkid]),
695
+ np.std(total_energy_scalers[spkid]),
696
+ np.min(total_energy_scalers[spkid]),
697
+ np.max(total_energy_scalers[spkid]),
698
+ np.median(total_energy_scalers[spkid]),
699
+ )
700
+ sta_dict[singer] = {
701
+ "voiced_positions": {
702
+ "mean": mean,
703
+ "std": std,
704
+ "median": median,
705
+ "min": min,
706
+ "max": max,
707
+ },
708
+ "total_positions": {
709
+ "mean": mean_t,
710
+ "std": std_t,
711
+ "median": median_t,
712
+ "min": min_t,
713
+ "max": max_t,
714
+ },
715
+ }
716
+
717
+ # save statistics
718
+ with open(os.path.join(save_dir, "statistics.json"), "w") as f:
719
+ json.dump(sta_dict, f, indent=4, ensure_ascii=False)
720
+
721
+
722
+ def copy_acoustic_features(metadata, dataset_dir, src_dataset_dir, cfg):
723
+ """Copy acoustic features from src_dataset_dir to dataset_dir
724
+
725
+ Args:
726
+ metadata (dict): dictionary that stores data in train.json and test.json files
727
+ dataset_dir (str): directory to store acoustic features
728
+ src_dataset_dir (str): directory to store acoustic features
729
+ cfg (dict): dictionary that stores configurations
730
+
731
+ """
732
+
733
+ if cfg.preprocess.extract_mel:
734
+ if not has_existed(os.path.join(dataset_dir, cfg.preprocess.mel_dir)):
735
+ os.makedirs(
736
+ os.path.join(dataset_dir, cfg.preprocess.mel_dir), exist_ok=True
737
+ )
738
+ print(
739
+ "Copying mel features from {} to {}...".format(
740
+ src_dataset_dir, dataset_dir
741
+ )
742
+ )
743
+ for utt_info in tqdm(metadata):
744
+ src_mel_path = os.path.join(
745
+ src_dataset_dir, cfg.preprocess.mel_dir, utt_info["Uid"] + ".npy"
746
+ )
747
+ dst_mel_path = os.path.join(
748
+ dataset_dir, cfg.preprocess.mel_dir, utt_info["Uid"] + ".npy"
749
+ )
750
+ # create soft-links
751
+ if not os.path.exists(dst_mel_path):
752
+ os.symlink(src_mel_path, dst_mel_path)
753
+ if cfg.preprocess.extract_energy:
754
+ if not has_existed(os.path.join(dataset_dir, cfg.preprocess.energy_dir)):
755
+ os.makedirs(
756
+ os.path.join(dataset_dir, cfg.preprocess.energy_dir), exist_ok=True
757
+ )
758
+ print(
759
+ "Copying energy features from {} to {}...".format(
760
+ src_dataset_dir, dataset_dir
761
+ )
762
+ )
763
+ for utt_info in tqdm(metadata):
764
+ src_energy_path = os.path.join(
765
+ src_dataset_dir, cfg.preprocess.energy_dir, utt_info["Uid"] + ".npy"
766
+ )
767
+ dst_energy_path = os.path.join(
768
+ dataset_dir, cfg.preprocess.energy_dir, utt_info["Uid"] + ".npy"
769
+ )
770
+ # create soft-links
771
+ if not os.path.exists(dst_energy_path):
772
+ os.symlink(src_energy_path, dst_energy_path)
773
+ if cfg.preprocess.extract_pitch:
774
+ if not has_existed(os.path.join(dataset_dir, cfg.preprocess.pitch_dir)):
775
+ os.makedirs(
776
+ os.path.join(dataset_dir, cfg.preprocess.pitch_dir), exist_ok=True
777
+ )
778
+ print(
779
+ "Copying pitch features from {} to {}...".format(
780
+ src_dataset_dir, dataset_dir
781
+ )
782
+ )
783
+ for utt_info in tqdm(metadata):
784
+ src_pitch_path = os.path.join(
785
+ src_dataset_dir, cfg.preprocess.pitch_dir, utt_info["Uid"] + ".npy"
786
+ )
787
+ dst_pitch_path = os.path.join(
788
+ dataset_dir, cfg.preprocess.pitch_dir, utt_info["Uid"] + ".npy"
789
+ )
790
+ # create soft-links
791
+ if not os.path.exists(dst_pitch_path):
792
+ os.symlink(src_pitch_path, dst_pitch_path)
793
+ if cfg.preprocess.extract_uv:
794
+ if not has_existed(os.path.join(dataset_dir, cfg.preprocess.uv_dir)):
795
+ os.makedirs(
796
+ os.path.join(dataset_dir, cfg.preprocess.uv_dir), exist_ok=True
797
+ )
798
+ print(
799
+ "Copying uv features from {} to {}...".format(
800
+ src_dataset_dir, dataset_dir
801
+ )
802
+ )
803
+ for utt_info in tqdm(metadata):
804
+ src_uv_path = os.path.join(
805
+ src_dataset_dir, cfg.preprocess.uv_dir, utt_info["Uid"] + ".npy"
806
+ )
807
+ dst_uv_path = os.path.join(
808
+ dataset_dir, cfg.preprocess.uv_dir, utt_info["Uid"] + ".npy"
809
+ )
810
+ # create soft-links
811
+ if not os.path.exists(dst_uv_path):
812
+ os.symlink(src_uv_path, dst_uv_path)
813
+ if cfg.preprocess.extract_audio:
814
+ if not has_existed(os.path.join(dataset_dir, cfg.preprocess.audio_dir)):
815
+ os.makedirs(
816
+ os.path.join(dataset_dir, cfg.preprocess.audio_dir), exist_ok=True
817
+ )
818
+ print(
819
+ "Copying audio features from {} to {}...".format(
820
+ src_dataset_dir, dataset_dir
821
+ )
822
+ )
823
+ for utt_info in tqdm(metadata):
824
+ src_audio_path = os.path.join(
825
+ src_dataset_dir, cfg.preprocess.audio_dir, utt_info["Uid"] + ".npy"
826
+ )
827
+ dst_audio_path = os.path.join(
828
+ dataset_dir, cfg.preprocess.audio_dir, utt_info["Uid"] + ".npy"
829
+ )
830
+ # create soft-links
831
+ if not os.path.exists(dst_audio_path):
832
+ os.symlink(src_audio_path, dst_audio_path)
833
+ if cfg.preprocess.extract_label:
834
+ if not has_existed(os.path.join(dataset_dir, cfg.preprocess.label_dir)):
835
+ os.makedirs(
836
+ os.path.join(dataset_dir, cfg.preprocess.label_dir), exist_ok=True
837
+ )
838
+ print(
839
+ "Copying label features from {} to {}...".format(
840
+ src_dataset_dir, dataset_dir
841
+ )
842
+ )
843
+ for utt_info in tqdm(metadata):
844
+ src_label_path = os.path.join(
845
+ src_dataset_dir, cfg.preprocess.label_dir, utt_info["Uid"] + ".npy"
846
+ )
847
+ dst_label_path = os.path.join(
848
+ dataset_dir, cfg.preprocess.label_dir, utt_info["Uid"] + ".npy"
849
+ )
850
+ # create soft-links
851
+ if not os.path.exists(dst_label_path):
852
+ os.symlink(src_label_path, dst_label_path)
853
+
854
+
855
+ def align_duration_mel(dataset, output_path, cfg):
856
+ print("align the duration and mel")
857
+
858
+ dataset_dir = os.path.join(output_path, dataset)
859
+ metadata = []
860
+ for dataset_type in ["train", "test"] if "eval" not in dataset else ["test"]:
861
+ dataset_file = os.path.join(dataset_dir, "{}.json".format(dataset_type))
862
+ with open(dataset_file, "r") as f:
863
+ metadata.extend(json.load(f))
864
+
865
+ utt2dur = {}
866
+ for index in tqdm(range(len(metadata))):
867
+ utt_info = metadata[index]
868
+ dataset = utt_info["Dataset"]
869
+ uid = utt_info["Uid"]
870
+ utt = "{}_{}".format(dataset, uid)
871
+
872
+ mel_path = os.path.join(dataset_dir, cfg.preprocess.mel_dir, uid + ".npy")
873
+ mel = np.load(mel_path).transpose(1, 0)
874
+ duration_path = os.path.join(
875
+ dataset_dir, cfg.preprocess.duration_dir, uid + ".npy"
876
+ )
877
+ duration = np.load(duration_path)
878
+ if sum(duration) != mel.shape[0]:
879
+ duration_sum = sum(duration)
880
+ mel_len = mel.shape[0]
881
+ mismatch = abs(duration_sum - mel_len)
882
+ assert mismatch <= 5, "duration and mel length mismatch!"
883
+ cloned = np.array(duration, copy=True)
884
+ if duration_sum > mel_len:
885
+ for j in range(1, len(duration) - 1):
886
+ if mismatch == 0:
887
+ break
888
+ dur_val = cloned[-j]
889
+ if dur_val >= mismatch:
890
+ cloned[-j] -= mismatch
891
+ mismatch -= dur_val
892
+ break
893
+ else:
894
+ cloned[-j] = 0
895
+ mismatch -= dur_val
896
+
897
+ elif duration_sum < mel_len:
898
+ cloned[-1] += mismatch
899
+ duration = cloned
900
+ utt2dur[utt] = duration
901
+ np.save(duration_path, duration)
902
+
903
+ return utt2dur
processors/content_extractor.py ADDED
@@ -0,0 +1,540 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import torch
8
+ import numpy as np
9
+ import yaml
10
+ import copy
11
+ from tqdm import tqdm
12
+ from torchaudio.compliance import kaldi
13
+ from torch.nn.utils.rnn import pad_sequence
14
+ from torch.utils.data import DataLoader
15
+ from fairseq import checkpoint_utils
16
+ from transformers import AutoModel, Wav2Vec2FeatureExtractor
17
+
18
+ from utils.io_optim import (
19
+ TorchaudioDataset,
20
+ LibrosaDataset,
21
+ FFmpegDataset,
22
+ collate_batch,
23
+ )
24
+ from modules import whisper_extractor as whisper
25
+ from modules.wenet_extractor.utils.init_model import init_model
26
+ from modules.wenet_extractor.utils.checkpoint import load_checkpoint
27
+
28
+ """
29
+ Extractor for content features
30
+ 1. whisper
31
+ 2. contentvec
32
+ 3. wenet
33
+ 4. mert
34
+
35
+ Pipeline:
36
+ in preprocess.py:
37
+ call extract_utt_content_features() to extract content features for each utterance
38
+ extract_utt_content_features() envelopes the following steps:
39
+ 1. load the model (whisper, contentvec, wenet)
40
+ 2. extract the content features
41
+ 3. save the content features into files
42
+ in svc_dataset.py:
43
+ call offline_align() to align the content features to the given target length
44
+
45
+ """
46
+
47
+ """
48
+ Extractor Usage:
49
+ 1. initialize an instance of extractor
50
+ extractor = WhisperExtractor(cfg)
51
+ 2. load the specified model
52
+ extractor.load_model()
53
+ 3. extract the content features
54
+ extractor.extract_content(utt) for single utterance
55
+ extractor.extract_content_batch(utts) for batch utterances
56
+ 4. save the content features
57
+ extractor.save_feature(utt, content_feature) for single utterance
58
+ """
59
+
60
+
61
+ class BaseExtractor:
62
+ def __init__(self, cfg):
63
+ self.cfg = cfg
64
+ self.extractor_type = None
65
+ self.model = None
66
+
67
+ def offline_align(self, content, target_len):
68
+ """
69
+ args:
70
+ content: (source_len, dim)
71
+ target_len: target length
72
+ return:
73
+ mapped_feature: (target_len, dim)
74
+ """
75
+ target_hop = self.cfg.preprocess.hop_size
76
+
77
+ assert self.extractor_type in ["whisper", "contentvec", "wenet"]
78
+ if self.extractor_type == "whisper":
79
+ source_hop = (
80
+ self.cfg.preprocess.whisper_frameshift
81
+ * self.cfg.preprocess.whisper_downsample_rate
82
+ * self.cfg.preprocess.sample_rate
83
+ )
84
+ elif self.extractor_type == "contentvec":
85
+ source_hop = (
86
+ self.cfg.preprocess.contentvec_frameshift
87
+ * self.cfg.preprocess.sample_rate
88
+ )
89
+ elif self.extractor_type == "wenet":
90
+ source_hop = (
91
+ self.cfg.preprocess.wenet_frameshift
92
+ * self.cfg.preprocess.wenet_downsample_rate
93
+ * self.cfg.preprocess.sample_rate
94
+ )
95
+ source_hop = int(source_hop)
96
+ factor = np.gcd(source_hop, target_hop)
97
+ source_hop //= factor
98
+ target_hop //= factor
99
+
100
+ # (source_len, 256)
101
+ _, width = content.shape
102
+ # slice the content from padded feature
103
+ source_len = min(target_len * target_hop // source_hop + 1, len(content))
104
+
105
+ # const ~= target_len * target_hop
106
+ const = source_len * source_hop // target_hop * target_hop
107
+
108
+ # (source_len * source_hop, dim)
109
+ up_sampling_feats = np.repeat(content, source_hop, axis=0)
110
+ # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
111
+ down_sampling_feats = np.average(
112
+ up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
113
+ )
114
+
115
+ err = abs(target_len - len(down_sampling_feats))
116
+ if err > 8:
117
+ # err_log_dir is indeterminate
118
+ err_log_dir = os.path.join(
119
+ self.cfg.preprocess.processed_dir, "align_max_err.log"
120
+ )
121
+ try:
122
+ with open(err_log_dir, "r") as f:
123
+ err_num = int(f.read())
124
+ except:
125
+ with open(err_log_dir, "w") as f:
126
+ f.write("0")
127
+ err_num = 0
128
+ if err > err_num:
129
+ with open(err_log_dir, "w") as f:
130
+ f.write(str(err))
131
+
132
+ if len(down_sampling_feats) < target_len:
133
+ # (1, dim) -> (err, dim)
134
+ end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
135
+ down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
136
+
137
+ # (target_len, dim)
138
+ mapped_feature = down_sampling_feats[:target_len]
139
+
140
+ return mapped_feature
141
+
142
+ def save_feature(self, utt, content_feature):
143
+ """Save a single utternace to path {cfg.preprocess.processed_dir}
144
+
145
+ Args:
146
+ utt (dict): one item in metadata, containing information for one utterance
147
+ content_feature (tensor): content feature of one utterance
148
+ """
149
+ uid = utt["Uid"]
150
+ assert self.extractor_type != None
151
+ out_dir = os.path.join(
152
+ self.cfg.preprocess.processed_dir, utt["Dataset"], self.extractor_type
153
+ )
154
+ os.makedirs(out_dir, exist_ok=True)
155
+ save_path = os.path.join(out_dir, uid + ".npy")
156
+ # only keep effective parts
157
+ duration = utt["Duration"]
158
+ if self.extractor_type == "whisper":
159
+ frameshift = (
160
+ self.cfg.preprocess.whisper_frameshift
161
+ * self.cfg.preprocess.whisper_downsample_rate
162
+ ) # 20ms
163
+ elif self.extractor_type == "contentvec":
164
+ frameshift = self.cfg.preprocess.contentvec_frameshift # 20ms
165
+ elif self.extractor_type == "wenet":
166
+ frameshift = (
167
+ self.cfg.preprocess.wenet_frameshift
168
+ * self.cfg.preprocess.wenet_downsample_rate
169
+ ) # 40ms
170
+ elif self.extractor_type == "mert":
171
+ frameshift = self.cfg.preprocess.mert_frameshift
172
+ else:
173
+ raise NotImplementedError
174
+ # calculate the number of valid frames
175
+ num_frames = int(np.ceil((duration - frameshift) / frameshift)) + 1
176
+ # (num_frames, dim) -> (valid_frames, dim)
177
+ assert (
178
+ len(content_feature.shape) == 2
179
+ ), "content feature shape error, it should be (num_frames, dim)"
180
+ content_feature = content_feature[:num_frames, :]
181
+ np.save(save_path, content_feature.cpu().detach().numpy())
182
+
183
+
184
+ class WhisperExtractor(BaseExtractor):
185
+ def __init__(self, config):
186
+ super(WhisperExtractor, self).__init__(config)
187
+ self.extractor_type = "whisper"
188
+
189
+ def load_model(self):
190
+ # load whisper checkpoint
191
+ print("Loading Whisper Model...")
192
+
193
+ checkpoint_file = (
194
+ self.cfg.preprocess.whisper_model_path
195
+ if "whisper_model_path" in self.cfg.preprocess
196
+ else None
197
+ )
198
+ model = whisper.load_model(
199
+ self.cfg.preprocess.whisper_model, checkpoint_file=checkpoint_file
200
+ )
201
+ if torch.cuda.is_available():
202
+ print("Using GPU...\n")
203
+ model = model.cuda()
204
+ else:
205
+ print("Using CPU...\n")
206
+
207
+ self.model = model.eval()
208
+
209
+ def extract_content_features(self, wavs, lens):
210
+ """extract content features from a batch of dataloader
211
+ Args:
212
+ wavs: tensor (batch_size, T)
213
+ lens: list
214
+ """
215
+ # wavs: (batch, max_len)
216
+ wavs = whisper.pad_or_trim(wavs)
217
+ # batch_mel: (batch, 80, 3000)
218
+ batch_mel = whisper.log_mel_spectrogram(wavs).to(self.model.device)
219
+ with torch.no_grad():
220
+ # (batch, 1500, 1024)
221
+ features = self.model.embed_audio(batch_mel)
222
+ return features
223
+
224
+
225
+ class ContentvecExtractor(BaseExtractor):
226
+ def __init__(self, cfg):
227
+ super(ContentvecExtractor, self).__init__(cfg)
228
+ self.extractor_type = "contentvec"
229
+
230
+ def load_model(self):
231
+ assert self.model == None
232
+ # Load model
233
+ ckpt_path = self.cfg.preprocess.contentvec_file
234
+ print("Load Contentvec Model...")
235
+
236
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
237
+ [ckpt_path],
238
+ suffix="",
239
+ )
240
+ model = models[0]
241
+ model.eval()
242
+
243
+ if torch.cuda.is_available():
244
+ # print("Using GPU...\n")
245
+ model = model.cuda()
246
+
247
+ self.model = model
248
+
249
+ def extract_content_features(self, wavs, lens):
250
+ """extract content features from a batch of dataloader
251
+ Args:
252
+ wavs: tensor (batch, T)
253
+ lens: list
254
+ """
255
+ device = next(self.model.parameters()).device
256
+ wavs = wavs.to(device) # (batch, max_len)
257
+ padding_mask = torch.eq(wavs, torch.zeros_like(wavs)).to(device)
258
+ with torch.no_grad():
259
+ logits = self.model.extract_features(
260
+ source=wavs, padding_mask=padding_mask, output_layer=12
261
+ )
262
+ # feats: (batch, T, 256)
263
+ feats = self.model.final_proj(logits[0])
264
+ return feats
265
+
266
+
267
+ class WenetExtractor(BaseExtractor):
268
+ def __init__(self, config):
269
+ super(WenetExtractor, self).__init__(config)
270
+ self.extractor_type = "wenet"
271
+
272
+ def load_model(self):
273
+ wenet_cfg = self.cfg.preprocess.wenet_config
274
+ wenet_model_path = self.cfg.preprocess.wenet_model_path
275
+ # load Wenet config
276
+ with open(wenet_cfg, "r") as w:
277
+ wenet_configs = yaml.load(w, Loader=yaml.FullLoader)
278
+ self.extract_conf = copy.deepcopy(wenet_configs["dataset_conf"])
279
+ print("Loading Wenet Model...")
280
+ self.model = init_model(wenet_configs)
281
+ load_checkpoint(self.model, wenet_model_path)
282
+
283
+ if torch.cuda.is_available():
284
+ print("Using GPU...\n")
285
+ self.model = self.model.cuda()
286
+ else:
287
+ print("Using CPU...\n")
288
+
289
+ self.model = self.model.eval()
290
+
291
+ def extract_content_features(self, wavs, lens):
292
+ """extract content features from a batch of dataloader
293
+ Args:
294
+ wavs: tensor
295
+ lens: list
296
+ """
297
+ feats_list = []
298
+ lengths_list = []
299
+
300
+ device = next(self.model.parameters()).device
301
+ # Extract fbank/mfcc features by kaldi
302
+ assert self.extract_conf is not None, "load model first!"
303
+ feats_type = self.extract_conf.get("feats_type", "fbank")
304
+ assert feats_type in ["fbank", "mfcc"]
305
+
306
+ for idx, wav in enumerate(wavs):
307
+ # wav: (T)
308
+ wav = wav[: lens[idx]].to(device)
309
+
310
+ # pad one frame to compensate for the frame cut off after feature extraction
311
+ pad_tensor = torch.zeros(160, device=wav.device)
312
+ wav = torch.cat((wav, pad_tensor), dim=-1)
313
+ wav *= 1 << 15
314
+
315
+ wav = wav.unsqueeze(0) # (T) -> (1, T)
316
+ if feats_type == "fbank":
317
+ fbank_conf = self.extract_conf.get("fbank_conf", {})
318
+ feat = kaldi.fbank(
319
+ wav,
320
+ sample_frequency=16000,
321
+ num_mel_bins=fbank_conf["num_mel_bins"],
322
+ frame_length=fbank_conf["frame_length"],
323
+ frame_shift=fbank_conf["frame_shift"],
324
+ dither=fbank_conf["dither"],
325
+ )
326
+ elif feats_type == "mfcc":
327
+ mfcc_conf = self.extract_conf.get("mfcc", {})
328
+ feat = kaldi.mfcc(
329
+ wav,
330
+ sample_frequency=16000,
331
+ num_mel_bins=mfcc_conf["num_mel_bins"],
332
+ frame_length=mfcc_conf["frame_length"],
333
+ frame_shift=mfcc_conf["frame_shift"],
334
+ dither=mfcc_conf["dither"],
335
+ num_ceps=mfcc_conf.get("num_ceps", 40),
336
+ high_freq=mfcc_conf.get("high_freq", 0.0),
337
+ low_freq=mfcc_conf.get("low_freq", 20.0),
338
+ )
339
+ feats_list.append(feat)
340
+ lengths_list.append(feat.shape[0])
341
+
342
+ feats_lengths = torch.tensor(lengths_list, dtype=torch.int32).to(device)
343
+ feats_tensor = pad_sequence(feats_list, batch_first=True).to(
344
+ device
345
+ ) # (batch, len, 80)
346
+
347
+ features = self.model.encoder_extractor(
348
+ feats_tensor,
349
+ feats_lengths,
350
+ decoding_chunk_size=-1,
351
+ num_decoding_left_chunks=-1,
352
+ simulate_streaming=False,
353
+ )
354
+ return features
355
+
356
+
357
+ class MertExtractor(BaseExtractor):
358
+ def __init__(self, cfg):
359
+ super(MertExtractor, self).__init__(cfg)
360
+ self.extractor_type = "mert"
361
+ self.preprocessor = None
362
+
363
+ def load_model(self):
364
+ assert self.model == None
365
+ assert self.preprocessor == None
366
+
367
+ print("Loading MERT Model: ...", self.cfg.preprocess.mert_model)
368
+
369
+ local_mert_path = "/mnt/workspace/fangzihao/acce/Amphion/pretrained/MERT"
370
+
371
+ model_name = self.cfg.preprocess.mert_model
372
+ model = AutoModel.from_pretrained(local_mert_path, trust_remote_code=True)
373
+
374
+ if torch.cuda.is_available():
375
+ model = model.cuda()
376
+ preprocessor = Wav2Vec2FeatureExtractor.from_pretrained(
377
+ local_mert_path, trust_remote_code=True
378
+ )
379
+
380
+ self.model = model
381
+ self.preprocessor = preprocessor
382
+
383
+ def extract_content_features(self, wavs, lens):
384
+ """extract content features from a batch of dataloader
385
+ Args:
386
+ wavs: tensor (batch, T)
387
+ lens: list
388
+ """
389
+ with torch.no_grad():
390
+ sample_rate = self.preprocessor.sampling_rate
391
+ device = next(self.model.parameters()).device
392
+ assert (
393
+ sample_rate == self.cfg.preprocess.mert_sample_rate
394
+ ), "mert sample rate mismatch, expected {}, got {}".format(
395
+ self.cfg.preprocess.mert_sample_rate, sample_rate
396
+ )
397
+ mert_features = []
398
+ # wav: (len)
399
+ for wav in wavs:
400
+ # {input_values: tensor, attention_mask: tensor}
401
+ inputs = self.preprocessor(
402
+ wavs, sampling_rate=sample_rate, return_tensors="pt"
403
+ ).to(device)
404
+
405
+ outputs = self.model(**inputs, output_hidden_states=True)
406
+ # (25 layers, time steps, 1024 feature_dim)
407
+ all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
408
+ # (1, frame_len, 1024) -> (frame_len, 1024)
409
+ feature = outputs.hidden_states[
410
+ self.cfg.preprocess.mert_feature_layer
411
+ ].squeeze(0)
412
+ mert_features.append(feature)
413
+
414
+ return mert_features
415
+
416
+
417
+ def extract_utt_content_features_dataloader(cfg, metadata, num_workers):
418
+ dataset_name = metadata[0]["Dataset"]
419
+
420
+ if cfg.preprocess.extract_whisper_feature:
421
+ feat_dir = os.path.join(cfg.preprocess.processed_dir, dataset_name, "whisper")
422
+ os.makedirs(feat_dir, exist_ok=True)
423
+ feat_files_num = len(os.listdir(feat_dir))
424
+
425
+ if feat_files_num != len(metadata):
426
+ whisper_waveforms = FFmpegDataset(
427
+ cfg, dataset_name, cfg.preprocess.whisper_sample_rate, metadata=metadata
428
+ )
429
+ data_loader = DataLoader(
430
+ whisper_waveforms,
431
+ num_workers=num_workers,
432
+ shuffle=False,
433
+ pin_memory=cfg.preprocess.pin_memory,
434
+ batch_size=cfg.preprocess.content_feature_batch_size,
435
+ collate_fn=collate_batch,
436
+ drop_last=False,
437
+ )
438
+ extractor = WhisperExtractor(cfg)
439
+ extractor.load_model()
440
+ for batch_idx, items in enumerate(tqdm(data_loader)):
441
+ _metadata, wavs, lens = items
442
+
443
+ batch_content_features = extractor.extract_content_features(
444
+ wavs,
445
+ lens,
446
+ )
447
+ for index, utt in enumerate(_metadata):
448
+ extractor.save_feature(utt, batch_content_features[index])
449
+
450
+ if cfg.preprocess.extract_contentvec_feature:
451
+ feat_dir = os.path.join(
452
+ cfg.preprocess.processed_dir, dataset_name, "contentvec"
453
+ )
454
+ os.makedirs(feat_dir, exist_ok=True)
455
+ feat_files_num = len(os.listdir(feat_dir))
456
+
457
+ if feat_files_num != len(metadata):
458
+ contentvec_waveforms = LibrosaDataset(
459
+ cfg,
460
+ dataset_name,
461
+ cfg.preprocess.contentvec_sample_rate,
462
+ metadata=metadata,
463
+ )
464
+ data_loader = DataLoader(
465
+ contentvec_waveforms,
466
+ num_workers=num_workers,
467
+ shuffle=False,
468
+ pin_memory=cfg.preprocess.pin_memory,
469
+ batch_size=cfg.preprocess.content_feature_batch_size,
470
+ collate_fn=collate_batch,
471
+ drop_last=False,
472
+ )
473
+ extractor = ContentvecExtractor(cfg)
474
+ extractor.load_model()
475
+ for batch_idx, items in enumerate(tqdm(data_loader)):
476
+ _metadata, wavs, lens = items
477
+
478
+ batch_content_features = extractor.extract_content_features(wavs, lens)
479
+ for index, utt in enumerate(_metadata):
480
+ extractor.save_feature(utt, batch_content_features[index])
481
+
482
+ if cfg.preprocess.extract_wenet_feature:
483
+ feat_dir = os.path.join(cfg.preprocess.processed_dir, dataset_name, "wenet")
484
+ os.makedirs(feat_dir, exist_ok=True)
485
+ feat_files_num = len(os.listdir(feat_dir))
486
+
487
+ if feat_files_num != len(metadata):
488
+ wenet_waveforms = TorchaudioDataset(
489
+ cfg, dataset_name, cfg.preprocess.wenet_sample_rate, metadata=metadata
490
+ )
491
+ data_loader = DataLoader(
492
+ wenet_waveforms,
493
+ num_workers=num_workers,
494
+ shuffle=False,
495
+ pin_memory=cfg.preprocess.pin_memory,
496
+ batch_size=cfg.preprocess.content_feature_batch_size,
497
+ collate_fn=collate_batch,
498
+ drop_last=False,
499
+ )
500
+ extractor = WenetExtractor(cfg)
501
+ extractor.load_model()
502
+ for batch_idx, items in enumerate(tqdm(data_loader)):
503
+ _metadata, wavs, lens = items
504
+
505
+ batch_content_features = extractor.extract_content_features(
506
+ wavs,
507
+ lens,
508
+ )
509
+ for index, utt in enumerate(_metadata):
510
+ extractor.save_feature(utt, batch_content_features[index])
511
+
512
+ if cfg.preprocess.extract_mert_feature:
513
+ feat_dir = os.path.join(cfg.preprocess.processed_dir, dataset_name, "mert")
514
+ os.makedirs(feat_dir, exist_ok=True)
515
+ feat_files_num = len(os.listdir(feat_dir))
516
+
517
+ if feat_files_num != len(metadata):
518
+ mert_waveforms = TorchaudioDataset(
519
+ cfg, dataset_name, cfg.preprocess.mert_sample_rate, metadata=metadata
520
+ )
521
+ data_loader = DataLoader(
522
+ mert_waveforms,
523
+ num_workers=num_workers,
524
+ shuffle=False,
525
+ pin_memory=cfg.preprocess.pin_memory,
526
+ batch_size=cfg.preprocess.content_feature_batch_size,
527
+ collate_fn=collate_batch,
528
+ drop_last=False,
529
+ )
530
+ extractor = MertExtractor(cfg)
531
+ extractor.load_model()
532
+ for batch_idx, items in enumerate(tqdm(data_loader)):
533
+ _metadata, wavs, lens = items
534
+
535
+ batch_content_features = extractor.extract_content_features(
536
+ wavs,
537
+ lens,
538
+ )
539
+ for index, utt in enumerate(_metadata):
540
+ extractor.save_feature(utt, batch_content_features[index])
processors/data_augment.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import math
7
+ import random
8
+ import os
9
+ import json
10
+
11
+ import numpy as np
12
+ import parselmouth
13
+ import torch
14
+ import torchaudio
15
+ from tqdm import tqdm
16
+
17
+ from audiomentations import TimeStretch
18
+
19
+ from pedalboard import (
20
+ Pedalboard,
21
+ HighShelfFilter,
22
+ LowShelfFilter,
23
+ PeakFilter,
24
+ PitchShift,
25
+ )
26
+
27
+ from utils.util import has_existed
28
+
29
+ PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT = 0.0
30
+ PRAAT_CHANGEGENDER_FORMANTSHIFTRATIO_DEFAULT = 1.0
31
+ PRAAT_CHANGEGENDER_PITCHSHIFTRATIO_DEFAULT = 1.0
32
+ PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT = 1.0
33
+ PRAAT_CHANGEGENDER_DURATIONFACTOR_DEFAULT = 1.0
34
+
35
+
36
+ def wav_to_Sound(wav, sr: int) -> parselmouth.Sound:
37
+ """Convert a waveform to a parselmouth.Sound object
38
+
39
+ Args:
40
+ wav (np.ndarray/torch.Tensor): waveform of shape (n_channels, n_samples)
41
+ sr (int, optional): sampling rate.
42
+
43
+ Returns:
44
+ parselmouth.Sound: a parselmouth.Sound object
45
+ """
46
+ assert wav.shape == (1, len(wav[0])), "wav must be of shape (1, n_samples)"
47
+ sound = None
48
+ if isinstance(wav, np.ndarray):
49
+ sound = parselmouth.Sound(wav[0], sampling_frequency=sr)
50
+ elif isinstance(wav, torch.Tensor):
51
+ sound = parselmouth.Sound(wav[0].numpy(), sampling_frequency=sr)
52
+ assert sound is not None, "wav must be either np.ndarray or torch.Tensor"
53
+ return sound
54
+
55
+
56
+ def get_pitch_median(wav, sr: int):
57
+ """Get the median pitch of a waveform
58
+
59
+ Args:
60
+ wav (np.ndarray/torch.Tensor): waveform of shape (n_channels, n_samples)
61
+ sr (int, optional): sampling rate.
62
+
63
+ Returns:
64
+ parselmouth.Pitch, float: a parselmouth.Pitch object and the median pitch
65
+ """
66
+ if not isinstance(wav, parselmouth.Sound):
67
+ sound = wav_to_Sound(wav, sr)
68
+ else:
69
+ sound = wav
70
+ pitch_median = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT
71
+
72
+ # To Pitch: Time step(s)(standard value: 0.0), Pitch floor (Hz)(standard value: 75), Pitch ceiling (Hz)(standard value: 600.0)
73
+ pitch = parselmouth.praat.call(sound, "To Pitch", 0.8 / 75, 75, 600)
74
+ # Get quantile: From time (s), To time (s), Quantile(0.5 is then the 50% quantile, i.e., the median), Units (Hertz or Bark)
75
+ pitch_median = parselmouth.praat.call(pitch, "Get quantile", 0.0, 0.0, 0.5, "Hertz")
76
+
77
+ return pitch, pitch_median
78
+
79
+
80
+ def change_gender(
81
+ sound,
82
+ pitch=None,
83
+ formant_shift_ratio: float = PRAAT_CHANGEGENDER_FORMANTSHIFTRATIO_DEFAULT,
84
+ new_pitch_median: float = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT,
85
+ pitch_range_ratio: float = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT,
86
+ duration_factor: float = PRAAT_CHANGEGENDER_DURATIONFACTOR_DEFAULT,
87
+ ) -> parselmouth.Sound:
88
+ """Invoke change gender function in praat
89
+
90
+ Args:
91
+ sound (parselmouth.Sound): a parselmouth.Sound object
92
+ pitch (parselmouth.Pitch, optional): a parselmouth.Pitch object. Defaults to None.
93
+ formant_shift_ratio (float, optional): formant shift ratio. A value of 1.0 means no change. Greater than 1.0 means higher pitch. Less than 1.0 means lower pitch.
94
+ new_pitch_median (float, optional): new pitch median.
95
+ pitch_range_ratio (float, optional): pitch range ratio. A value of 1.0 means no change. Greater than 1.0 means higher pitch range. Less than 1.0 means lower pitch range.
96
+ duration_factor (float, optional): duration factor. A value of 1.0 means no change. Greater than 1.0 means longer duration. Less than 1.0 means shorter duration.
97
+
98
+ Returns:
99
+ parselmouth.Sound: a parselmouth.Sound object
100
+ """
101
+ if pitch is None:
102
+ new_sound = parselmouth.praat.call(
103
+ sound,
104
+ "Change gender",
105
+ 75,
106
+ 600,
107
+ formant_shift_ratio,
108
+ new_pitch_median,
109
+ pitch_range_ratio,
110
+ duration_factor,
111
+ )
112
+ else:
113
+ new_sound = parselmouth.praat.call(
114
+ (sound, pitch),
115
+ "Change gender",
116
+ formant_shift_ratio,
117
+ new_pitch_median,
118
+ pitch_range_ratio,
119
+ duration_factor,
120
+ )
121
+ return new_sound
122
+
123
+
124
+ def apply_formant_and_pitch_shift(
125
+ sound: parselmouth.Sound,
126
+ formant_shift_ratio: float = PRAAT_CHANGEGENDER_FORMANTSHIFTRATIO_DEFAULT,
127
+ pitch_shift_ratio: float = PRAAT_CHANGEGENDER_PITCHSHIFTRATIO_DEFAULT,
128
+ pitch_range_ratio: float = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT,
129
+ duration_factor: float = PRAAT_CHANGEGENDER_DURATIONFACTOR_DEFAULT,
130
+ ) -> parselmouth.Sound:
131
+ """use Praat "Changer gender" command to manipulate pitch and formant
132
+ "Change gender": Praat -> Sound Object -> Convert -> Change gender
133
+ refer to Help of Praat for more details
134
+ # https://github.com/YannickJadoul/Parselmouth/issues/25#issuecomment-608632887 might help
135
+ """
136
+ pitch = None
137
+ new_pitch_median = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT
138
+ if pitch_shift_ratio != 1.0:
139
+ pitch, pitch_median = get_pitch_median(sound, sound.sampling_frequency)
140
+ new_pitch_median = pitch_median * pitch_shift_ratio
141
+
142
+ # refer to https://github.com/praat/praat/issues/1926#issuecomment-974909408
143
+ pitch_minimum = parselmouth.praat.call(
144
+ pitch, "Get minimum", 0.0, 0.0, "Hertz", "Parabolic"
145
+ )
146
+ new_median = pitch_median * pitch_shift_ratio
147
+ scaled_minimum = pitch_minimum * pitch_shift_ratio
148
+ result_minimum = new_median + (scaled_minimum - new_median) * pitch_range_ratio
149
+ if result_minimum < 0:
150
+ new_pitch_median = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT
151
+ pitch_range_ratio = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT
152
+
153
+ if math.isnan(new_pitch_median):
154
+ new_pitch_median = PRAAT_CHANGEGENDER_PITCHMEDIAN_DEFAULT
155
+ pitch_range_ratio = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT
156
+
157
+ new_sound = change_gender(
158
+ sound,
159
+ pitch,
160
+ formant_shift_ratio,
161
+ new_pitch_median,
162
+ pitch_range_ratio,
163
+ duration_factor,
164
+ )
165
+ return new_sound
166
+
167
+
168
+ # Function used in EQ
169
+ def pedalboard_equalizer(wav: np.ndarray, sr: int) -> np.ndarray:
170
+ """Use pedalboard to do equalizer"""
171
+ board = Pedalboard()
172
+
173
+ cutoff_low_freq = 60
174
+ cutoff_high_freq = 10000
175
+
176
+ q_min = 2
177
+ q_max = 5
178
+
179
+ random_all_freq = True
180
+ num_filters = 10
181
+ if random_all_freq:
182
+ key_freqs = [random.uniform(1, 12000) for _ in range(num_filters)]
183
+ else:
184
+ key_freqs = [
185
+ power_ratio(float(z) / (num_filters - 1), cutoff_low_freq, cutoff_high_freq)
186
+ for z in range(num_filters)
187
+ ]
188
+ q_values = [
189
+ power_ratio(random.uniform(0, 1), q_min, q_max) for _ in range(num_filters)
190
+ ]
191
+ gains = [random.uniform(-12, 12) for _ in range(num_filters)]
192
+ # low-shelving filter
193
+ board.append(
194
+ LowShelfFilter(
195
+ cutoff_frequency_hz=key_freqs[0], gain_db=gains[0], q=q_values[0]
196
+ )
197
+ )
198
+ # peaking filters
199
+ for i in range(1, 9):
200
+ board.append(
201
+ PeakFilter(
202
+ cutoff_frequency_hz=key_freqs[i], gain_db=gains[i], q=q_values[i]
203
+ )
204
+ )
205
+ # high-shelving filter
206
+ board.append(
207
+ HighShelfFilter(
208
+ cutoff_frequency_hz=key_freqs[9], gain_db=gains[9], q=q_values[9]
209
+ )
210
+ )
211
+
212
+ # Apply the pedalboard to the audio
213
+ processed_audio = board(wav, sr)
214
+ return processed_audio
215
+
216
+
217
+ def power_ratio(r: float, a: float, b: float):
218
+ return a * math.pow((b / a), r)
219
+
220
+
221
+ def audiomentations_time_stretch(wav: np.ndarray, sr: int) -> np.ndarray:
222
+ """Use audiomentations to do time stretch"""
223
+ transform = TimeStretch(
224
+ min_rate=0.8, max_rate=1.25, leave_length_unchanged=False, p=1.0
225
+ )
226
+ augmented_wav = transform(wav, sample_rate=sr)
227
+ return augmented_wav
228
+
229
+
230
+ def formant_and_pitch_shift(
231
+ sound: parselmouth.Sound, fs: bool, ps: bool
232
+ ) -> parselmouth.Sound:
233
+ """ """
234
+ formant_shift_ratio = PRAAT_CHANGEGENDER_FORMANTSHIFTRATIO_DEFAULT
235
+ pitch_shift_ratio = PRAAT_CHANGEGENDER_PITCHSHIFTRATIO_DEFAULT
236
+ pitch_range_ratio = PRAAT_CHANGEGENDER_PITCHRANGERATIO_DEFAULT
237
+
238
+ assert fs != ps, "fs, ps are mutually exclusive"
239
+
240
+ if fs:
241
+ formant_shift_ratio = random.uniform(1.0, 1.4)
242
+ use_reciprocal = random.uniform(-1, 1) > 0
243
+ if use_reciprocal:
244
+ formant_shift_ratio = 1.0 / formant_shift_ratio
245
+ # only use praat to change formant
246
+ new_sound = apply_formant_and_pitch_shift(
247
+ sound,
248
+ formant_shift_ratio=formant_shift_ratio,
249
+ )
250
+ return new_sound
251
+
252
+ if ps:
253
+ board = Pedalboard()
254
+ board.append(PitchShift(random.uniform(-12, 12)))
255
+ wav_numpy = sound.values
256
+ wav_numpy = board(wav_numpy, sound.sampling_frequency)
257
+ # use pedalboard to change pitch
258
+ new_sound = parselmouth.Sound(
259
+ wav_numpy, sampling_frequency=sound.sampling_frequency
260
+ )
261
+ return new_sound
262
+
263
+
264
+ def wav_manipulation(
265
+ wav: torch.Tensor,
266
+ sr: int,
267
+ aug_type: str = "None",
268
+ formant_shift: bool = False,
269
+ pitch_shift: bool = False,
270
+ time_stretch: bool = False,
271
+ equalizer: bool = False,
272
+ ) -> torch.Tensor:
273
+ assert aug_type == "None" or aug_type in [
274
+ "formant_shift",
275
+ "pitch_shift",
276
+ "time_stretch",
277
+ "equalizer",
278
+ ], "aug_type must be one of formant_shift, pitch_shift, time_stretch, equalizer"
279
+
280
+ assert aug_type == "None" or (
281
+ formant_shift == False
282
+ and pitch_shift == False
283
+ and time_stretch == False
284
+ and equalizer == False
285
+ ), "if aug_type is specified, other argument must be False"
286
+
287
+ if aug_type != "None":
288
+ if aug_type == "formant_shift":
289
+ formant_shift = True
290
+ if aug_type == "pitch_shift":
291
+ pitch_shift = True
292
+ if aug_type == "equalizer":
293
+ equalizer = True
294
+ if aug_type == "time_stretch":
295
+ time_stretch = True
296
+
297
+ wav_numpy = wav.numpy()
298
+
299
+ if equalizer:
300
+ wav_numpy = pedalboard_equalizer(wav_numpy, sr)
301
+
302
+ if time_stretch:
303
+ wav_numpy = audiomentations_time_stretch(wav_numpy, sr)
304
+
305
+ sound = wav_to_Sound(wav_numpy, sr)
306
+
307
+ if formant_shift or pitch_shift:
308
+ sound = formant_and_pitch_shift(sound, formant_shift, pitch_shift)
309
+
310
+ wav = torch.from_numpy(sound.values).float()
311
+ # shape (1, n_samples)
312
+ return wav
313
+
314
+
315
+ def augment_dataset(cfg, dataset) -> list:
316
+ """Augment dataset with formant_shift, pitch_shift, time_stretch, equalizer
317
+
318
+ Args:
319
+ cfg (dict): configuration
320
+ dataset (str): dataset name
321
+
322
+ Returns:
323
+ list: augmented dataset names
324
+ """
325
+ # load metadata
326
+ dataset_path = os.path.join(cfg.preprocess.processed_dir, dataset)
327
+ split = ["train", "test"] if "eval" not in dataset else ["test"]
328
+ augment_datasets = []
329
+ aug_types = [
330
+ "formant_shift" if cfg.preprocess.use_formant_shift else None,
331
+ "pitch_shift" if cfg.preprocess.use_pitch_shift else None,
332
+ "time_stretch" if cfg.preprocess.use_time_stretch else None,
333
+ "equalizer" if cfg.preprocess.use_equalizer else None,
334
+ ]
335
+ aug_types = filter(None, aug_types)
336
+ for aug_type in aug_types:
337
+ print("Augmenting {} with {}...".format(dataset, aug_type))
338
+ new_dataset = dataset + "_" + aug_type
339
+ augment_datasets.append(new_dataset)
340
+ new_dataset_path = os.path.join(cfg.preprocess.processed_dir, new_dataset)
341
+
342
+ for dataset_type in split:
343
+ metadata_path = os.path.join(dataset_path, "{}.json".format(dataset_type))
344
+ augmented_metadata = []
345
+ new_metadata_path = os.path.join(
346
+ new_dataset_path, "{}.json".format(dataset_type)
347
+ )
348
+ os.makedirs(new_dataset_path, exist_ok=True)
349
+ new_dataset_wav_dir = os.path.join(new_dataset_path, "wav")
350
+ os.makedirs(new_dataset_wav_dir, exist_ok=True)
351
+
352
+ if has_existed(new_metadata_path):
353
+ continue
354
+
355
+ with open(metadata_path, "r") as f:
356
+ metadata = json.load(f)
357
+
358
+ for utt in tqdm(metadata):
359
+ original_wav_path = utt["Path"]
360
+ original_wav, sr = torchaudio.load(original_wav_path)
361
+ new_wav = wav_manipulation(original_wav, sr, aug_type=aug_type)
362
+ new_wav_path = os.path.join(new_dataset_wav_dir, utt["Uid"] + ".wav")
363
+ torchaudio.save(new_wav_path, new_wav, sr)
364
+ new_utt = {
365
+ "Dataset": utt["Dataset"] + "_" + aug_type,
366
+ "index": utt["index"],
367
+ "Singer": utt["Singer"],
368
+ "Uid": utt["Uid"],
369
+ "Path": new_wav_path,
370
+ "Duration": utt["Duration"],
371
+ }
372
+ augmented_metadata.append(new_utt)
373
+ new_metadata_path = os.path.join(
374
+ new_dataset_path, "{}.json".format(dataset_type)
375
+ )
376
+ with open(new_metadata_path, "w") as f:
377
+ json.dump(augmented_metadata, f, indent=4, ensure_ascii=False)
378
+ return augment_datasets
processors/phone_extractor.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ from tqdm import tqdm
8
+ from text.g2p_module import G2PModule, LexiconModule
9
+ from text.symbol_table import SymbolTable
10
+
11
+ '''
12
+ phoneExtractor: extract phone from text
13
+ '''
14
+ class phoneExtractor:
15
+ def __init__(self, cfg, dataset_name=None, phone_symbol_file=None):
16
+ '''
17
+ Args:
18
+ cfg: config
19
+ dataset_name: name of dataset
20
+ '''
21
+ self.cfg = cfg
22
+
23
+ # phone symbols dict
24
+ self.phone_symbols = set()
25
+
26
+ # phone symbols dict file
27
+ if phone_symbol_file is not None:
28
+ self.phone_symbols_file = phone_symbol_file
29
+ elif dataset_name is not None:
30
+ self.dataset_name = dataset_name
31
+ self.phone_symbols_file = os.path.join(cfg.preprocess.processed_dir,
32
+ dataset_name,
33
+ cfg.preprocess.symbols_dict)
34
+
35
+
36
+ # initialize g2p module
37
+ if cfg.preprocess.phone_extractor in ["espeak", "pypinyin", "pypinyin_initials_finals"]:
38
+ self.g2p_module = G2PModule(backend=cfg.preprocess.phone_extractor)
39
+ elif cfg.preprocess.phone_extractor == 'lexicon':
40
+ assert cfg.preprocess.lexicon_path != ""
41
+ self.g2p_module = LexiconModule(cfg.preprocess.lexicon_path)
42
+ else:
43
+ print('No suppert to', cfg.preprocess.phone_extractor)
44
+ raise
45
+
46
+
47
+ def extract_phone(self, text):
48
+ '''
49
+ Extract phone from text
50
+ Args:
51
+
52
+ text: text of utterance
53
+
54
+ Returns:
55
+ phone_symbols: set of phone symbols
56
+ phone_seq: list of phone sequence of each utterance
57
+ '''
58
+
59
+ if self.cfg.preprocess.phone_extractor in ["espeak", "pypinyin", "pypinyin_initials_finals"]:
60
+ text = text.replace("”", '"').replace("“", '"')
61
+ phone = self.g2p_module.g2p_conversion(text=text)
62
+ self.phone_symbols.update(phone)
63
+ phone_seq = [phn for phn in phone]
64
+
65
+ elif self.cfg.preprocess.phone_extractor == 'lexicon':
66
+ phone_seq = self.g2p_module.g2p_conversion(text)
67
+ phone = phone_seq
68
+ if not isinstance(phone_seq, list):
69
+ phone_seq = phone_seq.split()
70
+
71
+ return phone_seq
72
+
73
+ def save_dataset_phone_symbols_to_table(self):
74
+ # load and merge saved phone symbols
75
+ if os.path.exists(self.phone_symbols_file):
76
+ phone_symbol_dict_saved = SymbolTable.from_file(self.phone_symbols_file)._sym2id.keys()
77
+ self.phone_symbols.update(set(phone_symbol_dict_saved))
78
+
79
+ # save phone symbols
80
+ phone_symbol_dict = SymbolTable()
81
+ for s in sorted(list(self.phone_symbols)):
82
+ phone_symbol_dict.add(s)
83
+ phone_symbol_dict.to_file(self.phone_symbols_file)
84
+
85
+
86
+ def extract_utt_phone_sequence(cfg, metadata):
87
+ '''
88
+ Extract phone sequence from text
89
+ Args:
90
+ cfg: config
91
+ metadata: list of dict, each dict contains "Uid", "Text"
92
+
93
+ '''
94
+
95
+ dataset_name = cfg.dataset[0]
96
+
97
+ # output path
98
+ out_path = os.path.join(cfg.preprocess.processed_dir, dataset_name, cfg.preprocess.phone_dir)
99
+ os.makedirs(out_path, exist_ok=True)
100
+
101
+ phone_extractor = phoneExtractor(cfg, dataset_name)
102
+
103
+ for utt in tqdm(metadata):
104
+ uid = utt["Uid"]
105
+ text = utt["Text"]
106
+
107
+ phone_seq = phone_extractor.extract_phone(text)
108
+
109
+ phone_path = os.path.join(out_path, uid+'.phone')
110
+ with open(phone_path, 'w') as fin:
111
+ fin.write(' '.join(phone_seq))
112
+
113
+ if cfg.preprocess.phone_extractor != 'lexicon':
114
+ phone_extractor.save_dataset_phone_symbols_to_table()
115
+
116
+
117
+
118
+ def save_all_dataset_phone_symbols_to_table(self, cfg, dataset):
119
+ # phone symbols dict
120
+ phone_symbols = set()
121
+
122
+ for dataset_name in dataset:
123
+ phone_symbols_file = os.path.join(cfg.preprocess.processed_dir,
124
+ dataset_name,
125
+ cfg.preprocess.symbols_dict)
126
+
127
+ # load and merge saved phone symbols
128
+ assert os.path.exists(phone_symbols_file)
129
+ phone_symbol_dict_saved = SymbolTable.from_file(phone_symbols_file)._sym2id.keys()
130
+ phone_symbols.update(set(phone_symbol_dict_saved))
131
+
132
+ # save all phone symbols to each dataset
133
+ phone_symbol_dict = SymbolTable()
134
+ for s in sorted(list(phone_symbols)):
135
+ phone_symbol_dict.add(s)
136
+ for dataset_name in dataset:
137
+ phone_symbols_file = os.path.join(cfg.preprocess.processed_dir,
138
+ dataset_name,
139
+ cfg.preprocess.symbols_dict)
140
+ phone_symbol_dict.to_file(phone_symbols_file)
141
+
142
+
text/__init__.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """ This code is modified from https://github.com/keithito/tacotron """
7
+ import re
8
+ from text import cleaners
9
+ from text.symbols import symbols
10
+
11
+
12
+ # Mappings from symbol to numeric ID and vice versa:
13
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
15
+
16
+ # Regular expression matching text enclosed in curly braces:
17
+ _curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")
18
+
19
+
20
+ def text_to_sequence(text, cleaner_names):
21
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
22
+
23
+ The text can optionally have ARPAbet sequences enclosed in curly braces embedded
24
+ in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
25
+
26
+ Args:
27
+ text: string to convert to a sequence
28
+ cleaner_names: names of the cleaner functions to run the text through
29
+
30
+ Returns:
31
+ List of integers corresponding to the symbols in the text
32
+ """
33
+ sequence = []
34
+
35
+ # Check for curly braces and treat their contents as ARPAbet:
36
+ while len(text):
37
+ m = _curly_re.match(text)
38
+
39
+ if not m:
40
+ sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
41
+ break
42
+ sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
43
+ sequence += _arpabet_to_sequence(m.group(2))
44
+ text = m.group(3)
45
+ return sequence
46
+
47
+
48
+ def sequence_to_text(sequence):
49
+ """Converts a sequence of IDs back to a string"""
50
+ result = ""
51
+ for symbol_id in sequence:
52
+ if symbol_id in _id_to_symbol:
53
+ s = _id_to_symbol[symbol_id]
54
+ # Enclose ARPAbet back in curly braces:
55
+ if len(s) > 1 and s[0] == "@":
56
+ s = "{%s}" % s[1:]
57
+ result += s
58
+ return result.replace("}{", " ")
59
+
60
+
61
+ def _clean_text(text, cleaner_names):
62
+ for name in cleaner_names:
63
+ cleaner = getattr(cleaners, name)
64
+ if not cleaner:
65
+ raise Exception("Unknown cleaner: %s" % name)
66
+ text = cleaner(text)
67
+ return text
68
+
69
+
70
+ def _symbols_to_sequence(symbols):
71
+ return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
72
+
73
+
74
+ def _arpabet_to_sequence(text):
75
+ return _symbols_to_sequence(["@" + s for s in text.split()])
76
+
77
+
78
+ def _should_keep_symbol(s):
79
+ return s in _symbol_to_id and s != "_" and s != "~"
text/cleaners.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """ This code is modified from https://github.com/keithito/tacotron """
7
+
8
+ """
9
+ Cleaners are transformations that run over the input text at both training and eval time.
10
+
11
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
12
+ hyperparameter. Some cleaners are English-specific. You'll typically want to use:
13
+ 1. "english_cleaners" for English text
14
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
15
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
16
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
17
+ the symbols in symbols.py to match your data).
18
+ """
19
+
20
+
21
+ # Regular expression matching whitespace:
22
+ import re
23
+ from unidecode import unidecode
24
+ from .numbers import normalize_numbers
25
+
26
+ _whitespace_re = re.compile(r"\s+")
27
+
28
+ # List of (regular expression, replacement) pairs for abbreviations:
29
+ _abbreviations = [
30
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
31
+ for x in [
32
+ ("mrs", "misess"),
33
+ ("mr", "mister"),
34
+ ("dr", "doctor"),
35
+ ("st", "saint"),
36
+ ("co", "company"),
37
+ ("jr", "junior"),
38
+ ("maj", "major"),
39
+ ("gen", "general"),
40
+ ("drs", "doctors"),
41
+ ("rev", "reverend"),
42
+ ("lt", "lieutenant"),
43
+ ("hon", "honorable"),
44
+ ("sgt", "sergeant"),
45
+ ("capt", "captain"),
46
+ ("esq", "esquire"),
47
+ ("ltd", "limited"),
48
+ ("col", "colonel"),
49
+ ("ft", "fort"),
50
+ ]
51
+ ]
52
+
53
+
54
+ def expand_abbreviations(text):
55
+ for regex, replacement in _abbreviations:
56
+ text = re.sub(regex, replacement, text)
57
+ return text
58
+
59
+
60
+ def expand_numbers(text):
61
+ return normalize_numbers(text)
62
+
63
+
64
+ def lowercase(text):
65
+ return text.lower()
66
+
67
+
68
+ def collapse_whitespace(text):
69
+ return re.sub(_whitespace_re, " ", text)
70
+
71
+
72
+ def convert_to_ascii(text):
73
+ return unidecode(text)
74
+
75
+
76
+ def basic_cleaners(text):
77
+ """Basic pipeline that lowercases and collapses whitespace without transliteration."""
78
+ text = lowercase(text)
79
+ text = collapse_whitespace(text)
80
+ return text
81
+
82
+
83
+ def transliteration_cleaners(text):
84
+ """Pipeline for non-English text that transliterates to ASCII."""
85
+ text = convert_to_ascii(text)
86
+ text = lowercase(text)
87
+ text = collapse_whitespace(text)
88
+ return text
89
+
90
+
91
+ def english_cleaners(text):
92
+ """Pipeline for English text, including number and abbreviation expansion."""
93
+ text = convert_to_ascii(text)
94
+ text = lowercase(text)
95
+ text = expand_numbers(text)
96
+ text = expand_abbreviations(text)
97
+ text = collapse_whitespace(text)
98
+ return text
text/cmudict.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """ This code is modified from https://github.com/keithito/tacotron """
7
+
8
+ import re
9
+
10
+
11
+ valid_symbols = [
12
+ "AA",
13
+ "AA0",
14
+ "AA1",
15
+ "AA2",
16
+ "AE",
17
+ "AE0",
18
+ "AE1",
19
+ "AE2",
20
+ "AH",
21
+ "AH0",
22
+ "AH1",
23
+ "AH2",
24
+ "AO",
25
+ "AO0",
26
+ "AO1",
27
+ "AO2",
28
+ "AW",
29
+ "AW0",
30
+ "AW1",
31
+ "AW2",
32
+ "AY",
33
+ "AY0",
34
+ "AY1",
35
+ "AY2",
36
+ "B",
37
+ "CH",
38
+ "D",
39
+ "DH",
40
+ "EH",
41
+ "EH0",
42
+ "EH1",
43
+ "EH2",
44
+ "ER",
45
+ "ER0",
46
+ "ER1",
47
+ "ER2",
48
+ "EY",
49
+ "EY0",
50
+ "EY1",
51
+ "EY2",
52
+ "F",
53
+ "G",
54
+ "HH",
55
+ "IH",
56
+ "IH0",
57
+ "IH1",
58
+ "IH2",
59
+ "IY",
60
+ "IY0",
61
+ "IY1",
62
+ "IY2",
63
+ "JH",
64
+ "K",
65
+ "L",
66
+ "M",
67
+ "N",
68
+ "NG",
69
+ "OW",
70
+ "OW0",
71
+ "OW1",
72
+ "OW2",
73
+ "OY",
74
+ "OY0",
75
+ "OY1",
76
+ "OY2",
77
+ "P",
78
+ "R",
79
+ "S",
80
+ "SH",
81
+ "T",
82
+ "TH",
83
+ "UH",
84
+ "UH0",
85
+ "UH1",
86
+ "UH2",
87
+ "UW",
88
+ "UW0",
89
+ "UW1",
90
+ "UW2",
91
+ "V",
92
+ "W",
93
+ "Y",
94
+ "Z",
95
+ "ZH",
96
+ ]
97
+
98
+ _valid_symbol_set = set(valid_symbols)
99
+
100
+
101
+ class CMUDict:
102
+ """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict"""
103
+
104
+ def __init__(self, file_or_path, keep_ambiguous=True):
105
+ if isinstance(file_or_path, str):
106
+ with open(file_or_path, encoding="latin-1") as f:
107
+ entries = _parse_cmudict(f)
108
+ else:
109
+ entries = _parse_cmudict(file_or_path)
110
+ if not keep_ambiguous:
111
+ entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
112
+ self._entries = entries
113
+
114
+ def __len__(self):
115
+ return len(self._entries)
116
+
117
+ def lookup(self, word):
118
+ """Returns list of ARPAbet pronunciations of the given word."""
119
+ return self._entries.get(word.upper())
120
+
121
+
122
+ _alt_re = re.compile(r"\([0-9]+\)")
123
+
124
+
125
+ def _parse_cmudict(file):
126
+ cmudict = {}
127
+ for line in file:
128
+ if len(line) and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"):
129
+ parts = line.split(" ")
130
+ word = re.sub(_alt_re, "", parts[0])
131
+ pronunciation = _get_pronunciation(parts[1])
132
+ if pronunciation:
133
+ if word in cmudict:
134
+ cmudict[word].append(pronunciation)
135
+ else:
136
+ cmudict[word] = [pronunciation]
137
+ return cmudict
138
+
139
+
140
+ def _get_pronunciation(s):
141
+ parts = s.strip().split(" ")
142
+ for part in parts:
143
+ if part not in _valid_symbol_set:
144
+ return None
145
+ return " ".join(parts)
text/g2p.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import re
7
+ from g2p_en import G2p
8
+ from string import punctuation
9
+
10
+
11
+ def read_lexicon(lex_path):
12
+ lexicon = {}
13
+ with open(lex_path) as f:
14
+ for line in f:
15
+ temp = re.split(r"\s+", line.strip("\n"))
16
+ word = temp[0]
17
+ phones = temp[1:]
18
+ if word.lower() not in lexicon:
19
+ lexicon[word.lower()] = phones
20
+ return lexicon
21
+
22
+
23
+ def preprocess_english(text, lexicon):
24
+ text = text.rstrip(punctuation)
25
+
26
+ g2p = G2p()
27
+ phones = []
28
+ words = re.split(r"([,;.\-\?\!\s+])", text)
29
+ for w in words:
30
+ if w.lower() in lexicon:
31
+ phones += lexicon[w.lower()]
32
+ else:
33
+ phones += list(filter(lambda p: p != " ", g2p(w)))
34
+ phones = "}{".join(phones)
35
+ phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
36
+ phones = phones.replace("}{", " ")
37
+
38
+ return phones
text/g2p_module.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import re
8
+ from g2p_en import G2p
9
+ from string import punctuation
10
+ from typing import Any, Dict, List, Optional, Pattern, Union
11
+
12
+ from phonemizer.backend import EspeakBackend
13
+ from phonemizer.backend.espeak.language_switch import LanguageSwitch
14
+ from phonemizer.backend.espeak.words_mismatch import WordMismatch
15
+ from phonemizer.punctuation import Punctuation
16
+ from phonemizer.separator import Separator
17
+
18
+ try:
19
+ from pypinyin import Style, pinyin
20
+ from pypinyin.style._utils import get_finals, get_initials
21
+ except Exception:
22
+ pass
23
+
24
+
25
+ # This code is modified from
26
+ # https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/tokenizer.py
27
+
28
+ class PypinyinBackend:
29
+ """PypinyinBackend for Chinese. Most codes is referenced from espnet.
30
+ There are two types pinyin or initials_finals, one is
31
+ just like "ni1 hao3", the other is like "n i1 h ao3".
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ backend="initials_finals",
37
+ punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
38
+ ) -> None:
39
+ self.backend = backend
40
+ self.punctuation_marks = punctuation_marks
41
+
42
+ def phonemize(
43
+ self, text: List[str], separator: Separator, strip=True, njobs=1
44
+ ) -> List[str]:
45
+ assert isinstance(text, List)
46
+ phonemized = []
47
+ for _text in text:
48
+ _text = re.sub(" +", " ", _text.strip())
49
+ _text = _text.replace(" ", separator.word)
50
+ phones = []
51
+ if self.backend == "pypinyin":
52
+ for n, py in enumerate(
53
+ pinyin(
54
+ _text, style=Style.TONE3, neutral_tone_with_five=True
55
+ )
56
+ ):
57
+ if all([c in self.punctuation_marks for c in py[0]]):
58
+ if len(phones):
59
+ assert phones[-1] == separator.syllable
60
+ phones.pop(-1)
61
+
62
+ phones.extend(list(py[0]))
63
+ else:
64
+ phones.extend([py[0], separator.syllable])
65
+ elif self.backend == "pypinyin_initials_finals":
66
+ for n, py in enumerate(
67
+ pinyin(
68
+ _text, style=Style.TONE3, neutral_tone_with_five=True
69
+ )
70
+ ):
71
+ if all([c in self.punctuation_marks for c in py[0]]):
72
+ if len(phones):
73
+ assert phones[-1] == separator.syllable
74
+ phones.pop(-1)
75
+ phones.extend(list(py[0]))
76
+ else:
77
+ if py[0][-1].isalnum():
78
+ initial = get_initials(py[0], strict=False)
79
+ if py[0][-1].isdigit():
80
+ final = (
81
+ get_finals(py[0][:-1], strict=False)
82
+ + py[0][-1]
83
+ )
84
+ else:
85
+ final = get_finals(py[0], strict=False)
86
+ phones.extend(
87
+ [
88
+ initial,
89
+ separator.phone,
90
+ final,
91
+ separator.syllable,
92
+ ]
93
+ )
94
+ else:
95
+ assert ValueError
96
+ else:
97
+ raise NotImplementedError
98
+ phonemized.append(
99
+ "".join(phones).rstrip(f"{separator.word}{separator.syllable}")
100
+ )
101
+ return phonemized
102
+
103
+
104
+ class G2PModule:
105
+ """Phonemize Text."""
106
+
107
+ def __init__(
108
+ self,
109
+ language="en-us",
110
+ backend="espeak",
111
+ separator=Separator(word="_", syllable="-", phone="|"),
112
+ preserve_punctuation=True,
113
+ punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
114
+ with_stress: bool = False,
115
+ tie: Union[bool, str] = False,
116
+ language_switch: LanguageSwitch = "keep-flags",
117
+ words_mismatch: WordMismatch = "ignore",
118
+ ) -> None:
119
+
120
+ self.backend = self._initialize_backend(
121
+ backend, language, punctuation_marks, preserve_punctuation,
122
+ with_stress, tie, language_switch, words_mismatch
123
+ )
124
+ self.separator = separator
125
+
126
+ def _initialize_backend(
127
+ self, backend, language, punctuation_marks, preserve_punctuation,
128
+ with_stress, tie, language_switch, words_mismatch
129
+ ):
130
+ if backend == "espeak":
131
+ return EspeakBackend(
132
+ language,
133
+ punctuation_marks=punctuation_marks,
134
+ preserve_punctuation=preserve_punctuation,
135
+ with_stress=with_stress,
136
+ tie=tie,
137
+ language_switch=language_switch,
138
+ words_mismatch=words_mismatch,
139
+ )
140
+ elif backend in ["pypinyin", "pypinyin_initials_finals"]:
141
+ return PypinyinBackend(
142
+ backend=backend,
143
+ punctuation_marks=punctuation_marks + self.separator.word,
144
+ )
145
+ else:
146
+ raise NotImplementedError(f"{backend}")
147
+
148
+ def to_list(self, phonemized: str) -> List[str]:
149
+ fields = []
150
+ for word in phonemized.split(self.separator.word):
151
+ pp = re.findall(r"\w+|[^\w\s]", word, re.UNICODE)
152
+ fields.extend(
153
+ [p for p in pp if p != self.separator.phone]
154
+ + [self.separator.word]
155
+ )
156
+ assert len("".join(fields[:-1])) == len(phonemized) - phonemized.count(
157
+ self.separator.phone
158
+ )
159
+ return fields[:-1]
160
+
161
+
162
+ def phonemization(self, text, strip=True) -> List[List[str]]:
163
+ if isinstance(text, str):
164
+ text = [text]
165
+
166
+ phonemized = self.backend.phonemize(
167
+ text, separator=self.separator, strip=strip, njobs=1
168
+ )
169
+ phonemes = [self.to_list(p) for p in phonemized]
170
+ return phonemes
171
+
172
+ def g2p_conversion(self, text: str) -> List[str]:
173
+ phonemes = self.phonemization([text.strip()])
174
+ return phonemes[0]
175
+
176
+
177
+ class LexiconModule:
178
+ def __init__(self, lex_path, language="en-us") -> None:
179
+
180
+ # todo: check lexicon derivation, merge with G2PModule?
181
+ lexicon = {}
182
+ with open(lex_path) as f:
183
+ for line in f:
184
+ temp = re.split(r"\s+", line.strip("\n"))
185
+ word = temp[0]
186
+ phones = temp[1:]
187
+ if word.lower() not in lexicon:
188
+ lexicon[word.lower()] = phones
189
+ self.lexicon = lexicon
190
+ self.language = language
191
+
192
+ def g2p_conversion(self, text):
193
+ phone = None
194
+
195
+ # todo: preprocess with other languages
196
+ if self.language == 'en-us':
197
+ phone = self.preprocess_english(text)
198
+ else:
199
+ print('No support to', self.language)
200
+ raise
201
+
202
+ return phone
203
+
204
+
205
+ def preprocess_english(self, text):
206
+ text = text.rstrip(punctuation)
207
+
208
+ g2p = G2p()
209
+ phones = []
210
+ words = re.split(r"([,;.\-\?\!\s+])", text)
211
+ for w in words:
212
+ if w.lower() in self.lexicon:
213
+ phones += self.lexicon[w.lower()]
214
+ else:
215
+ phones += list(filter(lambda p: p != " ", g2p(w)))
216
+ phones = "}{".join(phones)
217
+ phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
218
+ phones = phones.replace("}{", " ")
219
+
220
+
221
+ return phones
text/lexicon/librispeech-lexicon.txt ADDED
The diff for this file is too large to render. See raw diff
 
text/lexicon/pinyin-lexicon-r.txt ADDED
@@ -0,0 +1,4120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a1 a1
2
+ a2 a2
3
+ a3 a3
4
+ a4 a4
5
+ a5 a5
6
+ ai1 ai1
7
+ ai2 ai2
8
+ ai3 ai3
9
+ ai4 ai4
10
+ ai5 ai5
11
+ an1 an1
12
+ an2 an2
13
+ an3 an3
14
+ an4 an4
15
+ an5 an5
16
+ ang1 ang1
17
+ ang2 ang2
18
+ ang3 ang3
19
+ ang4 ang4
20
+ ang5 ang5
21
+ ao1 ao1
22
+ ao2 ao2
23
+ ao3 ao3
24
+ ao4 ao4
25
+ ao5 ao5
26
+ ba1 b a1
27
+ ba2 b a2
28
+ ba3 b a3
29
+ ba4 b a4
30
+ ba5 b a5
31
+ bai1 b ai1
32
+ bai2 b ai2
33
+ bai3 b ai3
34
+ bai4 b ai4
35
+ bai5 b ai5
36
+ ban1 b an1
37
+ ban2 b an2
38
+ ban3 b an3
39
+ ban4 b an4
40
+ ban5 b an5
41
+ bang1 b ang1
42
+ bang2 b ang2
43
+ bang3 b ang3
44
+ bang4 b ang4
45
+ bang5 b ang5
46
+ bao1 b ao1
47
+ bao2 b ao2
48
+ bao3 b ao3
49
+ bao4 b ao4
50
+ bao5 b ao5
51
+ bei1 b ei1
52
+ bei2 b ei2
53
+ bei3 b ei3
54
+ bei4 b ei4
55
+ bei5 b ei5
56
+ ben1 b en1
57
+ ben2 b en2
58
+ ben3 b en3
59
+ ben4 b en4
60
+ ben5 b en5
61
+ beng1 b eng1
62
+ beng2 b eng2
63
+ beng3 b eng3
64
+ beng4 b eng4
65
+ beng5 b eng5
66
+ bi1 b i1
67
+ bi2 b i2
68
+ bi3 b i3
69
+ bi4 b i4
70
+ bi5 b i5
71
+ bian1 b ian1
72
+ bian2 b ian2
73
+ bian3 b ian3
74
+ bian4 b ian4
75
+ bian5 b ian5
76
+ biao1 b iao1
77
+ biao2 b iao2
78
+ biao3 b iao3
79
+ biao4 b iao4
80
+ biao5 b iao5
81
+ bie1 b ie1
82
+ bie2 b ie2
83
+ bie3 b ie3
84
+ bie4 b ie4
85
+ bie5 b ie5
86
+ bin1 b in1
87
+ bin2 b in2
88
+ bin3 b in3
89
+ bin4 b in4
90
+ bin5 b in5
91
+ bing1 b ing1
92
+ bing2 b ing2
93
+ bing3 b ing3
94
+ bing4 b ing4
95
+ bing5 b ing5
96
+ bo1 b o1
97
+ bo2 b o2
98
+ bo3 b o3
99
+ bo4 b o4
100
+ bo5 b o5
101
+ bu1 b u1
102
+ bu2 b u2
103
+ bu3 b u3
104
+ bu4 b u4
105
+ bu5 b u5
106
+ ca1 c a1
107
+ ca2 c a2
108
+ ca3 c a3
109
+ ca4 c a4
110
+ ca5 c a5
111
+ cai1 c ai1
112
+ cai2 c ai2
113
+ cai3 c ai3
114
+ cai4 c ai4
115
+ cai5 c ai5
116
+ can1 c an1
117
+ can2 c an2
118
+ can3 c an3
119
+ can4 c an4
120
+ can5 c an5
121
+ cang1 c ang1
122
+ cang2 c ang2
123
+ cang3 c ang3
124
+ cang4 c ang4
125
+ cang5 c ang5
126
+ cao1 c ao1
127
+ cao2 c ao2
128
+ cao3 c ao3
129
+ cao4 c ao4
130
+ cao5 c ao5
131
+ ce1 c e1
132
+ ce2 c e2
133
+ ce3 c e3
134
+ ce4 c e4
135
+ ce5 c e5
136
+ cen1 c en1
137
+ cen2 c en2
138
+ cen3 c en3
139
+ cen4 c en4
140
+ cen5 c en5
141
+ ceng1 c eng1
142
+ ceng2 c eng2
143
+ ceng3 c eng3
144
+ ceng4 c eng4
145
+ ceng5 c eng5
146
+ cha1 ch a1
147
+ cha2 ch a2
148
+ cha3 ch a3
149
+ cha4 ch a4
150
+ cha5 ch a5
151
+ chai1 ch ai1
152
+ chai2 ch ai2
153
+ chai3 ch ai3
154
+ chai4 ch ai4
155
+ chai5 ch ai5
156
+ chan1 ch an1
157
+ chan2 ch an2
158
+ chan3 ch an3
159
+ chan4 ch an4
160
+ chan5 ch an5
161
+ chang1 ch ang1
162
+ chang2 ch ang2
163
+ chang3 ch ang3
164
+ chang4 ch ang4
165
+ chang5 ch ang5
166
+ chao1 ch ao1
167
+ chao2 ch ao2
168
+ chao3 ch ao3
169
+ chao4 ch ao4
170
+ chao5 ch ao5
171
+ che1 ch e1
172
+ che2 ch e2
173
+ che3 ch e3
174
+ che4 ch e4
175
+ che5 ch e5
176
+ chen1 ch en1
177
+ chen2 ch en2
178
+ chen3 ch en3
179
+ chen4 ch en4
180
+ chen5 ch en5
181
+ cheng1 ch eng1
182
+ cheng2 ch eng2
183
+ cheng3 ch eng3
184
+ cheng4 ch eng4
185
+ cheng5 ch eng5
186
+ chi1 ch iii1
187
+ chi2 ch iii2
188
+ chi3 ch iii3
189
+ chi4 ch iii4
190
+ chi5 ch iii5
191
+ chong1 ch ong1
192
+ chong2 ch ong2
193
+ chong3 ch ong3
194
+ chong4 ch ong4
195
+ chong5 ch ong5
196
+ chou1 ch ou1
197
+ chou2 ch ou2
198
+ chou3 ch ou3
199
+ chou4 ch ou4
200
+ chou5 ch ou5
201
+ chu1 ch u1
202
+ chu2 ch u2
203
+ chu3 ch u3
204
+ chu4 ch u4
205
+ chu5 ch u5
206
+ chuai1 ch uai1
207
+ chuai2 ch uai2
208
+ chuai3 ch uai3
209
+ chuai4 ch uai4
210
+ chuai5 ch uai5
211
+ chuan1 ch uan1
212
+ chuan2 ch uan2
213
+ chuan3 ch uan3
214
+ chuan4 ch uan4
215
+ chuan5 ch uan5
216
+ chuang1 ch uang1
217
+ chuang2 ch uang2
218
+ chuang3 ch uang3
219
+ chuang4 ch uang4
220
+ chuang5 ch uang5
221
+ chui1 ch uei1
222
+ chui2 ch uei2
223
+ chui3 ch uei3
224
+ chui4 ch uei4
225
+ chui5 ch uei5
226
+ chun1 ch uen1
227
+ chun2 ch uen2
228
+ chun3 ch uen3
229
+ chun4 ch uen4
230
+ chun5 ch uen5
231
+ chuo1 ch uo1
232
+ chuo2 ch uo2
233
+ chuo3 ch uo3
234
+ chuo4 ch uo4
235
+ chuo5 ch uo5
236
+ ci1 c ii1
237
+ ci2 c ii2
238
+ ci3 c ii3
239
+ ci4 c ii4
240
+ ci5 c ii5
241
+ cong1 c ong1
242
+ cong2 c ong2
243
+ cong3 c ong3
244
+ cong4 c ong4
245
+ cong5 c ong5
246
+ cou1 c ou1
247
+ cou2 c ou2
248
+ cou3 c ou3
249
+ cou4 c ou4
250
+ cou5 c ou5
251
+ cu1 c u1
252
+ cu2 c u2
253
+ cu3 c u3
254
+ cu4 c u4
255
+ cu5 c u5
256
+ cuan1 c uan1
257
+ cuan2 c uan2
258
+ cuan3 c uan3
259
+ cuan4 c uan4
260
+ cuan5 c uan5
261
+ cui1 c uei1
262
+ cui2 c uei2
263
+ cui3 c uei3
264
+ cui4 c uei4
265
+ cui5 c uei5
266
+ cun1 c uen1
267
+ cun2 c uen2
268
+ cun3 c uen3
269
+ cun4 c uen4
270
+ cun5 c uen5
271
+ cuo1 c uo1
272
+ cuo2 c uo2
273
+ cuo3 c uo3
274
+ cuo4 c uo4
275
+ cuo5 c uo5
276
+ da1 d a1
277
+ da2 d a2
278
+ da3 d a3
279
+ da4 d a4
280
+ da5 d a5
281
+ dai1 d ai1
282
+ dai2 d ai2
283
+ dai3 d ai3
284
+ dai4 d ai4
285
+ dai5 d ai5
286
+ dan1 d an1
287
+ dan2 d an2
288
+ dan3 d an3
289
+ dan4 d an4
290
+ dan5 d an5
291
+ dang1 d ang1
292
+ dang2 d ang2
293
+ dang3 d ang3
294
+ dang4 d ang4
295
+ dang5 d ang5
296
+ dao1 d ao1
297
+ dao2 d ao2
298
+ dao3 d ao3
299
+ dao4 d ao4
300
+ dao5 d ao5
301
+ de1 d e1
302
+ de2 d e2
303
+ de3 d e3
304
+ de4 d e4
305
+ de5 d e5
306
+ dei1 d ei1
307
+ dei2 d ei2
308
+ dei3 d ei3
309
+ dei4 d ei4
310
+ dei5 d ei5
311
+ den1 d en1
312
+ den2 d en2
313
+ den3 d en3
314
+ den4 d en4
315
+ den5 d en5
316
+ deng1 d eng1
317
+ deng2 d eng2
318
+ deng3 d eng3
319
+ deng4 d eng4
320
+ deng5 d eng5
321
+ di1 d i1
322
+ di2 d i2
323
+ di3 d i3
324
+ di4 d i4
325
+ di5 d i5
326
+ dia1 d ia1
327
+ dia2 d ia2
328
+ dia3 d ia3
329
+ dia4 d ia4
330
+ dia5 d ia5
331
+ dian1 d ian1
332
+ dian2 d ian2
333
+ dian3 d ian3
334
+ dian4 d ian4
335
+ dian5 d ian5
336
+ diao1 d iao1
337
+ diao2 d iao2
338
+ diao3 d iao3
339
+ diao4 d iao4
340
+ diao5 d iao5
341
+ die1 d ie1
342
+ die2 d ie2
343
+ die3 d ie3
344
+ die4 d ie4
345
+ die5 d ie5
346
+ ding1 d ing1
347
+ ding2 d ing2
348
+ ding3 d ing3
349
+ ding4 d ing4
350
+ ding5 d ing5
351
+ diu1 d iou1
352
+ diu2 d iou2
353
+ diu3 d iou3
354
+ diu4 d iou4
355
+ diu5 d iou5
356
+ dong1 d ong1
357
+ dong2 d ong2
358
+ dong3 d ong3
359
+ dong4 d ong4
360
+ dong5 d ong5
361
+ dou1 d ou1
362
+ dou2 d ou2
363
+ dou3 d ou3
364
+ dou4 d ou4
365
+ dou5 d ou5
366
+ du1 d u1
367
+ du2 d u2
368
+ du3 d u3
369
+ du4 d u4
370
+ du5 d u5
371
+ duan1 d uan1
372
+ duan2 d uan2
373
+ duan3 d uan3
374
+ duan4 d uan4
375
+ duan5 d uan5
376
+ dui1 d uei1
377
+ dui2 d uei2
378
+ dui3 d uei3
379
+ dui4 d uei4
380
+ dui5 d uei5
381
+ dun1 d uen1
382
+ dun2 d uen2
383
+ dun3 d uen3
384
+ dun4 d uen4
385
+ dun5 d uen5
386
+ duo1 d uo1
387
+ duo2 d uo2
388
+ duo3 d uo3
389
+ duo4 d uo4
390
+ duo5 d uo5
391
+ e1 e1
392
+ e2 e2
393
+ e3 e3
394
+ e4 e4
395
+ e5 e5
396
+ ei1 ei1
397
+ ei2 ei2
398
+ ei3 ei3
399
+ ei4 ei4
400
+ ei5 ei5
401
+ en1 en1
402
+ en2 en2
403
+ en3 en3
404
+ en4 en4
405
+ en5 en5
406
+ eng1 eng1
407
+ eng2 eng2
408
+ eng3 eng3
409
+ eng4 eng4
410
+ eng5 eng5
411
+ r1 er1
412
+ r2 er2
413
+ r3 er3
414
+ r4 er4
415
+ r5 er5
416
+ er1 er1
417
+ er2 er2
418
+ er3 er3
419
+ er4 er4
420
+ er5 er5
421
+ fa1 f a1
422
+ fa2 f a2
423
+ fa3 f a3
424
+ fa4 f a4
425
+ fa5 f a5
426
+ fan1 f an1
427
+ fan2 f an2
428
+ fan3 f an3
429
+ fan4 f an4
430
+ fan5 f an5
431
+ fang1 f ang1
432
+ fang2 f ang2
433
+ fang3 f ang3
434
+ fang4 f ang4
435
+ fang5 f ang5
436
+ fei1 f ei1
437
+ fei2 f ei2
438
+ fei3 f ei3
439
+ fei4 f ei4
440
+ fei5 f ei5
441
+ fen1 f en1
442
+ fen2 f en2
443
+ fen3 f en3
444
+ fen4 f en4
445
+ fen5 f en5
446
+ feng1 f eng1
447
+ feng2 f eng2
448
+ feng3 f eng3
449
+ feng4 f eng4
450
+ feng5 f eng5
451
+ fo1 f o1
452
+ fo2 f o2
453
+ fo3 f o3
454
+ fo4 f o4
455
+ fo5 f o5
456
+ fou1 f ou1
457
+ fou2 f ou2
458
+ fou3 f ou3
459
+ fou4 f ou4
460
+ fou5 f ou5
461
+ fu1 f u1
462
+ fu2 f u2
463
+ fu3 f u3
464
+ fu4 f u4
465
+ fu5 f u5
466
+ ga1 g a1
467
+ ga2 g a2
468
+ ga3 g a3
469
+ ga4 g a4
470
+ ga5 g a5
471
+ gai1 g ai1
472
+ gai2 g ai2
473
+ gai3 g ai3
474
+ gai4 g ai4
475
+ gai5 g ai5
476
+ gan1 g an1
477
+ gan2 g an2
478
+ gan3 g an3
479
+ gan4 g an4
480
+ gan5 g an5
481
+ gang1 g ang1
482
+ gang2 g ang2
483
+ gang3 g ang3
484
+ gang4 g ang4
485
+ gang5 g ang5
486
+ gao1 g ao1
487
+ gao2 g ao2
488
+ gao3 g ao3
489
+ gao4 g ao4
490
+ gao5 g ao5
491
+ ge1 g e1
492
+ ge2 g e2
493
+ ge3 g e3
494
+ ge4 g e4
495
+ ge5 g e5
496
+ gei1 g ei1
497
+ gei2 g ei2
498
+ gei3 g ei3
499
+ gei4 g ei4
500
+ gei5 g ei5
501
+ gen1 g en1
502
+ gen2 g en2
503
+ gen3 g en3
504
+ gen4 g en4
505
+ gen5 g en5
506
+ geng1 g eng1
507
+ geng2 g eng2
508
+ geng3 g eng3
509
+ geng4 g eng4
510
+ geng5 g eng5
511
+ gong1 g ong1
512
+ gong2 g ong2
513
+ gong3 g ong3
514
+ gong4 g ong4
515
+ gong5 g ong5
516
+ gou1 g ou1
517
+ gou2 g ou2
518
+ gou3 g ou3
519
+ gou4 g ou4
520
+ gou5 g ou5
521
+ gu1 g u1
522
+ gu2 g u2
523
+ gu3 g u3
524
+ gu4 g u4
525
+ gu5 g u5
526
+ gua1 g ua1
527
+ gua2 g ua2
528
+ gua3 g ua3
529
+ gua4 g ua4
530
+ gua5 g ua5
531
+ guai1 g uai1
532
+ guai2 g uai2
533
+ guai3 g uai3
534
+ guai4 g uai4
535
+ guai5 g uai5
536
+ guan1 g uan1
537
+ guan2 g uan2
538
+ guan3 g uan3
539
+ guan4 g uan4
540
+ guan5 g uan5
541
+ guang1 g uang1
542
+ guang2 g uang2
543
+ guang3 g uang3
544
+ guang4 g uang4
545
+ guang5 g uang5
546
+ gui1 g uei1
547
+ gui2 g uei2
548
+ gui3 g uei3
549
+ gui4 g uei4
550
+ gui5 g uei5
551
+ gun1 g uen1
552
+ gun2 g uen2
553
+ gun3 g uen3
554
+ gun4 g uen4
555
+ gun5 g uen5
556
+ guo1 g uo1
557
+ guo2 g uo2
558
+ guo3 g uo3
559
+ guo4 g uo4
560
+ guo5 g uo5
561
+ ha1 h a1
562
+ ha2 h a2
563
+ ha3 h a3
564
+ ha4 h a4
565
+ ha5 h a5
566
+ hai1 h ai1
567
+ hai2 h ai2
568
+ hai3 h ai3
569
+ hai4 h ai4
570
+ hai5 h ai5
571
+ han1 h an1
572
+ han2 h an2
573
+ han3 h an3
574
+ han4 h an4
575
+ han5 h an5
576
+ hang1 h ang1
577
+ hang2 h ang2
578
+ hang3 h ang3
579
+ hang4 h ang4
580
+ hang5 h ang5
581
+ hao1 h ao1
582
+ hao2 h ao2
583
+ hao3 h ao3
584
+ hao4 h ao4
585
+ hao5 h ao5
586
+ he1 h e1
587
+ he2 h e2
588
+ he3 h e3
589
+ he4 h e4
590
+ he5 h e5
591
+ hei1 h ei1
592
+ hei2 h ei2
593
+ hei3 h ei3
594
+ hei4 h ei4
595
+ hei5 h ei5
596
+ hen1 h en1
597
+ hen2 h en2
598
+ hen3 h en3
599
+ hen4 h en4
600
+ hen5 h en5
601
+ heng1 h eng1
602
+ heng2 h eng2
603
+ heng3 h eng3
604
+ heng4 h eng4
605
+ heng5 h eng5
606
+ hong1 h ong1
607
+ hong2 h ong2
608
+ hong3 h ong3
609
+ hong4 h ong4
610
+ hong5 h ong5
611
+ hou1 h ou1
612
+ hou2 h ou2
613
+ hou3 h ou3
614
+ hou4 h ou4
615
+ hou5 h ou5
616
+ hu1 h u1
617
+ hu2 h u2
618
+ hu3 h u3
619
+ hu4 h u4
620
+ hu5 h u5
621
+ hua1 h ua1
622
+ hua2 h ua2
623
+ hua3 h ua3
624
+ hua4 h ua4
625
+ hua5 h ua5
626
+ huai1 h uai1
627
+ huai2 h uai2
628
+ huai3 h uai3
629
+ huai4 h uai4
630
+ huai5 h uai5
631
+ huan1 h uan1
632
+ huan2 h uan2
633
+ huan3 h uan3
634
+ huan4 h uan4
635
+ huan5 h uan5
636
+ huang1 h uang1
637
+ huang2 h uang2
638
+ huang3 h uang3
639
+ huang4 h uang4
640
+ huang5 h uang5
641
+ hui1 h uei1
642
+ hui2 h uei2
643
+ hui3 h uei3
644
+ hui4 h uei4
645
+ hui5 h uei5
646
+ hun1 h uen1
647
+ hun2 h uen2
648
+ hun3 h uen3
649
+ hun4 h uen4
650
+ hun5 h uen5
651
+ huo1 h uo1
652
+ huo2 h uo2
653
+ huo3 h uo3
654
+ huo4 h uo4
655
+ huo5 h uo5
656
+ ji1 j i1
657
+ ji2 j i2
658
+ ji3 j i3
659
+ ji4 j i4
660
+ ji5 j i5
661
+ jia1 j ia1
662
+ jia2 j ia2
663
+ jia3 j ia3
664
+ jia4 j ia4
665
+ jia5 j ia5
666
+ jian1 j ian1
667
+ jian2 j ian2
668
+ jian3 j ian3
669
+ jian4 j ian4
670
+ jian5 j ian5
671
+ jiang1 j iang1
672
+ jiang2 j iang2
673
+ jiang3 j iang3
674
+ jiang4 j iang4
675
+ jiang5 j iang5
676
+ jiao1 j iao1
677
+ jiao2 j iao2
678
+ jiao3 j iao3
679
+ jiao4 j iao4
680
+ jiao5 j iao5
681
+ jie1 j ie1
682
+ jie2 j ie2
683
+ jie3 j ie3
684
+ jie4 j ie4
685
+ jie5 j ie5
686
+ jin1 j in1
687
+ jin2 j in2
688
+ jin3 j in3
689
+ jin4 j in4
690
+ jin5 j in5
691
+ jing1 j ing1
692
+ jing2 j ing2
693
+ jing3 j ing3
694
+ jing4 j ing4
695
+ jing5 j ing5
696
+ jiong1 j iong1
697
+ jiong2 j iong2
698
+ jiong3 j iong3
699
+ jiong4 j iong4
700
+ jiong5 j iong5
701
+ jiu1 j iou1
702
+ jiu2 j iou2
703
+ jiu3 j iou3
704
+ jiu4 j iou4
705
+ jiu5 j iou5
706
+ ju1 j v1
707
+ ju2 j v2
708
+ ju3 j v3
709
+ ju4 j v4
710
+ ju5 j v5
711
+ juan1 j van1
712
+ juan2 j van2
713
+ juan3 j van3
714
+ juan4 j van4
715
+ juan5 j van5
716
+ jue1 j ve1
717
+ jue2 j ve2
718
+ jue3 j ve3
719
+ jue4 j ve4
720
+ jue5 j ve5
721
+ jun1 j vn1
722
+ jun2 j vn2
723
+ jun3 j vn3
724
+ jun4 j vn4
725
+ jun5 j vn5
726
+ ka1 k a1
727
+ ka2 k a2
728
+ ka3 k a3
729
+ ka4 k a4
730
+ ka5 k a5
731
+ kai1 k ai1
732
+ kai2 k ai2
733
+ kai3 k ai3
734
+ kai4 k ai4
735
+ kai5 k ai5
736
+ kan1 k an1
737
+ kan2 k an2
738
+ kan3 k an3
739
+ kan4 k an4
740
+ kan5 k an5
741
+ kang1 k ang1
742
+ kang2 k ang2
743
+ kang3 k ang3
744
+ kang4 k ang4
745
+ kang5 k ang5
746
+ kao1 k ao1
747
+ kao2 k ao2
748
+ kao3 k ao3
749
+ kao4 k ao4
750
+ kao5 k ao5
751
+ ke1 k e1
752
+ ke2 k e2
753
+ ke3 k e3
754
+ ke4 k e4
755
+ ke5 k e5
756
+ kei1 k ei1
757
+ kei2 k ei2
758
+ kei3 k ei3
759
+ kei4 k ei4
760
+ kei5 k ei5
761
+ ken1 k en1
762
+ ken2 k en2
763
+ ken3 k en3
764
+ ken4 k en4
765
+ ken5 k en5
766
+ keng1 k eng1
767
+ keng2 k eng2
768
+ keng3 k eng3
769
+ keng4 k eng4
770
+ keng5 k eng5
771
+ kong1 k ong1
772
+ kong2 k ong2
773
+ kong3 k ong3
774
+ kong4 k ong4
775
+ kong5 k ong5
776
+ kou1 k ou1
777
+ kou2 k ou2
778
+ kou3 k ou3
779
+ kou4 k ou4
780
+ kou5 k ou5
781
+ ku1 k u1
782
+ ku2 k u2
783
+ ku3 k u3
784
+ ku4 k u4
785
+ ku5 k u5
786
+ kua1 k ua1
787
+ kua2 k ua2
788
+ kua3 k ua3
789
+ kua4 k ua4
790
+ kua5 k ua5
791
+ kuai1 k uai1
792
+ kuai2 k uai2
793
+ kuai3 k uai3
794
+ kuai4 k uai4
795
+ kuai5 k uai5
796
+ kuan1 k uan1
797
+ kuan2 k uan2
798
+ kuan3 k uan3
799
+ kuan4 k uan4
800
+ kuan5 k uan5
801
+ kuang1 k uang1
802
+ kuang2 k uang2
803
+ kuang3 k uang3
804
+ kuang4 k uang4
805
+ kuang5 k uang5
806
+ kui1 k uei1
807
+ kui2 k uei2
808
+ kui3 k uei3
809
+ kui4 k uei4
810
+ kui5 k uei5
811
+ kun1 k uen1
812
+ kun2 k uen2
813
+ kun3 k uen3
814
+ kun4 k uen4
815
+ kun5 k uen5
816
+ kuo1 k uo1
817
+ kuo2 k uo2
818
+ kuo3 k uo3
819
+ kuo4 k uo4
820
+ kuo5 k uo5
821
+ la1 l a1
822
+ la2 l a2
823
+ la3 l a3
824
+ la4 l a4
825
+ la5 l a5
826
+ lai1 l ai1
827
+ lai2 l ai2
828
+ lai3 l ai3
829
+ lai4 l ai4
830
+ lai5 l ai5
831
+ lan1 l an1
832
+ lan2 l an2
833
+ lan3 l an3
834
+ lan4 l an4
835
+ lan5 l an5
836
+ lang1 l ang1
837
+ lang2 l ang2
838
+ lang3 l ang3
839
+ lang4 l ang4
840
+ lang5 l ang5
841
+ lao1 l ao1
842
+ lao2 l ao2
843
+ lao3 l ao3
844
+ lao4 l ao4
845
+ lao5 l ao5
846
+ le1 l e1
847
+ le2 l e2
848
+ le3 l e3
849
+ le4 l e4
850
+ le5 l e5
851
+ lei1 l ei1
852
+ lei2 l ei2
853
+ lei3 l ei3
854
+ lei4 l ei4
855
+ lei5 l ei5
856
+ leng1 l eng1
857
+ leng2 l eng2
858
+ leng3 l eng3
859
+ leng4 l eng4
860
+ leng5 l eng5
861
+ li1 l i1
862
+ li2 l i2
863
+ li3 l i3
864
+ li4 l i4
865
+ li5 l i5
866
+ lia1 l ia1
867
+ lia2 l ia2
868
+ lia3 l ia3
869
+ lia4 l ia4
870
+ lia5 l ia5
871
+ lian1 l ian1
872
+ lian2 l ian2
873
+ lian3 l ian3
874
+ lian4 l ian4
875
+ lian5 l ian5
876
+ liang1 l iang1
877
+ liang2 l iang2
878
+ liang3 l iang3
879
+ liang4 l iang4
880
+ liang5 l iang5
881
+ liao1 l iao1
882
+ liao2 l iao2
883
+ liao3 l iao3
884
+ liao4 l iao4
885
+ liao5 l iao5
886
+ lie1 l ie1
887
+ lie2 l ie2
888
+ lie3 l ie3
889
+ lie4 l ie4
890
+ lie5 l ie5
891
+ lin1 l in1
892
+ lin2 l in2
893
+ lin3 l in3
894
+ lin4 l in4
895
+ lin5 l in5
896
+ ling1 l ing1
897
+ ling2 l ing2
898
+ ling3 l ing3
899
+ ling4 l ing4
900
+ ling5 l ing5
901
+ liu1 l iou1
902
+ liu2 l iou2
903
+ liu3 l iou3
904
+ liu4 l iou4
905
+ liu5 l iou5
906
+ lo1 l o1
907
+ lo2 l o2
908
+ lo3 l o3
909
+ lo4 l o4
910
+ lo5 l o5
911
+ long1 l ong1
912
+ long2 l ong2
913
+ long3 l ong3
914
+ long4 l ong4
915
+ long5 l ong5
916
+ lou1 l ou1
917
+ lou2 l ou2
918
+ lou3 l ou3
919
+ lou4 l ou4
920
+ lou5 l ou5
921
+ lu1 l u1
922
+ lu2 l u2
923
+ lu3 l u3
924
+ lu4 l u4
925
+ lu5 l u5
926
+ luan1 l uan1
927
+ luan2 l uan2
928
+ luan3 l uan3
929
+ luan4 l uan4
930
+ luan5 l uan5
931
+ lue1 l ve1
932
+ lue2 l ve2
933
+ lue3 l ve3
934
+ lue4 l ve4
935
+ lue5 l ve5
936
+ lve1 l ve1
937
+ lve2 l ve2
938
+ lve3 l ve3
939
+ lve4 l ve4
940
+ lve5 l ve5
941
+ lun1 l uen1
942
+ lun2 l uen2
943
+ lun3 l uen3
944
+ lun4 l uen4
945
+ lun5 l uen5
946
+ luo1 l uo1
947
+ luo2 l uo2
948
+ luo3 l uo3
949
+ luo4 l uo4
950
+ luo5 l uo5
951
+ lv1 l v1
952
+ lv2 l v2
953
+ lv3 l v3
954
+ lv4 l v4
955
+ lv5 l v5
956
+ ma1 m a1
957
+ ma2 m a2
958
+ ma3 m a3
959
+ ma4 m a4
960
+ ma5 m a5
961
+ mai1 m ai1
962
+ mai2 m ai2
963
+ mai3 m ai3
964
+ mai4 m ai4
965
+ mai5 m ai5
966
+ man1 m an1
967
+ man2 m an2
968
+ man3 m an3
969
+ man4 m an4
970
+ man5 m an5
971
+ mang1 m ang1
972
+ mang2 m ang2
973
+ mang3 m ang3
974
+ mang4 m ang4
975
+ mang5 m ang5
976
+ mao1 m ao1
977
+ mao2 m ao2
978
+ mao3 m ao3
979
+ mao4 m ao4
980
+ mao5 m ao5
981
+ me1 m e1
982
+ me2 m e2
983
+ me3 m e3
984
+ me4 m e4
985
+ me5 m e5
986
+ mei1 m ei1
987
+ mei2 m ei2
988
+ mei3 m ei3
989
+ mei4 m ei4
990
+ mei5 m ei5
991
+ men1 m en1
992
+ men2 m en2
993
+ men3 m en3
994
+ men4 m en4
995
+ men5 m en5
996
+ meng1 m eng1
997
+ meng2 m eng2
998
+ meng3 m eng3
999
+ meng4 m eng4
1000
+ meng5 m eng5
1001
+ mi1 m i1
1002
+ mi2 m i2
1003
+ mi3 m i3
1004
+ mi4 m i4
1005
+ mi5 m i5
1006
+ mian1 m ian1
1007
+ mian2 m ian2
1008
+ mian3 m ian3
1009
+ mian4 m ian4
1010
+ mian5 m ian5
1011
+ miao1 m iao1
1012
+ miao2 m iao2
1013
+ miao3 m iao3
1014
+ miao4 m iao4
1015
+ miao5 m iao5
1016
+ mie1 m ie1
1017
+ mie2 m ie2
1018
+ mie3 m ie3
1019
+ mie4 m ie4
1020
+ mie5 m ie5
1021
+ min1 m in1
1022
+ min2 m in2
1023
+ min3 m in3
1024
+ min4 m in4
1025
+ min5 m in5
1026
+ ming1 m ing1
1027
+ ming2 m ing2
1028
+ ming3 m ing3
1029
+ ming4 m ing4
1030
+ ming5 m ing5
1031
+ miu1 m iou1
1032
+ miu2 m iou2
1033
+ miu3 m iou3
1034
+ miu4 m iou4
1035
+ miu5 m iou5
1036
+ mo1 m o1
1037
+ mo2 m o2
1038
+ mo3 m o3
1039
+ mo4 m o4
1040
+ mo5 m o5
1041
+ mou1 m ou1
1042
+ mou2 m ou2
1043
+ mou3 m ou3
1044
+ mou4 m ou4
1045
+ mou5 m ou5
1046
+ mu1 m u1
1047
+ mu2 m u2
1048
+ mu3 m u3
1049
+ mu4 m u4
1050
+ mu5 m u5
1051
+ na1 n a1
1052
+ na2 n a2
1053
+ na3 n a3
1054
+ na4 n a4
1055
+ na5 n a5
1056
+ nai1 n ai1
1057
+ nai2 n ai2
1058
+ nai3 n ai3
1059
+ nai4 n ai4
1060
+ nai5 n ai5
1061
+ nan1 n an1
1062
+ nan2 n an2
1063
+ nan3 n an3
1064
+ nan4 n an4
1065
+ nan5 n an5
1066
+ nang1 n ang1
1067
+ nang2 n ang2
1068
+ nang3 n ang3
1069
+ nang4 n ang4
1070
+ nang5 n ang5
1071
+ nao1 n ao1
1072
+ nao2 n ao2
1073
+ nao3 n ao3
1074
+ nao4 n ao4
1075
+ nao5 n ao5
1076
+ ne1 n e1
1077
+ ne2 n e2
1078
+ ne3 n e3
1079
+ ne4 n e4
1080
+ ne5 n e5
1081
+ nei1 n ei1
1082
+ nei2 n ei2
1083
+ nei3 n ei3
1084
+ nei4 n ei4
1085
+ nei5 n ei5
1086
+ nen1 n en1
1087
+ nen2 n en2
1088
+ nen3 n en3
1089
+ nen4 n en4
1090
+ nen5 n en5
1091
+ neng1 n eng1
1092
+ neng2 n eng2
1093
+ neng3 n eng3
1094
+ neng4 n eng4
1095
+ neng5 n eng5
1096
+ ni1 n i1
1097
+ ni2 n i2
1098
+ ni3 n i3
1099
+ ni4 n i4
1100
+ ni5 n i5
1101
+ nian1 n ian1
1102
+ nian2 n ian2
1103
+ nian3 n ian3
1104
+ nian4 n ian4
1105
+ nian5 n ian5
1106
+ niang1 n iang1
1107
+ niang2 n iang2
1108
+ niang3 n iang3
1109
+ niang4 n iang4
1110
+ niang5 n iang5
1111
+ niao1 n iao1
1112
+ niao2 n iao2
1113
+ niao3 n iao3
1114
+ niao4 n iao4
1115
+ niao5 n iao5
1116
+ nie1 n ie1
1117
+ nie2 n ie2
1118
+ nie3 n ie3
1119
+ nie4 n ie4
1120
+ nie5 n ie5
1121
+ nin1 n in1
1122
+ nin2 n in2
1123
+ nin3 n in3
1124
+ nin4 n in4
1125
+ nin5 n in5
1126
+ ning1 n ing1
1127
+ ning2 n ing2
1128
+ ning3 n ing3
1129
+ ning4 n ing4
1130
+ ning5 n ing5
1131
+ niu1 n iou1
1132
+ niu2 n iou2
1133
+ niu3 n iou3
1134
+ niu4 n iou4
1135
+ niu5 n iou5
1136
+ nong1 n ong1
1137
+ nong2 n ong2
1138
+ nong3 n ong3
1139
+ nong4 n ong4
1140
+ nong5 n ong5
1141
+ nou1 n ou1
1142
+ nou2 n ou2
1143
+ nou3 n ou3
1144
+ nou4 n ou4
1145
+ nou5 n ou5
1146
+ nu1 n u1
1147
+ nu2 n u2
1148
+ nu3 n u3
1149
+ nu4 n u4
1150
+ nu5 n u5
1151
+ nuan1 n uan1
1152
+ nuan2 n uan2
1153
+ nuan3 n uan3
1154
+ nuan4 n uan4
1155
+ nuan5 n uan5
1156
+ nue1 n ve1
1157
+ nue2 n ve2
1158
+ nue3 n ve3
1159
+ nue4 n ve4
1160
+ nue5 n ve5
1161
+ nve1 n ve1
1162
+ nve2 n ve2
1163
+ nve3 n ve3
1164
+ nve4 n ve4
1165
+ nve5 n ve5
1166
+ nuo1 n uo1
1167
+ nuo2 n uo2
1168
+ nuo3 n uo3
1169
+ nuo4 n uo4
1170
+ nuo5 n uo5
1171
+ nv1 n v1
1172
+ nv2 n v2
1173
+ nv3 n v3
1174
+ nv4 n v4
1175
+ nv5 n v5
1176
+ o1 o1
1177
+ o2 o2
1178
+ o3 o3
1179
+ o4 o4
1180
+ o5 o5
1181
+ ou1 ou1
1182
+ ou2 ou2
1183
+ ou3 ou3
1184
+ ou4 ou4
1185
+ ou5 ou5
1186
+ pa1 p a1
1187
+ pa2 p a2
1188
+ pa3 p a3
1189
+ pa4 p a4
1190
+ pa5 p a5
1191
+ pai1 p ai1
1192
+ pai2 p ai2
1193
+ pai3 p ai3
1194
+ pai4 p ai4
1195
+ pai5 p ai5
1196
+ pan1 p an1
1197
+ pan2 p an2
1198
+ pan3 p an3
1199
+ pan4 p an4
1200
+ pan5 p an5
1201
+ pang1 p ang1
1202
+ pang2 p ang2
1203
+ pang3 p ang3
1204
+ pang4 p ang4
1205
+ pang5 p ang5
1206
+ pao1 p ao1
1207
+ pao2 p ao2
1208
+ pao3 p ao3
1209
+ pao4 p ao4
1210
+ pao5 p ao5
1211
+ pei1 p ei1
1212
+ pei2 p ei2
1213
+ pei3 p ei3
1214
+ pei4 p ei4
1215
+ pei5 p ei5
1216
+ pen1 p en1
1217
+ pen2 p en2
1218
+ pen3 p en3
1219
+ pen4 p en4
1220
+ pen5 p en5
1221
+ peng1 p eng1
1222
+ peng2 p eng2
1223
+ peng3 p eng3
1224
+ peng4 p eng4
1225
+ peng5 p eng5
1226
+ pi1 p i1
1227
+ pi2 p i2
1228
+ pi3 p i3
1229
+ pi4 p i4
1230
+ pi5 p i5
1231
+ pian1 p ian1
1232
+ pian2 p ian2
1233
+ pian3 p ian3
1234
+ pian4 p ian4
1235
+ pian5 p ian5
1236
+ piao1 p iao1
1237
+ piao2 p iao2
1238
+ piao3 p iao3
1239
+ piao4 p iao4
1240
+ piao5 p iao5
1241
+ pie1 p ie1
1242
+ pie2 p ie2
1243
+ pie3 p ie3
1244
+ pie4 p ie4
1245
+ pie5 p ie5
1246
+ pin1 p in1
1247
+ pin2 p in2
1248
+ pin3 p in3
1249
+ pin4 p in4
1250
+ pin5 p in5
1251
+ ping1 p ing1
1252
+ ping2 p ing2
1253
+ ping3 p ing3
1254
+ ping4 p ing4
1255
+ ping5 p ing5
1256
+ po1 p o1
1257
+ po2 p o2
1258
+ po3 p o3
1259
+ po4 p o4
1260
+ po5 p o5
1261
+ pou1 p ou1
1262
+ pou2 p ou2
1263
+ pou3 p ou3
1264
+ pou4 p ou4
1265
+ pou5 p ou5
1266
+ pu1 p u1
1267
+ pu2 p u2
1268
+ pu3 p u3
1269
+ pu4 p u4
1270
+ pu5 p u5
1271
+ qi1 q i1
1272
+ qi2 q i2
1273
+ qi3 q i3
1274
+ qi4 q i4
1275
+ qi5 q i5
1276
+ qia1 q ia1
1277
+ qia2 q ia2
1278
+ qia3 q ia3
1279
+ qia4 q ia4
1280
+ qia5 q ia5
1281
+ qian1 q ian1
1282
+ qian2 q ian2
1283
+ qian3 q ian3
1284
+ qian4 q ian4
1285
+ qian5 q ian5
1286
+ qiang1 q iang1
1287
+ qiang2 q iang2
1288
+ qiang3 q iang3
1289
+ qiang4 q iang4
1290
+ qiang5 q iang5
1291
+ qiao1 q iao1
1292
+ qiao2 q iao2
1293
+ qiao3 q iao3
1294
+ qiao4 q iao4
1295
+ qiao5 q iao5
1296
+ qie1 q ie1
1297
+ qie2 q ie2
1298
+ qie3 q ie3
1299
+ qie4 q ie4
1300
+ qie5 q ie5
1301
+ qin1 q in1
1302
+ qin2 q in2
1303
+ qin3 q in3
1304
+ qin4 q in4
1305
+ qin5 q in5
1306
+ qing1 q ing1
1307
+ qing2 q ing2
1308
+ qing3 q ing3
1309
+ qing4 q ing4
1310
+ qing5 q ing5
1311
+ qiong1 q iong1
1312
+ qiong2 q iong2
1313
+ qiong3 q iong3
1314
+ qiong4 q iong4
1315
+ qiong5 q iong5
1316
+ qiu1 q iou1
1317
+ qiu2 q iou2
1318
+ qiu3 q iou3
1319
+ qiu4 q iou4
1320
+ qiu5 q iou5
1321
+ qu1 q v1
1322
+ qu2 q v2
1323
+ qu3 q v3
1324
+ qu4 q v4
1325
+ qu5 q v5
1326
+ quan1 q van1
1327
+ quan2 q van2
1328
+ quan3 q van3
1329
+ quan4 q van4
1330
+ quan5 q van5
1331
+ que1 q ve1
1332
+ que2 q ve2
1333
+ que3 q ve3
1334
+ que4 q ve4
1335
+ que5 q ve5
1336
+ qun1 q vn1
1337
+ qun2 q vn2
1338
+ qun3 q vn3
1339
+ qun4 q vn4
1340
+ qun5 q vn5
1341
+ ran1 r an1
1342
+ ran2 r an2
1343
+ ran3 r an3
1344
+ ran4 r an4
1345
+ ran5 r an5
1346
+ rang1 r ang1
1347
+ rang2 r ang2
1348
+ rang3 r ang3
1349
+ rang4 r ang4
1350
+ rang5 r ang5
1351
+ rao1 r ao1
1352
+ rao2 r ao2
1353
+ rao3 r ao3
1354
+ rao4 r ao4
1355
+ rao5 r ao5
1356
+ re1 r e1
1357
+ re2 r e2
1358
+ re3 r e3
1359
+ re4 r e4
1360
+ re5 r e5
1361
+ ren1 r en1
1362
+ ren2 r en2
1363
+ ren3 r en3
1364
+ ren4 r en4
1365
+ ren5 r en5
1366
+ reng1 r eng1
1367
+ reng2 r eng2
1368
+ reng3 r eng3
1369
+ reng4 r eng4
1370
+ reng5 r eng5
1371
+ ri1 r iii1
1372
+ ri2 r iii2
1373
+ ri3 r iii3
1374
+ ri4 r iii4
1375
+ ri5 r iii5
1376
+ rong1 r ong1
1377
+ rong2 r ong2
1378
+ rong3 r ong3
1379
+ rong4 r ong4
1380
+ rong5 r ong5
1381
+ rou1 r ou1
1382
+ rou2 r ou2
1383
+ rou3 r ou3
1384
+ rou4 r ou4
1385
+ rou5 r ou5
1386
+ ru1 r u1
1387
+ ru2 r u2
1388
+ ru3 r u3
1389
+ ru4 r u4
1390
+ ru5 r u5
1391
+ rua1 r ua1
1392
+ rua2 r ua2
1393
+ rua3 r ua3
1394
+ rua4 r ua4
1395
+ rua5 r ua5
1396
+ ruan1 r uan1
1397
+ ruan2 r uan2
1398
+ ruan3 r uan3
1399
+ ruan4 r uan4
1400
+ ruan5 r uan5
1401
+ rui1 r uei1
1402
+ rui2 r uei2
1403
+ rui3 r uei3
1404
+ rui4 r uei4
1405
+ rui5 r uei5
1406
+ run1 r uen1
1407
+ run2 r uen2
1408
+ run3 r uen3
1409
+ run4 r uen4
1410
+ run5 r uen5
1411
+ ruo1 r uo1
1412
+ ruo2 r uo2
1413
+ ruo3 r uo3
1414
+ ruo4 r uo4
1415
+ ruo5 r uo5
1416
+ sa1 s a1
1417
+ sa2 s a2
1418
+ sa3 s a3
1419
+ sa4 s a4
1420
+ sa5 s a5
1421
+ sai1 s ai1
1422
+ sai2 s ai2
1423
+ sai3 s ai3
1424
+ sai4 s ai4
1425
+ sai5 s ai5
1426
+ san1 s an1
1427
+ san2 s an2
1428
+ san3 s an3
1429
+ san4 s an4
1430
+ san5 s an5
1431
+ sang1 s ang1
1432
+ sang2 s ang2
1433
+ sang3 s ang3
1434
+ sang4 s ang4
1435
+ sang5 s ang5
1436
+ sao1 s ao1
1437
+ sao2 s ao2
1438
+ sao3 s ao3
1439
+ sao4 s ao4
1440
+ sao5 s ao5
1441
+ se1 s e1
1442
+ se2 s e2
1443
+ se3 s e3
1444
+ se4 s e4
1445
+ se5 s e5
1446
+ sen1 s en1
1447
+ sen2 s en2
1448
+ sen3 s en3
1449
+ sen4 s en4
1450
+ sen5 s en5
1451
+ seng1 s eng1
1452
+ seng2 s eng2
1453
+ seng3 s eng3
1454
+ seng4 s eng4
1455
+ seng5 s eng5
1456
+ sha1 sh a1
1457
+ sha2 sh a2
1458
+ sha3 sh a3
1459
+ sha4 sh a4
1460
+ sha5 sh a5
1461
+ shai1 sh ai1
1462
+ shai2 sh ai2
1463
+ shai3 sh ai3
1464
+ shai4 sh ai4
1465
+ shai5 sh ai5
1466
+ shan1 sh an1
1467
+ shan2 sh an2
1468
+ shan3 sh an3
1469
+ shan4 sh an4
1470
+ shan5 sh an5
1471
+ shang1 sh ang1
1472
+ shang2 sh ang2
1473
+ shang3 sh ang3
1474
+ shang4 sh ang4
1475
+ shang5 sh ang5
1476
+ shao1 sh ao1
1477
+ shao2 sh ao2
1478
+ shao3 sh ao3
1479
+ shao4 sh ao4
1480
+ shao5 sh ao5
1481
+ she1 sh e1
1482
+ she2 sh e2
1483
+ she3 sh e3
1484
+ she4 sh e4
1485
+ she5 sh e5
1486
+ shei1 sh ei1
1487
+ shei2 sh ei2
1488
+ shei3 sh ei3
1489
+ shei4 sh ei4
1490
+ shei5 sh ei5
1491
+ shen1 sh en1
1492
+ shen2 sh en2
1493
+ shen3 sh en3
1494
+ shen4 sh en4
1495
+ shen5 sh en5
1496
+ sheng1 sh eng1
1497
+ sheng2 sh eng2
1498
+ sheng3 sh eng3
1499
+ sheng4 sh eng4
1500
+ sheng5 sh eng5
1501
+ shi1 sh iii1
1502
+ shi2 sh iii2
1503
+ shi3 sh iii3
1504
+ shi4 sh iii4
1505
+ shi5 sh iii5
1506
+ shou1 sh ou1
1507
+ shou2 sh ou2
1508
+ shou3 sh ou3
1509
+ shou4 sh ou4
1510
+ shou5 sh ou5
1511
+ shu1 sh u1
1512
+ shu2 sh u2
1513
+ shu3 sh u3
1514
+ shu4 sh u4
1515
+ shu5 sh u5
1516
+ shua1 sh ua1
1517
+ shua2 sh ua2
1518
+ shua3 sh ua3
1519
+ shua4 sh ua4
1520
+ shua5 sh ua5
1521
+ shuai1 sh uai1
1522
+ shuai2 sh uai2
1523
+ shuai3 sh uai3
1524
+ shuai4 sh uai4
1525
+ shuai5 sh uai5
1526
+ shuan1 sh uan1
1527
+ shuan2 sh uan2
1528
+ shuan3 sh uan3
1529
+ shuan4 sh uan4
1530
+ shuan5 sh uan5
1531
+ shuang1 sh uang1
1532
+ shuang2 sh uang2
1533
+ shuang3 sh uang3
1534
+ shuang4 sh uang4
1535
+ shuang5 sh uang5
1536
+ shui1 sh uei1
1537
+ shui2 sh uei2
1538
+ shui3 sh uei3
1539
+ shui4 sh uei4
1540
+ shui5 sh uei5
1541
+ shun1 sh uen1
1542
+ shun2 sh uen2
1543
+ shun3 sh uen3
1544
+ shun4 sh uen4
1545
+ shun5 sh uen5
1546
+ shuo1 sh uo1
1547
+ shuo2 sh uo2
1548
+ shuo3 sh uo3
1549
+ shuo4 sh uo4
1550
+ shuo5 sh uo5
1551
+ si1 s ii1
1552
+ si2 s ii2
1553
+ si3 s ii3
1554
+ si4 s ii4
1555
+ si5 s ii5
1556
+ song1 s ong1
1557
+ song2 s ong2
1558
+ song3 s ong3
1559
+ song4 s ong4
1560
+ song5 s ong5
1561
+ sou1 s ou1
1562
+ sou2 s ou2
1563
+ sou3 s ou3
1564
+ sou4 s ou4
1565
+ sou5 s ou5
1566
+ su1 s u1
1567
+ su2 s u2
1568
+ su3 s u3
1569
+ su4 s u4
1570
+ su5 s u5
1571
+ suan1 s uan1
1572
+ suan2 s uan2
1573
+ suan3 s uan3
1574
+ suan4 s uan4
1575
+ suan5 s uan5
1576
+ sui1 s uei1
1577
+ sui2 s uei2
1578
+ sui3 s uei3
1579
+ sui4 s uei4
1580
+ sui5 s uei5
1581
+ sun1 s uen1
1582
+ sun2 s uen2
1583
+ sun3 s uen3
1584
+ sun4 s uen4
1585
+ sun5 s uen5
1586
+ suo1 s uo1
1587
+ suo2 s uo2
1588
+ suo3 s uo3
1589
+ suo4 s uo4
1590
+ suo5 s uo5
1591
+ ta1 t a1
1592
+ ta2 t a2
1593
+ ta3 t a3
1594
+ ta4 t a4
1595
+ ta5 t a5
1596
+ tai1 t ai1
1597
+ tai2 t ai2
1598
+ tai3 t ai3
1599
+ tai4 t ai4
1600
+ tai5 t ai5
1601
+ tan1 t an1
1602
+ tan2 t an2
1603
+ tan3 t an3
1604
+ tan4 t an4
1605
+ tan5 t an5
1606
+ tang1 t ang1
1607
+ tang2 t ang2
1608
+ tang3 t ang3
1609
+ tang4 t ang4
1610
+ tang5 t ang5
1611
+ tao1 t ao1
1612
+ tao2 t ao2
1613
+ tao3 t ao3
1614
+ tao4 t ao4
1615
+ tao5 t ao5
1616
+ te1 t e1
1617
+ te2 t e2
1618
+ te3 t e3
1619
+ te4 t e4
1620
+ te5 t e5
1621
+ tei1 t ei1
1622
+ tei2 t ei2
1623
+ tei3 t ei3
1624
+ tei4 t ei4
1625
+ tei5 t ei5
1626
+ teng1 t eng1
1627
+ teng2 t eng2
1628
+ teng3 t eng3
1629
+ teng4 t eng4
1630
+ teng5 t eng5
1631
+ ti1 t i1
1632
+ ti2 t i2
1633
+ ti3 t i3
1634
+ ti4 t i4
1635
+ ti5 t i5
1636
+ tian1 t ian1
1637
+ tian2 t ian2
1638
+ tian3 t ian3
1639
+ tian4 t ian4
1640
+ tian5 t ian5
1641
+ tiao1 t iao1
1642
+ tiao2 t iao2
1643
+ tiao3 t iao3
1644
+ tiao4 t iao4
1645
+ tiao5 t iao5
1646
+ tie1 t ie1
1647
+ tie2 t ie2
1648
+ tie3 t ie3
1649
+ tie4 t ie4
1650
+ tie5 t ie5
1651
+ ting1 t ing1
1652
+ ting2 t ing2
1653
+ ting3 t ing3
1654
+ ting4 t ing4
1655
+ ting5 t ing5
1656
+ tong1 t ong1
1657
+ tong2 t ong2
1658
+ tong3 t ong3
1659
+ tong4 t ong4
1660
+ tong5 t ong5
1661
+ tou1 t ou1
1662
+ tou2 t ou2
1663
+ tou3 t ou3
1664
+ tou4 t ou4
1665
+ tou5 t ou5
1666
+ tu1 t u1
1667
+ tu2 t u2
1668
+ tu3 t u3
1669
+ tu4 t u4
1670
+ tu5 t u5
1671
+ tuan1 t uan1
1672
+ tuan2 t uan2
1673
+ tuan3 t uan3
1674
+ tuan4 t uan4
1675
+ tuan5 t uan5
1676
+ tui1 t uei1
1677
+ tui2 t uei2
1678
+ tui3 t uei3
1679
+ tui4 t uei4
1680
+ tui5 t uei5
1681
+ tun1 t uen1
1682
+ tun2 t uen2
1683
+ tun3 t uen3
1684
+ tun4 t uen4
1685
+ tun5 t uen5
1686
+ tuo1 t uo1
1687
+ tuo2 t uo2
1688
+ tuo3 t uo3
1689
+ tuo4 t uo4
1690
+ tuo5 t uo5
1691
+ wa1 w ua1
1692
+ wa2 w ua2
1693
+ wa3 w ua3
1694
+ wa4 w ua4
1695
+ wa5 w ua5
1696
+ wai1 w uai1
1697
+ wai2 w uai2
1698
+ wai3 w uai3
1699
+ wai4 w uai4
1700
+ wai5 w uai5
1701
+ wan1 w uan1
1702
+ wan2 w uan2
1703
+ wan3 w uan3
1704
+ wan4 w uan4
1705
+ wan5 w uan5
1706
+ wang1 w uang1
1707
+ wang2 w uang2
1708
+ wang3 w uang3
1709
+ wang4 w uang4
1710
+ wang5 w uang5
1711
+ wei1 w uei1
1712
+ wei2 w uei2
1713
+ wei3 w uei3
1714
+ wei4 w uei4
1715
+ wei5 w uei5
1716
+ wen1 w uen1
1717
+ wen2 w uen2
1718
+ wen3 w uen3
1719
+ wen4 w uen4
1720
+ wen5 w uen5
1721
+ weng1 w uen1
1722
+ weng2 w uen2
1723
+ weng3 w uen3
1724
+ weng4 w uen4
1725
+ weng5 w uen5
1726
+ wo1 w uo1
1727
+ wo2 w uo2
1728
+ wo3 w uo3
1729
+ wo4 w uo4
1730
+ wo5 w uo5
1731
+ wu1 w u1
1732
+ wu2 w u2
1733
+ wu3 w u3
1734
+ wu4 w u4
1735
+ wu5 w u5
1736
+ xi1 x i1
1737
+ xi2 x i2
1738
+ xi3 x i3
1739
+ xi4 x i4
1740
+ xi5 x i5
1741
+ xia1 x ia1
1742
+ xia2 x ia2
1743
+ xia3 x ia3
1744
+ xia4 x ia4
1745
+ xia5 x ia5
1746
+ xian1 x ian1
1747
+ xian2 x ian2
1748
+ xian3 x ian3
1749
+ xian4 x ian4
1750
+ xian5 x ian5
1751
+ xiang1 x iang1
1752
+ xiang2 x iang2
1753
+ xiang3 x iang3
1754
+ xiang4 x iang4
1755
+ xiang5 x iang5
1756
+ xiao1 x iao1
1757
+ xiao2 x iao2
1758
+ xiao3 x iao3
1759
+ xiao4 x iao4
1760
+ xiao5 x iao5
1761
+ xie1 x ie1
1762
+ xie2 x ie2
1763
+ xie3 x ie3
1764
+ xie4 x ie4
1765
+ xie5 x ie5
1766
+ xin1 x in1
1767
+ xin2 x in2
1768
+ xin3 x in3
1769
+ xin4 x in4
1770
+ xin5 x in5
1771
+ xing1 x ing1
1772
+ xing2 x ing2
1773
+ xing3 x ing3
1774
+ xing4 x ing4
1775
+ xing5 x ing5
1776
+ xiong1 x iong1
1777
+ xiong2 x iong2
1778
+ xiong3 x iong3
1779
+ xiong4 x iong4
1780
+ xiong5 x iong5
1781
+ xiu1 x iou1
1782
+ xiu2 x iou2
1783
+ xiu3 x iou3
1784
+ xiu4 x iou4
1785
+ xiu5 x iou5
1786
+ xu1 x v1
1787
+ xu2 x v2
1788
+ xu3 x v3
1789
+ xu4 x v4
1790
+ xu5 x v5
1791
+ xuan1 x van1
1792
+ xuan2 x van2
1793
+ xuan3 x van3
1794
+ xuan4 x van4
1795
+ xuan5 x van5
1796
+ xue1 x ve1
1797
+ xue2 x ve2
1798
+ xue3 x ve3
1799
+ xue4 x ve4
1800
+ xue5 x ve5
1801
+ xun1 x vn1
1802
+ xun2 x vn2
1803
+ xun3 x vn3
1804
+ xun4 x vn4
1805
+ xun5 x vn5
1806
+ ya1 y ia1
1807
+ ya2 y ia2
1808
+ ya3 y ia3
1809
+ ya4 y ia4
1810
+ ya5 y ia5
1811
+ yan1 y ian1
1812
+ yan2 y ian2
1813
+ yan3 y ian3
1814
+ yan4 y ian4
1815
+ yan5 y ian5
1816
+ yang1 y iang1
1817
+ yang2 y iang2
1818
+ yang3 y iang3
1819
+ yang4 y iang4
1820
+ yang5 y iang5
1821
+ yao1 y iao1
1822
+ yao2 y iao2
1823
+ yao3 y iao3
1824
+ yao4 y iao4
1825
+ yao5 y iao5
1826
+ ye1 y ie1
1827
+ ye2 y ie2
1828
+ ye3 y ie3
1829
+ ye4 y ie4
1830
+ ye5 y ie5
1831
+ yi1 y i1
1832
+ yi2 y i2
1833
+ yi3 y i3
1834
+ yi4 y i4
1835
+ yi5 y i5
1836
+ yin1 y in1
1837
+ yin2 y in2
1838
+ yin3 y in3
1839
+ yin4 y in4
1840
+ yin5 y in5
1841
+ ying1 y ing1
1842
+ ying2 y ing2
1843
+ ying3 y ing3
1844
+ ying4 y ing4
1845
+ ying5 y ing5
1846
+ yo1 y iou1
1847
+ yo2 y iou2
1848
+ yo3 y iou3
1849
+ yo4 y iou4
1850
+ yo5 y iou5
1851
+ yong1 y iong1
1852
+ yong2 y iong2
1853
+ yong3 y iong3
1854
+ yong4 y iong4
1855
+ yong5 y iong5
1856
+ you1 y iou1
1857
+ you2 y iou2
1858
+ you3 y iou3
1859
+ you4 y iou4
1860
+ you5 y iou5
1861
+ yu1 y v1
1862
+ yu2 y v2
1863
+ yu3 y v3
1864
+ yu4 y v4
1865
+ yu5 y v5
1866
+ yuan1 y van1
1867
+ yuan2 y van2
1868
+ yuan3 y van3
1869
+ yuan4 y van4
1870
+ yuan5 y van5
1871
+ yue1 y ve1
1872
+ yue2 y ve2
1873
+ yue3 y ve3
1874
+ yue4 y ve4
1875
+ yue5 y ve5
1876
+ yun1 y vn1
1877
+ yun2 y vn2
1878
+ yun3 y vn3
1879
+ yun4 y vn4
1880
+ yun5 y vn5
1881
+ za1 z a1
1882
+ za2 z a2
1883
+ za3 z a3
1884
+ za4 z a4
1885
+ za5 z a5
1886
+ zai1 z ai1
1887
+ zai2 z ai2
1888
+ zai3 z ai3
1889
+ zai4 z ai4
1890
+ zai5 z ai5
1891
+ zan1 z an1
1892
+ zan2 z an2
1893
+ zan3 z an3
1894
+ zan4 z an4
1895
+ zan5 z an5
1896
+ zang1 z ang1
1897
+ zang2 z ang2
1898
+ zang3 z ang3
1899
+ zang4 z ang4
1900
+ zang5 z ang5
1901
+ zao1 z ao1
1902
+ zao2 z ao2
1903
+ zao3 z ao3
1904
+ zao4 z ao4
1905
+ zao5 z ao5
1906
+ ze1 z e1
1907
+ ze2 z e2
1908
+ ze3 z e3
1909
+ ze4 z e4
1910
+ ze5 z e5
1911
+ zei1 z ei1
1912
+ zei2 z ei2
1913
+ zei3 z ei3
1914
+ zei4 z ei4
1915
+ zei5 z ei5
1916
+ zen1 z en1
1917
+ zen2 z en2
1918
+ zen3 z en3
1919
+ zen4 z en4
1920
+ zen5 z en5
1921
+ zeng1 z eng1
1922
+ zeng2 z eng2
1923
+ zeng3 z eng3
1924
+ zeng4 z eng4
1925
+ zeng5 z eng5
1926
+ zha1 zh a1
1927
+ zha2 zh a2
1928
+ zha3 zh a3
1929
+ zha4 zh a4
1930
+ zha5 zh a5
1931
+ zhai1 zh ai1
1932
+ zhai2 zh ai2
1933
+ zhai3 zh ai3
1934
+ zhai4 zh ai4
1935
+ zhai5 zh ai5
1936
+ zhan1 zh an1
1937
+ zhan2 zh an2
1938
+ zhan3 zh an3
1939
+ zhan4 zh an4
1940
+ zhan5 zh an5
1941
+ zhang1 zh ang1
1942
+ zhang2 zh ang2
1943
+ zhang3 zh ang3
1944
+ zhang4 zh ang4
1945
+ zhang5 zh ang5
1946
+ zhao1 zh ao1
1947
+ zhao2 zh ao2
1948
+ zhao3 zh ao3
1949
+ zhao4 zh ao4
1950
+ zhao5 zh ao5
1951
+ zhe1 zh e1
1952
+ zhe2 zh e2
1953
+ zhe3 zh e3
1954
+ zhe4 zh e4
1955
+ zhe5 zh e5
1956
+ zhei1 zh ei1
1957
+ zhei2 zh ei2
1958
+ zhei3 zh ei3
1959
+ zhei4 zh ei4
1960
+ zhei5 zh ei5
1961
+ zhen1 zh en1
1962
+ zhen2 zh en2
1963
+ zhen3 zh en3
1964
+ zhen4 zh en4
1965
+ zhen5 zh en5
1966
+ zheng1 zh eng1
1967
+ zheng2 zh eng2
1968
+ zheng3 zh eng3
1969
+ zheng4 zh eng4
1970
+ zheng5 zh eng5
1971
+ zhi1 zh iii1
1972
+ zhi2 zh iii2
1973
+ zhi3 zh iii3
1974
+ zhi4 zh iii4
1975
+ zhi5 zh iii5
1976
+ zhong1 zh ong1
1977
+ zhong2 zh ong2
1978
+ zhong3 zh ong3
1979
+ zhong4 zh ong4
1980
+ zhong5 zh ong5
1981
+ zhou1 zh ou1
1982
+ zhou2 zh ou2
1983
+ zhou3 zh ou3
1984
+ zhou4 zh ou4
1985
+ zhou5 zh ou5
1986
+ zhu1 zh u1
1987
+ zhu2 zh u2
1988
+ zhu3 zh u3
1989
+ zhu4 zh u4
1990
+ zhu5 zh u5
1991
+ zhua1 zh ua1
1992
+ zhua2 zh ua2
1993
+ zhua3 zh ua3
1994
+ zhua4 zh ua4
1995
+ zhua5 zh ua5
1996
+ zhuai1 zh uai1
1997
+ zhuai2 zh uai2
1998
+ zhuai3 zh uai3
1999
+ zhuai4 zh uai4
2000
+ zhuai5 zh uai5
2001
+ zhuan1 zh uan1
2002
+ zhuan2 zh uan2
2003
+ zhuan3 zh uan3
2004
+ zhuan4 zh uan4
2005
+ zhuan5 zh uan5
2006
+ zhuang1 zh uang1
2007
+ zhuang2 zh uang2
2008
+ zhuang3 zh uang3
2009
+ zhuang4 zh uang4
2010
+ zhuang5 zh uang5
2011
+ zhui1 zh uei1
2012
+ zhui2 zh uei2
2013
+ zhui3 zh uei3
2014
+ zhui4 zh uei4
2015
+ zhui5 zh uei5
2016
+ zhun1 zh uen1
2017
+ zhun2 zh uen2
2018
+ zhun3 zh uen3
2019
+ zhun4 zh uen4
2020
+ zhun5 zh uen5
2021
+ zhuo1 zh uo1
2022
+ zhuo2 zh uo2
2023
+ zhuo3 zh uo3
2024
+ zhuo4 zh uo4
2025
+ zhuo5 zh uo5
2026
+ zi1 z ii1
2027
+ zi2 z ii2
2028
+ zi3 z ii3
2029
+ zi4 z ii4
2030
+ zi5 z ii5
2031
+ zong1 z ong1
2032
+ zong2 z ong2
2033
+ zong3 z ong3
2034
+ zong4 z ong4
2035
+ zong5 z ong5
2036
+ zou1 z ou1
2037
+ zou2 z ou2
2038
+ zou3 z ou3
2039
+ zou4 z ou4
2040
+ zou5 z ou5
2041
+ zu1 z u1
2042
+ zu2 z u2
2043
+ zu3 z u3
2044
+ zu4 z u4
2045
+ zu5 z u5
2046
+ zuan1 z uan1
2047
+ zuan2 z uan2
2048
+ zuan3 z uan3
2049
+ zuan4 z uan4
2050
+ zuan5 z uan5
2051
+ zui1 z uei1
2052
+ zui2 z uei2
2053
+ zui3 z uei3
2054
+ zui4 z uei4
2055
+ zui5 z uei5
2056
+ zun1 z uen1
2057
+ zun2 z uen2
2058
+ zun3 z uen3
2059
+ zun4 z uen4
2060
+ zun5 z uen5
2061
+ zuo1 z uo1
2062
+ zuo2 z uo2
2063
+ zuo3 z uo3
2064
+ zuo4 z uo4
2065
+ zuo5 z uo5
2066
+ ar1 a1 rr
2067
+ ar2 a2 rr
2068
+ ar3 a3 rr
2069
+ ar4 a4 rr
2070
+ ar5 a5 rr
2071
+ air1 ai1 rr
2072
+ air2 ai2 rr
2073
+ air3 ai3 rr
2074
+ air4 ai4 rr
2075
+ air5 ai5 rr
2076
+ anr1 an1 rr
2077
+ anr2 an2 rr
2078
+ anr3 an3 rr
2079
+ anr4 an4 rr
2080
+ anr5 an5 rr
2081
+ angr1 ang1 rr
2082
+ angr2 ang2 rr
2083
+ angr3 ang3 rr
2084
+ angr4 ang4 rr
2085
+ angr5 ang5 rr
2086
+ aor1 ao1 rr
2087
+ aor2 ao2 rr
2088
+ aor3 ao3 rr
2089
+ aor4 ao4 rr
2090
+ aor5 ao5 rr
2091
+ bar1 b a1 rr
2092
+ bar2 b a2 rr
2093
+ bar3 b a3 rr
2094
+ bar4 b a4 rr
2095
+ bar5 b a5 rr
2096
+ bair1 b ai1 rr
2097
+ bair2 b ai2 rr
2098
+ bair3 b ai3 rr
2099
+ bair4 b ai4 rr
2100
+ bair5 b ai5 rr
2101
+ banr1 b an1 rr
2102
+ banr2 b an2 rr
2103
+ banr3 b an3 rr
2104
+ banr4 b an4 rr
2105
+ banr5 b an5 rr
2106
+ bangr1 b ang1 rr
2107
+ bangr2 b ang2 rr
2108
+ bangr3 b ang3 rr
2109
+ bangr4 b ang4 rr
2110
+ bangr5 b ang5 rr
2111
+ baor1 b ao1 rr
2112
+ baor2 b ao2 rr
2113
+ baor3 b ao3 rr
2114
+ baor4 b ao4 rr
2115
+ baor5 b ao5 rr
2116
+ beir1 b ei1 rr
2117
+ beir2 b ei2 rr
2118
+ beir3 b ei3 rr
2119
+ beir4 b ei4 rr
2120
+ beir5 b ei5 rr
2121
+ benr1 b en1 rr
2122
+ benr2 b en2 rr
2123
+ benr3 b en3 rr
2124
+ benr4 b en4 rr
2125
+ benr5 b en5 rr
2126
+ bengr1 b eng1 rr
2127
+ bengr2 b eng2 rr
2128
+ bengr3 b eng3 rr
2129
+ bengr4 b eng4 rr
2130
+ bengr5 b eng5 rr
2131
+ bir1 b i1 rr
2132
+ bir2 b i2 rr
2133
+ bir3 b i3 rr
2134
+ bir4 b i4 rr
2135
+ bir5 b i5 rr
2136
+ bianr1 b ian1 rr
2137
+ bianr2 b ian2 rr
2138
+ bianr3 b ian3 rr
2139
+ bianr4 b ian4 rr
2140
+ bianr5 b ian5 rr
2141
+ biaor1 b iao1 rr
2142
+ biaor2 b iao2 rr
2143
+ biaor3 b iao3 rr
2144
+ biaor4 b iao4 rr
2145
+ biaor5 b iao5 rr
2146
+ bier1 b ie1 rr
2147
+ bier2 b ie2 rr
2148
+ bier3 b ie3 rr
2149
+ bier4 b ie4 rr
2150
+ bier5 b ie5 rr
2151
+ binr1 b in1 rr
2152
+ binr2 b in2 rr
2153
+ binr3 b in3 rr
2154
+ binr4 b in4 rr
2155
+ binr5 b in5 rr
2156
+ bingr1 b ing1 rr
2157
+ bingr2 b ing2 rr
2158
+ bingr3 b ing3 rr
2159
+ bingr4 b ing4 rr
2160
+ bingr5 b ing5 rr
2161
+ bor1 b o1 rr
2162
+ bor2 b o2 rr
2163
+ bor3 b o3 rr
2164
+ bor4 b o4 rr
2165
+ bor5 b o5 rr
2166
+ bur1 b u1 rr
2167
+ bur2 b u2 rr
2168
+ bur3 b u3 rr
2169
+ bur4 b u4 rr
2170
+ bur5 b u5 rr
2171
+ car1 c a1 rr
2172
+ car2 c a2 rr
2173
+ car3 c a3 rr
2174
+ car4 c a4 rr
2175
+ car5 c a5 rr
2176
+ cair1 c ai1 rr
2177
+ cair2 c ai2 rr
2178
+ cair3 c ai3 rr
2179
+ cair4 c ai4 rr
2180
+ cair5 c ai5 rr
2181
+ canr1 c an1 rr
2182
+ canr2 c an2 rr
2183
+ canr3 c an3 rr
2184
+ canr4 c an4 rr
2185
+ canr5 c an5 rr
2186
+ cangr1 c ang1 rr
2187
+ cangr2 c ang2 rr
2188
+ cangr3 c ang3 rr
2189
+ cangr4 c ang4 rr
2190
+ cangr5 c ang5 rr
2191
+ caor1 c ao1 rr
2192
+ caor2 c ao2 rr
2193
+ caor3 c ao3 rr
2194
+ caor4 c ao4 rr
2195
+ caor5 c ao5 rr
2196
+ cer1 c e1 rr
2197
+ cer2 c e2 rr
2198
+ cer3 c e3 rr
2199
+ cer4 c e4 rr
2200
+ cer5 c e5 rr
2201
+ cenr1 c en1 rr
2202
+ cenr2 c en2 rr
2203
+ cenr3 c en3 rr
2204
+ cenr4 c en4 rr
2205
+ cenr5 c en5 rr
2206
+ cengr1 c eng1 rr
2207
+ cengr2 c eng2 rr
2208
+ cengr3 c eng3 rr
2209
+ cengr4 c eng4 rr
2210
+ cengr5 c eng5 rr
2211
+ char1 ch a1 rr
2212
+ char2 ch a2 rr
2213
+ char3 ch a3 rr
2214
+ char4 ch a4 rr
2215
+ char5 ch a5 rr
2216
+ chair1 ch ai1 rr
2217
+ chair2 ch ai2 rr
2218
+ chair3 ch ai3 rr
2219
+ chair4 ch ai4 rr
2220
+ chair5 ch ai5 rr
2221
+ chanr1 ch an1 rr
2222
+ chanr2 ch an2 rr
2223
+ chanr3 ch an3 rr
2224
+ chanr4 ch an4 rr
2225
+ chanr5 ch an5 rr
2226
+ changr1 ch ang1 rr
2227
+ changr2 ch ang2 rr
2228
+ changr3 ch ang3 rr
2229
+ changr4 ch ang4 rr
2230
+ changr5 ch ang5 rr
2231
+ chaor1 ch ao1 rr
2232
+ chaor2 ch ao2 rr
2233
+ chaor3 ch ao3 rr
2234
+ chaor4 ch ao4 rr
2235
+ chaor5 ch ao5 rr
2236
+ cher1 ch e1 rr
2237
+ cher2 ch e2 rr
2238
+ cher3 ch e3 rr
2239
+ cher4 ch e4 rr
2240
+ cher5 ch e5 rr
2241
+ chenr1 ch en1 rr
2242
+ chenr2 ch en2 rr
2243
+ chenr3 ch en3 rr
2244
+ chenr4 ch en4 rr
2245
+ chenr5 ch en5 rr
2246
+ chengr1 ch eng1 rr
2247
+ chengr2 ch eng2 rr
2248
+ chengr3 ch eng3 rr
2249
+ chengr4 ch eng4 rr
2250
+ chengr5 ch eng5 rr
2251
+ chir1 ch iii1 rr
2252
+ chir2 ch iii2 rr
2253
+ chir3 ch iii3 rr
2254
+ chir4 ch iii4 rr
2255
+ chir5 ch iii5 rr
2256
+ chongr1 ch ong1 rr
2257
+ chongr2 ch ong2 rr
2258
+ chongr3 ch ong3 rr
2259
+ chongr4 ch ong4 rr
2260
+ chongr5 ch ong5 rr
2261
+ chour1 ch ou1 rr
2262
+ chour2 ch ou2 rr
2263
+ chour3 ch ou3 rr
2264
+ chour4 ch ou4 rr
2265
+ chour5 ch ou5 rr
2266
+ chur1 ch u1 rr
2267
+ chur2 ch u2 rr
2268
+ chur3 ch u3 rr
2269
+ chur4 ch u4 rr
2270
+ chur5 ch u5 rr
2271
+ chuair1 ch uai1 rr
2272
+ chuair2 ch uai2 rr
2273
+ chuair3 ch uai3 rr
2274
+ chuair4 ch uai4 rr
2275
+ chuair5 ch uai5 rr
2276
+ chuanr1 ch uan1 rr
2277
+ chuanr2 ch uan2 rr
2278
+ chuanr3 ch uan3 rr
2279
+ chuanr4 ch uan4 rr
2280
+ chuanr5 ch uan5 rr
2281
+ chuangr1 ch uang1 rr
2282
+ chuangr2 ch uang2 rr
2283
+ chuangr3 ch uang3 rr
2284
+ chuangr4 ch uang4 rr
2285
+ chuangr5 ch uang5 rr
2286
+ chuir1 ch uei1 rr
2287
+ chuir2 ch uei2 rr
2288
+ chuir3 ch uei3 rr
2289
+ chuir4 ch uei4 rr
2290
+ chuir5 ch uei5 rr
2291
+ chunr1 ch uen1 rr
2292
+ chunr2 ch uen2 rr
2293
+ chunr3 ch uen3 rr
2294
+ chunr4 ch uen4 rr
2295
+ chunr5 ch uen5 rr
2296
+ chuor1 ch uo1 rr
2297
+ chuor2 ch uo2 rr
2298
+ chuor3 ch uo3 rr
2299
+ chuor4 ch uo4 rr
2300
+ chuor5 ch uo5 rr
2301
+ cir1 c ii1 rr
2302
+ cir2 c ii2 rr
2303
+ cir3 c ii3 rr
2304
+ cir4 c ii4 rr
2305
+ cir5 c ii5 rr
2306
+ congr1 c ong1 rr
2307
+ congr2 c ong2 rr
2308
+ congr3 c ong3 rr
2309
+ congr4 c ong4 rr
2310
+ congr5 c ong5 rr
2311
+ cour1 c ou1 rr
2312
+ cour2 c ou2 rr
2313
+ cour3 c ou3 rr
2314
+ cour4 c ou4 rr
2315
+ cour5 c ou5 rr
2316
+ cur1 c u1 rr
2317
+ cur2 c u2 rr
2318
+ cur3 c u3 rr
2319
+ cur4 c u4 rr
2320
+ cur5 c u5 rr
2321
+ cuanr1 c uan1 rr
2322
+ cuanr2 c uan2 rr
2323
+ cuanr3 c uan3 rr
2324
+ cuanr4 c uan4 rr
2325
+ cuanr5 c uan5 rr
2326
+ cuir1 c uei1 rr
2327
+ cuir2 c uei2 rr
2328
+ cuir3 c uei3 rr
2329
+ cuir4 c uei4 rr
2330
+ cuir5 c uei5 rr
2331
+ cunr1 c uen1 rr
2332
+ cunr2 c uen2 rr
2333
+ cunr3 c uen3 rr
2334
+ cunr4 c uen4 rr
2335
+ cunr5 c uen5 rr
2336
+ cuor1 c uo1 rr
2337
+ cuor2 c uo2 rr
2338
+ cuor3 c uo3 rr
2339
+ cuor4 c uo4 rr
2340
+ cuor5 c uo5 rr
2341
+ dar1 d a1 rr
2342
+ dar2 d a2 rr
2343
+ dar3 d a3 rr
2344
+ dar4 d a4 rr
2345
+ dar5 d a5 rr
2346
+ dair1 d ai1 rr
2347
+ dair2 d ai2 rr
2348
+ dair3 d ai3 rr
2349
+ dair4 d ai4 rr
2350
+ dair5 d ai5 rr
2351
+ danr1 d an1 rr
2352
+ danr2 d an2 rr
2353
+ danr3 d an3 rr
2354
+ danr4 d an4 rr
2355
+ danr5 d an5 rr
2356
+ dangr1 d ang1 rr
2357
+ dangr2 d ang2 rr
2358
+ dangr3 d ang3 rr
2359
+ dangr4 d ang4 rr
2360
+ dangr5 d ang5 rr
2361
+ daor1 d ao1 rr
2362
+ daor2 d ao2 rr
2363
+ daor3 d ao3 rr
2364
+ daor4 d ao4 rr
2365
+ daor5 d ao5 rr
2366
+ der1 d e1 rr
2367
+ der2 d e2 rr
2368
+ der3 d e3 rr
2369
+ der4 d e4 rr
2370
+ der5 d e5 rr
2371
+ deir1 d ei1 rr
2372
+ deir2 d ei2 rr
2373
+ deir3 d ei3 rr
2374
+ deir4 d ei4 rr
2375
+ deir5 d ei5 rr
2376
+ denr1 d en1 rr
2377
+ denr2 d en2 rr
2378
+ denr3 d en3 rr
2379
+ denr4 d en4 rr
2380
+ denr5 d en5 rr
2381
+ dengr1 d eng1 rr
2382
+ dengr2 d eng2 rr
2383
+ dengr3 d eng3 rr
2384
+ dengr4 d eng4 rr
2385
+ dengr5 d eng5 rr
2386
+ dir1 d i1 rr
2387
+ dir2 d i2 rr
2388
+ dir3 d i3 rr
2389
+ dir4 d i4 rr
2390
+ dir5 d i5 rr
2391
+ diar1 d ia1 rr
2392
+ diar2 d ia2 rr
2393
+ diar3 d ia3 rr
2394
+ diar4 d ia4 rr
2395
+ diar5 d ia5 rr
2396
+ dianr1 d ian1 rr
2397
+ dianr2 d ian2 rr
2398
+ dianr3 d ian3 rr
2399
+ dianr4 d ian4 rr
2400
+ dianr5 d ian5 rr
2401
+ diaor1 d iao1 rr
2402
+ diaor2 d iao2 rr
2403
+ diaor3 d iao3 rr
2404
+ diaor4 d iao4 rr
2405
+ diaor5 d iao5 rr
2406
+ dier1 d ie1 rr
2407
+ dier2 d ie2 rr
2408
+ dier3 d ie3 rr
2409
+ dier4 d ie4 rr
2410
+ dier5 d ie5 rr
2411
+ dingr1 d ing1 rr
2412
+ dingr2 d ing2 rr
2413
+ dingr3 d ing3 rr
2414
+ dingr4 d ing4 rr
2415
+ dingr5 d ing5 rr
2416
+ diur1 d iou1 rr
2417
+ diur2 d iou2 rr
2418
+ diur3 d iou3 rr
2419
+ diur4 d iou4 rr
2420
+ diur5 d iou5 rr
2421
+ dongr1 d ong1 rr
2422
+ dongr2 d ong2 rr
2423
+ dongr3 d ong3 rr
2424
+ dongr4 d ong4 rr
2425
+ dongr5 d ong5 rr
2426
+ dour1 d ou1 rr
2427
+ dour2 d ou2 rr
2428
+ dour3 d ou3 rr
2429
+ dour4 d ou4 rr
2430
+ dour5 d ou5 rr
2431
+ dur1 d u1 rr
2432
+ dur2 d u2 rr
2433
+ dur3 d u3 rr
2434
+ dur4 d u4 rr
2435
+ dur5 d u5 rr
2436
+ duanr1 d uan1 rr
2437
+ duanr2 d uan2 rr
2438
+ duanr3 d uan3 rr
2439
+ duanr4 d uan4 rr
2440
+ duanr5 d uan5 rr
2441
+ duir1 d uei1 rr
2442
+ duir2 d uei2 rr
2443
+ duir3 d uei3 rr
2444
+ duir4 d uei4 rr
2445
+ duir5 d uei5 rr
2446
+ dunr1 d uen1 rr
2447
+ dunr2 d uen2 rr
2448
+ dunr3 d uen3 rr
2449
+ dunr4 d uen4 rr
2450
+ dunr5 d uen5 rr
2451
+ duor1 d uo1 rr
2452
+ duor2 d uo2 rr
2453
+ duor3 d uo3 rr
2454
+ duor4 d uo4 rr
2455
+ duor5 d uo5 rr
2456
+ er1 e1 rr
2457
+ er2 e2 rr
2458
+ er3 e3 rr
2459
+ er4 e4 rr
2460
+ er5 e5 rr
2461
+ eir1 ei1 rr
2462
+ eir2 ei2 rr
2463
+ eir3 ei3 rr
2464
+ eir4 ei4 rr
2465
+ eir5 ei5 rr
2466
+ enr1 en1 rr
2467
+ enr2 en2 rr
2468
+ enr3 en3 rr
2469
+ enr4 en4 rr
2470
+ enr5 en5 rr
2471
+ engr1 eng1 rr
2472
+ engr2 eng2 rr
2473
+ engr3 eng3 rr
2474
+ engr4 eng4 rr
2475
+ engr5 eng5 rr
2476
+ far1 f a1 rr
2477
+ far2 f a2 rr
2478
+ far3 f a3 rr
2479
+ far4 f a4 rr
2480
+ far5 f a5 rr
2481
+ fanr1 f an1 rr
2482
+ fanr2 f an2 rr
2483
+ fanr3 f an3 rr
2484
+ fanr4 f an4 rr
2485
+ fanr5 f an5 rr
2486
+ fangr1 f ang1 rr
2487
+ fangr2 f ang2 rr
2488
+ fangr3 f ang3 rr
2489
+ fangr4 f ang4 rr
2490
+ fangr5 f ang5 rr
2491
+ feir1 f ei1 rr
2492
+ feir2 f ei2 rr
2493
+ feir3 f ei3 rr
2494
+ feir4 f ei4 rr
2495
+ feir5 f ei5 rr
2496
+ fenr1 f en1 rr
2497
+ fenr2 f en2 rr
2498
+ fenr3 f en3 rr
2499
+ fenr4 f en4 rr
2500
+ fenr5 f en5 rr
2501
+ fengr1 f eng1 rr
2502
+ fengr2 f eng2 rr
2503
+ fengr3 f eng3 rr
2504
+ fengr4 f eng4 rr
2505
+ fengr5 f eng5 rr
2506
+ for1 f o1 rr
2507
+ for2 f o2 rr
2508
+ for3 f o3 rr
2509
+ for4 f o4 rr
2510
+ for5 f o5 rr
2511
+ four1 f ou1 rr
2512
+ four2 f ou2 rr
2513
+ four3 f ou3 rr
2514
+ four4 f ou4 rr
2515
+ four5 f ou5 rr
2516
+ fur1 f u1 rr
2517
+ fur2 f u2 rr
2518
+ fur3 f u3 rr
2519
+ fur4 f u4 rr
2520
+ fur5 f u5 rr
2521
+ gar1 g a1 rr
2522
+ gar2 g a2 rr
2523
+ gar3 g a3 rr
2524
+ gar4 g a4 rr
2525
+ gar5 g a5 rr
2526
+ gair1 g ai1 rr
2527
+ gair2 g ai2 rr
2528
+ gair3 g ai3 rr
2529
+ gair4 g ai4 rr
2530
+ gair5 g ai5 rr
2531
+ ganr1 g an1 rr
2532
+ ganr2 g an2 rr
2533
+ ganr3 g an3 rr
2534
+ ganr4 g an4 rr
2535
+ ganr5 g an5 rr
2536
+ gangr1 g ang1 rr
2537
+ gangr2 g ang2 rr
2538
+ gangr3 g ang3 rr
2539
+ gangr4 g ang4 rr
2540
+ gangr5 g ang5 rr
2541
+ gaor1 g ao1 rr
2542
+ gaor2 g ao2 rr
2543
+ gaor3 g ao3 rr
2544
+ gaor4 g ao4 rr
2545
+ gaor5 g ao5 rr
2546
+ ger1 g e1 rr
2547
+ ger2 g e2 rr
2548
+ ger3 g e3 rr
2549
+ ger4 g e4 rr
2550
+ ger5 g e5 rr
2551
+ geir1 g ei1 rr
2552
+ geir2 g ei2 rr
2553
+ geir3 g ei3 rr
2554
+ geir4 g ei4 rr
2555
+ geir5 g ei5 rr
2556
+ genr1 g en1 rr
2557
+ genr2 g en2 rr
2558
+ genr3 g en3 rr
2559
+ genr4 g en4 rr
2560
+ genr5 g en5 rr
2561
+ gengr1 g eng1 rr
2562
+ gengr2 g eng2 rr
2563
+ gengr3 g eng3 rr
2564
+ gengr4 g eng4 rr
2565
+ gengr5 g eng5 rr
2566
+ gongr1 g ong1 rr
2567
+ gongr2 g ong2 rr
2568
+ gongr3 g ong3 rr
2569
+ gongr4 g ong4 rr
2570
+ gongr5 g ong5 rr
2571
+ gour1 g ou1 rr
2572
+ gour2 g ou2 rr
2573
+ gour3 g ou3 rr
2574
+ gour4 g ou4 rr
2575
+ gour5 g ou5 rr
2576
+ gur1 g u1 rr
2577
+ gur2 g u2 rr
2578
+ gur3 g u3 rr
2579
+ gur4 g u4 rr
2580
+ gur5 g u5 rr
2581
+ guar1 g ua1 rr
2582
+ guar2 g ua2 rr
2583
+ guar3 g ua3 rr
2584
+ guar4 g ua4 rr
2585
+ guar5 g ua5 rr
2586
+ guair1 g uai1 rr
2587
+ guair2 g uai2 rr
2588
+ guair3 g uai3 rr
2589
+ guair4 g uai4 rr
2590
+ guair5 g uai5 rr
2591
+ guanr1 g uan1 rr
2592
+ guanr2 g uan2 rr
2593
+ guanr3 g uan3 rr
2594
+ guanr4 g uan4 rr
2595
+ guanr5 g uan5 rr
2596
+ guangr1 g uang1 rr
2597
+ guangr2 g uang2 rr
2598
+ guangr3 g uang3 rr
2599
+ guangr4 g uang4 rr
2600
+ guangr5 g uang5 rr
2601
+ guir1 g uei1 rr
2602
+ guir2 g uei2 rr
2603
+ guir3 g uei3 rr
2604
+ guir4 g uei4 rr
2605
+ guir5 g uei5 rr
2606
+ gunr1 g uen1 rr
2607
+ gunr2 g uen2 rr
2608
+ gunr3 g uen3 rr
2609
+ gunr4 g uen4 rr
2610
+ gunr5 g uen5 rr
2611
+ guor1 g uo1 rr
2612
+ guor2 g uo2 rr
2613
+ guor3 g uo3 rr
2614
+ guor4 g uo4 rr
2615
+ guor5 g uo5 rr
2616
+ har1 h a1 rr
2617
+ har2 h a2 rr
2618
+ har3 h a3 rr
2619
+ har4 h a4 rr
2620
+ har5 h a5 rr
2621
+ hair1 h ai1 rr
2622
+ hair2 h ai2 rr
2623
+ hair3 h ai3 rr
2624
+ hair4 h ai4 rr
2625
+ hair5 h ai5 rr
2626
+ hanr1 h an1 rr
2627
+ hanr2 h an2 rr
2628
+ hanr3 h an3 rr
2629
+ hanr4 h an4 rr
2630
+ hanr5 h an5 rr
2631
+ hangr1 h ang1 rr
2632
+ hangr2 h ang2 rr
2633
+ hangr3 h ang3 rr
2634
+ hangr4 h ang4 rr
2635
+ hangr5 h ang5 rr
2636
+ haor1 h ao1 rr
2637
+ haor2 h ao2 rr
2638
+ haor3 h ao3 rr
2639
+ haor4 h ao4 rr
2640
+ haor5 h ao5 rr
2641
+ her1 h e1 rr
2642
+ her2 h e2 rr
2643
+ her3 h e3 rr
2644
+ her4 h e4 rr
2645
+ her5 h e5 rr
2646
+ heir1 h ei1 rr
2647
+ heir2 h ei2 rr
2648
+ heir3 h ei3 rr
2649
+ heir4 h ei4 rr
2650
+ heir5 h ei5 rr
2651
+ henr1 h en1 rr
2652
+ henr2 h en2 rr
2653
+ henr3 h en3 rr
2654
+ henr4 h en4 rr
2655
+ henr5 h en5 rr
2656
+ hengr1 h eng1 rr
2657
+ hengr2 h eng2 rr
2658
+ hengr3 h eng3 rr
2659
+ hengr4 h eng4 rr
2660
+ hengr5 h eng5 rr
2661
+ hongr1 h ong1 rr
2662
+ hongr2 h ong2 rr
2663
+ hongr3 h ong3 rr
2664
+ hongr4 h ong4 rr
2665
+ hongr5 h ong5 rr
2666
+ hour1 h ou1 rr
2667
+ hour2 h ou2 rr
2668
+ hour3 h ou3 rr
2669
+ hour4 h ou4 rr
2670
+ hour5 h ou5 rr
2671
+ hur1 h u1 rr
2672
+ hur2 h u2 rr
2673
+ hur3 h u3 rr
2674
+ hur4 h u4 rr
2675
+ hur5 h u5 rr
2676
+ huar1 h ua1 rr
2677
+ huar2 h ua2 rr
2678
+ huar3 h ua3 rr
2679
+ huar4 h ua4 rr
2680
+ huar5 h ua5 rr
2681
+ huair1 h uai1 rr
2682
+ huair2 h uai2 rr
2683
+ huair3 h uai3 rr
2684
+ huair4 h uai4 rr
2685
+ huair5 h uai5 rr
2686
+ huanr1 h uan1 rr
2687
+ huanr2 h uan2 rr
2688
+ huanr3 h uan3 rr
2689
+ huanr4 h uan4 rr
2690
+ huanr5 h uan5 rr
2691
+ huangr1 h uang1 rr
2692
+ huangr2 h uang2 rr
2693
+ huangr3 h uang3 rr
2694
+ huangr4 h uang4 rr
2695
+ huangr5 h uang5 rr
2696
+ huir1 h uei1 rr
2697
+ huir2 h uei2 rr
2698
+ huir3 h uei3 rr
2699
+ huir4 h uei4 rr
2700
+ huir5 h uei5 rr
2701
+ hunr1 h uen1 rr
2702
+ hunr2 h uen2 rr
2703
+ hunr3 h uen3 rr
2704
+ hunr4 h uen4 rr
2705
+ hunr5 h uen5 rr
2706
+ huor1 h uo1 rr
2707
+ huor2 h uo2 rr
2708
+ huor3 h uo3 rr
2709
+ huor4 h uo4 rr
2710
+ huor5 h uo5 rr
2711
+ jir1 j i1 rr
2712
+ jir2 j i2 rr
2713
+ jir3 j i3 rr
2714
+ jir4 j i4 rr
2715
+ jir5 j i5 rr
2716
+ jiar1 j ia1 rr
2717
+ jiar2 j ia2 rr
2718
+ jiar3 j ia3 rr
2719
+ jiar4 j ia4 rr
2720
+ jiar5 j ia5 rr
2721
+ jianr1 j ian1 rr
2722
+ jianr2 j ian2 rr
2723
+ jianr3 j ian3 rr
2724
+ jianr4 j ian4 rr
2725
+ jianr5 j ian5 rr
2726
+ jiangr1 j iang1 rr
2727
+ jiangr2 j iang2 rr
2728
+ jiangr3 j iang3 rr
2729
+ jiangr4 j iang4 rr
2730
+ jiangr5 j iang5 rr
2731
+ jiaor1 j iao1 rr
2732
+ jiaor2 j iao2 rr
2733
+ jiaor3 j iao3 rr
2734
+ jiaor4 j iao4 rr
2735
+ jiaor5 j iao5 rr
2736
+ jier1 j ie1 rr
2737
+ jier2 j ie2 rr
2738
+ jier3 j ie3 rr
2739
+ jier4 j ie4 rr
2740
+ jier5 j ie5 rr
2741
+ jinr1 j in1 rr
2742
+ jinr2 j in2 rr
2743
+ jinr3 j in3 rr
2744
+ jinr4 j in4 rr
2745
+ jinr5 j in5 rr
2746
+ jingr1 j ing1 rr
2747
+ jingr2 j ing2 rr
2748
+ jingr3 j ing3 rr
2749
+ jingr4 j ing4 rr
2750
+ jingr5 j ing5 rr
2751
+ jiongr1 j iong1 rr
2752
+ jiongr2 j iong2 rr
2753
+ jiongr3 j iong3 rr
2754
+ jiongr4 j iong4 rr
2755
+ jiongr5 j iong5 rr
2756
+ jiur1 j iou1 rr
2757
+ jiur2 j iou2 rr
2758
+ jiur3 j iou3 rr
2759
+ jiur4 j iou4 rr
2760
+ jiur5 j iou5 rr
2761
+ jur1 j v1 rr
2762
+ jur2 j v2 rr
2763
+ jur3 j v3 rr
2764
+ jur4 j v4 rr
2765
+ jur5 j v5 rr
2766
+ juanr1 j van1 rr
2767
+ juanr2 j van2 rr
2768
+ juanr3 j van3 rr
2769
+ juanr4 j van4 rr
2770
+ juanr5 j van5 rr
2771
+ juer1 j ve1 rr
2772
+ juer2 j ve2 rr
2773
+ juer3 j ve3 rr
2774
+ juer4 j ve4 rr
2775
+ juer5 j ve5 rr
2776
+ junr1 j vn1 rr
2777
+ junr2 j vn2 rr
2778
+ junr3 j vn3 rr
2779
+ junr4 j vn4 rr
2780
+ junr5 j vn5 rr
2781
+ kar1 k a1 rr
2782
+ kar2 k a2 rr
2783
+ kar3 k a3 rr
2784
+ kar4 k a4 rr
2785
+ kar5 k a5 rr
2786
+ kair1 k ai1 rr
2787
+ kair2 k ai2 rr
2788
+ kair3 k ai3 rr
2789
+ kair4 k ai4 rr
2790
+ kair5 k ai5 rr
2791
+ kanr1 k an1 rr
2792
+ kanr2 k an2 rr
2793
+ kanr3 k an3 rr
2794
+ kanr4 k an4 rr
2795
+ kanr5 k an5 rr
2796
+ kangr1 k ang1 rr
2797
+ kangr2 k ang2 rr
2798
+ kangr3 k ang3 rr
2799
+ kangr4 k ang4 rr
2800
+ kangr5 k ang5 rr
2801
+ kaor1 k ao1 rr
2802
+ kaor2 k ao2 rr
2803
+ kaor3 k ao3 rr
2804
+ kaor4 k ao4 rr
2805
+ kaor5 k ao5 rr
2806
+ ker1 k e1 rr
2807
+ ker2 k e2 rr
2808
+ ker3 k e3 rr
2809
+ ker4 k e4 rr
2810
+ ker5 k e5 rr
2811
+ keir1 k ei1 rr
2812
+ keir2 k ei2 rr
2813
+ keir3 k ei3 rr
2814
+ keir4 k ei4 rr
2815
+ keir5 k ei5 rr
2816
+ kenr1 k en1 rr
2817
+ kenr2 k en2 rr
2818
+ kenr3 k en3 rr
2819
+ kenr4 k en4 rr
2820
+ kenr5 k en5 rr
2821
+ kengr1 k eng1 rr
2822
+ kengr2 k eng2 rr
2823
+ kengr3 k eng3 rr
2824
+ kengr4 k eng4 rr
2825
+ kengr5 k eng5 rr
2826
+ kongr1 k ong1 rr
2827
+ kongr2 k ong2 rr
2828
+ kongr3 k ong3 rr
2829
+ kongr4 k ong4 rr
2830
+ kongr5 k ong5 rr
2831
+ kour1 k ou1 rr
2832
+ kour2 k ou2 rr
2833
+ kour3 k ou3 rr
2834
+ kour4 k ou4 rr
2835
+ kour5 k ou5 rr
2836
+ kur1 k u1 rr
2837
+ kur2 k u2 rr
2838
+ kur3 k u3 rr
2839
+ kur4 k u4 rr
2840
+ kur5 k u5 rr
2841
+ kuar1 k ua1 rr
2842
+ kuar2 k ua2 rr
2843
+ kuar3 k ua3 rr
2844
+ kuar4 k ua4 rr
2845
+ kuar5 k ua5 rr
2846
+ kuair1 k uai1 rr
2847
+ kuair2 k uai2 rr
2848
+ kuair3 k uai3 rr
2849
+ kuair4 k uai4 rr
2850
+ kuair5 k uai5 rr
2851
+ kuanr1 k uan1 rr
2852
+ kuanr2 k uan2 rr
2853
+ kuanr3 k uan3 rr
2854
+ kuanr4 k uan4 rr
2855
+ kuanr5 k uan5 rr
2856
+ kuangr1 k uang1 rr
2857
+ kuangr2 k uang2 rr
2858
+ kuangr3 k uang3 rr
2859
+ kuangr4 k uang4 rr
2860
+ kuangr5 k uang5 rr
2861
+ kuir1 k uei1 rr
2862
+ kuir2 k uei2 rr
2863
+ kuir3 k uei3 rr
2864
+ kuir4 k uei4 rr
2865
+ kuir5 k uei5 rr
2866
+ kunr1 k uen1 rr
2867
+ kunr2 k uen2 rr
2868
+ kunr3 k uen3 rr
2869
+ kunr4 k uen4 rr
2870
+ kunr5 k uen5 rr
2871
+ kuor1 k uo1 rr
2872
+ kuor2 k uo2 rr
2873
+ kuor3 k uo3 rr
2874
+ kuor4 k uo4 rr
2875
+ kuor5 k uo5 rr
2876
+ lar1 l a1 rr
2877
+ lar2 l a2 rr
2878
+ lar3 l a3 rr
2879
+ lar4 l a4 rr
2880
+ lar5 l a5 rr
2881
+ lair1 l ai1 rr
2882
+ lair2 l ai2 rr
2883
+ lair3 l ai3 rr
2884
+ lair4 l ai4 rr
2885
+ lair5 l ai5 rr
2886
+ lanr1 l an1 rr
2887
+ lanr2 l an2 rr
2888
+ lanr3 l an3 rr
2889
+ lanr4 l an4 rr
2890
+ lanr5 l an5 rr
2891
+ langr1 l ang1 rr
2892
+ langr2 l ang2 rr
2893
+ langr3 l ang3 rr
2894
+ langr4 l ang4 rr
2895
+ langr5 l ang5 rr
2896
+ laor1 l ao1 rr
2897
+ laor2 l ao2 rr
2898
+ laor3 l ao3 rr
2899
+ laor4 l ao4 rr
2900
+ laor5 l ao5 rr
2901
+ ler1 l e1 rr
2902
+ ler2 l e2 rr
2903
+ ler3 l e3 rr
2904
+ ler4 l e4 rr
2905
+ ler5 l e5 rr
2906
+ leir1 l ei1 rr
2907
+ leir2 l ei2 rr
2908
+ leir3 l ei3 rr
2909
+ leir4 l ei4 rr
2910
+ leir5 l ei5 rr
2911
+ lengr1 l eng1 rr
2912
+ lengr2 l eng2 rr
2913
+ lengr3 l eng3 rr
2914
+ lengr4 l eng4 rr
2915
+ lengr5 l eng5 rr
2916
+ lir1 l i1 rr
2917
+ lir2 l i2 rr
2918
+ lir3 l i3 rr
2919
+ lir4 l i4 rr
2920
+ lir5 l i5 rr
2921
+ liar1 l ia1 rr
2922
+ liar2 l ia2 rr
2923
+ liar3 l ia3 rr
2924
+ liar4 l ia4 rr
2925
+ liar5 l ia5 rr
2926
+ lianr1 l ian1 rr
2927
+ lianr2 l ian2 rr
2928
+ lianr3 l ian3 rr
2929
+ lianr4 l ian4 rr
2930
+ lianr5 l ian5 rr
2931
+ liangr1 l iang1 rr
2932
+ liangr2 l iang2 rr
2933
+ liangr3 l iang3 rr
2934
+ liangr4 l iang4 rr
2935
+ liangr5 l iang5 rr
2936
+ liaor1 l iao1 rr
2937
+ liaor2 l iao2 rr
2938
+ liaor3 l iao3 rr
2939
+ liaor4 l iao4 rr
2940
+ liaor5 l iao5 rr
2941
+ lier1 l ie1 rr
2942
+ lier2 l ie2 rr
2943
+ lier3 l ie3 rr
2944
+ lier4 l ie4 rr
2945
+ lier5 l ie5 rr
2946
+ linr1 l in1 rr
2947
+ linr2 l in2 rr
2948
+ linr3 l in3 rr
2949
+ linr4 l in4 rr
2950
+ linr5 l in5 rr
2951
+ lingr1 l ing1 rr
2952
+ lingr2 l ing2 rr
2953
+ lingr3 l ing3 rr
2954
+ lingr4 l ing4 rr
2955
+ lingr5 l ing5 rr
2956
+ liur1 l iou1 rr
2957
+ liur2 l iou2 rr
2958
+ liur3 l iou3 rr
2959
+ liur4 l iou4 rr
2960
+ liur5 l iou5 rr
2961
+ lor1 l o1 rr
2962
+ lor2 l o2 rr
2963
+ lor3 l o3 rr
2964
+ lor4 l o4 rr
2965
+ lor5 l o5 rr
2966
+ longr1 l ong1 rr
2967
+ longr2 l ong2 rr
2968
+ longr3 l ong3 rr
2969
+ longr4 l ong4 rr
2970
+ longr5 l ong5 rr
2971
+ lour1 l ou1 rr
2972
+ lour2 l ou2 rr
2973
+ lour3 l ou3 rr
2974
+ lour4 l ou4 rr
2975
+ lour5 l ou5 rr
2976
+ lur1 l u1 rr
2977
+ lur2 l u2 rr
2978
+ lur3 l u3 rr
2979
+ lur4 l u4 rr
2980
+ lur5 l u5 rr
2981
+ luanr1 l uan1 rr
2982
+ luanr2 l uan2 rr
2983
+ luanr3 l uan3 rr
2984
+ luanr4 l uan4 rr
2985
+ luanr5 l uan5 rr
2986
+ luer1 l ve1 rr
2987
+ luer2 l ve2 rr
2988
+ luer3 l ve3 rr
2989
+ luer4 l ve4 rr
2990
+ luer5 l ve5 rr
2991
+ lver1 l ve1 rr
2992
+ lver2 l ve2 rr
2993
+ lver3 l ve3 rr
2994
+ lver4 l ve4 rr
2995
+ lver5 l ve5 rr
2996
+ lunr1 l uen1 rr
2997
+ lunr2 l uen2 rr
2998
+ lunr3 l uen3 rr
2999
+ lunr4 l uen4 rr
3000
+ lunr5 l uen5 rr
3001
+ luor1 l uo1 rr
3002
+ luor2 l uo2 rr
3003
+ luor3 l uo3 rr
3004
+ luor4 l uo4 rr
3005
+ luor5 l uo5 rr
3006
+ lvr1 l v1 rr
3007
+ lvr2 l v2 rr
3008
+ lvr3 l v3 rr
3009
+ lvr4 l v4 rr
3010
+ lvr5 l v5 rr
3011
+ mar1 m a1 rr
3012
+ mar2 m a2 rr
3013
+ mar3 m a3 rr
3014
+ mar4 m a4 rr
3015
+ mar5 m a5 rr
3016
+ mair1 m ai1 rr
3017
+ mair2 m ai2 rr
3018
+ mair3 m ai3 rr
3019
+ mair4 m ai4 rr
3020
+ mair5 m ai5 rr
3021
+ manr1 m an1 rr
3022
+ manr2 m an2 rr
3023
+ manr3 m an3 rr
3024
+ manr4 m an4 rr
3025
+ manr5 m an5 rr
3026
+ mangr1 m ang1 rr
3027
+ mangr2 m ang2 rr
3028
+ mangr3 m ang3 rr
3029
+ mangr4 m ang4 rr
3030
+ mangr5 m ang5 rr
3031
+ maor1 m ao1 rr
3032
+ maor2 m ao2 rr
3033
+ maor3 m ao3 rr
3034
+ maor4 m ao4 rr
3035
+ maor5 m ao5 rr
3036
+ mer1 m e1 rr
3037
+ mer2 m e2 rr
3038
+ mer3 m e3 rr
3039
+ mer4 m e4 rr
3040
+ mer5 m e5 rr
3041
+ meir1 m ei1 rr
3042
+ meir2 m ei2 rr
3043
+ meir3 m ei3 rr
3044
+ meir4 m ei4 rr
3045
+ meir5 m ei5 rr
3046
+ menr1 m en1 rr
3047
+ menr2 m en2 rr
3048
+ menr3 m en3 rr
3049
+ menr4 m en4 rr
3050
+ menr5 m en5 rr
3051
+ mengr1 m eng1 rr
3052
+ mengr2 m eng2 rr
3053
+ mengr3 m eng3 rr
3054
+ mengr4 m eng4 rr
3055
+ mengr5 m eng5 rr
3056
+ mir1 m i1 rr
3057
+ mir2 m i2 rr
3058
+ mir3 m i3 rr
3059
+ mir4 m i4 rr
3060
+ mir5 m i5 rr
3061
+ mianr1 m ian1 rr
3062
+ mianr2 m ian2 rr
3063
+ mianr3 m ian3 rr
3064
+ mianr4 m ian4 rr
3065
+ mianr5 m ian5 rr
3066
+ miaor1 m iao1 rr
3067
+ miaor2 m iao2 rr
3068
+ miaor3 m iao3 rr
3069
+ miaor4 m iao4 rr
3070
+ miaor5 m iao5 rr
3071
+ mier1 m ie1 rr
3072
+ mier2 m ie2 rr
3073
+ mier3 m ie3 rr
3074
+ mier4 m ie4 rr
3075
+ mier5 m ie5 rr
3076
+ minr1 m in1 rr
3077
+ minr2 m in2 rr
3078
+ minr3 m in3 rr
3079
+ minr4 m in4 rr
3080
+ minr5 m in5 rr
3081
+ mingr1 m ing1 rr
3082
+ mingr2 m ing2 rr
3083
+ mingr3 m ing3 rr
3084
+ mingr4 m ing4 rr
3085
+ mingr5 m ing5 rr
3086
+ miur1 m iou1 rr
3087
+ miur2 m iou2 rr
3088
+ miur3 m iou3 rr
3089
+ miur4 m iou4 rr
3090
+ miur5 m iou5 rr
3091
+ mor1 m o1 rr
3092
+ mor2 m o2 rr
3093
+ mor3 m o3 rr
3094
+ mor4 m o4 rr
3095
+ mor5 m o5 rr
3096
+ mour1 m ou1 rr
3097
+ mour2 m ou2 rr
3098
+ mour3 m ou3 rr
3099
+ mour4 m ou4 rr
3100
+ mour5 m ou5 rr
3101
+ mur1 m u1 rr
3102
+ mur2 m u2 rr
3103
+ mur3 m u3 rr
3104
+ mur4 m u4 rr
3105
+ mur5 m u5 rr
3106
+ nar1 n a1 rr
3107
+ nar2 n a2 rr
3108
+ nar3 n a3 rr
3109
+ nar4 n a4 rr
3110
+ nar5 n a5 rr
3111
+ nair1 n ai1 rr
3112
+ nair2 n ai2 rr
3113
+ nair3 n ai3 rr
3114
+ nair4 n ai4 rr
3115
+ nair5 n ai5 rr
3116
+ nanr1 n an1 rr
3117
+ nanr2 n an2 rr
3118
+ nanr3 n an3 rr
3119
+ nanr4 n an4 rr
3120
+ nanr5 n an5 rr
3121
+ nangr1 n ang1 rr
3122
+ nangr2 n ang2 rr
3123
+ nangr3 n ang3 rr
3124
+ nangr4 n ang4 rr
3125
+ nangr5 n ang5 rr
3126
+ naor1 n ao1 rr
3127
+ naor2 n ao2 rr
3128
+ naor3 n ao3 rr
3129
+ naor4 n ao4 rr
3130
+ naor5 n ao5 rr
3131
+ ner1 n e1 rr
3132
+ ner2 n e2 rr
3133
+ ner3 n e3 rr
3134
+ ner4 n e4 rr
3135
+ ner5 n e5 rr
3136
+ neir1 n ei1 rr
3137
+ neir2 n ei2 rr
3138
+ neir3 n ei3 rr
3139
+ neir4 n ei4 rr
3140
+ neir5 n ei5 rr
3141
+ nenr1 n en1 rr
3142
+ nenr2 n en2 rr
3143
+ nenr3 n en3 rr
3144
+ nenr4 n en4 rr
3145
+ nenr5 n en5 rr
3146
+ nengr1 n eng1 rr
3147
+ nengr2 n eng2 rr
3148
+ nengr3 n eng3 rr
3149
+ nengr4 n eng4 rr
3150
+ nengr5 n eng5 rr
3151
+ nir1 n i1 rr
3152
+ nir2 n i2 rr
3153
+ nir3 n i3 rr
3154
+ nir4 n i4 rr
3155
+ nir5 n i5 rr
3156
+ nianr1 n ian1 rr
3157
+ nianr2 n ian2 rr
3158
+ nianr3 n ian3 rr
3159
+ nianr4 n ian4 rr
3160
+ nianr5 n ian5 rr
3161
+ niangr1 n iang1 rr
3162
+ niangr2 n iang2 rr
3163
+ niangr3 n iang3 rr
3164
+ niangr4 n iang4 rr
3165
+ niangr5 n iang5 rr
3166
+ niaor1 n iao1 rr
3167
+ niaor2 n iao2 rr
3168
+ niaor3 n iao3 rr
3169
+ niaor4 n iao4 rr
3170
+ niaor5 n iao5 rr
3171
+ nier1 n ie1 rr
3172
+ nier2 n ie2 rr
3173
+ nier3 n ie3 rr
3174
+ nier4 n ie4 rr
3175
+ nier5 n ie5 rr
3176
+ ninr1 n in1 rr
3177
+ ninr2 n in2 rr
3178
+ ninr3 n in3 rr
3179
+ ninr4 n in4 rr
3180
+ ninr5 n in5 rr
3181
+ ningr1 n ing1 rr
3182
+ ningr2 n ing2 rr
3183
+ ningr3 n ing3 rr
3184
+ ningr4 n ing4 rr
3185
+ ningr5 n ing5 rr
3186
+ niur1 n iou1 rr
3187
+ niur2 n iou2 rr
3188
+ niur3 n iou3 rr
3189
+ niur4 n iou4 rr
3190
+ niur5 n iou5 rr
3191
+ nongr1 n ong1 rr
3192
+ nongr2 n ong2 rr
3193
+ nongr3 n ong3 rr
3194
+ nongr4 n ong4 rr
3195
+ nongr5 n ong5 rr
3196
+ nour1 n ou1 rr
3197
+ nour2 n ou2 rr
3198
+ nour3 n ou3 rr
3199
+ nour4 n ou4 rr
3200
+ nour5 n ou5 rr
3201
+ nur1 n u1 rr
3202
+ nur2 n u2 rr
3203
+ nur3 n u3 rr
3204
+ nur4 n u4 rr
3205
+ nur5 n u5 rr
3206
+ nuanr1 n uan1 rr
3207
+ nuanr2 n uan2 rr
3208
+ nuanr3 n uan3 rr
3209
+ nuanr4 n uan4 rr
3210
+ nuanr5 n uan5 rr
3211
+ nuer1 n ve1 rr
3212
+ nuer2 n ve2 rr
3213
+ nuer3 n ve3 rr
3214
+ nuer4 n ve4 rr
3215
+ nuer5 n ve5 rr
3216
+ nver1 n ve1 rr
3217
+ nver2 n ve2 rr
3218
+ nver3 n ve3 rr
3219
+ nver4 n ve4 rr
3220
+ nver5 n ve5 rr
3221
+ nuor1 n uo1 rr
3222
+ nuor2 n uo2 rr
3223
+ nuor3 n uo3 rr
3224
+ nuor4 n uo4 rr
3225
+ nuor5 n uo5 rr
3226
+ nvr1 n v1 rr
3227
+ nvr2 n v2 rr
3228
+ nvr3 n v3 rr
3229
+ nvr4 n v4 rr
3230
+ nvr5 n v5 rr
3231
+ or1 o1 rr
3232
+ or2 o2 rr
3233
+ or3 o3 rr
3234
+ or4 o4 rr
3235
+ or5 o5 rr
3236
+ our1 ou1 rr
3237
+ our2 ou2 rr
3238
+ our3 ou3 rr
3239
+ our4 ou4 rr
3240
+ our5 ou5 rr
3241
+ par1 p a1 rr
3242
+ par2 p a2 rr
3243
+ par3 p a3 rr
3244
+ par4 p a4 rr
3245
+ par5 p a5 rr
3246
+ pair1 p ai1 rr
3247
+ pair2 p ai2 rr
3248
+ pair3 p ai3 rr
3249
+ pair4 p ai4 rr
3250
+ pair5 p ai5 rr
3251
+ panr1 p an1 rr
3252
+ panr2 p an2 rr
3253
+ panr3 p an3 rr
3254
+ panr4 p an4 rr
3255
+ panr5 p an5 rr
3256
+ pangr1 p ang1 rr
3257
+ pangr2 p ang2 rr
3258
+ pangr3 p ang3 rr
3259
+ pangr4 p ang4 rr
3260
+ pangr5 p ang5 rr
3261
+ paor1 p ao1 rr
3262
+ paor2 p ao2 rr
3263
+ paor3 p ao3 rr
3264
+ paor4 p ao4 rr
3265
+ paor5 p ao5 rr
3266
+ peir1 p ei1 rr
3267
+ peir2 p ei2 rr
3268
+ peir3 p ei3 rr
3269
+ peir4 p ei4 rr
3270
+ peir5 p ei5 rr
3271
+ penr1 p en1 rr
3272
+ penr2 p en2 rr
3273
+ penr3 p en3 rr
3274
+ penr4 p en4 rr
3275
+ penr5 p en5 rr
3276
+ pengr1 p eng1 rr
3277
+ pengr2 p eng2 rr
3278
+ pengr3 p eng3 rr
3279
+ pengr4 p eng4 rr
3280
+ pengr5 p eng5 rr
3281
+ pir1 p i1 rr
3282
+ pir2 p i2 rr
3283
+ pir3 p i3 rr
3284
+ pir4 p i4 rr
3285
+ pir5 p i5 rr
3286
+ pianr1 p ian1 rr
3287
+ pianr2 p ian2 rr
3288
+ pianr3 p ian3 rr
3289
+ pianr4 p ian4 rr
3290
+ pianr5 p ian5 rr
3291
+ piaor1 p iao1 rr
3292
+ piaor2 p iao2 rr
3293
+ piaor3 p iao3 rr
3294
+ piaor4 p iao4 rr
3295
+ piaor5 p iao5 rr
3296
+ pier1 p ie1 rr
3297
+ pier2 p ie2 rr
3298
+ pier3 p ie3 rr
3299
+ pier4 p ie4 rr
3300
+ pier5 p ie5 rr
3301
+ pinr1 p in1 rr
3302
+ pinr2 p in2 rr
3303
+ pinr3 p in3 rr
3304
+ pinr4 p in4 rr
3305
+ pinr5 p in5 rr
3306
+ pingr1 p ing1 rr
3307
+ pingr2 p ing2 rr
3308
+ pingr3 p ing3 rr
3309
+ pingr4 p ing4 rr
3310
+ pingr5 p ing5 rr
3311
+ por1 p o1 rr
3312
+ por2 p o2 rr
3313
+ por3 p o3 rr
3314
+ por4 p o4 rr
3315
+ por5 p o5 rr
3316
+ pour1 p ou1 rr
3317
+ pour2 p ou2 rr
3318
+ pour3 p ou3 rr
3319
+ pour4 p ou4 rr
3320
+ pour5 p ou5 rr
3321
+ pur1 p u1 rr
3322
+ pur2 p u2 rr
3323
+ pur3 p u3 rr
3324
+ pur4 p u4 rr
3325
+ pur5 p u5 rr
3326
+ qir1 q i1 rr
3327
+ qir2 q i2 rr
3328
+ qir3 q i3 rr
3329
+ qir4 q i4 rr
3330
+ qir5 q i5 rr
3331
+ qiar1 q ia1 rr
3332
+ qiar2 q ia2 rr
3333
+ qiar3 q ia3 rr
3334
+ qiar4 q ia4 rr
3335
+ qiar5 q ia5 rr
3336
+ qianr1 q ian1 rr
3337
+ qianr2 q ian2 rr
3338
+ qianr3 q ian3 rr
3339
+ qianr4 q ian4 rr
3340
+ qianr5 q ian5 rr
3341
+ qiangr1 q iang1 rr
3342
+ qiangr2 q iang2 rr
3343
+ qiangr3 q iang3 rr
3344
+ qiangr4 q iang4 rr
3345
+ qiangr5 q iang5 rr
3346
+ qiaor1 q iao1 rr
3347
+ qiaor2 q iao2 rr
3348
+ qiaor3 q iao3 rr
3349
+ qiaor4 q iao4 rr
3350
+ qiaor5 q iao5 rr
3351
+ qier1 q ie1 rr
3352
+ qier2 q ie2 rr
3353
+ qier3 q ie3 rr
3354
+ qier4 q ie4 rr
3355
+ qier5 q ie5 rr
3356
+ qinr1 q in1 rr
3357
+ qinr2 q in2 rr
3358
+ qinr3 q in3 rr
3359
+ qinr4 q in4 rr
3360
+ qinr5 q in5 rr
3361
+ qingr1 q ing1 rr
3362
+ qingr2 q ing2 rr
3363
+ qingr3 q ing3 rr
3364
+ qingr4 q ing4 rr
3365
+ qingr5 q ing5 rr
3366
+ qiongr1 q iong1 rr
3367
+ qiongr2 q iong2 rr
3368
+ qiongr3 q iong3 rr
3369
+ qiongr4 q iong4 rr
3370
+ qiongr5 q iong5 rr
3371
+ qiur1 q iou1 rr
3372
+ qiur2 q iou2 rr
3373
+ qiur3 q iou3 rr
3374
+ qiur4 q iou4 rr
3375
+ qiur5 q iou5 rr
3376
+ qur1 q v1 rr
3377
+ qur2 q v2 rr
3378
+ qur3 q v3 rr
3379
+ qur4 q v4 rr
3380
+ qur5 q v5 rr
3381
+ quanr1 q van1 rr
3382
+ quanr2 q van2 rr
3383
+ quanr3 q van3 rr
3384
+ quanr4 q van4 rr
3385
+ quanr5 q van5 rr
3386
+ quer1 q ve1 rr
3387
+ quer2 q ve2 rr
3388
+ quer3 q ve3 rr
3389
+ quer4 q ve4 rr
3390
+ quer5 q ve5 rr
3391
+ qunr1 q vn1 rr
3392
+ qunr2 q vn2 rr
3393
+ qunr3 q vn3 rr
3394
+ qunr4 q vn4 rr
3395
+ qunr5 q vn5 rr
3396
+ ranr1 r an1 rr
3397
+ ranr2 r an2 rr
3398
+ ranr3 r an3 rr
3399
+ ranr4 r an4 rr
3400
+ ranr5 r an5 rr
3401
+ rangr1 r ang1 rr
3402
+ rangr2 r ang2 rr
3403
+ rangr3 r ang3 rr
3404
+ rangr4 r ang4 rr
3405
+ rangr5 r ang5 rr
3406
+ raor1 r ao1 rr
3407
+ raor2 r ao2 rr
3408
+ raor3 r ao3 rr
3409
+ raor4 r ao4 rr
3410
+ raor5 r ao5 rr
3411
+ rer1 r e1 rr
3412
+ rer2 r e2 rr
3413
+ rer3 r e3 rr
3414
+ rer4 r e4 rr
3415
+ rer5 r e5 rr
3416
+ renr1 r en1 rr
3417
+ renr2 r en2 rr
3418
+ renr3 r en3 rr
3419
+ renr4 r en4 rr
3420
+ renr5 r en5 rr
3421
+ rengr1 r eng1 rr
3422
+ rengr2 r eng2 rr
3423
+ rengr3 r eng3 rr
3424
+ rengr4 r eng4 rr
3425
+ rengr5 r eng5 rr
3426
+ rir1 r iii1 rr
3427
+ rir2 r iii2 rr
3428
+ rir3 r iii3 rr
3429
+ rir4 r iii4 rr
3430
+ rir5 r iii5 rr
3431
+ rongr1 r ong1 rr
3432
+ rongr2 r ong2 rr
3433
+ rongr3 r ong3 rr
3434
+ rongr4 r ong4 rr
3435
+ rongr5 r ong5 rr
3436
+ rour1 r ou1 rr
3437
+ rour2 r ou2 rr
3438
+ rour3 r ou3 rr
3439
+ rour4 r ou4 rr
3440
+ rour5 r ou5 rr
3441
+ rur1 r u1 rr
3442
+ rur2 r u2 rr
3443
+ rur3 r u3 rr
3444
+ rur4 r u4 rr
3445
+ rur5 r u5 rr
3446
+ ruar1 r ua1 rr
3447
+ ruar2 r ua2 rr
3448
+ ruar3 r ua3 rr
3449
+ ruar4 r ua4 rr
3450
+ ruar5 r ua5 rr
3451
+ ruanr1 r uan1 rr
3452
+ ruanr2 r uan2 rr
3453
+ ruanr3 r uan3 rr
3454
+ ruanr4 r uan4 rr
3455
+ ruanr5 r uan5 rr
3456
+ ruir1 r uei1 rr
3457
+ ruir2 r uei2 rr
3458
+ ruir3 r uei3 rr
3459
+ ruir4 r uei4 rr
3460
+ ruir5 r uei5 rr
3461
+ runr1 r uen1 rr
3462
+ runr2 r uen2 rr
3463
+ runr3 r uen3 rr
3464
+ runr4 r uen4 rr
3465
+ runr5 r uen5 rr
3466
+ ruor1 r uo1 rr
3467
+ ruor2 r uo2 rr
3468
+ ruor3 r uo3 rr
3469
+ ruor4 r uo4 rr
3470
+ ruor5 r uo5 rr
3471
+ sar1 s a1 rr
3472
+ sar2 s a2 rr
3473
+ sar3 s a3 rr
3474
+ sar4 s a4 rr
3475
+ sar5 s a5 rr
3476
+ sair1 s ai1 rr
3477
+ sair2 s ai2 rr
3478
+ sair3 s ai3 rr
3479
+ sair4 s ai4 rr
3480
+ sair5 s ai5 rr
3481
+ sanr1 s an1 rr
3482
+ sanr2 s an2 rr
3483
+ sanr3 s an3 rr
3484
+ sanr4 s an4 rr
3485
+ sanr5 s an5 rr
3486
+ sangr1 s ang1 rr
3487
+ sangr2 s ang2 rr
3488
+ sangr3 s ang3 rr
3489
+ sangr4 s ang4 rr
3490
+ sangr5 s ang5 rr
3491
+ saor1 s ao1 rr
3492
+ saor2 s ao2 rr
3493
+ saor3 s ao3 rr
3494
+ saor4 s ao4 rr
3495
+ saor5 s ao5 rr
3496
+ ser1 s e1 rr
3497
+ ser2 s e2 rr
3498
+ ser3 s e3 rr
3499
+ ser4 s e4 rr
3500
+ ser5 s e5 rr
3501
+ senr1 s en1 rr
3502
+ senr2 s en2 rr
3503
+ senr3 s en3 rr
3504
+ senr4 s en4 rr
3505
+ senr5 s en5 rr
3506
+ sengr1 s eng1 rr
3507
+ sengr2 s eng2 rr
3508
+ sengr3 s eng3 rr
3509
+ sengr4 s eng4 rr
3510
+ sengr5 s eng5 rr
3511
+ shar1 sh a1 rr
3512
+ shar2 sh a2 rr
3513
+ shar3 sh a3 rr
3514
+ shar4 sh a4 rr
3515
+ shar5 sh a5 rr
3516
+ shair1 sh ai1 rr
3517
+ shair2 sh ai2 rr
3518
+ shair3 sh ai3 rr
3519
+ shair4 sh ai4 rr
3520
+ shair5 sh ai5 rr
3521
+ shanr1 sh an1 rr
3522
+ shanr2 sh an2 rr
3523
+ shanr3 sh an3 rr
3524
+ shanr4 sh an4 rr
3525
+ shanr5 sh an5 rr
3526
+ shangr1 sh ang1 rr
3527
+ shangr2 sh ang2 rr
3528
+ shangr3 sh ang3 rr
3529
+ shangr4 sh ang4 rr
3530
+ shangr5 sh ang5 rr
3531
+ shaor1 sh ao1 rr
3532
+ shaor2 sh ao2 rr
3533
+ shaor3 sh ao3 rr
3534
+ shaor4 sh ao4 rr
3535
+ shaor5 sh ao5 rr
3536
+ sher1 sh e1 rr
3537
+ sher2 sh e2 rr
3538
+ sher3 sh e3 rr
3539
+ sher4 sh e4 rr
3540
+ sher5 sh e5 rr
3541
+ sheir1 sh ei1 rr
3542
+ sheir2 sh ei2 rr
3543
+ sheir3 sh ei3 rr
3544
+ sheir4 sh ei4 rr
3545
+ sheir5 sh ei5 rr
3546
+ shenr1 sh en1 rr
3547
+ shenr2 sh en2 rr
3548
+ shenr3 sh en3 rr
3549
+ shenr4 sh en4 rr
3550
+ shenr5 sh en5 rr
3551
+ shengr1 sh eng1 rr
3552
+ shengr2 sh eng2 rr
3553
+ shengr3 sh eng3 rr
3554
+ shengr4 sh eng4 rr
3555
+ shengr5 sh eng5 rr
3556
+ shir1 sh iii1 rr
3557
+ shir2 sh iii2 rr
3558
+ shir3 sh iii3 rr
3559
+ shir4 sh iii4 rr
3560
+ shir5 sh iii5 rr
3561
+ shour1 sh ou1 rr
3562
+ shour2 sh ou2 rr
3563
+ shour3 sh ou3 rr
3564
+ shour4 sh ou4 rr
3565
+ shour5 sh ou5 rr
3566
+ shur1 sh u1 rr
3567
+ shur2 sh u2 rr
3568
+ shur3 sh u3 rr
3569
+ shur4 sh u4 rr
3570
+ shur5 sh u5 rr
3571
+ shuar1 sh ua1 rr
3572
+ shuar2 sh ua2 rr
3573
+ shuar3 sh ua3 rr
3574
+ shuar4 sh ua4 rr
3575
+ shuar5 sh ua5 rr
3576
+ shuair1 sh uai1 rr
3577
+ shuair2 sh uai2 rr
3578
+ shuair3 sh uai3 rr
3579
+ shuair4 sh uai4 rr
3580
+ shuair5 sh uai5 rr
3581
+ shuanr1 sh uan1 rr
3582
+ shuanr2 sh uan2 rr
3583
+ shuanr3 sh uan3 rr
3584
+ shuanr4 sh uan4 rr
3585
+ shuanr5 sh uan5 rr
3586
+ shuangr1 sh uang1 rr
3587
+ shuangr2 sh uang2 rr
3588
+ shuangr3 sh uang3 rr
3589
+ shuangr4 sh uang4 rr
3590
+ shuangr5 sh uang5 rr
3591
+ shuir1 sh uei1 rr
3592
+ shuir2 sh uei2 rr
3593
+ shuir3 sh uei3 rr
3594
+ shuir4 sh uei4 rr
3595
+ shuir5 sh uei5 rr
3596
+ shunr1 sh uen1 rr
3597
+ shunr2 sh uen2 rr
3598
+ shunr3 sh uen3 rr
3599
+ shunr4 sh uen4 rr
3600
+ shunr5 sh uen5 rr
3601
+ shuor1 sh uo1 rr
3602
+ shuor2 sh uo2 rr
3603
+ shuor3 sh uo3 rr
3604
+ shuor4 sh uo4 rr
3605
+ shuor5 sh uo5 rr
3606
+ sir1 s ii1 rr
3607
+ sir2 s ii2 rr
3608
+ sir3 s ii3 rr
3609
+ sir4 s ii4 rr
3610
+ sir5 s ii5 rr
3611
+ songr1 s ong1 rr
3612
+ songr2 s ong2 rr
3613
+ songr3 s ong3 rr
3614
+ songr4 s ong4 rr
3615
+ songr5 s ong5 rr
3616
+ sour1 s ou1 rr
3617
+ sour2 s ou2 rr
3618
+ sour3 s ou3 rr
3619
+ sour4 s ou4 rr
3620
+ sour5 s ou5 rr
3621
+ sur1 s u1 rr
3622
+ sur2 s u2 rr
3623
+ sur3 s u3 rr
3624
+ sur4 s u4 rr
3625
+ sur5 s u5 rr
3626
+ suanr1 s uan1 rr
3627
+ suanr2 s uan2 rr
3628
+ suanr3 s uan3 rr
3629
+ suanr4 s uan4 rr
3630
+ suanr5 s uan5 rr
3631
+ suir1 s uei1 rr
3632
+ suir2 s uei2 rr
3633
+ suir3 s uei3 rr
3634
+ suir4 s uei4 rr
3635
+ suir5 s uei5 rr
3636
+ sunr1 s uen1 rr
3637
+ sunr2 s uen2 rr
3638
+ sunr3 s uen3 rr
3639
+ sunr4 s uen4 rr
3640
+ sunr5 s uen5 rr
3641
+ suor1 s uo1 rr
3642
+ suor2 s uo2 rr
3643
+ suor3 s uo3 rr
3644
+ suor4 s uo4 rr
3645
+ suor5 s uo5 rr
3646
+ tar1 t a1 rr
3647
+ tar2 t a2 rr
3648
+ tar3 t a3 rr
3649
+ tar4 t a4 rr
3650
+ tar5 t a5 rr
3651
+ tair1 t ai1 rr
3652
+ tair2 t ai2 rr
3653
+ tair3 t ai3 rr
3654
+ tair4 t ai4 rr
3655
+ tair5 t ai5 rr
3656
+ tanr1 t an1 rr
3657
+ tanr2 t an2 rr
3658
+ tanr3 t an3 rr
3659
+ tanr4 t an4 rr
3660
+ tanr5 t an5 rr
3661
+ tangr1 t ang1 rr
3662
+ tangr2 t ang2 rr
3663
+ tangr3 t ang3 rr
3664
+ tangr4 t ang4 rr
3665
+ tangr5 t ang5 rr
3666
+ taor1 t ao1 rr
3667
+ taor2 t ao2 rr
3668
+ taor3 t ao3 rr
3669
+ taor4 t ao4 rr
3670
+ taor5 t ao5 rr
3671
+ ter1 t e1 rr
3672
+ ter2 t e2 rr
3673
+ ter3 t e3 rr
3674
+ ter4 t e4 rr
3675
+ ter5 t e5 rr
3676
+ teir1 t ei1 rr
3677
+ teir2 t ei2 rr
3678
+ teir3 t ei3 rr
3679
+ teir4 t ei4 rr
3680
+ teir5 t ei5 rr
3681
+ tengr1 t eng1 rr
3682
+ tengr2 t eng2 rr
3683
+ tengr3 t eng3 rr
3684
+ tengr4 t eng4 rr
3685
+ tengr5 t eng5 rr
3686
+ tir1 t i1 rr
3687
+ tir2 t i2 rr
3688
+ tir3 t i3 rr
3689
+ tir4 t i4 rr
3690
+ tir5 t i5 rr
3691
+ tianr1 t ian1 rr
3692
+ tianr2 t ian2 rr
3693
+ tianr3 t ian3 rr
3694
+ tianr4 t ian4 rr
3695
+ tianr5 t ian5 rr
3696
+ tiaor1 t iao1 rr
3697
+ tiaor2 t iao2 rr
3698
+ tiaor3 t iao3 rr
3699
+ tiaor4 t iao4 rr
3700
+ tiaor5 t iao5 rr
3701
+ tier1 t ie1 rr
3702
+ tier2 t ie2 rr
3703
+ tier3 t ie3 rr
3704
+ tier4 t ie4 rr
3705
+ tier5 t ie5 rr
3706
+ tingr1 t ing1 rr
3707
+ tingr2 t ing2 rr
3708
+ tingr3 t ing3 rr
3709
+ tingr4 t ing4 rr
3710
+ tingr5 t ing5 rr
3711
+ tongr1 t ong1 rr
3712
+ tongr2 t ong2 rr
3713
+ tongr3 t ong3 rr
3714
+ tongr4 t ong4 rr
3715
+ tongr5 t ong5 rr
3716
+ tour1 t ou1 rr
3717
+ tour2 t ou2 rr
3718
+ tour3 t ou3 rr
3719
+ tour4 t ou4 rr
3720
+ tour5 t ou5 rr
3721
+ tur1 t u1 rr
3722
+ tur2 t u2 rr
3723
+ tur3 t u3 rr
3724
+ tur4 t u4 rr
3725
+ tur5 t u5 rr
3726
+ tuanr1 t uan1 rr
3727
+ tuanr2 t uan2 rr
3728
+ tuanr3 t uan3 rr
3729
+ tuanr4 t uan4 rr
3730
+ tuanr5 t uan5 rr
3731
+ tuir1 t uei1 rr
3732
+ tuir2 t uei2 rr
3733
+ tuir3 t uei3 rr
3734
+ tuir4 t uei4 rr
3735
+ tuir5 t uei5 rr
3736
+ tunr1 t uen1 rr
3737
+ tunr2 t uen2 rr
3738
+ tunr3 t uen3 rr
3739
+ tunr4 t uen4 rr
3740
+ tunr5 t uen5 rr
3741
+ tuor1 t uo1 rr
3742
+ tuor2 t uo2 rr
3743
+ tuor3 t uo3 rr
3744
+ tuor4 t uo4 rr
3745
+ tuor5 t uo5 rr
3746
+ war1 w ua1 rr
3747
+ war2 w ua2 rr
3748
+ war3 w ua3 rr
3749
+ war4 w ua4 rr
3750
+ war5 w ua5 rr
3751
+ wair1 w uai1 rr
3752
+ wair2 w uai2 rr
3753
+ wair3 w uai3 rr
3754
+ wair4 w uai4 rr
3755
+ wair5 w uai5 rr
3756
+ wanr1 w uan1 rr
3757
+ wanr2 w uan2 rr
3758
+ wanr3 w uan3 rr
3759
+ wanr4 w uan4 rr
3760
+ wanr5 w uan5 rr
3761
+ wangr1 w uang1 rr
3762
+ wangr2 w uang2 rr
3763
+ wangr3 w uang3 rr
3764
+ wangr4 w uang4 rr
3765
+ wangr5 w uang5 rr
3766
+ weir1 w uei1 rr
3767
+ weir2 w uei2 rr
3768
+ weir3 w uei3 rr
3769
+ weir4 w uei4 rr
3770
+ weir5 w uei5 rr
3771
+ wenr1 w uen1 rr
3772
+ wenr2 w uen2 rr
3773
+ wenr3 w uen3 rr
3774
+ wenr4 w uen4 rr
3775
+ wenr5 w uen5 rr
3776
+ wengr1 w uen1 rr
3777
+ wengr2 w uen2 rr
3778
+ wengr3 w uen3 rr
3779
+ wengr4 w uen4 rr
3780
+ wengr5 w uen5 rr
3781
+ wor1 w uo1 rr
3782
+ wor2 w uo2 rr
3783
+ wor3 w uo3 rr
3784
+ wor4 w uo4 rr
3785
+ wor5 w uo5 rr
3786
+ wur1 w u1 rr
3787
+ wur2 w u2 rr
3788
+ wur3 w u3 rr
3789
+ wur4 w u4 rr
3790
+ wur5 w u5 rr
3791
+ xir1 x i1 rr
3792
+ xir2 x i2 rr
3793
+ xir3 x i3 rr
3794
+ xir4 x i4 rr
3795
+ xir5 x i5 rr
3796
+ xiar1 x ia1 rr
3797
+ xiar2 x ia2 rr
3798
+ xiar3 x ia3 rr
3799
+ xiar4 x ia4 rr
3800
+ xiar5 x ia5 rr
3801
+ xianr1 x ian1 rr
3802
+ xianr2 x ian2 rr
3803
+ xianr3 x ian3 rr
3804
+ xianr4 x ian4 rr
3805
+ xianr5 x ian5 rr
3806
+ xiangr1 x iang1 rr
3807
+ xiangr2 x iang2 rr
3808
+ xiangr3 x iang3 rr
3809
+ xiangr4 x iang4 rr
3810
+ xiangr5 x iang5 rr
3811
+ xiaor1 x iao1 rr
3812
+ xiaor2 x iao2 rr
3813
+ xiaor3 x iao3 rr
3814
+ xiaor4 x iao4 rr
3815
+ xiaor5 x iao5 rr
3816
+ xier1 x ie1 rr
3817
+ xier2 x ie2 rr
3818
+ xier3 x ie3 rr
3819
+ xier4 x ie4 rr
3820
+ xier5 x ie5 rr
3821
+ xinr1 x in1 rr
3822
+ xinr2 x in2 rr
3823
+ xinr3 x in3 rr
3824
+ xinr4 x in4 rr
3825
+ xinr5 x in5 rr
3826
+ xingr1 x ing1 rr
3827
+ xingr2 x ing2 rr
3828
+ xingr3 x ing3 rr
3829
+ xingr4 x ing4 rr
3830
+ xingr5 x ing5 rr
3831
+ xiongr1 x iong1 rr
3832
+ xiongr2 x iong2 rr
3833
+ xiongr3 x iong3 rr
3834
+ xiongr4 x iong4 rr
3835
+ xiongr5 x iong5 rr
3836
+ xiur1 x iou1 rr
3837
+ xiur2 x iou2 rr
3838
+ xiur3 x iou3 rr
3839
+ xiur4 x iou4 rr
3840
+ xiur5 x iou5 rr
3841
+ xur1 x v1 rr
3842
+ xur2 x v2 rr
3843
+ xur3 x v3 rr
3844
+ xur4 x v4 rr
3845
+ xur5 x v5 rr
3846
+ xuanr1 x van1 rr
3847
+ xuanr2 x van2 rr
3848
+ xuanr3 x van3 rr
3849
+ xuanr4 x van4 rr
3850
+ xuanr5 x van5 rr
3851
+ xuer1 x ve1 rr
3852
+ xuer2 x ve2 rr
3853
+ xuer3 x ve3 rr
3854
+ xuer4 x ve4 rr
3855
+ xuer5 x ve5 rr
3856
+ xunr1 x vn1 rr
3857
+ xunr2 x vn2 rr
3858
+ xunr3 x vn3 rr
3859
+ xunr4 x vn4 rr
3860
+ xunr5 x vn5 rr
3861
+ yar1 y ia1 rr
3862
+ yar2 y ia2 rr
3863
+ yar3 y ia3 rr
3864
+ yar4 y ia4 rr
3865
+ yar5 y ia5 rr
3866
+ yanr1 y ian1 rr
3867
+ yanr2 y ian2 rr
3868
+ yanr3 y ian3 rr
3869
+ yanr4 y ian4 rr
3870
+ yanr5 y ian5 rr
3871
+ yangr1 y iang1 rr
3872
+ yangr2 y iang2 rr
3873
+ yangr3 y iang3 rr
3874
+ yangr4 y iang4 rr
3875
+ yangr5 y iang5 rr
3876
+ yaor1 y iao1 rr
3877
+ yaor2 y iao2 rr
3878
+ yaor3 y iao3 rr
3879
+ yaor4 y iao4 rr
3880
+ yaor5 y iao5 rr
3881
+ yer1 y ie1 rr
3882
+ yer2 y ie2 rr
3883
+ yer3 y ie3 rr
3884
+ yer4 y ie4 rr
3885
+ yer5 y ie5 rr
3886
+ yir1 y i1 rr
3887
+ yir2 y i2 rr
3888
+ yir3 y i3 rr
3889
+ yir4 y i4 rr
3890
+ yir5 y i5 rr
3891
+ yinr1 y in1 rr
3892
+ yinr2 y in2 rr
3893
+ yinr3 y in3 rr
3894
+ yinr4 y in4 rr
3895
+ yinr5 y in5 rr
3896
+ yingr1 y ing1 rr
3897
+ yingr2 y ing2 rr
3898
+ yingr3 y ing3 rr
3899
+ yingr4 y ing4 rr
3900
+ yingr5 y ing5 rr
3901
+ yor1 y iou1 rr
3902
+ yor2 y iou2 rr
3903
+ yor3 y iou3 rr
3904
+ yor4 y iou4 rr
3905
+ yor5 y iou5 rr
3906
+ yongr1 y iong1 rr
3907
+ yongr2 y iong2 rr
3908
+ yongr3 y iong3 rr
3909
+ yongr4 y iong4 rr
3910
+ yongr5 y iong5 rr
3911
+ your1 y iou1 rr
3912
+ your2 y iou2 rr
3913
+ your3 y iou3 rr
3914
+ your4 y iou4 rr
3915
+ your5 y iou5 rr
3916
+ yur1 y v1 rr
3917
+ yur2 y v2 rr
3918
+ yur3 y v3 rr
3919
+ yur4 y v4 rr
3920
+ yur5 y v5 rr
3921
+ yuanr1 y van1 rr
3922
+ yuanr2 y van2 rr
3923
+ yuanr3 y van3 rr
3924
+ yuanr4 y van4 rr
3925
+ yuanr5 y van5 rr
3926
+ yuer1 y ve1 rr
3927
+ yuer2 y ve2 rr
3928
+ yuer3 y ve3 rr
3929
+ yuer4 y ve4 rr
3930
+ yuer5 y ve5 rr
3931
+ yunr1 y vn1 rr
3932
+ yunr2 y vn2 rr
3933
+ yunr3 y vn3 rr
3934
+ yunr4 y vn4 rr
3935
+ yunr5 y vn5 rr
3936
+ zar1 z a1 rr
3937
+ zar2 z a2 rr
3938
+ zar3 z a3 rr
3939
+ zar4 z a4 rr
3940
+ zar5 z a5 rr
3941
+ zair1 z ai1 rr
3942
+ zair2 z ai2 rr
3943
+ zair3 z ai3 rr
3944
+ zair4 z ai4 rr
3945
+ zair5 z ai5 rr
3946
+ zanr1 z an1 rr
3947
+ zanr2 z an2 rr
3948
+ zanr3 z an3 rr
3949
+ zanr4 z an4 rr
3950
+ zanr5 z an5 rr
3951
+ zangr1 z ang1 rr
3952
+ zangr2 z ang2 rr
3953
+ zangr3 z ang3 rr
3954
+ zangr4 z ang4 rr
3955
+ zangr5 z ang5 rr
3956
+ zaor1 z ao1 rr
3957
+ zaor2 z ao2 rr
3958
+ zaor3 z ao3 rr
3959
+ zaor4 z ao4 rr
3960
+ zaor5 z ao5 rr
3961
+ zer1 z e1 rr
3962
+ zer2 z e2 rr
3963
+ zer3 z e3 rr
3964
+ zer4 z e4 rr
3965
+ zer5 z e5 rr
3966
+ zeir1 z ei1 rr
3967
+ zeir2 z ei2 rr
3968
+ zeir3 z ei3 rr
3969
+ zeir4 z ei4 rr
3970
+ zeir5 z ei5 rr
3971
+ zenr1 z en1 rr
3972
+ zenr2 z en2 rr
3973
+ zenr3 z en3 rr
3974
+ zenr4 z en4 rr
3975
+ zenr5 z en5 rr
3976
+ zengr1 z eng1 rr
3977
+ zengr2 z eng2 rr
3978
+ zengr3 z eng3 rr
3979
+ zengr4 z eng4 rr
3980
+ zengr5 z eng5 rr
3981
+ zhar1 zh a1 rr
3982
+ zhar2 zh a2 rr
3983
+ zhar3 zh a3 rr
3984
+ zhar4 zh a4 rr
3985
+ zhar5 zh a5 rr
3986
+ zhair1 zh ai1 rr
3987
+ zhair2 zh ai2 rr
3988
+ zhair3 zh ai3 rr
3989
+ zhair4 zh ai4 rr
3990
+ zhair5 zh ai5 rr
3991
+ zhanr1 zh an1 rr
3992
+ zhanr2 zh an2 rr
3993
+ zhanr3 zh an3 rr
3994
+ zhanr4 zh an4 rr
3995
+ zhanr5 zh an5 rr
3996
+ zhangr1 zh ang1 rr
3997
+ zhangr2 zh ang2 rr
3998
+ zhangr3 zh ang3 rr
3999
+ zhangr4 zh ang4 rr
4000
+ zhangr5 zh ang5 rr
4001
+ zhaor1 zh ao1 rr
4002
+ zhaor2 zh ao2 rr
4003
+ zhaor3 zh ao3 rr
4004
+ zhaor4 zh ao4 rr
4005
+ zhaor5 zh ao5 rr
4006
+ zher1 zh e1 rr
4007
+ zher2 zh e2 rr
4008
+ zher3 zh e3 rr
4009
+ zher4 zh e4 rr
4010
+ zher5 zh e5 rr
4011
+ zheir1 zh ei1 rr
4012
+ zheir2 zh ei2 rr
4013
+ zheir3 zh ei3 rr
4014
+ zheir4 zh ei4 rr
4015
+ zheir5 zh ei5 rr
4016
+ zhenr1 zh en1 rr
4017
+ zhenr2 zh en2 rr
4018
+ zhenr3 zh en3 rr
4019
+ zhenr4 zh en4 rr
4020
+ zhenr5 zh en5 rr
4021
+ zhengr1 zh eng1 rr
4022
+ zhengr2 zh eng2 rr
4023
+ zhengr3 zh eng3 rr
4024
+ zhengr4 zh eng4 rr
4025
+ zhengr5 zh eng5 rr
4026
+ zhir1 zh iii1 rr
4027
+ zhir2 zh iii2 rr
4028
+ zhir3 zh iii3 rr
4029
+ zhir4 zh iii4 rr
4030
+ zhir5 zh iii5 rr
4031
+ zhongr1 zh ong1 rr
4032
+ zhongr2 zh ong2 rr
4033
+ zhongr3 zh ong3 rr
4034
+ zhongr4 zh ong4 rr
4035
+ zhongr5 zh ong5 rr
4036
+ zhour1 zh ou1 rr
4037
+ zhour2 zh ou2 rr
4038
+ zhour3 zh ou3 rr
4039
+ zhour4 zh ou4 rr
4040
+ zhour5 zh ou5 rr
4041
+ zhur1 zh u1 rr
4042
+ zhur2 zh u2 rr
4043
+ zhur3 zh u3 rr
4044
+ zhur4 zh u4 rr
4045
+ zhur5 zh u5 rr
4046
+ zhuar1 zh ua1 rr
4047
+ zhuar2 zh ua2 rr
4048
+ zhuar3 zh ua3 rr
4049
+ zhuar4 zh ua4 rr
4050
+ zhuar5 zh ua5 rr
4051
+ zhuair1 zh uai1 rr
4052
+ zhuair2 zh uai2 rr
4053
+ zhuair3 zh uai3 rr
4054
+ zhuair4 zh uai4 rr
4055
+ zhuair5 zh uai5 rr
4056
+ zhuanr1 zh uan1 rr
4057
+ zhuanr2 zh uan2 rr
4058
+ zhuanr3 zh uan3 rr
4059
+ zhuanr4 zh uan4 rr
4060
+ zhuanr5 zh uan5 rr
4061
+ zhuangr1 zh uang1 rr
4062
+ zhuangr2 zh uang2 rr
4063
+ zhuangr3 zh uang3 rr
4064
+ zhuangr4 zh uang4 rr
4065
+ zhuangr5 zh uang5 rr
4066
+ zhuir1 zh uei1 rr
4067
+ zhuir2 zh uei2 rr
4068
+ zhuir3 zh uei3 rr
4069
+ zhuir4 zh uei4 rr
4070
+ zhuir5 zh uei5 rr
4071
+ zhunr1 zh uen1 rr
4072
+ zhunr2 zh uen2 rr
4073
+ zhunr3 zh uen3 rr
4074
+ zhunr4 zh uen4 rr
4075
+ zhunr5 zh uen5 rr
4076
+ zhuor1 zh uo1 rr
4077
+ zhuor2 zh uo2 rr
4078
+ zhuor3 zh uo3 rr
4079
+ zhuor4 zh uo4 rr
4080
+ zhuor5 zh uo5 rr
4081
+ zir1 z ii1 rr
4082
+ zir2 z ii2 rr
4083
+ zir3 z ii3 rr
4084
+ zir4 z ii4 rr
4085
+ zir5 z ii5 rr
4086
+ zongr1 z ong1 rr
4087
+ zongr2 z ong2 rr
4088
+ zongr3 z ong3 rr
4089
+ zongr4 z ong4 rr
4090
+ zongr5 z ong5 rr
4091
+ zour1 z ou1 rr
4092
+ zour2 z ou2 rr
4093
+ zour3 z ou3 rr
4094
+ zour4 z ou4 rr
4095
+ zour5 z ou5 rr
4096
+ zur1 z u1 rr
4097
+ zur2 z u2 rr
4098
+ zur3 z u3 rr
4099
+ zur4 z u4 rr
4100
+ zur5 z u5 rr
4101
+ zuanr1 z uan1 rr
4102
+ zuanr2 z uan2 rr
4103
+ zuanr3 z uan3 rr
4104
+ zuanr4 z uan4 rr
4105
+ zuanr5 z uan5 rr
4106
+ zuir1 z uei1 rr
4107
+ zuir2 z uei2 rr
4108
+ zuir3 z uei3 rr
4109
+ zuir4 z uei4 rr
4110
+ zuir5 z uei5 rr
4111
+ zunr1 z uen1 rr
4112
+ zunr2 z uen2 rr
4113
+ zunr3 z uen3 rr
4114
+ zunr4 z uen4 rr
4115
+ zunr5 z uen5 rr
4116
+ zuor1 z uo1 rr
4117
+ zuor2 z uo2 rr
4118
+ zuor3 z uo3 rr
4119
+ zuor4 z uo4 rr
4120
+ zuor5 z uo5 rr
text/numbers.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """ This code is modified from https://github.com/keithito/tacotron """
7
+
8
+ import inflect
9
+ import re
10
+
11
+ _inflect = inflect.engine()
12
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
13
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
14
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
15
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
16
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
17
+ _number_re = re.compile(r"[0-9]+")
18
+
19
+
20
+ def _remove_commas(m):
21
+ return m.group(1).replace(",", "")
22
+
23
+
24
+ def _expand_decimal_point(m):
25
+ return m.group(1).replace(".", " point ")
26
+
27
+
28
+ def _expand_dollars(m):
29
+ match = m.group(1)
30
+ parts = match.split(".")
31
+ if len(parts) > 2:
32
+ return match + " dollars" # Unexpected format
33
+ dollars = int(parts[0]) if parts[0] else 0
34
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
35
+ if dollars and cents:
36
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
37
+ cent_unit = "cent" if cents == 1 else "cents"
38
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
39
+ elif dollars:
40
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
41
+ return "%s %s" % (dollars, dollar_unit)
42
+ elif cents:
43
+ cent_unit = "cent" if cents == 1 else "cents"
44
+ return "%s %s" % (cents, cent_unit)
45
+ else:
46
+ return "zero dollars"
47
+
48
+
49
+ def _expand_ordinal(m):
50
+ return _inflect.number_to_words(m.group(0))
51
+
52
+
53
+ def _expand_number(m):
54
+ num = int(m.group(0))
55
+ if num > 1000 and num < 3000:
56
+ if num == 2000:
57
+ return "two thousand"
58
+ elif num > 2000 and num < 2010:
59
+ return "two thousand " + _inflect.number_to_words(num % 100)
60
+ elif num % 100 == 0:
61
+ return _inflect.number_to_words(num // 100) + " hundred"
62
+ else:
63
+ return _inflect.number_to_words(
64
+ num, andword="", zero="oh", group=2
65
+ ).replace(", ", " ")
66
+ else:
67
+ return _inflect.number_to_words(num, andword="")
68
+
69
+
70
+ def normalize_numbers(text):
71
+ text = re.sub(_comma_number_re, _remove_commas, text)
72
+ text = re.sub(_pounds_re, r"\1 pounds", text)
73
+ text = re.sub(_dollars_re, _expand_dollars, text)
74
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
75
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
76
+ text = re.sub(_number_re, _expand_number, text)
77
+ return text
text/pinyin.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ initials = [
7
+ "b",
8
+ "c",
9
+ "ch",
10
+ "d",
11
+ "f",
12
+ "g",
13
+ "h",
14
+ "j",
15
+ "k",
16
+ "l",
17
+ "m",
18
+ "n",
19
+ "p",
20
+ "q",
21
+ "r",
22
+ "s",
23
+ "sh",
24
+ "t",
25
+ "w",
26
+ "x",
27
+ "y",
28
+ "z",
29
+ "zh",
30
+ ]
31
+ finals = [
32
+ "a1",
33
+ "a2",
34
+ "a3",
35
+ "a4",
36
+ "a5",
37
+ "ai1",
38
+ "ai2",
39
+ "ai3",
40
+ "ai4",
41
+ "ai5",
42
+ "an1",
43
+ "an2",
44
+ "an3",
45
+ "an4",
46
+ "an5",
47
+ "ang1",
48
+ "ang2",
49
+ "ang3",
50
+ "ang4",
51
+ "ang5",
52
+ "ao1",
53
+ "ao2",
54
+ "ao3",
55
+ "ao4",
56
+ "ao5",
57
+ "e1",
58
+ "e2",
59
+ "e3",
60
+ "e4",
61
+ "e5",
62
+ "ei1",
63
+ "ei2",
64
+ "ei3",
65
+ "ei4",
66
+ "ei5",
67
+ "en1",
68
+ "en2",
69
+ "en3",
70
+ "en4",
71
+ "en5",
72
+ "eng1",
73
+ "eng2",
74
+ "eng3",
75
+ "eng4",
76
+ "eng5",
77
+ "er1",
78
+ "er2",
79
+ "er3",
80
+ "er4",
81
+ "er5",
82
+ "i1",
83
+ "i2",
84
+ "i3",
85
+ "i4",
86
+ "i5",
87
+ "ia1",
88
+ "ia2",
89
+ "ia3",
90
+ "ia4",
91
+ "ia5",
92
+ "ian1",
93
+ "ian2",
94
+ "ian3",
95
+ "ian4",
96
+ "ian5",
97
+ "iang1",
98
+ "iang2",
99
+ "iang3",
100
+ "iang4",
101
+ "iang5",
102
+ "iao1",
103
+ "iao2",
104
+ "iao3",
105
+ "iao4",
106
+ "iao5",
107
+ "ie1",
108
+ "ie2",
109
+ "ie3",
110
+ "ie4",
111
+ "ie5",
112
+ "ii1",
113
+ "ii2",
114
+ "ii3",
115
+ "ii4",
116
+ "ii5",
117
+ "iii1",
118
+ "iii2",
119
+ "iii3",
120
+ "iii4",
121
+ "iii5",
122
+ "in1",
123
+ "in2",
124
+ "in3",
125
+ "in4",
126
+ "in5",
127
+ "ing1",
128
+ "ing2",
129
+ "ing3",
130
+ "ing4",
131
+ "ing5",
132
+ "iong1",
133
+ "iong2",
134
+ "iong3",
135
+ "iong4",
136
+ "iong5",
137
+ "iou1",
138
+ "iou2",
139
+ "iou3",
140
+ "iou4",
141
+ "iou5",
142
+ "o1",
143
+ "o2",
144
+ "o3",
145
+ "o4",
146
+ "o5",
147
+ "ong1",
148
+ "ong2",
149
+ "ong3",
150
+ "ong4",
151
+ "ong5",
152
+ "ou1",
153
+ "ou2",
154
+ "ou3",
155
+ "ou4",
156
+ "ou5",
157
+ "u1",
158
+ "u2",
159
+ "u3",
160
+ "u4",
161
+ "u5",
162
+ "ua1",
163
+ "ua2",
164
+ "ua3",
165
+ "ua4",
166
+ "ua5",
167
+ "uai1",
168
+ "uai2",
169
+ "uai3",
170
+ "uai4",
171
+ "uai5",
172
+ "uan1",
173
+ "uan2",
174
+ "uan3",
175
+ "uan4",
176
+ "uan5",
177
+ "uang1",
178
+ "uang2",
179
+ "uang3",
180
+ "uang4",
181
+ "uang5",
182
+ "uei1",
183
+ "uei2",
184
+ "uei3",
185
+ "uei4",
186
+ "uei5",
187
+ "uen1",
188
+ "uen2",
189
+ "uen3",
190
+ "uen4",
191
+ "uen5",
192
+ "uo1",
193
+ "uo2",
194
+ "uo3",
195
+ "uo4",
196
+ "uo5",
197
+ "v1",
198
+ "v2",
199
+ "v3",
200
+ "v4",
201
+ "v5",
202
+ "van1",
203
+ "van2",
204
+ "van3",
205
+ "van4",
206
+ "van5",
207
+ "ve1",
208
+ "ve2",
209
+ "ve3",
210
+ "ve4",
211
+ "ve5",
212
+ "vn1",
213
+ "vn2",
214
+ "vn3",
215
+ "vn4",
216
+ "vn5",
217
+ ]
218
+ valid_symbols = initials + finals + ["rr"]
text/symbol_table.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 Mobvoi Inc. (authors: Fangjun Kuang)
2
+ #
3
+ # See ../../../LICENSE for clarification regarding multiple authors
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from dataclasses import dataclass
18
+ from dataclasses import field
19
+ from typing import Dict
20
+ from typing import Generic
21
+ from typing import List
22
+ from typing import Optional
23
+ from typing import TypeVar
24
+ from typing import Union
25
+
26
+ Symbol = TypeVar('Symbol')
27
+
28
+ # SymbolTable is copied from
29
+ # https://github.com/k2-fsa/k2/blob/master/k2/python/k2/symbol_table.py
30
+
31
+ '''
32
+ SymbolTable: map symbol to id
33
+ '''
34
+ @dataclass(repr=False)
35
+ class SymbolTable(Generic[Symbol]):
36
+ '''SymbolTable that maps symbol IDs, found on the FSA arcs to
37
+ actual objects. These objects can be arbitrary Python objects
38
+ that can serve as keys in a dictionary (i.e. they need to be
39
+ hashable and immutable).
40
+
41
+ The SymbolTable can only be read to/written from disk if the
42
+ symbols are strings.
43
+ '''
44
+ _id2sym: Dict[int, Symbol] = field(default_factory=dict)
45
+ '''Map an integer to a symbol.
46
+ '''
47
+
48
+ _sym2id: Dict[Symbol, int] = field(default_factory=dict)
49
+ '''Map a symbol to an integer.
50
+ '''
51
+
52
+ _next_available_id: int = 1
53
+ '''A helper internal field that helps adding new symbols
54
+ to the table efficiently.
55
+ '''
56
+
57
+ eps: Symbol = '<eps>'
58
+ '''Null symbol, always mapped to index 0.
59
+ '''
60
+
61
+ def __post_init__(self):
62
+ assert all(self._sym2id[sym] == idx for idx, sym in self._id2sym.items())
63
+ assert all(self._id2sym[idx] == sym for sym, idx in self._sym2id.items())
64
+ assert 0 not in self._id2sym or self._id2sym[0] == self.eps
65
+
66
+ self._next_available_id = max(self._id2sym, default=0) + 1
67
+ self._id2sym.setdefault(0, self.eps)
68
+ self._sym2id.setdefault(self.eps, 0)
69
+
70
+
71
+ @staticmethod
72
+ def from_str(s: str) -> 'SymbolTable':
73
+ '''Build a symbol table from a string.
74
+
75
+ The string consists of lines. Every line has two fields separated
76
+ by space(s), tab(s) or both. The first field is the symbol and the
77
+ second the integer id of the symbol.
78
+
79
+ Args:
80
+ s:
81
+ The input string with the format described above.
82
+ Returns:
83
+ An instance of :class:`SymbolTable`.
84
+ '''
85
+ id2sym: Dict[int, str] = dict()
86
+ sym2id: Dict[str, int] = dict()
87
+
88
+ for line in s.split('\n'):
89
+ fields = line.split()
90
+ if len(fields) == 0:
91
+ continue # skip empty lines
92
+ assert len(fields) == 2, \
93
+ f'Expect a line with 2 fields. Given: {len(fields)}'
94
+ sym, idx = fields[0], int(fields[1])
95
+ assert sym not in sym2id, f'Duplicated symbol {sym}'
96
+ assert idx not in id2sym, f'Duplicated id {idx}'
97
+ id2sym[idx] = sym
98
+ sym2id[sym] = idx
99
+
100
+ eps = id2sym.get(0, '<eps>')
101
+
102
+ return SymbolTable(_id2sym=id2sym, _sym2id=sym2id, eps=eps)
103
+
104
+ @staticmethod
105
+ def from_file(filename: str) -> 'SymbolTable':
106
+ '''Build a symbol table from file.
107
+
108
+ Every line in the symbol table file has two fields separated by
109
+ space(s), tab(s) or both. The following is an example file:
110
+
111
+ .. code-block::
112
+
113
+ <eps> 0
114
+ a 1
115
+ b 2
116
+ c 3
117
+
118
+ Args:
119
+ filename:
120
+ Name of the symbol table file. Its format is documented above.
121
+
122
+ Returns:
123
+ An instance of :class:`SymbolTable`.
124
+
125
+ '''
126
+ with open(filename, 'r', encoding='utf-8') as f:
127
+ return SymbolTable.from_str(f.read().strip())
128
+
129
+ def to_str(self) -> str:
130
+ '''
131
+ Returns:
132
+ Return a string representation of this object. You can pass
133
+ it to the method ``from_str`` to recreate an identical object.
134
+ '''
135
+ s = ''
136
+ for idx, symbol in sorted(self._id2sym.items()):
137
+ s += f'{symbol} {idx}\n'
138
+ return s
139
+
140
+ def to_file(self, filename: str):
141
+ '''Serialize the SymbolTable to a file.
142
+
143
+ Every line in the symbol table file has two fields separated by
144
+ space(s), tab(s) or both. The following is an example file:
145
+
146
+ .. code-block::
147
+
148
+ <eps> 0
149
+ a 1
150
+ b 2
151
+ c 3
152
+
153
+ Args:
154
+ filename:
155
+ Name of the symbol table file. Its format is documented above.
156
+ '''
157
+ with open(filename, 'w') as f:
158
+ for idx, symbol in sorted(self._id2sym.items()):
159
+ print(symbol, idx, file=f)
160
+
161
+ def add(self, symbol: Symbol, index: Optional[int] = None) -> int:
162
+ '''Add a new symbol to the SymbolTable.
163
+
164
+ Args:
165
+ symbol:
166
+ The symbol to be added.
167
+ index:
168
+ Optional int id to which the symbol should be assigned.
169
+ If it is not available, a ValueError will be raised.
170
+
171
+ Returns:
172
+ The int id to which the symbol has been assigned.
173
+ '''
174
+ # Already in the table? Return its ID.
175
+ if symbol in self._sym2id:
176
+ return self._sym2id[symbol]
177
+ # Specific ID not provided - use next available.
178
+ if index is None:
179
+ index = self._next_available_id
180
+ # Specific ID provided but not available.
181
+ if index in self._id2sym:
182
+ raise ValueError(f"Cannot assign id '{index}' to '{symbol}' - "
183
+ f"already occupied by {self._id2sym[index]}")
184
+ self._sym2id[symbol] = index
185
+ self._id2sym[index] = symbol
186
+
187
+ # Update next available ID if needed
188
+ if self._next_available_id <= index:
189
+ self._next_available_id = index + 1
190
+
191
+ return index
192
+
193
+ def get(self, k: Union[int, Symbol]) -> Union[Symbol, int]:
194
+ '''Get a symbol for an id or get an id for a symbol
195
+
196
+ Args:
197
+ k:
198
+ If it is an id, it tries to find the symbol corresponding
199
+ to the id; if it is a symbol, it tries to find the id
200
+ corresponding to the symbol.
201
+
202
+ Returns:
203
+ An id or a symbol depending on the given `k`.
204
+ '''
205
+ if isinstance(k, int):
206
+ return self._id2sym[k]
207
+ else:
208
+ return self._sym2id[k]
209
+
210
+ def merge(self, other: 'SymbolTable') -> 'SymbolTable':
211
+ '''Create a union of two SymbolTables.
212
+ Raises an AssertionError if the same IDs are occupied by
213
+ different symbols.
214
+
215
+ Args:
216
+ other:
217
+ A symbol table to merge with ``self``.
218
+
219
+ Returns:
220
+ A new symbol table.
221
+ '''
222
+ self._check_compatible(other)
223
+ return SymbolTable(
224
+ _id2sym={**self._id2sym, **other._id2sym},
225
+ _sym2id={**self._sym2id, **other._sym2id},
226
+ eps=self.eps
227
+ )
228
+
229
+ def _check_compatible(self, other: 'SymbolTable') -> None:
230
+ # Epsilon compatibility
231
+ assert self.eps == other.eps, f'Mismatched epsilon symbol: ' \
232
+ f'{self.eps} != {other.eps}'
233
+ # IDs compatibility
234
+ common_ids = set(self._id2sym).intersection(other._id2sym)
235
+ for idx in common_ids:
236
+ assert self[idx] == other[idx], f'ID conflict for id: {idx}, ' \
237
+ f'self[idx] = "{self[idx]}", ' \
238
+ f'other[idx] = "{other[idx]}"'
239
+ # Symbols compatibility
240
+ common_symbols = set(self._sym2id).intersection(other._sym2id)
241
+ for sym in common_symbols:
242
+ assert self[sym] == other[sym], f'ID conflict for id: {sym}, ' \
243
+ f'self[sym] = "{self[sym]}", ' \
244
+ f'other[sym] = "{other[sym]}"'
245
+
246
+ def __getitem__(self, item: Union[int, Symbol]) -> Union[Symbol, int]:
247
+ return self.get(item)
248
+
249
+ def __contains__(self, item: Union[int, Symbol]) -> bool:
250
+ if isinstance(item, int):
251
+ return item in self._id2sym
252
+ else:
253
+ return item in self._sym2id
254
+
255
+ def __len__(self) -> int:
256
+ return len(self._id2sym)
257
+
258
+ def __eq__(self, other: 'SymbolTable') -> bool:
259
+ if len(self) != len(other):
260
+ return False
261
+
262
+ for s in self.symbols:
263
+ if self[s] != other[s]:
264
+ return False
265
+
266
+ return True
267
+
268
+ @property
269
+ def ids(self) -> List[int]:
270
+ '''Returns a list of integer IDs corresponding to the symbols.
271
+ '''
272
+ ans = list(self._id2sym.keys())
273
+ ans.sort()
274
+ return ans
275
+
276
+ @property
277
+ def symbols(self) -> List[Symbol]:
278
+ '''Returns a list of symbols (e.g., strings) corresponding to
279
+ the integer IDs.
280
+ '''
281
+ ans = list(self._sym2id.keys())
282
+ ans.sort()
283
+ return ans
284
+
text/symbols.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ """ This code is modified from https://github.com/keithito/tacotron """
7
+
8
+ """
9
+ Defines the set of symbols used in text input to the model.
10
+
11
+ The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. """
12
+
13
+ from text import cmudict, pinyin
14
+
15
+ _pad = "_"
16
+ _punctuation = "!'(),.:;? "
17
+ _special = "-"
18
+ _letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
19
+ _silences = ["@sp", "@spn", "@sil"]
20
+
21
+ # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
22
+ _arpabet = ["@" + s for s in cmudict.valid_symbols]
23
+ _pinyin = ["@" + s for s in pinyin.valid_symbols]
24
+
25
+ # Export all symbols:
26
+ symbols = (
27
+ [_pad]
28
+ + list(_special)
29
+ + list(_punctuation)
30
+ + list(_letters)
31
+ + _arpabet
32
+ + _silences
33
+ # + _pinyin # for chinese
34
+ )
text/text_token_collation.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from pathlib import Path
7
+ from typing import List, Tuple
8
+ import os
9
+ import numpy as np
10
+ import torch
11
+ from text.symbol_table import SymbolTable
12
+ from text import text_to_sequence
13
+
14
+
15
+ '''
16
+ TextToken: map text to id
17
+ '''
18
+ # TextTokenCollator is modified from
19
+ # https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/collation.py
20
+ class TextTokenCollator:
21
+ def __init__(
22
+ self,
23
+ text_tokens: List[str],
24
+ add_eos: bool = True,
25
+ add_bos: bool = True,
26
+ pad_symbol: str = "<pad>",
27
+ bos_symbol: str = "<bos>",
28
+ eos_symbol: str = "<eos>",
29
+ ):
30
+ self.pad_symbol = pad_symbol
31
+ self.add_eos = add_eos
32
+ self.add_bos = add_bos
33
+ self.bos_symbol = bos_symbol
34
+ self.eos_symbol = eos_symbol
35
+
36
+ unique_tokens = [pad_symbol]
37
+ if add_bos:
38
+ unique_tokens.append(bos_symbol)
39
+ if add_eos:
40
+ unique_tokens.append(eos_symbol)
41
+ unique_tokens.extend(sorted(text_tokens))
42
+
43
+ self.token2idx = {token: idx for idx, token in enumerate(unique_tokens)}
44
+ self.idx2token = unique_tokens
45
+
46
+ def index(
47
+ self, tokens_list: List[str]
48
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
49
+ seqs, seq_lens = [], []
50
+ for tokens in tokens_list:
51
+ assert (
52
+ all([True if s in self.token2idx else False for s in tokens])
53
+ is True
54
+ )
55
+ seq = (
56
+ ([self.bos_symbol] if self.add_bos else [])
57
+ + list(tokens)
58
+ + ([self.eos_symbol] if self.add_eos else [])
59
+ )
60
+ seqs.append(seq)
61
+ seq_lens.append(len(seq))
62
+
63
+ max_len = max(seq_lens)
64
+ for k, (seq, seq_len) in enumerate(zip(seqs, seq_lens)):
65
+ seq.extend([self.pad_symbol] * (max_len - seq_len))
66
+
67
+ tokens = torch.from_numpy(
68
+ np.array(
69
+ [[self.token2idx[token] for token in seq] for seq in seqs],
70
+ dtype=np.int64,
71
+ )
72
+ )
73
+ tokens_lens = torch.IntTensor(seq_lens)
74
+
75
+ return tokens, tokens_lens
76
+
77
+ def __call__(self, text):
78
+ tokens_seq = [p for p in text]
79
+ seq = (
80
+ ([self.bos_symbol] if self.add_bos else [])
81
+ + tokens_seq
82
+ + ([self.eos_symbol] if self.add_eos else [])
83
+ )
84
+
85
+ token_ids = [self.token2idx[token] for token in seq]
86
+ token_lens = len(tokens_seq) + self.add_eos + self.add_bos
87
+
88
+ return token_ids, token_lens
89
+
90
+
91
+ def get_text_token_collater(text_tokens_file: str) -> TextTokenCollator:
92
+ text_tokens_path = Path(text_tokens_file)
93
+ unique_tokens = SymbolTable.from_file(text_tokens_path)
94
+ collater = TextTokenCollator(
95
+ unique_tokens.symbols, add_bos=True, add_eos=True
96
+ )
97
+ token2idx = collater.token2idx
98
+ return collater, token2idx
99
+
100
+
101
+ class phoneIDCollation:
102
+ def __init__(self, cfg, dataset=None, symbols_dict_file=None) -> None:
103
+
104
+ if cfg.preprocess.phone_extractor != 'lexicon':
105
+ ### get text token collator
106
+ if symbols_dict_file is None:
107
+ assert dataset is not None
108
+ symbols_dict_file = os.path.join(
109
+ cfg.preprocess.processed_dir,
110
+ dataset,
111
+ cfg.preprocess.symbols_dict
112
+ )
113
+ self.text_token_colloator, token2idx = get_text_token_collater(symbols_dict_file)
114
+ # # unique_tokens = SymbolTable.from_file(symbols_dict_path)
115
+ # # text_tokenizer = TextToken(unique_tokens.symbols, add_bos=True, add_eos=True)
116
+
117
+ # # update phone symbols dict file with pad_symbol or optional tokens (add_bos and add_eos) in TextTokenCollator
118
+ # phone_symbol_dict = SymbolTable()
119
+ # for s in sorted(list(set(token2idx.keys()))):
120
+ # phone_symbol_dict.add(s)
121
+ # phone_symbol_dict.to_file(symbols_dict_file)
122
+
123
+ def get_phone_id_sequence(self, cfg, phones_seq):
124
+
125
+ if cfg.preprocess.phone_extractor == 'lexicon':
126
+ phones_seq = ' '.join(phones_seq)
127
+ sequence = text_to_sequence(phones_seq, cfg.preprocess.text_cleaners)
128
+ else:
129
+ sequence, seq_len = self.text_token_colloator(phones_seq)
130
+ return sequence
131
+
utils/HyperParams/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ from .hps import HyperParams
utils/HyperParams/hps.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ class HyperParams:
8
+ """The class to store hyperparameters. The key is case-insensitive.
9
+
10
+ Args:
11
+ *args: a list of dict or HyperParams.
12
+ **kwargs: a list of key-value pairs.
13
+ """
14
+
15
+ def __init__(self, **kwargs):
16
+ for k, v in kwargs.items():
17
+ if type(v) == dict:
18
+ v = HyperParams(**v)
19
+ self[k] = v
20
+
21
+ def keys(self):
22
+ return self.__dict__.keys()
23
+
24
+ def items(self):
25
+ return self.__dict__.items()
26
+
27
+ def values(self):
28
+ return self.__dict__.values()
29
+
30
+ def __len__(self):
31
+ return len(self.__dict__)
32
+
33
+ def __getitem__(self, key):
34
+ return getattr(self, key)
35
+
36
+ def __setitem__(self, key, value):
37
+ return setattr(self, key, value)
38
+
39
+ def __contains__(self, key):
40
+ return key in self.__dict__
41
+
42
+ def __repr__(self):
43
+ return self.__dict__.__repr__()
utils/__init__.py ADDED
File without changes
utils/audio.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import numpy as np
8
+ from numpy import linalg as LA
9
+ import librosa
10
+ import soundfile as sf
11
+ import librosa.filters
12
+
13
+
14
+ def load_audio_torch(wave_file, fs):
15
+ """Load audio data into torch tensor
16
+
17
+ Args:
18
+ wave_file (str): path to wave file
19
+ fs (int): sample rate
20
+
21
+ Returns:
22
+ audio (tensor): audio data in tensor
23
+ fs (int): sample rate
24
+ """
25
+
26
+ audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True)
27
+ # audio: (T,)
28
+ assert len(audio) > 2
29
+
30
+ # Check the audio type (for soundfile loading backbone) - float, 8bit or 16bit
31
+ if np.issubdtype(audio.dtype, np.integer):
32
+ max_mag = -np.iinfo(audio.dtype).min
33
+ else:
34
+ max_mag = max(np.amax(audio), -np.amin(audio))
35
+ max_mag = (
36
+ (2**31) + 1
37
+ if max_mag > (2**15)
38
+ else ((2**15) + 1 if max_mag > 1.01 else 1.0)
39
+ )
40
+
41
+ # Normalize the audio
42
+ audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag
43
+
44
+ if (torch.isnan(audio) | torch.isinf(audio)).any():
45
+ return [], sample_rate or fs or 48000
46
+
47
+ # Resample the audio to our target samplerate
48
+ if fs is not None and fs != sample_rate:
49
+ audio = torch.from_numpy(
50
+ librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs)
51
+ )
52
+ sample_rate = fs
53
+
54
+ return audio, fs
55
+
56
+
57
+ def _stft(y, cfg):
58
+ return librosa.stft(
59
+ y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size
60
+ )
61
+
62
+
63
+ def energy(wav, cfg):
64
+ D = _stft(wav, cfg)
65
+ magnitudes = np.abs(D).T # [F, T]
66
+ return LA.norm(magnitudes, axis=1)
67
+
68
+
69
+ def get_energy_from_tacotron(audio, _stft):
70
+ audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
71
+ audio = torch.autograd.Variable(audio, requires_grad=False)
72
+ mel, energy = _stft.mel_spectrogram(audio)
73
+ energy = torch.squeeze(energy, 0).numpy().astype(np.float32)
74
+ return mel, energy
utils/audio_slicer.py ADDED
@@ -0,0 +1,476 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import json
8
+ import numpy as np
9
+ from tqdm import tqdm
10
+ import torch
11
+ import torchaudio
12
+
13
+ from utils.io import save_audio
14
+ from utils.audio import load_audio_torch
15
+
16
+
17
+ # This function is obtained from librosa.
18
+ def get_rms(
19
+ y,
20
+ *,
21
+ frame_length=2048,
22
+ hop_length=512,
23
+ pad_mode="constant",
24
+ ):
25
+ padding = (int(frame_length // 2), int(frame_length // 2))
26
+ y = np.pad(y, padding, mode=pad_mode)
27
+
28
+ axis = -1
29
+ # put our new within-frame axis at the end for now
30
+ out_strides = y.strides + tuple([y.strides[axis]])
31
+ # Reduce the shape on the framing axis
32
+ x_shape_trimmed = list(y.shape)
33
+ x_shape_trimmed[axis] -= frame_length - 1
34
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
35
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
36
+ if axis < 0:
37
+ target_axis = axis - 1
38
+ else:
39
+ target_axis = axis + 1
40
+ xw = np.moveaxis(xw, -1, target_axis)
41
+ # Downsample along the target axis
42
+ slices = [slice(None)] * xw.ndim
43
+ slices[axis] = slice(0, None, hop_length)
44
+ x = xw[tuple(slices)]
45
+
46
+ # Calculate power
47
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
48
+
49
+ return np.sqrt(power)
50
+
51
+
52
+ class Slicer:
53
+ """
54
+ Copy from: https://github.com/openvpi/audio-slicer/blob/main/slicer2.py
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ sr: int,
60
+ threshold: float = -40.0,
61
+ min_length: int = 5000,
62
+ min_interval: int = 300,
63
+ hop_size: int = 10,
64
+ max_sil_kept: int = 5000,
65
+ ):
66
+ if not min_length >= min_interval >= hop_size:
67
+ raise ValueError(
68
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
69
+ )
70
+ if not max_sil_kept >= hop_size:
71
+ raise ValueError(
72
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
73
+ )
74
+ min_interval = sr * min_interval / 1000
75
+ self.threshold = 10 ** (threshold / 20.0)
76
+ self.hop_size = round(sr * hop_size / 1000)
77
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
78
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
79
+ self.min_interval = round(min_interval / self.hop_size)
80
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
81
+
82
+ def _apply_slice(self, waveform, begin, end):
83
+ begin = begin * self.hop_size
84
+ if len(waveform.shape) > 1:
85
+ end = min(waveform.shape[1], end * self.hop_size)
86
+ return waveform[:, begin:end], begin, end
87
+ else:
88
+ end = min(waveform.shape[0], end * self.hop_size)
89
+ return waveform[begin:end], begin, end
90
+
91
+ # @timeit
92
+ def slice(self, waveform, return_chunks_positions=False):
93
+ if len(waveform.shape) > 1:
94
+ # (#channle, wave_len) -> (wave_len)
95
+ samples = waveform.mean(axis=0)
96
+ else:
97
+ samples = waveform
98
+ if samples.shape[0] <= self.min_length:
99
+ return [waveform]
100
+ rms_list = get_rms(
101
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
102
+ ).squeeze(0)
103
+ sil_tags = []
104
+ silence_start = None
105
+ clip_start = 0
106
+ for i, rms in enumerate(rms_list):
107
+ # Keep looping while frame is silent.
108
+ if rms < self.threshold:
109
+ # Record start of silent frames.
110
+ if silence_start is None:
111
+ silence_start = i
112
+ continue
113
+ # Keep looping while frame is not silent and silence start has not been recorded.
114
+ if silence_start is None:
115
+ continue
116
+ # Clear recorded silence start if interval is not enough or clip is too short
117
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
118
+ need_slice_middle = (
119
+ i - silence_start >= self.min_interval
120
+ and i - clip_start >= self.min_length
121
+ )
122
+ if not is_leading_silence and not need_slice_middle:
123
+ silence_start = None
124
+ continue
125
+ # Need slicing. Record the range of silent frames to be removed.
126
+ if i - silence_start <= self.max_sil_kept:
127
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
128
+ if silence_start == 0:
129
+ sil_tags.append((0, pos))
130
+ else:
131
+ sil_tags.append((pos, pos))
132
+ clip_start = pos
133
+ elif i - silence_start <= self.max_sil_kept * 2:
134
+ pos = rms_list[
135
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
136
+ ].argmin()
137
+ pos += i - self.max_sil_kept
138
+ pos_l = (
139
+ rms_list[
140
+ silence_start : silence_start + self.max_sil_kept + 1
141
+ ].argmin()
142
+ + silence_start
143
+ )
144
+ pos_r = (
145
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
146
+ + i
147
+ - self.max_sil_kept
148
+ )
149
+ if silence_start == 0:
150
+ sil_tags.append((0, pos_r))
151
+ clip_start = pos_r
152
+ else:
153
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
154
+ clip_start = max(pos_r, pos)
155
+ else:
156
+ pos_l = (
157
+ rms_list[
158
+ silence_start : silence_start + self.max_sil_kept + 1
159
+ ].argmin()
160
+ + silence_start
161
+ )
162
+ pos_r = (
163
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
164
+ + i
165
+ - self.max_sil_kept
166
+ )
167
+ if silence_start == 0:
168
+ sil_tags.append((0, pos_r))
169
+ else:
170
+ sil_tags.append((pos_l, pos_r))
171
+ clip_start = pos_r
172
+ silence_start = None
173
+ # Deal with trailing silence.
174
+ total_frames = rms_list.shape[0]
175
+ if (
176
+ silence_start is not None
177
+ and total_frames - silence_start >= self.min_interval
178
+ ):
179
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
180
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
181
+ sil_tags.append((pos, total_frames + 1))
182
+ # Apply and return slices.
183
+ if len(sil_tags) == 0:
184
+ return [waveform]
185
+ else:
186
+ chunks = []
187
+ chunks_pos_of_waveform = []
188
+
189
+ if sil_tags[0][0] > 0:
190
+ chunk, begin, end = self._apply_slice(waveform, 0, sil_tags[0][0])
191
+ chunks.append(chunk)
192
+ chunks_pos_of_waveform.append((begin, end))
193
+
194
+ for i in range(len(sil_tags) - 1):
195
+ chunk, begin, end = self._apply_slice(
196
+ waveform, sil_tags[i][1], sil_tags[i + 1][0]
197
+ )
198
+ chunks.append(chunk)
199
+ chunks_pos_of_waveform.append((begin, end))
200
+
201
+ if sil_tags[-1][1] < total_frames:
202
+ chunk, begin, end = self._apply_slice(
203
+ waveform, sil_tags[-1][1], total_frames
204
+ )
205
+ chunks.append(chunk)
206
+ chunks_pos_of_waveform.append((begin, end))
207
+
208
+ return (
209
+ chunks
210
+ if not return_chunks_positions
211
+ else (
212
+ chunks,
213
+ chunks_pos_of_waveform,
214
+ )
215
+ )
216
+
217
+
218
+ def split_utterances_from_audio(
219
+ wav_file,
220
+ output_dir,
221
+ max_duration_of_utterance=10.0,
222
+ min_interval=300,
223
+ db_threshold=-40,
224
+ ):
225
+ """
226
+ Split a long audio into utterances accoring to the silence (VAD).
227
+
228
+ max_duration_of_utterance (second):
229
+ The maximum duration of every utterance (seconds)
230
+ min_interval (millisecond):
231
+ The smaller min_interval is, the more sliced audio clips this script is likely to generate.
232
+ """
233
+ print("File:", wav_file.split("/")[-1])
234
+ waveform, fs = torchaudio.load(wav_file)
235
+
236
+ slicer = Slicer(sr=fs, min_interval=min_interval, threshold=db_threshold)
237
+ chunks, positions = slicer.slice(waveform, return_chunks_positions=True)
238
+
239
+ durations = [(end - begin) / fs for begin, end in positions]
240
+ print(
241
+ "Slicer's min silence part is {}ms, min and max duration of sliced utterances is {}s and {}s".format(
242
+ min_interval, min(durations), max(durations)
243
+ )
244
+ )
245
+
246
+ res_chunks, res_positions = [], []
247
+ for i, chunk in enumerate(chunks):
248
+ if len(chunk.shape) == 1:
249
+ chunk = chunk[None, :]
250
+
251
+ begin, end = positions[i]
252
+ assert end - begin == chunk.shape[-1]
253
+
254
+ max_wav_len = max_duration_of_utterance * fs
255
+ if chunk.shape[-1] <= max_wav_len:
256
+ res_chunks.append(chunk)
257
+ res_positions.append(positions[i])
258
+ else:
259
+ # TODO: to reserve overlapping and conduct fade-in, fade-out
260
+
261
+ # Get segments number
262
+ number = 2
263
+ while chunk.shape[-1] // number >= max_wav_len:
264
+ number += 1
265
+ seg_len = chunk.shape[-1] // number
266
+
267
+ # Split
268
+ for num in range(number):
269
+ s = seg_len * num
270
+ t = min(s + seg_len, chunk.shape[-1])
271
+
272
+ seg_begin = begin + s
273
+ seg_end = begin + t
274
+
275
+ res_chunks.append(chunk[:, s:t])
276
+ res_positions.append((seg_begin, seg_end))
277
+
278
+ # Save utterances
279
+ os.makedirs(output_dir, exist_ok=True)
280
+ res = {"fs": int(fs)}
281
+ for i, chunk in enumerate(res_chunks):
282
+ filename = "{:04d}.wav".format(i)
283
+ res[filename] = [int(p) for p in res_positions[i]]
284
+ save_audio(os.path.join(output_dir, filename), chunk, fs)
285
+
286
+ # Save positions
287
+ with open(os.path.join(output_dir, "positions.json"), "w") as f:
288
+ json.dump(res, f, indent=4, ensure_ascii=False)
289
+ return res
290
+
291
+
292
+ def is_silence(
293
+ wavform,
294
+ fs,
295
+ threshold=-40.0,
296
+ min_interval=300,
297
+ hop_size=10,
298
+ min_length=5000,
299
+ ):
300
+ """
301
+ Detect whether the given wavform is a silence
302
+
303
+ wavform: (T, )
304
+ """
305
+ threshold = 10 ** (threshold / 20.0)
306
+
307
+ hop_size = round(fs * hop_size / 1000)
308
+ win_size = min(round(min_interval), 4 * hop_size)
309
+ min_length = round(fs * min_length / 1000 / hop_size)
310
+
311
+ if wavform.shape[0] <= min_length:
312
+ return True
313
+
314
+ # (#Frame,)
315
+ rms_array = get_rms(y=wavform, frame_length=win_size, hop_length=hop_size).squeeze(
316
+ 0
317
+ )
318
+ return (rms_array < threshold).all()
319
+
320
+
321
+ def split_audio(
322
+ wav_file, target_sr, output_dir, max_duration_of_segment=10.0, overlap_duration=1.0
323
+ ):
324
+ """
325
+ Split a long audio into segments.
326
+
327
+ target_sr:
328
+ The target sampling rate to save the segments.
329
+ max_duration_of_utterance (second):
330
+ The maximum duration of every utterance (second)
331
+ overlap_duraion:
332
+ Each segment has "overlap duration" (second) overlap with its previous and next segment
333
+ """
334
+ # (#channel, T) -> (T,)
335
+ waveform, fs = torchaudio.load(wav_file)
336
+ waveform = torchaudio.functional.resample(
337
+ waveform, orig_freq=fs, new_freq=target_sr
338
+ )
339
+ waveform = torch.mean(waveform, dim=0)
340
+
341
+ # waveform, _ = load_audio_torch(wav_file, target_sr)
342
+ assert len(waveform.shape) == 1
343
+
344
+ assert overlap_duration < max_duration_of_segment
345
+ length = int(max_duration_of_segment * target_sr)
346
+ stride = int((max_duration_of_segment - overlap_duration) * target_sr)
347
+ chunks = []
348
+ for i in range(0, len(waveform), stride):
349
+ # (length,)
350
+ chunks.append(waveform[i : i + length])
351
+ if i + length >= len(waveform):
352
+ break
353
+
354
+ # Save segments
355
+ os.makedirs(output_dir, exist_ok=True)
356
+ results = []
357
+ for i, chunk in enumerate(chunks):
358
+ uid = "{:04d}".format(i)
359
+ filename = os.path.join(output_dir, "{}.wav".format(uid))
360
+ results.append(
361
+ {"Uid": uid, "Path": filename, "Duration": len(chunk) / target_sr}
362
+ )
363
+ save_audio(
364
+ filename,
365
+ chunk,
366
+ target_sr,
367
+ turn_up=not is_silence(chunk, target_sr),
368
+ add_silence=False,
369
+ )
370
+
371
+ return results
372
+
373
+
374
+ def merge_segments_torchaudio(wav_files, fs, output_path, overlap_duration=1.0):
375
+ """Merge the given wav_files (may have overlaps) into a long audio
376
+
377
+ fs:
378
+ The sampling rate of the wav files.
379
+ output_path:
380
+ The output path to save the merged audio.
381
+ overlap_duration (float, optional):
382
+ Each segment has "overlap duration" (second) overlap with its previous and next segment. Defaults to 1.0.
383
+ """
384
+
385
+ waveforms = []
386
+ for file in wav_files:
387
+ # (T,)
388
+ waveform, _ = load_audio_torch(file, fs)
389
+ waveforms.append(waveform)
390
+
391
+ if len(waveforms) == 1:
392
+ save_audio(output_path, waveforms[0], fs, add_silence=False, turn_up=False)
393
+ return
394
+
395
+ overlap_len = int(overlap_duration * fs)
396
+ fade_out = torchaudio.transforms.Fade(fade_out_len=overlap_len)
397
+ fade_in = torchaudio.transforms.Fade(fade_in_len=overlap_len)
398
+ fade_in_and_out = torchaudio.transforms.Fade(fade_out_len=overlap_len)
399
+
400
+ segments_lens = [len(wav) for wav in waveforms]
401
+ merged_waveform_len = sum(segments_lens) - overlap_len * (len(waveforms) - 1)
402
+ merged_waveform = torch.zeros(merged_waveform_len)
403
+
404
+ start = 0
405
+ for index, wav in enumerate(
406
+ tqdm(waveforms, desc="Merge for {}".format(output_path))
407
+ ):
408
+ wav_len = len(wav)
409
+
410
+ if index == 0:
411
+ wav = fade_out(wav)
412
+ elif index == len(waveforms) - 1:
413
+ wav = fade_in(wav)
414
+ else:
415
+ wav = fade_in_and_out(wav)
416
+
417
+ merged_waveform[start : start + wav_len] = wav
418
+ start += wav_len - overlap_len
419
+
420
+ save_audio(output_path, merged_waveform, fs, add_silence=False, turn_up=True)
421
+
422
+
423
+ def merge_segments_encodec(wav_files, fs, output_path, overlap_duration=1.0):
424
+ """Merge the given wav_files (may have overlaps) into a long audio
425
+
426
+ fs:
427
+ The sampling rate of the wav files.
428
+ output_path:
429
+ The output path to save the merged audio.
430
+ overlap_duration (float, optional):
431
+ Each segment has "overlap duration" (second) overlap with its previous and next segment. Defaults to 1.0.
432
+ """
433
+
434
+ waveforms = []
435
+ for file in wav_files:
436
+ # (T,)
437
+ waveform, _ = load_audio_torch(file, fs)
438
+ waveforms.append(waveform)
439
+
440
+ if len(waveforms) == 1:
441
+ save_audio(output_path, waveforms[0], fs, add_silence=False, turn_up=False)
442
+ return
443
+
444
+ device = waveforms[0].device
445
+ dtype = waveforms[0].dtype
446
+ shape = waveforms[0].shape[:-1]
447
+
448
+ overlap_len = int(overlap_duration * fs)
449
+ segments_lens = [len(wav) for wav in waveforms]
450
+ merged_waveform_len = sum(segments_lens) - overlap_len * (len(waveforms) - 1)
451
+
452
+ sum_weight = torch.zeros(merged_waveform_len, device=device, dtype=dtype)
453
+ out = torch.zeros(*shape, merged_waveform_len, device=device, dtype=dtype)
454
+ offset = 0
455
+
456
+ for frame in waveforms:
457
+ frame_length = frame.size(-1)
458
+ t = torch.linspace(0, 1, frame_length + 2, device=device, dtype=torch.float32)[
459
+ 1:-1
460
+ ]
461
+ weight = 0.5 - (t - 0.5).abs()
462
+ weighted_frame = frame * weight
463
+
464
+ cur = out[..., offset : offset + frame_length]
465
+ cur += weighted_frame[..., : cur.size(-1)]
466
+ out[..., offset : offset + frame_length] = cur
467
+
468
+ cur = sum_weight[offset : offset + frame_length]
469
+ cur += weight[..., : cur.size(-1)]
470
+ sum_weight[offset : offset + frame_length] = cur
471
+
472
+ offset += frame_length - overlap_len
473
+
474
+ assert sum_weight.min() > 0
475
+ merged_waveform = out / sum_weight
476
+ save_audio(output_path, merged_waveform, fs, add_silence=False, turn_up=True)
utils/data_utils.py ADDED
@@ -0,0 +1,575 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import json
7
+ import os
8
+
9
+ import numpy as np
10
+ from scipy.interpolate import interp1d
11
+ from tqdm import tqdm
12
+ from sklearn.preprocessing import StandardScaler
13
+
14
+
15
+ def load_content_feature_path(meta_data, processed_dir, feat_dir):
16
+ utt2feat_path = {}
17
+ for utt_info in meta_data:
18
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
19
+ feat_path = os.path.join(
20
+ processed_dir, utt_info["Dataset"], feat_dir, f'{utt_info["Uid"]}.npy'
21
+ )
22
+ utt2feat_path[utt] = feat_path
23
+
24
+ return utt2feat_path
25
+
26
+
27
+ def load_source_content_feature_path(meta_data, feat_dir):
28
+ utt2feat_path = {}
29
+ for utt in meta_data:
30
+ feat_path = os.path.join(feat_dir, f"{utt}.npy")
31
+ utt2feat_path[utt] = feat_path
32
+
33
+ return utt2feat_path
34
+
35
+
36
+ def get_spk_map(spk2id_path, utt2spk_path):
37
+ utt2spk = {}
38
+ with open(spk2id_path, "r") as spk2id_file:
39
+ spk2id = json.load(spk2id_file)
40
+ with open(utt2spk_path, encoding="utf-8") as f:
41
+ for line in f.readlines():
42
+ utt, spk = line.strip().split("\t")
43
+ utt2spk[utt] = spk
44
+ return spk2id, utt2spk
45
+
46
+
47
+ def get_target_f0_median(f0_dir):
48
+ total_f0 = []
49
+ for utt in os.listdir(f0_dir):
50
+ if not utt.endswith(".npy"):
51
+ continue
52
+ f0_feat_path = os.path.join(f0_dir, utt)
53
+ f0 = np.load(f0_feat_path)
54
+ total_f0 += f0.tolist()
55
+
56
+ total_f0 = np.array(total_f0)
57
+ voiced_position = np.where(total_f0 != 0)
58
+ return np.median(total_f0[voiced_position])
59
+
60
+
61
+ def get_conversion_f0_factor(source_f0, target_median, source_median=None):
62
+ """Align the median between source f0 and target f0
63
+
64
+ Note: Here we use multiplication, whose factor is target_median/source_median
65
+
66
+ Reference: Frequency and pitch interval
67
+ http://blog.ccyg.studio/article/be12c2ee-d47c-4098-9782-ca76da3035e4/
68
+ """
69
+ if source_median is None:
70
+ voiced_position = np.where(source_f0 != 0)
71
+ source_median = np.median(source_f0[voiced_position])
72
+ factor = target_median / source_median
73
+ return source_median, factor
74
+
75
+
76
+ def transpose_key(frame_pitch, trans_key):
77
+ # Transpose by user's argument
78
+ print("Transpose key = {} ...\n".format(trans_key))
79
+
80
+ transed_pitch = frame_pitch * 2 ** (trans_key / 12)
81
+ return transed_pitch
82
+
83
+
84
+ def pitch_shift_to_target(frame_pitch, target_pitch_median, source_pitch_median=None):
85
+ # Loading F0 Base (median) and shift
86
+ source_pitch_median, factor = get_conversion_f0_factor(
87
+ frame_pitch, target_pitch_median, source_pitch_median
88
+ )
89
+ print(
90
+ "Auto transposing: source f0 median = {:.1f}, target f0 median = {:.1f}, factor = {:.2f}".format(
91
+ source_pitch_median, target_pitch_median, factor
92
+ )
93
+ )
94
+ transed_pitch = frame_pitch * factor
95
+ return transed_pitch
96
+
97
+
98
+ def load_frame_pitch(
99
+ meta_data,
100
+ processed_dir,
101
+ pitch_dir,
102
+ use_log_scale=False,
103
+ return_norm=False,
104
+ interoperate=False,
105
+ utt2spk=None,
106
+ ):
107
+ utt2pitch = {}
108
+ utt2uv = {}
109
+ if utt2spk is None:
110
+ pitch_scaler = StandardScaler()
111
+ for utt_info in meta_data:
112
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
113
+ pitch_path = os.path.join(
114
+ processed_dir, utt_info["Dataset"], pitch_dir, f'{utt_info["Uid"]}.npy'
115
+ )
116
+ pitch = np.load(pitch_path)
117
+ assert len(pitch) > 0
118
+ uv = pitch != 0
119
+ utt2uv[utt] = uv
120
+ if use_log_scale:
121
+ nonzero_idxes = np.where(pitch != 0)[0]
122
+ pitch[nonzero_idxes] = np.log(pitch[nonzero_idxes])
123
+ utt2pitch[utt] = pitch
124
+ pitch_scaler.partial_fit(pitch.reshape(-1, 1))
125
+
126
+ mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
127
+ if return_norm:
128
+ for utt_info in meta_data:
129
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
130
+ pitch = utt2pitch[utt]
131
+ normalized_pitch = (pitch - mean) / std
132
+ utt2pitch[utt] = normalized_pitch
133
+ pitch_statistic = {"mean": mean, "std": std}
134
+ else:
135
+ spk2utt = {}
136
+ pitch_statistic = []
137
+ for utt_info in meta_data:
138
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
139
+ if not utt2spk[utt] in spk2utt:
140
+ spk2utt[utt2spk[utt]] = []
141
+ spk2utt[utt2spk[utt]].append(utt)
142
+
143
+ for spk in spk2utt:
144
+ pitch_scaler = StandardScaler()
145
+ for utt in spk2utt[spk]:
146
+ dataset = utt.split("_")[0]
147
+ uid = "_".join(utt.split("_")[1:])
148
+ pitch_path = os.path.join(
149
+ processed_dir, dataset, pitch_dir, f"{uid}.npy"
150
+ )
151
+ pitch = np.load(pitch_path)
152
+ assert len(pitch) > 0
153
+ uv = pitch != 0
154
+ utt2uv[utt] = uv
155
+ if use_log_scale:
156
+ nonzero_idxes = np.where(pitch != 0)[0]
157
+ pitch[nonzero_idxes] = np.log(pitch[nonzero_idxes])
158
+ utt2pitch[utt] = pitch
159
+ pitch_scaler.partial_fit(pitch.reshape(-1, 1))
160
+
161
+ mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
162
+ if return_norm:
163
+ for utt in spk2utt[spk]:
164
+ pitch = utt2pitch[utt]
165
+ normalized_pitch = (pitch - mean) / std
166
+ utt2pitch[utt] = normalized_pitch
167
+ pitch_statistic.append({"spk": spk, "mean": mean, "std": std})
168
+
169
+ return utt2pitch, utt2uv, pitch_statistic
170
+
171
+
172
+ # discard
173
+ def load_phone_pitch(
174
+ meta_data,
175
+ processed_dir,
176
+ pitch_dir,
177
+ utt2dur,
178
+ use_log_scale=False,
179
+ return_norm=False,
180
+ interoperate=True,
181
+ utt2spk=None,
182
+ ):
183
+ print("Load Phone Pitch")
184
+ utt2pitch = {}
185
+ utt2uv = {}
186
+ if utt2spk is None:
187
+ pitch_scaler = StandardScaler()
188
+ for utt_info in tqdm(meta_data):
189
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
190
+ pitch_path = os.path.join(
191
+ processed_dir, utt_info["Dataset"], pitch_dir, f'{utt_info["Uid"]}.npy'
192
+ )
193
+ frame_pitch = np.load(pitch_path)
194
+ assert len(frame_pitch) > 0
195
+ uv = frame_pitch != 0
196
+ utt2uv[utt] = uv
197
+ phone_pitch = phone_average_pitch(frame_pitch, utt2dur[utt], interoperate)
198
+ if use_log_scale:
199
+ nonzero_idxes = np.where(phone_pitch != 0)[0]
200
+ phone_pitch[nonzero_idxes] = np.log(phone_pitch[nonzero_idxes])
201
+ utt2pitch[utt] = phone_pitch
202
+ pitch_scaler.partial_fit(remove_outlier(phone_pitch).reshape(-1, 1))
203
+
204
+ mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
205
+ max_value = np.finfo(np.float64).min
206
+ min_value = np.finfo(np.float64).max
207
+ if return_norm:
208
+ for utt_info in meta_data:
209
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
210
+ pitch = utt2pitch[utt]
211
+ normalized_pitch = (pitch - mean) / std
212
+ max_value = max(max_value, max(normalized_pitch))
213
+ min_value = min(min_value, min(normalized_pitch))
214
+ utt2pitch[utt] = normalized_pitch
215
+ phone_normalized_pitch_path = os.path.join(
216
+ processed_dir,
217
+ utt_info["Dataset"],
218
+ "phone_level_" + pitch_dir,
219
+ f'{utt_info["Uid"]}.npy',
220
+ )
221
+ pitch_statistic = {
222
+ "mean": mean,
223
+ "std": std,
224
+ "min_value": min_value,
225
+ "max_value": max_value,
226
+ }
227
+ else:
228
+ spk2utt = {}
229
+ pitch_statistic = []
230
+ for utt_info in tqdm(meta_data):
231
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
232
+ if not utt2spk[utt] in spk2utt:
233
+ spk2utt[utt2spk[utt]] = []
234
+ spk2utt[utt2spk[utt]].append(utt)
235
+
236
+ for spk in spk2utt:
237
+ pitch_scaler = StandardScaler()
238
+ for utt in spk2utt[spk]:
239
+ dataset = utt.split("_")[0]
240
+ uid = "_".join(utt.split("_")[1:])
241
+ pitch_path = os.path.join(
242
+ processed_dir, dataset, pitch_dir, f"{uid}.npy"
243
+ )
244
+ frame_pitch = np.load(pitch_path)
245
+ assert len(frame_pitch) > 0
246
+ uv = frame_pitch != 0
247
+ utt2uv[utt] = uv
248
+ phone_pitch = phone_average_pitch(
249
+ frame_pitch, utt2dur[utt], interoperate
250
+ )
251
+ if use_log_scale:
252
+ nonzero_idxes = np.where(phone_pitch != 0)[0]
253
+ phone_pitch[nonzero_idxes] = np.log(phone_pitch[nonzero_idxes])
254
+ utt2pitch[utt] = phone_pitch
255
+ pitch_scaler.partial_fit(remove_outlier(phone_pitch).reshape(-1, 1))
256
+
257
+ mean, std = pitch_scaler.mean_[0], pitch_scaler.scale_[0]
258
+ max_value = np.finfo(np.float64).min
259
+ min_value = np.finfo(np.float64).max
260
+
261
+ if return_norm:
262
+ for utt in spk2utt[spk]:
263
+ pitch = utt2pitch[utt]
264
+ normalized_pitch = (pitch - mean) / std
265
+ max_value = max(max_value, max(normalized_pitch))
266
+ min_value = min(min_value, min(normalized_pitch))
267
+ utt2pitch[utt] = normalized_pitch
268
+ pitch_statistic.append(
269
+ {
270
+ "spk": spk,
271
+ "mean": mean,
272
+ "std": std,
273
+ "min_value": min_value,
274
+ "max_value": max_value,
275
+ }
276
+ )
277
+
278
+ return utt2pitch, utt2uv, pitch_statistic
279
+
280
+
281
+ def phone_average_pitch(pitch, dur, interoperate=False):
282
+ pos = 0
283
+
284
+ if interoperate:
285
+ nonzero_ids = np.where(pitch != 0)[0]
286
+ interp_fn = interp1d(
287
+ nonzero_ids,
288
+ pitch[nonzero_ids],
289
+ fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]),
290
+ bounds_error=False,
291
+ )
292
+ pitch = interp_fn(np.arange(0, len(pitch)))
293
+ phone_pitch = np.zeros(len(dur))
294
+
295
+ for i, d in enumerate(dur):
296
+ d = int(d)
297
+ if d > 0 and pos < len(pitch):
298
+ phone_pitch[i] = np.mean(pitch[pos : pos + d])
299
+ else:
300
+ phone_pitch[i] = 0
301
+ pos += d
302
+ return phone_pitch
303
+
304
+
305
+ def load_energy(
306
+ meta_data,
307
+ processed_dir,
308
+ energy_dir,
309
+ use_log_scale=False,
310
+ return_norm=False,
311
+ utt2spk=None,
312
+ ):
313
+ utt2energy = {}
314
+ if utt2spk is None:
315
+ for utt_info in meta_data:
316
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
317
+ energy_path = os.path.join(
318
+ processed_dir, utt_info["Dataset"], energy_dir, f'{utt_info["Uid"]}.npy'
319
+ )
320
+ if not os.path.exists(energy_path):
321
+ continue
322
+ energy = np.load(energy_path)
323
+ assert len(energy) > 0
324
+
325
+ if use_log_scale:
326
+ nonzero_idxes = np.where(energy != 0)[0]
327
+ energy[nonzero_idxes] = np.log(energy[nonzero_idxes])
328
+ utt2energy[utt] = energy
329
+
330
+ if return_norm:
331
+ with open(
332
+ os.path.join(
333
+ processed_dir, utt_info["Dataset"], energy_dir, "statistics.json"
334
+ )
335
+ ) as f:
336
+ stats = json.load(f)
337
+ mean, std = (
338
+ stats[utt_info["Dataset"] + "_" + utt_info["Singer"]][
339
+ "voiced_positions"
340
+ ]["mean"],
341
+ stats["LJSpeech_LJSpeech"]["voiced_positions"]["std"],
342
+ )
343
+ for utt in utt2energy.keys():
344
+ energy = utt2energy[utt]
345
+ normalized_energy = (energy - mean) / std
346
+ utt2energy[utt] = normalized_energy
347
+
348
+ energy_statistic = {"mean": mean, "std": std}
349
+ else:
350
+ spk2utt = {}
351
+ energy_statistic = []
352
+ for utt_info in meta_data:
353
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
354
+ if not utt2spk[utt] in spk2utt:
355
+ spk2utt[utt2spk[utt]] = []
356
+ spk2utt[utt2spk[utt]].append(utt)
357
+
358
+ for spk in spk2utt:
359
+ energy_scaler = StandardScaler()
360
+ for utt in spk2utt[spk]:
361
+ dataset = utt.split("_")[0]
362
+ uid = "_".join(utt.split("_")[1:])
363
+ energy_path = os.path.join(
364
+ processed_dir, dataset, energy_dir, f"{uid}.npy"
365
+ )
366
+ if not os.path.exists(energy_path):
367
+ continue
368
+ frame_energy = np.load(energy_path)
369
+ assert len(frame_energy) > 0
370
+
371
+ if use_log_scale:
372
+ nonzero_idxes = np.where(frame_energy != 0)[0]
373
+ frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
374
+ utt2energy[utt] = frame_energy
375
+ energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
376
+
377
+ mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
378
+ if return_norm:
379
+ for utt in spk2utt[spk]:
380
+ energy = utt2energy[utt]
381
+ normalized_energy = (energy - mean) / std
382
+ utt2energy[utt] = normalized_energy
383
+ energy_statistic.append({"spk": spk, "mean": mean, "std": std})
384
+
385
+ return utt2energy, energy_statistic
386
+
387
+
388
+ def load_frame_energy(
389
+ meta_data,
390
+ processed_dir,
391
+ energy_dir,
392
+ use_log_scale=False,
393
+ return_norm=False,
394
+ interoperate=False,
395
+ utt2spk=None,
396
+ ):
397
+ utt2energy = {}
398
+ if utt2spk is None:
399
+ energy_scaler = StandardScaler()
400
+ for utt_info in meta_data:
401
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
402
+ energy_path = os.path.join(
403
+ processed_dir, utt_info["Dataset"], energy_dir, f'{utt_info["Uid"]}.npy'
404
+ )
405
+ frame_energy = np.load(energy_path)
406
+ assert len(frame_energy) > 0
407
+
408
+ if use_log_scale:
409
+ nonzero_idxes = np.where(frame_energy != 0)[0]
410
+ frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
411
+ utt2energy[utt] = frame_energy
412
+ energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
413
+
414
+ mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
415
+ if return_norm:
416
+ for utt_info in meta_data:
417
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
418
+ energy = utt2energy[utt]
419
+ normalized_energy = (energy - mean) / std
420
+ utt2energy[utt] = normalized_energy
421
+ energy_statistic = {"mean": mean, "std": std}
422
+
423
+ else:
424
+ spk2utt = {}
425
+ energy_statistic = []
426
+ for utt_info in meta_data:
427
+ utt = utt_info["Dataset"] + "_" + utt_info["Uid"]
428
+ if not utt2spk[utt] in spk2utt:
429
+ spk2utt[utt2spk[utt]] = []
430
+ spk2utt[utt2spk[utt]].append(utt)
431
+
432
+ for spk in spk2utt:
433
+ energy_scaler = StandardScaler()
434
+ for utt in spk2utt[spk]:
435
+ dataset = utt.split("_")[0]
436
+ uid = "_".join(utt.split("_")[1:])
437
+ energy_path = os.path.join(
438
+ processed_dir, dataset, energy_dir, f"{uid}.npy"
439
+ )
440
+ frame_energy = np.load(energy_path)
441
+ assert len(frame_energy) > 0
442
+
443
+ if use_log_scale:
444
+ nonzero_idxes = np.where(frame_energy != 0)[0]
445
+ frame_energy[nonzero_idxes] = np.log(frame_energy[nonzero_idxes])
446
+ utt2energy[utt] = frame_energy
447
+ energy_scaler.partial_fit(frame_energy.reshape(-1, 1))
448
+
449
+ mean, std = energy_scaler.mean_[0], energy_scaler.scale_[0]
450
+ if return_norm:
451
+ for utt in spk2utt[spk]:
452
+ energy = utt2energy[utt]
453
+ normalized_energy = (energy - mean) / std
454
+ utt2energy[utt] = normalized_energy
455
+ energy_statistic.append({"spk": spk, "mean": mean, "std": std})
456
+
457
+ return utt2energy, energy_statistic
458
+
459
+
460
+ def align_length(feature, target_len, pad_value=0.0):
461
+ feature_len = feature.shape[-1]
462
+ dim = len(feature.shape)
463
+ # align 1-D data
464
+ if dim == 2:
465
+ if target_len > feature_len:
466
+ feature = np.pad(
467
+ feature,
468
+ ((0, 0), (0, target_len - feature_len)),
469
+ constant_values=pad_value,
470
+ )
471
+ else:
472
+ feature = feature[:, :target_len]
473
+ # align 2-D data
474
+ elif dim == 1:
475
+ if target_len > feature_len:
476
+ feature = np.pad(
477
+ feature, (0, target_len - feature_len), constant_values=pad_value
478
+ )
479
+ else:
480
+ feature = feature[:target_len]
481
+ else:
482
+ raise NotImplementedError
483
+ return feature
484
+
485
+
486
+ def align_whisper_feauture_length(
487
+ feature, target_len, fast_mapping=True, source_hop=320, target_hop=256
488
+ ):
489
+ factor = np.gcd(source_hop, target_hop)
490
+ source_hop //= factor
491
+ target_hop //= factor
492
+ # print(
493
+ # "Mapping source's {} frames => target's {} frames".format(
494
+ # target_hop, source_hop
495
+ # )
496
+ # )
497
+
498
+ max_source_len = 1500
499
+ target_len = min(target_len, max_source_len * source_hop // target_hop)
500
+
501
+ width = feature.shape[-1]
502
+
503
+ if fast_mapping:
504
+ source_len = target_len * target_hop // source_hop + 1
505
+ feature = feature[:source_len]
506
+
507
+ else:
508
+ source_len = max_source_len
509
+
510
+ # const ~= target_len * target_hop
511
+ const = source_len * source_hop // target_hop * target_hop
512
+
513
+ # (source_len * source_hop, dim)
514
+ up_sampling_feats = np.repeat(feature, source_hop, axis=0)
515
+ # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
516
+ down_sampling_feats = np.average(
517
+ up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
518
+ )
519
+ assert len(down_sampling_feats) >= target_len
520
+
521
+ # (target_len, dim)
522
+ feat = down_sampling_feats[:target_len]
523
+
524
+ return feat
525
+
526
+
527
+ def align_content_feature_length(feature, target_len, source_hop=320, target_hop=256):
528
+ factor = np.gcd(source_hop, target_hop)
529
+ source_hop //= factor
530
+ target_hop //= factor
531
+ # print(
532
+ # "Mapping source's {} frames => target's {} frames".format(
533
+ # target_hop, source_hop
534
+ # )
535
+ # )
536
+
537
+ # (source_len, 256)
538
+ source_len, width = feature.shape
539
+
540
+ # const ~= target_len * target_hop
541
+ const = source_len * source_hop // target_hop * target_hop
542
+
543
+ # (source_len * source_hop, dim)
544
+ up_sampling_feats = np.repeat(feature, source_hop, axis=0)
545
+ # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
546
+ down_sampling_feats = np.average(
547
+ up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
548
+ )
549
+
550
+ err = abs(target_len - len(down_sampling_feats))
551
+ if err > 4: ## why 4 not 3?
552
+ print("target_len:", target_len)
553
+ print("raw feature:", feature.shape)
554
+ print("up_sampling:", up_sampling_feats.shape)
555
+ print("down_sampling_feats:", down_sampling_feats.shape)
556
+ exit()
557
+ if len(down_sampling_feats) < target_len:
558
+ # (1, dim) -> (err, dim)
559
+ end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
560
+ down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
561
+
562
+ # (target_len, dim)
563
+ feat = down_sampling_feats[:target_len]
564
+
565
+ return feat
566
+
567
+
568
+ def remove_outlier(values):
569
+ values = np.array(values)
570
+ p25 = np.percentile(values, 25)
571
+ p75 = np.percentile(values, 75)
572
+ lower = p25 - 1.5 * (p75 - p25)
573
+ upper = p75 + 1.5 * (p75 - p25)
574
+ normal_indices = np.logical_and(values > lower, values < upper)
575
+ return values[normal_indices]
utils/distribution.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import numpy as np
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
+ from torch.distributions import Normal
11
+
12
+
13
+ def log_sum_exp(x):
14
+ """numerically stable log_sum_exp implementation that prevents overflow"""
15
+ # TF ordering
16
+ axis = len(x.size()) - 1
17
+ m, _ = torch.max(x, dim=axis)
18
+ m2, _ = torch.max(x, dim=axis, keepdim=True)
19
+ return m + torch.log(torch.sum(torch.exp(x - m2), dim=axis))
20
+
21
+
22
+ def discretized_mix_logistic_loss(
23
+ y_hat, y, num_classes=256, log_scale_min=-7.0, reduce=True
24
+ ):
25
+ """Discretized mixture of logistic distributions loss
26
+
27
+ Note that it is assumed that input is scaled to [-1, 1].
28
+
29
+ Args:
30
+ y_hat (Tensor): Predicted output (B x C x T)
31
+ y (Tensor): Target (B x T x 1).
32
+ num_classes (int): Number of classes
33
+ log_scale_min (float): Log scale minimum value
34
+ reduce (bool): If True, the losses are averaged or summed for each
35
+ minibatch.
36
+
37
+ Returns
38
+ Tensor: loss
39
+ """
40
+ assert y_hat.dim() == 3
41
+ assert y_hat.size(1) % 3 == 0
42
+ nr_mix = y_hat.size(1) // 3
43
+
44
+ # (B x T x C)
45
+ y_hat = y_hat.transpose(1, 2)
46
+
47
+ # unpack parameters. (B, T, num_mixtures) x 3
48
+ logit_probs = y_hat[:, :, :nr_mix]
49
+ means = y_hat[:, :, nr_mix : 2 * nr_mix]
50
+ log_scales = torch.clamp(y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min)
51
+
52
+ # B x T x 1 -> B x T x num_mixtures
53
+ y = y.expand_as(means)
54
+
55
+ centered_y = y - means
56
+ inv_stdv = torch.exp(-log_scales)
57
+ plus_in = inv_stdv * (centered_y + 1.0 / (num_classes - 1))
58
+ cdf_plus = torch.sigmoid(plus_in)
59
+ min_in = inv_stdv * (centered_y - 1.0 / (num_classes - 1))
60
+ cdf_min = torch.sigmoid(min_in)
61
+
62
+ # log probability for edge case of 0 (before scaling)
63
+ # equivalent: torch.log(torch.sigmoid(plus_in))
64
+ log_cdf_plus = plus_in - F.softplus(plus_in)
65
+
66
+ # log probability for edge case of 255 (before scaling)
67
+ # equivalent: (1 - torch.sigmoid(min_in)).log()
68
+ log_one_minus_cdf_min = -F.softplus(min_in)
69
+
70
+ # probability for all other cases
71
+ cdf_delta = cdf_plus - cdf_min
72
+
73
+ mid_in = inv_stdv * centered_y
74
+ # log probability in the center of the bin, to be used in extreme cases
75
+ # (not actually used in our code)
76
+ log_pdf_mid = mid_in - log_scales - 2.0 * F.softplus(mid_in)
77
+
78
+ # tf equivalent
79
+ """
80
+ log_probs = tf.where(x < -0.999, log_cdf_plus,
81
+ tf.where(x > 0.999, log_one_minus_cdf_min,
82
+ tf.where(cdf_delta > 1e-5,
83
+ tf.log(tf.maximum(cdf_delta, 1e-12)),
84
+ log_pdf_mid - np.log(127.5))))
85
+ """
86
+ # TODO: cdf_delta <= 1e-5 actually can happen. How can we choose the value
87
+ # for num_classes=65536 case? 1e-7? not sure..
88
+ inner_inner_cond = (cdf_delta > 1e-5).float()
89
+
90
+ inner_inner_out = inner_inner_cond * torch.log(
91
+ torch.clamp(cdf_delta, min=1e-12)
92
+ ) + (1.0 - inner_inner_cond) * (log_pdf_mid - np.log((num_classes - 1) / 2))
93
+ inner_cond = (y > 0.999).float()
94
+ inner_out = (
95
+ inner_cond * log_one_minus_cdf_min + (1.0 - inner_cond) * inner_inner_out
96
+ )
97
+ cond = (y < -0.999).float()
98
+ log_probs = cond * log_cdf_plus + (1.0 - cond) * inner_out
99
+
100
+ log_probs = log_probs + F.log_softmax(logit_probs, -1)
101
+
102
+ if reduce:
103
+ return -torch.sum(log_sum_exp(log_probs))
104
+ else:
105
+ return -log_sum_exp(log_probs).unsqueeze(-1)
106
+
107
+
108
+ def to_one_hot(tensor, n, fill_with=1.0):
109
+ # we perform one hot encore with respect to the last axis
110
+ one_hot = torch.FloatTensor(tensor.size() + (n,)).zero_()
111
+ if tensor.is_cuda:
112
+ one_hot = one_hot.cuda()
113
+ one_hot.scatter_(len(tensor.size()), tensor.unsqueeze(-1), fill_with)
114
+ return one_hot
115
+
116
+
117
+ def sample_from_discretized_mix_logistic(y, log_scale_min=-7.0, clamp_log_scale=False):
118
+ """
119
+ Sample from discretized mixture of logistic distributions
120
+
121
+ Args:
122
+ y (Tensor): B x C x T
123
+ log_scale_min (float): Log scale minimum value
124
+
125
+ Returns:
126
+ Tensor: sample in range of [-1, 1].
127
+ """
128
+ assert y.size(1) % 3 == 0
129
+ nr_mix = y.size(1) // 3
130
+
131
+ # B x T x C
132
+ y = y.transpose(1, 2)
133
+ logit_probs = y[:, :, :nr_mix]
134
+
135
+ # sample mixture indicator from softmax
136
+ temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
137
+ temp = logit_probs.data - torch.log(-torch.log(temp))
138
+ _, argmax = temp.max(dim=-1)
139
+
140
+ # (B, T) -> (B, T, nr_mix)
141
+ one_hot = to_one_hot(argmax, nr_mix)
142
+ # select logistic parameters
143
+ means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1)
144
+ log_scales = torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1)
145
+ if clamp_log_scale:
146
+ log_scales = torch.clamp(log_scales, min=log_scale_min)
147
+ # sample from logistic & clip to interval
148
+ # we don't actually round to the nearest 8bit value when sampling
149
+ u = means.data.new(means.size()).uniform_(1e-5, 1.0 - 1e-5)
150
+ x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1.0 - u))
151
+
152
+ x = torch.clamp(torch.clamp(x, min=-1.0), max=1.0)
153
+
154
+ return x
155
+
156
+
157
+ # we can easily define discretized version of the gaussian loss, however,
158
+ # use continuous version as same as the https://clarinet-demo.github.io/
159
+ def mix_gaussian_loss(y_hat, y, log_scale_min=-7.0, reduce=True):
160
+ """Mixture of continuous gaussian distributions loss
161
+
162
+ Note that it is assumed that input is scaled to [-1, 1].
163
+
164
+ Args:
165
+ y_hat (Tensor): Predicted output (B x C x T)
166
+ y (Tensor): Target (B x T x 1).
167
+ log_scale_min (float): Log scale minimum value
168
+ reduce (bool): If True, the losses are averaged or summed for each
169
+ minibatch.
170
+ Returns
171
+ Tensor: loss
172
+ """
173
+ assert y_hat.dim() == 3
174
+ C = y_hat.size(1)
175
+ if C == 2:
176
+ nr_mix = 1
177
+ else:
178
+ assert y_hat.size(1) % 3 == 0
179
+ nr_mix = y_hat.size(1) // 3
180
+
181
+ # (B x T x C)
182
+ y_hat = y_hat.transpose(1, 2)
183
+
184
+ # unpack parameters.
185
+ if C == 2:
186
+ # special case for C == 2, just for compatibility
187
+ logit_probs = None
188
+ means = y_hat[:, :, 0:1]
189
+ log_scales = torch.clamp(y_hat[:, :, 1:2], min=log_scale_min)
190
+ else:
191
+ # (B, T, num_mixtures) x 3
192
+ logit_probs = y_hat[:, :, :nr_mix]
193
+ means = y_hat[:, :, nr_mix : 2 * nr_mix]
194
+ log_scales = torch.clamp(
195
+ y_hat[:, :, 2 * nr_mix : 3 * nr_mix], min=log_scale_min
196
+ )
197
+
198
+ # B x T x 1 -> B x T x num_mixtures
199
+ y = y.expand_as(means)
200
+
201
+ centered_y = y - means
202
+ dist = Normal(loc=0.0, scale=torch.exp(log_scales))
203
+ # do we need to add a trick to avoid log(0)?
204
+ log_probs = dist.log_prob(centered_y)
205
+
206
+ if nr_mix > 1:
207
+ log_probs = log_probs + F.log_softmax(logit_probs, -1)
208
+
209
+ if reduce:
210
+ if nr_mix == 1:
211
+ return -torch.sum(log_probs)
212
+ else:
213
+ return -torch.sum(log_sum_exp(log_probs))
214
+ else:
215
+ if nr_mix == 1:
216
+ return -log_probs
217
+ else:
218
+ return -log_sum_exp(log_probs).unsqueeze(-1)
219
+
220
+
221
+ def sample_from_mix_gaussian(y, log_scale_min=-7.0):
222
+ """
223
+ Sample from (discretized) mixture of gaussian distributions
224
+ Args:
225
+ y (Tensor): B x C x T
226
+ log_scale_min (float): Log scale minimum value
227
+ Returns:
228
+ Tensor: sample in range of [-1, 1].
229
+ """
230
+ C = y.size(1)
231
+ if C == 2:
232
+ nr_mix = 1
233
+ else:
234
+ assert y.size(1) % 3 == 0
235
+ nr_mix = y.size(1) // 3
236
+
237
+ # B x T x C
238
+ y = y.transpose(1, 2)
239
+
240
+ if C == 2:
241
+ logit_probs = None
242
+ else:
243
+ logit_probs = y[:, :, :nr_mix]
244
+
245
+ if nr_mix > 1:
246
+ # sample mixture indicator from softmax
247
+ temp = logit_probs.data.new(logit_probs.size()).uniform_(1e-5, 1.0 - 1e-5)
248
+ temp = logit_probs.data - torch.log(-torch.log(temp))
249
+ _, argmax = temp.max(dim=-1)
250
+
251
+ # (B, T) -> (B, T, nr_mix)
252
+ one_hot = to_one_hot(argmax, nr_mix)
253
+
254
+ # Select means and log scales
255
+ means = torch.sum(y[:, :, nr_mix : 2 * nr_mix] * one_hot, dim=-1)
256
+ log_scales = torch.sum(y[:, :, 2 * nr_mix : 3 * nr_mix] * one_hot, dim=-1)
257
+ else:
258
+ if C == 2:
259
+ means, log_scales = y[:, :, 0], y[:, :, 1]
260
+ elif C == 3:
261
+ means, log_scales = y[:, :, 1], y[:, :, 2]
262
+ else:
263
+ assert False, "shouldn't happen"
264
+
265
+ scales = torch.exp(log_scales)
266
+ dist = Normal(loc=means, scale=scales)
267
+ x = dist.sample()
268
+
269
+ x = torch.clamp(x, min=-1.0, max=1.0)
270
+ return x
utils/dsp.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import numpy as np
7
+ import torch
8
+
9
+ # ZERO = 1e-12
10
+
11
+
12
+ def gaussian_normalize_mel_channel(mel, mu, sigma):
13
+ """
14
+ Shift to Standorm Normal Distribution
15
+
16
+ Args:
17
+ mel: (n_mels, frame_len)
18
+ mu: (n_mels,), mean value
19
+ sigma: (n_mels,), sd value
20
+ Return:
21
+ Tensor like mel
22
+ """
23
+ mu = np.expand_dims(mu, -1)
24
+ sigma = np.expand_dims(sigma, -1)
25
+ return (mel - mu) / sigma
26
+
27
+
28
+ def de_gaussian_normalize_mel_channel(mel, mu, sigma):
29
+ """
30
+
31
+ Args:
32
+ mel: (n_mels, frame_len)
33
+ mu: (n_mels,), mean value
34
+ sigma: (n_mels,), sd value
35
+ Return:
36
+ Tensor like mel
37
+ """
38
+ mu = np.expand_dims(mu, -1)
39
+ sigma = np.expand_dims(sigma, -1)
40
+ return sigma * mel + mu
41
+
42
+
43
+ def decompress(audio_compressed, bits):
44
+ mu = 2**bits - 1
45
+ audio = np.sign(audio_compressed) / mu * ((1 + mu) ** np.abs(audio_compressed) - 1)
46
+ return audio
47
+
48
+
49
+ def compress(audio, bits):
50
+ mu = 2**bits - 1
51
+ audio_compressed = np.sign(audio) * np.log(1 + mu * np.abs(audio)) / np.log(mu + 1)
52
+ return audio_compressed
53
+
54
+
55
+ def label_to_audio(quant, bits):
56
+ classes = 2**bits
57
+ audio = 2 * quant / (classes - 1.0) - 1.0
58
+ return audio
59
+
60
+
61
+ def audio_to_label(audio, bits):
62
+ """Normalized audio data tensor to digit array
63
+
64
+ Args:
65
+ audio (tensor): audio data
66
+ bits (int): data bits
67
+
68
+ Returns:
69
+ array<int>: digit array of audio data
70
+ """
71
+ classes = 2**bits
72
+ # initialize an increasing array with values from -1 to 1
73
+ bins = np.linspace(-1, 1, classes)
74
+ # change value in audio tensor to digits
75
+ quant = np.digitize(audio, bins) - 1
76
+ return quant
77
+
78
+
79
+ def label_to_onehot(x, bits):
80
+ """Converts a class vector (integers) to binary class matrix.
81
+ Args:
82
+ x: class vector to be converted into a matrix
83
+ (integers from 0 to num_classes).
84
+ num_classes: total number of classes.
85
+ Returns:
86
+ A binary matrix representation of the input. The classes axis
87
+ is placed last.
88
+ """
89
+ classes = 2**bits
90
+
91
+ result = torch.zeros((x.shape[0], classes), dtype=torch.float32)
92
+ for i in range(x.shape[0]):
93
+ result[i, x[i]] = 1
94
+
95
+ output_shape = x.shape + (classes,)
96
+ output = torch.reshape(result, output_shape)
97
+ return output
utils/duration.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import numpy as np
7
+ import os
8
+ import tgt
9
+
10
+
11
+ def get_alignment(tier, cfg):
12
+ sample_rate = cfg["sample_rate"]
13
+ hop_size = cfg["hop_size"]
14
+
15
+ sil_phones = ["sil", "sp", "spn"]
16
+
17
+ phones = []
18
+ durations = []
19
+ start_time = 0
20
+ end_time = 0
21
+ end_idx = 0
22
+
23
+ for t in tier._objects:
24
+ s, e, p = t.start_time, t.end_time, t.text
25
+
26
+ # Trim leading silences
27
+ if phones == []:
28
+ if p in sil_phones:
29
+ continue
30
+ else:
31
+ start_time = s
32
+
33
+ if p not in sil_phones:
34
+ # For ordinary phones
35
+ phones.append(p)
36
+ end_time = e
37
+ end_idx = len(phones)
38
+ else:
39
+ # For silent phones
40
+ phones.append(p)
41
+
42
+ durations.append(
43
+ int(
44
+ np.round(e * sample_rate / hop_size)
45
+ - np.round(s * sample_rate / hop_size)
46
+ )
47
+ )
48
+
49
+ # Trim tailing silences
50
+ phones = phones[:end_idx]
51
+ durations = durations[:end_idx]
52
+
53
+ return phones, durations, start_time, end_time
54
+
55
+
56
+ def get_duration(utt, wav, cfg):
57
+ speaker = utt["Singer"]
58
+ basename = utt["Uid"]
59
+ dataset = utt["Dataset"]
60
+ sample_rate = cfg["sample_rate"]
61
+
62
+ # print(cfg.processed_dir, dataset, speaker, basename)
63
+ wav_path = os.path.join(
64
+ cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename)
65
+ )
66
+ text_path = os.path.join(
67
+ cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename)
68
+ )
69
+ tg_path = os.path.join(
70
+ cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename)
71
+ )
72
+
73
+ # Read raw text
74
+ with open(text_path, "r") as f:
75
+ raw_text = f.readline().strip("\n")
76
+
77
+ # Get alignments
78
+ textgrid = tgt.io.read_textgrid(tg_path)
79
+ phone, duration, start, end = get_alignment(
80
+ textgrid.get_tier_by_name("phones"), cfg
81
+ )
82
+ text = "{" + " ".join(phone) + "}"
83
+ if start >= end:
84
+ return None
85
+
86
+ return duration, text, int(sample_rate * start), int(sample_rate * end)
utils/f0.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import librosa
7
+ import numpy as np
8
+ import torch
9
+ import parselmouth
10
+ import torchcrepe
11
+ import pyworld as pw
12
+
13
+
14
+ def get_bin_index(f0, m, M, n_bins, use_log_scale):
15
+ """
16
+ WARNING: to abandon!
17
+
18
+ Args:
19
+ raw_f0: tensor whose shpae is (N, frame_len)
20
+ Returns:
21
+ index: tensor whose shape is same to f0
22
+ """
23
+ raw_f0 = f0.clone()
24
+ raw_m, raw_M = m, M
25
+
26
+ if use_log_scale:
27
+ f0[torch.where(f0 == 0)] = 1
28
+ f0 = torch.log(f0)
29
+ m, M = float(np.log(m)), float(np.log(M))
30
+
31
+ # Set normal index in [1, n_bins - 1]
32
+ width = (M + 1e-7 - m) / (n_bins - 1)
33
+ index = (f0 - m) // width + 1
34
+ # Set unvoiced frames as 0, Therefore, the vocabulary is [0, n_bins- 1], whose size is n_bins
35
+ index[torch.where(f0 == 0)] = 0
36
+
37
+ # TODO: Boundary check (special: to judge whether 0 for unvoiced)
38
+ if torch.any(raw_f0 > raw_M):
39
+ print("F0 Warning: too high f0: {}".format(raw_f0[torch.where(raw_f0 > raw_M)]))
40
+ index[torch.where(raw_f0 > raw_M)] = n_bins - 1
41
+ if torch.any(raw_f0 < raw_m):
42
+ print("F0 Warning: too low f0: {}".format(raw_f0[torch.where(f0 < m)]))
43
+ index[torch.where(f0 < m)] = 0
44
+
45
+ return torch.as_tensor(index, dtype=torch.long, device=f0.device)
46
+
47
+
48
+ def f0_to_coarse(f0, pitch_bin, pitch_min, pitch_max):
49
+ ## TODO: Figure out the detail of this function
50
+
51
+ f0_mel_min = 1127 * np.log(1 + pitch_min / 700)
52
+ f0_mel_max = 1127 * np.log(1 + pitch_max / 700)
53
+
54
+ is_torch = isinstance(f0, torch.Tensor)
55
+ f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
56
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (pitch_bin - 2) / (
57
+ f0_mel_max - f0_mel_min
58
+ ) + 1
59
+
60
+ f0_mel[f0_mel <= 1] = 1
61
+ f0_mel[f0_mel > pitch_bin - 1] = pitch_bin - 1
62
+ f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int32)
63
+ assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (
64
+ f0_coarse.max(),
65
+ f0_coarse.min(),
66
+ )
67
+ return f0_coarse
68
+
69
+
70
+ def interpolate(f0):
71
+ """Interpolate the unvoiced part. Thus the f0 can be passed to a subtractive synthesizer.
72
+ Args:
73
+ f0: A numpy array of shape (seq_len,)
74
+ Returns:
75
+ f0: Interpolated f0 of shape (seq_len,)
76
+ uv: Unvoiced part of shape (seq_len,)
77
+ """
78
+ uv = f0 == 0
79
+ if len(f0[~uv]) > 0:
80
+ # interpolate the unvoiced f0
81
+ f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
82
+ uv = uv.astype("float")
83
+ uv = np.min(np.array([uv[:-2], uv[1:-1], uv[2:]]), axis=0)
84
+ uv = np.pad(uv, (1, 1))
85
+ return f0, uv
86
+
87
+
88
+ def get_log_f0(f0):
89
+ f0[np.where(f0 == 0)] = 1
90
+ log_f0 = np.log(f0)
91
+ return log_f0
92
+
93
+
94
+ # ========== Methods ==========
95
+
96
+
97
+ def get_f0_features_using_pyin(audio, cfg):
98
+ """Using pyin to extract the f0 feature.
99
+ Args:
100
+ audio
101
+ fs
102
+ win_length
103
+ hop_length
104
+ f0_min
105
+ f0_max
106
+ Returns:
107
+ f0: numpy array of shape (frame_len,)
108
+ """
109
+ f0, voiced_flag, voiced_probs = librosa.pyin(
110
+ y=audio,
111
+ fmin=cfg.f0_min,
112
+ fmax=cfg.f0_max,
113
+ sr=cfg.sample_rate,
114
+ win_length=cfg.win_size,
115
+ hop_length=cfg.hop_size,
116
+ )
117
+ # Set nan to 0
118
+ f0[voiced_flag == False] = 0
119
+ return f0
120
+
121
+
122
+ def get_f0_features_using_parselmouth(audio, cfg, speed=1):
123
+ """Using parselmouth to extract the f0 feature.
124
+ Args:
125
+ audio
126
+ mel_len
127
+ hop_length
128
+ fs
129
+ f0_min
130
+ f0_max
131
+ speed(default=1)
132
+ Returns:
133
+ f0: numpy array of shape (frame_len,)
134
+ pitch_coarse: numpy array of shape (frame_len,)
135
+ """
136
+ hop_size = int(np.round(cfg.hop_size * speed))
137
+
138
+ # Calculate the time step for pitch extraction
139
+ time_step = hop_size / cfg.sample_rate * 1000
140
+
141
+ f0 = (
142
+ parselmouth.Sound(audio, cfg.sample_rate)
143
+ .to_pitch_ac(
144
+ time_step=time_step / 1000,
145
+ voicing_threshold=0.6,
146
+ pitch_floor=cfg.f0_min,
147
+ pitch_ceiling=cfg.f0_max,
148
+ )
149
+ .selected_array["frequency"]
150
+ )
151
+
152
+ # Pad the pitch to the mel_len
153
+ # pad_size = (int(len(audio) // hop_size) - len(f0) + 1) // 2
154
+ # f0 = np.pad(f0, [[pad_size, mel_len - len(f0) - pad_size]], mode="constant")
155
+
156
+ # Get the coarse part
157
+ pitch_coarse = f0_to_coarse(f0, cfg.pitch_bin, cfg.f0_min, cfg.f0_max)
158
+ return f0, pitch_coarse
159
+
160
+
161
+ def get_f0_features_using_dio(audio, cfg):
162
+ """Using dio to extract the f0 feature.
163
+ Args:
164
+ audio
165
+ mel_len
166
+ fs
167
+ hop_length
168
+ f0_min
169
+ f0_max
170
+ Returns:
171
+ f0: numpy array of shape (frame_len,)
172
+ """
173
+ # Get the raw f0
174
+ _f0, t = pw.dio(
175
+ audio.astype("double"),
176
+ cfg.sample_rate,
177
+ f0_floor=cfg.f0_min,
178
+ f0_ceil=cfg.f0_max,
179
+ channels_in_octave=2,
180
+ frame_period=(1000 * cfg.hop_size / cfg.sample_rate),
181
+ )
182
+ # Get the f0
183
+ f0 = pw.stonemask(audio.astype("double"), _f0, t, cfg.sample_rate)
184
+ return f0
185
+
186
+
187
+ def get_f0_features_using_harvest(audio, mel_len, fs, hop_length, f0_min, f0_max):
188
+ """Using harvest to extract the f0 feature.
189
+ Args:
190
+ audio
191
+ mel_len
192
+ fs
193
+ hop_length
194
+ f0_min
195
+ f0_max
196
+ Returns:
197
+ f0: numpy array of shape (frame_len,)
198
+ """
199
+ f0, _ = pw.harvest(
200
+ audio.astype("double"),
201
+ fs,
202
+ f0_floor=f0_min,
203
+ f0_ceil=f0_max,
204
+ frame_period=(1000 * hop_length / fs),
205
+ )
206
+ f0 = f0.astype("float")[:mel_len]
207
+ return f0
208
+
209
+
210
+ def get_f0_features_using_crepe_legacy(
211
+ audio, mel_len, fs, hop_length, hop_length_new, f0_min, f0_max, threshold=0.3
212
+ ):
213
+ """Using torchcrepe to extract the f0 feature.
214
+ Args:
215
+ audio
216
+ mel_len
217
+ fs
218
+ hop_length
219
+ hop_length_new
220
+ f0_min
221
+ f0_max
222
+ threshold(default=0.3)
223
+ Returns:
224
+ f0: numpy array of shape (frame_len,)
225
+ """
226
+ # Currently, crepe only supports 16khz audio
227
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
228
+ audio_16k = librosa.resample(audio, orig_sr=fs, target_sr=16000)
229
+ audio_16k_torch = torch.FloatTensor(audio_16k).unsqueeze(0).to(device)
230
+
231
+ # Get the raw pitch
232
+ f0, pd = torchcrepe.predict(
233
+ audio_16k_torch,
234
+ 16000,
235
+ hop_length_new,
236
+ f0_min,
237
+ f0_max,
238
+ pad=True,
239
+ model="full",
240
+ batch_size=1024,
241
+ device=device,
242
+ return_periodicity=True,
243
+ )
244
+
245
+ # Filter, de-silence, set up threshold for unvoiced part
246
+ pd = torchcrepe.filter.median(pd, 3)
247
+ pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_16k_torch, 16000, hop_length_new)
248
+ f0 = torchcrepe.threshold.At(threshold)(f0, pd)
249
+ f0 = torchcrepe.filter.mean(f0, 3)
250
+
251
+ # Convert unvoiced part to 0hz
252
+ f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)
253
+
254
+ # Interpolate f0
255
+ nzindex = torch.nonzero(f0[0]).squeeze()
256
+ f0 = torch.index_select(f0[0], dim=0, index=nzindex).cpu().numpy()
257
+ time_org = 0.005 * nzindex.cpu().numpy()
258
+ time_frame = np.arange(mel_len) * hop_length / fs
259
+ f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
260
+ return f0
261
+
262
+ def get_f0_features_using_crepe(audio, cfg):
263
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
264
+ audio_torch = torch.FloatTensor(audio).unsqueeze(0).to(device)
265
+
266
+ crepe_pitch, pd = torchcrepe.predict(audio_torch, cfg.sample_rate, cfg.hop_size, fmin=cfg.f0_min, fmax=cfg.f0_max, return_periodicity=True)
267
+
268
+ threshold = 0.3
269
+
270
+ # Filter, de-silence, set up threshold for unvoiced part
271
+ pd = torchcrepe.filter.median(pd, 3)
272
+ pd = torchcrepe.threshold.Silence(-60.0)(pd, audio_torch, cfg.sample_rate, 256)
273
+ crepe_pitch = torchcrepe.threshold.At(threshold)(crepe_pitch, pd)
274
+ crepe_pitch = torchcrepe.filter.mean(crepe_pitch, 3)
275
+
276
+ # Convert unvoiced part to 0hz
277
+ crepe_pitch = torch.where(torch.isnan(crepe_pitch), torch.full_like(crepe_pitch, 0), crepe_pitch)
278
+
279
+ return crepe_pitch[0].cpu().numpy()
280
+
281
+
282
+ def get_f0(audio, cfg):
283
+ if cfg.pitch_extractor == "dio":
284
+ f0 = get_f0_features_using_dio(audio, cfg)
285
+ elif cfg.pitch_extractor == "pyin":
286
+ f0 = get_f0_features_using_pyin(audio, cfg)
287
+ elif cfg.pitch_extractor == "parselmouth":
288
+ f0, _ = get_f0_features_using_parselmouth(audio, cfg)
289
+ elif cfg.pitch_extractor == "crepe":
290
+ f0 = get_f0_features_using_crepe(audio, cfg)
291
+ # elif cfg.data.f0_extractor == 'cwt': # todo
292
+
293
+ return f0
294
+
295
+
296
+ def get_cents(f0_hz):
297
+ """
298
+ F_{cent} = 1200 * log2 (F/440)
299
+
300
+ Reference:
301
+ APSIPA'17, Perceptual Evaluation of Singing Quality
302
+ """
303
+ voiced_f0 = f0_hz[f0_hz != 0]
304
+ return 1200 * np.log2(voiced_f0 / 440)
305
+
306
+
307
+ def get_pitch_derivatives(f0_hz):
308
+ """
309
+ f0_hz: (,T)
310
+ """
311
+ f0_cent = get_cents(f0_hz)
312
+ return f0_cent[1:] - f0_cent[:-1]
313
+
314
+
315
+ def get_pitch_sub_median(f0_hz):
316
+ """
317
+ f0_hz: (,T)
318
+ """
319
+ f0_cent = get_cents(f0_hz)
320
+ return f0_cent - np.median(f0_cent)
utils/hparam.py ADDED
@@ -0,0 +1,659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # This code is modified from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/training/python/training/hparam.py pylint: disable=line-too-long
7
+ """Hyperparameter values."""
8
+ from __future__ import absolute_import
9
+ from __future__ import division
10
+ from __future__ import print_function
11
+
12
+ import json
13
+ import numbers
14
+ import re
15
+ import six
16
+
17
+ # Define the regular expression for parsing a single clause of the input
18
+ # (delimited by commas). A legal clause looks like:
19
+ # <variable name>[<index>]? = <rhs>
20
+ # where <rhs> is either a single token or [] enclosed list of tokens.
21
+ # For example: "var[1] = a" or "x = [1,2,3]"
22
+ PARAM_RE = re.compile(
23
+ r"""
24
+ (?P<name>[a-zA-Z][\w\.]*) # variable name: "var" or "x"
25
+ (\[\s*(?P<index>\d+)\s*\])? # (optional) index: "1" or None
26
+ \s*=\s*
27
+ ((?P<val>[^,\[]*) # single value: "a" or None
28
+ |
29
+ \[(?P<vals>[^\]]*)\]) # list of values: None or "1,2,3"
30
+ ($|,\s*)""",
31
+ re.VERBOSE,
32
+ )
33
+
34
+
35
+ def _parse_fail(name, var_type, value, values):
36
+ """Helper function for raising a value error for bad assignment."""
37
+ raise ValueError(
38
+ "Could not parse hparam '%s' of type '%s' with value '%s' in %s"
39
+ % (name, var_type.__name__, value, values)
40
+ )
41
+
42
+
43
+ def _reuse_fail(name, values):
44
+ """Helper function for raising a value error for reuse of name."""
45
+ raise ValueError("Multiple assignments to variable '%s' in %s" % (name, values))
46
+
47
+
48
+ def _process_scalar_value(name, parse_fn, var_type, m_dict, values, results_dictionary):
49
+ """Update results_dictionary with a scalar value.
50
+
51
+ Used to update the results_dictionary to be returned by parse_values when
52
+ encountering a clause with a scalar RHS (e.g. "s=5" or "arr[0]=5".)
53
+
54
+ Mutates results_dictionary.
55
+
56
+ Args:
57
+ name: Name of variable in assignment ("s" or "arr").
58
+ parse_fn: Function for parsing the actual value.
59
+ var_type: Type of named variable.
60
+ m_dict: Dictionary constructed from regex parsing.
61
+ m_dict['val']: RHS value (scalar)
62
+ m_dict['index']: List index value (or None)
63
+ values: Full expression being parsed
64
+ results_dictionary: The dictionary being updated for return by the parsing
65
+ function.
66
+
67
+ Raises:
68
+ ValueError: If the name has already been used.
69
+ """
70
+ try:
71
+ parsed_value = parse_fn(m_dict["val"])
72
+ except ValueError:
73
+ _parse_fail(name, var_type, m_dict["val"], values)
74
+
75
+ # If no index is provided
76
+ if not m_dict["index"]:
77
+ if name in results_dictionary:
78
+ _reuse_fail(name, values)
79
+ results_dictionary[name] = parsed_value
80
+ else:
81
+ if name in results_dictionary:
82
+ # The name has already been used as a scalar, then it
83
+ # will be in this dictionary and map to a non-dictionary.
84
+ if not isinstance(results_dictionary.get(name), dict):
85
+ _reuse_fail(name, values)
86
+ else:
87
+ results_dictionary[name] = {}
88
+
89
+ index = int(m_dict["index"])
90
+ # Make sure the index position hasn't already been assigned a value.
91
+ if index in results_dictionary[name]:
92
+ _reuse_fail("{}[{}]".format(name, index), values)
93
+ results_dictionary[name][index] = parsed_value
94
+
95
+
96
+ def _process_list_value(name, parse_fn, var_type, m_dict, values, results_dictionary):
97
+ """Update results_dictionary from a list of values.
98
+
99
+ Used to update results_dictionary to be returned by parse_values when
100
+ encountering a clause with a list RHS (e.g. "arr=[1,2,3]".)
101
+
102
+ Mutates results_dictionary.
103
+
104
+ Args:
105
+ name: Name of variable in assignment ("arr").
106
+ parse_fn: Function for parsing individual values.
107
+ var_type: Type of named variable.
108
+ m_dict: Dictionary constructed from regex parsing.
109
+ m_dict['val']: RHS value (scalar)
110
+ values: Full expression being parsed
111
+ results_dictionary: The dictionary being updated for return by the parsing
112
+ function.
113
+
114
+ Raises:
115
+ ValueError: If the name has an index or the values cannot be parsed.
116
+ """
117
+ if m_dict["index"] is not None:
118
+ raise ValueError("Assignment of a list to a list index.")
119
+ elements = filter(None, re.split("[ ,]", m_dict["vals"]))
120
+ # Make sure the name hasn't already been assigned a value
121
+ if name in results_dictionary:
122
+ raise _reuse_fail(name, values)
123
+ try:
124
+ results_dictionary[name] = [parse_fn(e) for e in elements]
125
+ except ValueError:
126
+ _parse_fail(name, var_type, m_dict["vals"], values)
127
+
128
+
129
+ def _cast_to_type_if_compatible(name, param_type, value):
130
+ """Cast hparam to the provided type, if compatible.
131
+
132
+ Args:
133
+ name: Name of the hparam to be cast.
134
+ param_type: The type of the hparam.
135
+ value: The value to be cast, if compatible.
136
+
137
+ Returns:
138
+ The result of casting `value` to `param_type`.
139
+
140
+ Raises:
141
+ ValueError: If the type of `value` is not compatible with param_type.
142
+ * If `param_type` is a string type, but `value` is not.
143
+ * If `param_type` is a boolean, but `value` is not, or vice versa.
144
+ * If `param_type` is an integer type, but `value` is not.
145
+ * If `param_type` is a float type, but `value` is not a numeric type.
146
+ """
147
+ fail_msg = "Could not cast hparam '%s' of type '%s' from value %r" % (
148
+ name,
149
+ param_type,
150
+ value,
151
+ )
152
+
153
+ # Some callers use None, for which we can't do any casting/checking. :(
154
+ if issubclass(param_type, type(None)):
155
+ return value
156
+
157
+ # Avoid converting a non-string type to a string.
158
+ if issubclass(param_type, (six.string_types, six.binary_type)) and not isinstance(
159
+ value, (six.string_types, six.binary_type)
160
+ ):
161
+ raise ValueError(fail_msg)
162
+
163
+ # Avoid converting a number or string type to a boolean or vice versa.
164
+ if issubclass(param_type, bool) != isinstance(value, bool):
165
+ raise ValueError(fail_msg)
166
+
167
+ # Avoid converting float to an integer (the reverse is fine).
168
+ if issubclass(param_type, numbers.Integral) and not isinstance(
169
+ value, numbers.Integral
170
+ ):
171
+ raise ValueError(fail_msg)
172
+
173
+ # Avoid converting a non-numeric type to a numeric type.
174
+ if issubclass(param_type, numbers.Number) and not isinstance(value, numbers.Number):
175
+ raise ValueError(fail_msg)
176
+
177
+ return param_type(value)
178
+
179
+
180
+ def parse_values(values, type_map, ignore_unknown=False):
181
+ """Parses hyperparameter values from a string into a python map.
182
+
183
+ `values` is a string containing comma-separated `name=value` pairs.
184
+ For each pair, the value of the hyperparameter named `name` is set to
185
+ `value`.
186
+
187
+ If a hyperparameter name appears multiple times in `values`, a ValueError
188
+ is raised (e.g. 'a=1,a=2', 'a[1]=1,a[1]=2').
189
+
190
+ If a hyperparameter name in both an index assignment and scalar assignment,
191
+ a ValueError is raised. (e.g. 'a=[1,2,3],a[0] = 1').
192
+
193
+ The hyperparameter name may contain '.' symbols, which will result in an
194
+ attribute name that is only accessible through the getattr and setattr
195
+ functions. (And must be first explicit added through add_hparam.)
196
+
197
+ WARNING: Use of '.' in your variable names is allowed, but is not well
198
+ supported and not recommended.
199
+
200
+ The `value` in `name=value` must follows the syntax according to the
201
+ type of the parameter:
202
+
203
+ * Scalar integer: A Python-parsable integer point value. E.g.: 1,
204
+ 100, -12.
205
+ * Scalar float: A Python-parsable floating point value. E.g.: 1.0,
206
+ -.54e89.
207
+ * Boolean: Either true or false.
208
+ * Scalar string: A non-empty sequence of characters, excluding comma,
209
+ spaces, and square brackets. E.g.: foo, bar_1.
210
+ * List: A comma separated list of scalar values of the parameter type
211
+ enclosed in square brackets. E.g.: [1,2,3], [1.0,1e-12], [high,low].
212
+
213
+ When index assignment is used, the corresponding type_map key should be the
214
+ list name. E.g. for "arr[1]=0" the type_map must have the key "arr" (not
215
+ "arr[1]").
216
+
217
+ Args:
218
+ values: String. Comma separated list of `name=value` pairs where
219
+ 'value' must follow the syntax described above.
220
+ type_map: A dictionary mapping hyperparameter names to types. Note every
221
+ parameter name in values must be a key in type_map. The values must
222
+ conform to the types indicated, where a value V is said to conform to a
223
+ type T if either V has type T, or V is a list of elements of type T.
224
+ Hence, for a multidimensional parameter 'x' taking float values,
225
+ 'x=[0.1,0.2]' will parse successfully if type_map['x'] = float.
226
+ ignore_unknown: Bool. Whether values that are missing a type in type_map
227
+ should be ignored. If set to True, a ValueError will not be raised for
228
+ unknown hyperparameter type.
229
+
230
+ Returns:
231
+ A python map mapping each name to either:
232
+ * A scalar value.
233
+ * A list of scalar values.
234
+ * A dictionary mapping index numbers to scalar values.
235
+ (e.g. "x=5,L=[1,2],arr[1]=3" results in {'x':5,'L':[1,2],'arr':{1:3}}")
236
+
237
+ Raises:
238
+ ValueError: If there is a problem with input.
239
+ * If `values` cannot be parsed.
240
+ * If a list is assigned to a list index (e.g. 'a[1] = [1,2,3]').
241
+ * If the same rvalue is assigned two different values (e.g. 'a=1,a=2',
242
+ 'a[1]=1,a[1]=2', or 'a=1,a=[1]')
243
+ """
244
+ results_dictionary = {}
245
+ pos = 0
246
+ while pos < len(values):
247
+ m = PARAM_RE.match(values, pos)
248
+ if not m:
249
+ raise ValueError("Malformed hyperparameter value: %s" % values[pos:])
250
+ # Check that there is a comma between parameters and move past it.
251
+ pos = m.end()
252
+ # Parse the values.
253
+ m_dict = m.groupdict()
254
+ name = m_dict["name"]
255
+ if name not in type_map:
256
+ if ignore_unknown:
257
+ continue
258
+ raise ValueError("Unknown hyperparameter type for %s" % name)
259
+ type_ = type_map[name]
260
+
261
+ # Set up correct parsing function (depending on whether type_ is a bool)
262
+ if type_ == bool:
263
+
264
+ def parse_bool(value):
265
+ if value in ["true", "True"]:
266
+ return True
267
+ elif value in ["false", "False"]:
268
+ return False
269
+ else:
270
+ try:
271
+ return bool(int(value))
272
+ except ValueError:
273
+ _parse_fail(name, type_, value, values)
274
+
275
+ parse = parse_bool
276
+ else:
277
+ parse = type_
278
+
279
+ # If a singe value is provided
280
+ if m_dict["val"] is not None:
281
+ _process_scalar_value(
282
+ name, parse, type_, m_dict, values, results_dictionary
283
+ )
284
+
285
+ # If the assigned value is a list:
286
+ elif m_dict["vals"] is not None:
287
+ _process_list_value(name, parse, type_, m_dict, values, results_dictionary)
288
+
289
+ else: # Not assigned a list or value
290
+ _parse_fail(name, type_, "", values)
291
+
292
+ return results_dictionary
293
+
294
+
295
+ class HParams(object):
296
+ """Class to hold a set of hyperparameters as name-value pairs.
297
+
298
+ A `HParams` object holds hyperparameters used to build and train a model,
299
+ such as the number of hidden units in a neural net layer or the learning rate
300
+ to use when training.
301
+
302
+ You first create a `HParams` object by specifying the names and values of the
303
+ hyperparameters.
304
+
305
+ To make them easily accessible the parameter names are added as direct
306
+ attributes of the class. A typical usage is as follows:
307
+
308
+ ```python
309
+ # Create a HParams object specifying names and values of the model
310
+ # hyperparameters:
311
+ hparams = HParams(learning_rate=0.1, num_hidden_units=100)
312
+
313
+ # The hyperparameter are available as attributes of the HParams object:
314
+ hparams.learning_rate ==> 0.1
315
+ hparams.num_hidden_units ==> 100
316
+ ```
317
+
318
+ Hyperparameters have type, which is inferred from the type of their value
319
+ passed at construction type. The currently supported types are: integer,
320
+ float, boolean, string, and list of integer, float, boolean, or string.
321
+
322
+ You can override hyperparameter values by calling the
323
+ [`parse()`](#HParams.parse) method, passing a string of comma separated
324
+ `name=value` pairs. This is intended to make it possible to override
325
+ any hyperparameter values from a single command-line flag to which
326
+ the user passes 'hyper-param=value' pairs. It avoids having to define
327
+ one flag for each hyperparameter.
328
+
329
+ The syntax expected for each value depends on the type of the parameter.
330
+ See `parse()` for a description of the syntax.
331
+
332
+ Example:
333
+
334
+ ```python
335
+ # Define a command line flag to pass name=value pairs.
336
+ # For example using argparse:
337
+ import argparse
338
+ parser = argparse.ArgumentParser(description='Train my model.')
339
+ parser.add_argument('--hparams', type=str,
340
+ help='Comma separated list of "name=value" pairs.')
341
+ args = parser.parse_args()
342
+ ...
343
+ def my_program():
344
+ # Create a HParams object specifying the names and values of the
345
+ # model hyperparameters:
346
+ hparams = tf.HParams(learning_rate=0.1, num_hidden_units=100,
347
+ activations=['relu', 'tanh'])
348
+
349
+ # Override hyperparameters values by parsing the command line
350
+ hparams.parse(args.hparams)
351
+
352
+ # If the user passed `--hparams=learning_rate=0.3` on the command line
353
+ # then 'hparams' has the following attributes:
354
+ hparams.learning_rate ==> 0.3
355
+ hparams.num_hidden_units ==> 100
356
+ hparams.activations ==> ['relu', 'tanh']
357
+
358
+ # If the hyperparameters are in json format use parse_json:
359
+ hparams.parse_json('{"learning_rate": 0.3, "activations": "relu"}')
360
+ ```
361
+ """
362
+
363
+ _HAS_DYNAMIC_ATTRIBUTES = True # Required for pytype checks.
364
+
365
+ def __init__(self, model_structure=None, **kwargs):
366
+ """Create an instance of `HParams` from keyword arguments.
367
+
368
+ The keyword arguments specify name-values pairs for the hyperparameters.
369
+ The parameter types are inferred from the type of the values passed.
370
+
371
+ The parameter names are added as attributes of `HParams` object, so they
372
+ can be accessed directly with the dot notation `hparams._name_`.
373
+
374
+ Example:
375
+
376
+ ```python
377
+ # Define 3 hyperparameters: 'learning_rate' is a float parameter,
378
+ # 'num_hidden_units' an integer parameter, and 'activation' a string
379
+ # parameter.
380
+ hparams = tf.HParams(
381
+ learning_rate=0.1, num_hidden_units=100, activation='relu')
382
+
383
+ hparams.activation ==> 'relu'
384
+ ```
385
+
386
+ Note that a few names are reserved and cannot be used as hyperparameter
387
+ names. If you use one of the reserved name the constructor raises a
388
+ `ValueError`.
389
+
390
+ Args:
391
+ model_structure: An instance of ModelStructure, defining the feature
392
+ crosses to be used in the Trial.
393
+ **kwargs: Key-value pairs where the key is the hyperparameter name and
394
+ the value is the value for the parameter.
395
+
396
+ Raises:
397
+ ValueError: If both `hparam_def` and initialization values are provided,
398
+ or if one of the arguments is invalid.
399
+
400
+ """
401
+ # Register the hyperparameters and their type in _hparam_types.
402
+ # This simplifies the implementation of parse().
403
+ # _hparam_types maps the parameter name to a tuple (type, bool).
404
+ # The type value is the type of the parameter for scalar hyperparameters,
405
+ # or the type of the list elements for multidimensional hyperparameters.
406
+ # The bool value is True if the value is a list, False otherwise.
407
+ self._hparam_types = {}
408
+ self._model_structure = model_structure
409
+ for name, value in six.iteritems(kwargs):
410
+ self.add_hparam(name, value)
411
+
412
+ def add_hparam(self, name, value):
413
+ """Adds {name, value} pair to hyperparameters.
414
+
415
+ Args:
416
+ name: Name of the hyperparameter.
417
+ value: Value of the hyperparameter. Can be one of the following types:
418
+ int, float, string, int list, float list, or string list.
419
+
420
+ Raises:
421
+ ValueError: if one of the arguments is invalid.
422
+ """
423
+ # Keys in kwargs are unique, but 'name' could the name of a pre-existing
424
+ # attribute of this object. In that case we refuse to use it as a
425
+ # hyperparameter name.
426
+ if getattr(self, name, None) is not None:
427
+ raise ValueError("Hyperparameter name is reserved: %s" % name)
428
+ if isinstance(value, (list, tuple)):
429
+ if not value:
430
+ raise ValueError(
431
+ "Multi-valued hyperparameters cannot be empty: %s" % name
432
+ )
433
+ self._hparam_types[name] = (type(value[0]), True)
434
+ else:
435
+ self._hparam_types[name] = (type(value), False)
436
+ setattr(self, name, value)
437
+
438
+ def set_hparam(self, name, value):
439
+ """Set the value of an existing hyperparameter.
440
+
441
+ This function verifies that the type of the value matches the type of the
442
+ existing hyperparameter.
443
+
444
+ Args:
445
+ name: Name of the hyperparameter.
446
+ value: New value of the hyperparameter.
447
+
448
+ Raises:
449
+ KeyError: If the hyperparameter doesn't exist.
450
+ ValueError: If there is a type mismatch.
451
+ """
452
+ param_type, is_list = self._hparam_types[name]
453
+ if isinstance(value, list):
454
+ if not is_list:
455
+ raise ValueError(
456
+ "Must not pass a list for single-valued parameter: %s" % name
457
+ )
458
+ setattr(
459
+ self,
460
+ name,
461
+ [_cast_to_type_if_compatible(name, param_type, v) for v in value],
462
+ )
463
+ else:
464
+ if is_list:
465
+ raise ValueError(
466
+ "Must pass a list for multi-valued parameter: %s." % name
467
+ )
468
+ setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
469
+
470
+ def del_hparam(self, name):
471
+ """Removes the hyperparameter with key 'name'.
472
+
473
+ Does nothing if it isn't present.
474
+
475
+ Args:
476
+ name: Name of the hyperparameter.
477
+ """
478
+ if hasattr(self, name):
479
+ delattr(self, name)
480
+ del self._hparam_types[name]
481
+
482
+ def parse(self, values):
483
+ """Override existing hyperparameter values, parsing new values from a string.
484
+
485
+ See parse_values for more detail on the allowed format for values.
486
+
487
+ Args:
488
+ values: String. Comma separated list of `name=value` pairs where 'value'
489
+ must follow the syntax described above.
490
+
491
+ Returns:
492
+ The `HParams` instance.
493
+
494
+ Raises:
495
+ ValueError: If `values` cannot be parsed or a hyperparameter in `values`
496
+ doesn't exist.
497
+ """
498
+ type_map = {}
499
+ for name, t in self._hparam_types.items():
500
+ param_type, _ = t
501
+ type_map[name] = param_type
502
+
503
+ values_map = parse_values(values, type_map)
504
+ return self.override_from_dict(values_map)
505
+
506
+ def override_from_dict(self, values_dict):
507
+ """Override existing hyperparameter values, parsing new values from a dictionary.
508
+
509
+ Args:
510
+ values_dict: Dictionary of name:value pairs.
511
+
512
+ Returns:
513
+ The `HParams` instance.
514
+
515
+ Raises:
516
+ KeyError: If a hyperparameter in `values_dict` doesn't exist.
517
+ ValueError: If `values_dict` cannot be parsed.
518
+ """
519
+ for name, value in values_dict.items():
520
+ self.set_hparam(name, value)
521
+ return self
522
+
523
+ def set_model_structure(self, model_structure):
524
+ self._model_structure = model_structure
525
+
526
+ def get_model_structure(self):
527
+ return self._model_structure
528
+
529
+ def to_json(self, indent=None, separators=None, sort_keys=False):
530
+ """Serializes the hyperparameters into JSON.
531
+
532
+ Args:
533
+ indent: If a non-negative integer, JSON array elements and object members
534
+ will be pretty-printed with that indent level. An indent level of 0, or
535
+ negative, will only insert newlines. `None` (the default) selects the
536
+ most compact representation.
537
+ separators: Optional `(item_separator, key_separator)` tuple. Default is
538
+ `(', ', ': ')`.
539
+ sort_keys: If `True`, the output dictionaries will be sorted by key.
540
+
541
+ Returns:
542
+ A JSON string.
543
+ """
544
+
545
+ def remove_callables(x):
546
+ """Omit callable elements from input with arbitrary nesting."""
547
+ if isinstance(x, dict):
548
+ return {
549
+ k: remove_callables(v)
550
+ for k, v in six.iteritems(x)
551
+ if not callable(v)
552
+ }
553
+ elif isinstance(x, list):
554
+ return [remove_callables(i) for i in x if not callable(i)]
555
+ return x
556
+
557
+ return json.dumps(
558
+ remove_callables(self.values()),
559
+ indent=indent,
560
+ separators=separators,
561
+ sort_keys=sort_keys,
562
+ )
563
+
564
+ def parse_json(self, values_json):
565
+ """Override existing hyperparameter values, parsing new values from a json object.
566
+
567
+ Args:
568
+ values_json: String containing a json object of name:value pairs.
569
+
570
+ Returns:
571
+ The `HParams` instance.
572
+
573
+ Raises:
574
+ KeyError: If a hyperparameter in `values_json` doesn't exist.
575
+ ValueError: If `values_json` cannot be parsed.
576
+ """
577
+ values_map = json.loads(values_json)
578
+ return self.override_from_dict(values_map)
579
+
580
+ def values(self):
581
+ """Return the hyperparameter values as a Python dictionary.
582
+
583
+ Returns:
584
+ A dictionary with hyperparameter names as keys. The values are the
585
+ hyperparameter values.
586
+ """
587
+ return {n: getattr(self, n) for n in self._hparam_types.keys()}
588
+
589
+ def get(self, key, default=None):
590
+ """Returns the value of `key` if it exists, else `default`."""
591
+ if key in self._hparam_types:
592
+ # Ensure that default is compatible with the parameter type.
593
+ if default is not None:
594
+ param_type, is_param_list = self._hparam_types[key]
595
+ type_str = "list<%s>" % param_type if is_param_list else str(param_type)
596
+ fail_msg = (
597
+ "Hparam '%s' of type '%s' is incompatible with "
598
+ "default=%s" % (key, type_str, default)
599
+ )
600
+
601
+ is_default_list = isinstance(default, list)
602
+ if is_param_list != is_default_list:
603
+ raise ValueError(fail_msg)
604
+
605
+ try:
606
+ if is_default_list:
607
+ for value in default:
608
+ _cast_to_type_if_compatible(key, param_type, value)
609
+ else:
610
+ _cast_to_type_if_compatible(key, param_type, default)
611
+ except ValueError as e:
612
+ raise ValueError("%s. %s" % (fail_msg, e))
613
+
614
+ return getattr(self, key)
615
+
616
+ return default
617
+
618
+ def __contains__(self, key):
619
+ return key in self._hparam_types
620
+
621
+ def __str__(self):
622
+ return str(sorted(self.values().items()))
623
+
624
+ def __repr__(self):
625
+ return "%s(%s)" % (type(self).__name__, self.__str__())
626
+
627
+ @staticmethod
628
+ def _get_kind_name(param_type, is_list):
629
+ """Returns the field name given parameter type and is_list.
630
+
631
+ Args:
632
+ param_type: Data type of the hparam.
633
+ is_list: Whether this is a list.
634
+
635
+ Returns:
636
+ A string representation of the field name.
637
+
638
+ Raises:
639
+ ValueError: If parameter type is not recognized.
640
+ """
641
+ if issubclass(param_type, bool):
642
+ # This check must happen before issubclass(param_type, six.integer_types),
643
+ # since Python considers bool to be a subclass of int.
644
+ typename = "bool"
645
+ elif issubclass(param_type, six.integer_types):
646
+ # Setting 'int' and 'long' types to be 'int64' to ensure the type is
647
+ # compatible with both Python2 and Python3.
648
+ typename = "int64"
649
+ elif issubclass(param_type, (six.string_types, six.binary_type)):
650
+ # Setting 'string' and 'bytes' types to be 'bytes' to ensure the type is
651
+ # compatible with both Python2 and Python3.
652
+ typename = "bytes"
653
+ elif issubclass(param_type, float):
654
+ typename = "float"
655
+ else:
656
+ raise ValueError("Unsupported parameter type: %s" % str(param_type))
657
+
658
+ suffix = "list" if is_list else "value"
659
+ return "_".join([typename, suffix])
utils/hubert.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # This code is modified from https://github.com/svc-develop-team/so-vits-svc/blob/4.0/preprocess_hubert_f0.py
7
+
8
+ import os
9
+ import librosa
10
+ import torch
11
+ import numpy as np
12
+ from fairseq import checkpoint_utils
13
+ from tqdm import tqdm
14
+ import torch
15
+
16
+
17
+ def load_hubert_model(hps):
18
+ # Load model
19
+ ckpt_path = hps.hubert_file
20
+ print("Load Hubert Model...")
21
+
22
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
23
+ [ckpt_path],
24
+ suffix="",
25
+ )
26
+ model = models[0]
27
+ model.eval()
28
+
29
+ if torch.cuda.is_available():
30
+ model = model.cuda()
31
+
32
+ return model
33
+
34
+
35
+ def get_hubert_content(hmodel, wav_16k_tensor):
36
+ feats = wav_16k_tensor
37
+ if feats.dim() == 2: # double channels
38
+ feats = feats.mean(-1)
39
+ assert feats.dim() == 1, feats.dim()
40
+ feats = feats.view(1, -1)
41
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
42
+ inputs = {
43
+ "source": feats.to(wav_16k_tensor.device),
44
+ "padding_mask": padding_mask.to(wav_16k_tensor.device),
45
+ "output_layer": 9, # layer 9
46
+ }
47
+ with torch.no_grad():
48
+ logits = hmodel.extract_features(**inputs)
49
+ feats = hmodel.final_proj(logits[0]).squeeze(0)
50
+
51
+ return feats
52
+
53
+
54
+ def content_vector_encoder(model, audio_path, default_sampling_rate=16000):
55
+ """
56
+ # content vector default sr: 16000
57
+ """
58
+
59
+ wav16k, sr = librosa.load(audio_path, sr=default_sampling_rate)
60
+ device = next(model.parameters()).device
61
+ wav16k = torch.from_numpy(wav16k).to(device)
62
+
63
+ # (1, 256, frame_len)
64
+ content_feature = get_hubert_content(model, wav_16k_tensor=wav16k)
65
+
66
+ return content_feature.cpu().detach().numpy()
67
+
68
+
69
+ def repeat_expand_2d(content, target_len):
70
+ """
71
+ content : [hubert_dim(256), src_len]
72
+ target: [hubert_dim(256), target_len]
73
+ """
74
+ src_len = content.shape[-1]
75
+ target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(
76
+ content.device
77
+ )
78
+ temp = torch.arange(src_len + 1) * target_len / src_len
79
+ current_pos = 0
80
+ for i in range(target_len):
81
+ if i < temp[current_pos + 1]:
82
+ target[:, i] = content[:, current_pos]
83
+ else:
84
+ current_pos += 1
85
+ target[:, i] = content[:, current_pos]
86
+
87
+ return target
88
+
89
+
90
+ def get_mapped_features(raw_content_features, mapping_features):
91
+ """
92
+ Content Vector: frameshift = 20ms, hop_size = 480 in 24k
93
+
94
+ Now it's only used for mapping to bigvgan's mels (sr = 24k, hop_size = 256, frameshift ~= 10.7 ms)
95
+ """
96
+ source_hop = 480
97
+ target_hop = 256
98
+
99
+ factor = np.gcd(source_hop, target_hop)
100
+ source_hop //= factor
101
+ target_hop //= factor
102
+ print(
103
+ "Mapping source's {} frames => target's {} frames".format(
104
+ target_hop, source_hop
105
+ )
106
+ )
107
+
108
+ results = []
109
+ for index, mapping_feat in enumerate(tqdm(mapping_features)):
110
+ # mappping_feat: (mels_frame_len, n_mels)
111
+ target_len = len(mapping_feat)
112
+
113
+ # (source_len, 256)
114
+ raw_feats = raw_content_features[index][0].cpu().numpy().T
115
+ source_len, width = raw_feats.shape
116
+
117
+ # const ~= target_len * target_hop
118
+ const = source_len * source_hop // target_hop * target_hop
119
+
120
+ # (source_len * source_hop, dim)
121
+ up_sampling_feats = np.repeat(raw_feats, source_hop, axis=0)
122
+ # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
123
+ down_sampling_feats = np.average(
124
+ up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
125
+ )
126
+
127
+ err = abs(target_len - len(down_sampling_feats))
128
+ if err > 3:
129
+ print("index:", index)
130
+ print("mels:", mapping_feat.shape)
131
+ print("raw content vector:", raw_feats.shape)
132
+ print("up_sampling:", up_sampling_feats.shape)
133
+ print("down_sampling_feats:", down_sampling_feats.shape)
134
+ exit()
135
+ if len(down_sampling_feats) < target_len:
136
+ # (1, dim) -> (err, dim)
137
+ end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
138
+ down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
139
+
140
+ # (target_len, dim)
141
+ feats = down_sampling_feats[:target_len]
142
+ results.append(feats)
143
+
144
+ return results
145
+
146
+
147
+ def extract_hubert_features_of_dataset(datasets, model, out_dir):
148
+ for utt in tqdm(datasets):
149
+ uid = utt["Uid"]
150
+ audio_path = utt["Path"]
151
+
152
+ content_vector_feature = content_vector_encoder(model, audio_path) # (T, 256)
153
+
154
+ save_path = os.path.join(out_dir, uid + ".npy")
155
+ np.save(save_path, content_vector_feature)
utils/io.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import os
7
+ import numpy as np
8
+ import torch
9
+ import torchaudio
10
+
11
+
12
+ def save_feature(process_dir, feature_dir, item, feature, overrides=True):
13
+ """Save features to path
14
+
15
+ Args:
16
+ process_dir (str): directory to store features
17
+ feature_dir (_type_): directory to store one type of features (mel, energy, ...)
18
+ item (str): uid
19
+ feature (tensor): feature tensor
20
+ overrides (bool, optional): whether to override existing files. Defaults to True.
21
+ """
22
+ process_dir = os.path.join(process_dir, feature_dir)
23
+ os.makedirs(process_dir, exist_ok=True)
24
+ out_path = os.path.join(process_dir, item + ".npy")
25
+
26
+ if os.path.exists(out_path):
27
+ if overrides:
28
+ np.save(out_path, feature)
29
+ else:
30
+ np.save(out_path, feature)
31
+
32
+
33
+ def save_txt(process_dir, feature_dir, item, feature, overrides=True):
34
+ process_dir = os.path.join(process_dir, feature_dir)
35
+ os.makedirs(process_dir, exist_ok=True)
36
+ out_path = os.path.join(process_dir, item + ".txt")
37
+
38
+ if os.path.exists(out_path):
39
+ if overrides:
40
+ f = open(out_path, "w")
41
+ f.writelines(feature)
42
+ f.close()
43
+ else:
44
+ f = open(out_path, "w")
45
+ f.writelines(feature)
46
+ f.close()
47
+
48
+
49
+ def save_audio(path, waveform, fs, add_silence=False, turn_up=False, volume_peak=0.9):
50
+ if turn_up:
51
+ # continue to turn up to volume_peak
52
+ ratio = volume_peak / max(waveform.max(), abs(waveform.min()))
53
+ waveform = waveform * ratio
54
+
55
+ if add_silence:
56
+ silence_len = fs // 20
57
+ silence = np.zeros((silence_len,), dtype=waveform.dtype)
58
+ result = np.concatenate([silence, waveform, silence])
59
+ waveform = result
60
+
61
+ waveform = torch.as_tensor(waveform, dtype=torch.float32, device="cpu")
62
+ if len(waveform.size()) == 1:
63
+ waveform = waveform[None, :]
64
+ elif waveform.size(0) != 1:
65
+ # Stereo to mono
66
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
67
+ torchaudio.save(path, waveform, fs, encoding="PCM_S", bits_per_sample=16)
68
+
69
+
70
+ async def async_load_audio(path, sample_rate: int = 24000):
71
+ r"""
72
+ Args:
73
+ path: The source loading path.
74
+ sample_rate: The target sample rate, will automatically resample if necessary.
75
+
76
+ Returns:
77
+ waveform: The waveform object. Should be [1 x sequence_len].
78
+ """
79
+
80
+ async def use_torchaudio_load(path):
81
+ return torchaudio.load(path)
82
+
83
+ waveform, sr = await use_torchaudio_load(path)
84
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
85
+
86
+ if sr != sample_rate:
87
+ waveform = torchaudio.functional.resample(waveform, sr, sample_rate)
88
+
89
+ if torch.any(torch.isnan(waveform) or torch.isinf(waveform)):
90
+ raise ValueError("NaN or Inf found in waveform.")
91
+ return waveform
92
+
93
+
94
+ async def async_save_audio(
95
+ path,
96
+ waveform,
97
+ sample_rate: int = 24000,
98
+ add_silence: bool = False,
99
+ volume_peak: float = 0.9,
100
+ ):
101
+ r"""
102
+ Args:
103
+ path: The target saving path.
104
+ waveform: The waveform object. Should be [n_channel x sequence_len].
105
+ sample_rate: Sample rate.
106
+ add_silence: If ``true``, concat 0.05s silence to beginning and end.
107
+ volume_peak: Turn up volume for larger number, vice versa.
108
+ """
109
+
110
+ async def use_torchaudio_save(path, waveform, sample_rate):
111
+ torchaudio.save(
112
+ path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16
113
+ )
114
+
115
+ waveform = torch.as_tensor(waveform, device="cpu", dtype=torch.float32)
116
+ shape = waveform.size()[:-1]
117
+
118
+ ratio = abs(volume_peak) / max(waveform.max(), abs(waveform.min()))
119
+ waveform = waveform * ratio
120
+
121
+ if add_silence:
122
+ silence_len = sample_rate // 20
123
+ silence = torch.zeros((*shape, silence_len), dtype=waveform.type())
124
+ waveform = torch.concatenate((silence, waveform, silence), dim=-1)
125
+
126
+ if waveform.dim() == 1:
127
+ waveform = waveform[None]
128
+
129
+ await use_torchaudio_save(path, waveform, sample_rate)
130
+
131
+
132
+ def load_mel_extrema(cfg, dataset_name, split):
133
+ dataset_dir = os.path.join(
134
+ cfg.OUTPUT_PATH,
135
+ "preprocess/{}_version".format(cfg.data.process_version),
136
+ dataset_name,
137
+ )
138
+
139
+ min_file = os.path.join(
140
+ dataset_dir,
141
+ "mel_min_max",
142
+ split.split("_")[-1],
143
+ "mel_min.npy",
144
+ )
145
+ max_file = os.path.join(
146
+ dataset_dir,
147
+ "mel_min_max",
148
+ split.split("_")[-1],
149
+ "mel_max.npy",
150
+ )
151
+ mel_min = np.load(min_file)
152
+ mel_max = np.load(max_file)
153
+ return mel_min, mel_max
utils/io_optim.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import torchaudio
8
+ import json
9
+ import os
10
+ import numpy as np
11
+ import librosa
12
+ from torch.nn.utils.rnn import pad_sequence
13
+ from modules import whisper_extractor as whisper
14
+
15
+
16
+ class TorchaudioDataset(torch.utils.data.Dataset):
17
+ def __init__(self, cfg, dataset, sr, accelerator=None, metadata=None):
18
+ """
19
+ Args:
20
+ cfg: config
21
+ dataset: dataset name
22
+
23
+ """
24
+ assert isinstance(dataset, str)
25
+
26
+ self.sr = sr
27
+ self.cfg = cfg
28
+
29
+ if metadata is None:
30
+ self.train_metadata_path = os.path.join(
31
+ cfg.preprocess.processed_dir, dataset, cfg.preprocess.train_file
32
+ )
33
+ self.valid_metadata_path = os.path.join(
34
+ cfg.preprocess.processed_dir, dataset, cfg.preprocess.valid_file
35
+ )
36
+ self.metadata = self.get_metadata()
37
+ else:
38
+ self.metadata = metadata
39
+
40
+ if accelerator is not None:
41
+ self.device = accelerator.device
42
+ elif torch.cuda.is_available():
43
+ self.device = torch.device("cuda")
44
+ else:
45
+ self.device = torch.device("cpu")
46
+
47
+ def get_metadata(self):
48
+ metadata = []
49
+ with open(self.train_metadata_path, "r", encoding="utf-8") as t:
50
+ metadata.extend(json.load(t))
51
+ with open(self.valid_metadata_path, "r", encoding="utf-8") as v:
52
+ metadata.extend(json.load(v))
53
+ return metadata
54
+
55
+ def __len__(self):
56
+ return len(self.metadata)
57
+
58
+ def __getitem__(self, index):
59
+ utt_info = self.metadata[index]
60
+ wav_path = utt_info["Path"]
61
+
62
+ wav, sr = torchaudio.load(wav_path)
63
+
64
+ # resample
65
+ if sr != self.sr:
66
+ wav = torchaudio.functional.resample(wav, sr, self.sr)
67
+ # downmixing
68
+ if wav.shape[0] > 1:
69
+ wav = torch.mean(wav, dim=0, keepdim=True)
70
+ assert wav.shape[0] == 1
71
+ wav = wav.squeeze(0)
72
+ # record the length of wav without padding
73
+ length = wav.shape[0]
74
+ # wav: (T)
75
+ return utt_info, wav, length
76
+
77
+
78
+ class LibrosaDataset(TorchaudioDataset):
79
+ def __init__(self, cfg, dataset, sr, accelerator=None, metadata=None):
80
+ super().__init__(cfg, dataset, sr, accelerator, metadata)
81
+
82
+ def __getitem__(self, index):
83
+ utt_info = self.metadata[index]
84
+ wav_path = utt_info["Path"]
85
+
86
+ wav, _ = librosa.load(wav_path, sr=self.sr)
87
+ # wav: (T)
88
+ wav = torch.from_numpy(wav)
89
+
90
+ # record the length of wav without padding
91
+ length = wav.shape[0]
92
+ return utt_info, wav, length
93
+
94
+
95
+ class FFmpegDataset(TorchaudioDataset):
96
+ def __init__(self, cfg, dataset, sr, accelerator=None, metadata=None):
97
+ super().__init__(cfg, dataset, sr, accelerator, metadata)
98
+
99
+ def __getitem__(self, index):
100
+ utt_info = self.metadata[index]
101
+ wav_path = utt_info["Path"]
102
+
103
+ # wav: (T,)
104
+ wav = whisper.load_audio(wav_path) # sr = 16000
105
+ # convert to torch tensor
106
+ wav = torch.from_numpy(wav)
107
+ # record the length of wav without padding
108
+ length = wav.shape[0]
109
+
110
+ return utt_info, wav, length
111
+
112
+
113
+ def collate_batch(batch_list):
114
+ """
115
+ Args:
116
+ batch_list: list of (metadata, wav, length)
117
+ """
118
+ metadata = [item[0] for item in batch_list]
119
+ # wavs: (B, T)
120
+ wavs = pad_sequence([item[1] for item in batch_list], batch_first=True)
121
+ lens = [item[2] for item in batch_list]
122
+
123
+ return metadata, wavs, lens
utils/mel.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ from librosa.filters import mel as librosa_mel_fn
8
+
9
+
10
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
11
+ return torch.log(torch.clamp(x, min=clip_val) * C)
12
+
13
+
14
+ def spectral_normalize_torch(magnitudes):
15
+ output = dynamic_range_compression_torch(magnitudes)
16
+ return output
17
+
18
+
19
+ def extract_linear_features(y, cfg, center=False):
20
+ if torch.min(y) < -1.0:
21
+ print("min value is ", torch.min(y))
22
+ if torch.max(y) > 1.0:
23
+ print("max value is ", torch.max(y))
24
+
25
+ global hann_window
26
+ hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
27
+
28
+ y = torch.nn.functional.pad(
29
+ y.unsqueeze(1),
30
+ (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
31
+ mode="reflect",
32
+ )
33
+ y = y.squeeze(1)
34
+
35
+ # complex tensor as default, then use view_as_real for future pytorch compatibility
36
+ spec = torch.stft(
37
+ y,
38
+ cfg.n_fft,
39
+ hop_length=cfg.hop_size,
40
+ win_length=cfg.win_size,
41
+ window=hann_window[str(y.device)],
42
+ center=center,
43
+ pad_mode="reflect",
44
+ normalized=False,
45
+ onesided=True,
46
+ return_complex=True,
47
+ )
48
+ spec = torch.view_as_real(spec)
49
+ spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
50
+ spec = torch.squeeze(spec, 0)
51
+ return spec
52
+
53
+
54
+ def mel_spectrogram_torch(y, cfg, center=False):
55
+ if torch.min(y) < -1.0:
56
+ print("min value is ", torch.min(y))
57
+ if torch.max(y) > 1.0:
58
+ print("max value is ", torch.max(y))
59
+
60
+ global mel_basis, hann_window
61
+ if cfg.fmax not in mel_basis:
62
+ mel = librosa_mel_fn(
63
+ sr=cfg.sample_rate,
64
+ n_fft=cfg.n_fft,
65
+ n_mels=cfg.n_mel,
66
+ fmin=cfg.fmin,
67
+ fmax=cfg.fmax,
68
+ )
69
+ mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
70
+ torch.from_numpy(mel).float().to(y.device)
71
+ )
72
+ hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
73
+
74
+ y = torch.nn.functional.pad(
75
+ y.unsqueeze(1),
76
+ (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
77
+ mode="reflect",
78
+ )
79
+ y = y.squeeze(1)
80
+
81
+ spec = torch.stft(
82
+ y,
83
+ cfg.n_fft,
84
+ hop_length=cfg.hop_size,
85
+ win_length=cfg.win_size,
86
+ window=hann_window[str(y.device)],
87
+ center=center,
88
+ pad_mode="reflect",
89
+ normalized=False,
90
+ onesided=True,
91
+ return_complex=True,
92
+ )
93
+
94
+ spec = torch.view_as_real(spec)
95
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
96
+
97
+ spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
98
+ spec = spectral_normalize_torch(spec)
99
+
100
+ return spec
101
+
102
+
103
+ mel_basis = {}
104
+ hann_window = {}
105
+
106
+
107
+ def extract_mel_features(
108
+ y,
109
+ cfg,
110
+ center=False
111
+ # n_fft, n_mel, sampling_rate, hop_size, win_size, fmin, fmax, center=False
112
+ ):
113
+ """Extract mel features
114
+
115
+ Args:
116
+ y (tensor): audio data in tensor
117
+ cfg (dict): configuration in cfg.preprocess
118
+ center (bool, optional): In STFT, whether t-th frame is centered at time t*hop_length. Defaults to False.
119
+
120
+ Returns:
121
+ tensor: a tensor containing the mel feature calculated based on STFT result
122
+ """
123
+ if torch.min(y) < -1.0:
124
+ print("min value is ", torch.min(y))
125
+ if torch.max(y) > 1.0:
126
+ print("max value is ", torch.max(y))
127
+
128
+ global mel_basis, hann_window
129
+ if cfg.fmax not in mel_basis:
130
+ mel = librosa_mel_fn(
131
+ sr=cfg.sample_rate,
132
+ n_fft=cfg.n_fft,
133
+ n_mels=cfg.n_mel,
134
+ fmin=cfg.fmin,
135
+ fmax=cfg.fmax,
136
+ )
137
+ mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
138
+ torch.from_numpy(mel).float().to(y.device)
139
+ )
140
+ hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
141
+
142
+ y = torch.nn.functional.pad(
143
+ y.unsqueeze(1),
144
+ (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
145
+ mode="reflect",
146
+ )
147
+ y = y.squeeze(1)
148
+
149
+ # complex tensor as default, then use view_as_real for future pytorch compatibility
150
+ spec = torch.stft(
151
+ y,
152
+ cfg.n_fft,
153
+ hop_length=cfg.hop_size,
154
+ win_length=cfg.win_size,
155
+ window=hann_window[str(y.device)],
156
+ center=center,
157
+ pad_mode="reflect",
158
+ normalized=False,
159
+ onesided=True,
160
+ return_complex=True,
161
+ )
162
+ spec = torch.view_as_real(spec)
163
+ spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
164
+
165
+ spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
166
+ spec = spectral_normalize_torch(spec)
167
+
168
+ return spec.squeeze(0)
169
+
170
+
171
+ def extract_mel_features_tts(
172
+ y,
173
+ cfg,
174
+ center=False,
175
+ taco=False,
176
+ _stft=None,
177
+ ):
178
+ """Extract mel features
179
+
180
+ Args:
181
+ y (tensor): audio data in tensor
182
+ cfg (dict): configuration in cfg.preprocess
183
+ center (bool, optional): In STFT, whether t-th frame is centered at time t*hop_length. Defaults to False.
184
+ taco: use tacotron mel
185
+
186
+ Returns:
187
+ tensor: a tensor containing the mel feature calculated based on STFT result
188
+ """
189
+ if not taco:
190
+ if torch.min(y) < -1.0:
191
+ print("min value is ", torch.min(y))
192
+ if torch.max(y) > 1.0:
193
+ print("max value is ", torch.max(y))
194
+
195
+ global mel_basis, hann_window
196
+ if cfg.fmax not in mel_basis:
197
+ mel = librosa_mel_fn(
198
+ sr=cfg.sample_rate,
199
+ n_fft=cfg.n_fft,
200
+ n_mels=cfg.n_mel,
201
+ fmin=cfg.fmin,
202
+ fmax=cfg.fmax,
203
+ )
204
+ mel_basis[str(cfg.fmax) + "_" + str(y.device)] = (
205
+ torch.from_numpy(mel).float().to(y.device)
206
+ )
207
+ hann_window[str(y.device)] = torch.hann_window(cfg.win_size).to(y.device)
208
+
209
+ y = torch.nn.functional.pad(
210
+ y.unsqueeze(1),
211
+ (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
212
+ mode="reflect",
213
+ )
214
+ y = y.squeeze(1)
215
+
216
+ # complex tensor as default, then use view_as_real for future pytorch compatibility
217
+ spec = torch.stft(
218
+ y,
219
+ cfg.n_fft,
220
+ hop_length=cfg.hop_size,
221
+ win_length=cfg.win_size,
222
+ window=hann_window[str(y.device)],
223
+ center=center,
224
+ pad_mode="reflect",
225
+ normalized=False,
226
+ onesided=True,
227
+ return_complex=True,
228
+ )
229
+ spec = torch.view_as_real(spec)
230
+ spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
231
+
232
+ spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
233
+ spec = spectral_normalize_torch(spec)
234
+ spec = spec.squeeze(0)
235
+ else:
236
+ audio = torch.clip(y, -1, 1)
237
+ audio = torch.autograd.Variable(audio, requires_grad=False)
238
+ spec, energy = _stft.mel_spectrogram(audio)
239
+ spec = torch.squeeze(spec, 0)
240
+
241
+ spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
242
+ spec = spectral_normalize_torch(spec)
243
+
244
+ return spec.squeeze(0)
245
+
246
+
247
+ def amplitude_phase_spectrum(y, cfg):
248
+ hann_window = torch.hann_window(cfg.win_size).to(y.device)
249
+
250
+ y = torch.nn.functional.pad(
251
+ y.unsqueeze(1),
252
+ (int((cfg.n_fft - cfg.hop_size) / 2), int((cfg.n_fft - cfg.hop_size) / 2)),
253
+ mode="reflect",
254
+ )
255
+ y = y.squeeze(1)
256
+
257
+ stft_spec = torch.stft(
258
+ y,
259
+ cfg.n_fft,
260
+ hop_length=cfg.hop_size,
261
+ win_length=cfg.win_size,
262
+ window=hann_window,
263
+ center=False,
264
+ return_complex=True,
265
+ )
266
+
267
+ stft_spec = torch.view_as_real(stft_spec)
268
+ if stft_spec.size()[0] == 1:
269
+ stft_spec = stft_spec.squeeze(0)
270
+
271
+ if len(list(stft_spec.size())) == 4:
272
+ rea = stft_spec[:, :, :, 0] # [batch_size, n_fft//2+1, frames]
273
+ imag = stft_spec[:, :, :, 1] # [batch_size, n_fft//2+1, frames]
274
+ else:
275
+ rea = stft_spec[:, :, 0] # [n_fft//2+1, frames]
276
+ imag = stft_spec[:, :, 1] # [n_fft//2+1, frames]
277
+
278
+ log_amplitude = torch.log(
279
+ torch.abs(torch.sqrt(torch.pow(rea, 2) + torch.pow(imag, 2))) + 1e-5
280
+ ) # [n_fft//2+1, frames]
281
+ phase = torch.atan2(imag, rea) # [n_fft//2+1, frames]
282
+
283
+ return log_amplitude, phase, rea, imag
utils/mert.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # This code is modified from https://huggingface.co/m-a-p/MERT-v1-330M
7
+
8
+ import torch
9
+ from tqdm import tqdm
10
+ import numpy as np
11
+
12
+ from transformers import Wav2Vec2FeatureExtractor
13
+ from transformers import AutoModel
14
+ import torchaudio
15
+ import torchaudio.transforms as T
16
+ from sklearn.preprocessing import StandardScaler
17
+
18
+
19
+ def mert_encoder(model, processor, audio_path, hps):
20
+ """
21
+ # mert default sr: 24000
22
+ """
23
+ with torch.no_grad():
24
+ resample_rate = processor.sampling_rate
25
+ device = next(model.parameters()).device
26
+
27
+ input_audio, sampling_rate = torchaudio.load(audio_path)
28
+ input_audio = input_audio.squeeze()
29
+
30
+ if sampling_rate != resample_rate:
31
+ resampler = T.Resample(sampling_rate, resample_rate)
32
+ input_audio = resampler(input_audio)
33
+
34
+ inputs = processor(
35
+ input_audio, sampling_rate=resample_rate, return_tensors="pt"
36
+ ).to(
37
+ device
38
+ ) # {input_values: tensor, attention_mask: tensor}
39
+
40
+ outputs = model(**inputs, output_hidden_states=True) # list: len is 25
41
+
42
+ # [25 layer, Time steps, 1024 feature_dim]
43
+ # all_layer_hidden_states = torch.stack(outputs.hidden_states).squeeze()
44
+ # mert_features.append(all_layer_hidden_states)
45
+
46
+ feature = outputs.hidden_states[
47
+ hps.mert_feature_layer
48
+ ].squeeze() # [1, frame len, 1024] -> [frame len, 1024]
49
+
50
+ return feature.cpu().detach().numpy()
51
+
52
+
53
+ def mert_features_normalization(raw_mert_features):
54
+ normalized_mert_features = list()
55
+
56
+ mert_features = np.array(raw_mert_features)
57
+ scaler = StandardScaler().fit(mert_features)
58
+ for raw_mert_feature in raw_mert_feature:
59
+ normalized_mert_feature = scaler.transform(raw_mert_feature)
60
+ normalized_mert_features.append(normalized_mert_feature)
61
+ return normalized_mert_features
62
+
63
+
64
+ def get_mapped_mert_features(raw_mert_features, mapping_features, fast_mapping=True):
65
+ source_hop = 320
66
+ target_hop = 256
67
+
68
+ factor = np.gcd(source_hop, target_hop)
69
+ source_hop //= factor
70
+ target_hop //= factor
71
+ print(
72
+ "Mapping source's {} frames => target's {} frames".format(
73
+ target_hop, source_hop
74
+ )
75
+ )
76
+
77
+ mert_features = []
78
+ for index, mapping_feat in enumerate(tqdm(mapping_features)):
79
+ # mapping_feat: (mels_frame_len, n_mels)
80
+ target_len = mapping_feat.shape[0]
81
+
82
+ # (frame_len, 1024)
83
+ raw_feats = raw_mert_features[index].cpu().numpy()
84
+ source_len, width = raw_feats.shape
85
+
86
+ # const ~= target_len * target_hop
87
+ const = source_len * source_hop // target_hop * target_hop
88
+
89
+ # (source_len * source_hop, dim)
90
+ up_sampling_feats = np.repeat(raw_feats, source_hop, axis=0)
91
+ # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
92
+ down_sampling_feats = np.average(
93
+ up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
94
+ )
95
+
96
+ err = abs(target_len - len(down_sampling_feats))
97
+ if err > 3:
98
+ print("index:", index)
99
+ print("mels:", mapping_feat.shape)
100
+ print("raw mert vector:", raw_feats.shape)
101
+ print("up_sampling:", up_sampling_feats.shape)
102
+ print("const:", const)
103
+ print("down_sampling_feats:", down_sampling_feats.shape)
104
+ exit()
105
+ if len(down_sampling_feats) < target_len:
106
+ # (1, dim) -> (err, dim)
107
+ end = down_sampling_feats[-1][None, :].repeat(err, axis=0)
108
+ down_sampling_feats = np.concatenate([down_sampling_feats, end], axis=0)
109
+
110
+ # (target_len, dim)
111
+ feats = down_sampling_feats[:target_len]
112
+ mert_features.append(feats)
113
+
114
+ return mert_features
115
+
116
+
117
+ def load_mert_model(hps):
118
+ print("Loading MERT Model: ", hps.mert_model)
119
+
120
+ # Load model
121
+ model_name = hps.mert_model
122
+ model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
123
+
124
+ if torch.cuda.is_available():
125
+ model = model.cuda()
126
+
127
+ # model = model.eval()
128
+
129
+ preprocessor = Wav2Vec2FeatureExtractor.from_pretrained(
130
+ model_name, trust_remote_code=True
131
+ )
132
+ return model, preprocessor
133
+
134
+
135
+ # loading the corresponding preprocessor config
136
+ # def load_preprocessor (model_name="m-a-p/MERT-v1-330M"):
137
+ # print('load_preprocessor...')
138
+ # preprocessor = Wav2Vec2FeatureExtractor.from_pretrained(model_name,trust_remote_code=True)
139
+ # return preprocessor
utils/model_summary.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import humanfriendly
7
+ import numpy as np
8
+ import torch
9
+
10
+
11
+ def get_human_readable_count(number: int) -> str:
12
+ """Return human_readable_count
13
+
14
+ Originated from:
15
+ https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/core/memory.py
16
+
17
+ Abbreviates an integer number with K, M, B, T for thousands, millions,
18
+ billions and trillions, respectively.
19
+ Examples:
20
+ >>> get_human_readable_count(123)
21
+ '123 '
22
+ >>> get_human_readable_count(1234) # (one thousand)
23
+ '1 K'
24
+ >>> get_human_readable_count(2e6) # (two million)
25
+ '2 M'
26
+ >>> get_human_readable_count(3e9) # (three billion)
27
+ '3 B'
28
+ >>> get_human_readable_count(4e12) # (four trillion)
29
+ '4 T'
30
+ >>> get_human_readable_count(5e15) # (more than trillion)
31
+ '5,000 T'
32
+ Args:
33
+ number: a positive integer number
34
+ Return:
35
+ A string formatted according to the pattern described above.
36
+ """
37
+ assert number >= 0
38
+ labels = [" ", "K", "M", "B", "T"]
39
+ num_digits = int(np.floor(np.log10(number)) + 1 if number > 0 else 1)
40
+ num_groups = int(np.ceil(num_digits / 3))
41
+ num_groups = min(num_groups, len(labels))
42
+ shift = -3 * (num_groups - 1)
43
+ number = number * (10**shift)
44
+ index = num_groups - 1
45
+ return f"{number:.2f} {labels[index]}"
46
+
47
+
48
+ def to_bytes(dtype) -> int:
49
+ return int(str(dtype)[-2:]) // 8
50
+
51
+
52
+ def model_summary(model: torch.nn.Module) -> str:
53
+ message = "Model structure:\n"
54
+ message += str(model)
55
+ tot_params = sum(p.numel() for p in model.parameters())
56
+ num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
57
+ percent_trainable = "{:.1f}".format(num_params * 100.0 / tot_params)
58
+ tot_params = get_human_readable_count(tot_params)
59
+ num_params = get_human_readable_count(num_params)
60
+ message += "\n\nModel summary:\n"
61
+ message += f" Class Name: {model.__class__.__name__}\n"
62
+ message += f" Total Number of model parameters: {tot_params}\n"
63
+ message += (
64
+ f" Number of trainable parameters: {num_params} ({percent_trainable}%)\n"
65
+ )
66
+ num_bytes = humanfriendly.format_size(
67
+ sum(
68
+ p.numel() * to_bytes(p.dtype) for p in model.parameters() if p.requires_grad
69
+ )
70
+ )
71
+ message += f" Size: {num_bytes}\n"
72
+ dtype = next(iter(model.parameters())).dtype
73
+ message += f" Type: {dtype}"
74
+ return message
utils/prompt_preparer.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+
8
+ class PromptPreparer:
9
+ def prepare_prompts(self, y, y_lens, codes, nar_stage, y_prompts_codes):
10
+ if self.prefix_mode == 0:
11
+ y_emb, prefix_len = self._handle_prefix_mode_0(y, codes, nar_stage)
12
+ elif self.prefix_mode == 1:
13
+ y_emb, prefix_len = self._handle_prefix_mode_1(y, y_lens, codes, nar_stage)
14
+ elif self.prefix_mode in [2, 4]:
15
+ y_emb, prefix_len = self._handle_prefix_mode_2_4(y, y_lens, codes, nar_stage, y_prompts_codes)
16
+ else:
17
+ raise ValueError("Invalid prefix mode")
18
+
19
+ return y_emb, prefix_len
20
+
21
+ def _handle_prefix_mode_0(self, y, codes, nar_stage):
22
+ prefix_len = 0
23
+ y_emb = self.nar_audio_embeddings[0](y)
24
+ for j in range(1, nar_stage):
25
+ y_emb = y_emb + self.nar_audio_embeddings[j](codes[..., j])
26
+ return y_emb, 0
27
+
28
+ def _handle_prefix_mode_1(self, y, y_lens, codes, nar_stage):
29
+ int_low = (0.25 * y_lens.min()).type(torch.int64).item()
30
+ prefix_len = torch.randint(int_low, int_low * 2, size=()).item()
31
+ prefix_len = min(prefix_len, 225)
32
+
33
+ y_prompts = self.nar_audio_embeddings[0](y[:, :prefix_len])
34
+ y_emb = self.nar_audio_embeddings[0](y[:, prefix_len:])
35
+ for j in range(1, self.num_quantizers):
36
+ y_prompts += self.nar_audio_embeddings[j](
37
+ codes[:, :prefix_len, j]
38
+ )
39
+ if j < nar_stage:
40
+ y_emb += self.nar_audio_embeddings[j](
41
+ codes[:, prefix_len:, j]
42
+ )
43
+ y_emb = torch.concat([y_prompts, y_emb], axis=1)
44
+ return y_emb, prefix_len
45
+
46
+ def _handle_prefix_mode_2_4(self, y, y_lens, codes, nar_stage, y_prompts_codes):
47
+ if self.prefix_mode == 2:
48
+ prefix_len = min(225, int(0.25 * y_lens.min().item()))
49
+
50
+ y_prompts_codes = []
51
+ for b in range(codes.shape[0]):
52
+ start = self.rng.randint(0, y_lens[b].item() - prefix_len)
53
+ y_prompts_codes.append(
54
+ torch.clone(codes[b, start : start + prefix_len])
55
+ )
56
+ codes[
57
+ b, start : start + prefix_len, nar_stage
58
+ ] = self.audio_token_num
59
+ y_prompts_codes = torch.stack(y_prompts_codes, dim=0)
60
+ else:
61
+ prefix_len = y_prompts_codes.shape[1]
62
+
63
+ y_prompts = self.nar_audio_embeddings[0](y_prompts_codes[..., 0])
64
+ y_emb = self.nar_audio_embeddings[0](y)
65
+ for j in range(1, self.num_quantizers):
66
+ y_prompts += self.nar_audio_embeddings[j](
67
+ y_prompts_codes[..., j]
68
+ )
69
+ if j < nar_stage:
70
+ y_emb += self.nar_audio_embeddings[j](codes[..., j])
71
+ y_emb = torch.concat([y_prompts, y_emb], axis=1)
72
+
73
+ return y_emb, prefix_len
utils/ssim.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # This code is modified from https://github.com/Po-Hsun-Su/pytorch-ssim
7
+
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from torch.autograd import Variable
11
+ from math import exp
12
+
13
+
14
+ def gaussian(window_size, sigma):
15
+ gauss = torch.Tensor(
16
+ [
17
+ exp(-((x - window_size // 2) ** 2) / float(2 * sigma**2))
18
+ for x in range(window_size)
19
+ ]
20
+ )
21
+ return gauss / gauss.sum()
22
+
23
+
24
+ def create_window(window_size, channel):
25
+ _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
26
+ _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
27
+ window = Variable(
28
+ _2D_window.expand(channel, 1, window_size, window_size).contiguous()
29
+ )
30
+ return window
31
+
32
+
33
+ def _ssim(img1, img2, window, window_size, channel, size_average=True):
34
+ mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
35
+ mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
36
+
37
+ mu1_sq = mu1.pow(2)
38
+ mu2_sq = mu2.pow(2)
39
+ mu1_mu2 = mu1 * mu2
40
+
41
+ sigma1_sq = (
42
+ F.conv2d(img1 * img1, window, padding=window_size // 2, groups=channel) - mu1_sq
43
+ )
44
+ sigma2_sq = (
45
+ F.conv2d(img2 * img2, window, padding=window_size // 2, groups=channel) - mu2_sq
46
+ )
47
+ sigma12 = (
48
+ F.conv2d(img1 * img2, window, padding=window_size // 2, groups=channel)
49
+ - mu1_mu2
50
+ )
51
+
52
+ C1 = 0.01**2
53
+ C2 = 0.03**2
54
+
55
+ ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
56
+ (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
57
+ )
58
+
59
+ if size_average:
60
+ return ssim_map.mean()
61
+ else:
62
+ return ssim_map.mean(1)
63
+
64
+
65
+ class SSIM(torch.nn.Module):
66
+ def __init__(self, window_size=11, size_average=True):
67
+ super(SSIM, self).__init__()
68
+ self.window_size = window_size
69
+ self.size_average = size_average
70
+ self.channel = 1
71
+ self.window = create_window(window_size, self.channel)
72
+
73
+ def forward(self, fake, real, bias=6.0):
74
+ fake = fake[:, None, :, :] + bias # [B, 1, T, n_mels]
75
+ real = real[:, None, :, :] + bias # [B, 1, T, n_mels]
76
+ self.window = self.window.to(dtype=fake.dtype, device=fake.device)
77
+ loss = 1 - _ssim(
78
+ fake, real, self.window, self.window_size, self.channel, self.size_average
79
+ )
80
+ return loss
utils/stft.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import torch.nn.functional as F
8
+ import numpy as np
9
+ from scipy.signal import get_window
10
+ from librosa.util import pad_center, tiny
11
+ from librosa.filters import mel as librosa_mel_fn
12
+
13
+ import torch
14
+ import numpy as np
15
+ import librosa.util as librosa_util
16
+ from scipy.signal import get_window
17
+
18
+
19
+ def window_sumsquare(
20
+ window,
21
+ n_frames,
22
+ hop_length,
23
+ win_length,
24
+ n_fft,
25
+ dtype=np.float32,
26
+ norm=None,
27
+ ):
28
+ """
29
+ # from librosa 0.6
30
+ Compute the sum-square envelope of a window function at a given hop length.
31
+
32
+ This is used to estimate modulation effects induced by windowing
33
+ observations in short-time fourier transforms.
34
+
35
+ Parameters
36
+ ----------
37
+ window : string, tuple, number, callable, or list-like
38
+ Window specification, as in `get_window`
39
+
40
+ n_frames : int > 0
41
+ The number of analysis frames
42
+
43
+ hop_length : int > 0
44
+ The number of samples to advance between frames
45
+
46
+ win_length : [optional]
47
+ The length of the window function. By default, this matches `n_fft`.
48
+
49
+ n_fft : int > 0
50
+ The length of each analysis frame.
51
+
52
+ dtype : np.dtype
53
+ The data type of the output
54
+
55
+ Returns
56
+ -------
57
+ wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
58
+ The sum-squared envelope of the window function
59
+ """
60
+ if win_length is None:
61
+ win_length = n_fft
62
+
63
+ n = n_fft + hop_length * (n_frames - 1)
64
+ x = np.zeros(n, dtype=dtype)
65
+
66
+ # Compute the squared window at the desired length
67
+ win_sq = get_window(window, win_length, fftbins=True)
68
+ win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
69
+ win_sq = librosa_util.pad_center(win_sq, n_fft)
70
+
71
+ # Fill the envelope
72
+ for i in range(n_frames):
73
+ sample = i * hop_length
74
+ x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))]
75
+ return x
76
+
77
+
78
+ def griffin_lim(magnitudes, stft_fn, n_iters=30):
79
+ """
80
+ PARAMS
81
+ ------
82
+ magnitudes: spectrogram magnitudes
83
+ stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
84
+ """
85
+
86
+ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
87
+ angles = angles.astype(np.float32)
88
+ angles = torch.autograd.Variable(torch.from_numpy(angles))
89
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
90
+
91
+ for i in range(n_iters):
92
+ _, angles = stft_fn.transform(signal)
93
+ signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
94
+ return signal
95
+
96
+
97
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
98
+ """
99
+ PARAMS
100
+ ------
101
+ C: compression factor
102
+ """
103
+ return torch.log(torch.clamp(x, min=clip_val) * C)
104
+
105
+
106
+ def dynamic_range_decompression(x, C=1):
107
+ """
108
+ PARAMS
109
+ ------
110
+ C: compression factor used to compress
111
+ """
112
+ return torch.exp(x) / C
113
+
114
+
115
+ class STFT(torch.nn.Module):
116
+ """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
117
+
118
+ def __init__(self, filter_length, hop_length, win_length, window="hann"):
119
+ super(STFT, self).__init__()
120
+ self.filter_length = filter_length
121
+ self.hop_length = hop_length
122
+ self.win_length = win_length
123
+ self.window = window
124
+ self.forward_transform = None
125
+ scale = self.filter_length / self.hop_length
126
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
127
+
128
+ cutoff = int((self.filter_length / 2 + 1))
129
+ fourier_basis = np.vstack(
130
+ [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
131
+ )
132
+
133
+ forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
134
+ inverse_basis = torch.FloatTensor(
135
+ np.linalg.pinv(scale * fourier_basis).T[:, None, :]
136
+ )
137
+
138
+ if window is not None:
139
+ assert filter_length >= win_length
140
+ # get window and zero center pad it to filter_length
141
+ fft_window = get_window(window, win_length, fftbins=True)
142
+ fft_window = pad_center(fft_window, filter_length)
143
+ fft_window = torch.from_numpy(fft_window).float()
144
+
145
+ # window the bases
146
+ forward_basis *= fft_window
147
+ inverse_basis *= fft_window
148
+
149
+ self.register_buffer("forward_basis", forward_basis.float())
150
+ self.register_buffer("inverse_basis", inverse_basis.float())
151
+
152
+ def transform(self, input_data):
153
+ num_batches = input_data.size(0)
154
+ num_samples = input_data.size(1)
155
+
156
+ self.num_samples = num_samples
157
+
158
+ # similar to librosa, reflect-pad the input
159
+ input_data = input_data.view(num_batches, 1, num_samples)
160
+ input_data = F.pad(
161
+ input_data.unsqueeze(1),
162
+ (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
163
+ mode="reflect",
164
+ )
165
+ input_data = input_data.squeeze(1)
166
+
167
+ forward_transform = F.conv1d(
168
+ input_data.cuda(),
169
+ torch.autograd.Variable(self.forward_basis, requires_grad=False).cuda(),
170
+ stride=self.hop_length,
171
+ padding=0,
172
+ ).cpu()
173
+
174
+ cutoff = int((self.filter_length / 2) + 1)
175
+ real_part = forward_transform[:, :cutoff, :]
176
+ imag_part = forward_transform[:, cutoff:, :]
177
+
178
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
179
+ phase = torch.autograd.Variable(torch.atan2(imag_part.data, real_part.data))
180
+
181
+ return magnitude, phase
182
+
183
+ def inverse(self, magnitude, phase):
184
+ recombine_magnitude_phase = torch.cat(
185
+ [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
186
+ )
187
+
188
+ inverse_transform = F.conv_transpose1d(
189
+ recombine_magnitude_phase,
190
+ torch.autograd.Variable(self.inverse_basis, requires_grad=False),
191
+ stride=self.hop_length,
192
+ padding=0,
193
+ )
194
+
195
+ if self.window is not None:
196
+ window_sum = window_sumsquare(
197
+ self.window,
198
+ magnitude.size(-1),
199
+ hop_length=self.hop_length,
200
+ win_length=self.win_length,
201
+ n_fft=self.filter_length,
202
+ dtype=np.float32,
203
+ )
204
+ # remove modulation effects
205
+ approx_nonzero_indices = torch.from_numpy(
206
+ np.where(window_sum > tiny(window_sum))[0]
207
+ )
208
+ window_sum = torch.autograd.Variable(
209
+ torch.from_numpy(window_sum), requires_grad=False
210
+ )
211
+ window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
212
+ inverse_transform[:, :, approx_nonzero_indices] /= window_sum[
213
+ approx_nonzero_indices
214
+ ]
215
+
216
+ # scale by hop ratio
217
+ inverse_transform *= float(self.filter_length) / self.hop_length
218
+
219
+ inverse_transform = inverse_transform[:, :, int(self.filter_length / 2) :]
220
+ inverse_transform = inverse_transform[:, :, : -int(self.filter_length / 2) :]
221
+
222
+ return inverse_transform
223
+
224
+ def forward(self, input_data):
225
+ self.magnitude, self.phase = self.transform(input_data)
226
+ reconstruction = self.inverse(self.magnitude, self.phase)
227
+ return reconstruction
228
+
229
+
230
+ class TacotronSTFT(torch.nn.Module):
231
+ def __init__(
232
+ self,
233
+ filter_length,
234
+ hop_length,
235
+ win_length,
236
+ n_mel_channels,
237
+ sampling_rate,
238
+ mel_fmin,
239
+ mel_fmax,
240
+ ):
241
+ super(TacotronSTFT, self).__init__()
242
+ self.n_mel_channels = n_mel_channels
243
+ self.sampling_rate = sampling_rate
244
+ self.stft_fn = STFT(filter_length, hop_length, win_length)
245
+ mel_basis = librosa_mel_fn(
246
+ sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
247
+ )
248
+ mel_basis = torch.from_numpy(mel_basis).float()
249
+ self.register_buffer("mel_basis", mel_basis)
250
+
251
+ def spectral_normalize(self, magnitudes):
252
+ output = dynamic_range_compression(magnitudes)
253
+ return output
254
+
255
+ def spectral_de_normalize(self, magnitudes):
256
+ output = dynamic_range_decompression(magnitudes)
257
+ return output
258
+
259
+ def mel_spectrogram(self, y):
260
+ """Computes mel-spectrograms from a batch of waves
261
+ PARAMS
262
+ ------
263
+ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]
264
+
265
+ RETURNS
266
+ -------
267
+ mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
268
+ """
269
+ assert torch.min(y.data) >= -1
270
+ assert torch.max(y.data) <= 1
271
+
272
+ magnitudes, phases = self.stft_fn.transform(y)
273
+ magnitudes = magnitudes.data
274
+ mel_output = torch.matmul(self.mel_basis, magnitudes)
275
+ mel_output = self.spectral_normalize(mel_output)
276
+ energy = torch.norm(magnitudes, dim=1)
277
+
278
+ return mel_output, energy
utils/symbol_table.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # This code is modified from
7
+ # https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/utils/symbol_table.py
8
+
9
+ from dataclasses import dataclass
10
+ from dataclasses import field
11
+ from typing import Dict
12
+ from typing import Generic
13
+ from typing import List
14
+ from typing import Optional
15
+ from typing import TypeVar
16
+ from typing import Union
17
+
18
+ Symbol = TypeVar('Symbol')
19
+
20
+
21
+ @dataclass(repr=False)
22
+ class SymbolTable(Generic[Symbol]):
23
+ '''SymbolTable that maps symbol IDs, found on the FSA arcs to
24
+ actual objects. These objects can be arbitrary Python objects
25
+ that can serve as keys in a dictionary (i.e. they need to be
26
+ hashable and immutable).
27
+
28
+ The SymbolTable can only be read to/written from disk if the
29
+ symbols are strings.
30
+ '''
31
+ _id2sym: Dict[int, Symbol] = field(default_factory=dict)
32
+ '''Map an integer to a symbol.
33
+ '''
34
+
35
+ _sym2id: Dict[Symbol, int] = field(default_factory=dict)
36
+ '''Map a symbol to an integer.
37
+ '''
38
+
39
+ _next_available_id: int = 1
40
+ '''A helper internal field that helps adding new symbols
41
+ to the table efficiently.
42
+ '''
43
+
44
+ eps: Symbol = '<eps>'
45
+ '''Null symbol, always mapped to index 0.
46
+ '''
47
+
48
+ def __post_init__(self):
49
+ assert all(self._sym2id[sym] == idx for idx, sym in self._id2sym.items())
50
+ assert all(self._id2sym[idx] == sym for sym, idx in self._sym2id.items())
51
+ assert 0 not in self._id2sym or self._id2sym[0] == self.eps
52
+
53
+ self._next_available_id = max(self._id2sym, default=0) + 1
54
+ self._id2sym.setdefault(0, self.eps)
55
+ self._sym2id.setdefault(self.eps, 0)
56
+
57
+
58
+ @staticmethod
59
+ def from_str(s: str) -> 'SymbolTable':
60
+ '''Build a symbol table from a string.
61
+
62
+ The string consists of lines. Every line has two fields separated
63
+ by space(s), tab(s) or both. The first field is the symbol and the
64
+ second the integer id of the symbol.
65
+
66
+ Args:
67
+ s:
68
+ The input string with the format described above.
69
+ Returns:
70
+ An instance of :class:`SymbolTable`.
71
+ '''
72
+ id2sym: Dict[int, str] = dict()
73
+ sym2id: Dict[str, int] = dict()
74
+
75
+ for line in s.split('\n'):
76
+ fields = line.split()
77
+ if len(fields) == 0:
78
+ continue # skip empty lines
79
+ assert len(fields) == 2, \
80
+ f'Expect a line with 2 fields. Given: {len(fields)}'
81
+ sym, idx = fields[0], int(fields[1])
82
+ assert sym not in sym2id, f'Duplicated symbol {sym}'
83
+ assert idx not in id2sym, f'Duplicated id {idx}'
84
+ id2sym[idx] = sym
85
+ sym2id[sym] = idx
86
+
87
+ eps = id2sym.get(0, '<eps>')
88
+
89
+ return SymbolTable(_id2sym=id2sym, _sym2id=sym2id, eps=eps)
90
+
91
+ @staticmethod
92
+ def from_file(filename: str) -> 'SymbolTable':
93
+ '''Build a symbol table from file.
94
+
95
+ Every line in the symbol table file has two fields separated by
96
+ space(s), tab(s) or both. The following is an example file:
97
+
98
+ .. code-block::
99
+
100
+ <eps> 0
101
+ a 1
102
+ b 2
103
+ c 3
104
+
105
+ Args:
106
+ filename:
107
+ Name of the symbol table file. Its format is documented above.
108
+
109
+ Returns:
110
+ An instance of :class:`SymbolTable`.
111
+
112
+ '''
113
+ with open(filename, 'r', encoding='utf-8') as f:
114
+ return SymbolTable.from_str(f.read().strip())
115
+
116
+ def to_str(self) -> str:
117
+ '''
118
+ Returns:
119
+ Return a string representation of this object. You can pass
120
+ it to the method ``from_str`` to recreate an identical object.
121
+ '''
122
+ s = ''
123
+ for idx, symbol in sorted(self._id2sym.items()):
124
+ s += f'{symbol} {idx}\n'
125
+ return s
126
+
127
+ def to_file(self, filename: str):
128
+ '''Serialize the SymbolTable to a file.
129
+
130
+ Every line in the symbol table file has two fields separated by
131
+ space(s), tab(s) or both. The following is an example file:
132
+
133
+ .. code-block::
134
+
135
+ <eps> 0
136
+ a 1
137
+ b 2
138
+ c 3
139
+
140
+ Args:
141
+ filename:
142
+ Name of the symbol table file. Its format is documented above.
143
+ '''
144
+ with open(filename, 'w') as f:
145
+ for idx, symbol in sorted(self._id2sym.items()):
146
+ print(symbol, idx, file=f)
147
+
148
+ def add(self, symbol: Symbol, index: Optional[int] = None) -> int:
149
+ '''Add a new symbol to the SymbolTable.
150
+
151
+ Args:
152
+ symbol:
153
+ The symbol to be added.
154
+ index:
155
+ Optional int id to which the symbol should be assigned.
156
+ If it is not available, a ValueError will be raised.
157
+
158
+ Returns:
159
+ The int id to which the symbol has been assigned.
160
+ '''
161
+ # Already in the table? Return its ID.
162
+ if symbol in self._sym2id:
163
+ return self._sym2id[symbol]
164
+ # Specific ID not provided - use next available.
165
+ if index is None:
166
+ index = self._next_available_id
167
+ # Specific ID provided but not available.
168
+ if index in self._id2sym:
169
+ raise ValueError(f"Cannot assign id '{index}' to '{symbol}' - "
170
+ f"already occupied by {self._id2sym[index]}")
171
+ self._sym2id[symbol] = index
172
+ self._id2sym[index] = symbol
173
+
174
+ # Update next available ID if needed
175
+ if self._next_available_id <= index:
176
+ self._next_available_id = index + 1
177
+
178
+ return index
179
+
180
+ def get(self, k: Union[int, Symbol]) -> Union[Symbol, int]:
181
+ '''Get a symbol for an id or get an id for a symbol
182
+
183
+ Args:
184
+ k:
185
+ If it is an id, it tries to find the symbol corresponding
186
+ to the id; if it is a symbol, it tries to find the id
187
+ corresponding to the symbol.
188
+
189
+ Returns:
190
+ An id or a symbol depending on the given `k`.
191
+ '''
192
+ if isinstance(k, int):
193
+ return self._id2sym[k]
194
+ else:
195
+ return self._sym2id[k]
196
+
197
+ def merge(self, other: 'SymbolTable') -> 'SymbolTable':
198
+ '''Create a union of two SymbolTables.
199
+ Raises an AssertionError if the same IDs are occupied by
200
+ different symbols.
201
+
202
+ Args:
203
+ other:
204
+ A symbol table to merge with ``self``.
205
+
206
+ Returns:
207
+ A new symbol table.
208
+ '''
209
+ self._check_compatible(other)
210
+ return SymbolTable(
211
+ _id2sym={**self._id2sym, **other._id2sym},
212
+ _sym2id={**self._sym2id, **other._sym2id},
213
+ eps=self.eps
214
+ )
215
+
216
+ def _check_compatible(self, other: 'SymbolTable') -> None:
217
+ # Epsilon compatibility
218
+ assert self.eps == other.eps, f'Mismatched epsilon symbol: ' \
219
+ f'{self.eps} != {other.eps}'
220
+ # IDs compatibility
221
+ common_ids = set(self._id2sym).intersection(other._id2sym)
222
+ for idx in common_ids:
223
+ assert self[idx] == other[idx], f'ID conflict for id: {idx}, ' \
224
+ f'self[idx] = "{self[idx]}", ' \
225
+ f'other[idx] = "{other[idx]}"'
226
+ # Symbols compatibility
227
+ common_symbols = set(self._sym2id).intersection(other._sym2id)
228
+ for sym in common_symbols:
229
+ assert self[sym] == other[sym], f'ID conflict for id: {sym}, ' \
230
+ f'self[sym] = "{self[sym]}", ' \
231
+ f'other[sym] = "{other[sym]}"'
232
+
233
+ def __getitem__(self, item: Union[int, Symbol]) -> Union[Symbol, int]:
234
+ return self.get(item)
235
+
236
+ def __contains__(self, item: Union[int, Symbol]) -> bool:
237
+ if isinstance(item, int):
238
+ return item in self._id2sym
239
+ else:
240
+ return item in self._sym2id
241
+
242
+ def __len__(self) -> int:
243
+ return len(self._id2sym)
244
+
245
+ def __eq__(self, other: 'SymbolTable') -> bool:
246
+ if len(self) != len(other):
247
+ return False
248
+
249
+ for s in self.symbols:
250
+ if self[s] != other[s]:
251
+ return False
252
+
253
+ return True
254
+
255
+ @property
256
+ def ids(self) -> List[int]:
257
+ '''Returns a list of integer IDs corresponding to the symbols.
258
+ '''
259
+ ans = list(self._id2sym.keys())
260
+ ans.sort()
261
+ return ans
262
+
263
+ @property
264
+ def symbols(self) -> List[Symbol]:
265
+ '''Returns a list of symbols (e.g., strings) corresponding to
266
+ the integer IDs.
267
+ '''
268
+ ans = list(self._sym2id.keys())
269
+ ans.sort()
270
+ return ans
271
+
272
+
273
+ class TextToken:
274
+ def __init__(
275
+ self,
276
+ text_tokens: List[str],
277
+ add_eos: bool = True,
278
+ add_bos: bool = True,
279
+ pad_symbol: str = "<pad>",
280
+ bos_symbol: str = "<bos>",
281
+ eos_symbol: str = "<eos>",
282
+ ):
283
+ self.pad_symbol = pad_symbol
284
+ self.add_eos = add_eos
285
+ self.add_bos = add_bos
286
+ self.bos_symbol = bos_symbol
287
+ self.eos_symbol = eos_symbol
288
+
289
+ unique_tokens = [pad_symbol]
290
+ if add_bos:
291
+ unique_tokens.append(bos_symbol)
292
+ if add_eos:
293
+ unique_tokens.append(eos_symbol)
294
+ unique_tokens.extend(sorted(text_tokens))
295
+
296
+ self.token2idx = {token: idx for idx, token in enumerate(unique_tokens)}
297
+ self.idx2token = unique_tokens
298
+
299
+
300
+ def get_token_id_seq(self, text):
301
+ tokens_seq = [p for p in text]
302
+ seq = (
303
+ ([self.bos_symbol] if self.add_bos else [])
304
+ + tokens_seq
305
+ + ([self.eos_symbol] if self.add_eos else [])
306
+ )
307
+
308
+ token_ids = [self.token2idx[token] for token in seq]
309
+ token_lens = len(tokens_seq) + self.add_eos + self.add_bos
310
+
311
+ return token_ids, token_lens
312
+
313
+
utils/tokenizer.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # This code is modified from
7
+ # https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/tokenizer.py
8
+
9
+ import re
10
+ from typing import Any, Dict, List, Optional, Pattern, Union
11
+
12
+ import torch
13
+ import torchaudio
14
+ from encodec import EncodecModel
15
+ from encodec.utils import convert_audio
16
+
17
+
18
+
19
+ class AudioTokenizer:
20
+ """EnCodec audio tokenizer for encoding and decoding audio.
21
+
22
+ Attributes:
23
+ device: The device on which the codec model is loaded.
24
+ codec: The pretrained EnCodec model.
25
+ sample_rate: Sample rate of the model.
26
+ channels: Number of audio channels in the model.
27
+ """
28
+
29
+ def __init__(self, device: Any = None) -> None:
30
+ model = EncodecModel.encodec_model_24khz()
31
+ model.set_target_bandwidth(6.0)
32
+ remove_encodec_weight_norm(model)
33
+
34
+ if not device:
35
+ device = torch.device("cpu")
36
+ if torch.cuda.is_available():
37
+ device = torch.device("cuda:0")
38
+
39
+ self._device = device
40
+
41
+ self.codec = model.to(device)
42
+ self.sample_rate = model.sample_rate
43
+ self.channels = model.channels
44
+
45
+ @property
46
+ def device(self):
47
+ return self._device
48
+
49
+ def encode(self, wav: torch.Tensor) -> torch.Tensor:
50
+ """Encode the audio waveform.
51
+
52
+ Args:
53
+ wav: A tensor representing the audio waveform.
54
+
55
+ Returns:
56
+ A tensor representing the encoded audio.
57
+ """
58
+ return self.codec.encode(wav.to(self.device))
59
+
60
+ def decode(self, frames: torch.Tensor) -> torch.Tensor:
61
+ """Decode the encoded audio frames.
62
+
63
+ Args:
64
+ frames: A tensor representing the encoded audio frames.
65
+
66
+ Returns:
67
+ A tensor representing the decoded audio waveform.
68
+ """
69
+ return self.codec.decode(frames)
70
+
71
+
72
+
73
+ def tokenize_audio(tokenizer: AudioTokenizer, audio_path: str):
74
+ """
75
+ Tokenize the audio waveform using the given AudioTokenizer.
76
+
77
+ Args:
78
+ tokenizer: An instance of AudioTokenizer.
79
+ audio_path: Path to the audio file.
80
+
81
+ Returns:
82
+ A tensor of encoded frames from the audio.
83
+
84
+ Raises:
85
+ FileNotFoundError: If the audio file is not found.
86
+ RuntimeError: If there's an error processing the audio data.
87
+ """
88
+ # try:
89
+ # Load and preprocess the audio waveform
90
+ wav, sr = torchaudio.load(audio_path)
91
+ wav = convert_audio(wav, sr, tokenizer.sample_rate, tokenizer.channels)
92
+ wav = wav.unsqueeze(0)
93
+
94
+ # Extract discrete codes from EnCodec
95
+ with torch.no_grad():
96
+ encoded_frames = tokenizer.encode(wav)
97
+ return encoded_frames
98
+
99
+ # except FileNotFoundError:
100
+ # raise FileNotFoundError(f"Audio file not found at {audio_path}")
101
+ # except Exception as e:
102
+ # raise RuntimeError(f"Error processing audio data: {e}")
103
+
104
+
105
+
106
+ def remove_encodec_weight_norm(model):
107
+ from encodec.modules import SConv1d
108
+ from encodec.modules.seanet import SConvTranspose1d, SEANetResnetBlock
109
+ from torch.nn.utils import remove_weight_norm
110
+
111
+ encoder = model.encoder.model
112
+ for key in encoder._modules:
113
+ if isinstance(encoder._modules[key], SEANetResnetBlock):
114
+ remove_weight_norm(encoder._modules[key].shortcut.conv.conv)
115
+ block_modules = encoder._modules[key].block._modules
116
+ for skey in block_modules:
117
+ if isinstance(block_modules[skey], SConv1d):
118
+ remove_weight_norm(block_modules[skey].conv.conv)
119
+ elif isinstance(encoder._modules[key], SConv1d):
120
+ remove_weight_norm(encoder._modules[key].conv.conv)
121
+
122
+ decoder = model.decoder.model
123
+ for key in decoder._modules:
124
+ if isinstance(decoder._modules[key], SEANetResnetBlock):
125
+ remove_weight_norm(decoder._modules[key].shortcut.conv.conv)
126
+ block_modules = decoder._modules[key].block._modules
127
+ for skey in block_modules:
128
+ if isinstance(block_modules[skey], SConv1d):
129
+ remove_weight_norm(block_modules[skey].conv.conv)
130
+ elif isinstance(decoder._modules[key], SConvTranspose1d):
131
+ remove_weight_norm(decoder._modules[key].convtr.convtr)
132
+ elif isinstance(decoder._modules[key], SConv1d):
133
+ remove_weight_norm(decoder._modules[key].conv.conv)
134
+
135
+
136
+ def extract_encodec_token(wav_path):
137
+ model = EncodecModel.encodec_model_24khz()
138
+ model.set_target_bandwidth(6.0)
139
+
140
+ wav, sr = torchaudio.load(wav_path)
141
+ wav = convert_audio(wav, sr, model.sample_rate, model.channels)
142
+ wav = wav.unsqueeze(0)
143
+ if torch.cuda.is_available():
144
+ model = model.cuda()
145
+ wav = wav.cuda()
146
+ with torch.no_grad():
147
+ encoded_frames = model.encode(wav)
148
+ codes_ = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) # [B, n_q, T]
149
+ codes = codes_.cpu().numpy()[0,:,:].T # [T, 8]
150
+
151
+ return codes
utils/topk_sampling.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
+
11
+ # This function is modified from https://github.com/microsoft/unilm/blob/master/xtune/src/transformers/modeling_utils.py
12
+ def top_k_top_p_filtering(
13
+ logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1
14
+ ):
15
+ """
16
+ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering.
17
+
18
+ Args:
19
+ logits (torch.Tensor): Logits distribution with shape (batch size, vocabulary size).
20
+ top_k (int, optional): Keep only top k tokens with highest probability (top-k filtering).
21
+ Set to 0 to disable. Defaults to 0.
22
+ top_p (float, optional): Keep the top tokens with a cumulative probability >= top_p (nucleus filtering).
23
+ Must be between 0 and 1, inclusive. Defaults to 1.0.
24
+ filter_value (float, optional): The value to assign to filtered logits. Defaults to -float('Inf').
25
+ min_tokens_to_keep (int, optional): Ensure that at least this number of tokens are kept per batch example.
26
+ Defaults to 1.
27
+
28
+ Returns:
29
+ torch.Tensor: The filtered logits.
30
+ """
31
+ """
32
+ Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
33
+ Make sure we keep at least min_tokens_to_keep per batch example in the output
34
+ From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
35
+ """
36
+ if top_k > 0:
37
+ # Apply top-k filtering
38
+ top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))
39
+ indices_to_remove = logits < torch.topk(logits, top_k).values[..., -1, None]
40
+ logits[indices_to_remove] = filter_value
41
+
42
+ if top_p < 1.0:
43
+ # Apply top-p filtering
44
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
45
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
46
+
47
+ # Create a mask to remove tokens with cumulative probability above the top_p threshold
48
+ sorted_indices_to_remove = cumulative_probs > top_p
49
+ if min_tokens_to_keep > 1:
50
+ sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
51
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
52
+ sorted_indices_to_remove[..., 0] = 0
53
+
54
+ # Scatter sorted tensors back to original indexing
55
+ indices_to_remove = sorted_indices.scatter(1, sorted_indices, sorted_indices_to_remove)
56
+ logits[indices_to_remove] = filter_value
57
+
58
+ return logits
59
+
60
+
61
+ def topk_sampling(logits, top_k=50, top_p=1.0, temperature=1.0):
62
+ """
63
+ Perform top-k and top-p sampling on logits.
64
+
65
+ Args:
66
+ logits (torch.Tensor): The logits to sample from.
67
+ top_k (int, optional): The number of highest probability tokens to keep for top-k filtering.
68
+ Must be a positive integer. Defaults to 50.
69
+ top_p (float, optional): The cumulative probability threshold for nucleus sampling.
70
+ Must be between 0 and 1. Defaults to 1.0.
71
+ temperature (float, optional): The scaling factor to adjust the logits distribution.
72
+ Must be strictly positive. Defaults to 1.0.
73
+
74
+ Returns:
75
+ torch.Tensor: The sampled token.
76
+ """
77
+
78
+ # Adjust logits using temperature
79
+ if temperature != 1.0:
80
+ logits = logits / temperature
81
+
82
+ # Top-p/top-k filtering
83
+ logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
84
+
85
+ # Sample from the filtered distribution
86
+ token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
87
+ return token
utils/trainer_utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+
8
+
9
+ def check_nan(logger, loss, y_pred, y_gt):
10
+ if torch.any(torch.isnan(loss)):
11
+ logger.info("out has nan: ", torch.any(torch.isnan(y_pred)))
12
+ logger.info("y_gt has nan: ", torch.any(torch.isnan(y_gt)))
13
+ logger.info("out: ", y_pred)
14
+ logger.info("y_gt: ", y_gt)
15
+ logger.info("loss = {:.4f}\n".format(loss.item()))
16
+ exit()
utils/util.py ADDED
@@ -0,0 +1,688 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import collections
8
+ import glob
9
+ import os
10
+ import random
11
+ import time
12
+ import argparse
13
+ from collections import OrderedDict
14
+
15
+ import json5
16
+ import numpy as np
17
+ import glob
18
+ from torch.nn import functional as F
19
+
20
+
21
+ try:
22
+ from ruamel.yaml import YAML as yaml
23
+ except:
24
+ from ruamel_yaml import YAML as yaml
25
+
26
+ import torch
27
+
28
+ from utils.hparam import HParams
29
+ import logging
30
+ from logging import handlers
31
+
32
+
33
+ def str2bool(v):
34
+ """Used in argparse.ArgumentParser.add_argument to indicate
35
+ that a type is a bool type and user can enter
36
+
37
+ - yes, true, t, y, 1, to represent True
38
+ - no, false, f, n, 0, to represent False
39
+
40
+ See https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse # noqa
41
+ """
42
+ if isinstance(v, bool):
43
+ return v
44
+ if v.lower() in ("yes", "true", "t", "y", "1"):
45
+ return True
46
+ elif v.lower() in ("no", "false", "f", "n", "0"):
47
+ return False
48
+ else:
49
+ raise argparse.ArgumentTypeError("Boolean value expected.")
50
+
51
+
52
+ def find_checkpoint_of_mapper(mapper_ckpt_dir):
53
+ mapper_ckpts = glob.glob(os.path.join(mapper_ckpt_dir, "ckpts/*.pt"))
54
+
55
+ # Select the max steps
56
+ mapper_ckpts.sort()
57
+ mapper_weights_file = mapper_ckpts[-1]
58
+ return mapper_weights_file
59
+
60
+
61
+ def pad_f0_to_tensors(f0s, batched=None):
62
+ # Initialize
63
+ tensors = []
64
+
65
+ if batched == None:
66
+ # Get the max frame for padding
67
+ size = -1
68
+ for f0 in f0s:
69
+ size = max(size, f0.shape[-1])
70
+
71
+ tensor = torch.zeros(len(f0s), size)
72
+
73
+ for i, f0 in enumerate(f0s):
74
+ tensor[i, : f0.shape[-1]] = f0[:]
75
+
76
+ tensors.append(tensor)
77
+ else:
78
+ start = 0
79
+ while start + batched - 1 < len(f0s):
80
+ end = start + batched - 1
81
+
82
+ # Get the max frame for padding
83
+ size = -1
84
+ for i in range(start, end + 1):
85
+ size = max(size, f0s[i].shape[-1])
86
+
87
+ tensor = torch.zeros(batched, size)
88
+
89
+ for i in range(start, end + 1):
90
+ tensor[i - start, : f0s[i].shape[-1]] = f0s[i][:]
91
+
92
+ tensors.append(tensor)
93
+
94
+ start = start + batched
95
+
96
+ if start != len(f0s):
97
+ end = len(f0s)
98
+
99
+ # Get the max frame for padding
100
+ size = -1
101
+ for i in range(start, end):
102
+ size = max(size, f0s[i].shape[-1])
103
+
104
+ tensor = torch.zeros(len(f0s) - start, size)
105
+
106
+ for i in range(start, end):
107
+ tensor[i - start, : f0s[i].shape[-1]] = f0s[i][:]
108
+
109
+ tensors.append(tensor)
110
+
111
+ return tensors
112
+
113
+
114
+ def pad_mels_to_tensors(mels, batched=None):
115
+ """
116
+ Args:
117
+ mels: A list of mel-specs
118
+ Returns:
119
+ tensors: A list of tensors containing the batched mel-specs
120
+ mel_frames: A list of tensors containing the frames of the original mel-specs
121
+ """
122
+ # Initialize
123
+ tensors = []
124
+ mel_frames = []
125
+
126
+ # Split mel-specs into batches to avoid cuda memory exceed
127
+ if batched == None:
128
+ # Get the max frame for padding
129
+ size = -1
130
+ for mel in mels:
131
+ size = max(size, mel.shape[-1])
132
+
133
+ tensor = torch.zeros(len(mels), mels[0].shape[0], size)
134
+ mel_frame = torch.zeros(len(mels), dtype=torch.int32)
135
+
136
+ for i, mel in enumerate(mels):
137
+ tensor[i, :, : mel.shape[-1]] = mel[:]
138
+ mel_frame[i] = mel.shape[-1]
139
+
140
+ tensors.append(tensor)
141
+ mel_frames.append(mel_frame)
142
+ else:
143
+ start = 0
144
+ while start + batched - 1 < len(mels):
145
+ end = start + batched - 1
146
+
147
+ # Get the max frame for padding
148
+ size = -1
149
+ for i in range(start, end + 1):
150
+ size = max(size, mels[i].shape[-1])
151
+
152
+ tensor = torch.zeros(batched, mels[0].shape[0], size)
153
+ mel_frame = torch.zeros(batched, dtype=torch.int32)
154
+
155
+ for i in range(start, end + 1):
156
+ tensor[i - start, :, : mels[i].shape[-1]] = mels[i][:]
157
+ mel_frame[i - start] = mels[i].shape[-1]
158
+
159
+ tensors.append(tensor)
160
+ mel_frames.append(mel_frame)
161
+
162
+ start = start + batched
163
+
164
+ if start != len(mels):
165
+ end = len(mels)
166
+
167
+ # Get the max frame for padding
168
+ size = -1
169
+ for i in range(start, end):
170
+ size = max(size, mels[i].shape[-1])
171
+
172
+ tensor = torch.zeros(len(mels) - start, mels[0].shape[0], size)
173
+ mel_frame = torch.zeros(len(mels) - start, dtype=torch.int32)
174
+
175
+ for i in range(start, end):
176
+ tensor[i - start, :, : mels[i].shape[-1]] = mels[i][:]
177
+ mel_frame[i - start] = mels[i].shape[-1]
178
+
179
+ tensors.append(tensor)
180
+ mel_frames.append(mel_frame)
181
+
182
+ return tensors, mel_frames
183
+
184
+
185
+ def load_model_config(args):
186
+ """Load model configurations (in args.json under checkpoint directory)
187
+
188
+ Args:
189
+ args (ArgumentParser): arguments to run bins/preprocess.py
190
+
191
+ Returns:
192
+ dict: dictionary that stores model configurations
193
+ """
194
+ if args.checkpoint_dir is None:
195
+ assert args.checkpoint_file is not None
196
+ checkpoint_dir = os.path.split(args.checkpoint_file)[0]
197
+ else:
198
+ checkpoint_dir = args.checkpoint_dir
199
+ config_path = os.path.join(checkpoint_dir, "args.json")
200
+ print("config_path: ", config_path)
201
+
202
+ config = load_config(config_path)
203
+ return config
204
+
205
+
206
+ def remove_and_create(dir):
207
+ if os.path.exists(dir):
208
+ os.system("rm -r {}".format(dir))
209
+ os.makedirs(dir, exist_ok=True)
210
+
211
+
212
+ def has_existed(path, warning=False):
213
+ if not warning:
214
+ return os.path.exists(path)
215
+
216
+ if os.path.exists(path):
217
+ answer = input(
218
+ "The path {} has existed. \nInput 'y' (or hit Enter) to skip it, and input 'n' to re-write it [y/n]\n".format(
219
+ path
220
+ )
221
+ )
222
+ if not answer == "n":
223
+ return True
224
+
225
+ return False
226
+
227
+
228
+ def remove_older_ckpt(saved_model_name, checkpoint_dir, max_to_keep=5):
229
+ if os.path.exists(os.path.join(checkpoint_dir, "checkpoint")):
230
+ with open(os.path.join(checkpoint_dir, "checkpoint"), "r") as f:
231
+ ckpts = [x.strip() for x in f.readlines()]
232
+ else:
233
+ ckpts = []
234
+ ckpts.append(saved_model_name)
235
+ for item in ckpts[:-max_to_keep]:
236
+ if os.path.exists(os.path.join(checkpoint_dir, item)):
237
+ os.remove(os.path.join(checkpoint_dir, item))
238
+ with open(os.path.join(checkpoint_dir, "checkpoint"), "w") as f:
239
+ for item in ckpts[-max_to_keep:]:
240
+ f.write("{}\n".format(item))
241
+
242
+
243
+ def set_all_random_seed(seed: int):
244
+ random.seed(seed)
245
+ np.random.seed(seed)
246
+ torch.random.manual_seed(seed)
247
+
248
+
249
+ def save_checkpoint(
250
+ args,
251
+ generator,
252
+ g_optimizer,
253
+ step,
254
+ discriminator=None,
255
+ d_optimizer=None,
256
+ max_to_keep=5,
257
+ ):
258
+ saved_model_name = "model.ckpt-{}.pt".format(step)
259
+ checkpoint_path = os.path.join(args.checkpoint_dir, saved_model_name)
260
+
261
+ if discriminator and d_optimizer:
262
+ torch.save(
263
+ {
264
+ "generator": generator.state_dict(),
265
+ "discriminator": discriminator.state_dict(),
266
+ "g_optimizer": g_optimizer.state_dict(),
267
+ "d_optimizer": d_optimizer.state_dict(),
268
+ "global_step": step,
269
+ },
270
+ checkpoint_path,
271
+ )
272
+ else:
273
+ torch.save(
274
+ {
275
+ "generator": generator.state_dict(),
276
+ "g_optimizer": g_optimizer.state_dict(),
277
+ "global_step": step,
278
+ },
279
+ checkpoint_path,
280
+ )
281
+
282
+ print("Saved checkpoint: {}".format(checkpoint_path))
283
+
284
+ if os.path.exists(os.path.join(args.checkpoint_dir, "checkpoint")):
285
+ with open(os.path.join(args.checkpoint_dir, "checkpoint"), "r") as f:
286
+ ckpts = [x.strip() for x in f.readlines()]
287
+ else:
288
+ ckpts = []
289
+ ckpts.append(saved_model_name)
290
+ for item in ckpts[:-max_to_keep]:
291
+ if os.path.exists(os.path.join(args.checkpoint_dir, item)):
292
+ os.remove(os.path.join(args.checkpoint_dir, item))
293
+ with open(os.path.join(args.checkpoint_dir, "checkpoint"), "w") as f:
294
+ for item in ckpts[-max_to_keep:]:
295
+ f.write("{}\n".format(item))
296
+
297
+
298
+ def attempt_to_restore(
299
+ generator, g_optimizer, checkpoint_dir, discriminator=None, d_optimizer=None
300
+ ):
301
+ checkpoint_list = os.path.join(checkpoint_dir, "checkpoint")
302
+ if os.path.exists(checkpoint_list):
303
+ checkpoint_filename = open(checkpoint_list).readlines()[-1].strip()
304
+ checkpoint_path = os.path.join(checkpoint_dir, "{}".format(checkpoint_filename))
305
+ print("Restore from {}".format(checkpoint_path))
306
+ checkpoint = torch.load(checkpoint_path, map_location="cpu")
307
+ if generator:
308
+ if not list(generator.state_dict().keys())[0].startswith("module."):
309
+ raw_dict = checkpoint["generator"]
310
+ clean_dict = OrderedDict()
311
+ for k, v in raw_dict.items():
312
+ if k.startswith("module."):
313
+ clean_dict[k[7:]] = v
314
+ else:
315
+ clean_dict[k] = v
316
+ generator.load_state_dict(clean_dict)
317
+ else:
318
+ generator.load_state_dict(checkpoint["generator"])
319
+ if g_optimizer:
320
+ g_optimizer.load_state_dict(checkpoint["g_optimizer"])
321
+ global_step = 100000
322
+ if discriminator and "discriminator" in checkpoint.keys():
323
+ discriminator.load_state_dict(checkpoint["discriminator"])
324
+ global_step = checkpoint["global_step"]
325
+ print("restore discriminator")
326
+ if d_optimizer and "d_optimizer" in checkpoint.keys():
327
+ d_optimizer.load_state_dict(checkpoint["d_optimizer"])
328
+ print("restore d_optimizer...")
329
+ else:
330
+ global_step = 0
331
+ return global_step
332
+
333
+
334
+ class ExponentialMovingAverage(object):
335
+ def __init__(self, decay):
336
+ self.decay = decay
337
+ self.shadow = {}
338
+
339
+ def register(self, name, val):
340
+ self.shadow[name] = val.clone()
341
+
342
+ def update(self, name, x):
343
+ assert name in self.shadow
344
+ update_delta = self.shadow[name] - x
345
+ self.shadow[name] -= (1.0 - self.decay) * update_delta
346
+
347
+
348
+ def apply_moving_average(model, ema):
349
+ for name, param in model.named_parameters():
350
+ if name in ema.shadow:
351
+ ema.update(name, param.data)
352
+
353
+
354
+ def register_model_to_ema(model, ema):
355
+ for name, param in model.named_parameters():
356
+ if param.requires_grad:
357
+ ema.register(name, param.data)
358
+
359
+
360
+ class YParams(HParams):
361
+ def __init__(self, yaml_file):
362
+ if not os.path.exists(yaml_file):
363
+ raise IOError("yaml file: {} is not existed".format(yaml_file))
364
+ super().__init__()
365
+ self.d = collections.OrderedDict()
366
+ with open(yaml_file) as fp:
367
+ for _, v in yaml().load(fp).items():
368
+ for k1, v1 in v.items():
369
+ try:
370
+ if self.get(k1):
371
+ self.set_hparam(k1, v1)
372
+ else:
373
+ self.add_hparam(k1, v1)
374
+ self.d[k1] = v1
375
+ except Exception:
376
+ import traceback
377
+
378
+ print(traceback.format_exc())
379
+
380
+ # @property
381
+ def get_elements(self):
382
+ return self.d.items()
383
+
384
+
385
+ def override_config(base_config, new_config):
386
+ """Update new configurations in the original dict with the new dict
387
+
388
+ Args:
389
+ base_config (dict): original dict to be overridden
390
+ new_config (dict): dict with new configurations
391
+
392
+ Returns:
393
+ dict: updated configuration dict
394
+ """
395
+ for k, v in new_config.items():
396
+ if type(v) == dict:
397
+ if k not in base_config.keys():
398
+ base_config[k] = {}
399
+ base_config[k] = override_config(base_config[k], v)
400
+ else:
401
+ base_config[k] = v
402
+ return base_config
403
+
404
+
405
+ def get_lowercase_keys_config(cfg):
406
+ """Change all keys in cfg to lower case
407
+
408
+ Args:
409
+ cfg (dict): dictionary that stores configurations
410
+
411
+ Returns:
412
+ dict: dictionary that stores configurations
413
+ """
414
+ updated_cfg = dict()
415
+ for k, v in cfg.items():
416
+ if type(v) == dict:
417
+ v = get_lowercase_keys_config(v)
418
+ updated_cfg[k.lower()] = v
419
+ return updated_cfg
420
+
421
+
422
+ def _load_config(config_fn, lowercase=False):
423
+ """Load configurations into a dictionary
424
+
425
+ Args:
426
+ config_fn (str): path to configuration file
427
+ lowercase (bool, optional): whether changing keys to lower case. Defaults to False.
428
+
429
+ Returns:
430
+ dict: dictionary that stores configurations
431
+ """
432
+ with open(config_fn, "r") as f:
433
+ data = f.read()
434
+ config_ = json5.loads(data)
435
+ if "base_config" in config_:
436
+ # load configurations from new path
437
+ p_config_path = os.path.join(os.getenv("WORK_DIR"), config_["base_config"])
438
+ p_config_ = _load_config(p_config_path)
439
+ config_ = override_config(p_config_, config_)
440
+ if lowercase:
441
+ # change keys in config_ to lower case
442
+ config_ = get_lowercase_keys_config(config_)
443
+ return config_
444
+
445
+
446
+ def load_config(config_fn, lowercase=False):
447
+ """Load configurations into a dictionary
448
+
449
+ Args:
450
+ config_fn (str): path to configuration file
451
+ lowercase (bool, optional): _description_. Defaults to False.
452
+
453
+ Returns:
454
+ JsonHParams: an object that stores configurations
455
+ """
456
+ config_ = _load_config(config_fn, lowercase=lowercase)
457
+ # create an JsonHParams object with configuration dict
458
+ cfg = JsonHParams(**config_)
459
+ return cfg
460
+
461
+
462
+ def save_config(save_path, cfg):
463
+ """Save configurations into a json file
464
+
465
+ Args:
466
+ save_path (str): path to save configurations
467
+ cfg (dict): dictionary that stores configurations
468
+ """
469
+ with open(save_path, "w") as f:
470
+ json5.dump(
471
+ cfg, f, ensure_ascii=False, indent=4, quote_keys=True, sort_keys=True
472
+ )
473
+
474
+
475
+ class JsonHParams:
476
+ def __init__(self, **kwargs):
477
+ for k, v in kwargs.items():
478
+ if type(v) == dict:
479
+ v = JsonHParams(**v)
480
+ self[k] = v
481
+
482
+ def keys(self):
483
+ return self.__dict__.keys()
484
+
485
+ def items(self):
486
+ return self.__dict__.items()
487
+
488
+ def values(self):
489
+ return self.__dict__.values()
490
+
491
+ def __len__(self):
492
+ return len(self.__dict__)
493
+
494
+ def __getitem__(self, key):
495
+ return getattr(self, key)
496
+
497
+ def __setitem__(self, key, value):
498
+ return setattr(self, key, value)
499
+
500
+ def __contains__(self, key):
501
+ return key in self.__dict__
502
+
503
+ def __repr__(self):
504
+ return self.__dict__.__repr__()
505
+
506
+
507
+ class ValueWindow:
508
+ def __init__(self, window_size=100):
509
+ self._window_size = window_size
510
+ self._values = []
511
+
512
+ def append(self, x):
513
+ self._values = self._values[-(self._window_size - 1) :] + [x]
514
+
515
+ @property
516
+ def sum(self):
517
+ return sum(self._values)
518
+
519
+ @property
520
+ def count(self):
521
+ return len(self._values)
522
+
523
+ @property
524
+ def average(self):
525
+ return self.sum / max(1, self.count)
526
+
527
+ def reset(self):
528
+ self._values = []
529
+
530
+
531
+ class Logger(object):
532
+ def __init__(
533
+ self,
534
+ filename,
535
+ level="info",
536
+ when="D",
537
+ backCount=10,
538
+ fmt="%(asctime)s : %(message)s",
539
+ ):
540
+ self.level_relations = {
541
+ "debug": logging.DEBUG,
542
+ "info": logging.INFO,
543
+ "warning": logging.WARNING,
544
+ "error": logging.ERROR,
545
+ "crit": logging.CRITICAL,
546
+ }
547
+ if level == "debug":
548
+ fmt = "%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s"
549
+ self.logger = logging.getLogger(filename)
550
+ format_str = logging.Formatter(fmt)
551
+ self.logger.setLevel(self.level_relations.get(level))
552
+ sh = logging.StreamHandler()
553
+ sh.setFormatter(format_str)
554
+ th = handlers.TimedRotatingFileHandler(
555
+ filename=filename, when=when, backupCount=backCount, encoding="utf-8"
556
+ )
557
+ th.setFormatter(format_str)
558
+ self.logger.addHandler(sh)
559
+ self.logger.addHandler(th)
560
+ self.logger.info(
561
+ "==========================New Starting Here=============================="
562
+ )
563
+
564
+
565
+ def init_weights(m, mean=0.0, std=0.01):
566
+ classname = m.__class__.__name__
567
+ if classname.find("Conv") != -1:
568
+ m.weight.data.normal_(mean, std)
569
+
570
+
571
+ def get_padding(kernel_size, dilation=1):
572
+ return int((kernel_size * dilation - dilation) / 2)
573
+
574
+
575
+ def slice_segments(x, ids_str, segment_size=4):
576
+ ret = torch.zeros_like(x[:, :, :segment_size])
577
+ for i in range(x.size(0)):
578
+ idx_str = ids_str[i]
579
+ idx_end = idx_str + segment_size
580
+ ret[i] = x[i, :, idx_str:idx_end]
581
+ return ret
582
+
583
+
584
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
585
+ b, d, t = x.size()
586
+ if x_lengths is None:
587
+ x_lengths = t
588
+ ids_str_max = x_lengths - segment_size + 1
589
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
590
+ ret = slice_segments(x, ids_str, segment_size)
591
+ return ret, ids_str
592
+
593
+
594
+ def subsequent_mask(length):
595
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
596
+ return mask
597
+
598
+
599
+ @torch.jit.script
600
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
601
+ n_channels_int = n_channels[0]
602
+ in_act = input_a + input_b
603
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
604
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
605
+ acts = t_act * s_act
606
+ return acts
607
+
608
+
609
+ def convert_pad_shape(pad_shape):
610
+ l = pad_shape[::-1]
611
+ pad_shape = [item for sublist in l for item in sublist]
612
+ return pad_shape
613
+
614
+
615
+ def sequence_mask(length, max_length=None):
616
+ if max_length is None:
617
+ max_length = length.max()
618
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
619
+ return x.unsqueeze(0) < length.unsqueeze(1)
620
+
621
+
622
+ def generate_path(duration, mask):
623
+ """
624
+ duration: [b, 1, t_x]
625
+ mask: [b, 1, t_y, t_x]
626
+ """
627
+ device = duration.device
628
+
629
+ b, _, t_y, t_x = mask.shape
630
+ cum_duration = torch.cumsum(duration, -1)
631
+
632
+ cum_duration_flat = cum_duration.view(b * t_x)
633
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
634
+ path = path.view(b, t_x, t_y)
635
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
636
+ path = path.unsqueeze(1).transpose(2, 3) * mask
637
+ return path
638
+
639
+
640
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
641
+ if isinstance(parameters, torch.Tensor):
642
+ parameters = [parameters]
643
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
644
+ norm_type = float(norm_type)
645
+ if clip_value is not None:
646
+ clip_value = float(clip_value)
647
+
648
+ total_norm = 0
649
+ for p in parameters:
650
+ param_norm = p.grad.data.norm(norm_type)
651
+ total_norm += param_norm.item() ** norm_type
652
+ if clip_value is not None:
653
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
654
+ total_norm = total_norm ** (1.0 / norm_type)
655
+ return total_norm
656
+
657
+
658
+ def get_current_time():
659
+ pass
660
+
661
+
662
+ def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
663
+ """
664
+ Args:
665
+ lengths:
666
+ A 1-D tensor containing sentence lengths.
667
+ max_len:
668
+ The length of masks.
669
+ Returns:
670
+ Return a 2-D bool tensor, where masked positions
671
+ are filled with `True` and non-masked positions are
672
+ filled with `False`.
673
+
674
+ >>> lengths = torch.tensor([1, 3, 2, 5])
675
+ >>> make_pad_mask(lengths)
676
+ tensor([[False, True, True, True, True],
677
+ [False, False, False, True, True],
678
+ [False, False, True, True, True],
679
+ [False, False, False, False, False]])
680
+ """
681
+ assert lengths.ndim == 1, lengths.ndim
682
+ max_len = max(max_len, lengths.max())
683
+ n = lengths.size(0)
684
+ seq_range = torch.arange(0, max_len, device=lengths.device)
685
+ expaned_lengths = seq_range.unsqueeze(0).expand(n, max_len)
686
+
687
+ return expaned_lengths >= lengths.unsqueeze(-1)
688
+
utils/whisper.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ import torch
7
+ import os
8
+ import pickle
9
+ from tqdm import tqdm
10
+ import numpy as np
11
+
12
+ from modules import whisper_extractor as whisper
13
+
14
+
15
+ def whisper_encoder_batch(model, audio_paths):
16
+ batch = len(audio_paths)
17
+ batch_mel = torch.zeros((batch, 80, 3000), dtype=torch.float32, device=model.device)
18
+
19
+ for i, audio_path in enumerate(audio_paths):
20
+ # (48000,)
21
+ audio = whisper.load_audio(str(audio_path))
22
+ audio = whisper.pad_or_trim(audio)
23
+
24
+ # (80, 3000)
25
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
26
+ batch_mel[i] = mel
27
+
28
+ with torch.no_grad():
29
+ # (batch, 1500, 1024)
30
+ features = model.embed_audio(batch_mel)
31
+
32
+ return features.cpu().detach().numpy()
33
+
34
+
35
+ def whisper_encoder(model, audio_path):
36
+ audio = whisper.load_audio(str(audio_path))
37
+ audio = whisper.pad_or_trim(audio)
38
+
39
+ # (80, 3000)
40
+ mel = whisper.log_mel_spectrogram(audio).to(model.device).unsqueeze(0)
41
+
42
+ with torch.no_grad():
43
+ # (1, 1500, 1024) -> # (1500, 1024)
44
+ features = model.embed_audio(mel).squeeze(0)
45
+
46
+ return features.cpu().detach().numpy()
47
+
48
+
49
+ def get_mapped_whisper_features(
50
+ raw_whisper_features, mapping_features, fast_mapping=True
51
+ ):
52
+ """
53
+ Whisper: frameshift = 20ms (30s audio -> 1500 frames), hop_size = 480 in 24k
54
+ # Ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/model.py#L136
55
+
56
+ Now it's only used for mapping to bigvgan's mels (sr = 24k, hop_size = 256, frameshift ~= 10.7 ms)
57
+ """
58
+ source_hop = 480
59
+ target_hop = 256
60
+
61
+ factor = np.gcd(source_hop, target_hop)
62
+ source_hop //= factor
63
+ target_hop //= factor
64
+ print(
65
+ "Mapping source's {} frames => target's {} frames".format(
66
+ target_hop, source_hop
67
+ )
68
+ )
69
+
70
+ max_source_len = 1500
71
+ whisper_features = []
72
+ for index, mapping_feat in enumerate(tqdm(mapping_features)):
73
+ # mapping_feat: (mels_frame_len, n_mels)
74
+ target_len = mapping_feat.shape[0]
75
+ # The max target_len is 2812
76
+ target_len = min(target_len, max_source_len * source_hop // target_hop)
77
+
78
+ # (1500, dim)
79
+ raw_feats = raw_whisper_features[index]
80
+ width = raw_feats.shape[-1]
81
+
82
+ if fast_mapping:
83
+ source_len = target_len * target_hop // source_hop + 1
84
+ raw_feats = raw_feats[:source_len]
85
+ else:
86
+ source_len = max_source_len
87
+
88
+ # const ~= target_len * target_hop
89
+ const = source_len * source_hop // target_hop * target_hop
90
+
91
+ # (source_len * source_hop, dim)
92
+ up_sampling_feats = np.repeat(raw_feats, source_hop, axis=0)
93
+ # (const, dim) -> (const/target_hop, target_hop, dim) -> (const/target_hop, dim)
94
+ down_sampling_feats = np.average(
95
+ up_sampling_feats[:const].reshape(-1, target_hop, width), axis=1
96
+ )
97
+ assert len(down_sampling_feats) >= target_len
98
+
99
+ # (target_len, dim)
100
+ feats = down_sampling_feats[:target_len]
101
+ whisper_features.append(feats)
102
+
103
+ return whisper_features
104
+
105
+
106
+ def load_whisper_model(hps):
107
+ print("Loading Whisper Model: ", hps.whisper_model)
108
+ model = whisper.load_model(hps.whisper_model)
109
+ if torch.cuda.is_available():
110
+ model = model.cuda()
111
+
112
+ model = model.eval()
113
+ return model
114
+
115
+
116
+ def load_target_acoustic_features(
117
+ output_path, dataset, acoustic_features_name, acoustic_features_fs, dataset_type
118
+ ):
119
+ mapping_dir = os.path.join(
120
+ output_path,
121
+ dataset,
122
+ "{}/{}".format(acoustic_features_name, acoustic_features_fs),
123
+ )
124
+ with open(os.path.join(mapping_dir, "{}.pkl".format(dataset_type)), "rb") as f:
125
+ mapping_features = pickle.load(f)
126
+
127
+ # Mels: (n_mels, frame_len) -> (frame_len, n_mels)
128
+ if acoustic_features_name == "mels":
129
+ print("Transposing mel features...")
130
+ mapping_features = [feat.T for feat in mapping_features]
131
+
132
+ print(
133
+ "Mapping to the acoustic features {}, #sz = {}, feats[0] is {}".format(
134
+ acoustic_features_name, len(mapping_features), mapping_features[0].shape
135
+ )
136
+ )
137
+ return mapping_features
138
+
139
+
140
+ def extract_whisper_features_of_dataset(
141
+ datasets,
142
+ model,
143
+ batch_size,
144
+ out_dir,
145
+ ):
146
+ audio_paths = [utt["Path"] for utt in datasets]
147
+ if len(audio_paths) < batch_size:
148
+ batch_size = len(audio_paths)
149
+
150
+ start, end = 0, 0
151
+ while end < len(audio_paths):
152
+ # Raw features: (batch_size, 1500, dim)
153
+ start = end
154
+ end = start + batch_size
155
+ tmp_raw_whisper_features = whisper_encoder_batch(model, audio_paths[start:end])
156
+
157
+ # Mapping to acoustic features' lengths
158
+ for index, utt in enumerate(tqdm(datasets[start:end])):
159
+ uid = utt["Uid"]
160
+ raw_whisper_feature = tmp_raw_whisper_features[index]
161
+
162
+ save_path = os.path.join(out_dir, uid + ".npy")
163
+ np.save(save_path, raw_whisper_feature)
164
+
165
+ print("{}/{} Done...".format(end, len(audio_paths)))
utils/world.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2023 Amphion.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+ # 1. Extract WORLD features including F0, AP, SP
7
+ # 2. Transform between SP and MCEP
8
+ import torchaudio
9
+ import pyworld as pw
10
+ import numpy as np
11
+ import torch
12
+ import diffsptk
13
+ import os
14
+ from tqdm import tqdm
15
+ import pickle
16
+ import torchaudio
17
+
18
+
19
+ def get_mcep_params(fs):
20
+ """Hyperparameters of transformation between SP and MCEP
21
+
22
+ Reference:
23
+ https://github.com/CSTR-Edinburgh/merlin/blob/master/misc/scripts/vocoder/world_v2/copy_synthesis.sh
24
+
25
+ """
26
+ if fs in [44100, 48000]:
27
+ fft_size = 2048
28
+ alpha = 0.77
29
+ if fs in [16000]:
30
+ fft_size = 1024
31
+ alpha = 0.58
32
+ return fft_size, alpha
33
+
34
+
35
+ def extract_world_features(waveform, frameshift=10):
36
+ # waveform: (1, seq)
37
+ # x: (seq,)
38
+ x = np.array(waveform, dtype=np.double)
39
+
40
+ _f0, t = pw.dio(x, fs, frame_period=frameshift) # raw pitch extractor
41
+ f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement
42
+ sp = pw.cheaptrick(x, f0, t, fs) # extract smoothed spectrogram
43
+ ap = pw.d4c(x, f0, t, fs) # extract aperiodicity
44
+
45
+ return f0, sp, ap, fs
46
+
47
+
48
+ def sp2mcep(x, mcsize, fs):
49
+ fft_size, alpha = get_mcep_params(fs)
50
+ x = torch.as_tensor(x, dtype=torch.float)
51
+
52
+ tmp = diffsptk.ScalarOperation("SquareRoot")(x)
53
+ tmp = diffsptk.ScalarOperation("Multiplication", 32768.0)(tmp)
54
+ mgc = diffsptk.MelCepstralAnalysis(
55
+ cep_order=mcsize - 1, fft_length=fft_size, alpha=alpha, n_iter=1
56
+ )(tmp)
57
+ return mgc.numpy()
58
+
59
+
60
+ def mcep2sp(x, mcsize, fs):
61
+ fft_size, alpha = get_mcep_params(fs)
62
+ x = torch.as_tensor(x, dtype=torch.float)
63
+
64
+ tmp = diffsptk.MelGeneralizedCepstrumToSpectrum(
65
+ alpha=alpha,
66
+ cep_order=mcsize - 1,
67
+ fft_length=fft_size,
68
+ )(x)
69
+ tmp = diffsptk.ScalarOperation("Division", 32768.0)(tmp)
70
+ sp = diffsptk.ScalarOperation("Power", 2)(tmp)
71
+ return sp.double().numpy()
72
+
73
+
74
+ def f0_statistics(f0_features, path):
75
+ print("\nF0 statistics...")
76
+
77
+ total_f0 = []
78
+ for f0 in tqdm(f0_features):
79
+ total_f0 += [f for f in f0 if f != 0]
80
+
81
+ mean = sum(total_f0) / len(total_f0)
82
+ print("Min = {}, Max = {}, Mean = {}".format(min(total_f0), max(total_f0), mean))
83
+
84
+ with open(path, "wb") as f:
85
+ pickle.dump([mean, total_f0], f)
86
+
87
+
88
+ def world_synthesis(f0, sp, ap, fs, frameshift):
89
+ y = pw.synthesize(
90
+ f0, sp, ap, fs, frame_period=frameshift
91
+ ) # synthesize an utterance using the parameters
92
+ return y