commited on
Browse files- ckpts/svc/vocalist_l1_contentvec+whisper/args.json +257 -0
- ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/optimizer.bin +3 -0
- ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/pytorch_model.bin +3 -0
- ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/random_states_0.pkl +3 -0
- ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/singers.json +17 -0
- ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.0 +3 -0
- ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.1 +3 -0
- ckpts/svc/vocalist_l1_contentvec+whisper/singers.json +17 -0
- config/audioldm.json +92 -0
- config/autoencoderkl.json +69 -0
- config/base.json +220 -0
- config/comosvc.json +216 -0
- config/diffusion.json +227 -0
- config/fs2.json +117 -0
- config/transformer.json +180 -0
- config/tts.json +23 -0
- config/valle.json +52 -0
- config/vits.json +101 -0
- config/vocoder.json +84 -0
@@ -0,0 +1,257 @@
1 |
2 |
"task_type": "svc",
3 |
"dataset": [
4 |
5 |
6 |
"exp_name": "vocalist_l1_contentvec+whisper",
7 |
"inference": {
8 |
"diffusion": {
9 |
"scheduler": "pndm",
10 |
"scheduler_settings": {
11 |
"num_inference_timesteps": 1000,
12 |
13 |
14 |
15 |
"model": {
16 |
"condition_encoder": {
17 |
"content_encoder_dim": 384,
18 |
"contentvec_dim": 256,
19 |
"f0_max": 1100,
20 |
"f0_min": 50,
21 |
"input_loudness_dim": 1,
22 |
"input_melody_dim": 1,
23 |
"merge_mode": "add",
24 |
"mert_dim": 256,
25 |
"n_bins_loudness": 256,
26 |
"n_bins_melody": 256,
27 |
"output_content_dim": 384,
28 |
"output_loudness_dim": 384,
29 |
"output_melody_dim": 384,
30 |
"output_singer_dim": 384,
31 |
"pitch_max": 1100,
32 |
"pitch_min": 50,
33 |
"singer_table_size": 512,
34 |
"use_conformer_for_content_features": false,
35 |
"use_contentvec": true,
36 |
"use_log_f0": true,
37 |
"use_log_loudness": true,
38 |
"use_mert": false,
39 |
"use_singer_encoder": true,
40 |
"use_spkid": true,
41 |
"use_wenet": false,
42 |
"use_whisper": true,
43 |
"wenet_dim": 512,
44 |
"whisper_dim": 1024,
45 |
46 |
"diffusion": {
47 |
"bidilconv": {
48 |
"base_channel": 384,
49 |
"conditioner_size": 384,
50 |
"conv_kernel_size": 3,
51 |
"dilation_cycle_length": 4,
52 |
"n_res_block": 20,
53 |
54 |
"model_type": "bidilconv",
55 |
"scheduler": "ddpm",
56 |
"scheduler_settings": {
57 |
"beta_end": 0.02,
58 |
"beta_schedule": "linear",
59 |
"beta_start": 0.0001,
60 |
"num_train_timesteps": 1000,
61 |
62 |
"step_encoder": {
63 |
"activation": "SiLU",
64 |
"dim_hidden_layer": 512,
65 |
"dim_raw_embedding": 128,
66 |
"max_period": 10000,
67 |
"num_layer": 2,
68 |
69 |
"unet2d": {
70 |
"down_block_types": [
71 |
72 |
73 |
74 |
75 |
76 |
"in_channels": 1,
77 |
"mid_block_type": "UNetMidBlock2DCrossAttn",
78 |
"only_cross_attention": false,
79 |
"out_channels": 1,
80 |
"up_block_types": [
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
"model_type": "DiffWaveNetSVC",
90 |
"preprocess": {
91 |
"audio_dir": "audios",
92 |
"bits": 8,
93 |
"content_feature_batch_size": 16,
94 |
"contentvec_batch_size": 1,
95 |
"contentvec_dir": "contentvec",
96 |
"contentvec_file": "pretrained/contentvec/",
97 |
"contentvec_frameshift": 0.02,
98 |
"contentvec_sample_rate": 16000,
99 |
"dur_dir": "durs",
100 |
"duration_dir": "duration",
101 |
"emo2id": "emo2id.json",
102 |
"energy_dir": "energys",
103 |
"extract_audio": false,
104 |
"extract_contentvec_feature": true,
105 |
"extract_energy": true,
106 |
"extract_label": false,
107 |
"extract_mcep": false,
108 |
"extract_mel": true,
109 |
"extract_mert_feature": false,
110 |
"extract_pitch": true,
111 |
"extract_uv": true,
112 |
"extract_wenet_feature": false,
113 |
"extract_whisper_feature": true,
114 |
"f0_max": 1100,
115 |
"f0_min": 50,
116 |
"file_lst": "file.lst",
117 |
"fmax": 12000,
118 |
"fmin": 0,
119 |
"hop_size": 256,
120 |
"is_label": true,
121 |
"is_mu_law": true,
122 |
"lab_dir": "labs",
123 |
"label_dir": "labels",
124 |
"mcep_dir": "mcep",
125 |
"mel_dir": "mels",
126 |
"mel_min_max_norm": true,
127 |
"mel_min_max_stats_dir": "mel_min_max_stats",
128 |
"mert_dir": "mert",
129 |
"mert_feature_layer": -1,
130 |
"mert_frameshit": 0.01333,
131 |
"mert_hop_size": 320,
132 |
"mert_model": "m-a-p/MERT-v1-330M",
133 |
"min_level_db": -115,
134 |
"mu_law_norm": false,
135 |
"n_fft": 1024,
136 |
"n_mel": 100,
137 |
"num_silent_frames": 8,
138 |
"num_workers": 8,
139 |
"phone_seq_file": "phone_seq_file",
140 |
"pin_memory": true,
141 |
"pitch_bin": 256,
142 |
"pitch_dir": "pitches",
143 |
"pitch_extractor": "crepe", // "parselmouth"
144 |
"pitch_max": 1100.0,
145 |
"pitch_min": 50.0,
146 |
"processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
147 |
"ref_level_db": 20,
148 |
"sample_rate": 24000,
149 |
"spk2id": "singers.json",
150 |
"train_file": "train.json",
151 |
"trim_fft_size": 512,
152 |
"trim_hop_size": 128,
153 |
"trim_silence": false,
154 |
"trim_top_db": 30,
155 |
"trimmed_wav_dir": "trimmed_wavs",
156 |
"use_audio": false,
157 |
"use_contentvec": true,
158 |
"use_dur": false,
159 |
"use_emoid": false,
160 |
"use_frame_duration": false,
161 |
"use_frame_energy": true,
162 |
"use_frame_pitch": true,
163 |
"use_lab": false,
164 |
"use_label": false,
165 |
"use_log_scale_energy": false,
166 |
"use_log_scale_pitch": false,
167 |
"use_mel": true,
168 |
"use_mert": false,
169 |
"use_min_max_norm_mel": true,
170 |
"use_one_hot": false,
171 |
"use_phn_seq": false,
172 |
"use_phone_duration": false,
173 |
"use_phone_energy": false,
174 |
"use_phone_pitch": false,
175 |
"use_spkid": true,
176 |
"use_uv": true,
177 |
"use_wav": false,
178 |
"use_wenet": false,
179 |
"use_whisper": true,
180 |
"utt2emo": "utt2emo",
181 |
"utt2spk": "utt2singer",
182 |
"uv_dir": "uvs",
183 |
"valid_file": "test.json",
184 |
"wav_dir": "wavs",
185 |
"wenet_batch_size": 1,
186 |
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
187 |
"wenet_dir": "wenet",
188 |
"wenet_downsample_rate": 4,
189 |
"wenet_frameshift": 0.01,
190 |
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/",
191 |
"wenet_sample_rate": 16000,
192 |
"whisper_batch_size": 30,
193 |
"whisper_dir": "whisper",
194 |
"whisper_downsample_rate": 2,
195 |
"whisper_frameshift": 0.01,
196 |
"whisper_model": "medium",
197 |
"whisper_model_path": "pretrained/whisper/",
198 |
"whisper_sample_rate": 16000,
199 |
"win_size": 1024,
200 |
201 |
"supported_model_type": [
202 |
203 |
204 |
205 |
206 |
207 |
208 |
"train": {
209 |
"adamw": {
210 |
"lr": 0.0004,
211 |
212 |
"batch_size": 32,
213 |
"dataloader": {
214 |
"num_worker": 8,
215 |
"pin_memory": true,
216 |
217 |
"ddp": true,
218 |
"epochs": 50000,
219 |
"gradient_accumulation_step": 1,
220 |
"keep_checkpoint_max": 5,
221 |
"keep_last": [
222 |
223 |
224 |
225 |
"max_epoch": -1,
226 |
"max_steps": 1000000,
227 |
"multi_speaker_training": false,
228 |
"optimizer": "AdamW",
229 |
"random_seed": 10086,
230 |
"reducelronplateau": {
231 |
"factor": 0.8,
232 |
"min_lr": 0.0001,
233 |
"patience": 10,
234 |
235 |
"run_eval": [
236 |
237 |
238 |
239 |
"sampler": {
240 |
"drop_last": true,
241 |
"holistic_shuffle": false,
242 |
243 |
"save_checkpoint_stride": [
244 |
245 |
246 |
247 |
"save_checkpoints_steps": 10000,
248 |
"save_summary_steps": 500,
249 |
"scheduler": "ReduceLROnPlateau",
250 |
"total_training_steps": 50000,
251 |
"tracker": [
252 |
253 |
254 |
"valid_interval": 10000,
255 |
256 |
"use_custom_dataset": true,
257 |
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:836af10b834c7aec9209eb19ce43559e6ef1e3a59bd6468e90cadbc9a18749ef
3 |
size 249512389
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:d54eed12bef331095fc367f196d07c5061d5cb72dd6fe0e1e4453b997bf1d68d
3 |
size 124755137
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:6798ddffadcd7d5405a77e667c674c474e4fef0cba817fdd300c7c985c1e82fe
3 |
size 14599
@@ -0,0 +1,17 @@
1 |
2 |
"vocalist_l1_Adele": 0,
3 |
"vocalist_l1_Beyonce": 1,
4 |
"vocalist_l1_BrunoMars": 2,
5 |
"vocalist_l1_JohnMayer": 3,
6 |
"vocalist_l1_MichaelJackson": 4,
7 |
"vocalist_l1_TaylorSwift": 5,
8 |
"vocalist_l1_张学友": 6,
9 |
"vocalist_l1_李健": 7,
10 |
"vocalist_l1_汪峰": 8,
11 |
"vocalist_l1_王菲": 9,
12 |
"vocalist_l1_石倚洁": 10,
13 |
"vocalist_l1_蔡琴": 11,
14 |
"vocalist_l1_那英": 12,
15 |
"vocalist_l1_陈奕迅": 13,
16 |
"vocalist_l1_陶喆": 14
17 |
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:d7f490fd0c97876e24bfc44413365ded7ff5d22c1c79f0dac0b754f3b32df76f
3 |
size 88
@@ -0,0 +1,3 @@
1 |
2 |
oid sha256:e01bcf2fa621ba563b70568c18fe0742d0f48cafae83a6e8beb0bb6d1f6d146d
3 |
size 77413046
@@ -0,0 +1,17 @@
1 |
2 |
"vocalist_l1_Adele": 0,
3 |
"vocalist_l1_Beyonce": 1,
4 |
"vocalist_l1_BrunoMars": 2,
5 |
"vocalist_l1_JohnMayer": 3,
6 |
"vocalist_l1_MichaelJackson": 4,
7 |
"vocalist_l1_TaylorSwift": 5,
8 |
"vocalist_l1_张学友": 6,
9 |
"vocalist_l1_李健": 7,
10 |
"vocalist_l1_汪峰": 8,
11 |
"vocalist_l1_王菲": 9,
12 |
"vocalist_l1_石倚洁": 10,
13 |
"vocalist_l1_蔡琴": 11,
14 |
"vocalist_l1_那英": 12,
15 |
"vocalist_l1_陈奕迅": 13,
16 |
"vocalist_l1_陶喆": 14
17 |
@@ -0,0 +1,92 @@
1 |
2 |
"base_config": "config/base.json",
3 |
"model_type": "AudioLDM",
4 |
"task_type": "tta",
5 |
"dataset": [
6 |
7 |
8 |
"preprocess": {
9 |
// feature used for model training
10 |
"use_spkid": false,
11 |
"use_uv": false,
12 |
"use_frame_pitch": false,
13 |
"use_phone_pitch": false,
14 |
"use_frame_energy": false,
15 |
"use_phone_energy": false,
16 |
"use_mel": false,
17 |
"use_audio": false,
18 |
"use_label": false,
19 |
"use_one_hot": false,
20 |
"cond_mask_prob": 0.1
21 |
22 |
// model
23 |
"model": {
24 |
"audioldm": {
25 |
"image_size": 32,
26 |
"in_channels": 4,
27 |
"out_channels": 4,
28 |
"model_channels": 256,
29 |
"attention_resolutions": [
30 |
31 |
32 |
33 |
34 |
"num_res_blocks": 2,
35 |
"channel_mult": [
36 |
37 |
38 |
39 |
40 |
"num_heads": 8,
41 |
"use_spatial_transformer": true,
42 |
"transformer_depth": 1,
43 |
"context_dim": 768,
44 |
"use_checkpoint": true,
45 |
"legacy": false
46 |
47 |
"autoencoderkl": {
48 |
"ch": 128,
49 |
"ch_mult": [
50 |
51 |
52 |
53 |
54 |
55 |
56 |
"num_res_blocks": 2,
57 |
"in_channels": 1,
58 |
"z_channels": 4,
59 |
"out_ch": 1,
60 |
"double_z": true
61 |
62 |
"noise_scheduler": {
63 |
"num_train_timesteps": 1000,
64 |
"beta_start": 0.00085,
65 |
"beta_end": 0.012,
66 |
"beta_schedule": "scaled_linear",
67 |
"clip_sample": false,
68 |
"steps_offset": 1,
69 |
"set_alpha_to_one": false,
70 |
"skip_prk_steps": true,
71 |
"prediction_type": "epsilon"
72 |
73 |
74 |
// train
75 |
"train": {
76 |
"lronPlateau": {
77 |
"factor": 0.9,
78 |
"patience": 100,
79 |
"min_lr": 4.0e-5,
80 |
"verbose": true
81 |
82 |
"adam": {
83 |
"lr": 5.0e-5,
84 |
"betas": [
85 |
86 |
87 |
88 |
"weight_decay": 1.0e-2,
89 |
"eps": 1.0e-8
90 |
91 |
92 |
@@ -0,0 +1,69 @@
1 |
2 |
"base_config": "config/base.json",
3 |
"model_type": "AutoencoderKL",
4 |
"task_type": "tta",
5 |
"dataset": [
6 |
7 |
8 |
"preprocess": {
9 |
// feature used for model training
10 |
"use_spkid": false,
11 |
"use_uv": false,
12 |
"use_frame_pitch": false,
13 |
"use_phone_pitch": false,
14 |
"use_frame_energy": false,
15 |
"use_phone_energy": false,
16 |
"use_mel": false,
17 |
"use_audio": false,
18 |
"use_label": false,
19 |
"use_one_hot": false
20 |
21 |
// model
22 |
"model": {
23 |
"autoencoderkl": {
24 |
"ch": 128,
25 |
"ch_mult": [
26 |
27 |
28 |
29 |
30 |
31 |
32 |
"num_res_blocks": 2,
33 |
"in_channels": 1,
34 |
"z_channels": 4,
35 |
"out_ch": 1,
36 |
"double_z": true
37 |
38 |
"loss": {
39 |
"kl_weight": 1e-8,
40 |
"disc_weight": 0.5,
41 |
"disc_factor": 1.0,
42 |
"logvar_init": 0.0,
43 |
"min_adapt_d_weight": 0.0,
44 |
"max_adapt_d_weight": 10.0,
45 |
"disc_start": 50001,
46 |
"disc_in_channels": 1,
47 |
"disc_num_layers": 3,
48 |
"use_actnorm": false
49 |
50 |
51 |
// train
52 |
"train": {
53 |
"lronPlateau": {
54 |
"factor": 0.9,
55 |
"patience": 100,
56 |
"min_lr": 4.0e-5,
57 |
"verbose": true
58 |
59 |
"adam": {
60 |
"lr": 4.0e-4,
61 |
"betas": [
62 |
63 |
64 |
65 |
"weight_decay": 1.0e-2,
66 |
"eps": 1.0e-8
67 |
68 |
69 |
@@ -0,0 +1,220 @@
1 |
2 |
"supported_model_type": [
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
"task_type": "",
11 |
"dataset": [],
12 |
"use_custom_dataset": false,
13 |
"preprocess": {
14 |
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
15 |
// trim audio silence
16 |
"data_augment": false,
17 |
"trim_silence": false,
18 |
"num_silent_frames": 8,
19 |
"trim_fft_size": 512, // fft size used in trimming
20 |
"trim_hop_size": 128, // hop size used in trimming
21 |
"trim_top_db": 30, // top db used in trimming sensitive to each dataset
22 |
// acoustic features
23 |
"extract_mel": false,
24 |
"mel_extract_mode": "",
25 |
"extract_linear_spec": false,
26 |
"extract_mcep": false,
27 |
"extract_pitch": false,
28 |
"extract_acoustic_token": false,
29 |
"pitch_remove_outlier": false,
30 |
"extract_uv": false,
31 |
"pitch_norm": false,
32 |
"extract_audio": false,
33 |
"extract_label": false,
34 |
"pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
35 |
"extract_energy": false,
36 |
"energy_remove_outlier": false,
37 |
"energy_norm": false,
38 |
"energy_extract_mode": "from_mel",
39 |
"extract_duration": false,
40 |
"extract_amplitude_phase": false,
41 |
"mel_min_max_norm": false,
42 |
// lingusitic features
43 |
"extract_phone": false,
44 |
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
45 |
// content features
46 |
"extract_whisper_feature": false,
47 |
"extract_contentvec_feature": false,
48 |
"extract_mert_feature": false,
49 |
"extract_wenet_feature": false,
50 |
// Settings for data preprocessing
51 |
"n_mel": 80,
52 |
"win_size": 480,
53 |
"hop_size": 120,
54 |
"sample_rate": 24000,
55 |
"n_fft": 1024,
56 |
"fmin": 0,
57 |
"fmax": 12000,
58 |
"min_level_db": -115,
59 |
"ref_level_db": 20,
60 |
"bits": 8,
61 |
// Directory names of processed data or extracted features
62 |
"processed_dir": "processed_data",
63 |
"trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
64 |
"raw_data": "raw_data",
65 |
"phone_dir": "phones",
66 |
"wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
67 |
"audio_dir": "audios",
68 |
"log_amplitude_dir": "log_amplitudes",
69 |
"phase_dir": "phases",
70 |
"real_dir": "reals",
71 |
"imaginary_dir": "imaginarys",
72 |
"label_dir": "labels",
73 |
"linear_dir": "linears",
74 |
"mel_dir": "mels", // directory name of extraced mel features
75 |
"mcep_dir": "mcep", // directory name of extraced mcep features
76 |
"dur_dir": "durs",
77 |
"symbols_dict": "symbols.dict",
78 |
"lab_dir": "labs", // directory name of extraced label features
79 |
"wenet_dir": "wenet", // directory name of extraced wenet features
80 |
"contentvec_dir": "contentvec", // directory name of extraced wenet features
81 |
"pitch_dir": "pitches", // directory name of extraced pitch features
82 |
"energy_dir": "energys", // directory name of extracted energy features
83 |
"phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
84 |
"phone_energy_dir": "phone_energys", // directory name of extracted energy features
85 |
"uv_dir": "uvs", // directory name of extracted unvoiced features
86 |
"duration_dir": "duration", // ground-truth duration file
87 |
"phone_seq_file": "phone_seq_file", // phoneme sequence file
88 |
"file_lst": "file.lst",
89 |
"train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
90 |
"valid_file": "valid.json", // validattion set
91 |
"spk2id": "spk2id.json", // used for multi-speaker dataset
92 |
"utt2spk": "utt2spk", // used for multi-speaker dataset
93 |
"emo2id": "emo2id.json", // used for multi-emotion dataset
94 |
"utt2emo": "utt2emo", // used for multi-emotion dataset
95 |
// Features used for model training
96 |
"use_text": false,
97 |
"use_phone": false,
98 |
"use_phn_seq": false,
99 |
"use_lab": false,
100 |
"use_linear": false,
101 |
"use_mel": false,
102 |
"use_min_max_norm_mel": false,
103 |
"use_wav": false,
104 |
"use_phone_pitch": false,
105 |
"use_log_scale_pitch": false,
106 |
"use_phone_energy": false,
107 |
"use_phone_duration": false,
108 |
"use_log_scale_energy": false,
109 |
"use_wenet": false,
110 |
"use_dur": false,
111 |
"use_spkid": false, // True: use speaker id for multi-speaker dataset
112 |
"use_emoid": false, // True: use emotion id for multi-emotion dataset
113 |
"use_frame_pitch": false,
114 |
"use_uv": false,
115 |
"use_frame_energy": false,
116 |
"use_frame_duration": false,
117 |
"use_audio": false,
118 |
"use_label": false,
119 |
"use_one_hot": false,
120 |
"use_amplitude_phase": false,
121 |
"data_augment": false,
122 |
"align_mel_duration": false
123 |
124 |
"train": {
125 |
"ddp": true,
126 |
"random_seed": 970227,
127 |
"batch_size": 16,
128 |
"max_steps": 1000000,
129 |
// Trackers
130 |
"tracker": [
131 |
132 |
// "wandb",
133 |
// "cometml",
134 |
// "mlflow",
135 |
136 |
"max_epoch": -1,
137 |
// -1 means no limit
138 |
"save_checkpoint_stride": [
139 |
140 |
141 |
142 |
// unit is epoch
143 |
"keep_last": [
144 |
145 |
146 |
147 |
// -1 means infinite, if one number will broadcast
148 |
"run_eval": [
149 |
150 |
151 |
152 |
// if one number will broadcast
153 |
// Fix the random seed
154 |
"random_seed": 10086,
155 |
// Optimizer
156 |
"optimizer": "AdamW",
157 |
"adamw": {
158 |
"lr": 4.0e-4
159 |
// nn model lr
160 |
161 |
// LR Scheduler
162 |
"scheduler": "ReduceLROnPlateau",
163 |
"reducelronplateau": {
164 |
"factor": 0.8,
165 |
"patience": 10,
166 |
// unit is epoch
167 |
"min_lr": 1.0e-4
168 |
169 |
// Batchsampler
170 |
"sampler": {
171 |
"holistic_shuffle": true,
172 |
"drop_last": true
173 |
174 |
// Dataloader
175 |
"dataloader": {
176 |
"num_worker": 32,
177 |
"pin_memory": true
178 |
179 |
"gradient_accumulation_step": 1,
180 |
"total_training_steps": 50000,
181 |
"save_summary_steps": 500,
182 |
"save_checkpoints_steps": 10000,
183 |
"valid_interval": 10000,
184 |
"keep_checkpoint_max": 5,
185 |
"multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
186 |
"max_epoch": -1,
187 |
// -1 means no limit
188 |
"save_checkpoint_stride": [
189 |
190 |
191 |
192 |
// unit is epoch
193 |
"keep_last": [
194 |
195 |
196 |
197 |
// -1 means infinite, if one number will broadcast
198 |
"run_eval": [
199 |
200 |
201 |
202 |
// Batchsampler
203 |
"sampler": {
204 |
"holistic_shuffle": true,
205 |
"drop_last": true
206 |
207 |
// Dataloader
208 |
"dataloader": {
209 |
"num_worker": 32,
210 |
"pin_memory": true
211 |
212 |
// Trackers
213 |
"tracker": [
214 |
215 |
// "wandb",
216 |
// "cometml",
217 |
// "mlflow",
218 |
219 |
220 |
@@ -0,0 +1,216 @@
1 |
2 |
"base_config": "config/base.json",
3 |
"model_type": "DiffComoSVC",
4 |
"task_type": "svc",
5 |
"use_custom_dataset": false,
6 |
"preprocess": {
7 |
// data augmentations
8 |
"use_pitch_shift": false,
9 |
"use_formant_shift": false,
10 |
"use_time_stretch": false,
11 |
"use_equalizer": false,
12 |
// acoustic features
13 |
"extract_mel": true,
14 |
"mel_min_max_norm": true,
15 |
"extract_pitch": true,
16 |
"pitch_extractor": "parselmouth",
17 |
"extract_uv": true,
18 |
"extract_energy": true,
19 |
// content features
20 |
"extract_whisper_feature": false,
21 |
"whisper_sample_rate": 16000,
22 |
"extract_contentvec_feature": false,
23 |
"contentvec_sample_rate": 16000,
24 |
"extract_wenet_feature": false,
25 |
"wenet_sample_rate": 16000,
26 |
"extract_mert_feature": false,
27 |
"mert_sample_rate": 16000,
28 |
// Default config for whisper
29 |
"whisper_frameshift": 0.01,
30 |
"whisper_downsample_rate": 2,
31 |
// Default config for content vector
32 |
"contentvec_frameshift": 0.02,
33 |
// Default config for mert
34 |
"mert_model": "m-a-p/MERT-v1-330M",
35 |
"mert_feature_layer": -1,
36 |
"mert_hop_size": 320,
37 |
// 24k
38 |
"mert_frameshit": 0.01333,
39 |
// 10ms
40 |
"wenet_frameshift": 0.01,
41 |
// wenetspeech is 4, gigaspeech is 6
42 |
"wenet_downsample_rate": 4,
43 |
// Default config
44 |
"n_mel": 100,
45 |
"win_size": 1024,
46 |
// todo
47 |
"hop_size": 256,
48 |
"sample_rate": 24000,
49 |
"n_fft": 1024,
50 |
// todo
51 |
"fmin": 0,
52 |
"fmax": 12000,
53 |
// todo
54 |
"f0_min": 50,
55 |
// ~C2
56 |
"f0_max": 1100,
57 |
//1100, // ~C6(1100), ~G5(800)
58 |
"pitch_bin": 256,
59 |
"pitch_max": 1100.0,
60 |
"pitch_min": 50.0,
61 |
"is_label": true,
62 |
"is_mu_law": true,
63 |
"bits": 8,
64 |
"mel_min_max_stats_dir": "mel_min_max_stats",
65 |
"whisper_dir": "whisper",
66 |
"contentvec_dir": "contentvec",
67 |
"wenet_dir": "wenet",
68 |
"mert_dir": "mert",
69 |
// Extract content features using dataloader
70 |
"pin_memory": true,
71 |
"num_workers": 8,
72 |
"content_feature_batch_size": 16,
73 |
// Features used for model training
74 |
"use_mel": true,
75 |
"use_min_max_norm_mel": true,
76 |
"use_frame_pitch": true,
77 |
"use_uv": true,
78 |
"use_frame_energy": true,
79 |
"use_log_scale_pitch": false,
80 |
"use_log_scale_energy": false,
81 |
"use_spkid": true,
82 |
// Meta file
83 |
"train_file": "train.json",
84 |
"valid_file": "test.json",
85 |
"spk2id": "singers.json",
86 |
"utt2spk": "utt2singer"
87 |
88 |
"model": {
89 |
"teacher_model_path": "[Your Teacher Model Path].bin",
90 |
"condition_encoder": {
91 |
"merge_mode": "add",
92 |
"input_melody_dim": 1,
93 |
"use_log_f0": true,
94 |
"n_bins_melody": 256,
95 |
//# Quantization (0 for not quantization)
96 |
"output_melody_dim": 384,
97 |
"input_loudness_dim": 1,
98 |
"use_log_loudness": true,
99 |
"n_bins_loudness": 256,
100 |
"output_loudness_dim": 384,
101 |
"use_whisper": false,
102 |
"use_contentvec": false,
103 |
"use_wenet": false,
104 |
"use_mert": false,
105 |
"whisper_dim": 1024,
106 |
"contentvec_dim": 256,
107 |
"mert_dim": 256,
108 |
"wenet_dim": 512,
109 |
"content_encoder_dim": 384,
110 |
"output_singer_dim": 384,
111 |
"singer_table_size": 512,
112 |
"output_content_dim": 384,
113 |
"use_spkid": true
114 |
115 |
"comosvc": {
116 |
"distill": false,
117 |
// conformer encoder
118 |
"input_dim": 384,
119 |
"output_dim": 100,
120 |
"n_heads": 2,
121 |
"n_layers": 6,
122 |
"filter_channels": 512,
123 |
"dropout": 0.1,
124 |
// karras diffusion
125 |
"P_mean": -1.2,
126 |
"P_std": 1.2,
127 |
"sigma_data": 0.5,
128 |
"sigma_min": 0.002,
129 |
"sigma_max": 80,
130 |
"rho": 7,
131 |
"n_timesteps": 40,
132 |
133 |
"diffusion": {
134 |
// Diffusion steps encoder
135 |
"step_encoder": {
136 |
"dim_raw_embedding": 128,
137 |
"dim_hidden_layer": 512,
138 |
"activation": "SiLU",
139 |
"num_layer": 2,
140 |
"max_period": 10000
141 |
142 |
// Diffusion decoder
143 |
"model_type": "bidilconv",
144 |
// bidilconv, unet2d, TODO: unet1d
145 |
"bidilconv": {
146 |
"base_channel": 384,
147 |
"n_res_block": 20,
148 |
"conv_kernel_size": 3,
149 |
"dilation_cycle_length": 4,
150 |
// specially, 1 means no dilation
151 |
"conditioner_size": 100
152 |
153 |
154 |
155 |
"train": {
156 |
// Basic settings
157 |
"fast_steps": 0,
158 |
"batch_size": 32,
159 |
"gradient_accumulation_step": 1,
160 |
"max_epoch": -1,
161 |
// -1 means no limit
162 |
"save_checkpoint_stride": [
163 |
164 |
165 |
166 |
// unit is epoch
167 |
"keep_last": [
168 |
169 |
170 |
171 |
// -1 means infinite, if one number will broadcast
172 |
"run_eval": [
173 |
174 |
175 |
176 |
// if one number will broadcast
177 |
// Fix the random seed
178 |
"random_seed": 10086,
179 |
// Batchsampler
180 |
"sampler": {
181 |
"holistic_shuffle": true,
182 |
"drop_last": true
183 |
184 |
// Dataloader
185 |
"dataloader": {
186 |
"num_worker": 32,
187 |
"pin_memory": true
188 |
189 |
// Trackers
190 |
"tracker": [
191 |
192 |
// "wandb",
193 |
// "cometml",
194 |
// "mlflow",
195 |
196 |
// Optimizer
197 |
"optimizer": "AdamW",
198 |
"adamw": {
199 |
"lr": 4.0e-4
200 |
// nn model lr
201 |
202 |
// LR Scheduler
203 |
"scheduler": "ReduceLROnPlateau",
204 |
"reducelronplateau": {
205 |
"factor": 0.8,
206 |
"patience": 10,
207 |
// unit is epoch
208 |
"min_lr": 1.0e-4
209 |
210 |
211 |
"inference": {
212 |
"comosvc": {
213 |
"inference_steps": 40
214 |
215 |
216 |
@@ -0,0 +1,227 @@
1 |
2 |
3 |
"base_config": "config/base.json",
4 |
"model_type": "diffusion",
5 |
"task_type": "svc",
6 |
"use_custom_dataset": false,
7 |
"preprocess": {
8 |
// data augmentations
9 |
"use_pitch_shift": false,
10 |
"use_formant_shift": false,
11 |
"use_time_stretch": false,
12 |
"use_equalizer": false,
13 |
// acoustic features
14 |
"extract_mel": true,
15 |
"mel_min_max_norm": true,
16 |
"extract_pitch": true,
17 |
"pitch_extractor": "parselmouth",
18 |
"extract_uv": true,
19 |
"extract_energy": true,
20 |
// content features
21 |
"extract_whisper_feature": false,
22 |
"whisper_sample_rate": 16000,
23 |
"extract_contentvec_feature": false,
24 |
"contentvec_sample_rate": 16000,
25 |
"extract_wenet_feature": false,
26 |
"wenet_sample_rate": 16000,
27 |
"extract_mert_feature": false,
28 |
"mert_sample_rate": 16000,
29 |
// Default config for whisper
30 |
"whisper_frameshift": 0.01,
31 |
"whisper_downsample_rate": 2,
32 |
// Default config for content vector
33 |
"contentvec_frameshift": 0.02,
34 |
// Default config for mert
35 |
"mert_model": "m-a-p/MERT-v1-330M",
36 |
"mert_feature_layer": -1,
37 |
"mert_hop_size": 320,
38 |
// 24k
39 |
"mert_frameshit": 0.01333,
40 |
// 10ms
41 |
"wenet_frameshift": 0.01,
42 |
// wenetspeech is 4, gigaspeech is 6
43 |
"wenet_downsample_rate": 4,
44 |
// Default config
45 |
"n_mel": 100,
46 |
"win_size": 1024,
47 |
// todo
48 |
"hop_size": 256,
49 |
"sample_rate": 24000,
50 |
"n_fft": 1024,
51 |
// todo
52 |
"fmin": 0,
53 |
"fmax": 12000,
54 |
// todo
55 |
"f0_min": 50,
56 |
// ~C2
57 |
"f0_max": 1100,
58 |
//1100, // ~C6(1100), ~G5(800)
59 |
"pitch_bin": 256,
60 |
"pitch_max": 1100.0,
61 |
"pitch_min": 50.0,
62 |
"is_label": true,
63 |
"is_mu_law": true,
64 |
"bits": 8,
65 |
"mel_min_max_stats_dir": "mel_min_max_stats",
66 |
"whisper_dir": "whisper",
67 |
"contentvec_dir": "contentvec",
68 |
"wenet_dir": "wenet",
69 |
"mert_dir": "mert",
70 |
// Extract content features using dataloader
71 |
"pin_memory": true,
72 |
"num_workers": 8,
73 |
"content_feature_batch_size": 16,
74 |
// Features used for model training
75 |
"use_mel": true,
76 |
"use_min_max_norm_mel": true,
77 |
"use_frame_pitch": true,
78 |
"use_uv": true,
79 |
"use_frame_energy": true,
80 |
"use_log_scale_pitch": false,
81 |
"use_log_scale_energy": false,
82 |
"use_spkid": true,
83 |
// Meta file
84 |
"train_file": "train.json",
85 |
"valid_file": "test.json",
86 |
"spk2id": "singers.json",
87 |
"utt2spk": "utt2singer"
88 |
89 |
"model": {
90 |
"condition_encoder": {
91 |
"merge_mode": "add",
92 |
"input_melody_dim": 1,
93 |
"use_log_f0": true,
94 |
"n_bins_melody": 256,
95 |
//# Quantization (0 for not quantization)
96 |
"output_melody_dim": 384,
97 |
"input_loudness_dim": 1,
98 |
"use_log_loudness": true,
99 |
"n_bins_loudness": 256,
100 |
"output_loudness_dim": 384,
101 |
"use_whisper": false,
102 |
"use_contentvec": false,
103 |
"use_wenet": false,
104 |
"use_mert": false,
105 |
"whisper_dim": 1024,
106 |
"contentvec_dim": 256,
107 |
"mert_dim": 256,
108 |
"wenet_dim": 512,
109 |
"content_encoder_dim": 384,
110 |
"output_singer_dim": 384,
111 |
"singer_table_size": 512,
112 |
"output_content_dim": 384,
113 |
"use_spkid": true
114 |
115 |
116 |
"diffusion": {
117 |
"scheduler": "ddpm",
118 |
"scheduler_settings": {
119 |
"num_train_timesteps": 1000,
120 |
"beta_start": 1.0e-4,
121 |
"beta_end": 0.02,
122 |
"beta_schedule": "linear"
123 |
124 |
// Diffusion steps encoder
125 |
"step_encoder": {
126 |
"dim_raw_embedding": 128,
127 |
"dim_hidden_layer": 512,
128 |
"activation": "SiLU",
129 |
"num_layer": 2,
130 |
"max_period": 10000
131 |
132 |
// Diffusion decoder
133 |
"model_type": "bidilconv",
134 |
// bidilconv, unet2d, TODO: unet1d
135 |
"bidilconv": {
136 |
"base_channel": 384,
137 |
"n_res_block": 20,
138 |
"conv_kernel_size": 3,
139 |
"dilation_cycle_length": 4,
140 |
// specially, 1 means no dilation
141 |
"conditioner_size": 384
142 |
143 |
"unet2d": {
144 |
"in_channels": 1,
145 |
"out_channels": 1,
146 |
"down_block_types": [
147 |
148 |
149 |
150 |
151 |
152 |
"mid_block_type": "UNetMidBlock2DCrossAttn",
153 |
"up_block_types": [
154 |
155 |
156 |
157 |
158 |
159 |
"only_cross_attention": false
160 |
161 |
162 |
163 |
164 |
"train": {
165 |
// Basic settings
166 |
"batch_size": 64,
167 |
"gradient_accumulation_step": 1,
168 |
"max_epoch": -1,
169 |
// -1 means no limit
170 |
"save_checkpoint_stride": [
171 |
172 |
173 |
174 |
// unit is epoch
175 |
"keep_last": [
176 |
177 |
178 |
179 |
// -1 means infinite, if one number will broadcast
180 |
"run_eval": [
181 |
182 |
183 |
184 |
// if one number will broadcast
185 |
// Fix the random seed
186 |
"random_seed": 10086,
187 |
// Batchsampler
188 |
"sampler": {
189 |
"holistic_shuffle": true,
190 |
"drop_last": true
191 |
192 |
// Dataloader
193 |
"dataloader": {
194 |
"num_worker": 32,
195 |
"pin_memory": true
196 |
197 |
// Trackers
198 |
"tracker": [
199 |
200 |
// "wandb",
201 |
// "cometml",
202 |
// "mlflow",
203 |
204 |
// Optimizer
205 |
"optimizer": "AdamW",
206 |
"adamw": {
207 |
"lr": 4.0e-4
208 |
// nn model lr
209 |
210 |
// LR Scheduler
211 |
"scheduler": "ReduceLROnPlateau",
212 |
"reducelronplateau": {
213 |
"factor": 0.8,
214 |
"patience": 10,
215 |
// unit is epoch
216 |
"min_lr": 1.0e-4
217 |
218 |
219 |
"inference": {
220 |
"diffusion": {
221 |
"scheduler": "pndm",
222 |
"scheduler_settings": {
223 |
"num_inference_timesteps": 1000
224 |
225 |
226 |
227 |
@@ -0,0 +1,117 @@
1 |
2 |
"base_config": "config/tts.json",
3 |
"model_type": "FastSpeech2",
4 |
"task_type": "tts",
5 |
"dataset": ["LJSpeech"],
6 |
"preprocess": {
7 |
// acoustic features
8 |
"extract_audio": true,
9 |
"extract_mel": true,
10 |
"mel_extract_mode": "taco",
11 |
"mel_min_max_norm": false,
12 |
"extract_pitch": true,
13 |
"extract_uv": false,
14 |
"pitch_extractor": "dio",
15 |
"extract_energy": true,
16 |
"energy_extract_mode": "from_tacotron_stft",
17 |
"extract_duration": true,
18 |
"use_phone": true,
19 |
"pitch_norm": true,
20 |
"energy_norm": true,
21 |
"pitch_remove_outlier": true,
22 |
"energy_remove_outlier": true,
23 |
24 |
// Default config
25 |
"n_mel": 80,
26 |
"win_size": 1024, // todo
27 |
"hop_size": 256,
28 |
"sample_rate": 22050,
29 |
"n_fft": 1024, // todo
30 |
"fmin": 0,
31 |
"fmax": 8000, // todo
32 |
"raw_data": "raw_data",
33 |
"text_cleaners": ["english_cleaners"],
34 |
"f0_min": 71, // ~C2
35 |
"f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36 |
"pitch_bin": 256,
37 |
"pitch_max": 1100.0,
38 |
"pitch_min": 50.0,
39 |
"is_label": true,
40 |
"is_mu_law": true,
41 |
"bits": 8,
42 |
43 |
"mel_min_max_stats_dir": "mel_min_max_stats",
44 |
"whisper_dir": "whisper",
45 |
"content_vector_dir": "content_vector",
46 |
"wenet_dir": "wenet",
47 |
"mert_dir": "mert",
48 |
49 |
50 |
51 |
// Features used for model training
52 |
"use_mel": true,
53 |
"use_min_max_norm_mel": false,
54 |
"use_frame_pitch": false,
55 |
"use_frame_energy": false,
56 |
"use_phone_pitch": true,
57 |
"use_phone_energy": true,
58 |
"use_log_scale_pitch": false,
59 |
"use_log_scale_energy": false,
60 |
"use_spkid": false,
61 |
"align_mel_duration": true,
62 |
"text_cleaners": ["english_cleaners"]
63 |
64 |
"model": {
65 |
// Settings for transformer
66 |
"transformer": {
67 |
"encoder_layer": 4,
68 |
"encoder_head": 2,
69 |
"encoder_hidden": 256,
70 |
"decoder_layer": 6,
71 |
"decoder_head": 2,
72 |
"decoder_hidden": 256,
73 |
"conv_filter_size": 1024,
74 |
"conv_kernel_size": [9, 1],
75 |
"encoder_dropout": 0.2,
76 |
"decoder_dropout": 0.2
77 |
78 |
79 |
// Settings for variance_predictor
80 |
81 |
"filter_size": 256,
82 |
"kernel_size": 3,
83 |
"dropout": 0.5
84 |
85 |
86 |
"pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
87 |
"energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
88 |
"n_bins": 256
89 |
90 |
"max_seq_len": 1000
91 |
92 |
93 |
"batch_size": 16,
94 |
"sort_sample": true,
95 |
"drop_last": true,
96 |
"group_size": 4,
97 |
"grad_clip_thresh": 1.0,
98 |
"dataloader": {
99 |
"num_worker": 8,
100 |
"pin_memory": true
101 |
102 |
103 |
"num_warmup": 4000
104 |
105 |
// LR Scheduler
106 |
"scheduler": "NoamLR",
107 |
// Optimizer
108 |
"optimizer": "Adam",
109 |
"adam": {
110 |
"lr": 0.0625,
111 |
"betas": [0.9, 0.98],
112 |
"eps": 0.000000001,
113 |
"weight_decay": 0.0
114 |
115 |
116 |
117 |
@@ -0,0 +1,180 @@
1 |
2 |
"base_config": "config/base.json",
3 |
"model_type": "Transformer",
4 |
"task_type": "svc",
5 |
"use_custom_dataset": false,
6 |
"preprocess": {
7 |
// data augmentations
8 |
"use_pitch_shift": false,
9 |
"use_formant_shift": false,
10 |
"use_time_stretch": false,
11 |
"use_equalizer": false,
12 |
// acoustic features
13 |
"extract_mel": true,
14 |
"mel_min_max_norm": true,
15 |
"extract_pitch": true,
16 |
"pitch_extractor": "parselmouth",
17 |
"extract_uv": true,
18 |
"extract_energy": true,
19 |
// content features
20 |
"extract_whisper_feature": false,
21 |
"whisper_sample_rate": 16000,
22 |
"extract_contentvec_feature": false,
23 |
"contentvec_sample_rate": 16000,
24 |
"extract_wenet_feature": false,
25 |
"wenet_sample_rate": 16000,
26 |
"extract_mert_feature": false,
27 |
"mert_sample_rate": 16000,
28 |
// Default config for whisper
29 |
"whisper_frameshift": 0.01,
30 |
"whisper_downsample_rate": 2,
31 |
// Default config for content vector
32 |
"contentvec_frameshift": 0.02,
33 |
// Default config for mert
34 |
"mert_model": "m-a-p/MERT-v1-330M",
35 |
"mert_feature_layer": -1,
36 |
"mert_hop_size": 320,
37 |
// 24k
38 |
"mert_frameshit": 0.01333,
39 |
// 10ms
40 |
"wenet_frameshift": 0.01,
41 |
// wenetspeech is 4, gigaspeech is 6
42 |
"wenet_downsample_rate": 4,
43 |
// Default config
44 |
"n_mel": 100,
45 |
"win_size": 1024,
46 |
// todo
47 |
"hop_size": 256,
48 |
"sample_rate": 24000,
49 |
"n_fft": 1024,
50 |
// todo
51 |
"fmin": 0,
52 |
"fmax": 12000,
53 |
// todo
54 |
"f0_min": 50,
55 |
// ~C2
56 |
"f0_max": 1100,
57 |
//1100, // ~C6(1100), ~G5(800)
58 |
"pitch_bin": 256,
59 |
"pitch_max": 1100.0,
60 |
"pitch_min": 50.0,
61 |
"is_label": true,
62 |
"is_mu_law": true,
63 |
"bits": 8,
64 |
"mel_min_max_stats_dir": "mel_min_max_stats",
65 |
"whisper_dir": "whisper",
66 |
"contentvec_dir": "contentvec",
67 |
"wenet_dir": "wenet",
68 |
"mert_dir": "mert",
69 |
// Extract content features using dataloader
70 |
"pin_memory": true,
71 |
"num_workers": 8,
72 |
"content_feature_batch_size": 16,
73 |
// Features used for model training
74 |
"use_mel": true,
75 |
"use_min_max_norm_mel": true,
76 |
"use_frame_pitch": true,
77 |
"use_uv": true,
78 |
"use_frame_energy": true,
79 |
"use_log_scale_pitch": false,
80 |
"use_log_scale_energy": false,
81 |
"use_spkid": true,
82 |
// Meta file
83 |
"train_file": "train.json",
84 |
"valid_file": "test.json",
85 |
"spk2id": "singers.json",
86 |
"utt2spk": "utt2singer"
87 |
88 |
"model": {
89 |
"condition_encoder": {
90 |
"merge_mode": "add",
91 |
"input_melody_dim": 1,
92 |
"use_log_f0": true,
93 |
"n_bins_melody": 256,
94 |
//# Quantization (0 for not quantization)
95 |
"output_melody_dim": 384,
96 |
"input_loudness_dim": 1,
97 |
"use_log_loudness": true,
98 |
"n_bins_loudness": 256,
99 |
"output_loudness_dim": 384,
100 |
"use_whisper": false,
101 |
"use_contentvec": true,
102 |
"use_wenet": false,
103 |
"use_mert": false,
104 |
"whisper_dim": 1024,
105 |
"contentvec_dim": 256,
106 |
"mert_dim": 256,
107 |
"wenet_dim": 512,
108 |
"content_encoder_dim": 384,
109 |
"output_singer_dim": 384,
110 |
"singer_table_size": 512,
111 |
"output_content_dim": 384,
112 |
"use_spkid": true
113 |
114 |
"transformer": {
115 |
"type": "conformer",
116 |
// 'conformer' or 'transformer'
117 |
"input_dim": 384,
118 |
"output_dim": 100,
119 |
"n_heads": 2,
120 |
"n_layers": 6,
121 |
"filter_channels": 512,
122 |
"dropout": 0.1,
123 |
124 |
125 |
"train": {
126 |
// Basic settings
127 |
"batch_size": 64,
128 |
"gradient_accumulation_step": 1,
129 |
"max_epoch": -1,
130 |
// -1 means no limit
131 |
"save_checkpoint_stride": [
132 |
133 |
134 |
135 |
// unit is epoch
136 |
"keep_last": [
137 |
138 |
139 |
140 |
// -1 means infinite, if one number will broadcast
141 |
"run_eval": [
142 |
143 |
144 |
145 |
// if one number will broadcast
146 |
// Fix the random seed
147 |
"random_seed": 10086,
148 |
// Batchsampler
149 |
"sampler": {
150 |
"holistic_shuffle": true,
151 |
"drop_last": true
152 |
153 |
// Dataloader
154 |
"dataloader": {
155 |
"num_worker": 32,
156 |
"pin_memory": true
157 |
158 |
// Trackers
159 |
"tracker": [
160 |
161 |
// "wandb",
162 |
// "cometml",
163 |
// "mlflow",
164 |
165 |
// Optimizer
166 |
"optimizer": "AdamW",
167 |
"adamw": {
168 |
"lr": 4.0e-4
169 |
// nn model lr
170 |
171 |
// LR Scheduler
172 |
"scheduler": "ReduceLROnPlateau",
173 |
"reducelronplateau": {
174 |
"factor": 0.8,
175 |
"patience": 10,
176 |
// unit is epoch
177 |
"min_lr": 1.0e-4
178 |
179 |
180 |
@@ -0,0 +1,23 @@
1 |
2 |
"base_config": "config/base.json",
3 |
"supported_model_type": [
4 |
5 |
6 |
7 |
8 |
"task_type": "tts",
9 |
"preprocess": {
10 |
"language": "en-us",
11 |
// linguistic features
12 |
"extract_phone": true,
13 |
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
14 |
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
15 |
// Directory names of processed data or extracted features
16 |
"phone_dir": "phones",
17 |
"use_phone": true,
18 |
19 |
"model": {
20 |
"text_token_num": 512,
21 |
22 |
23 |
@@ -0,0 +1,52 @@
1 |
2 |
"base_config": "config/tts.json",
3 |
"model_type": "VALLE",
4 |
"task_type": "tts",
5 |
"dataset": [
6 |
7 |
8 |
"preprocess": {
9 |
"extract_phone": true,
10 |
"phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
11 |
"extract_acoustic_token": true,
12 |
"acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
13 |
"acoustic_token_dir": "acoutic_tokens",
14 |
"use_text": false,
15 |
"use_phone": true,
16 |
"use_acoustic_token": true,
17 |
"symbols_dict": "symbols.dict",
18 |
"min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
19 |
"max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
20 |
"sampling_rate": 24000,
21 |
22 |
"model": {
23 |
"text_token_num": 512,
24 |
"audio_token_num": 1024,
25 |
"decoder_dim": 1024, // embedding dimension of the decoder model
26 |
"nhead": 16, // number of attention heads in the decoder layers
27 |
"num_decoder_layers": 12, // number of decoder layers
28 |
"norm_first": true, // pre or post Normalization.
29 |
"add_prenet": false, // whether add PreNet after Inputs
30 |
"prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
31 |
"share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
32 |
"nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
33 |
"prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
34 |
"num_quantizers": 8, // numbert of the audio quantization layers
35 |
// "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
36 |
37 |
"train": {
38 |
"ddp": false,
39 |
"train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
40 |
"max_epoch": 20,
41 |
"optimizer": "ScaledAdam",
42 |
"scheduler": "Eden",
43 |
"warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases
44 |
"base_lr": 0.05, // base learning rate."
45 |
"valid_interval": 1000,
46 |
"log_epoch_step": 1000,
47 |
"save_checkpoint_stride": [
48 |
49 |
50 |
51 |
52 |
@@ -0,0 +1,101 @@
1 |
2 |
"base_config": "config/tts.json",
3 |
"model_type": "VITS",
4 |
"task_type": "tts",
5 |
"preprocess": {
6 |
"extract_phone": true,
7 |
"extract_mel": true,
8 |
"n_mel": 80,
9 |
"fmin": 0,
10 |
"fmax": null,
11 |
"extract_linear_spec": true,
12 |
"extract_audio": true,
13 |
"use_linear": true,
14 |
"use_mel": true,
15 |
"use_audio": true,
16 |
"use_text": false,
17 |
"use_phone": true,
18 |
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
19 |
"n_fft": 1024,
20 |
"win_size": 1024,
21 |
"hop_size": 256,
22 |
"segment_size": 8192,
23 |
"text_cleaners": [
24 |
25 |
26 |
27 |
"model": {
28 |
"text_token_num": 512,
29 |
"inter_channels": 192,
30 |
"hidden_channels": 192,
31 |
"filter_channels": 768,
32 |
"n_heads": 2,
33 |
"n_layers": 6,
34 |
"kernel_size": 3,
35 |
"p_dropout": 0.1,
36 |
"resblock": "1",
37 |
"resblock_kernel_sizes": [
38 |
39 |
40 |
41 |
42 |
"resblock_dilation_sizes": [
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
"upsample_rates": [
60 |
61 |
62 |
63 |
64 |
65 |
"upsample_initial_channel": 512,
66 |
"upsample_kernel_sizes": [
67 |
68 |
69 |
70 |
71 |
72 |
"n_layers_q": 3,
73 |
"use_spectral_norm": false,
74 |
"n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
75 |
"gin_channels": 256,
76 |
"use_sdp": true
77 |
78 |
"train": {
79 |
"fp16_run": true,
80 |
"learning_rate": 2e-4,
81 |
"betas": [
82 |
83 |
84 |
85 |
"eps": 1e-9,
86 |
"batch_size": 16,
87 |
"lr_decay": 0.999875,
88 |
// "segment_size": 8192,
89 |
"init_lr_ratio": 1,
90 |
"warmup_epochs": 0,
91 |
"c_mel": 45,
92 |
"c_kl": 1.0,
93 |
"AdamW": {
94 |
"betas": [
95 |
96 |
97 |
98 |
"eps": 1e-9,
99 |
100 |
101 |
@@ -0,0 +1,84 @@
1 |
2 |
"base_config": "config/base.json",
3 |
"dataset": [
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
"task_type": "vocoder",
23 |
"preprocess": {
24 |
// acoustic features
25 |
"extract_mel": true,
26 |
"extract_pitch": false,
27 |
"extract_uv": false,
28 |
"extract_audio": true,
29 |
"extract_label": false,
30 |
"extract_one_hot": false,
31 |
"extract_amplitude_phase": false,
32 |
"pitch_extractor": "parselmouth",
33 |
// Settings for data preprocessing
34 |
"n_mel": 100,
35 |
"win_size": 1024,
36 |
"hop_size": 256,
37 |
"sample_rate": 24000,
38 |
"n_fft": 1024,
39 |
"fmin": 0,
40 |
"fmax": 12000,
41 |
"f0_min": 50,
42 |
"f0_max": 1100,
43 |
"pitch_bin": 256,
44 |
"pitch_max": 1100.0,
45 |
"pitch_min": 50.0,
46 |
"is_mu_law": false,
47 |
"bits": 8,
48 |
"cut_mel_frame": 32,
49 |
// Directory names of processed data or extracted features
50 |
"spk2id": "singers.json",
51 |
// Features used for model training
52 |
"use_mel": true,
53 |
"use_frame_pitch": false,
54 |
"use_uv": false,
55 |
"use_audio": true,
56 |
"use_label": false,
57 |
"use_one_hot": false,
58 |
"train_file": "train.json",
59 |
"valid_file": "test.json"
60 |
61 |
"train": {
62 |
"random_seed": 114514,
63 |
"batch_size": 64,
64 |
"gradient_accumulation_step": 1,
65 |
"max_epoch": 1000000,
66 |
"save_checkpoint_stride": [
67 |
68 |
69 |
"run_eval": [
70 |
71 |
72 |
"sampler": {
73 |
"holistic_shuffle": true,
74 |
"drop_last": true
75 |
76 |
"dataloader": {
77 |
"num_worker": 4,
78 |
"pin_memory": true
79 |
80 |
"tracker": [
81 |
82 |
83 |
84 |