Billpai commited on
Commit
507c407
·
1 Parent(s): 0a17a23
ckpts/svc/vocalist_l1_contentvec+whisper/args.json ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_type": "svc",
3
+ "dataset": [
4
+ "vocalist_l1",
5
+ ],
6
+ "exp_name": "vocalist_l1_contentvec+whisper",
7
+ "inference": {
8
+ "diffusion": {
9
+ "scheduler": "pndm",
10
+ "scheduler_settings": {
11
+ "num_inference_timesteps": 1000,
12
+ },
13
+ },
14
+ },
15
+ "model": {
16
+ "condition_encoder": {
17
+ "content_encoder_dim": 384,
18
+ "contentvec_dim": 256,
19
+ "f0_max": 1100,
20
+ "f0_min": 50,
21
+ "input_loudness_dim": 1,
22
+ "input_melody_dim": 1,
23
+ "merge_mode": "add",
24
+ "mert_dim": 256,
25
+ "n_bins_loudness": 256,
26
+ "n_bins_melody": 256,
27
+ "output_content_dim": 384,
28
+ "output_loudness_dim": 384,
29
+ "output_melody_dim": 384,
30
+ "output_singer_dim": 384,
31
+ "pitch_max": 1100,
32
+ "pitch_min": 50,
33
+ "singer_table_size": 512,
34
+ "use_conformer_for_content_features": false,
35
+ "use_contentvec": true,
36
+ "use_log_f0": true,
37
+ "use_log_loudness": true,
38
+ "use_mert": false,
39
+ "use_singer_encoder": true,
40
+ "use_spkid": true,
41
+ "use_wenet": false,
42
+ "use_whisper": true,
43
+ "wenet_dim": 512,
44
+ "whisper_dim": 1024,
45
+ },
46
+ "diffusion": {
47
+ "bidilconv": {
48
+ "base_channel": 384,
49
+ "conditioner_size": 384,
50
+ "conv_kernel_size": 3,
51
+ "dilation_cycle_length": 4,
52
+ "n_res_block": 20,
53
+ },
54
+ "model_type": "bidilconv",
55
+ "scheduler": "ddpm",
56
+ "scheduler_settings": {
57
+ "beta_end": 0.02,
58
+ "beta_schedule": "linear",
59
+ "beta_start": 0.0001,
60
+ "num_train_timesteps": 1000,
61
+ },
62
+ "step_encoder": {
63
+ "activation": "SiLU",
64
+ "dim_hidden_layer": 512,
65
+ "dim_raw_embedding": 128,
66
+ "max_period": 10000,
67
+ "num_layer": 2,
68
+ },
69
+ "unet2d": {
70
+ "down_block_types": [
71
+ "CrossAttnDownBlock2D",
72
+ "CrossAttnDownBlock2D",
73
+ "CrossAttnDownBlock2D",
74
+ "DownBlock2D",
75
+ ],
76
+ "in_channels": 1,
77
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
78
+ "only_cross_attention": false,
79
+ "out_channels": 1,
80
+ "up_block_types": [
81
+ "UpBlock2D",
82
+ "CrossAttnUpBlock2D",
83
+ "CrossAttnUpBlock2D",
84
+ "CrossAttnUpBlock2D",
85
+ ],
86
+ },
87
+ },
88
+ },
89
+ "model_type": "DiffWaveNetSVC",
90
+ "preprocess": {
91
+ "audio_dir": "audios",
92
+ "bits": 8,
93
+ "content_feature_batch_size": 16,
94
+ "contentvec_batch_size": 1,
95
+ "contentvec_dir": "contentvec",
96
+ "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
97
+ "contentvec_frameshift": 0.02,
98
+ "contentvec_sample_rate": 16000,
99
+ "dur_dir": "durs",
100
+ "duration_dir": "duration",
101
+ "emo2id": "emo2id.json",
102
+ "energy_dir": "energys",
103
+ "extract_audio": false,
104
+ "extract_contentvec_feature": true,
105
+ "extract_energy": true,
106
+ "extract_label": false,
107
+ "extract_mcep": false,
108
+ "extract_mel": true,
109
+ "extract_mert_feature": false,
110
+ "extract_pitch": true,
111
+ "extract_uv": true,
112
+ "extract_wenet_feature": false,
113
+ "extract_whisper_feature": true,
114
+ "f0_max": 1100,
115
+ "f0_min": 50,
116
+ "file_lst": "file.lst",
117
+ "fmax": 12000,
118
+ "fmin": 0,
119
+ "hop_size": 256,
120
+ "is_label": true,
121
+ "is_mu_law": true,
122
+ "lab_dir": "labs",
123
+ "label_dir": "labels",
124
+ "mcep_dir": "mcep",
125
+ "mel_dir": "mels",
126
+ "mel_min_max_norm": true,
127
+ "mel_min_max_stats_dir": "mel_min_max_stats",
128
+ "mert_dir": "mert",
129
+ "mert_feature_layer": -1,
130
+ "mert_frameshit": 0.01333,
131
+ "mert_hop_size": 320,
132
+ "mert_model": "m-a-p/MERT-v1-330M",
133
+ "min_level_db": -115,
134
+ "mu_law_norm": false,
135
+ "n_fft": 1024,
136
+ "n_mel": 100,
137
+ "num_silent_frames": 8,
138
+ "num_workers": 8,
139
+ "phone_seq_file": "phone_seq_file",
140
+ "pin_memory": true,
141
+ "pitch_bin": 256,
142
+ "pitch_dir": "pitches",
143
+ "pitch_extractor": "crepe", // "parselmouth"
144
+ "pitch_max": 1100.0,
145
+ "pitch_min": 50.0,
146
+ "processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
147
+ "ref_level_db": 20,
148
+ "sample_rate": 24000,
149
+ "spk2id": "singers.json",
150
+ "train_file": "train.json",
151
+ "trim_fft_size": 512,
152
+ "trim_hop_size": 128,
153
+ "trim_silence": false,
154
+ "trim_top_db": 30,
155
+ "trimmed_wav_dir": "trimmed_wavs",
156
+ "use_audio": false,
157
+ "use_contentvec": true,
158
+ "use_dur": false,
159
+ "use_emoid": false,
160
+ "use_frame_duration": false,
161
+ "use_frame_energy": true,
162
+ "use_frame_pitch": true,
163
+ "use_lab": false,
164
+ "use_label": false,
165
+ "use_log_scale_energy": false,
166
+ "use_log_scale_pitch": false,
167
+ "use_mel": true,
168
+ "use_mert": false,
169
+ "use_min_max_norm_mel": true,
170
+ "use_one_hot": false,
171
+ "use_phn_seq": false,
172
+ "use_phone_duration": false,
173
+ "use_phone_energy": false,
174
+ "use_phone_pitch": false,
175
+ "use_spkid": true,
176
+ "use_uv": true,
177
+ "use_wav": false,
178
+ "use_wenet": false,
179
+ "use_whisper": true,
180
+ "utt2emo": "utt2emo",
181
+ "utt2spk": "utt2singer",
182
+ "uv_dir": "uvs",
183
+ "valid_file": "test.json",
184
+ "wav_dir": "wavs",
185
+ "wenet_batch_size": 1,
186
+ "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
187
+ "wenet_dir": "wenet",
188
+ "wenet_downsample_rate": 4,
189
+ "wenet_frameshift": 0.01,
190
+ "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
191
+ "wenet_sample_rate": 16000,
192
+ "whisper_batch_size": 30,
193
+ "whisper_dir": "whisper",
194
+ "whisper_downsample_rate": 2,
195
+ "whisper_frameshift": 0.01,
196
+ "whisper_model": "medium",
197
+ "whisper_model_path": "pretrained/whisper/medium.pt",
198
+ "whisper_sample_rate": 16000,
199
+ "win_size": 1024,
200
+ },
201
+ "supported_model_type": [
202
+ "Fastspeech2",
203
+ "DiffSVC",
204
+ "Transformer",
205
+ "EDM",
206
+ "CD",
207
+ ],
208
+ "train": {
209
+ "adamw": {
210
+ "lr": 0.0004,
211
+ },
212
+ "batch_size": 32,
213
+ "dataloader": {
214
+ "num_worker": 8,
215
+ "pin_memory": true,
216
+ },
217
+ "ddp": true,
218
+ "epochs": 50000,
219
+ "gradient_accumulation_step": 1,
220
+ "keep_checkpoint_max": 5,
221
+ "keep_last": [
222
+ 5,
223
+ -1,
224
+ ],
225
+ "max_epoch": -1,
226
+ "max_steps": 1000000,
227
+ "multi_speaker_training": false,
228
+ "optimizer": "AdamW",
229
+ "random_seed": 10086,
230
+ "reducelronplateau": {
231
+ "factor": 0.8,
232
+ "min_lr": 0.0001,
233
+ "patience": 10,
234
+ },
235
+ "run_eval": [
236
+ false,
237
+ true,
238
+ ],
239
+ "sampler": {
240
+ "drop_last": true,
241
+ "holistic_shuffle": false,
242
+ },
243
+ "save_checkpoint_stride": [
244
+ 3,
245
+ 10,
246
+ ],
247
+ "save_checkpoints_steps": 10000,
248
+ "save_summary_steps": 500,
249
+ "scheduler": "ReduceLROnPlateau",
250
+ "total_training_steps": 50000,
251
+ "tracker": [
252
+ "tensorboard",
253
+ ],
254
+ "valid_interval": 10000,
255
+ },
256
+ "use_custom_dataset": true,
257
+ }
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:836af10b834c7aec9209eb19ce43559e6ef1e3a59bd6468e90cadbc9a18749ef
3
+ size 249512389
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d54eed12bef331095fc367f196d07c5061d5cb72dd6fe0e1e4453b997bf1d68d
3
+ size 124755137
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/random_states_0.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6798ddffadcd7d5405a77e667c674c474e4fef0cba817fdd300c7c985c1e82fe
3
+ size 14599
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/singers.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocalist_l1_Adele": 0,
3
+ "vocalist_l1_Beyonce": 1,
4
+ "vocalist_l1_BrunoMars": 2,
5
+ "vocalist_l1_JohnMayer": 3,
6
+ "vocalist_l1_MichaelJackson": 4,
7
+ "vocalist_l1_TaylorSwift": 5,
8
+ "vocalist_l1_张学友": 6,
9
+ "vocalist_l1_李健": 7,
10
+ "vocalist_l1_汪峰": 8,
11
+ "vocalist_l1_王菲": 9,
12
+ "vocalist_l1_石倚洁": 10,
13
+ "vocalist_l1_蔡琴": 11,
14
+ "vocalist_l1_那英": 12,
15
+ "vocalist_l1_陈奕迅": 13,
16
+ "vocalist_l1_陶喆": 14
17
+ }
ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7f490fd0c97876e24bfc44413365ded7ff5d22c1c79f0dac0b754f3b32df76f
3
+ size 88
ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e01bcf2fa621ba563b70568c18fe0742d0f48cafae83a6e8beb0bb6d1f6d146d
3
+ size 77413046
ckpts/svc/vocalist_l1_contentvec+whisper/singers.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "vocalist_l1_Adele": 0,
3
+ "vocalist_l1_Beyonce": 1,
4
+ "vocalist_l1_BrunoMars": 2,
5
+ "vocalist_l1_JohnMayer": 3,
6
+ "vocalist_l1_MichaelJackson": 4,
7
+ "vocalist_l1_TaylorSwift": 5,
8
+ "vocalist_l1_张学友": 6,
9
+ "vocalist_l1_李健": 7,
10
+ "vocalist_l1_汪峰": 8,
11
+ "vocalist_l1_王菲": 9,
12
+ "vocalist_l1_石倚洁": 10,
13
+ "vocalist_l1_蔡琴": 11,
14
+ "vocalist_l1_那英": 12,
15
+ "vocalist_l1_陈奕迅": 13,
16
+ "vocalist_l1_陶喆": 14
17
+ }
config/audioldm.json ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AudioLDM",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false,
20
+ "cond_mask_prob": 0.1
21
+ },
22
+ // model
23
+ "model": {
24
+ "audioldm": {
25
+ "image_size": 32,
26
+ "in_channels": 4,
27
+ "out_channels": 4,
28
+ "model_channels": 256,
29
+ "attention_resolutions": [
30
+ 4,
31
+ 2,
32
+ 1
33
+ ],
34
+ "num_res_blocks": 2,
35
+ "channel_mult": [
36
+ 1,
37
+ 2,
38
+ 4
39
+ ],
40
+ "num_heads": 8,
41
+ "use_spatial_transformer": true,
42
+ "transformer_depth": 1,
43
+ "context_dim": 768,
44
+ "use_checkpoint": true,
45
+ "legacy": false
46
+ },
47
+ "autoencoderkl": {
48
+ "ch": 128,
49
+ "ch_mult": [
50
+ 1,
51
+ 1,
52
+ 2,
53
+ 2,
54
+ 4
55
+ ],
56
+ "num_res_blocks": 2,
57
+ "in_channels": 1,
58
+ "z_channels": 4,
59
+ "out_ch": 1,
60
+ "double_z": true
61
+ },
62
+ "noise_scheduler": {
63
+ "num_train_timesteps": 1000,
64
+ "beta_start": 0.00085,
65
+ "beta_end": 0.012,
66
+ "beta_schedule": "scaled_linear",
67
+ "clip_sample": false,
68
+ "steps_offset": 1,
69
+ "set_alpha_to_one": false,
70
+ "skip_prk_steps": true,
71
+ "prediction_type": "epsilon"
72
+ }
73
+ },
74
+ // train
75
+ "train": {
76
+ "lronPlateau": {
77
+ "factor": 0.9,
78
+ "patience": 100,
79
+ "min_lr": 4.0e-5,
80
+ "verbose": true
81
+ },
82
+ "adam": {
83
+ "lr": 5.0e-5,
84
+ "betas": [
85
+ 0.9,
86
+ 0.999
87
+ ],
88
+ "weight_decay": 1.0e-2,
89
+ "eps": 1.0e-8
90
+ }
91
+ }
92
+ }
config/autoencoderkl.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "AutoencoderKL",
4
+ "task_type": "tta",
5
+ "dataset": [
6
+ "AudioCaps"
7
+ ],
8
+ "preprocess": {
9
+ // feature used for model training
10
+ "use_spkid": false,
11
+ "use_uv": false,
12
+ "use_frame_pitch": false,
13
+ "use_phone_pitch": false,
14
+ "use_frame_energy": false,
15
+ "use_phone_energy": false,
16
+ "use_mel": false,
17
+ "use_audio": false,
18
+ "use_label": false,
19
+ "use_one_hot": false
20
+ },
21
+ // model
22
+ "model": {
23
+ "autoencoderkl": {
24
+ "ch": 128,
25
+ "ch_mult": [
26
+ 1,
27
+ 1,
28
+ 2,
29
+ 2,
30
+ 4
31
+ ],
32
+ "num_res_blocks": 2,
33
+ "in_channels": 1,
34
+ "z_channels": 4,
35
+ "out_ch": 1,
36
+ "double_z": true
37
+ },
38
+ "loss": {
39
+ "kl_weight": 1e-8,
40
+ "disc_weight": 0.5,
41
+ "disc_factor": 1.0,
42
+ "logvar_init": 0.0,
43
+ "min_adapt_d_weight": 0.0,
44
+ "max_adapt_d_weight": 10.0,
45
+ "disc_start": 50001,
46
+ "disc_in_channels": 1,
47
+ "disc_num_layers": 3,
48
+ "use_actnorm": false
49
+ }
50
+ },
51
+ // train
52
+ "train": {
53
+ "lronPlateau": {
54
+ "factor": 0.9,
55
+ "patience": 100,
56
+ "min_lr": 4.0e-5,
57
+ "verbose": true
58
+ },
59
+ "adam": {
60
+ "lr": 4.0e-4,
61
+ "betas": [
62
+ 0.9,
63
+ 0.999
64
+ ],
65
+ "weight_decay": 1.0e-2,
66
+ "eps": 1.0e-8
67
+ }
68
+ }
69
+ }
config/base.json ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "supported_model_type": [
3
+ "GANVocoder",
4
+ "Fastspeech2",
5
+ "DiffSVC",
6
+ "Transformer",
7
+ "EDM",
8
+ "CD"
9
+ ],
10
+ "task_type": "",
11
+ "dataset": [],
12
+ "use_custom_dataset": false,
13
+ "preprocess": {
14
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
15
+ // trim audio silence
16
+ "data_augment": false,
17
+ "trim_silence": false,
18
+ "num_silent_frames": 8,
19
+ "trim_fft_size": 512, // fft size used in trimming
20
+ "trim_hop_size": 128, // hop size used in trimming
21
+ "trim_top_db": 30, // top db used in trimming sensitive to each dataset
22
+ // acoustic features
23
+ "extract_mel": false,
24
+ "mel_extract_mode": "",
25
+ "extract_linear_spec": false,
26
+ "extract_mcep": false,
27
+ "extract_pitch": false,
28
+ "extract_acoustic_token": false,
29
+ "pitch_remove_outlier": false,
30
+ "extract_uv": false,
31
+ "pitch_norm": false,
32
+ "extract_audio": false,
33
+ "extract_label": false,
34
+ "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
35
+ "extract_energy": false,
36
+ "energy_remove_outlier": false,
37
+ "energy_norm": false,
38
+ "energy_extract_mode": "from_mel",
39
+ "extract_duration": false,
40
+ "extract_amplitude_phase": false,
41
+ "mel_min_max_norm": false,
42
+ // lingusitic features
43
+ "extract_phone": false,
44
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
45
+ // content features
46
+ "extract_whisper_feature": false,
47
+ "extract_contentvec_feature": false,
48
+ "extract_mert_feature": false,
49
+ "extract_wenet_feature": false,
50
+ // Settings for data preprocessing
51
+ "n_mel": 80,
52
+ "win_size": 480,
53
+ "hop_size": 120,
54
+ "sample_rate": 24000,
55
+ "n_fft": 1024,
56
+ "fmin": 0,
57
+ "fmax": 12000,
58
+ "min_level_db": -115,
59
+ "ref_level_db": 20,
60
+ "bits": 8,
61
+ // Directory names of processed data or extracted features
62
+ "processed_dir": "processed_data",
63
+ "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
64
+ "raw_data": "raw_data",
65
+ "phone_dir": "phones",
66
+ "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
67
+ "audio_dir": "audios",
68
+ "log_amplitude_dir": "log_amplitudes",
69
+ "phase_dir": "phases",
70
+ "real_dir": "reals",
71
+ "imaginary_dir": "imaginarys",
72
+ "label_dir": "labels",
73
+ "linear_dir": "linears",
74
+ "mel_dir": "mels", // directory name of extraced mel features
75
+ "mcep_dir": "mcep", // directory name of extraced mcep features
76
+ "dur_dir": "durs",
77
+ "symbols_dict": "symbols.dict",
78
+ "lab_dir": "labs", // directory name of extraced label features
79
+ "wenet_dir": "wenet", // directory name of extraced wenet features
80
+ "contentvec_dir": "contentvec", // directory name of extraced wenet features
81
+ "pitch_dir": "pitches", // directory name of extraced pitch features
82
+ "energy_dir": "energys", // directory name of extracted energy features
83
+ "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
84
+ "phone_energy_dir": "phone_energys", // directory name of extracted energy features
85
+ "uv_dir": "uvs", // directory name of extracted unvoiced features
86
+ "duration_dir": "duration", // ground-truth duration file
87
+ "phone_seq_file": "phone_seq_file", // phoneme sequence file
88
+ "file_lst": "file.lst",
89
+ "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
90
+ "valid_file": "valid.json", // validattion set
91
+ "spk2id": "spk2id.json", // used for multi-speaker dataset
92
+ "utt2spk": "utt2spk", // used for multi-speaker dataset
93
+ "emo2id": "emo2id.json", // used for multi-emotion dataset
94
+ "utt2emo": "utt2emo", // used for multi-emotion dataset
95
+ // Features used for model training
96
+ "use_text": false,
97
+ "use_phone": false,
98
+ "use_phn_seq": false,
99
+ "use_lab": false,
100
+ "use_linear": false,
101
+ "use_mel": false,
102
+ "use_min_max_norm_mel": false,
103
+ "use_wav": false,
104
+ "use_phone_pitch": false,
105
+ "use_log_scale_pitch": false,
106
+ "use_phone_energy": false,
107
+ "use_phone_duration": false,
108
+ "use_log_scale_energy": false,
109
+ "use_wenet": false,
110
+ "use_dur": false,
111
+ "use_spkid": false, // True: use speaker id for multi-speaker dataset
112
+ "use_emoid": false, // True: use emotion id for multi-emotion dataset
113
+ "use_frame_pitch": false,
114
+ "use_uv": false,
115
+ "use_frame_energy": false,
116
+ "use_frame_duration": false,
117
+ "use_audio": false,
118
+ "use_label": false,
119
+ "use_one_hot": false,
120
+ "use_amplitude_phase": false,
121
+ "data_augment": false,
122
+ "align_mel_duration": false
123
+ },
124
+ "train": {
125
+ "ddp": true,
126
+ "random_seed": 970227,
127
+ "batch_size": 16,
128
+ "max_steps": 1000000,
129
+ // Trackers
130
+ "tracker": [
131
+ "tensorboard"
132
+ // "wandb",
133
+ // "cometml",
134
+ // "mlflow",
135
+ ],
136
+ "max_epoch": -1,
137
+ // -1 means no limit
138
+ "save_checkpoint_stride": [
139
+ 5,
140
+ 20
141
+ ],
142
+ // unit is epoch
143
+ "keep_last": [
144
+ 3,
145
+ -1
146
+ ],
147
+ // -1 means infinite, if one number will broadcast
148
+ "run_eval": [
149
+ false,
150
+ true
151
+ ],
152
+ // if one number will broadcast
153
+ // Fix the random seed
154
+ "random_seed": 10086,
155
+ // Optimizer
156
+ "optimizer": "AdamW",
157
+ "adamw": {
158
+ "lr": 4.0e-4
159
+ // nn model lr
160
+ },
161
+ // LR Scheduler
162
+ "scheduler": "ReduceLROnPlateau",
163
+ "reducelronplateau": {
164
+ "factor": 0.8,
165
+ "patience": 10,
166
+ // unit is epoch
167
+ "min_lr": 1.0e-4
168
+ },
169
+ // Batchsampler
170
+ "sampler": {
171
+ "holistic_shuffle": true,
172
+ "drop_last": true
173
+ },
174
+ // Dataloader
175
+ "dataloader": {
176
+ "num_worker": 32,
177
+ "pin_memory": true
178
+ },
179
+ "gradient_accumulation_step": 1,
180
+ "total_training_steps": 50000,
181
+ "save_summary_steps": 500,
182
+ "save_checkpoints_steps": 10000,
183
+ "valid_interval": 10000,
184
+ "keep_checkpoint_max": 5,
185
+ "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
186
+ "max_epoch": -1,
187
+ // -1 means no limit
188
+ "save_checkpoint_stride": [
189
+ 5,
190
+ 20
191
+ ],
192
+ // unit is epoch
193
+ "keep_last": [
194
+ 3,
195
+ -1
196
+ ],
197
+ // -1 means infinite, if one number will broadcast
198
+ "run_eval": [
199
+ false,
200
+ true
201
+ ],
202
+ // Batchsampler
203
+ "sampler": {
204
+ "holistic_shuffle": true,
205
+ "drop_last": true
206
+ },
207
+ // Dataloader
208
+ "dataloader": {
209
+ "num_worker": 32,
210
+ "pin_memory": true
211
+ },
212
+ // Trackers
213
+ "tracker": [
214
+ "tensorboard"
215
+ // "wandb",
216
+ // "cometml",
217
+ // "mlflow",
218
+ ],
219
+ },
220
+ }
config/comosvc.json ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "DiffComoSVC",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "teacher_model_path": "[Your Teacher Model Path].bin",
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ "comosvc": {
116
+ "distill": false,
117
+ // conformer encoder
118
+ "input_dim": 384,
119
+ "output_dim": 100,
120
+ "n_heads": 2,
121
+ "n_layers": 6,
122
+ "filter_channels": 512,
123
+ "dropout": 0.1,
124
+ // karras diffusion
125
+ "P_mean": -1.2,
126
+ "P_std": 1.2,
127
+ "sigma_data": 0.5,
128
+ "sigma_min": 0.002,
129
+ "sigma_max": 80,
130
+ "rho": 7,
131
+ "n_timesteps": 40,
132
+ },
133
+ "diffusion": {
134
+ // Diffusion steps encoder
135
+ "step_encoder": {
136
+ "dim_raw_embedding": 128,
137
+ "dim_hidden_layer": 512,
138
+ "activation": "SiLU",
139
+ "num_layer": 2,
140
+ "max_period": 10000
141
+ },
142
+ // Diffusion decoder
143
+ "model_type": "bidilconv",
144
+ // bidilconv, unet2d, TODO: unet1d
145
+ "bidilconv": {
146
+ "base_channel": 384,
147
+ "n_res_block": 20,
148
+ "conv_kernel_size": 3,
149
+ "dilation_cycle_length": 4,
150
+ // specially, 1 means no dilation
151
+ "conditioner_size": 100
152
+ }
153
+ },
154
+ },
155
+ "train": {
156
+ // Basic settings
157
+ "fast_steps": 0,
158
+ "batch_size": 32,
159
+ "gradient_accumulation_step": 1,
160
+ "max_epoch": -1,
161
+ // -1 means no limit
162
+ "save_checkpoint_stride": [
163
+ 10,
164
+ 100
165
+ ],
166
+ // unit is epoch
167
+ "keep_last": [
168
+ 3,
169
+ -1
170
+ ],
171
+ // -1 means infinite, if one number will broadcast
172
+ "run_eval": [
173
+ false,
174
+ true
175
+ ],
176
+ // if one number will broadcast
177
+ // Fix the random seed
178
+ "random_seed": 10086,
179
+ // Batchsampler
180
+ "sampler": {
181
+ "holistic_shuffle": true,
182
+ "drop_last": true
183
+ },
184
+ // Dataloader
185
+ "dataloader": {
186
+ "num_worker": 32,
187
+ "pin_memory": true
188
+ },
189
+ // Trackers
190
+ "tracker": [
191
+ "tensorboard"
192
+ // "wandb",
193
+ // "cometml",
194
+ // "mlflow",
195
+ ],
196
+ // Optimizer
197
+ "optimizer": "AdamW",
198
+ "adamw": {
199
+ "lr": 4.0e-4
200
+ // nn model lr
201
+ },
202
+ // LR Scheduler
203
+ "scheduler": "ReduceLROnPlateau",
204
+ "reducelronplateau": {
205
+ "factor": 0.8,
206
+ "patience": 10,
207
+ // unit is epoch
208
+ "min_lr": 1.0e-4
209
+ }
210
+ },
211
+ "inference": {
212
+ "comosvc": {
213
+ "inference_steps": 40
214
+ }
215
+ }
216
+ }
config/diffusion.json ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // FIXME: THESE ARE LEGACY
3
+ "base_config": "config/base.json",
4
+ "model_type": "diffusion",
5
+ "task_type": "svc",
6
+ "use_custom_dataset": false,
7
+ "preprocess": {
8
+ // data augmentations
9
+ "use_pitch_shift": false,
10
+ "use_formant_shift": false,
11
+ "use_time_stretch": false,
12
+ "use_equalizer": false,
13
+ // acoustic features
14
+ "extract_mel": true,
15
+ "mel_min_max_norm": true,
16
+ "extract_pitch": true,
17
+ "pitch_extractor": "parselmouth",
18
+ "extract_uv": true,
19
+ "extract_energy": true,
20
+ // content features
21
+ "extract_whisper_feature": false,
22
+ "whisper_sample_rate": 16000,
23
+ "extract_contentvec_feature": false,
24
+ "contentvec_sample_rate": 16000,
25
+ "extract_wenet_feature": false,
26
+ "wenet_sample_rate": 16000,
27
+ "extract_mert_feature": false,
28
+ "mert_sample_rate": 16000,
29
+ // Default config for whisper
30
+ "whisper_frameshift": 0.01,
31
+ "whisper_downsample_rate": 2,
32
+ // Default config for content vector
33
+ "contentvec_frameshift": 0.02,
34
+ // Default config for mert
35
+ "mert_model": "m-a-p/MERT-v1-330M",
36
+ "mert_feature_layer": -1,
37
+ "mert_hop_size": 320,
38
+ // 24k
39
+ "mert_frameshit": 0.01333,
40
+ // 10ms
41
+ "wenet_frameshift": 0.01,
42
+ // wenetspeech is 4, gigaspeech is 6
43
+ "wenet_downsample_rate": 4,
44
+ // Default config
45
+ "n_mel": 100,
46
+ "win_size": 1024,
47
+ // todo
48
+ "hop_size": 256,
49
+ "sample_rate": 24000,
50
+ "n_fft": 1024,
51
+ // todo
52
+ "fmin": 0,
53
+ "fmax": 12000,
54
+ // todo
55
+ "f0_min": 50,
56
+ // ~C2
57
+ "f0_max": 1100,
58
+ //1100, // ~C6(1100), ~G5(800)
59
+ "pitch_bin": 256,
60
+ "pitch_max": 1100.0,
61
+ "pitch_min": 50.0,
62
+ "is_label": true,
63
+ "is_mu_law": true,
64
+ "bits": 8,
65
+ "mel_min_max_stats_dir": "mel_min_max_stats",
66
+ "whisper_dir": "whisper",
67
+ "contentvec_dir": "contentvec",
68
+ "wenet_dir": "wenet",
69
+ "mert_dir": "mert",
70
+ // Extract content features using dataloader
71
+ "pin_memory": true,
72
+ "num_workers": 8,
73
+ "content_feature_batch_size": 16,
74
+ // Features used for model training
75
+ "use_mel": true,
76
+ "use_min_max_norm_mel": true,
77
+ "use_frame_pitch": true,
78
+ "use_uv": true,
79
+ "use_frame_energy": true,
80
+ "use_log_scale_pitch": false,
81
+ "use_log_scale_energy": false,
82
+ "use_spkid": true,
83
+ // Meta file
84
+ "train_file": "train.json",
85
+ "valid_file": "test.json",
86
+ "spk2id": "singers.json",
87
+ "utt2spk": "utt2singer"
88
+ },
89
+ "model": {
90
+ "condition_encoder": {
91
+ "merge_mode": "add",
92
+ "input_melody_dim": 1,
93
+ "use_log_f0": true,
94
+ "n_bins_melody": 256,
95
+ //# Quantization (0 for not quantization)
96
+ "output_melody_dim": 384,
97
+ "input_loudness_dim": 1,
98
+ "use_log_loudness": true,
99
+ "n_bins_loudness": 256,
100
+ "output_loudness_dim": 384,
101
+ "use_whisper": false,
102
+ "use_contentvec": false,
103
+ "use_wenet": false,
104
+ "use_mert": false,
105
+ "whisper_dim": 1024,
106
+ "contentvec_dim": 256,
107
+ "mert_dim": 256,
108
+ "wenet_dim": 512,
109
+ "content_encoder_dim": 384,
110
+ "output_singer_dim": 384,
111
+ "singer_table_size": 512,
112
+ "output_content_dim": 384,
113
+ "use_spkid": true
114
+ },
115
+ // FIXME: FOLLOWING ARE NEW!!
116
+ "diffusion": {
117
+ "scheduler": "ddpm",
118
+ "scheduler_settings": {
119
+ "num_train_timesteps": 1000,
120
+ "beta_start": 1.0e-4,
121
+ "beta_end": 0.02,
122
+ "beta_schedule": "linear"
123
+ },
124
+ // Diffusion steps encoder
125
+ "step_encoder": {
126
+ "dim_raw_embedding": 128,
127
+ "dim_hidden_layer": 512,
128
+ "activation": "SiLU",
129
+ "num_layer": 2,
130
+ "max_period": 10000
131
+ },
132
+ // Diffusion decoder
133
+ "model_type": "bidilconv",
134
+ // bidilconv, unet2d, TODO: unet1d
135
+ "bidilconv": {
136
+ "base_channel": 384,
137
+ "n_res_block": 20,
138
+ "conv_kernel_size": 3,
139
+ "dilation_cycle_length": 4,
140
+ // specially, 1 means no dilation
141
+ "conditioner_size": 384
142
+ },
143
+ "unet2d": {
144
+ "in_channels": 1,
145
+ "out_channels": 1,
146
+ "down_block_types": [
147
+ "CrossAttnDownBlock2D",
148
+ "CrossAttnDownBlock2D",
149
+ "CrossAttnDownBlock2D",
150
+ "DownBlock2D"
151
+ ],
152
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
153
+ "up_block_types": [
154
+ "UpBlock2D",
155
+ "CrossAttnUpBlock2D",
156
+ "CrossAttnUpBlock2D",
157
+ "CrossAttnUpBlock2D"
158
+ ],
159
+ "only_cross_attention": false
160
+ }
161
+ }
162
+ },
163
+ // FIXME: FOLLOWING ARE NEW!!
164
+ "train": {
165
+ // Basic settings
166
+ "batch_size": 64,
167
+ "gradient_accumulation_step": 1,
168
+ "max_epoch": -1,
169
+ // -1 means no limit
170
+ "save_checkpoint_stride": [
171
+ 5,
172
+ 20
173
+ ],
174
+ // unit is epoch
175
+ "keep_last": [
176
+ 3,
177
+ -1
178
+ ],
179
+ // -1 means infinite, if one number will broadcast
180
+ "run_eval": [
181
+ false,
182
+ true
183
+ ],
184
+ // if one number will broadcast
185
+ // Fix the random seed
186
+ "random_seed": 10086,
187
+ // Batchsampler
188
+ "sampler": {
189
+ "holistic_shuffle": true,
190
+ "drop_last": true
191
+ },
192
+ // Dataloader
193
+ "dataloader": {
194
+ "num_worker": 32,
195
+ "pin_memory": true
196
+ },
197
+ // Trackers
198
+ "tracker": [
199
+ "tensorboard"
200
+ // "wandb",
201
+ // "cometml",
202
+ // "mlflow",
203
+ ],
204
+ // Optimizer
205
+ "optimizer": "AdamW",
206
+ "adamw": {
207
+ "lr": 4.0e-4
208
+ // nn model lr
209
+ },
210
+ // LR Scheduler
211
+ "scheduler": "ReduceLROnPlateau",
212
+ "reducelronplateau": {
213
+ "factor": 0.8,
214
+ "patience": 10,
215
+ // unit is epoch
216
+ "min_lr": 1.0e-4
217
+ }
218
+ },
219
+ "inference": {
220
+ "diffusion": {
221
+ "scheduler": "pndm",
222
+ "scheduler_settings": {
223
+ "num_inference_timesteps": 1000
224
+ }
225
+ }
226
+ }
227
+ }
config/fs2.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "FastSpeech2",
4
+ "task_type": "tts",
5
+ "dataset": ["LJSpeech"],
6
+ "preprocess": {
7
+ // acoustic features
8
+ "extract_audio": true,
9
+ "extract_mel": true,
10
+ "mel_extract_mode": "taco",
11
+ "mel_min_max_norm": false,
12
+ "extract_pitch": true,
13
+ "extract_uv": false,
14
+ "pitch_extractor": "dio",
15
+ "extract_energy": true,
16
+ "energy_extract_mode": "from_tacotron_stft",
17
+ "extract_duration": true,
18
+ "use_phone": true,
19
+ "pitch_norm": true,
20
+ "energy_norm": true,
21
+ "pitch_remove_outlier": true,
22
+ "energy_remove_outlier": true,
23
+
24
+ // Default config
25
+ "n_mel": 80,
26
+ "win_size": 1024, // todo
27
+ "hop_size": 256,
28
+ "sample_rate": 22050,
29
+ "n_fft": 1024, // todo
30
+ "fmin": 0,
31
+ "fmax": 8000, // todo
32
+ "raw_data": "raw_data",
33
+ "text_cleaners": ["english_cleaners"],
34
+ "f0_min": 71, // ~C2
35
+ "f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36
+ "pitch_bin": 256,
37
+ "pitch_max": 1100.0,
38
+ "pitch_min": 50.0,
39
+ "is_label": true,
40
+ "is_mu_law": true,
41
+ "bits": 8,
42
+
43
+ "mel_min_max_stats_dir": "mel_min_max_stats",
44
+ "whisper_dir": "whisper",
45
+ "content_vector_dir": "content_vector",
46
+ "wenet_dir": "wenet",
47
+ "mert_dir": "mert",
48
+ "spk2id":"spk2id.json",
49
+ "utt2spk":"utt2spk",
50
+
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_min_max_norm_mel": false,
54
+ "use_frame_pitch": false,
55
+ "use_frame_energy": false,
56
+ "use_phone_pitch": true,
57
+ "use_phone_energy": true,
58
+ "use_log_scale_pitch": false,
59
+ "use_log_scale_energy": false,
60
+ "use_spkid": false,
61
+ "align_mel_duration": true,
62
+ "text_cleaners": ["english_cleaners"]
63
+ },
64
+ "model": {
65
+ // Settings for transformer
66
+ "transformer": {
67
+ "encoder_layer": 4,
68
+ "encoder_head": 2,
69
+ "encoder_hidden": 256,
70
+ "decoder_layer": 6,
71
+ "decoder_head": 2,
72
+ "decoder_hidden": 256,
73
+ "conv_filter_size": 1024,
74
+ "conv_kernel_size": [9, 1],
75
+ "encoder_dropout": 0.2,
76
+ "decoder_dropout": 0.2
77
+ },
78
+
79
+ // Settings for variance_predictor
80
+ "variance_predictor":{
81
+ "filter_size": 256,
82
+ "kernel_size": 3,
83
+ "dropout": 0.5
84
+ },
85
+ "variance_embedding":{
86
+ "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
87
+ "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
88
+ "n_bins": 256
89
+ },
90
+ "max_seq_len": 1000
91
+ },
92
+ "train":{
93
+ "batch_size": 16,
94
+ "sort_sample": true,
95
+ "drop_last": true,
96
+ "group_size": 4,
97
+ "grad_clip_thresh": 1.0,
98
+ "dataloader": {
99
+ "num_worker": 8,
100
+ "pin_memory": true
101
+ },
102
+ "lr_scheduler":{
103
+ "num_warmup": 4000
104
+ },
105
+ // LR Scheduler
106
+ "scheduler": "NoamLR",
107
+ // Optimizer
108
+ "optimizer": "Adam",
109
+ "adam": {
110
+ "lr": 0.0625,
111
+ "betas": [0.9, 0.98],
112
+ "eps": 0.000000001,
113
+ "weight_decay": 0.0
114
+ },
115
+ }
116
+
117
+ }
config/transformer.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "model_type": "Transformer",
4
+ "task_type": "svc",
5
+ "use_custom_dataset": false,
6
+ "preprocess": {
7
+ // data augmentations
8
+ "use_pitch_shift": false,
9
+ "use_formant_shift": false,
10
+ "use_time_stretch": false,
11
+ "use_equalizer": false,
12
+ // acoustic features
13
+ "extract_mel": true,
14
+ "mel_min_max_norm": true,
15
+ "extract_pitch": true,
16
+ "pitch_extractor": "parselmouth",
17
+ "extract_uv": true,
18
+ "extract_energy": true,
19
+ // content features
20
+ "extract_whisper_feature": false,
21
+ "whisper_sample_rate": 16000,
22
+ "extract_contentvec_feature": false,
23
+ "contentvec_sample_rate": 16000,
24
+ "extract_wenet_feature": false,
25
+ "wenet_sample_rate": 16000,
26
+ "extract_mert_feature": false,
27
+ "mert_sample_rate": 16000,
28
+ // Default config for whisper
29
+ "whisper_frameshift": 0.01,
30
+ "whisper_downsample_rate": 2,
31
+ // Default config for content vector
32
+ "contentvec_frameshift": 0.02,
33
+ // Default config for mert
34
+ "mert_model": "m-a-p/MERT-v1-330M",
35
+ "mert_feature_layer": -1,
36
+ "mert_hop_size": 320,
37
+ // 24k
38
+ "mert_frameshit": 0.01333,
39
+ // 10ms
40
+ "wenet_frameshift": 0.01,
41
+ // wenetspeech is 4, gigaspeech is 6
42
+ "wenet_downsample_rate": 4,
43
+ // Default config
44
+ "n_mel": 100,
45
+ "win_size": 1024,
46
+ // todo
47
+ "hop_size": 256,
48
+ "sample_rate": 24000,
49
+ "n_fft": 1024,
50
+ // todo
51
+ "fmin": 0,
52
+ "fmax": 12000,
53
+ // todo
54
+ "f0_min": 50,
55
+ // ~C2
56
+ "f0_max": 1100,
57
+ //1100, // ~C6(1100), ~G5(800)
58
+ "pitch_bin": 256,
59
+ "pitch_max": 1100.0,
60
+ "pitch_min": 50.0,
61
+ "is_label": true,
62
+ "is_mu_law": true,
63
+ "bits": 8,
64
+ "mel_min_max_stats_dir": "mel_min_max_stats",
65
+ "whisper_dir": "whisper",
66
+ "contentvec_dir": "contentvec",
67
+ "wenet_dir": "wenet",
68
+ "mert_dir": "mert",
69
+ // Extract content features using dataloader
70
+ "pin_memory": true,
71
+ "num_workers": 8,
72
+ "content_feature_batch_size": 16,
73
+ // Features used for model training
74
+ "use_mel": true,
75
+ "use_min_max_norm_mel": true,
76
+ "use_frame_pitch": true,
77
+ "use_uv": true,
78
+ "use_frame_energy": true,
79
+ "use_log_scale_pitch": false,
80
+ "use_log_scale_energy": false,
81
+ "use_spkid": true,
82
+ // Meta file
83
+ "train_file": "train.json",
84
+ "valid_file": "test.json",
85
+ "spk2id": "singers.json",
86
+ "utt2spk": "utt2singer"
87
+ },
88
+ "model": {
89
+ "condition_encoder": {
90
+ "merge_mode": "add",
91
+ "input_melody_dim": 1,
92
+ "use_log_f0": true,
93
+ "n_bins_melody": 256,
94
+ //# Quantization (0 for not quantization)
95
+ "output_melody_dim": 384,
96
+ "input_loudness_dim": 1,
97
+ "use_log_loudness": true,
98
+ "n_bins_loudness": 256,
99
+ "output_loudness_dim": 384,
100
+ "use_whisper": false,
101
+ "use_contentvec": true,
102
+ "use_wenet": false,
103
+ "use_mert": false,
104
+ "whisper_dim": 1024,
105
+ "contentvec_dim": 256,
106
+ "mert_dim": 256,
107
+ "wenet_dim": 512,
108
+ "content_encoder_dim": 384,
109
+ "output_singer_dim": 384,
110
+ "singer_table_size": 512,
111
+ "output_content_dim": 384,
112
+ "use_spkid": true
113
+ },
114
+ "transformer": {
115
+ "type": "conformer",
116
+ // 'conformer' or 'transformer'
117
+ "input_dim": 384,
118
+ "output_dim": 100,
119
+ "n_heads": 2,
120
+ "n_layers": 6,
121
+ "filter_channels": 512,
122
+ "dropout": 0.1,
123
+ }
124
+ },
125
+ "train": {
126
+ // Basic settings
127
+ "batch_size": 64,
128
+ "gradient_accumulation_step": 1,
129
+ "max_epoch": -1,
130
+ // -1 means no limit
131
+ "save_checkpoint_stride": [
132
+ 10,
133
+ 100
134
+ ],
135
+ // unit is epoch
136
+ "keep_last": [
137
+ 3,
138
+ -1
139
+ ],
140
+ // -1 means infinite, if one number will broadcast
141
+ "run_eval": [
142
+ false,
143
+ true
144
+ ],
145
+ // if one number will broadcast
146
+ // Fix the random seed
147
+ "random_seed": 10086,
148
+ // Batchsampler
149
+ "sampler": {
150
+ "holistic_shuffle": true,
151
+ "drop_last": true
152
+ },
153
+ // Dataloader
154
+ "dataloader": {
155
+ "num_worker": 32,
156
+ "pin_memory": true
157
+ },
158
+ // Trackers
159
+ "tracker": [
160
+ "tensorboard"
161
+ // "wandb",
162
+ // "cometml",
163
+ // "mlflow",
164
+ ],
165
+ // Optimizer
166
+ "optimizer": "AdamW",
167
+ "adamw": {
168
+ "lr": 4.0e-4
169
+ // nn model lr
170
+ },
171
+ // LR Scheduler
172
+ "scheduler": "ReduceLROnPlateau",
173
+ "reducelronplateau": {
174
+ "factor": 0.8,
175
+ "patience": 10,
176
+ // unit is epoch
177
+ "min_lr": 1.0e-4
178
+ }
179
+ }
180
+ }
config/tts.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "supported_model_type": [
4
+ "Fastspeech2",
5
+ "VITS",
6
+ "VALLE",
7
+ ],
8
+ "task_type": "tts",
9
+ "preprocess": {
10
+ "language": "en-us",
11
+ // linguistic features
12
+ "extract_phone": true,
13
+ "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
14
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
15
+ // Directory names of processed data or extracted features
16
+ "phone_dir": "phones",
17
+ "use_phone": true,
18
+ },
19
+ "model": {
20
+ "text_token_num": 512,
21
+ }
22
+
23
+ }
config/valle.json ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VALLE",
4
+ "task_type": "tts",
5
+ "dataset": [
6
+ "libritts"
7
+ ],
8
+ "preprocess": {
9
+ "extract_phone": true,
10
+ "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
11
+ "extract_acoustic_token": true,
12
+ "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
13
+ "acoustic_token_dir": "acoutic_tokens",
14
+ "use_text": false,
15
+ "use_phone": true,
16
+ "use_acoustic_token": true,
17
+ "symbols_dict": "symbols.dict",
18
+ "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
19
+ "max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
20
+ "sampling_rate": 24000,
21
+ },
22
+ "model": {
23
+ "text_token_num": 512,
24
+ "audio_token_num": 1024,
25
+ "decoder_dim": 1024, // embedding dimension of the decoder model
26
+ "nhead": 16, // number of attention heads in the decoder layers
27
+ "num_decoder_layers": 12, // number of decoder layers
28
+ "norm_first": true, // pre or post Normalization.
29
+ "add_prenet": false, // whether add PreNet after Inputs
30
+ "prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
31
+ "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
32
+ "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
33
+ "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
34
+ "num_quantizers": 8, // numbert of the audio quantization layers
35
+ // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
36
+ },
37
+ "train": {
38
+ "ddp": false,
39
+ "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
40
+ "max_epoch": 20,
41
+ "optimizer": "ScaledAdam",
42
+ "scheduler": "Eden",
43
+ "warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases
44
+ "base_lr": 0.05, // base learning rate."
45
+ "valid_interval": 1000,
46
+ "log_epoch_step": 1000,
47
+ "save_checkpoint_stride": [
48
+ 1,
49
+ 1
50
+ ]
51
+ }
52
+ }
config/vits.json ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/tts.json",
3
+ "model_type": "VITS",
4
+ "task_type": "tts",
5
+ "preprocess": {
6
+ "extract_phone": true,
7
+ "extract_mel": true,
8
+ "n_mel": 80,
9
+ "fmin": 0,
10
+ "fmax": null,
11
+ "extract_linear_spec": true,
12
+ "extract_audio": true,
13
+ "use_linear": true,
14
+ "use_mel": true,
15
+ "use_audio": true,
16
+ "use_text": false,
17
+ "use_phone": true,
18
+ "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
19
+ "n_fft": 1024,
20
+ "win_size": 1024,
21
+ "hop_size": 256,
22
+ "segment_size": 8192,
23
+ "text_cleaners": [
24
+ "english_cleaners"
25
+ ]
26
+ },
27
+ "model": {
28
+ "text_token_num": 512,
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0.1,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [
38
+ 3,
39
+ 7,
40
+ 11
41
+ ],
42
+ "resblock_dilation_sizes": [
43
+ [
44
+ 1,
45
+ 3,
46
+ 5
47
+ ],
48
+ [
49
+ 1,
50
+ 3,
51
+ 5
52
+ ],
53
+ [
54
+ 1,
55
+ 3,
56
+ 5
57
+ ]
58
+ ],
59
+ "upsample_rates": [
60
+ 8,
61
+ 8,
62
+ 2,
63
+ 2
64
+ ],
65
+ "upsample_initial_channel": 512,
66
+ "upsample_kernel_sizes": [
67
+ 16,
68
+ 16,
69
+ 4,
70
+ 4
71
+ ],
72
+ "n_layers_q": 3,
73
+ "use_spectral_norm": false,
74
+ "n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
75
+ "gin_channels": 256,
76
+ "use_sdp": true
77
+ },
78
+ "train": {
79
+ "fp16_run": true,
80
+ "learning_rate": 2e-4,
81
+ "betas": [
82
+ 0.8,
83
+ 0.99
84
+ ],
85
+ "eps": 1e-9,
86
+ "batch_size": 16,
87
+ "lr_decay": 0.999875,
88
+ // "segment_size": 8192,
89
+ "init_lr_ratio": 1,
90
+ "warmup_epochs": 0,
91
+ "c_mel": 45,
92
+ "c_kl": 1.0,
93
+ "AdamW": {
94
+ "betas": [
95
+ 0.8,
96
+ 0.99
97
+ ],
98
+ "eps": 1e-9,
99
+ }
100
+ }
101
+ }
config/vocoder.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "base_config": "config/base.json",
3
+ "dataset": [
4
+ "LJSpeech",
5
+ "LibriTTS",
6
+ "opencpop",
7
+ "m4singer",
8
+ "svcc",
9
+ "svcceval",
10
+ "pjs",
11
+ "opensinger",
12
+ "popbutfy",
13
+ "nus48e",
14
+ "popcs",
15
+ "kising",
16
+ "csd",
17
+ "opera",
18
+ "vctk",
19
+ "lijian",
20
+ "cdmusiceval"
21
+ ],
22
+ "task_type": "vocoder",
23
+ "preprocess": {
24
+ // acoustic features
25
+ "extract_mel": true,
26
+ "extract_pitch": false,
27
+ "extract_uv": false,
28
+ "extract_audio": true,
29
+ "extract_label": false,
30
+ "extract_one_hot": false,
31
+ "extract_amplitude_phase": false,
32
+ "pitch_extractor": "parselmouth",
33
+ // Settings for data preprocessing
34
+ "n_mel": 100,
35
+ "win_size": 1024,
36
+ "hop_size": 256,
37
+ "sample_rate": 24000,
38
+ "n_fft": 1024,
39
+ "fmin": 0,
40
+ "fmax": 12000,
41
+ "f0_min": 50,
42
+ "f0_max": 1100,
43
+ "pitch_bin": 256,
44
+ "pitch_max": 1100.0,
45
+ "pitch_min": 50.0,
46
+ "is_mu_law": false,
47
+ "bits": 8,
48
+ "cut_mel_frame": 32,
49
+ // Directory names of processed data or extracted features
50
+ "spk2id": "singers.json",
51
+ // Features used for model training
52
+ "use_mel": true,
53
+ "use_frame_pitch": false,
54
+ "use_uv": false,
55
+ "use_audio": true,
56
+ "use_label": false,
57
+ "use_one_hot": false,
58
+ "train_file": "train.json",
59
+ "valid_file": "test.json"
60
+ },
61
+ "train": {
62
+ "random_seed": 114514,
63
+ "batch_size": 64,
64
+ "gradient_accumulation_step": 1,
65
+ "max_epoch": 1000000,
66
+ "save_checkpoint_stride": [
67
+ 20
68
+ ],
69
+ "run_eval": [
70
+ true
71
+ ],
72
+ "sampler": {
73
+ "holistic_shuffle": true,
74
+ "drop_last": true
75
+ },
76
+ "dataloader": {
77
+ "num_worker": 4,
78
+ "pin_memory": true
79
+ },
80
+ "tracker": [
81
+ "tensorboard"
82
+ ],
83
+ }
84
+ }