Billpai commited on
Commit
0a17a23
·
1 Parent(s): f03cd94
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. ckpts/svc/vocalist_l1_contentvec+whisper/args.json +0 -257
  2. ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/optimizer.bin +0 -3
  3. ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/pytorch_model.bin +0 -3
  4. ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/random_states_0.pkl +0 -3
  5. ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/singers.json +0 -17
  6. ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.0 +0 -3
  7. ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.1 +0 -3
  8. ckpts/svc/vocalist_l1_contentvec+whisper/singers.json +0 -17
  9. config/audioldm.json +0 -92
  10. config/autoencoderkl.json +0 -69
  11. config/base.json +0 -220
  12. config/comosvc.json +0 -216
  13. config/diffusion.json +0 -227
  14. config/fs2.json +0 -117
  15. config/transformer.json +0 -180
  16. config/tts.json +0 -23
  17. config/valle.json +0 -52
  18. config/vits.json +0 -101
  19. config/vocoder.json +0 -84
  20. egs/svc/MultipleContentsSVC/README.md +0 -153
  21. egs/svc/MultipleContentsSVC/exp_config.json +0 -126
  22. egs/svc/MultipleContentsSVC/run.sh +0 -1
  23. egs/svc/README.md +0 -34
  24. egs/svc/_template/run.sh +0 -150
  25. egs/vocoder/README.md +0 -23
  26. egs/vocoder/diffusion/README.md +0 -0
  27. egs/vocoder/diffusion/exp_config_base.json +0 -0
  28. egs/vocoder/gan/README.md +0 -224
  29. egs/vocoder/gan/_template/run.sh +0 -143
  30. egs/vocoder/gan/apnet/exp_config.json +0 -45
  31. egs/vocoder/gan/apnet/run.sh +0 -143
  32. egs/vocoder/gan/bigvgan/exp_config.json +0 -66
  33. egs/vocoder/gan/bigvgan/run.sh +0 -143
  34. egs/vocoder/gan/bigvgan_large/exp_config.json +0 -70
  35. egs/vocoder/gan/bigvgan_large/run.sh +0 -143
  36. egs/vocoder/gan/exp_config_base.json +0 -111
  37. egs/vocoder/gan/hifigan/exp_config.json +0 -59
  38. egs/vocoder/gan/hifigan/run.sh +0 -143
  39. egs/vocoder/gan/melgan/exp_config.json +0 -34
  40. egs/vocoder/gan/melgan/run.sh +0 -143
  41. egs/vocoder/gan/nsfhifigan/exp_config.json +0 -83
  42. egs/vocoder/gan/nsfhifigan/run.sh +0 -143
  43. egs/vocoder/gan/tfr_enhanced_hifigan/README.md +0 -185
  44. egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json +0 -118
  45. egs/vocoder/gan/tfr_enhanced_hifigan/run.sh +0 -145
  46. examples/chinese_female_recordings.wav +0 -3
  47. examples/chinese_male_seperated.wav +0 -3
  48. examples/english_female_seperated.wav +0 -3
  49. examples/english_male_recordings.wav +0 -3
  50. examples/output/.DS_Store +0 -0
ckpts/svc/vocalist_l1_contentvec+whisper/args.json DELETED
@@ -1,257 +0,0 @@
1
- {
2
- "task_type": "svc",
3
- "dataset": [
4
- "vocalist_l1",
5
- ],
6
- "exp_name": "vocalist_l1_contentvec+whisper",
7
- "inference": {
8
- "diffusion": {
9
- "scheduler": "pndm",
10
- "scheduler_settings": {
11
- "num_inference_timesteps": 1000,
12
- },
13
- },
14
- },
15
- "model": {
16
- "condition_encoder": {
17
- "content_encoder_dim": 384,
18
- "contentvec_dim": 256,
19
- "f0_max": 1100,
20
- "f0_min": 50,
21
- "input_loudness_dim": 1,
22
- "input_melody_dim": 1,
23
- "merge_mode": "add",
24
- "mert_dim": 256,
25
- "n_bins_loudness": 256,
26
- "n_bins_melody": 256,
27
- "output_content_dim": 384,
28
- "output_loudness_dim": 384,
29
- "output_melody_dim": 384,
30
- "output_singer_dim": 384,
31
- "pitch_max": 1100,
32
- "pitch_min": 50,
33
- "singer_table_size": 512,
34
- "use_conformer_for_content_features": false,
35
- "use_contentvec": true,
36
- "use_log_f0": true,
37
- "use_log_loudness": true,
38
- "use_mert": false,
39
- "use_singer_encoder": true,
40
- "use_spkid": true,
41
- "use_wenet": false,
42
- "use_whisper": true,
43
- "wenet_dim": 512,
44
- "whisper_dim": 1024,
45
- },
46
- "diffusion": {
47
- "bidilconv": {
48
- "base_channel": 384,
49
- "conditioner_size": 384,
50
- "conv_kernel_size": 3,
51
- "dilation_cycle_length": 4,
52
- "n_res_block": 20,
53
- },
54
- "model_type": "bidilconv",
55
- "scheduler": "ddpm",
56
- "scheduler_settings": {
57
- "beta_end": 0.02,
58
- "beta_schedule": "linear",
59
- "beta_start": 0.0001,
60
- "num_train_timesteps": 1000,
61
- },
62
- "step_encoder": {
63
- "activation": "SiLU",
64
- "dim_hidden_layer": 512,
65
- "dim_raw_embedding": 128,
66
- "max_period": 10000,
67
- "num_layer": 2,
68
- },
69
- "unet2d": {
70
- "down_block_types": [
71
- "CrossAttnDownBlock2D",
72
- "CrossAttnDownBlock2D",
73
- "CrossAttnDownBlock2D",
74
- "DownBlock2D",
75
- ],
76
- "in_channels": 1,
77
- "mid_block_type": "UNetMidBlock2DCrossAttn",
78
- "only_cross_attention": false,
79
- "out_channels": 1,
80
- "up_block_types": [
81
- "UpBlock2D",
82
- "CrossAttnUpBlock2D",
83
- "CrossAttnUpBlock2D",
84
- "CrossAttnUpBlock2D",
85
- ],
86
- },
87
- },
88
- },
89
- "model_type": "DiffWaveNetSVC",
90
- "preprocess": {
91
- "audio_dir": "audios",
92
- "bits": 8,
93
- "content_feature_batch_size": 16,
94
- "contentvec_batch_size": 1,
95
- "contentvec_dir": "contentvec",
96
- "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
97
- "contentvec_frameshift": 0.02,
98
- "contentvec_sample_rate": 16000,
99
- "dur_dir": "durs",
100
- "duration_dir": "duration",
101
- "emo2id": "emo2id.json",
102
- "energy_dir": "energys",
103
- "extract_audio": false,
104
- "extract_contentvec_feature": true,
105
- "extract_energy": true,
106
- "extract_label": false,
107
- "extract_mcep": false,
108
- "extract_mel": true,
109
- "extract_mert_feature": false,
110
- "extract_pitch": true,
111
- "extract_uv": true,
112
- "extract_wenet_feature": false,
113
- "extract_whisper_feature": true,
114
- "f0_max": 1100,
115
- "f0_min": 50,
116
- "file_lst": "file.lst",
117
- "fmax": 12000,
118
- "fmin": 0,
119
- "hop_size": 256,
120
- "is_label": true,
121
- "is_mu_law": true,
122
- "lab_dir": "labs",
123
- "label_dir": "labels",
124
- "mcep_dir": "mcep",
125
- "mel_dir": "mels",
126
- "mel_min_max_norm": true,
127
- "mel_min_max_stats_dir": "mel_min_max_stats",
128
- "mert_dir": "mert",
129
- "mert_feature_layer": -1,
130
- "mert_frameshit": 0.01333,
131
- "mert_hop_size": 320,
132
- "mert_model": "m-a-p/MERT-v1-330M",
133
- "min_level_db": -115,
134
- "mu_law_norm": false,
135
- "n_fft": 1024,
136
- "n_mel": 100,
137
- "num_silent_frames": 8,
138
- "num_workers": 8,
139
- "phone_seq_file": "phone_seq_file",
140
- "pin_memory": true,
141
- "pitch_bin": 256,
142
- "pitch_dir": "pitches",
143
- "pitch_extractor": "crepe", // "parselmouth"
144
- "pitch_max": 1100.0,
145
- "pitch_min": 50.0,
146
- "processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
147
- "ref_level_db": 20,
148
- "sample_rate": 24000,
149
- "spk2id": "singers.json",
150
- "train_file": "train.json",
151
- "trim_fft_size": 512,
152
- "trim_hop_size": 128,
153
- "trim_silence": false,
154
- "trim_top_db": 30,
155
- "trimmed_wav_dir": "trimmed_wavs",
156
- "use_audio": false,
157
- "use_contentvec": true,
158
- "use_dur": false,
159
- "use_emoid": false,
160
- "use_frame_duration": false,
161
- "use_frame_energy": true,
162
- "use_frame_pitch": true,
163
- "use_lab": false,
164
- "use_label": false,
165
- "use_log_scale_energy": false,
166
- "use_log_scale_pitch": false,
167
- "use_mel": true,
168
- "use_mert": false,
169
- "use_min_max_norm_mel": true,
170
- "use_one_hot": false,
171
- "use_phn_seq": false,
172
- "use_phone_duration": false,
173
- "use_phone_energy": false,
174
- "use_phone_pitch": false,
175
- "use_spkid": true,
176
- "use_uv": true,
177
- "use_wav": false,
178
- "use_wenet": false,
179
- "use_whisper": true,
180
- "utt2emo": "utt2emo",
181
- "utt2spk": "utt2singer",
182
- "uv_dir": "uvs",
183
- "valid_file": "test.json",
184
- "wav_dir": "wavs",
185
- "wenet_batch_size": 1,
186
- "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
187
- "wenet_dir": "wenet",
188
- "wenet_downsample_rate": 4,
189
- "wenet_frameshift": 0.01,
190
- "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
191
- "wenet_sample_rate": 16000,
192
- "whisper_batch_size": 30,
193
- "whisper_dir": "whisper",
194
- "whisper_downsample_rate": 2,
195
- "whisper_frameshift": 0.01,
196
- "whisper_model": "medium",
197
- "whisper_model_path": "pretrained/whisper/medium.pt",
198
- "whisper_sample_rate": 16000,
199
- "win_size": 1024,
200
- },
201
- "supported_model_type": [
202
- "Fastspeech2",
203
- "DiffSVC",
204
- "Transformer",
205
- "EDM",
206
- "CD",
207
- ],
208
- "train": {
209
- "adamw": {
210
- "lr": 0.0004,
211
- },
212
- "batch_size": 32,
213
- "dataloader": {
214
- "num_worker": 8,
215
- "pin_memory": true,
216
- },
217
- "ddp": true,
218
- "epochs": 50000,
219
- "gradient_accumulation_step": 1,
220
- "keep_checkpoint_max": 5,
221
- "keep_last": [
222
- 5,
223
- -1,
224
- ],
225
- "max_epoch": -1,
226
- "max_steps": 1000000,
227
- "multi_speaker_training": false,
228
- "optimizer": "AdamW",
229
- "random_seed": 10086,
230
- "reducelronplateau": {
231
- "factor": 0.8,
232
- "min_lr": 0.0001,
233
- "patience": 10,
234
- },
235
- "run_eval": [
236
- false,
237
- true,
238
- ],
239
- "sampler": {
240
- "drop_last": true,
241
- "holistic_shuffle": false,
242
- },
243
- "save_checkpoint_stride": [
244
- 3,
245
- 10,
246
- ],
247
- "save_checkpoints_steps": 10000,
248
- "save_summary_steps": 500,
249
- "scheduler": "ReduceLROnPlateau",
250
- "total_training_steps": 50000,
251
- "tracker": [
252
- "tensorboard",
253
- ],
254
- "valid_interval": 10000,
255
- },
256
- "use_custom_dataset": true,
257
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/optimizer.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:836af10b834c7aec9209eb19ce43559e6ef1e3a59bd6468e90cadbc9a18749ef
3
- size 249512389
 
 
 
 
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d54eed12bef331095fc367f196d07c5061d5cb72dd6fe0e1e4453b997bf1d68d
3
- size 124755137
 
 
 
 
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/random_states_0.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6798ddffadcd7d5405a77e667c674c474e4fef0cba817fdd300c7c985c1e82fe
3
- size 14599
 
 
 
 
ckpts/svc/vocalist_l1_contentvec+whisper/checkpoint/epoch-6852_step-0678447_loss-1.946773/singers.json DELETED
@@ -1,17 +0,0 @@
1
- {
2
- "vocalist_l1_Adele": 0,
3
- "vocalist_l1_Beyonce": 1,
4
- "vocalist_l1_BrunoMars": 2,
5
- "vocalist_l1_JohnMayer": 3,
6
- "vocalist_l1_MichaelJackson": 4,
7
- "vocalist_l1_TaylorSwift": 5,
8
- "vocalist_l1_张学友": 6,
9
- "vocalist_l1_李健": 7,
10
- "vocalist_l1_汪峰": 8,
11
- "vocalist_l1_王菲": 9,
12
- "vocalist_l1_石倚洁": 10,
13
- "vocalist_l1_蔡琴": 11,
14
- "vocalist_l1_那英": 12,
15
- "vocalist_l1_陈奕迅": 13,
16
- "vocalist_l1_陶喆": 14
17
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.0 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7f490fd0c97876e24bfc44413365ded7ff5d22c1c79f0dac0b754f3b32df76f
3
- size 88
 
 
 
 
ckpts/svc/vocalist_l1_contentvec+whisper/log/vocalist_l1_contentvec+whisper/events.out.tfevents.1696052302.mmnewyardnodesz63219.120.1 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e01bcf2fa621ba563b70568c18fe0742d0f48cafae83a6e8beb0bb6d1f6d146d
3
- size 77413046
 
 
 
 
ckpts/svc/vocalist_l1_contentvec+whisper/singers.json DELETED
@@ -1,17 +0,0 @@
1
- {
2
- "vocalist_l1_Adele": 0,
3
- "vocalist_l1_Beyonce": 1,
4
- "vocalist_l1_BrunoMars": 2,
5
- "vocalist_l1_JohnMayer": 3,
6
- "vocalist_l1_MichaelJackson": 4,
7
- "vocalist_l1_TaylorSwift": 5,
8
- "vocalist_l1_张学友": 6,
9
- "vocalist_l1_李健": 7,
10
- "vocalist_l1_汪峰": 8,
11
- "vocalist_l1_王菲": 9,
12
- "vocalist_l1_石倚洁": 10,
13
- "vocalist_l1_蔡琴": 11,
14
- "vocalist_l1_那英": 12,
15
- "vocalist_l1_陈奕迅": 13,
16
- "vocalist_l1_陶喆": 14
17
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/audioldm.json DELETED
@@ -1,92 +0,0 @@
1
- {
2
- "base_config": "config/base.json",
3
- "model_type": "AudioLDM",
4
- "task_type": "tta",
5
- "dataset": [
6
- "AudioCaps"
7
- ],
8
- "preprocess": {
9
- // feature used for model training
10
- "use_spkid": false,
11
- "use_uv": false,
12
- "use_frame_pitch": false,
13
- "use_phone_pitch": false,
14
- "use_frame_energy": false,
15
- "use_phone_energy": false,
16
- "use_mel": false,
17
- "use_audio": false,
18
- "use_label": false,
19
- "use_one_hot": false,
20
- "cond_mask_prob": 0.1
21
- },
22
- // model
23
- "model": {
24
- "audioldm": {
25
- "image_size": 32,
26
- "in_channels": 4,
27
- "out_channels": 4,
28
- "model_channels": 256,
29
- "attention_resolutions": [
30
- 4,
31
- 2,
32
- 1
33
- ],
34
- "num_res_blocks": 2,
35
- "channel_mult": [
36
- 1,
37
- 2,
38
- 4
39
- ],
40
- "num_heads": 8,
41
- "use_spatial_transformer": true,
42
- "transformer_depth": 1,
43
- "context_dim": 768,
44
- "use_checkpoint": true,
45
- "legacy": false
46
- },
47
- "autoencoderkl": {
48
- "ch": 128,
49
- "ch_mult": [
50
- 1,
51
- 1,
52
- 2,
53
- 2,
54
- 4
55
- ],
56
- "num_res_blocks": 2,
57
- "in_channels": 1,
58
- "z_channels": 4,
59
- "out_ch": 1,
60
- "double_z": true
61
- },
62
- "noise_scheduler": {
63
- "num_train_timesteps": 1000,
64
- "beta_start": 0.00085,
65
- "beta_end": 0.012,
66
- "beta_schedule": "scaled_linear",
67
- "clip_sample": false,
68
- "steps_offset": 1,
69
- "set_alpha_to_one": false,
70
- "skip_prk_steps": true,
71
- "prediction_type": "epsilon"
72
- }
73
- },
74
- // train
75
- "train": {
76
- "lronPlateau": {
77
- "factor": 0.9,
78
- "patience": 100,
79
- "min_lr": 4.0e-5,
80
- "verbose": true
81
- },
82
- "adam": {
83
- "lr": 5.0e-5,
84
- "betas": [
85
- 0.9,
86
- 0.999
87
- ],
88
- "weight_decay": 1.0e-2,
89
- "eps": 1.0e-8
90
- }
91
- }
92
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/autoencoderkl.json DELETED
@@ -1,69 +0,0 @@
1
- {
2
- "base_config": "config/base.json",
3
- "model_type": "AutoencoderKL",
4
- "task_type": "tta",
5
- "dataset": [
6
- "AudioCaps"
7
- ],
8
- "preprocess": {
9
- // feature used for model training
10
- "use_spkid": false,
11
- "use_uv": false,
12
- "use_frame_pitch": false,
13
- "use_phone_pitch": false,
14
- "use_frame_energy": false,
15
- "use_phone_energy": false,
16
- "use_mel": false,
17
- "use_audio": false,
18
- "use_label": false,
19
- "use_one_hot": false
20
- },
21
- // model
22
- "model": {
23
- "autoencoderkl": {
24
- "ch": 128,
25
- "ch_mult": [
26
- 1,
27
- 1,
28
- 2,
29
- 2,
30
- 4
31
- ],
32
- "num_res_blocks": 2,
33
- "in_channels": 1,
34
- "z_channels": 4,
35
- "out_ch": 1,
36
- "double_z": true
37
- },
38
- "loss": {
39
- "kl_weight": 1e-8,
40
- "disc_weight": 0.5,
41
- "disc_factor": 1.0,
42
- "logvar_init": 0.0,
43
- "min_adapt_d_weight": 0.0,
44
- "max_adapt_d_weight": 10.0,
45
- "disc_start": 50001,
46
- "disc_in_channels": 1,
47
- "disc_num_layers": 3,
48
- "use_actnorm": false
49
- }
50
- },
51
- // train
52
- "train": {
53
- "lronPlateau": {
54
- "factor": 0.9,
55
- "patience": 100,
56
- "min_lr": 4.0e-5,
57
- "verbose": true
58
- },
59
- "adam": {
60
- "lr": 4.0e-4,
61
- "betas": [
62
- 0.9,
63
- 0.999
64
- ],
65
- "weight_decay": 1.0e-2,
66
- "eps": 1.0e-8
67
- }
68
- }
69
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/base.json DELETED
@@ -1,220 +0,0 @@
1
- {
2
- "supported_model_type": [
3
- "GANVocoder",
4
- "Fastspeech2",
5
- "DiffSVC",
6
- "Transformer",
7
- "EDM",
8
- "CD"
9
- ],
10
- "task_type": "",
11
- "dataset": [],
12
- "use_custom_dataset": false,
13
- "preprocess": {
14
- "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
15
- // trim audio silence
16
- "data_augment": false,
17
- "trim_silence": false,
18
- "num_silent_frames": 8,
19
- "trim_fft_size": 512, // fft size used in trimming
20
- "trim_hop_size": 128, // hop size used in trimming
21
- "trim_top_db": 30, // top db used in trimming sensitive to each dataset
22
- // acoustic features
23
- "extract_mel": false,
24
- "mel_extract_mode": "",
25
- "extract_linear_spec": false,
26
- "extract_mcep": false,
27
- "extract_pitch": false,
28
- "extract_acoustic_token": false,
29
- "pitch_remove_outlier": false,
30
- "extract_uv": false,
31
- "pitch_norm": false,
32
- "extract_audio": false,
33
- "extract_label": false,
34
- "pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
35
- "extract_energy": false,
36
- "energy_remove_outlier": false,
37
- "energy_norm": false,
38
- "energy_extract_mode": "from_mel",
39
- "extract_duration": false,
40
- "extract_amplitude_phase": false,
41
- "mel_min_max_norm": false,
42
- // lingusitic features
43
- "extract_phone": false,
44
- "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
45
- // content features
46
- "extract_whisper_feature": false,
47
- "extract_contentvec_feature": false,
48
- "extract_mert_feature": false,
49
- "extract_wenet_feature": false,
50
- // Settings for data preprocessing
51
- "n_mel": 80,
52
- "win_size": 480,
53
- "hop_size": 120,
54
- "sample_rate": 24000,
55
- "n_fft": 1024,
56
- "fmin": 0,
57
- "fmax": 12000,
58
- "min_level_db": -115,
59
- "ref_level_db": 20,
60
- "bits": 8,
61
- // Directory names of processed data or extracted features
62
- "processed_dir": "processed_data",
63
- "trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
64
- "raw_data": "raw_data",
65
- "phone_dir": "phones",
66
- "wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
67
- "audio_dir": "audios",
68
- "log_amplitude_dir": "log_amplitudes",
69
- "phase_dir": "phases",
70
- "real_dir": "reals",
71
- "imaginary_dir": "imaginarys",
72
- "label_dir": "labels",
73
- "linear_dir": "linears",
74
- "mel_dir": "mels", // directory name of extraced mel features
75
- "mcep_dir": "mcep", // directory name of extraced mcep features
76
- "dur_dir": "durs",
77
- "symbols_dict": "symbols.dict",
78
- "lab_dir": "labs", // directory name of extraced label features
79
- "wenet_dir": "wenet", // directory name of extraced wenet features
80
- "contentvec_dir": "contentvec", // directory name of extraced wenet features
81
- "pitch_dir": "pitches", // directory name of extraced pitch features
82
- "energy_dir": "energys", // directory name of extracted energy features
83
- "phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
84
- "phone_energy_dir": "phone_energys", // directory name of extracted energy features
85
- "uv_dir": "uvs", // directory name of extracted unvoiced features
86
- "duration_dir": "duration", // ground-truth duration file
87
- "phone_seq_file": "phone_seq_file", // phoneme sequence file
88
- "file_lst": "file.lst",
89
- "train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
90
- "valid_file": "valid.json", // validattion set
91
- "spk2id": "spk2id.json", // used for multi-speaker dataset
92
- "utt2spk": "utt2spk", // used for multi-speaker dataset
93
- "emo2id": "emo2id.json", // used for multi-emotion dataset
94
- "utt2emo": "utt2emo", // used for multi-emotion dataset
95
- // Features used for model training
96
- "use_text": false,
97
- "use_phone": false,
98
- "use_phn_seq": false,
99
- "use_lab": false,
100
- "use_linear": false,
101
- "use_mel": false,
102
- "use_min_max_norm_mel": false,
103
- "use_wav": false,
104
- "use_phone_pitch": false,
105
- "use_log_scale_pitch": false,
106
- "use_phone_energy": false,
107
- "use_phone_duration": false,
108
- "use_log_scale_energy": false,
109
- "use_wenet": false,
110
- "use_dur": false,
111
- "use_spkid": false, // True: use speaker id for multi-speaker dataset
112
- "use_emoid": false, // True: use emotion id for multi-emotion dataset
113
- "use_frame_pitch": false,
114
- "use_uv": false,
115
- "use_frame_energy": false,
116
- "use_frame_duration": false,
117
- "use_audio": false,
118
- "use_label": false,
119
- "use_one_hot": false,
120
- "use_amplitude_phase": false,
121
- "data_augment": false,
122
- "align_mel_duration": false
123
- },
124
- "train": {
125
- "ddp": true,
126
- "random_seed": 970227,
127
- "batch_size": 16,
128
- "max_steps": 1000000,
129
- // Trackers
130
- "tracker": [
131
- "tensorboard"
132
- // "wandb",
133
- // "cometml",
134
- // "mlflow",
135
- ],
136
- "max_epoch": -1,
137
- // -1 means no limit
138
- "save_checkpoint_stride": [
139
- 5,
140
- 20
141
- ],
142
- // unit is epoch
143
- "keep_last": [
144
- 3,
145
- -1
146
- ],
147
- // -1 means infinite, if one number will broadcast
148
- "run_eval": [
149
- false,
150
- true
151
- ],
152
- // if one number will broadcast
153
- // Fix the random seed
154
- "random_seed": 10086,
155
- // Optimizer
156
- "optimizer": "AdamW",
157
- "adamw": {
158
- "lr": 4.0e-4
159
- // nn model lr
160
- },
161
- // LR Scheduler
162
- "scheduler": "ReduceLROnPlateau",
163
- "reducelronplateau": {
164
- "factor": 0.8,
165
- "patience": 10,
166
- // unit is epoch
167
- "min_lr": 1.0e-4
168
- },
169
- // Batchsampler
170
- "sampler": {
171
- "holistic_shuffle": true,
172
- "drop_last": true
173
- },
174
- // Dataloader
175
- "dataloader": {
176
- "num_worker": 32,
177
- "pin_memory": true
178
- },
179
- "gradient_accumulation_step": 1,
180
- "total_training_steps": 50000,
181
- "save_summary_steps": 500,
182
- "save_checkpoints_steps": 10000,
183
- "valid_interval": 10000,
184
- "keep_checkpoint_max": 5,
185
- "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
186
- "max_epoch": -1,
187
- // -1 means no limit
188
- "save_checkpoint_stride": [
189
- 5,
190
- 20
191
- ],
192
- // unit is epoch
193
- "keep_last": [
194
- 3,
195
- -1
196
- ],
197
- // -1 means infinite, if one number will broadcast
198
- "run_eval": [
199
- false,
200
- true
201
- ],
202
- // Batchsampler
203
- "sampler": {
204
- "holistic_shuffle": true,
205
- "drop_last": true
206
- },
207
- // Dataloader
208
- "dataloader": {
209
- "num_worker": 32,
210
- "pin_memory": true
211
- },
212
- // Trackers
213
- "tracker": [
214
- "tensorboard"
215
- // "wandb",
216
- // "cometml",
217
- // "mlflow",
218
- ],
219
- },
220
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/comosvc.json DELETED
@@ -1,216 +0,0 @@
1
- {
2
- "base_config": "config/base.json",
3
- "model_type": "DiffComoSVC",
4
- "task_type": "svc",
5
- "use_custom_dataset": false,
6
- "preprocess": {
7
- // data augmentations
8
- "use_pitch_shift": false,
9
- "use_formant_shift": false,
10
- "use_time_stretch": false,
11
- "use_equalizer": false,
12
- // acoustic features
13
- "extract_mel": true,
14
- "mel_min_max_norm": true,
15
- "extract_pitch": true,
16
- "pitch_extractor": "parselmouth",
17
- "extract_uv": true,
18
- "extract_energy": true,
19
- // content features
20
- "extract_whisper_feature": false,
21
- "whisper_sample_rate": 16000,
22
- "extract_contentvec_feature": false,
23
- "contentvec_sample_rate": 16000,
24
- "extract_wenet_feature": false,
25
- "wenet_sample_rate": 16000,
26
- "extract_mert_feature": false,
27
- "mert_sample_rate": 16000,
28
- // Default config for whisper
29
- "whisper_frameshift": 0.01,
30
- "whisper_downsample_rate": 2,
31
- // Default config for content vector
32
- "contentvec_frameshift": 0.02,
33
- // Default config for mert
34
- "mert_model": "m-a-p/MERT-v1-330M",
35
- "mert_feature_layer": -1,
36
- "mert_hop_size": 320,
37
- // 24k
38
- "mert_frameshit": 0.01333,
39
- // 10ms
40
- "wenet_frameshift": 0.01,
41
- // wenetspeech is 4, gigaspeech is 6
42
- "wenet_downsample_rate": 4,
43
- // Default config
44
- "n_mel": 100,
45
- "win_size": 1024,
46
- // todo
47
- "hop_size": 256,
48
- "sample_rate": 24000,
49
- "n_fft": 1024,
50
- // todo
51
- "fmin": 0,
52
- "fmax": 12000,
53
- // todo
54
- "f0_min": 50,
55
- // ~C2
56
- "f0_max": 1100,
57
- //1100, // ~C6(1100), ~G5(800)
58
- "pitch_bin": 256,
59
- "pitch_max": 1100.0,
60
- "pitch_min": 50.0,
61
- "is_label": true,
62
- "is_mu_law": true,
63
- "bits": 8,
64
- "mel_min_max_stats_dir": "mel_min_max_stats",
65
- "whisper_dir": "whisper",
66
- "contentvec_dir": "contentvec",
67
- "wenet_dir": "wenet",
68
- "mert_dir": "mert",
69
- // Extract content features using dataloader
70
- "pin_memory": true,
71
- "num_workers": 8,
72
- "content_feature_batch_size": 16,
73
- // Features used for model training
74
- "use_mel": true,
75
- "use_min_max_norm_mel": true,
76
- "use_frame_pitch": true,
77
- "use_uv": true,
78
- "use_frame_energy": true,
79
- "use_log_scale_pitch": false,
80
- "use_log_scale_energy": false,
81
- "use_spkid": true,
82
- // Meta file
83
- "train_file": "train.json",
84
- "valid_file": "test.json",
85
- "spk2id": "singers.json",
86
- "utt2spk": "utt2singer"
87
- },
88
- "model": {
89
- "teacher_model_path": "[Your Teacher Model Path].bin",
90
- "condition_encoder": {
91
- "merge_mode": "add",
92
- "input_melody_dim": 1,
93
- "use_log_f0": true,
94
- "n_bins_melody": 256,
95
- //# Quantization (0 for not quantization)
96
- "output_melody_dim": 384,
97
- "input_loudness_dim": 1,
98
- "use_log_loudness": true,
99
- "n_bins_loudness": 256,
100
- "output_loudness_dim": 384,
101
- "use_whisper": false,
102
- "use_contentvec": false,
103
- "use_wenet": false,
104
- "use_mert": false,
105
- "whisper_dim": 1024,
106
- "contentvec_dim": 256,
107
- "mert_dim": 256,
108
- "wenet_dim": 512,
109
- "content_encoder_dim": 384,
110
- "output_singer_dim": 384,
111
- "singer_table_size": 512,
112
- "output_content_dim": 384,
113
- "use_spkid": true
114
- },
115
- "comosvc": {
116
- "distill": false,
117
- // conformer encoder
118
- "input_dim": 384,
119
- "output_dim": 100,
120
- "n_heads": 2,
121
- "n_layers": 6,
122
- "filter_channels": 512,
123
- "dropout": 0.1,
124
- // karras diffusion
125
- "P_mean": -1.2,
126
- "P_std": 1.2,
127
- "sigma_data": 0.5,
128
- "sigma_min": 0.002,
129
- "sigma_max": 80,
130
- "rho": 7,
131
- "n_timesteps": 40,
132
- },
133
- "diffusion": {
134
- // Diffusion steps encoder
135
- "step_encoder": {
136
- "dim_raw_embedding": 128,
137
- "dim_hidden_layer": 512,
138
- "activation": "SiLU",
139
- "num_layer": 2,
140
- "max_period": 10000
141
- },
142
- // Diffusion decoder
143
- "model_type": "bidilconv",
144
- // bidilconv, unet2d, TODO: unet1d
145
- "bidilconv": {
146
- "base_channel": 384,
147
- "n_res_block": 20,
148
- "conv_kernel_size": 3,
149
- "dilation_cycle_length": 4,
150
- // specially, 1 means no dilation
151
- "conditioner_size": 100
152
- }
153
- },
154
- },
155
- "train": {
156
- // Basic settings
157
- "fast_steps": 0,
158
- "batch_size": 32,
159
- "gradient_accumulation_step": 1,
160
- "max_epoch": -1,
161
- // -1 means no limit
162
- "save_checkpoint_stride": [
163
- 10,
164
- 100
165
- ],
166
- // unit is epoch
167
- "keep_last": [
168
- 3,
169
- -1
170
- ],
171
- // -1 means infinite, if one number will broadcast
172
- "run_eval": [
173
- false,
174
- true
175
- ],
176
- // if one number will broadcast
177
- // Fix the random seed
178
- "random_seed": 10086,
179
- // Batchsampler
180
- "sampler": {
181
- "holistic_shuffle": true,
182
- "drop_last": true
183
- },
184
- // Dataloader
185
- "dataloader": {
186
- "num_worker": 32,
187
- "pin_memory": true
188
- },
189
- // Trackers
190
- "tracker": [
191
- "tensorboard"
192
- // "wandb",
193
- // "cometml",
194
- // "mlflow",
195
- ],
196
- // Optimizer
197
- "optimizer": "AdamW",
198
- "adamw": {
199
- "lr": 4.0e-4
200
- // nn model lr
201
- },
202
- // LR Scheduler
203
- "scheduler": "ReduceLROnPlateau",
204
- "reducelronplateau": {
205
- "factor": 0.8,
206
- "patience": 10,
207
- // unit is epoch
208
- "min_lr": 1.0e-4
209
- }
210
- },
211
- "inference": {
212
- "comosvc": {
213
- "inference_steps": 40
214
- }
215
- }
216
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/diffusion.json DELETED
@@ -1,227 +0,0 @@
1
- {
2
- // FIXME: THESE ARE LEGACY
3
- "base_config": "config/base.json",
4
- "model_type": "diffusion",
5
- "task_type": "svc",
6
- "use_custom_dataset": false,
7
- "preprocess": {
8
- // data augmentations
9
- "use_pitch_shift": false,
10
- "use_formant_shift": false,
11
- "use_time_stretch": false,
12
- "use_equalizer": false,
13
- // acoustic features
14
- "extract_mel": true,
15
- "mel_min_max_norm": true,
16
- "extract_pitch": true,
17
- "pitch_extractor": "parselmouth",
18
- "extract_uv": true,
19
- "extract_energy": true,
20
- // content features
21
- "extract_whisper_feature": false,
22
- "whisper_sample_rate": 16000,
23
- "extract_contentvec_feature": false,
24
- "contentvec_sample_rate": 16000,
25
- "extract_wenet_feature": false,
26
- "wenet_sample_rate": 16000,
27
- "extract_mert_feature": false,
28
- "mert_sample_rate": 16000,
29
- // Default config for whisper
30
- "whisper_frameshift": 0.01,
31
- "whisper_downsample_rate": 2,
32
- // Default config for content vector
33
- "contentvec_frameshift": 0.02,
34
- // Default config for mert
35
- "mert_model": "m-a-p/MERT-v1-330M",
36
- "mert_feature_layer": -1,
37
- "mert_hop_size": 320,
38
- // 24k
39
- "mert_frameshit": 0.01333,
40
- // 10ms
41
- "wenet_frameshift": 0.01,
42
- // wenetspeech is 4, gigaspeech is 6
43
- "wenet_downsample_rate": 4,
44
- // Default config
45
- "n_mel": 100,
46
- "win_size": 1024,
47
- // todo
48
- "hop_size": 256,
49
- "sample_rate": 24000,
50
- "n_fft": 1024,
51
- // todo
52
- "fmin": 0,
53
- "fmax": 12000,
54
- // todo
55
- "f0_min": 50,
56
- // ~C2
57
- "f0_max": 1100,
58
- //1100, // ~C6(1100), ~G5(800)
59
- "pitch_bin": 256,
60
- "pitch_max": 1100.0,
61
- "pitch_min": 50.0,
62
- "is_label": true,
63
- "is_mu_law": true,
64
- "bits": 8,
65
- "mel_min_max_stats_dir": "mel_min_max_stats",
66
- "whisper_dir": "whisper",
67
- "contentvec_dir": "contentvec",
68
- "wenet_dir": "wenet",
69
- "mert_dir": "mert",
70
- // Extract content features using dataloader
71
- "pin_memory": true,
72
- "num_workers": 8,
73
- "content_feature_batch_size": 16,
74
- // Features used for model training
75
- "use_mel": true,
76
- "use_min_max_norm_mel": true,
77
- "use_frame_pitch": true,
78
- "use_uv": true,
79
- "use_frame_energy": true,
80
- "use_log_scale_pitch": false,
81
- "use_log_scale_energy": false,
82
- "use_spkid": true,
83
- // Meta file
84
- "train_file": "train.json",
85
- "valid_file": "test.json",
86
- "spk2id": "singers.json",
87
- "utt2spk": "utt2singer"
88
- },
89
- "model": {
90
- "condition_encoder": {
91
- "merge_mode": "add",
92
- "input_melody_dim": 1,
93
- "use_log_f0": true,
94
- "n_bins_melody": 256,
95
- //# Quantization (0 for not quantization)
96
- "output_melody_dim": 384,
97
- "input_loudness_dim": 1,
98
- "use_log_loudness": true,
99
- "n_bins_loudness": 256,
100
- "output_loudness_dim": 384,
101
- "use_whisper": false,
102
- "use_contentvec": false,
103
- "use_wenet": false,
104
- "use_mert": false,
105
- "whisper_dim": 1024,
106
- "contentvec_dim": 256,
107
- "mert_dim": 256,
108
- "wenet_dim": 512,
109
- "content_encoder_dim": 384,
110
- "output_singer_dim": 384,
111
- "singer_table_size": 512,
112
- "output_content_dim": 384,
113
- "use_spkid": true
114
- },
115
- // FIXME: FOLLOWING ARE NEW!!
116
- "diffusion": {
117
- "scheduler": "ddpm",
118
- "scheduler_settings": {
119
- "num_train_timesteps": 1000,
120
- "beta_start": 1.0e-4,
121
- "beta_end": 0.02,
122
- "beta_schedule": "linear"
123
- },
124
- // Diffusion steps encoder
125
- "step_encoder": {
126
- "dim_raw_embedding": 128,
127
- "dim_hidden_layer": 512,
128
- "activation": "SiLU",
129
- "num_layer": 2,
130
- "max_period": 10000
131
- },
132
- // Diffusion decoder
133
- "model_type": "bidilconv",
134
- // bidilconv, unet2d, TODO: unet1d
135
- "bidilconv": {
136
- "base_channel": 384,
137
- "n_res_block": 20,
138
- "conv_kernel_size": 3,
139
- "dilation_cycle_length": 4,
140
- // specially, 1 means no dilation
141
- "conditioner_size": 384
142
- },
143
- "unet2d": {
144
- "in_channels": 1,
145
- "out_channels": 1,
146
- "down_block_types": [
147
- "CrossAttnDownBlock2D",
148
- "CrossAttnDownBlock2D",
149
- "CrossAttnDownBlock2D",
150
- "DownBlock2D"
151
- ],
152
- "mid_block_type": "UNetMidBlock2DCrossAttn",
153
- "up_block_types": [
154
- "UpBlock2D",
155
- "CrossAttnUpBlock2D",
156
- "CrossAttnUpBlock2D",
157
- "CrossAttnUpBlock2D"
158
- ],
159
- "only_cross_attention": false
160
- }
161
- }
162
- },
163
- // FIXME: FOLLOWING ARE NEW!!
164
- "train": {
165
- // Basic settings
166
- "batch_size": 64,
167
- "gradient_accumulation_step": 1,
168
- "max_epoch": -1,
169
- // -1 means no limit
170
- "save_checkpoint_stride": [
171
- 5,
172
- 20
173
- ],
174
- // unit is epoch
175
- "keep_last": [
176
- 3,
177
- -1
178
- ],
179
- // -1 means infinite, if one number will broadcast
180
- "run_eval": [
181
- false,
182
- true
183
- ],
184
- // if one number will broadcast
185
- // Fix the random seed
186
- "random_seed": 10086,
187
- // Batchsampler
188
- "sampler": {
189
- "holistic_shuffle": true,
190
- "drop_last": true
191
- },
192
- // Dataloader
193
- "dataloader": {
194
- "num_worker": 32,
195
- "pin_memory": true
196
- },
197
- // Trackers
198
- "tracker": [
199
- "tensorboard"
200
- // "wandb",
201
- // "cometml",
202
- // "mlflow",
203
- ],
204
- // Optimizer
205
- "optimizer": "AdamW",
206
- "adamw": {
207
- "lr": 4.0e-4
208
- // nn model lr
209
- },
210
- // LR Scheduler
211
- "scheduler": "ReduceLROnPlateau",
212
- "reducelronplateau": {
213
- "factor": 0.8,
214
- "patience": 10,
215
- // unit is epoch
216
- "min_lr": 1.0e-4
217
- }
218
- },
219
- "inference": {
220
- "diffusion": {
221
- "scheduler": "pndm",
222
- "scheduler_settings": {
223
- "num_inference_timesteps": 1000
224
- }
225
- }
226
- }
227
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/fs2.json DELETED
@@ -1,117 +0,0 @@
1
- {
2
- "base_config": "config/tts.json",
3
- "model_type": "FastSpeech2",
4
- "task_type": "tts",
5
- "dataset": ["LJSpeech"],
6
- "preprocess": {
7
- // acoustic features
8
- "extract_audio": true,
9
- "extract_mel": true,
10
- "mel_extract_mode": "taco",
11
- "mel_min_max_norm": false,
12
- "extract_pitch": true,
13
- "extract_uv": false,
14
- "pitch_extractor": "dio",
15
- "extract_energy": true,
16
- "energy_extract_mode": "from_tacotron_stft",
17
- "extract_duration": true,
18
- "use_phone": true,
19
- "pitch_norm": true,
20
- "energy_norm": true,
21
- "pitch_remove_outlier": true,
22
- "energy_remove_outlier": true,
23
-
24
- // Default config
25
- "n_mel": 80,
26
- "win_size": 1024, // todo
27
- "hop_size": 256,
28
- "sample_rate": 22050,
29
- "n_fft": 1024, // todo
30
- "fmin": 0,
31
- "fmax": 8000, // todo
32
- "raw_data": "raw_data",
33
- "text_cleaners": ["english_cleaners"],
34
- "f0_min": 71, // ~C2
35
- "f0_max": 800, //1100, // ~C6(1100), ~G5(800)
36
- "pitch_bin": 256,
37
- "pitch_max": 1100.0,
38
- "pitch_min": 50.0,
39
- "is_label": true,
40
- "is_mu_law": true,
41
- "bits": 8,
42
-
43
- "mel_min_max_stats_dir": "mel_min_max_stats",
44
- "whisper_dir": "whisper",
45
- "content_vector_dir": "content_vector",
46
- "wenet_dir": "wenet",
47
- "mert_dir": "mert",
48
- "spk2id":"spk2id.json",
49
- "utt2spk":"utt2spk",
50
-
51
- // Features used for model training
52
- "use_mel": true,
53
- "use_min_max_norm_mel": false,
54
- "use_frame_pitch": false,
55
- "use_frame_energy": false,
56
- "use_phone_pitch": true,
57
- "use_phone_energy": true,
58
- "use_log_scale_pitch": false,
59
- "use_log_scale_energy": false,
60
- "use_spkid": false,
61
- "align_mel_duration": true,
62
- "text_cleaners": ["english_cleaners"]
63
- },
64
- "model": {
65
- // Settings for transformer
66
- "transformer": {
67
- "encoder_layer": 4,
68
- "encoder_head": 2,
69
- "encoder_hidden": 256,
70
- "decoder_layer": 6,
71
- "decoder_head": 2,
72
- "decoder_hidden": 256,
73
- "conv_filter_size": 1024,
74
- "conv_kernel_size": [9, 1],
75
- "encoder_dropout": 0.2,
76
- "decoder_dropout": 0.2
77
- },
78
-
79
- // Settings for variance_predictor
80
- "variance_predictor":{
81
- "filter_size": 256,
82
- "kernel_size": 3,
83
- "dropout": 0.5
84
- },
85
- "variance_embedding":{
86
- "pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
87
- "energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
88
- "n_bins": 256
89
- },
90
- "max_seq_len": 1000
91
- },
92
- "train":{
93
- "batch_size": 16,
94
- "sort_sample": true,
95
- "drop_last": true,
96
- "group_size": 4,
97
- "grad_clip_thresh": 1.0,
98
- "dataloader": {
99
- "num_worker": 8,
100
- "pin_memory": true
101
- },
102
- "lr_scheduler":{
103
- "num_warmup": 4000
104
- },
105
- // LR Scheduler
106
- "scheduler": "NoamLR",
107
- // Optimizer
108
- "optimizer": "Adam",
109
- "adam": {
110
- "lr": 0.0625,
111
- "betas": [0.9, 0.98],
112
- "eps": 0.000000001,
113
- "weight_decay": 0.0
114
- },
115
- }
116
-
117
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/transformer.json DELETED
@@ -1,180 +0,0 @@
1
- {
2
- "base_config": "config/base.json",
3
- "model_type": "Transformer",
4
- "task_type": "svc",
5
- "use_custom_dataset": false,
6
- "preprocess": {
7
- // data augmentations
8
- "use_pitch_shift": false,
9
- "use_formant_shift": false,
10
- "use_time_stretch": false,
11
- "use_equalizer": false,
12
- // acoustic features
13
- "extract_mel": true,
14
- "mel_min_max_norm": true,
15
- "extract_pitch": true,
16
- "pitch_extractor": "parselmouth",
17
- "extract_uv": true,
18
- "extract_energy": true,
19
- // content features
20
- "extract_whisper_feature": false,
21
- "whisper_sample_rate": 16000,
22
- "extract_contentvec_feature": false,
23
- "contentvec_sample_rate": 16000,
24
- "extract_wenet_feature": false,
25
- "wenet_sample_rate": 16000,
26
- "extract_mert_feature": false,
27
- "mert_sample_rate": 16000,
28
- // Default config for whisper
29
- "whisper_frameshift": 0.01,
30
- "whisper_downsample_rate": 2,
31
- // Default config for content vector
32
- "contentvec_frameshift": 0.02,
33
- // Default config for mert
34
- "mert_model": "m-a-p/MERT-v1-330M",
35
- "mert_feature_layer": -1,
36
- "mert_hop_size": 320,
37
- // 24k
38
- "mert_frameshit": 0.01333,
39
- // 10ms
40
- "wenet_frameshift": 0.01,
41
- // wenetspeech is 4, gigaspeech is 6
42
- "wenet_downsample_rate": 4,
43
- // Default config
44
- "n_mel": 100,
45
- "win_size": 1024,
46
- // todo
47
- "hop_size": 256,
48
- "sample_rate": 24000,
49
- "n_fft": 1024,
50
- // todo
51
- "fmin": 0,
52
- "fmax": 12000,
53
- // todo
54
- "f0_min": 50,
55
- // ~C2
56
- "f0_max": 1100,
57
- //1100, // ~C6(1100), ~G5(800)
58
- "pitch_bin": 256,
59
- "pitch_max": 1100.0,
60
- "pitch_min": 50.0,
61
- "is_label": true,
62
- "is_mu_law": true,
63
- "bits": 8,
64
- "mel_min_max_stats_dir": "mel_min_max_stats",
65
- "whisper_dir": "whisper",
66
- "contentvec_dir": "contentvec",
67
- "wenet_dir": "wenet",
68
- "mert_dir": "mert",
69
- // Extract content features using dataloader
70
- "pin_memory": true,
71
- "num_workers": 8,
72
- "content_feature_batch_size": 16,
73
- // Features used for model training
74
- "use_mel": true,
75
- "use_min_max_norm_mel": true,
76
- "use_frame_pitch": true,
77
- "use_uv": true,
78
- "use_frame_energy": true,
79
- "use_log_scale_pitch": false,
80
- "use_log_scale_energy": false,
81
- "use_spkid": true,
82
- // Meta file
83
- "train_file": "train.json",
84
- "valid_file": "test.json",
85
- "spk2id": "singers.json",
86
- "utt2spk": "utt2singer"
87
- },
88
- "model": {
89
- "condition_encoder": {
90
- "merge_mode": "add",
91
- "input_melody_dim": 1,
92
- "use_log_f0": true,
93
- "n_bins_melody": 256,
94
- //# Quantization (0 for not quantization)
95
- "output_melody_dim": 384,
96
- "input_loudness_dim": 1,
97
- "use_log_loudness": true,
98
- "n_bins_loudness": 256,
99
- "output_loudness_dim": 384,
100
- "use_whisper": false,
101
- "use_contentvec": true,
102
- "use_wenet": false,
103
- "use_mert": false,
104
- "whisper_dim": 1024,
105
- "contentvec_dim": 256,
106
- "mert_dim": 256,
107
- "wenet_dim": 512,
108
- "content_encoder_dim": 384,
109
- "output_singer_dim": 384,
110
- "singer_table_size": 512,
111
- "output_content_dim": 384,
112
- "use_spkid": true
113
- },
114
- "transformer": {
115
- "type": "conformer",
116
- // 'conformer' or 'transformer'
117
- "input_dim": 384,
118
- "output_dim": 100,
119
- "n_heads": 2,
120
- "n_layers": 6,
121
- "filter_channels": 512,
122
- "dropout": 0.1,
123
- }
124
- },
125
- "train": {
126
- // Basic settings
127
- "batch_size": 64,
128
- "gradient_accumulation_step": 1,
129
- "max_epoch": -1,
130
- // -1 means no limit
131
- "save_checkpoint_stride": [
132
- 10,
133
- 100
134
- ],
135
- // unit is epoch
136
- "keep_last": [
137
- 3,
138
- -1
139
- ],
140
- // -1 means infinite, if one number will broadcast
141
- "run_eval": [
142
- false,
143
- true
144
- ],
145
- // if one number will broadcast
146
- // Fix the random seed
147
- "random_seed": 10086,
148
- // Batchsampler
149
- "sampler": {
150
- "holistic_shuffle": true,
151
- "drop_last": true
152
- },
153
- // Dataloader
154
- "dataloader": {
155
- "num_worker": 32,
156
- "pin_memory": true
157
- },
158
- // Trackers
159
- "tracker": [
160
- "tensorboard"
161
- // "wandb",
162
- // "cometml",
163
- // "mlflow",
164
- ],
165
- // Optimizer
166
- "optimizer": "AdamW",
167
- "adamw": {
168
- "lr": 4.0e-4
169
- // nn model lr
170
- },
171
- // LR Scheduler
172
- "scheduler": "ReduceLROnPlateau",
173
- "reducelronplateau": {
174
- "factor": 0.8,
175
- "patience": 10,
176
- // unit is epoch
177
- "min_lr": 1.0e-4
178
- }
179
- }
180
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/tts.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "base_config": "config/base.json",
3
- "supported_model_type": [
4
- "Fastspeech2",
5
- "VITS",
6
- "VALLE",
7
- ],
8
- "task_type": "tts",
9
- "preprocess": {
10
- "language": "en-us",
11
- // linguistic features
12
- "extract_phone": true,
13
- "phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
14
- "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
15
- // Directory names of processed data or extracted features
16
- "phone_dir": "phones",
17
- "use_phone": true,
18
- },
19
- "model": {
20
- "text_token_num": 512,
21
- }
22
-
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/valle.json DELETED
@@ -1,52 +0,0 @@
1
- {
2
- "base_config": "config/tts.json",
3
- "model_type": "VALLE",
4
- "task_type": "tts",
5
- "dataset": [
6
- "libritts"
7
- ],
8
- "preprocess": {
9
- "extract_phone": true,
10
- "phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
11
- "extract_acoustic_token": true,
12
- "acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
13
- "acoustic_token_dir": "acoutic_tokens",
14
- "use_text": false,
15
- "use_phone": true,
16
- "use_acoustic_token": true,
17
- "symbols_dict": "symbols.dict",
18
- "min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
19
- "max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
20
- "sampling_rate": 24000,
21
- },
22
- "model": {
23
- "text_token_num": 512,
24
- "audio_token_num": 1024,
25
- "decoder_dim": 1024, // embedding dimension of the decoder model
26
- "nhead": 16, // number of attention heads in the decoder layers
27
- "num_decoder_layers": 12, // number of decoder layers
28
- "norm_first": true, // pre or post Normalization.
29
- "add_prenet": false, // whether add PreNet after Inputs
30
- "prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
31
- "share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
32
- "nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
33
- "prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
34
- "num_quantizers": 8, // numbert of the audio quantization layers
35
- // "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
36
- },
37
- "train": {
38
- "ddp": false,
39
- "train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
40
- "max_epoch": 20,
41
- "optimizer": "ScaledAdam",
42
- "scheduler": "Eden",
43
- "warmup_steps": 200, // number of steps that affects how rapidly the learning rate decreases
44
- "base_lr": 0.05, // base learning rate."
45
- "valid_interval": 1000,
46
- "log_epoch_step": 1000,
47
- "save_checkpoint_stride": [
48
- 1,
49
- 1
50
- ]
51
- }
52
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/vits.json DELETED
@@ -1,101 +0,0 @@
1
- {
2
- "base_config": "config/tts.json",
3
- "model_type": "VITS",
4
- "task_type": "tts",
5
- "preprocess": {
6
- "extract_phone": true,
7
- "extract_mel": true,
8
- "n_mel": 80,
9
- "fmin": 0,
10
- "fmax": null,
11
- "extract_linear_spec": true,
12
- "extract_audio": true,
13
- "use_linear": true,
14
- "use_mel": true,
15
- "use_audio": true,
16
- "use_text": false,
17
- "use_phone": true,
18
- "lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
19
- "n_fft": 1024,
20
- "win_size": 1024,
21
- "hop_size": 256,
22
- "segment_size": 8192,
23
- "text_cleaners": [
24
- "english_cleaners"
25
- ]
26
- },
27
- "model": {
28
- "text_token_num": 512,
29
- "inter_channels": 192,
30
- "hidden_channels": 192,
31
- "filter_channels": 768,
32
- "n_heads": 2,
33
- "n_layers": 6,
34
- "kernel_size": 3,
35
- "p_dropout": 0.1,
36
- "resblock": "1",
37
- "resblock_kernel_sizes": [
38
- 3,
39
- 7,
40
- 11
41
- ],
42
- "resblock_dilation_sizes": [
43
- [
44
- 1,
45
- 3,
46
- 5
47
- ],
48
- [
49
- 1,
50
- 3,
51
- 5
52
- ],
53
- [
54
- 1,
55
- 3,
56
- 5
57
- ]
58
- ],
59
- "upsample_rates": [
60
- 8,
61
- 8,
62
- 2,
63
- 2
64
- ],
65
- "upsample_initial_channel": 512,
66
- "upsample_kernel_sizes": [
67
- 16,
68
- 16,
69
- 4,
70
- 4
71
- ],
72
- "n_layers_q": 3,
73
- "use_spectral_norm": false,
74
- "n_speakers": 0, // number of speakers, while be automatically set if n_speakers is 0 and multi_speaker_training is true
75
- "gin_channels": 256,
76
- "use_sdp": true
77
- },
78
- "train": {
79
- "fp16_run": true,
80
- "learning_rate": 2e-4,
81
- "betas": [
82
- 0.8,
83
- 0.99
84
- ],
85
- "eps": 1e-9,
86
- "batch_size": 16,
87
- "lr_decay": 0.999875,
88
- // "segment_size": 8192,
89
- "init_lr_ratio": 1,
90
- "warmup_epochs": 0,
91
- "c_mel": 45,
92
- "c_kl": 1.0,
93
- "AdamW": {
94
- "betas": [
95
- 0.8,
96
- 0.99
97
- ],
98
- "eps": 1e-9,
99
- }
100
- }
101
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config/vocoder.json DELETED
@@ -1,84 +0,0 @@
1
- {
2
- "base_config": "config/base.json",
3
- "dataset": [
4
- "LJSpeech",
5
- "LibriTTS",
6
- "opencpop",
7
- "m4singer",
8
- "svcc",
9
- "svcceval",
10
- "pjs",
11
- "opensinger",
12
- "popbutfy",
13
- "nus48e",
14
- "popcs",
15
- "kising",
16
- "csd",
17
- "opera",
18
- "vctk",
19
- "lijian",
20
- "cdmusiceval"
21
- ],
22
- "task_type": "vocoder",
23
- "preprocess": {
24
- // acoustic features
25
- "extract_mel": true,
26
- "extract_pitch": false,
27
- "extract_uv": false,
28
- "extract_audio": true,
29
- "extract_label": false,
30
- "extract_one_hot": false,
31
- "extract_amplitude_phase": false,
32
- "pitch_extractor": "parselmouth",
33
- // Settings for data preprocessing
34
- "n_mel": 100,
35
- "win_size": 1024,
36
- "hop_size": 256,
37
- "sample_rate": 24000,
38
- "n_fft": 1024,
39
- "fmin": 0,
40
- "fmax": 12000,
41
- "f0_min": 50,
42
- "f0_max": 1100,
43
- "pitch_bin": 256,
44
- "pitch_max": 1100.0,
45
- "pitch_min": 50.0,
46
- "is_mu_law": false,
47
- "bits": 8,
48
- "cut_mel_frame": 32,
49
- // Directory names of processed data or extracted features
50
- "spk2id": "singers.json",
51
- // Features used for model training
52
- "use_mel": true,
53
- "use_frame_pitch": false,
54
- "use_uv": false,
55
- "use_audio": true,
56
- "use_label": false,
57
- "use_one_hot": false,
58
- "train_file": "train.json",
59
- "valid_file": "test.json"
60
- },
61
- "train": {
62
- "random_seed": 114514,
63
- "batch_size": 64,
64
- "gradient_accumulation_step": 1,
65
- "max_epoch": 1000000,
66
- "save_checkpoint_stride": [
67
- 20
68
- ],
69
- "run_eval": [
70
- true
71
- ],
72
- "sampler": {
73
- "holistic_shuffle": true,
74
- "drop_last": true
75
- },
76
- "dataloader": {
77
- "num_worker": 4,
78
- "pin_memory": true
79
- },
80
- "tracker": [
81
- "tensorboard"
82
- ],
83
- }
84
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/svc/MultipleContentsSVC/README.md DELETED
@@ -1,153 +0,0 @@
1
- # Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion
2
-
3
- [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
4
- [![demo](https://img.shields.io/badge/SVC-Demo-red)](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html)
5
-
6
- <br>
7
- <div align="center">
8
- <img src="../../../imgs/svc/MultipleContentsSVC.png" width="85%">
9
- </div>
10
- <br>
11
-
12
- This is the official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Specially,
13
-
14
- - The muptile content features are from [Whipser](https://github.com/wenet-e2e/wenet) and [ContentVec](https://github.com/auspicious3000/contentvec).
15
- - The acoustic model is based on Bidirectional Non-Causal Dilated CNN (called `DiffWaveNetSVC` in Amphion), which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
16
- - The vocoder is [BigVGAN](https://github.com/NVIDIA/BigVGAN) architecture and we fine-tuned it in over 120 hours singing voice data.
17
-
18
- There are four stages in total:
19
-
20
- 1. Data preparation
21
- 2. Features extraction
22
- 3. Training
23
- 4. Inference/conversion
24
-
25
- > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
26
- > ```bash
27
- > cd Amphion
28
- > ```
29
-
30
- ## 1. Data Preparation
31
-
32
- ### Dataset Download
33
-
34
- By default, we utilize the five datasets for training: M4Singer, Opencpop, OpenSinger, SVCC, and VCTK. How to download them is detailed [here](../../datasets/README.md).
35
-
36
- ### Configuration
37
-
38
- Specify the dataset paths in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
39
-
40
- ```json
41
- "dataset": [
42
- "m4singer",
43
- "opencpop",
44
- "opensinger",
45
- "svcc",
46
- "vctk"
47
- ],
48
- "dataset_path": {
49
- // TODO: Fill in your dataset path
50
- "m4singer": "[M4Singer dataset path]",
51
- "opencpop": "[Opencpop dataset path]",
52
- "opensinger": "[OpenSinger dataset path]",
53
- "svcc": "[SVCC dataset path]",
54
- "vctk": "[VCTK dataset path]"
55
- },
56
- ```
57
-
58
- ## 2. Features Extraction
59
-
60
- ### Content-based Pretrained Models Download
61
-
62
- By default, we utilize the Whisper and ContentVec to extract content features. How to download them is detailed [here](../../../pretrained/README.md).
63
-
64
- ### Configuration
65
-
66
- Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
67
-
68
- ```json
69
- // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
70
- "log_dir": "ckpts/svc",
71
- "preprocess": {
72
- // TODO: Fill in the output data path. The default value is "Amphion/data"
73
- "processed_dir": "data",
74
- ...
75
- },
76
- ```
77
-
78
- ### Run
79
-
80
- Run the `run.sh` as the preproces stage (set `--stage 1`).
81
-
82
- ```bash
83
- sh egs/svc/MultipleContentsSVC/run.sh --stage 1
84
- ```
85
-
86
- > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
87
-
88
- ## 3. Training
89
-
90
- ### Configuration
91
-
92
- We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
93
-
94
- ```json
95
- "train": {
96
- "batch_size": 32,
97
- ...
98
- "adamw": {
99
- "lr": 2.0e-4
100
- },
101
- ...
102
- }
103
- ```
104
-
105
- ### Run
106
-
107
- Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/svc/[YourExptName]`.
108
-
109
- ```bash
110
- sh egs/svc/MultipleContentsSVC/run.sh --stage 2 --name [YourExptName]
111
- ```
112
-
113
- > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
114
-
115
- ## 4. Inference/Conversion
116
-
117
- ### Pretrained Vocoder Download
118
-
119
- We fine-tune the official BigVGAN pretrained model with over 120 hours singing voice data. The benifits of fine-tuning has been investigated in our paper (see this [demo page](https://www.zhangxueyao.com/data/MultipleContentsSVC/vocoder.html)). The final pretrained singing voice vocoder is released [here](../../../pretrained/README.md#amphion-singing-bigvgan) (called `Amphion Singing BigVGAN`).
120
-
121
- ### Run
122
-
123
- For inference/conversion, you need to specify the following configurations when running `run.sh`:
124
-
125
- | Parameters | Description | Example |
126
- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
127
- | `--infer_expt_dir` | The experimental directory which contains `checkpoint` | `Amphion/ckpts/svc/[YourExptName]` |
128
- | `--infer_output_dir` | The output directory to save inferred audios. | `Amphion/ckpts/svc/[YourExptName]/result` |
129
- | `--infer_source_file` or `--infer_source_audio_dir` | The inference source (can be a json file or a dir). | The `infer_source_file` could be `Amphion/data/[YourDataset]/test.json`, and the `infer_source_audio_dir` is a folder which includes several audio files (*.wav, *.mp3 or *.flac). |
130
- | `--infer_target_speaker` | The target speaker you want to convert into. You can refer to `Amphion/ckpts/svc/[YourExptName]/singers.json` to choose a trained speaker. | For opencpop dataset, the speaker name would be `opencpop_female1`. |
131
- | `--infer_key_shift` | How many semitones you want to transpose. | `"autoshfit"` (by default), `3`, `-3`, etc. |
132
-
133
- For example, if you want to make `opencpop_female1` sing the songs in the `[Your Audios Folder]`, just run:
134
-
135
- ```bash
136
- sh egs/svc/MultipleContentsSVC/run.sh --stage 3 --gpu "0" \
137
- --infer_expt_dir Amphion/ckpts/svc/[YourExptName] \
138
- --infer_output_dir Amphion/ckpts/svc/[YourExptName]/result \
139
- --infer_source_audio_dir [Your Audios Folder] \
140
- --infer_target_speaker "opencpop_female1" \
141
- --infer_key_shift "autoshift"
142
- ```
143
-
144
- ## Citations
145
-
146
- ```bibtex
147
- @article{zhang2023leveraging,
148
- title={Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion},
149
- author={Zhang, Xueyao and Gu, Yicheng and Chen, Haopeng and Fang, Zihao and Zou, Lexiao and Xue, Liumeng and Wu, Zhizheng},
150
- journal={Machine Learning for Audio Worshop, NeurIPS 2023},
151
- year={2023}
152
- }
153
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/svc/MultipleContentsSVC/exp_config.json DELETED
@@ -1,126 +0,0 @@
1
- {
2
- "base_config": "config/diffusion.json",
3
- "model_type": "DiffWaveNetSVC",
4
- "dataset": [
5
- "m4singer",
6
- "opencpop",
7
- "opensinger",
8
- "svcc",
9
- "vctk"
10
- ],
11
- "dataset_path": {
12
- // TODO: Fill in your dataset path
13
- "m4singer": "[M4Singer dataset path]",
14
- "opencpop": "[Opencpop dataset path]",
15
- "opensinger": "[OpenSinger dataset path]",
16
- "svcc": "[SVCC dataset path]",
17
- "vctk": "[VCTK dataset path]"
18
- },
19
- // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc"
20
- "log_dir": "ckpts/svc",
21
- "preprocess": {
22
- // TODO: Fill in the output data path. The default value is "Amphion/data"
23
- "processed_dir": "data",
24
- // Config for features extraction
25
- "extract_mel": true,
26
- "extract_pitch": true,
27
- "extract_energy": true,
28
- "extract_whisper_feature": true,
29
- "extract_contentvec_feature": true,
30
- "extract_wenet_feature": false,
31
- "whisper_batch_size": 30, // decrease it if your GPU is out of memory
32
- "contentvec_batch_size": 1,
33
- // Fill in the content-based pretrained model's path
34
- "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
35
- "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
36
- "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
37
- "whisper_model": "medium",
38
- "whisper_model_path": "pretrained/whisper/medium.pt",
39
- // Config for features usage
40
- "use_mel": true,
41
- "use_min_max_norm_mel": true,
42
- "use_frame_pitch": true,
43
- "use_frame_energy": true,
44
- "use_spkid": true,
45
- "use_whisper": true,
46
- "use_contentvec": true,
47
- "use_wenet": false,
48
- "n_mel": 100,
49
- "sample_rate": 24000
50
- },
51
- "model": {
52
- "condition_encoder": {
53
- // Config for features usage
54
- "use_whisper": true,
55
- "use_contentvec": true,
56
- "use_wenet": false,
57
- "whisper_dim": 1024,
58
- "contentvec_dim": 256,
59
- "wenet_dim": 512,
60
- "use_singer_encoder": false,
61
- "pitch_min": 50,
62
- "pitch_max": 1100
63
- },
64
- "diffusion": {
65
- "scheduler": "ddpm",
66
- "scheduler_settings": {
67
- "num_train_timesteps": 1000,
68
- "beta_start": 1.0e-4,
69
- "beta_end": 0.02,
70
- "beta_schedule": "linear"
71
- },
72
- // Diffusion steps encoder
73
- "step_encoder": {
74
- "dim_raw_embedding": 128,
75
- "dim_hidden_layer": 512,
76
- "activation": "SiLU",
77
- "num_layer": 2,
78
- "max_period": 10000
79
- },
80
- // Diffusion decoder
81
- "model_type": "bidilconv",
82
- // bidilconv, unet2d, TODO: unet1d
83
- "bidilconv": {
84
- "base_channel": 512,
85
- "n_res_block": 40,
86
- "conv_kernel_size": 3,
87
- "dilation_cycle_length": 4,
88
- // specially, 1 means no dilation
89
- "conditioner_size": 384
90
- }
91
- }
92
- },
93
- "train": {
94
- "batch_size": 32,
95
- "gradient_accumulation_step": 1,
96
- "max_epoch": -1, // -1 means no limit
97
- "save_checkpoint_stride": [
98
- 3,
99
- 50
100
- ],
101
- "keep_last": [
102
- 3,
103
- 2
104
- ],
105
- "run_eval": [
106
- true,
107
- true
108
- ],
109
- "adamw": {
110
- "lr": 2.0e-4
111
- },
112
- "reducelronplateau": {
113
- "factor": 0.8,
114
- "patience": 30,
115
- "min_lr": 1.0e-4
116
- },
117
- "dataloader": {
118
- "num_worker": 8,
119
- "pin_memory": true
120
- },
121
- "sampler": {
122
- "holistic_shuffle": false,
123
- "drop_last": true
124
- }
125
- }
126
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/svc/MultipleContentsSVC/run.sh DELETED
@@ -1 +0,0 @@
1
- ../_template/run.sh
 
 
egs/svc/README.md DELETED
@@ -1,34 +0,0 @@
1
- # Amphion Singing Voice Conversion (SVC) Recipe
2
-
3
- ## Quick Start
4
-
5
- We provide a **[beginner recipe](MultipleContentsSVC)** to demonstrate how to train a cutting edge SVC model. Specifically, it is also an official implementation of the paper "[Leveraging Content-based Features from Multiple Acoustic Models for Singing Voice Conversion](https://arxiv.org/abs/2310.11160)" (NeurIPS 2023 Workshop on Machine Learning for Audio). Some demos can be seen [here](https://www.zhangxueyao.com/data/MultipleContentsSVC/index.html).
6
-
7
- ## Supported Model Architectures
8
-
9
- The main idea of SVC is to first disentangle the speaker-agnostic representations from the source audio, and then inject the desired speaker information to synthesize the target, which usually utilizes an acoustic decoder and a subsequent waveform synthesizer (vocoder):
10
-
11
- <br>
12
- <div align="center">
13
- <img src="../../imgs/svc/pipeline.png" width="70%">
14
- </div>
15
- <br>
16
-
17
- Until now, Amphion SVC has supported the following features and models:
18
-
19
- - **Speaker-agnostic Representations**:
20
- - Content Features: Sourcing from [WeNet](https://github.com/wenet-e2e/wenet), [Whisper](https://github.com/openai/whisper), and [ContentVec](https://github.com/auspicious3000/contentvec).
21
- - Prosody Features: F0 and energy.
22
- - **Speaker Embeddings**:
23
- - Speaker Look-Up Table.
24
- - Reference Encoder (👨‍💻 developing): It can be used for zero-shot SVC.
25
- - **Acoustic Decoders**:
26
- - Diffusion-based models:
27
- - **[DiffWaveNetSVC](MultipleContentsSVC)**: The encoder is based on Bidirectional Non-Causal Dilated CNN, which is similar to [WaveNet](https://arxiv.org/pdf/1609.03499.pdf), [DiffWave](https://openreview.net/forum?id=a-xFK8Ymz5J), and [DiffSVC](https://ieeexplore.ieee.org/document/9688219).
28
- - **[DiffComoSVC](DiffComoSVC)** (👨‍💻 developing): The diffusion framework is based on [Consistency Model](https://proceedings.mlr.press/v202/song23a.html). It can significantly accelerate the inference process of the diffusion model.
29
- - Transformer-based models:
30
- - **[TransformerSVC](TransformerSVC)**: Encoder-only and Non-autoregressive Transformer Architecture.
31
- - VAE- and Flow-based models:
32
- - **[VitsSVC]()** (👨‍💻 developing): It is designed as a [VITS](https://arxiv.org/abs/2106.06103)-like model whose textual input is replaced by the content features, which is similar to [so-vits-svc](https://github.com/svc-develop-team/so-vits-svc).
33
- - **Waveform Synthesizers (Vocoders)**:
34
- - The supported vocoders can be seen in [Amphion Vocoder Recipe](../vocoder/README.md).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/svc/_template/run.sh DELETED
@@ -1,150 +0,0 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- ######## Build Experiment Environment ###########
7
- exp_dir=$(cd `dirname $0`; pwd)
8
- work_dir=$(dirname $(dirname $(dirname $exp_dir)))
9
-
10
- export WORK_DIR=$work_dir
11
- export PYTHONPATH=$work_dir
12
- export PYTHONIOENCODING=UTF-8
13
-
14
- ######## Parse the Given Parameters from the Commond ###########
15
- options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,resume_from_ckpt_path:,resume_type:,infer_expt_dir:,infer_output_dir:,infer_source_file:,infer_source_audio_dir:,infer_target_speaker:,infer_key_shift:,infer_vocoder_dir: -- "$@")
16
- eval set -- "$options"
17
-
18
- while true; do
19
- case $1 in
20
- # Experimental Configuration File
21
- -c | --config) shift; exp_config=$1 ; shift ;;
22
- # Experimental Name
23
- -n | --name) shift; exp_name=$1 ; shift ;;
24
- # Running Stage
25
- -s | --stage) shift; running_stage=$1 ; shift ;;
26
- # Visible GPU machines. The default value is "0".
27
- --gpu) shift; gpu=$1 ; shift ;;
28
-
29
- # [Only for Training] Resume configuration
30
- --resume) shift; resume=$1 ; shift ;;
31
- # [Only for Training] The specific checkpoint path that you want to resume from.
32
- --resume_from_ckpt_path) shift; resume_from_ckpt_path=$1 ; shift ;;
33
- # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
- --resume_type) shift; resume_type=$1 ; shift ;;
35
-
36
- # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
37
- --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
38
- # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
39
- --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
40
- # [Only for Inference] The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir can be "$work_dir/source_audio" which includes several audio files (*.wav, *.mp3 or *.flac).
41
- --infer_source_file) shift; infer_source_file=$1 ; shift ;;
42
- --infer_source_audio_dir) shift; infer_source_audio_dir=$1 ; shift ;;
43
- # [Only for Inference] Specify the target speaker you want to convert into. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1".
44
- --infer_target_speaker) shift; infer_target_speaker=$1 ; shift ;;
45
- # [Only for Inference] For advanced users, you can modify the trans_key parameters into an integer (which means the semitones you want to transpose). Its default value is "autoshift".
46
- --infer_key_shift) shift; infer_key_shift=$1 ; shift ;;
47
- # [Only for Inference] The vocoder dir. Its default value is Amphion/pretrained/bigvgan. See Amphion/pretrained/README.md to download the pretrained BigVGAN vocoders.
48
- --infer_vocoder_dir) shift; infer_vocoder_dir=$1 ; shift ;;
49
-
50
- --) shift ; break ;;
51
- *) echo "Invalid option: $1" exit 1 ;;
52
- esac
53
- done
54
-
55
-
56
- ### Value check ###
57
- if [ -z "$running_stage" ]; then
58
- echo "[Error] Please specify the running stage"
59
- exit 1
60
- fi
61
-
62
- if [ -z "$exp_config" ]; then
63
- exp_config="${exp_dir}"/exp_config.json
64
- fi
65
- echo "Exprimental Configuration File: $exp_config"
66
-
67
- if [ -z "$gpu" ]; then
68
- gpu="0"
69
- fi
70
-
71
- ######## Features Extraction ###########
72
- if [ $running_stage -eq 1 ]; then
73
- CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/svc/preprocess.py \
74
- --config $exp_config \
75
- --num_workers 4
76
- fi
77
-
78
- ######## Training ###########
79
- if [ $running_stage -eq 2 ]; then
80
- if [ -z "$exp_name" ]; then
81
- echo "[Error] Please specify the experiments name"
82
- exit 1
83
- fi
84
- echo "Exprimental Name: $exp_name"
85
-
86
- if [ "$resume" = true ]; then
87
- echo "Automatically resume from the experimental dir..."
88
- CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/svc/train.py \
89
- --config "$exp_config" \
90
- --exp_name "$exp_name" \
91
- --log_level info \
92
- --resume
93
- else
94
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/svc/train.py \
95
- --config "$exp_config" \
96
- --exp_name "$exp_name" \
97
- --log_level info \
98
- --resume_from_ckpt_path "$resume_from_ckpt_path" \
99
- --resume_type "$resume_type"
100
- fi
101
- fi
102
-
103
- ######## Inference/Conversion ###########
104
- if [ $running_stage -eq 3 ]; then
105
- if [ -z "$infer_expt_dir" ]; then
106
- echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
107
- exit 1
108
- fi
109
-
110
- if [ -z "$infer_output_dir" ]; then
111
- infer_output_dir="$expt_dir/result"
112
- fi
113
-
114
- if [ -z "$infer_source_file" ] && [ -z "$infer_source_audio_dir" ]; then
115
- echo "[Error] Please specify the source file/dir. The inference source (can be a json file or a dir). For example, the source_file can be "[Your path to save processed data]/[YourDataset]/test.json", and the source_audio_dir should include several audio files (*.wav, *.mp3 or *.flac)."
116
- exit 1
117
- fi
118
-
119
- if [ -z "$infer_source_file" ]; then
120
- infer_source=$infer_source_audio_dir
121
- fi
122
-
123
- if [ -z "$infer_source_audio_dir" ]; then
124
- infer_source=$infer_source_file
125
- fi
126
-
127
- if [ -z "$infer_target_speaker" ]; then
128
- echo "[Error] Please specify the target speaker. You can refer to "[Your path to save logs and checkpoints]/[Your Expt Name]/singers.json". In this singer look-up table, you can see the usable speaker names (all the keys of the dictionary). For example, for opencpop dataset, the speaker name would be "opencpop_female1""
129
- exit 1
130
- fi
131
-
132
- if [ -z "$infer_key_shift" ]; then
133
- infer_key_shift="autoshift"
134
- fi
135
-
136
- if [ -z "$infer_vocoder_dir" ]; then
137
- infer_vocoder_dir="$work_dir"/pretrained/bigvgan
138
- echo "[Warning] You don't specify the infer_vocoder_dir. It is set $infer_vocoder_dir by default. Make sure that you have followed Amphoion/pretrained/README.md to download the pretrained BigVGAN vocoder checkpoint."
139
- fi
140
-
141
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/svc/inference.py \
142
- --config $exp_config \
143
- --acoustics_dir $infer_expt_dir \
144
- --vocoder_dir $infer_vocoder_dir \
145
- --target_singer $infer_target_speaker \
146
- --trans_key $infer_key_shift \
147
- --source $infer_source \
148
- --output_dir $infer_output_dir \
149
- --log_level debug
150
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/README.md DELETED
@@ -1,23 +0,0 @@
1
- # Amphion Vocoder Recipe
2
-
3
- ## Quick Start
4
-
5
- We provide a [**beginner recipe**](gan/tfr_enhanced_hifigan/README.md) to demonstrate how to train a high quality HiFi-GAN speech vocoder. Specially, it is also an official implementation of our paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". Some demos can be seen [here](https://vocodexelysium.github.io/MS-SB-CQTD/).
6
-
7
- ## Supported Models
8
-
9
- Neural vocoder generates audible waveforms from acoustic representations, which is one of the key parts for current audio generation systems. Until now, Amphion has supported various widely-used vocoders according to different vocoder types, including:
10
-
11
- - **GAN-based vocoders**, which we have provided [**a unified recipe**](gan/README.md) :
12
- - [MelGAN](https://arxiv.org/abs/1910.06711)
13
- - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
14
- - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
15
- - [BigVGAN](https://arxiv.org/abs/2206.04658)
16
- - [APNet](https://arxiv.org/abs/2305.07952)
17
- - **Flow-based vocoders** (👨‍💻 developing):
18
- - [WaveGlow](https://arxiv.org/abs/1811.00002)
19
- - **Diffusion-based vocoders** (👨‍💻 developing):
20
- - [Diffwave](https://arxiv.org/abs/2009.09761)
21
- - **Auto-regressive based vocoders** (👨‍💻 developing):
22
- - [WaveNet](https://arxiv.org/abs/1609.03499)
23
- - [WaveRNN](https://arxiv.org/abs/1802.08435v1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/diffusion/README.md DELETED
File without changes
egs/vocoder/diffusion/exp_config_base.json DELETED
File without changes
egs/vocoder/gan/README.md DELETED
@@ -1,224 +0,0 @@
1
- # Amphion GAN-based Vocoder Recipe
2
-
3
- ## Supported Model Architectures
4
-
5
- GAN-based Vocoder consists of a generator and multiple discriminators, as illustrated below:
6
-
7
- <br>
8
- <div align="center">
9
- <img src="../../../imgs/vocoder/gan/pipeline.png" width="40%">
10
- </div>
11
- <br>
12
-
13
- Until now, Amphion GAN-based Vocoder has supported the following generators and discriminators.
14
-
15
- - **Generators**
16
- - [MelGAN](https://arxiv.org/abs/1910.06711)
17
- - [HiFi-GAN](https://arxiv.org/abs/2010.05646)
18
- - [NSF-HiFiGAN](https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts)
19
- - [BigVGAN](https://arxiv.org/abs/2206.04658)
20
- - [APNet](https://arxiv.org/abs/2305.07952)
21
- - **Discriminators**
22
- - [Multi-Scale Discriminator](https://arxiv.org/abs/2010.05646)
23
- - [Multi-Period Discriminator](https://arxiv.org/abs/2010.05646)
24
- - [Multi-Resolution Discriminator](https://arxiv.org/abs/2011.09631)
25
- - [Multi-Scale Short-Time Fourier Transform Discriminator](https://arxiv.org/abs/2210.13438)
26
- - [**Multi-Scale Constant-Q Transfrom Discriminator (ours)**](https://arxiv.org/abs/2311.14957)
27
-
28
- You can use any vocoder architecture with any dataset you want. There are four steps in total:
29
-
30
- 1. Data preparation
31
- 2. Feature extraction
32
- 3. Training
33
- 4. Inference
34
-
35
- > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
36
- > ```bash
37
- > cd Amphion
38
- > ```
39
-
40
- ## 1. Data Preparation
41
-
42
- You can train the vocoder with any datasets. Amphion's supported open-source datasets are detailed [here](../../../datasets/README.md).
43
-
44
- ### Configuration
45
-
46
- Specify the dataset path in `exp_config_base.json`. Note that you can change the `dataset` list to use your preferred datasets.
47
-
48
- ```json
49
- "dataset": [
50
- "csd",
51
- "kising",
52
- "m4singer",
53
- "nus48e",
54
- "opencpop",
55
- "opensinger",
56
- "opera",
57
- "pjs",
58
- "popbutfy",
59
- "popcs",
60
- "ljspeech",
61
- "vctk",
62
- "libritts",
63
- ],
64
- "dataset_path": {
65
- // TODO: Fill in your dataset path
66
- "csd": "[dataset path]",
67
- "kising": "[dataset path]",
68
- "m4singer": "[dataset path]",
69
- "nus48e": "[dataset path]",
70
- "opencpop": "[dataset path]",
71
- "opensinger": "[dataset path]",
72
- "opera": "[dataset path]",
73
- "pjs": "[dataset path]",
74
- "popbutfy": "[dataset path]",
75
- "popcs": "[dataset path]",
76
- "ljspeech": "[dataset path]",
77
- "vctk": "[dataset path]",
78
- "libritts": "[dataset path]",
79
- },
80
- ```
81
-
82
- ### 2. Feature Extraction
83
-
84
- The needed features are speficied in the individual vocoder direction so it doesn't require any modification.
85
-
86
- ### Configuration
87
-
88
- Specify the dataset path and the output path for saving the processed data and the training model in `exp_config_base.json`:
89
-
90
- ```json
91
- // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
92
- "log_dir": "ckpts/vocoder",
93
- "preprocess": {
94
- // TODO: Fill in the output data path. The default value is "Amphion/data"
95
- "processed_dir": "data",
96
- ...
97
- },
98
- ```
99
-
100
- ### Run
101
-
102
- Run the `run.sh` as the preproces stage (set `--stage 1`).
103
-
104
- ```bash
105
- sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 1
106
- ```
107
-
108
- > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
109
-
110
- ## 3. Training
111
-
112
- ### Configuration
113
-
114
- We provide the default hyparameters in the `exp_config_base.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
115
-
116
- ```json
117
- "train": {
118
- "batch_size": 16,
119
- "max_epoch": 1000000,
120
- "save_checkpoint_stride": [20],
121
- "adamw": {
122
- "lr": 2.0e-4,
123
- "adam_b1": 0.8,
124
- "adam_b2": 0.99
125
- },
126
- "exponential_lr": {
127
- "lr_decay": 0.999
128
- },
129
- }
130
- ```
131
-
132
- You can also choose any amount of prefered discriminators for training in the `exp_config_base.json`.
133
-
134
- ```json
135
- "discriminators": [
136
- "msd",
137
- "mpd",
138
- "msstftd",
139
- "mssbcqtd",
140
- ],
141
- ```
142
-
143
- ### Run
144
-
145
- Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
146
-
147
- ```bash
148
- sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 2 --name [YourExptName]
149
- ```
150
-
151
- > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
152
-
153
-
154
- ## 4. Inference
155
-
156
- ### Run
157
-
158
- Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from_audio`.
159
-
160
- ```bash
161
- sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
162
- --infer_mode [Your chosen inference mode] \
163
- --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
164
- --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
165
- --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
166
- --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
167
- --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
168
- ```
169
-
170
- #### a. Inference from Dataset
171
-
172
- Run the `run.sh` with specified datasets, here is an example.
173
-
174
- ```bash
175
- sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
176
- --infer_mode infer_from_dataset \
177
- --infer_datasets "libritts vctk ljspeech" \
178
- --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
179
- --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
180
- ```
181
-
182
- #### b. Inference from Features
183
-
184
- If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
185
-
186
- ```plaintext
187
- ┣ {infer_feature_dir}
188
- ┃ ┣ mels
189
- ┃ ┃ ┣ sample1.npy
190
- ┃ ┃ ┣ sample2.npy
191
- ┃ ┣ f0s (required if you use NSF-HiFiGAN)
192
- ┃ ┃ ┣ sample1.npy
193
- ┃ ┃ ┣ sample2.npy
194
- ```
195
-
196
- Then run the `run.sh` with specificed folder direction, here is an example.
197
-
198
- ```bash
199
- sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
200
- --infer_mode infer_from_feature \
201
- --infer_feature_dir [Your path to your predicted acoustic features] \
202
- --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
203
- --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
204
- ```
205
-
206
- #### c. Inference from Audios
207
-
208
- If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
209
-
210
- ```plaintext
211
- ┣ audios
212
- ┃ ┣ sample1.wav
213
- ┃ ┣ sample2.wav
214
- ```
215
-
216
- Then run the `run.sh` with specificed folder direction, here is an example.
217
-
218
- ```bash
219
- sh egs/vocoder/gan/{vocoder_name}/run.sh --stage 3 \
220
- --infer_mode infer_from_audio \
221
- --infer_audio_dir [Your path to your audio files] \
222
- --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
223
- --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
224
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/_template/run.sh DELETED
@@ -1,143 +0,0 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- ######## Build Experiment Environment ###########
7
- exp_dir=$(cd `dirname $0`; pwd)
8
- work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
-
10
- export WORK_DIR=$work_dir
11
- export PYTHONPATH=$work_dir
12
- export PYTHONIOENCODING=UTF-8
13
-
14
- ######## Parse the Given Parameters from the Commond ###########
15
- options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
- eval set -- "$options"
17
-
18
- while true; do
19
- case $1 in
20
- # Experimental Configuration File
21
- -c | --config) shift; exp_config=$1 ; shift ;;
22
- # Experimental Name
23
- -n | --name) shift; exp_name=$1 ; shift ;;
24
- # Running Stage
25
- -s | --stage) shift; running_stage=$1 ; shift ;;
26
- # Visible GPU machines. The default value is "0".
27
- --gpu) shift; gpu=$1 ; shift ;;
28
-
29
- # [Only for Training] Resume configuration
30
- --resume) shift; resume=$1 ; shift ;;
31
- # [Only for Training] The specific checkpoint path that you want to resume from.
32
- --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
- # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
- --resume_type) shift; resume_type=$1 ; shift ;;
35
-
36
- # [Only for Inference] The inference mode
37
- --infer_mode) shift; infer_mode=$1 ; shift ;;
38
- # [Only for Inference] The inferenced datasets
39
- --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
- # [Only for Inference] The feature dir for inference
41
- --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
- # [Only for Inference] The audio dir for inference
43
- --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
- # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
- --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
- # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
- --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
-
49
- --) shift ; break ;;
50
- *) echo "Invalid option: $1" exit 1 ;;
51
- esac
52
- done
53
-
54
-
55
- ### Value check ###
56
- if [ -z "$running_stage" ]; then
57
- echo "[Error] Please specify the running stage"
58
- exit 1
59
- fi
60
-
61
- if [ -z "$exp_config" ]; then
62
- exp_config="${exp_dir}"/exp_config.json
63
- fi
64
- echo "Exprimental Configuration File: $exp_config"
65
-
66
- if [ -z "$gpu" ]; then
67
- gpu="0"
68
- fi
69
-
70
- ######## Features Extraction ###########
71
- if [ $running_stage -eq 1 ]; then
72
- CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
- --config $exp_config \
74
- --num_workers 8
75
- fi
76
-
77
- ######## Training ###########
78
- if [ $running_stage -eq 2 ]; then
79
- if [ -z "$exp_name" ]; then
80
- echo "[Error] Please specify the experiments name"
81
- exit 1
82
- fi
83
- echo "Exprimental Name: $exp_name"
84
-
85
- if [ "$resume" = true ]; then
86
- echo "Automatically resume from the experimental dir..."
87
- CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
- --config "$exp_config" \
89
- --exp_name "$exp_name" \
90
- --log_level info \
91
- --resume
92
- else
93
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
- --config "$exp_config" \
95
- --exp_name "$exp_name" \
96
- --log_level info \
97
- --checkpoint "$checkpoint" \
98
- --resume_type "$resume_type"
99
- fi
100
- fi
101
-
102
- ######## Inference/Conversion ###########
103
- if [ $running_stage -eq 3 ]; then
104
- if [ -z "$infer_expt_dir" ]; then
105
- echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
- exit 1
107
- fi
108
-
109
- if [ -z "$infer_output_dir" ]; then
110
- infer_output_dir="$infer_expt_dir/result"
111
- fi
112
-
113
- if [ $infer_mode = "infer_from_dataset" ]; then
114
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
- --config $exp_config \
116
- --infer_mode $infer_mode \
117
- --infer_datasets $infer_datasets \
118
- --vocoder_dir $infer_expt_dir \
119
- --output_dir $infer_output_dir \
120
- --log_level debug
121
- fi
122
-
123
- if [ $infer_mode = "infer_from_feature" ]; then
124
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
- --config $exp_config \
126
- --infer_mode $infer_mode \
127
- --feature_folder $infer_feature_dir \
128
- --vocoder_dir $infer_expt_dir \
129
- --output_dir $infer_output_dir \
130
- --log_level debug
131
- fi
132
-
133
- if [ $infer_mode = "infer_from_audio" ]; then
134
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
- --config $exp_config \
136
- --infer_mode $infer_mode \
137
- --audio_folder $infer_audio_dir \
138
- --vocoder_dir $infer_expt_dir \
139
- --output_dir $infer_output_dir \
140
- --log_level debug
141
- fi
142
-
143
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/apnet/exp_config.json DELETED
@@ -1,45 +0,0 @@
1
- {
2
- "base_config": "egs/vocoder/gan/exp_config_base.json",
3
- "preprocess": {
4
- // acoustic features
5
- "extract_mel": true,
6
- "extract_audio": true,
7
- "extract_amplitude_phase": true,
8
-
9
- // Features used for model training
10
- "use_mel": true,
11
- "use_audio": true,
12
- "use_amplitude_phase": true
13
- },
14
- "model": {
15
- "generator": "apnet",
16
- "apnet": {
17
- "ASP_channel": 512,
18
- "ASP_resblock_kernel_sizes": [3,7,11],
19
- "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
20
- "ASP_input_conv_kernel_size": 7,
21
- "ASP_output_conv_kernel_size": 7,
22
-
23
- "PSP_channel": 512,
24
- "PSP_resblock_kernel_sizes": [3,7,11],
25
- "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
26
- "PSP_input_conv_kernel_size": 7,
27
- "PSP_output_R_conv_kernel_size": 7,
28
- "PSP_output_I_conv_kernel_size": 7,
29
- }
30
- },
31
- "train": {
32
- "criterions": [
33
- "feature",
34
- "discriminator",
35
- "generator",
36
- "mel",
37
- "phase",
38
- "amplitude",
39
- "consistency"
40
- ]
41
- },
42
- "inference": {
43
- "batch_size": 1,
44
- }
45
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/apnet/run.sh DELETED
@@ -1,143 +0,0 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- ######## Build Experiment Environment ###########
7
- exp_dir=$(cd `dirname $0`; pwd)
8
- work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
-
10
- export WORK_DIR=$work_dir
11
- export PYTHONPATH=$work_dir
12
- export PYTHONIOENCODING=UTF-8
13
-
14
- ######## Parse the Given Parameters from the Commond ###########
15
- options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
- eval set -- "$options"
17
-
18
- while true; do
19
- case $1 in
20
- # Experimental Configuration File
21
- -c | --config) shift; exp_config=$1 ; shift ;;
22
- # Experimental Name
23
- -n | --name) shift; exp_name=$1 ; shift ;;
24
- # Running Stage
25
- -s | --stage) shift; running_stage=$1 ; shift ;;
26
- # Visible GPU machines. The default value is "0".
27
- --gpu) shift; gpu=$1 ; shift ;;
28
-
29
- # [Only for Training] Resume configuration
30
- --resume) shift; resume=$1 ; shift ;;
31
- # [Only for Training] The specific checkpoint path that you want to resume from.
32
- --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
- # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
- --resume_type) shift; resume_type=$1 ; shift ;;
35
-
36
- # [Only for Inference] The inference mode
37
- --infer_mode) shift; infer_mode=$1 ; shift ;;
38
- # [Only for Inference] The inferenced datasets
39
- --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
- # [Only for Inference] The feature dir for inference
41
- --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
- # [Only for Inference] The audio dir for inference
43
- --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
- # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
- --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
- # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
- --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
-
49
- --) shift ; break ;;
50
- *) echo "Invalid option: $1" exit 1 ;;
51
- esac
52
- done
53
-
54
-
55
- ### Value check ###
56
- if [ -z "$running_stage" ]; then
57
- echo "[Error] Please specify the running stage"
58
- exit 1
59
- fi
60
-
61
- if [ -z "$exp_config" ]; then
62
- exp_config="${exp_dir}"/exp_config.json
63
- fi
64
- echo "Exprimental Configuration File: $exp_config"
65
-
66
- if [ -z "$gpu" ]; then
67
- gpu="0"
68
- fi
69
-
70
- ######## Features Extraction ###########
71
- if [ $running_stage -eq 1 ]; then
72
- CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
- --config $exp_config \
74
- --num_workers 8
75
- fi
76
-
77
- ######## Training ###########
78
- if [ $running_stage -eq 2 ]; then
79
- if [ -z "$exp_name" ]; then
80
- echo "[Error] Please specify the experiments name"
81
- exit 1
82
- fi
83
- echo "Exprimental Name: $exp_name"
84
-
85
- if [ "$resume" = true ]; then
86
- echo "Automatically resume from the experimental dir..."
87
- CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
- --config "$exp_config" \
89
- --exp_name "$exp_name" \
90
- --log_level info \
91
- --resume
92
- else
93
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
- --config "$exp_config" \
95
- --exp_name "$exp_name" \
96
- --log_level info \
97
- --checkpoint "$checkpoint" \
98
- --resume_type "$resume_type"
99
- fi
100
- fi
101
-
102
- ######## Inference/Conversion ###########
103
- if [ $running_stage -eq 3 ]; then
104
- if [ -z "$infer_expt_dir" ]; then
105
- echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
- exit 1
107
- fi
108
-
109
- if [ -z "$infer_output_dir" ]; then
110
- infer_output_dir="$infer_expt_dir/result"
111
- fi
112
-
113
- if [ $infer_mode = "infer_from_dataset" ]; then
114
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
- --config $exp_config \
116
- --infer_mode $infer_mode \
117
- --infer_datasets $infer_datasets \
118
- --vocoder_dir $infer_expt_dir \
119
- --output_dir $infer_output_dir \
120
- --log_level debug
121
- fi
122
-
123
- if [ $infer_mode = "infer_from_feature" ]; then
124
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
- --config $exp_config \
126
- --infer_mode $infer_mode \
127
- --feature_folder $infer_feature_dir \
128
- --vocoder_dir $infer_expt_dir \
129
- --output_dir $infer_output_dir \
130
- --log_level debug
131
- fi
132
-
133
- if [ $infer_mode = "infer_from_audio" ]; then
134
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
- --config $exp_config \
136
- --infer_mode $infer_mode \
137
- --audio_folder $infer_audio_dir \
138
- --vocoder_dir $infer_expt_dir \
139
- --output_dir $infer_output_dir \
140
- --log_level debug
141
- fi
142
-
143
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/bigvgan/exp_config.json DELETED
@@ -1,66 +0,0 @@
1
- {
2
- "base_config": "egs/vocoder/gan/exp_config_base.json",
3
- "preprocess": {
4
- // acoustic features
5
- "extract_mel": true,
6
- "extract_audio": true,
7
-
8
- // Features used for model training
9
- "use_mel": true,
10
- "use_audio": true
11
- },
12
- "model": {
13
- "generator": "bigvgan",
14
- "bigvgan": {
15
- "resblock": "1",
16
- "activation": "snakebeta",
17
- "snake_logscale": true,
18
- "upsample_rates": [
19
- 8,
20
- 8,
21
- 2,
22
- 2,
23
- ],
24
- "upsample_kernel_sizes": [
25
- 16,
26
- 16,
27
- 4,
28
- 4
29
- ],
30
- "upsample_initial_channel": 512,
31
- "resblock_kernel_sizes": [
32
- 3,
33
- 7,
34
- 11
35
- ],
36
- "resblock_dilation_sizes": [
37
- [
38
- 1,
39
- 3,
40
- 5
41
- ],
42
- [
43
- 1,
44
- 3,
45
- 5
46
- ],
47
- [
48
- 1,
49
- 3,
50
- 5
51
- ]
52
- ]
53
- }
54
- },
55
- "train": {
56
- "criterions": [
57
- "feature",
58
- "discriminator",
59
- "generator",
60
- "mel",
61
- ]
62
- },
63
- "inference": {
64
- "batch_size": 1,
65
- }
66
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/bigvgan/run.sh DELETED
@@ -1,143 +0,0 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- ######## Build Experiment Environment ###########
7
- exp_dir=$(cd `dirname $0`; pwd)
8
- work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
-
10
- export WORK_DIR=$work_dir
11
- export PYTHONPATH=$work_dir
12
- export PYTHONIOENCODING=UTF-8
13
-
14
- ######## Parse the Given Parameters from the Commond ###########
15
- options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
- eval set -- "$options"
17
-
18
- while true; do
19
- case $1 in
20
- # Experimental Configuration File
21
- -c | --config) shift; exp_config=$1 ; shift ;;
22
- # Experimental Name
23
- -n | --name) shift; exp_name=$1 ; shift ;;
24
- # Running Stage
25
- -s | --stage) shift; running_stage=$1 ; shift ;;
26
- # Visible GPU machines. The default value is "0".
27
- --gpu) shift; gpu=$1 ; shift ;;
28
-
29
- # [Only for Training] Resume configuration
30
- --resume) shift; resume=$1 ; shift ;;
31
- # [Only for Training] The specific checkpoint path that you want to resume from.
32
- --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
- # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
- --resume_type) shift; resume_type=$1 ; shift ;;
35
-
36
- # [Only for Inference] The inference mode
37
- --infer_mode) shift; infer_mode=$1 ; shift ;;
38
- # [Only for Inference] The inferenced datasets
39
- --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
- # [Only for Inference] The feature dir for inference
41
- --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
- # [Only for Inference] The audio dir for inference
43
- --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
- # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
- --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
- # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
- --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
-
49
- --) shift ; break ;;
50
- *) echo "Invalid option: $1" exit 1 ;;
51
- esac
52
- done
53
-
54
-
55
- ### Value check ###
56
- if [ -z "$running_stage" ]; then
57
- echo "[Error] Please specify the running stage"
58
- exit 1
59
- fi
60
-
61
- if [ -z "$exp_config" ]; then
62
- exp_config="${exp_dir}"/exp_config.json
63
- fi
64
- echo "Exprimental Configuration File: $exp_config"
65
-
66
- if [ -z "$gpu" ]; then
67
- gpu="0"
68
- fi
69
-
70
- ######## Features Extraction ###########
71
- if [ $running_stage -eq 1 ]; then
72
- CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
- --config $exp_config \
74
- --num_workers 8
75
- fi
76
-
77
- ######## Training ###########
78
- if [ $running_stage -eq 2 ]; then
79
- if [ -z "$exp_name" ]; then
80
- echo "[Error] Please specify the experiments name"
81
- exit 1
82
- fi
83
- echo "Exprimental Name: $exp_name"
84
-
85
- if [ "$resume" = true ]; then
86
- echo "Automatically resume from the experimental dir..."
87
- CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
- --config "$exp_config" \
89
- --exp_name "$exp_name" \
90
- --log_level info \
91
- --resume
92
- else
93
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
- --config "$exp_config" \
95
- --exp_name "$exp_name" \
96
- --log_level info \
97
- --checkpoint "$checkpoint" \
98
- --resume_type "$resume_type"
99
- fi
100
- fi
101
-
102
- ######## Inference/Conversion ###########
103
- if [ $running_stage -eq 3 ]; then
104
- if [ -z "$infer_expt_dir" ]; then
105
- echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
- exit 1
107
- fi
108
-
109
- if [ -z "$infer_output_dir" ]; then
110
- infer_output_dir="$infer_expt_dir/result"
111
- fi
112
-
113
- if [ $infer_mode = "infer_from_dataset" ]; then
114
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
- --config $exp_config \
116
- --infer_mode $infer_mode \
117
- --infer_datasets $infer_datasets \
118
- --vocoder_dir $infer_expt_dir \
119
- --output_dir $infer_output_dir \
120
- --log_level debug
121
- fi
122
-
123
- if [ $infer_mode = "infer_from_feature" ]; then
124
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
- --config $exp_config \
126
- --infer_mode $infer_mode \
127
- --feature_folder $infer_feature_dir \
128
- --vocoder_dir $infer_expt_dir \
129
- --output_dir $infer_output_dir \
130
- --log_level debug
131
- fi
132
-
133
- if [ $infer_mode = "infer_from_audio" ]; then
134
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
- --config $exp_config \
136
- --infer_mode $infer_mode \
137
- --audio_folder $infer_audio_dir \
138
- --vocoder_dir $infer_expt_dir \
139
- --output_dir $infer_output_dir \
140
- --log_level debug
141
- fi
142
-
143
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/bigvgan_large/exp_config.json DELETED
@@ -1,70 +0,0 @@
1
- {
2
- "base_config": "egs/vocoder/gan/exp_config_base.json",
3
- "preprocess": {
4
- // acoustic features
5
- "extract_mel": true,
6
- "extract_audio": true,
7
-
8
- // Features used for model training
9
- "use_mel": true,
10
- "use_audio": true
11
- },
12
- "model": {
13
- "generator": "bigvgan",
14
- "bigvgan": {
15
- "resblock": "1",
16
- "activation": "snakebeta",
17
- "snake_logscale": true,
18
- "upsample_rates": [
19
- 4,
20
- 4,
21
- 2,
22
- 2,
23
- 2,
24
- 2
25
- ],
26
- "upsample_kernel_sizes": [
27
- 8,
28
- 8,
29
- 4,
30
- 4,
31
- 4,
32
- 4
33
- ],
34
- "upsample_initial_channel": 1536,
35
- "resblock_kernel_sizes": [
36
- 3,
37
- 7,
38
- 11
39
- ],
40
- "resblock_dilation_sizes": [
41
- [
42
- 1,
43
- 3,
44
- 5
45
- ],
46
- [
47
- 1,
48
- 3,
49
- 5
50
- ],
51
- [
52
- 1,
53
- 3,
54
- 5
55
- ]
56
- ]
57
- },
58
- },
59
- "train": {
60
- "criterions": [
61
- "feature",
62
- "discriminator",
63
- "generator",
64
- "mel",
65
- ]
66
- },
67
- "inference": {
68
- "batch_size": 1,
69
- }
70
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/bigvgan_large/run.sh DELETED
@@ -1,143 +0,0 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- ######## Build Experiment Environment ###########
7
- exp_dir=$(cd `dirname $0`; pwd)
8
- work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
-
10
- export WORK_DIR=$work_dir
11
- export PYTHONPATH=$work_dir
12
- export PYTHONIOENCODING=UTF-8
13
-
14
- ######## Parse the Given Parameters from the Commond ###########
15
- options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
- eval set -- "$options"
17
-
18
- while true; do
19
- case $1 in
20
- # Experimental Configuration File
21
- -c | --config) shift; exp_config=$1 ; shift ;;
22
- # Experimental Name
23
- -n | --name) shift; exp_name=$1 ; shift ;;
24
- # Running Stage
25
- -s | --stage) shift; running_stage=$1 ; shift ;;
26
- # Visible GPU machines. The default value is "0".
27
- --gpu) shift; gpu=$1 ; shift ;;
28
-
29
- # [Only for Training] Resume configuration
30
- --resume) shift; resume=$1 ; shift ;;
31
- # [Only for Training] The specific checkpoint path that you want to resume from.
32
- --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
- # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
- --resume_type) shift; resume_type=$1 ; shift ;;
35
-
36
- # [Only for Inference] The inference mode
37
- --infer_mode) shift; infer_mode=$1 ; shift ;;
38
- # [Only for Inference] The inferenced datasets
39
- --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
- # [Only for Inference] The feature dir for inference
41
- --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
- # [Only for Inference] The audio dir for inference
43
- --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
- # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
- --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
- # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
- --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
-
49
- --) shift ; break ;;
50
- *) echo "Invalid option: $1" exit 1 ;;
51
- esac
52
- done
53
-
54
-
55
- ### Value check ###
56
- if [ -z "$running_stage" ]; then
57
- echo "[Error] Please specify the running stage"
58
- exit 1
59
- fi
60
-
61
- if [ -z "$exp_config" ]; then
62
- exp_config="${exp_dir}"/exp_config.json
63
- fi
64
- echo "Exprimental Configuration File: $exp_config"
65
-
66
- if [ -z "$gpu" ]; then
67
- gpu="0"
68
- fi
69
-
70
- ######## Features Extraction ###########
71
- if [ $running_stage -eq 1 ]; then
72
- CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
- --config $exp_config \
74
- --num_workers 8
75
- fi
76
-
77
- ######## Training ###########
78
- if [ $running_stage -eq 2 ]; then
79
- if [ -z "$exp_name" ]; then
80
- echo "[Error] Please specify the experiments name"
81
- exit 1
82
- fi
83
- echo "Exprimental Name: $exp_name"
84
-
85
- if [ "$resume" = true ]; then
86
- echo "Automatically resume from the experimental dir..."
87
- CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
- --config "$exp_config" \
89
- --exp_name "$exp_name" \
90
- --log_level info \
91
- --resume
92
- else
93
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
- --config "$exp_config" \
95
- --exp_name "$exp_name" \
96
- --log_level info \
97
- --checkpoint "$checkpoint" \
98
- --resume_type "$resume_type"
99
- fi
100
- fi
101
-
102
- ######## Inference/Conversion ###########
103
- if [ $running_stage -eq 3 ]; then
104
- if [ -z "$infer_expt_dir" ]; then
105
- echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
- exit 1
107
- fi
108
-
109
- if [ -z "$infer_output_dir" ]; then
110
- infer_output_dir="$infer_expt_dir/result"
111
- fi
112
-
113
- if [ $infer_mode = "infer_from_dataset" ]; then
114
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
- --config $exp_config \
116
- --infer_mode $infer_mode \
117
- --infer_datasets $infer_datasets \
118
- --vocoder_dir $infer_expt_dir \
119
- --output_dir $infer_output_dir \
120
- --log_level debug
121
- fi
122
-
123
- if [ $infer_mode = "infer_from_feature" ]; then
124
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
- --config $exp_config \
126
- --infer_mode $infer_mode \
127
- --feature_folder $infer_feature_dir \
128
- --vocoder_dir $infer_expt_dir \
129
- --output_dir $infer_output_dir \
130
- --log_level debug
131
- fi
132
-
133
- if [ $infer_mode = "infer_from_audio" ]; then
134
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
- --config $exp_config \
136
- --infer_mode $infer_mode \
137
- --audio_folder $infer_audio_dir \
138
- --vocoder_dir $infer_expt_dir \
139
- --output_dir $infer_output_dir \
140
- --log_level debug
141
- fi
142
-
143
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/exp_config_base.json DELETED
@@ -1,111 +0,0 @@
1
- {
2
- "base_config": "config/vocoder.json",
3
- "model_type": "GANVocoder",
4
- // TODO: Choose your needed datasets
5
- "dataset": [
6
- "csd",
7
- "kising",
8
- "m4singer",
9
- "nus48e",
10
- "opencpop",
11
- "opensinger",
12
- "opera",
13
- "pjs",
14
- "popbutfy",
15
- "popcs",
16
- "ljspeech",
17
- "vctk",
18
- "libritts",
19
- ],
20
- "dataset_path": {
21
- // TODO: Fill in your dataset path
22
- "csd": "[dataset path]",
23
- "kising": "[dataset path]",
24
- "m4singer": "[dataset path]",
25
- "nus48e": "[dataset path]",
26
- "opencpop": "[dataset path]",
27
- "opensinger": "[dataset path]",
28
- "opera": "[dataset path]",
29
- "pjs": "[dataset path]",
30
- "popbutfy": "[dataset path]",
31
- "popcs": "[dataset path]",
32
- "ljspeech": "[dataset path]",
33
- "vctk": "[dataset path]",
34
- "libritts": "[dataset path]",
35
- },
36
- // TODO: Fill in the output log path
37
- "log_dir": "ckpts/vocoder",
38
- "preprocess": {
39
- // Acoustic features
40
- "extract_mel": true,
41
- "extract_audio": true,
42
- "extract_pitch": false,
43
- "extract_uv": false,
44
- "pitch_extractor": "parselmouth",
45
-
46
- // Features used for model training
47
- "use_mel": true,
48
- "use_frame_pitch": false,
49
- "use_uv": false,
50
- "use_audio": true,
51
-
52
- // TODO: Fill in the output data path
53
- "processed_dir": "data/",
54
- "n_mel": 100,
55
- "sample_rate": 24000
56
- },
57
- "model": {
58
- // TODO: Choose your needed discriminators
59
- "discriminators": [
60
- "msd",
61
- "mpd",
62
- "msstftd",
63
- "mssbcqtd",
64
- ],
65
- "mpd": {
66
- "mpd_reshapes": [
67
- 2,
68
- 3,
69
- 5,
70
- 7,
71
- 11
72
- ],
73
- "use_spectral_norm": false,
74
- "discriminator_channel_mult_factor": 1
75
- },
76
- "mrd": {
77
- "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
78
- "use_spectral_norm": false,
79
- "discriminator_channel_mult_factor": 1,
80
- "mrd_override": false
81
- },
82
- "msstftd": {
83
- "filters": 32
84
- },
85
- "mssbcqtd": {
86
- hop_lengths: [512, 256, 256],
87
- filters: 32,
88
- max_filters: 1024,
89
- filters_scale: 1,
90
- dilations: [1, 2, 4],
91
- in_channels: 1,
92
- out_channels: 1,
93
- n_octaves: [9, 9, 9],
94
- bins_per_octaves: [24, 36, 48]
95
- },
96
- },
97
- "train": {
98
- // TODO: Choose a suitable batch size, training epoch, and save stride
99
- "batch_size": 32,
100
- "max_epoch": 1000000,
101
- "save_checkpoint_stride": [20],
102
- "adamw": {
103
- "lr": 2.0e-4,
104
- "adam_b1": 0.8,
105
- "adam_b2": 0.99
106
- },
107
- "exponential_lr": {
108
- "lr_decay": 0.999
109
- },
110
- }
111
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/hifigan/exp_config.json DELETED
@@ -1,59 +0,0 @@
1
- {
2
- "base_config": "egs/vocoder/gan/exp_config_base.json",
3
- "preprocess": {
4
- // acoustic features
5
- "extract_mel": true,
6
- "extract_audio": true,
7
-
8
- // Features used for model training
9
- "use_mel": true,
10
- "use_audio": true
11
- },
12
- "model": {
13
- "generator": "hifigan",
14
- "hifigan": {
15
- "resblock": "2",
16
- "upsample_rates": [
17
- 8,
18
- 8,
19
- 4
20
- ],
21
- "upsample_kernel_sizes": [
22
- 16,
23
- 16,
24
- 8
25
- ],
26
- "upsample_initial_channel": 256,
27
- "resblock_kernel_sizes": [
28
- 3,
29
- 5,
30
- 7
31
- ],
32
- "resblock_dilation_sizes": [
33
- [
34
- 1,
35
- 2
36
- ],
37
- [
38
- 2,
39
- 6
40
- ],
41
- [
42
- 3,
43
- 12
44
- ]
45
- ]
46
- }
47
- },
48
- "train": {
49
- "criterions": [
50
- "feature",
51
- "discriminator",
52
- "generator",
53
- "mel",
54
- ]
55
- },
56
- "inference": {
57
- "batch_size": 1,
58
- }
59
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/hifigan/run.sh DELETED
@@ -1,143 +0,0 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- ######## Build Experiment Environment ###########
7
- exp_dir=$(cd `dirname $0`; pwd)
8
- work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
-
10
- export WORK_DIR=$work_dir
11
- export PYTHONPATH=$work_dir
12
- export PYTHONIOENCODING=UTF-8
13
-
14
- ######## Parse the Given Parameters from the Commond ###########
15
- options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
- eval set -- "$options"
17
-
18
- while true; do
19
- case $1 in
20
- # Experimental Configuration File
21
- -c | --config) shift; exp_config=$1 ; shift ;;
22
- # Experimental Name
23
- -n | --name) shift; exp_name=$1 ; shift ;;
24
- # Running Stage
25
- -s | --stage) shift; running_stage=$1 ; shift ;;
26
- # Visible GPU machines. The default value is "0".
27
- --gpu) shift; gpu=$1 ; shift ;;
28
-
29
- # [Only for Training] Resume configuration
30
- --resume) shift; resume=$1 ; shift ;;
31
- # [Only for Training] The specific checkpoint path that you want to resume from.
32
- --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
- # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
- --resume_type) shift; resume_type=$1 ; shift ;;
35
-
36
- # [Only for Inference] The inference mode
37
- --infer_mode) shift; infer_mode=$1 ; shift ;;
38
- # [Only for Inference] The inferenced datasets
39
- --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
- # [Only for Inference] The feature dir for inference
41
- --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
- # [Only for Inference] The audio dir for inference
43
- --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
- # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
- --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
- # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
- --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
-
49
- --) shift ; break ;;
50
- *) echo "Invalid option: $1" exit 1 ;;
51
- esac
52
- done
53
-
54
-
55
- ### Value check ###
56
- if [ -z "$running_stage" ]; then
57
- echo "[Error] Please specify the running stage"
58
- exit 1
59
- fi
60
-
61
- if [ -z "$exp_config" ]; then
62
- exp_config="${exp_dir}"/exp_config.json
63
- fi
64
- echo "Exprimental Configuration File: $exp_config"
65
-
66
- if [ -z "$gpu" ]; then
67
- gpu="0"
68
- fi
69
-
70
- ######## Features Extraction ###########
71
- if [ $running_stage -eq 1 ]; then
72
- CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
- --config $exp_config \
74
- --num_workers 8
75
- fi
76
-
77
- ######## Training ###########
78
- if [ $running_stage -eq 2 ]; then
79
- if [ -z "$exp_name" ]; then
80
- echo "[Error] Please specify the experiments name"
81
- exit 1
82
- fi
83
- echo "Exprimental Name: $exp_name"
84
-
85
- if [ "$resume" = true ]; then
86
- echo "Automatically resume from the experimental dir..."
87
- CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
- --config "$exp_config" \
89
- --exp_name "$exp_name" \
90
- --log_level info \
91
- --resume
92
- else
93
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
- --config "$exp_config" \
95
- --exp_name "$exp_name" \
96
- --log_level info \
97
- --checkpoint "$checkpoint" \
98
- --resume_type "$resume_type"
99
- fi
100
- fi
101
-
102
- ######## Inference/Conversion ###########
103
- if [ $running_stage -eq 3 ]; then
104
- if [ -z "$infer_expt_dir" ]; then
105
- echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
- exit 1
107
- fi
108
-
109
- if [ -z "$infer_output_dir" ]; then
110
- infer_output_dir="$infer_expt_dir/result"
111
- fi
112
-
113
- if [ $infer_mode = "infer_from_dataset" ]; then
114
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
- --config $exp_config \
116
- --infer_mode $infer_mode \
117
- --infer_datasets $infer_datasets \
118
- --vocoder_dir $infer_expt_dir \
119
- --output_dir $infer_output_dir \
120
- --log_level debug
121
- fi
122
-
123
- if [ $infer_mode = "infer_from_feature" ]; then
124
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
- --config $exp_config \
126
- --infer_mode $infer_mode \
127
- --feature_folder $infer_feature_dir \
128
- --vocoder_dir $infer_expt_dir \
129
- --output_dir $infer_output_dir \
130
- --log_level debug
131
- fi
132
-
133
- if [ $infer_mode = "infer_from_audio" ]; then
134
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
- --config $exp_config \
136
- --infer_mode $infer_mode \
137
- --audio_folder $infer_audio_dir \
138
- --vocoder_dir $infer_expt_dir \
139
- --output_dir $infer_output_dir \
140
- --log_level debug
141
- fi
142
-
143
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/melgan/exp_config.json DELETED
@@ -1,34 +0,0 @@
1
- {
2
- "base_config": "egs/vocoder/gan/exp_config_base.json",
3
- "preprocess": {
4
- // acoustic features
5
- "extract_mel": true,
6
- "extract_audio": true,
7
-
8
- // Features used for model training
9
- "use_mel": true,
10
- "use_audio": true
11
- },
12
- "model": {
13
- "generator": "melgan",
14
- "melgan": {
15
- "ratios": [8, 8, 2, 2],
16
- "ngf": 32,
17
- "n_residual_layers": 3,
18
- "num_D": 3,
19
- "ndf": 16,
20
- "n_layers": 4,
21
- "downsampling_factor": 4
22
- },
23
- },
24
- "train": {
25
- "criterions": [
26
- "feature",
27
- "discriminator",
28
- "generator",
29
- ]
30
- },
31
- "inference": {
32
- "batch_size": 1,
33
- }
34
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/melgan/run.sh DELETED
@@ -1,143 +0,0 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- ######## Build Experiment Environment ###########
7
- exp_dir=$(cd `dirname $0`; pwd)
8
- work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
-
10
- export WORK_DIR=$work_dir
11
- export PYTHONPATH=$work_dir
12
- export PYTHONIOENCODING=UTF-8
13
-
14
- ######## Parse the Given Parameters from the Commond ###########
15
- options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
- eval set -- "$options"
17
-
18
- while true; do
19
- case $1 in
20
- # Experimental Configuration File
21
- -c | --config) shift; exp_config=$1 ; shift ;;
22
- # Experimental Name
23
- -n | --name) shift; exp_name=$1 ; shift ;;
24
- # Running Stage
25
- -s | --stage) shift; running_stage=$1 ; shift ;;
26
- # Visible GPU machines. The default value is "0".
27
- --gpu) shift; gpu=$1 ; shift ;;
28
-
29
- # [Only for Training] Resume configuration
30
- --resume) shift; resume=$1 ; shift ;;
31
- # [Only for Training] The specific checkpoint path that you want to resume from.
32
- --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
- # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
- --resume_type) shift; resume_type=$1 ; shift ;;
35
-
36
- # [Only for Inference] The inference mode
37
- --infer_mode) shift; infer_mode=$1 ; shift ;;
38
- # [Only for Inference] The inferenced datasets
39
- --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
- # [Only for Inference] The feature dir for inference
41
- --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
- # [Only for Inference] The audio dir for inference
43
- --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
- # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
- --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
- # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
- --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
-
49
- --) shift ; break ;;
50
- *) echo "Invalid option: $1" exit 1 ;;
51
- esac
52
- done
53
-
54
-
55
- ### Value check ###
56
- if [ -z "$running_stage" ]; then
57
- echo "[Error] Please specify the running stage"
58
- exit 1
59
- fi
60
-
61
- if [ -z "$exp_config" ]; then
62
- exp_config="${exp_dir}"/exp_config.json
63
- fi
64
- echo "Exprimental Configuration File: $exp_config"
65
-
66
- if [ -z "$gpu" ]; then
67
- gpu="0"
68
- fi
69
-
70
- ######## Features Extraction ###########
71
- if [ $running_stage -eq 1 ]; then
72
- CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
- --config $exp_config \
74
- --num_workers 8
75
- fi
76
-
77
- ######## Training ###########
78
- if [ $running_stage -eq 2 ]; then
79
- if [ -z "$exp_name" ]; then
80
- echo "[Error] Please specify the experiments name"
81
- exit 1
82
- fi
83
- echo "Exprimental Name: $exp_name"
84
-
85
- if [ "$resume" = true ]; then
86
- echo "Automatically resume from the experimental dir..."
87
- CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
- --config "$exp_config" \
89
- --exp_name "$exp_name" \
90
- --log_level info \
91
- --resume
92
- else
93
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
- --config "$exp_config" \
95
- --exp_name "$exp_name" \
96
- --log_level info \
97
- --checkpoint "$checkpoint" \
98
- --resume_type "$resume_type"
99
- fi
100
- fi
101
-
102
- ######## Inference/Conversion ###########
103
- if [ $running_stage -eq 3 ]; then
104
- if [ -z "$infer_expt_dir" ]; then
105
- echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
- exit 1
107
- fi
108
-
109
- if [ -z "$infer_output_dir" ]; then
110
- infer_output_dir="$infer_expt_dir/result"
111
- fi
112
-
113
- if [ $infer_mode = "infer_from_dataset" ]; then
114
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
- --config $exp_config \
116
- --infer_mode $infer_mode \
117
- --infer_datasets $infer_datasets \
118
- --vocoder_dir $infer_expt_dir \
119
- --output_dir $infer_output_dir \
120
- --log_level debug
121
- fi
122
-
123
- if [ $infer_mode = "infer_from_feature" ]; then
124
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
- --config $exp_config \
126
- --infer_mode $infer_mode \
127
- --feature_folder $infer_feature_dir \
128
- --vocoder_dir $infer_expt_dir \
129
- --output_dir $infer_output_dir \
130
- --log_level debug
131
- fi
132
-
133
- if [ $infer_mode = "infer_from_audio" ]; then
134
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
- --config $exp_config \
136
- --infer_mode $infer_mode \
137
- --audio_folder $infer_audio_dir \
138
- --vocoder_dir $infer_expt_dir \
139
- --output_dir $infer_output_dir \
140
- --log_level debug
141
- fi
142
-
143
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/nsfhifigan/exp_config.json DELETED
@@ -1,83 +0,0 @@
1
- {
2
- "base_config": "egs/vocoder/gan/exp_config_base.json",
3
- "preprocess": {
4
- // acoustic features
5
- "extract_mel": true,
6
- "extract_audio": true,
7
- "extract_pitch": true,
8
-
9
- // Features used for model training
10
- "use_mel": true,
11
- "use_audio": true,
12
- "use_frame_pitch": true
13
- },
14
- "model": {
15
- "generator": "nsfhifigan",
16
- "nsfhifigan": {
17
- "resblock": "1",
18
- "harmonic_num": 8,
19
- "upsample_rates": [
20
- 8,
21
- 4,
22
- 2,
23
- 2,
24
- 2
25
- ],
26
- "upsample_kernel_sizes": [
27
- 16,
28
- 8,
29
- 4,
30
- 4,
31
- 4
32
- ],
33
- "upsample_initial_channel": 768,
34
- "resblock_kernel_sizes": [
35
- 3,
36
- 7,
37
- 11
38
- ],
39
- "resblock_dilation_sizes": [
40
- [
41
- 1,
42
- 3,
43
- 5
44
- ],
45
- [
46
- 1,
47
- 3,
48
- 5
49
- ],
50
- [
51
- 1,
52
- 3,
53
- 5
54
- ]
55
- ]
56
- },
57
- "mpd": {
58
- "mpd_reshapes": [
59
- 2,
60
- 3,
61
- 5,
62
- 7,
63
- 11,
64
- 17,
65
- 23,
66
- 37
67
- ],
68
- "use_spectral_norm": false,
69
- "discriminator_channel_multi": 1
70
- }
71
- },
72
- "train": {
73
- "criterions": [
74
- "feature",
75
- "discriminator",
76
- "generator",
77
- "mel",
78
- ]
79
- },
80
- "inference": {
81
- "batch_size": 1,
82
- }
83
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/nsfhifigan/run.sh DELETED
@@ -1,143 +0,0 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- ######## Build Experiment Environment ###########
7
- exp_dir=$(cd `dirname $0`; pwd)
8
- work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
-
10
- export WORK_DIR=$work_dir
11
- export PYTHONPATH=$work_dir
12
- export PYTHONIOENCODING=UTF-8
13
-
14
- ######## Parse the Given Parameters from the Commond ###########
15
- options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
- eval set -- "$options"
17
-
18
- while true; do
19
- case $1 in
20
- # Experimental Configuration File
21
- -c | --config) shift; exp_config=$1 ; shift ;;
22
- # Experimental Name
23
- -n | --name) shift; exp_name=$1 ; shift ;;
24
- # Running Stage
25
- -s | --stage) shift; running_stage=$1 ; shift ;;
26
- # Visible GPU machines. The default value is "0".
27
- --gpu) shift; gpu=$1 ; shift ;;
28
-
29
- # [Only for Training] Resume configuration
30
- --resume) shift; resume=$1 ; shift ;;
31
- # [Only for Training] The specific checkpoint path that you want to resume from.
32
- --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
- # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
- --resume_type) shift; resume_type=$1 ; shift ;;
35
-
36
- # [Only for Inference] The inference mode
37
- --infer_mode) shift; infer_mode=$1 ; shift ;;
38
- # [Only for Inference] The inferenced datasets
39
- --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
- # [Only for Inference] The feature dir for inference
41
- --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
- # [Only for Inference] The audio dir for inference
43
- --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
- # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
- --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
- # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
- --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
-
49
- --) shift ; break ;;
50
- *) echo "Invalid option: $1" exit 1 ;;
51
- esac
52
- done
53
-
54
-
55
- ### Value check ###
56
- if [ -z "$running_stage" ]; then
57
- echo "[Error] Please specify the running stage"
58
- exit 1
59
- fi
60
-
61
- if [ -z "$exp_config" ]; then
62
- exp_config="${exp_dir}"/exp_config.json
63
- fi
64
- echo "Exprimental Configuration File: $exp_config"
65
-
66
- if [ -z "$gpu" ]; then
67
- gpu="0"
68
- fi
69
-
70
- ######## Features Extraction ###########
71
- if [ $running_stage -eq 1 ]; then
72
- CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
- --config $exp_config \
74
- --num_workers 8
75
- fi
76
-
77
- ######## Training ###########
78
- if [ $running_stage -eq 2 ]; then
79
- if [ -z "$exp_name" ]; then
80
- echo "[Error] Please specify the experiments name"
81
- exit 1
82
- fi
83
- echo "Exprimental Name: $exp_name"
84
-
85
- if [ "$resume" = true ]; then
86
- echo "Automatically resume from the experimental dir..."
87
- CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
- --config "$exp_config" \
89
- --exp_name "$exp_name" \
90
- --log_level info \
91
- --resume
92
- else
93
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
- --config "$exp_config" \
95
- --exp_name "$exp_name" \
96
- --log_level info \
97
- --checkpoint "$checkpoint" \
98
- --resume_type "$resume_type"
99
- fi
100
- fi
101
-
102
- ######## Inference/Conversion ###########
103
- if [ $running_stage -eq 3 ]; then
104
- if [ -z "$infer_expt_dir" ]; then
105
- echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
- exit 1
107
- fi
108
-
109
- if [ -z "$infer_output_dir" ]; then
110
- infer_output_dir="$infer_expt_dir/result"
111
- fi
112
-
113
- if [ $infer_mode = "infer_from_dataset" ]; then
114
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
115
- --config $exp_config \
116
- --infer_mode $infer_mode \
117
- --infer_datasets $infer_datasets \
118
- --vocoder_dir $infer_expt_dir \
119
- --output_dir $infer_output_dir \
120
- --log_level debug
121
- fi
122
-
123
- if [ $infer_mode = "infer_from_feature" ]; then
124
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
125
- --config $exp_config \
126
- --infer_mode $infer_mode \
127
- --feature_folder $infer_feature_dir \
128
- --vocoder_dir $infer_expt_dir \
129
- --output_dir $infer_output_dir \
130
- --log_level debug
131
- fi
132
-
133
- if [ $infer_mode = "infer_from_audio" ]; then
134
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
135
- --config $exp_config \
136
- --infer_mode $infer_mode \
137
- --audio_folder $infer_audio_dir \
138
- --vocoder_dir $infer_expt_dir \
139
- --output_dir $infer_output_dir \
140
- --log_level debug
141
- fi
142
-
143
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/tfr_enhanced_hifigan/README.md DELETED
@@ -1,185 +0,0 @@
1
- # Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fedility Vocoder
2
-
3
- [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2311.14957)
4
- [![demo](https://img.shields.io/badge/Vocoder-Demo-red)](https://vocodexelysium.github.io/MS-SB-CQTD/)
5
-
6
- <br>
7
- <div align="center">
8
- <img src="../../../../imgs/vocoder/gan/MSSBCQTD.png" width="80%">
9
- </div>
10
- <br>
11
-
12
- This is the official implementation of the paper "[Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder](https://arxiv.org/abs/2311.14957)". In this recipe, we will illustrate how to train a high quality HiFi-GAN on LibriTTS, VCTK and LJSpeech via utilizing multiple Time-Frequency-Representation-based Discriminators.
13
-
14
- There are four stages in total:
15
-
16
- 1. Data preparation
17
- 2. Feature extraction
18
- 3. Training
19
- 4. Inference
20
-
21
- > **NOTE:** You need to run every command of this recipe in the `Amphion` root path:
22
- > ```bash
23
- > cd Amphion
24
- > ```
25
-
26
- ## 1. Data Preparation
27
-
28
- ### Dataset Download
29
-
30
- By default, we utilize the three datasets for training: LibriTTS, VCTK and LJSpeech. How to download them is detailed in [here](../../../datasets/README.md).
31
-
32
- ### Configuration
33
-
34
- Specify the dataset path in `exp_config.json`. Note that you can change the `dataset` list to use your preferred datasets.
35
-
36
- ```json
37
- "dataset": [
38
- "ljspeech",
39
- "vctk",
40
- "libritts",
41
- ],
42
- "dataset_path": {
43
- // TODO: Fill in your dataset path
44
- "ljspeech": "[LJSpeech dataset path]",
45
- "vctk": "[VCTK dataset path]",
46
- "libritts": "[LibriTTS dataset path]",
47
- },
48
- ```
49
-
50
- ## 2. Features Extraction
51
-
52
- For HiFiGAN, only the Mel-Spectrogram and the Output Audio are needed for training.
53
-
54
- ### Configuration
55
-
56
- Specify the dataset path and the output path for saving the processed data and the training model in `exp_config.json`:
57
-
58
- ```json
59
- // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
60
- "log_dir": "ckpts/vocoder",
61
- "preprocess": {
62
- // TODO: Fill in the output data path. The default value is "Amphion/data"
63
- "processed_dir": "data",
64
- ...
65
- },
66
- ```
67
-
68
- ### Run
69
-
70
- Run the `run.sh` as the preproces stage (set `--stage 1`).
71
-
72
- ```bash
73
- sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 1
74
- ```
75
-
76
- > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "1"`.
77
-
78
- ## 3. Training
79
-
80
- ### Configuration
81
-
82
- We provide the default hyparameters in the `exp_config.json`. They can work on single NVIDIA-24g GPU. You can adjust them based on you GPU machines.
83
-
84
- ```json
85
- "train": {
86
- "batch_size": 32,
87
- ...
88
- }
89
- ```
90
-
91
- ### Run
92
-
93
- Run the `run.sh` as the training stage (set `--stage 2`). Specify a experimental name to run the following command. The tensorboard logs and checkpoints will be saved in `Amphion/ckpts/vocoder/[YourExptName]`.
94
-
95
- ```bash
96
- sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 2 --name [YourExptName]
97
- ```
98
-
99
- > **NOTE:** The `CUDA_VISIBLE_DEVICES` is set as `"0"` in default. You can change it when running `run.sh` by specifying such as `--gpu "0,1,2,3"`.
100
-
101
- ## 4. Inference
102
-
103
- ### Pretrained Vocoder Download
104
-
105
- We trained a HiFiGAN checkpoint with around 685 hours Speech data. The final pretrained checkpoint is released [here](../../../../pretrained/hifigan/README.md).
106
-
107
- ### Run
108
-
109
- Run the `run.sh` as the training stage (set `--stage 3`), we provide three different inference modes, including `infer_from_dataset`, `infer_from_feature`, `and infer_from audio`.
110
-
111
- ```bash
112
- sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
113
- --infer_mode [Your chosen inference mode] \
114
- --infer_datasets [Datasets you want to inference, needed when infer_from_dataset] \
115
- --infer_feature_dir [Your path to your predicted acoustic features, needed when infer_from_feature] \
116
- --infer_audio_dir [Your path to your audio files, needed when infer_form_audio] \
117
- --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
118
- --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
119
- ```
120
-
121
- #### a. Inference from Dataset
122
-
123
- Run the `run.sh` with specified datasets, here is an example.
124
-
125
- ```bash
126
- sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
127
- --infer_mode infer_from_dataset \
128
- --infer_datasets "libritts vctk ljspeech" \
129
- --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
130
- --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
131
- ```
132
-
133
- #### b. Inference from Features
134
-
135
- If you want to inference from your generated acoustic features, you should first prepare your acoustic features into the following structure:
136
-
137
- ```plaintext
138
- ┣ {infer_feature_dir}
139
- ┃ ┣ mels
140
- ┃ ┃ ┣ sample1.npy
141
- ┃ ┃ ┣ sample2.npy
142
- ```
143
-
144
- Then run the `run.sh` with specificed folder direction, here is an example.
145
-
146
- ```bash
147
- sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
148
- --infer_mode infer_from_feature \
149
- --infer_feature_dir [Your path to your predicted acoustic features] \
150
- --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
151
- --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
152
- ```
153
-
154
- #### c. Inference from Audios
155
-
156
- If you want to inference from audios for quick analysis synthesis, you should first prepare your audios into the following structure:
157
-
158
- ```plaintext
159
- ┣ audios
160
- ┃ ┣ sample1.wav
161
- ┃ ┣ sample2.wav
162
- ```
163
-
164
- Then run the `run.sh` with specificed folder direction, here is an example.
165
-
166
- ```bash
167
- sh egs/vocoder/gan/tfr_enhanced_hifigan/run.sh --stage 3 \
168
- --infer_mode infer_from_audio \
169
- --infer_audio_dir [Your path to your audio files] \
170
- --infer_expt_dir Amphion/ckpts/vocoder/[YourExptName] \
171
- --infer_output_dir Amphion/ckpts/vocoder/[YourExptName]/result \
172
- ```
173
-
174
- ## Citations
175
-
176
- ```bibtex
177
- @misc{gu2023cqt,
178
- title={Multi-Scale Sub-Band Constant-Q Transform Discriminator for High-Fidelity Vocoder},
179
- author={Yicheng Gu and Xueyao Zhang and Liumeng Xue and Zhizheng Wu},
180
- year={2023},
181
- eprint={2311.14957},
182
- archivePrefix={arXiv},
183
- primaryClass={cs.SD}
184
- }
185
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/tfr_enhanced_hifigan/exp_config.json DELETED
@@ -1,118 +0,0 @@
1
- {
2
- "base_config": "egs/vocoder/gan/exp_config_base.json",
3
- "model_type": "GANVocoder",
4
- "dataset": [
5
- "ljspeech",
6
- "vctk",
7
- "libritts",
8
- ],
9
- "dataset_path": {
10
- // TODO: Fill in your dataset path
11
- "ljspeech": "[dataset path]",
12
- "vctk": "[dataset path]",
13
- "libritts": "[dataset path]",
14
- },
15
- // TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
16
- "log_dir": "ckpts/vocoder",
17
- "preprocess": {
18
- // TODO: Fill in the output data path. The default value is "Amphion/data"
19
- "processed_dir": "data",
20
- // acoustic features
21
- "extract_mel": true,
22
- "extract_audio": true,
23
- "extract_pitch": false,
24
- "extract_uv": false,
25
- "extract_amplitude_phase": false,
26
- "pitch_extractor": "parselmouth",
27
- // Features used for model training
28
- "use_mel": true,
29
- "use_frame_pitch": false,
30
- "use_uv": false,
31
- "use_audio": true,
32
- "n_mel": 100,
33
- "sample_rate": 24000
34
- },
35
- "model": {
36
- "generator": "hifigan",
37
- "discriminators": [
38
- "msd",
39
- "mpd",
40
- "mssbcqtd",
41
- "msstftd",
42
- ],
43
- "hifigan": {
44
- "resblock": "1",
45
- "upsample_rates": [
46
- 8,
47
- 4,
48
- 2,
49
- 2,
50
- 2
51
- ],
52
- "upsample_kernel_sizes": [
53
- 16,
54
- 8,
55
- 4,
56
- 4,
57
- 4
58
- ],
59
- "upsample_initial_channel": 768,
60
- "resblock_kernel_sizes": [
61
- 3,
62
- 5,
63
- 7
64
- ],
65
- "resblock_dilation_sizes": [
66
- [
67
- 1,
68
- 3,
69
- 5
70
- ],
71
- [
72
- 1,
73
- 3,
74
- 5
75
- ],
76
- [
77
- 1,
78
- 3,
79
- 5
80
- ]
81
- ]
82
- },
83
- "mpd": {
84
- "mpd_reshapes": [
85
- 2,
86
- 3,
87
- 5,
88
- 7,
89
- 11,
90
- 17,
91
- 23,
92
- 37
93
- ],
94
- "use_spectral_norm": false,
95
- "discriminator_channel_multi": 1
96
- }
97
- },
98
- "train": {
99
- "batch_size": 16,
100
- "adamw": {
101
- "lr": 2.0e-4,
102
- "adam_b1": 0.8,
103
- "adam_b2": 0.99
104
- },
105
- "exponential_lr": {
106
- "lr_decay": 0.999
107
- },
108
- "criterions": [
109
- "feature",
110
- "discriminator",
111
- "generator",
112
- "mel",
113
- ]
114
- },
115
- "inference": {
116
- "batch_size": 1,
117
- }
118
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
egs/vocoder/gan/tfr_enhanced_hifigan/run.sh DELETED
@@ -1,145 +0,0 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
- ######## Build Experiment Environment ###########
7
- exp_dir=$(cd `dirname $0`; pwd)
8
- work_dir=$(dirname $(dirname $(dirname $(dirname $exp_dir))))
9
-
10
- export WORK_DIR=$work_dir
11
- export PYTHONPATH=$work_dir
12
- export PYTHONIOENCODING=UTF-8
13
-
14
- ######## Parse the Given Parameters from the Commond ###########
15
- options=$(getopt -o c:n:s --long gpu:,config:,name:,stage:,resume:,checkpoint:,resume_type:,infer_mode:,infer_datasets:,infer_feature_dir:,infer_audio_dir:,infer_expt_dir:,infer_output_dir: -- "$@")
16
- eval set -- "$options"
17
-
18
- while true; do
19
- case $1 in
20
- # Experimental Configuration File
21
- -c | --config) shift; exp_config=$1 ; shift ;;
22
- # Experimental Name
23
- -n | --name) shift; exp_name=$1 ; shift ;;
24
- # Running Stage
25
- -s | --stage) shift; running_stage=$1 ; shift ;;
26
- # Visible GPU machines. The default value is "0".
27
- --gpu) shift; gpu=$1 ; shift ;;
28
-
29
- # [Only for Training] Resume configuration
30
- --resume) shift; resume=$1 ; shift ;;
31
- # [Only for Training] The specific checkpoint path that you want to resume from.
32
- --checkpoint) shift; cehckpoint=$1 ; shift ;;
33
- # [Only for Training] `resume` for loading all the things (including model weights, optimizer, scheduler, and random states). `finetune` for loading only the model weights.
34
- --resume_type) shift; resume_type=$1 ; shift ;;
35
-
36
- # [Only for Inference] The inference mode
37
- --infer_mode) shift; infer_mode=$1 ; shift ;;
38
- # [Only for Inference] The inferenced datasets
39
- --infer_datasets) shift; infer_datasets=$1 ; shift ;;
40
- # [Only for Inference] The feature dir for inference
41
- --infer_feature_dir) shift; infer_feature_dir=$1 ; shift ;;
42
- # [Only for Inference] The audio dir for inference
43
- --infer_audio_dir) shift; infer_audio_dir=$1 ; shift ;;
44
- # [Only for Inference] The experiment dir. The value is like "[Your path to save logs and checkpoints]/[YourExptName]"
45
- --infer_expt_dir) shift; infer_expt_dir=$1 ; shift ;;
46
- # [Only for Inference] The output dir to save inferred audios. Its default value is "$expt_dir/result"
47
- --infer_output_dir) shift; infer_output_dir=$1 ; shift ;;
48
-
49
- --) shift ; break ;;
50
- *) echo "Invalid option: $1" exit 1 ;;
51
- esac
52
- done
53
-
54
-
55
- ### Value check ###
56
- if [ -z "$running_stage" ]; then
57
- echo "[Error] Please specify the running stage"
58
- exit 1
59
- fi
60
-
61
- if [ -z "$exp_config" ]; then
62
- exp_config="${exp_dir}"/exp_config.json
63
- fi
64
- echo "Exprimental Configuration File: $exp_config"
65
-
66
- if [ -z "$gpu" ]; then
67
- gpu="0"
68
- fi
69
-
70
- ######## Features Extraction ###########
71
- if [ $running_stage -eq 1 ]; then
72
- CUDA_VISIBLE_DEVICES=$gpu python "${work_dir}"/bins/vocoder/preprocess.py \
73
- --config $exp_config \
74
- --num_workers 8
75
- fi
76
-
77
- ######## Training ###########
78
- if [ $running_stage -eq 2 ]; then
79
- if [ -z "$exp_name" ]; then
80
- echo "[Error] Please specify the experiments name"
81
- exit 1
82
- fi
83
- echo "Exprimental Name: $exp_name"
84
-
85
- if [ "$resume" = true ]; then
86
- echo "Automatically resume from the experimental dir..."
87
- CUDA_VISIBLE_DEVICES="$gpu" accelerate launch "${work_dir}"/bins/vocoder/train.py \
88
- --config "$exp_config" \
89
- --exp_name "$exp_name" \
90
- --log_level info \
91
- --resume
92
- else
93
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "${work_dir}"/bins/vocoder/train.py \
94
- --config "$exp_config" \
95
- --exp_name "$exp_name" \
96
- --log_level info \
97
- --checkpoint "$checkpoint" \
98
- --resume_type "$resume_type"
99
- fi
100
- fi
101
-
102
- ######## Inference/Conversion ###########
103
- if [ $running_stage -eq 3 ]; then
104
- if [ -z "$infer_expt_dir" ]; then
105
- echo "[Error] Please specify the experimental directionary. The value is like [Your path to save logs and checkpoints]/[YourExptName]"
106
- exit 1
107
- fi
108
-
109
- if [ -z "$infer_output_dir" ]; then
110
- infer_output_dir="$infer_expt_dir/result"
111
- fi
112
-
113
- echo $infer_datasets
114
-
115
- if [ $infer_mode = "infer_from_dataset" ]; then
116
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
117
- --config $exp_config \
118
- --infer_mode $infer_mode \
119
- --infer_datasets $infer_datasets \
120
- --vocoder_dir $infer_expt_dir \
121
- --output_dir $infer_output_dir \
122
- --log_level debug
123
- fi
124
-
125
- if [ $infer_mode = "infer_from_feature" ]; then
126
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
127
- --config $exp_config \
128
- --infer_mode $infer_mode \
129
- --feature_folder $infer_feature_dir \
130
- --vocoder_dir $infer_expt_dir \
131
- --output_dir $infer_output_dir \
132
- --log_level debug
133
- fi
134
-
135
- if [ $infer_mode = "infer_from_audio" ]; then
136
- CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/vocoder/inference.py \
137
- --config $exp_config \
138
- --infer_mode $infer_mode \
139
- --audio_folder $infer_audio_dir \
140
- --vocoder_dir $infer_expt_dir \
141
- --output_dir $infer_output_dir \
142
- --log_level debug
143
- fi
144
-
145
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/chinese_female_recordings.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f710270fe3857211c55aaa1f813e310e68855ff9eabaf5b249537a2d4277cc30
3
- size 448928
 
 
 
 
examples/chinese_male_seperated.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:009077a677b23bff3154078930e6c624d218eb0acbe78990bec88f6bf5a6e5de
3
- size 480044
 
 
 
 
examples/english_female_seperated.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:87e75863ffb4e597467a825d019217e73d64dce1e9635de60a32559ffcb97cf4
3
- size 1509584
 
 
 
 
examples/english_male_recordings.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e14ebf1c554ebb25e5169b4bcda36a685538e94c531f303339bad91ff93a2288
3
- size 251948
 
 
 
 
examples/output/.DS_Store DELETED
Binary file (6.15 kB)