m4singer
init
be234b7
raw
history blame
6.06 kB
K_step: 1000
accumulate_grad_batches: 1
audio_num_mel_bins: 80
audio_sample_rate: 24000
base_config:
- usr/configs/m4singer/base.yaml
binarization_args:
shuffle: false
with_align: true
with_f0: true
with_f0cwt: true
with_spk_embed: true
with_txt: true
with_wav: false
binarizer_cls: data_gen.singing.binarize.M4SingerBinarizer
binary_data_dir: data/binary/m4singer
check_val_every_n_epoch: 10
clip_grad_norm: 1
content_cond_steps: []
cwt_add_f0_loss: false
cwt_hidden_size: 128
cwt_layers: 2
cwt_loss: l1
cwt_std_scale: 0.8
datasets:
- m4singer
debug: false
dec_ffn_kernel_size: 9
dec_layers: 4
decay_steps: 100000
decoder_type: fft
dict_dir: ''
diff_decoder_type: wavenet
diff_loss_type: l1
dilation_cycle_length: 4
dropout: 0.1
ds_workers: 4
dur_enc_hidden_stride_kernel:
- 0,2,3
- 0,2,3
- 0,1,3
dur_loss: mse
dur_predictor_kernel: 3
dur_predictor_layers: 5
enc_ffn_kernel_size: 9
enc_layers: 4
encoder_K: 8
encoder_type: fft
endless_ds: true
ffn_act: gelu
ffn_padding: SAME
fft_size: 512
fmax: 12000
fmin: 30
fs2_ckpt: checkpoints/m4singer_fs2_e2e
gaussian_start: true
gen_dir_name: ''
gen_tgt_spk_id: -1
hidden_size: 256
hop_size: 128
infer: false
keep_bins: 80
lambda_commit: 0.25
lambda_energy: 0.0
lambda_f0: 0.0
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_uv: 0.0
lambda_word_dur: 1.0
load_ckpt: ''
log_interval: 100
loud_norm: false
lr: 0.001
max_beta: 0.02
max_epochs: 1000
max_eval_sentences: 1
max_eval_tokens: 60000
max_frames: 5000
max_input_tokens: 1550
max_sentences: 28
max_tokens: 36000
max_updates: 900000
mel_loss: ssim:0.5|l1:0.5
mel_vmax: 1.5
mel_vmin: -6.0
min_level_db: -120
norm_type: gn
num_ckpt_keep: 3
num_heads: 2
num_sanity_val_steps: 1
num_spk: 20
num_test_samples: 0
num_valid_plots: 10
optimizer_adam_beta1: 0.9
optimizer_adam_beta2: 0.98
out_wav_norm: false
pe_ckpt: checkpoints/m4singer_pe
pe_enable: true
pitch_ar: false
pitch_enc_hidden_stride_kernel:
- 0,2,5
- 0,2,5
- 0,2,5
pitch_extractor: parselmouth
pitch_loss: l1
pitch_norm: log
pitch_type: frame
pndm_speedup: 10
pre_align_args:
allow_no_txt: false
denoise: false
forced_align: mfa
txt_processor: zh_g2pM
use_sox: true
use_tone: false
pre_align_cls: data_gen.singing.pre_align.SingingPreAlign
predictor_dropout: 0.5
predictor_grad: 0.1
predictor_hidden: -1
predictor_kernel: 5
predictor_layers: 5
prenet_dropout: 0.5
prenet_hidden_size: 256
pretrain_fs_ckpt: ''
processed_data_dir: xxx
profile_infer: false
raw_data_dir: data/raw/m4singer
ref_norm_layer: bn
rel_pos: true
reset_phone_dict: true
residual_channels: 256
residual_layers: 20
save_best: false
save_ckpt: true
save_codes:
- configs
- modules
- tasks
- utils
- usr
save_f0: true
save_gt: true
schedule_type: linear
seed: 1234
sort_by_len: true
spec_max:
- -0.3894500136375427
- -0.3796464204788208
- -0.2914905250072479
- -0.15550297498703003
- -0.08502643555402756
- 0.10698417574167252
- -0.0739326998591423
- -0.0541548952460289
- 0.15501998364925385
- 0.06483431905508041
- 0.03054228238761425
- -0.013737732544541359
- -0.004876468330621719
- 0.04368264228105545
- 0.13329921662807465
- 0.16471388936042786
- 0.04605761915445328
- -0.05680707097053528
- 0.0542571023106575
- -0.0076539707370102406
- -0.00953489076346159
- -0.04434828832745552
- 0.001293870504014194
- -0.12238839268684387
- 0.06418416649103165
- 0.02843189612030983
- 0.08505241572856903
- 0.07062800228595734
- 0.00120724702719599
- -0.07675088942050934
- 0.03785804659128189
- 0.04890783503651619
- -0.06888376921415329
- -0.0839693546295166
- -0.17545585334300995
- -0.2911079525947571
- -0.4238220453262329
- -0.262084037065506
- -0.3002263605594635
- -0.3845032751560211
- -0.3906497061252594
- -0.6550108790397644
- -0.7810799479484558
- -0.7503029704093933
- -0.7995198965072632
- -0.8092347383499146
- -0.6196113228797913
- -0.6684317588806152
- -0.7735874056816101
- -0.8324533104896545
- -0.9601566791534424
- -0.955253541469574
- -0.748817503452301
- -0.9106167554855347
- -0.9707801342010498
- -1.053107500076294
- -1.0448424816131592
- -1.1082794666290283
- -1.1296544075012207
- -1.071642279624939
- -1.1003081798553467
- -1.166810154914856
- -1.1408926248550415
- -1.1330615282058716
- -1.1167492866516113
- -1.0716774463653564
- -1.035891056060791
- -1.0092483758926392
- -0.9675999879837036
- -0.938962996006012
- -1.0120564699172974
- -0.9777995347976685
- -1.029313564300537
- -0.9459163546562195
- -0.8519706130027771
- -0.7751091122627258
- -0.7933766841888428
- -0.9019735455513
- -0.9983296990394592
- -1.505873441696167
spec_min:
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
- -6.0
spk_cond_steps: []
stop_token_weight: 5.0
task_cls: usr.diffsinger_task.DiffSingerMIDITask
test_ids: []
test_input_dir: ''
test_num: 0
test_prefixes:
- "Alto-2#\u5C81\u6708\u795E\u5077"
- "Alto-2#\u5947\u5999\u80FD\u529B\u6B4C"
- "Tenor-1#\u4E00\u5343\u5E74\u4EE5\u540E"
- "Tenor-1#\u7AE5\u8BDD"
- "Tenor-2#\u6D88\u6101"
- "Tenor-2#\u4E00\u8364\u4E00\u7D20"
- "Soprano-1#\u5FF5\u5974\u5A07\u8D64\u58C1\u6000\u53E4"
- "Soprano-1#\u95EE\u6625"
test_set_name: test
timesteps: 1000
train_set_name: train
use_denoise: false
use_energy_embed: false
use_gt_dur: false
use_gt_f0: false
use_midi: true
use_nsf: true
use_pitch_embed: false
use_pos_embed: true
use_spk_embed: false
use_spk_id: true
use_split_spk_id: false
use_uv: true
use_var_enc: false
val_check_interval: 2000
valid_num: 0
valid_set_name: valid
vocoder: vocoders.hifigan.HifiGAN
vocoder_ckpt: checkpoints/m4singer_hifigan
warmup_updates: 2000
wav2spec_eps: 1e-6
weight_decay: 0
win_size: 512
work_dir: checkpoints/m4singer_diff_e2e