|
{ |
|
"base_config": "egs/tts/NaturalSpeech2/exp_config_base.json", |
|
"dataset": [ |
|
"libritts", |
|
], |
|
"exp_name": "ns2_wenet_16", |
|
"log_dir": "ckpts/tts", |
|
"model": { |
|
"diffusion": { |
|
"beta_max": 20, |
|
"beta_min": 0.05, |
|
"diffusion_type": "diffusion", |
|
"noise_factor": 1.0, |
|
"ode_solver": "euler", |
|
"sigma": 1.0, |
|
"wavenet": { |
|
"attn_head": 8, |
|
"cross_attn_per_layer": 3, |
|
"dilation_cycle": 2, |
|
"drop_out": 0.2, |
|
"hidden_size": 512, |
|
"input_size": 128, |
|
"num_layers": 40, |
|
"out_size": 128, |
|
}, |
|
}, |
|
"inference_step": 500, |
|
"latent_dim": 128, |
|
"prior_encoder": { |
|
"duration_predictor": { |
|
"attn_head": 8, |
|
"conv_layers": 30, |
|
"cross_attn_per_layer": 3, |
|
"drop_out": 0.5, |
|
"filter_size": 512, |
|
"input_size": 512, |
|
"kernel_size": 3, |
|
}, |
|
"encoder": { |
|
"conv_filter_size": 2048, |
|
"conv_kernel_size": 9, |
|
"encoder_dropout": 0.2, |
|
"encoder_head": 8, |
|
"encoder_hidden": 512, |
|
"encoder_layer": 6, |
|
"use_cln": true, |
|
}, |
|
"pitch_bins_num": 512, |
|
"pitch_max": 1100, |
|
"pitch_min": 50, |
|
"pitch_predictor": { |
|
"attn_head": 8, |
|
"conv_layers": 30, |
|
"cross_attn_per_layer": 3, |
|
"drop_out": 0.5, |
|
"filter_size": 512, |
|
"input_size": 512, |
|
"kernel_size": 5, |
|
}, |
|
"vocab_size": 100, |
|
}, |
|
"prompt_encoder": { |
|
"conv_filter_size": 2048, |
|
"conv_kernel_size": 9, |
|
"encoder_dropout": 0.2, |
|
"encoder_head": 8, |
|
"encoder_hidden": 512, |
|
"encoder_layer": 6, |
|
"use_cln": false, |
|
}, |
|
"query_emb": { |
|
"head_num": 8, |
|
"hidden_size": 512, |
|
"query_token_num": 32, |
|
}, |
|
}, |
|
"model_type": "NaturalSpeech2", |
|
"preprocess": { |
|
"align_mel_duration": false, |
|
"audio_dir": "audios", |
|
"bits": 8, |
|
"clip_mode": "start", |
|
"code_dir": "code", |
|
"contentvec_dir": "contentvec", |
|
"data_augment": false, |
|
"dur_dir": "durs", |
|
"duration_dir": "duration", |
|
"emo2id": "emo2id.json", |
|
"energy_dir": "energys", |
|
"energy_extract_mode": "from_mel", |
|
"energy_norm": false, |
|
"energy_remove_outlier": false, |
|
"extract_acoustic_token": false, |
|
"extract_amplitude_phase": false, |
|
"extract_audio": false, |
|
"extract_contentvec_feature": false, |
|
"extract_duration": false, |
|
"extract_energy": false, |
|
"extract_label": false, |
|
"extract_linear_spec": false, |
|
"extract_mcep": false, |
|
"extract_mel": false, |
|
"extract_mert_feature": false, |
|
"extract_phone": false, |
|
"extract_pitch": false, |
|
"extract_uv": false, |
|
"extract_wenet_feature": false, |
|
"extract_whisper_feature": false, |
|
"file_lst": "file.lst", |
|
"fmax": 12000, |
|
"fmin": 0, |
|
"hop_size": 120, |
|
"imaginary_dir": "imaginarys", |
|
"lab_dir": "labs", |
|
"label_dir": "labels", |
|
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt", |
|
"linear_dir": "linears", |
|
"log_amplitude_dir": "log_amplitudes", |
|
"mcep_dir": "mcep", |
|
"mel_dir": "mels", |
|
"mel_extract_mode": "", |
|
"mel_min_max_norm": false, |
|
"melspec_dir": "mel", |
|
"metadata_dir": "metadata", |
|
"min_level_db": -115, |
|
"n_fft": 1024, |
|
"n_mel": 80, |
|
"num_silent_frames": 8, |
|
"phase_dir": "phases", |
|
"phone_dir": "phones", |
|
"phone_energy_dir": "phone_energys", |
|
"phone_extractor": "espeak", |
|
"phone_pitch_dir": "phone_pitches", |
|
"phone_seq_file": "phone_seq_file", |
|
"pitch_dir": "pitch", |
|
"pitch_extractor": "parselmouth", |
|
"pitch_norm": false, |
|
"pitch_remove_outlier": false, |
|
"processed_dir": "data", |
|
"raw_data": "raw_data", |
|
"read_metadata": true, |
|
"real_dir": "reals", |
|
"ref_level_db": 20, |
|
"sample_rate": 24000, |
|
"spk2id": "spk2id.json", |
|
"symbols_dict": "symbols.dict", |
|
"train_file": "train.json", |
|
"trim_fft_size": 512, |
|
"trim_hop_size": 128, |
|
"trim_silence": false, |
|
"trim_top_db": 30, |
|
"trimmed_wav_dir": "trimmed_wavs", |
|
"use_amplitude_phase": false, |
|
"use_audio": false, |
|
"use_code": true, |
|
"use_cross_reference": true, |
|
"use_dur": false, |
|
"use_duration": true, |
|
"use_emoid": false, |
|
"use_frame_duration": false, |
|
"use_frame_energy": false, |
|
"use_frame_pitch": false, |
|
"use_lab": false, |
|
"use_label": false, |
|
"use_len": true, |
|
"use_linear": false, |
|
"use_log_scale_energy": false, |
|
"use_log_scale_pitch": false, |
|
"use_mel": false, |
|
"use_min_max_norm_mel": false, |
|
"use_one_hot": false, |
|
"use_phn_seq": false, |
|
"use_phone": true, |
|
"use_phone_duration": false, |
|
"use_phone_energy": false, |
|
"use_phone_pitch": false, |
|
"use_pitch": true, |
|
"use_spkid": true, |
|
"use_text": false, |
|
"use_uv": false, |
|
"use_wav": false, |
|
"use_wenet": false, |
|
"utt2emo": "utt2emo", |
|
"utt2spk": "utt2spk", |
|
"uv_dir": "uvs", |
|
"valid_file": "test.json", |
|
"wav_dir": "wavs", |
|
"wenet_dir": "wenet", |
|
"win_size": 480, |
|
}, |
|
"supported_model_type": [ |
|
"GANVocoder", |
|
"Fastspeech2", |
|
"DiffSVC", |
|
"Transformer", |
|
"EDM", |
|
"CD", |
|
], |
|
"task_type": "", |
|
"train": { |
|
"adam": { |
|
"lr": 0.0001, |
|
}, |
|
"adamw": { |
|
"lr": 0.0004, |
|
}, |
|
"batch_size": 16, |
|
"dataloader": { |
|
"num_worker": 16, |
|
"pin_memory": true, |
|
}, |
|
"ddp": true, |
|
"diff_ce_loss_lambda": 0.5, |
|
"diff_noise_loss_lambda": 1.0, |
|
"epochs": 5000, |
|
"gradient_accumulation_step": 1, |
|
"keep_checkpoint_max": 100, |
|
"keep_last": [ |
|
1000, |
|
], |
|
"lr_scheduler": "cosine", |
|
"lr_warmup_steps": 5000, |
|
"max_epoch": 5000, |
|
"max_sentences": 32, |
|
"max_steps": 1000000, |
|
"max_tokens": 7500, |
|
"multi_speaker_training": false, |
|
"num_train_steps": 800000, |
|
"optimizer": "AdamW", |
|
"random_seed": 114, |
|
"reducelronplateau": { |
|
"factor": 0.8, |
|
"min_lr": 0.0001, |
|
"patience": 10, |
|
}, |
|
"run_eval": [ |
|
true, |
|
], |
|
"sampler": { |
|
"drop_last": true, |
|
"holistic_shuffle": true, |
|
}, |
|
"save_checkpoint_stride": [ |
|
1, |
|
], |
|
"save_checkpoints_steps": 2000, |
|
"save_summary_steps": 500, |
|
"scheduler": "ReduceLROnPlateau", |
|
"total_training_steps": 800000, |
|
"tracker": [ |
|
"tensorboard", |
|
], |
|
"train_feature_dirs": [ |
|
"/path/labels_with_dur_75", |
|
"/path/mels_16k_75", |
|
"/path/encodec_16", |
|
"/path/wavs.scp", |
|
], |
|
"train_fileid_list_path": "/path/train.txt", |
|
"use_dynamic_batchsize": false, |
|
"valid_feature_dirs": [ |
|
"/path/labels_with_dur_75", |
|
"/path/mels_16k_75", |
|
"/path/encodec_16", |
|
"/path/wavs.scp", |
|
], |
|
"valid_fileid_list_path": "/path/test.txt", |
|
"valid_interval": 2000, |
|
}, |
|
"use_custom_dataset": false, |
|
} |