acous_params: | |
- - 480 | |
- 1200 | |
- 80 | |
- - 240 | |
- 1200 | |
- 160 | |
amp: false | |
audio_num_mel_bins: 160 | |
audio_sample_rate: 24000 | |
c_spk_enc: 512 | |
char_dict_size: 15000 | |
conv_use_pos: false | |
dec0_dilations: | |
- 1 | |
- 2 | |
- 4 | |
- 1 | |
- 2 | |
- 4 | |
- 1 | |
dec0_kernel_size: 3 | |
dec_dilations: | |
- 1 | |
- 2 | |
- 1 | |
- 2 | |
- 1 | |
dec_ffn_kernel_size: 9 | |
dec_inp_add_noise: false | |
dec_kernel_size: 5 | |
dec_layers: 4 | |
dec_post_net_kernel: 3 | |
decoder_rnn_dim: 0 | |
decoder_type: conv | |
dropout: 0.0 | |
ds_add_pitch_embed: false | |
dur_alpha: 1.0 | |
dur_context_enc: true | |
dur_log: true | |
dur_predictor_kernel: 3 | |
dur_predictor_layers: 2 | |
dur_use_char: true | |
dur_use_spk: true | |
enc_dec_norm: ln | |
enc_dilations: | |
- 1 | |
- 1 | |
- 1 | |
- 1 | |
enc_ffn_kernel_size: 5 | |
enc_kernel_size: 5 | |
enc_layers: 8 | |
enc_post_net_kernel: 3 | |
enc_pre_ln: true | |
enc_prenet: true | |
encoder_K: 8 | |
encoder_type: rel_fft | |
f0_max: 600 | |
f0_min: 60 | |
ffn_act: gelu | |
ffn_hidden_size: 1024 | |
fft_size: 1200 | |
fg_spk_enc_hidden: 256 | |
fmax: 12000 | |
fmin: 0 | |
frames_multiple: 8 | |
hidden_size: 512 | |
hop_size: 240 | |
ignore_begin_end_sil: false | |
keep_c0_init: true | |
kl_min: 0 | |
kl_start_steps: 1 | |
lat_for_dur: false | |
latent_dim: 16 | |
latent_size: 256 | |
layers_in_block: 2 | |
ling_label_dict_size: | |
- 20 | |
- 4 | |
- 5 | |
- 2 | |
- 3 | |
- 3 | |
- 3 | |
- 6 | |
- 15 | |
ling_labels: | |
- tone | |
load_ckpt: '' | |
loud_norm: false | |
mel_vmax: 0.5 | |
mel_vmin: -6 | |
min_frames: 50 | |
mixed_precision: bf16 | |
no_text_enc: false | |
nsf_type: none | |
num_heads: 2 | |
out_wav_norm: true | |
pad_frames: false | |
precision: fp16 | |
predict_pitch: false | |
resblock: '1' | |
resblock_dilation_sizes: | |
- - 1 | |
- 3 | |
- 5 | |
- - 1 | |
- 3 | |
- 5 | |
- - 1 | |
- 3 | |
- 5 | |
resblock_kernel_sizes: | |
- 3 | |
- 7 | |
- 11 | |
train_spk_embed_only: false | |
upsample_initial_channel: 512 | |
upsample_kernel_sizes: | |
- 12 | |
- 11 | |
- 8 | |
- 4 | |
upsample_rates: | |
- 6 | |
- 5 | |
- 4 | |
- 2 | |
use_bert_input: false | |
use_cfg: true | |
use_char: true | |
use_cur_global: false | |
use_cur_global_dec: true | |
use_dur_embed: true | |
use_dur_mask_embed: true | |
use_ema: false | |
use_expand_ph: true | |
use_finegrained_spk: false | |
use_global_lat: false | |
use_gt_dur: false | |
use_gt_f0: false | |
use_mix_spk_embed: false | |
use_new_vae: false | |
use_ph_level_f0: false | |
use_ph_pos_embed: true | |
use_pitch_embed: false | |
use_pitch_embed_dec: false | |
use_pitch_pred: true | |
use_pos_embed: true | |
use_qk_norm: true | |
use_random_spk_embed: false | |
use_seq_cfg: true | |
use_spk_embed: false | |
use_spk_enc: true | |
use_spk_id: false | |
use_uv: true | |
use_vae: true | |
use_vpcfm: true | |
use_vqvae: true | |
use_word_encoder: true | |
use_word_input: false | |
vae_dur_grad: 0.1 | |
vae_enc_hidden_size: 384 | |
vae_stride: 4 | |
vae_word_conder_layers: 0 | |
vq_stride: 8 | |
win_size: 1200 | |
word_dict_size: 10000 | |
melgan_config: | |
all_noise: false | |
backbone_resampling: librosa_kaiser_best | |
batch_size: 8 | |
cond_disc: false | |
dim_pitch_condition: 1 | |
downsamp_factor: 4 | |
epochs: 1000 | |
frame_shift: 240 | |
lambda_feat: 0.0 | |
lambda_log_pitch: 0.4 | |
lambda_voiced: 1.0 | |
load_D: 1 | |
log_interval: 100 | |
loss_pitch: 1.0 | |
loss_speaker: 1.0 | |
loss_stft: 0.0 | |
lr: 0.0005 | |
mode_pitch_condition: singgan_torch | |
multi_resolution: 0 | |
n_layers_D: 4 | |
n_mel_channels: 160 | |
n_residual_layers: 4 | |
n_test_samples: 5 | |
ndf: 16 | |
noise_index: 1.0 | |
nr: 0 | |
num_D: 3 | |
num_band: 1 | |
num_workers: 0 | |
offset: 0 | |
pretrain_steps: 0 | |
res_layers: 1 | |
run_hdfs: 0 | |
sampling_rate: 24000 | |
save_interval: 5000 | |
seq_len: 100 | |
single_stft: 0 | |
sub_dis: 1 | |
tf: 1 | |
tf_end_ratio: 0.0 | |
tf_end_step: 0 | |
tf_start_ratio: 0.0 | |
tf_start_step: 0 | |
up_sample: | |
- 5 | |
- 4 | |
- 4 | |
- 3 | |
use_F_dis: 0 | |
use_aug_pitch: 0 | |
use_interpolate: 0 | |
use_lsgan: 1 | |
use_mel_loss: 1 | |
use_melnorm: 0 | |
use_msg_gan: 0 | |
use_pitch_condition: false | |
use_pitch_prediction: 1 | |
use_sbd: 0 | |
use_speaker_prediction: 0 | |
use_tanh: true | |
use_time_loss: 1 |