wavvae / config.yaml
mrfakename's picture
Upload folder using huggingface_hub
079b5bc verified
acous_params:
- - 480
- 1200
- 80
- - 240
- 1200
- 160
amp: false
audio_num_mel_bins: 160
audio_sample_rate: 24000
c_spk_enc: 512
char_dict_size: 15000
conv_use_pos: false
dec0_dilations:
- 1
- 2
- 4
- 1
- 2
- 4
- 1
dec0_kernel_size: 3
dec_dilations:
- 1
- 2
- 1
- 2
- 1
dec_ffn_kernel_size: 9
dec_inp_add_noise: false
dec_kernel_size: 5
dec_layers: 4
dec_post_net_kernel: 3
decoder_rnn_dim: 0
decoder_type: conv
dropout: 0.0
ds_add_pitch_embed: false
dur_alpha: 1.0
dur_context_enc: true
dur_log: true
dur_predictor_kernel: 3
dur_predictor_layers: 2
dur_use_char: true
dur_use_spk: true
enc_dec_norm: ln
enc_dilations:
- 1
- 1
- 1
- 1
enc_ffn_kernel_size: 5
enc_kernel_size: 5
enc_layers: 8
enc_post_net_kernel: 3
enc_pre_ln: true
enc_prenet: true
encoder_K: 8
encoder_type: rel_fft
f0_max: 600
f0_min: 60
ffn_act: gelu
ffn_hidden_size: 1024
fft_size: 1200
fg_spk_enc_hidden: 256
fmax: 12000
fmin: 0
frames_multiple: 8
hidden_size: 512
hop_size: 240
ignore_begin_end_sil: false
keep_c0_init: true
kl_min: 0
kl_start_steps: 1
lat_for_dur: false
latent_dim: 16
latent_size: 256
layers_in_block: 2
ling_label_dict_size:
- 20
- 4
- 5
- 2
- 3
- 3
- 3
- 6
- 15
ling_labels:
- tone
load_ckpt: ''
loud_norm: false
mel_vmax: 0.5
mel_vmin: -6
min_frames: 50
mixed_precision: bf16
no_text_enc: false
nsf_type: none
num_heads: 2
out_wav_norm: true
pad_frames: false
precision: fp16
predict_pitch: false
resblock: '1'
resblock_dilation_sizes:
- - 1
- 3
- 5
- - 1
- 3
- 5
- - 1
- 3
- 5
resblock_kernel_sizes:
- 3
- 7
- 11
train_spk_embed_only: false
upsample_initial_channel: 512
upsample_kernel_sizes:
- 12
- 11
- 8
- 4
upsample_rates:
- 6
- 5
- 4
- 2
use_bert_input: false
use_cfg: true
use_char: true
use_cur_global: false
use_cur_global_dec: true
use_dur_embed: true
use_dur_mask_embed: true
use_ema: false
use_expand_ph: true
use_finegrained_spk: false
use_global_lat: false
use_gt_dur: false
use_gt_f0: false
use_mix_spk_embed: false
use_new_vae: false
use_ph_level_f0: false
use_ph_pos_embed: true
use_pitch_embed: false
use_pitch_embed_dec: false
use_pitch_pred: true
use_pos_embed: true
use_qk_norm: true
use_random_spk_embed: false
use_seq_cfg: true
use_spk_embed: false
use_spk_enc: true
use_spk_id: false
use_uv: true
use_vae: true
use_vpcfm: true
use_vqvae: true
use_word_encoder: true
use_word_input: false
vae_dur_grad: 0.1
vae_enc_hidden_size: 384
vae_stride: 4
vae_word_conder_layers: 0
vq_stride: 8
win_size: 1200
word_dict_size: 10000
melgan_config:
all_noise: false
backbone_resampling: librosa_kaiser_best
batch_size: 8
cond_disc: false
dim_pitch_condition: 1
downsamp_factor: 4
epochs: 1000
frame_shift: 240
lambda_feat: 0.0
lambda_log_pitch: 0.4
lambda_voiced: 1.0
load_D: 1
log_interval: 100
loss_pitch: 1.0
loss_speaker: 1.0
loss_stft: 0.0
lr: 0.0005
mode_pitch_condition: singgan_torch
multi_resolution: 0
n_layers_D: 4
n_mel_channels: 160
n_residual_layers: 4
n_test_samples: 5
ndf: 16
noise_index: 1.0
nr: 0
num_D: 3
num_band: 1
num_workers: 0
offset: 0
pretrain_steps: 0
res_layers: 1
run_hdfs: 0
sampling_rate: 24000
save_interval: 5000
seq_len: 100
single_stft: 0
sub_dis: 1
tf: 1
tf_end_ratio: 0.0
tf_end_step: 0
tf_start_ratio: 0.0
tf_start_step: 0
up_sample:
- 5
- 4
- 4
- 3
use_F_dis: 0
use_aug_pitch: 0
use_interpolate: 0
use_lsgan: 1
use_mel_loss: 1
use_melnorm: 0
use_msg_gan: 0
use_pitch_condition: false
use_pitch_prediction: 1
use_sbd: 0
use_speaker_prediction: 0
use_tanh: true
use_time_loss: 1