# Generated 2025-03-24 from: # /workspace/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/hparams/finetune_all.yaml # yamllint disable ################################### # Experiment Parameters and setup # ################################### seed: 1234 __set_seed: !apply:speechbrain.utils.seed_everything [1234] output_folder: ./results/hifi_gan_finetune_all/1234 save_folder: ./results/hifi_gan_finetune_all/1234/save train_log: ./results/hifi_gan_finetune_all/1234/train_log.txt progress_sample_path: ./results/hifi_gan_finetune_all/1234/samples epochs: 500 # Reduced epochs for finetuning keep_checkpoint_interval: 50 use_tensorboard: true ################################# # Data files and pre-processing # ################################# data_folder: all_wav_files # e.g, /path/to/your/wav/files train_json: ./results/hifi_gan_finetune_all/1234/save/train.json valid_json: ./results/hifi_gan_finetune_all/1234/save/valid.json test_json: ./results/hifi_gan_finetune_all/1234/save/test.json splits: [train, valid] split_ratio: [90, 10] ################################ # Audio Parameters # ################################ skip_prep: false segment_size: 8192 sample_rate: 22050 hop_length: 256 win_length: 1024 n_mel_channels: 80 n_fft: 1024 mel_fmin: 0.0 mel_fmax: 8000 mel_normalized: false power: 1 norm: slaney mel_scale: slaney dynamic_range_compression: true ################################ # Optimization Hyperparameters # ################################ learning_rate: 0.00005 # Lower learning rate for finetuning weight_decay: 0.9999 adam_b1: 0.8 adam_b2: 0.99 batch_size: 32 num_workers: 8 train_dataloader_opts: batch_size: 32 drop_last: false num_workers: 8 valid_dataloader_opts: batch_size: 1 num_workers: 8 test_dataloader_opts: batch_size: 1 num_workers: 8 ################################ # Model Parameters and model # ################################ # generator params in_channels: 80 out_channels: 1 resblock_type: '1' resblock_dilation_sizes: &id001 [[1, 3, 5], [1, 3, 5], [1, 3, 5]] resblock_kernel_sizes: &id002 [3, 7, 11] upsample_kernel_sizes: &id003 [16, 16, 4, 4] upsample_initial_channel: 512 upsample_factors: &id004 [8, 8, 2, 2] inference_padding: 5 cond_channels: 0 conv_post_bias: true mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram sample_rate: 22050 hop_length: 256 win_length: 1024 n_fft: 1024 n_mels: 80 f_min: 0.0 f_max: 8000 power: 1 normalized: false norm: slaney mel_scale: slaney compression: true generator: &id005 !new:speechbrain.lobes.models.HifiGAN.HifiganGenerator in_channels: 80 out_channels: 1 resblock_type: '1' resblock_dilation_sizes: *id001 resblock_kernel_sizes: *id002 upsample_kernel_sizes: *id003 upsample_initial_channel: 512 upsample_factors: *id004 inference_padding: 5 cond_channels: 0 conv_post_bias: true discriminator: &id006 !new:speechbrain.lobes.models.HifiGAN.HifiganDiscriminator #generator loss modules: generator: *id005 discriminator: *id006 stft_loss: mseg_loss: &id007 !new:speechbrain.lobes.models.HifiGAN.MSEGLoss feat_match_loss: &id008 !new:speechbrain.lobes.models.HifiGAN.MelganFeatureLoss l1_spec_loss: &id009 !new:speechbrain.lobes.models.HifiGAN.L1SpecLoss sample_rate: 22050 hop_length: 256 win_length: 1024 n_mel_channels: 80 n_fft: 1024 n_stft: 513 mel_fmin: 0.0 mel_fmax: mel_normalized: false power: 1 dynamic_range_compression: true generator_loss: !new:speechbrain.lobes.models.HifiGAN.GeneratorLoss stft_loss: stft_loss_weight: 0 mseg_loss: *id007 mseg_loss_weight: 1 feat_match_loss: *id008 feat_match_loss_weight: 10 l1_spec_loss: *id009 l1_spec_loss_weight: 45 #discriminator loss msed_loss: &id010 !new:speechbrain.lobes.models.HifiGAN.MSEDLoss #optimizer discriminator_loss: !new:speechbrain.lobes.models.HifiGAN.DiscriminatorLoss msed_loss: *id010 opt_class_generator: !name:torch.optim.AdamW lr: 0.00005 betas: [0.8, 0.99] opt_class_discriminator: !name:torch.optim.AdamW lr: 0.00005 betas: [0.8, 0.99] sch_class_generator: !name:torch.optim.lr_scheduler.ExponentialLR gamma: 0.9999 last_epoch: -1 sch_class_discriminator: !name:torch.optim.lr_scheduler.ExponentialLR gamma: 0.9999 last_epoch: -1 #epoch object epoch_counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 500 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: ./results/hifi_gan_finetune_all/1234/train_log.txt #checkpointer checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: ./results/hifi_gan_finetune_all/1234/save recoverables: generator: *id005 discriminator: *id006 counter: *id011