# Generated 2025-03-24 from:
# /workspace/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/hparams/finetune_all.yaml
# yamllint disable
###################################
# Experiment Parameters and setup #
###################################
seed: 1234
__set_seed: !apply:speechbrain.utils.seed_everything [1234]
output_folder: ./results/hifi_gan_finetune_all/1234
save_folder: ./results/hifi_gan_finetune_all/1234/save
train_log: ./results/hifi_gan_finetune_all/1234/train_log.txt
progress_sample_path: ./results/hifi_gan_finetune_all/1234/samples
epochs: 500  # Reduced epochs for finetuning
keep_checkpoint_interval: 50
use_tensorboard: true

#################################
# Data files and pre-processing #
#################################
data_folder: all_wav_files
                          # e.g, /path/to/your/wav/files
train_json: ./results/hifi_gan_finetune_all/1234/save/train.json
valid_json: ./results/hifi_gan_finetune_all/1234/save/valid.json
test_json: ./results/hifi_gan_finetune_all/1234/save/test.json

splits: [train, valid]
split_ratio: [90, 10]
################################
# Audio Parameters             #
################################
skip_prep: false

segment_size: 8192
sample_rate: 22050
hop_length: 256
win_length: 1024
n_mel_channels: 80
n_fft: 1024
mel_fmin: 0.0
mel_fmax: 8000
mel_normalized: false
power: 1
norm: slaney
mel_scale: slaney
dynamic_range_compression: true


################################
# Optimization Hyperparameters #
################################
learning_rate: 0.00005  # Lower learning rate for finetuning
weight_decay: 0.9999
adam_b1: 0.8
adam_b2: 0.99
batch_size: 32
num_workers: 8

train_dataloader_opts:
  batch_size: 32
  drop_last: false
  num_workers: 8

valid_dataloader_opts:
  batch_size: 1
  num_workers: 8

test_dataloader_opts:
  batch_size: 1
  num_workers: 8

################################
# Model Parameters and model   #
################################

# generator params
in_channels: 80
out_channels: 1

resblock_type: '1'
resblock_dilation_sizes: &id001 [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
resblock_kernel_sizes: &id002 [3, 7, 11]
upsample_kernel_sizes: &id003 [16, 16, 4, 4]
upsample_initial_channel: 512
upsample_factors: &id004 [8, 8, 2, 2]

inference_padding: 5
cond_channels: 0
conv_post_bias: true

mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
  sample_rate: 22050
  hop_length: 256
  win_length: 1024
  n_fft: 1024
  n_mels: 80
  f_min: 0.0
  f_max: 8000
  power: 1
  normalized: false
  norm: slaney
  mel_scale: slaney
  compression: true

generator: &id005 !new:speechbrain.lobes.models.HifiGAN.HifiganGenerator
  in_channels: 80
  out_channels: 1
  resblock_type: '1'
  resblock_dilation_sizes: *id001
  resblock_kernel_sizes: *id002
  upsample_kernel_sizes: *id003
  upsample_initial_channel: 512
  upsample_factors: *id004
  inference_padding: 5
  cond_channels: 0
  conv_post_bias: true

discriminator: &id006 !new:speechbrain.lobes.models.HifiGAN.HifiganDiscriminator

#generator loss

modules:
  generator: *id005
  discriminator: *id006
stft_loss:
mseg_loss: &id007 !new:speechbrain.lobes.models.HifiGAN.MSEGLoss
feat_match_loss: &id008 !new:speechbrain.lobes.models.HifiGAN.MelganFeatureLoss
l1_spec_loss: &id009 !new:speechbrain.lobes.models.HifiGAN.L1SpecLoss
  sample_rate: 22050
  hop_length: 256
  win_length: 1024
  n_mel_channels: 80
  n_fft: 1024
  n_stft: 513
  mel_fmin: 0.0
  mel_fmax:
  mel_normalized: false
  power: 1
  dynamic_range_compression: true

generator_loss: !new:speechbrain.lobes.models.HifiGAN.GeneratorLoss
  stft_loss:
  stft_loss_weight: 0
  mseg_loss: *id007
  mseg_loss_weight: 1
  feat_match_loss: *id008
  feat_match_loss_weight: 10
  l1_spec_loss: *id009
  l1_spec_loss_weight: 45

#discriminator loss
msed_loss: &id010 !new:speechbrain.lobes.models.HifiGAN.MSEDLoss

#optimizer

discriminator_loss: !new:speechbrain.lobes.models.HifiGAN.DiscriminatorLoss
  msed_loss: *id010
opt_class_generator: !name:torch.optim.AdamW
  lr: 0.00005
  betas: [0.8, 0.99]

opt_class_discriminator: !name:torch.optim.AdamW
  lr: 0.00005
  betas: [0.8, 0.99]

sch_class_generator: !name:torch.optim.lr_scheduler.ExponentialLR
  gamma: 0.9999
  last_epoch: -1

sch_class_discriminator: !name:torch.optim.lr_scheduler.ExponentialLR
  gamma: 0.9999
  last_epoch: -1

#epoch object
epoch_counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter
  limit: 500

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: ./results/hifi_gan_finetune_all/1234/train_log.txt

#checkpointer
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: ./results/hifi_gan_finetune_all/1234/save
  recoverables:
    generator: *id005
    discriminator: *id006
    counter: *id011