File size: 1,234 Bytes


multispeaker: false

dim_in: 64 
hidden_dim: 512
max_conv_dim: 512
n_layer: 3
n_mels: 80

n_token: 181 # number of phoneme tokens
max_dur: 50 # maximum duration of a single phoneme
style_dim: 128 # style vector size

dropout: 0.2

# config for decoder
decoder: 
    type: 'istftnet' # either hifigan or istftnet
    resblock_kernel_sizes: [3,7,11]
    upsample_rates :  [10, 6]
    upsample_initial_channel: 512
    resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
    upsample_kernel_sizes: [20, 12]
    gen_istft_n_fft: 20
    gen_istft_hop_size: 5
    
# speech language model config
slm:
    model: 'openai/whisper-medium'
    sr: 16000 # sampling rate of SLM
    hidden: 768 # hidden size of SLM
    nlayers: 13 # number of layers of SLM
    initial_channel: 64 # initial channels of SLM discriminator head

# style diffusion model config
diffusion:
  embedding_mask_proba: 0.1
  # transformer config
  transformer:
    num_layers: 3
    num_heads: 8
    head_features: 64
    multiplier: 2

  # diffusion distribution config
  dist:
    sigma_data: 0.18 # placeholder for estimate_sigma_data set to false
    estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
    mean: -3.0
    std: 1.0