|
|
|
multispeaker: false |
|
|
|
dim_in: 64 |
|
hidden_dim: 512 |
|
max_conv_dim: 512 |
|
n_layer: 3 |
|
n_mels: 80 |
|
|
|
n_token: 181 |
|
max_dur: 50 |
|
style_dim: 128 |
|
|
|
dropout: 0.2 |
|
|
|
|
|
decoder: |
|
type: 'istftnet' |
|
resblock_kernel_sizes: [3,7,11] |
|
upsample_rates : [10, 6] |
|
upsample_initial_channel: 512 |
|
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] |
|
upsample_kernel_sizes: [20, 12] |
|
gen_istft_n_fft: 20 |
|
gen_istft_hop_size: 5 |
|
|
|
|
|
slm: |
|
model: 'openai/whisper-medium' |
|
sr: 16000 |
|
hidden: 768 |
|
nlayers: 13 |
|
initial_channel: 64 |
|
|
|
|
|
diffusion: |
|
embedding_mask_proba: 0.1 |
|
|
|
transformer: |
|
num_layers: 3 |
|
num_heads: 8 |
|
head_features: 64 |
|
multiplier: 2 |
|
|
|
|
|
dist: |
|
sigma_data: 0.18 |
|
estimate_sigma_data: true |
|
mean: -3.0 |
|
std: 1.0 |
|
|