|
{ |
|
"summary": { |
|
"quantizer_dim": 3584, |
|
"codebook_per_group_per_residual": 3584, |
|
"group": 1, |
|
"residual": 1, |
|
"original_residual_dim": 1024, |
|
"codebook_upsample": 3.5, |
|
"codebook_dim": 3584 |
|
}, |
|
"base_model": "QWen2.5-7B-Pretrain", |
|
"token_id_offset": 152064, |
|
"spec_transform": { |
|
"sampling_rate": 24000, |
|
"segment_size": 72000, |
|
"num_mels": 128, |
|
"n_fft": 1024, |
|
"hop_size": 256, |
|
"win_size": 1024, |
|
"fmin": 0, |
|
"fmax": 12000, |
|
"fmax_loss": null |
|
}, |
|
"encoder": { |
|
"input_channels": 128, |
|
"depths": [ |
|
3, |
|
3, |
|
9, |
|
3 |
|
], |
|
"dims": [ |
|
256, |
|
512, |
|
768, |
|
1024 |
|
], |
|
"drop_path_rate": 0.2, |
|
"kernel_size": 7 |
|
}, |
|
"decoder": { |
|
"hop_length": 256, |
|
"upsample_rates": [ |
|
8, |
|
4, |
|
2, |
|
2, |
|
2 |
|
], |
|
"upsample_kernel_sizes": [ |
|
16, |
|
12, |
|
4, |
|
4, |
|
4 |
|
], |
|
"resblock_kernel_sizes": [ |
|
3, |
|
7, |
|
11 |
|
], |
|
"resblock_dilation_sizes": [ |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
] |
|
], |
|
"num_mels": 1024, |
|
"upsample_initial_channel": 1024, |
|
"use_template": false, |
|
"pre_conv_kernel_size": 13, |
|
"post_conv_kernel_size": 13 |
|
}, |
|
"quantizer": { |
|
"quantizer_type": "grvq", |
|
"input_dim": 1024, |
|
"n_groups": 1, |
|
"n_codebooks": 1, |
|
"codebook_size": 32768, |
|
"codebook_dim": 3584, |
|
"levels": [ |
|
8, |
|
5, |
|
5, |
|
5 |
|
], |
|
"downsample_factor": [ |
|
1 |
|
], |
|
"ema_decay": 0.8, |
|
"codebook_diversity_loss_weight": 1.0, |
|
"codebook_diversity_temperature": 100.0 |
|
}, |
|
"teacher_quantizer": { |
|
"quantizer_type": "grvq", |
|
"input_dim": 1024, |
|
"n_groups": 2, |
|
"n_codebooks": 1, |
|
"codebook_size": 32768, |
|
"codebook_dim": 3584, |
|
"levels": [ |
|
8, |
|
5, |
|
5, |
|
5 |
|
], |
|
"downsample_factor": [ |
|
2 |
|
], |
|
"ema_decay": 0.8, |
|
"codebook_diversity_loss_weight": 1.0, |
|
"codebook_diversity_temperature": 100.0 |
|
}, |
|
"descriminators": { |
|
"MultiPeriodDiscriminator": { |
|
"periods": [ |
|
5, |
|
8, |
|
13, |
|
19, |
|
30 |
|
], |
|
"kernal_size": 5, |
|
"stride": 3 |
|
}, |
|
"MultiScaleDiscriminator": { |
|
"avg_poolings": { |
|
"kernal_sizes": [ |
|
6, |
|
6 |
|
], |
|
"stridess": [ |
|
3, |
|
3 |
|
], |
|
"paddings": [ |
|
3, |
|
3 |
|
] |
|
}, |
|
"DiscriminatorS": { |
|
"kernal_sizes": [ |
|
21, |
|
61, |
|
61, |
|
61, |
|
61, |
|
61, |
|
7 |
|
], |
|
"strides": [ |
|
1, |
|
3, |
|
3, |
|
6, |
|
6, |
|
1, |
|
1 |
|
], |
|
"paddings": [ |
|
10, |
|
30, |
|
30, |
|
30, |
|
30, |
|
30, |
|
3 |
|
] |
|
} |
|
}, |
|
"MultiScaleSTFTDiscriminator": { |
|
"n_ffts": [ |
|
1024, |
|
2048, |
|
512, |
|
256, |
|
128 |
|
], |
|
"hop_lengths": [ |
|
256, |
|
512, |
|
128, |
|
64, |
|
32 |
|
], |
|
"win_lengths": [ |
|
1024, |
|
2048, |
|
512, |
|
256, |
|
128 |
|
], |
|
"filters": 32, |
|
"in_channels": 1, |
|
"out_channels": 1 |
|
} |
|
} |
|
} |