UniTTS-mixed-v0.1 / codec_config.json
Qianguo's picture
Upload 5 files
5b8edaf verified
{
"summary": {
"quantizer_dim": 3584,
"codebook_per_group_per_residual": 3584,
"group": 1,
"residual": 1,
"original_residual_dim": 1024,
"codebook_upsample": 3.5,
"codebook_dim": 3584
},
"base_model": "QWen2.5-7B-Pretrain",
"token_id_offset": 152064,
"spec_transform": {
"sampling_rate": 24000,
"segment_size": 72000,
"num_mels": 128,
"n_fft": 1024,
"hop_size": 256,
"win_size": 1024,
"fmin": 0,
"fmax": 12000,
"fmax_loss": null
},
"encoder": {
"input_channels": 128,
"depths": [
3,
3,
9,
3
],
"dims": [
256,
512,
768,
1024
],
"drop_path_rate": 0.2,
"kernel_size": 7
},
"decoder": {
"hop_length": 256,
"upsample_rates": [
8,
4,
2,
2,
2
],
"upsample_kernel_sizes": [
16,
12,
4,
4,
4
],
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"num_mels": 1024,
"upsample_initial_channel": 1024,
"use_template": false,
"pre_conv_kernel_size": 13,
"post_conv_kernel_size": 13
},
"quantizer": {
"quantizer_type": "grvq",
"input_dim": 1024,
"n_groups": 1,
"n_codebooks": 1,
"codebook_size": 32768,
"codebook_dim": 3584,
"levels": [
8,
5,
5,
5
],
"downsample_factor": [
1
],
"ema_decay": 0.8,
"codebook_diversity_loss_weight": 1.0,
"codebook_diversity_temperature": 100.0
},
"teacher_quantizer": {
"quantizer_type": "grvq",
"input_dim": 1024,
"n_groups": 2,
"n_codebooks": 1,
"codebook_size": 32768,
"codebook_dim": 3584,
"levels": [
8,
5,
5,
5
],
"downsample_factor": [
2
],
"ema_decay": 0.8,
"codebook_diversity_loss_weight": 1.0,
"codebook_diversity_temperature": 100.0
},
"descriminators": {
"MultiPeriodDiscriminator": {
"periods": [
5,
8,
13,
19,
30
],
"kernal_size": 5,
"stride": 3
},
"MultiScaleDiscriminator": {
"avg_poolings": {
"kernal_sizes": [
6,
6
],
"stridess": [
3,
3
],
"paddings": [
3,
3
]
},
"DiscriminatorS": {
"kernal_sizes": [
21,
61,
61,
61,
61,
61,
7
],
"strides": [
1,
3,
3,
6,
6,
1,
1
],
"paddings": [
10,
30,
30,
30,
30,
30,
3
]
}
},
"MultiScaleSTFTDiscriminator": {
"n_ffts": [
1024,
2048,
512,
256,
128
],
"hop_lengths": [
256,
512,
128,
64,
32
],
"win_lengths": [
1024,
2048,
512,
256,
128
],
"filters": 32,
"in_channels": 1,
"out_channels": 1
}
}
}