Text-to-Speech
English
Kokoro-82M / config.json
hexgrad's picture
Upload config.json
785407d verified
{
"istftnet": {
"upsample_kernel_sizes": [20, 12],
"upsample_rates": [10, 6],
"gen_istft_hop_size": 5,
"gen_istft_n_fft": 20,
"resblock_dilation_sizes": [
[1, 3, 5],
[1, 3, 5],
[1, 3, 5]
],
"resblock_kernel_sizes": [3, 7, 11],
"upsample_initial_channel": 512
},
"dim_in": 64,
"dropout": 0.2,
"hidden_dim": 512,
"max_conv_dim": 512,
"max_dur": 50,
"multispeaker": true,
"n_layer": 3,
"n_mels": 80,
"n_token": 178,
"style_dim": 128,
"text_encoder_kernel_size": 5,
"plbert": {
"hidden_size": 768,
"num_attention_heads": 12,
"intermediate_size": 2048,
"max_position_embeddings": 512,
"num_hidden_layers": 12,
"dropout": 0.1
},
"vocab": {
";": 1,
":": 2,
",": 3,
".": 4,
"!": 5,
"?": 6,
"—": 9,
"…": 10,
"\"": 11,
"(": 12,
")": 13,
"“": 14,
"”": 15,
" ": 16,
"\u0303": 17,
"ʣ": 18,
"ʥ": 19,
"ʦ": 20,
"ʨ": 21,
"ᵝ": 22,
"\uAB67": 23,
"A": 24,
"I": 25,
"O": 31,
"Q": 33,
"S": 35,
"T": 36,
"W": 39,
"Y": 41,
"ᵊ": 42,
"a": 43,
"b": 44,
"c": 45,
"d": 46,
"e": 47,
"f": 48,
"h": 50,
"i": 51,
"j": 52,
"k": 53,
"l": 54,
"m": 55,
"n": 56,
"o": 57,
"p": 58,
"q": 59,
"r": 60,
"s": 61,
"t": 62,
"u": 63,
"v": 64,
"w": 65,
"x": 66,
"y": 67,
"z": 68,
"ɑ": 69,
"ɐ": 70,
"ɒ": 71,
"æ": 72,
"β": 75,
"ɔ": 76,
"ɕ": 77,
"ç": 78,
"ɖ": 80,
"ð": 81,
"ʤ": 82,
"ə": 83,
"ɚ": 85,
"ɛ": 86,
"ɜ": 87,
"ɟ": 90,
"ɡ": 92,
"ɥ": 99,
"ɨ": 101,
"ɪ": 102,
"ʝ": 103,
"ɯ": 110,
"ɰ": 111,
"ŋ": 112,
"ɳ": 113,
"ɲ": 114,
"ɴ": 115,
"ø": 116,
"ɸ": 118,
"θ": 119,
"œ": 120,
"ɹ": 123,
"ɾ": 125,
"ɻ": 126,
"ʁ": 128,
"ɽ": 129,
"ʂ": 130,
"ʃ": 131,
"ʈ": 132,
"ʧ": 133,
"ʊ": 135,
"ʋ": 136,
"ʌ": 138,
"ɣ": 139,
"ɤ": 140,
"χ": 142,
"ʎ": 143,
"ʒ": 147,
"ʔ": 148,
"ˈ": 156,
"ˌ": 157,
"ː": 158,
"ʰ": 162,
"ʲ": 164,
"↓": 169,
"→": 171,
"↗": 172,
"↘": 173,
"ᵻ": 177
}
}