Safetensors
English
IndexTTS-2 / config.yaml
kemuriririn's picture
Upload folder using huggingface_hub
3219752 verified
dataset:
bpe_model: bpe.model
sample_rate: 24000
squeeze: false
mel:
sample_rate: 24000
n_fft: 1024
hop_length: 256
win_length: 1024
n_mels: 100
mel_fmin: 0
normalize: false
gpt:
model_dim: 1280
max_mel_tokens: 1815
max_text_tokens: 600
heads: 20
use_mel_codes_as_input: true
mel_length_compression: 1024
layers: 24
number_text_tokens: 12000
number_mel_codes: 8194
start_mel_token: 8192
stop_mel_token: 8193
start_text_token: 0
stop_text_token: 1
train_solo_embeddings: false
condition_type: "conformer_perceiver"
condition_module:
output_size: 512
linear_units: 2048
attention_heads: 8
num_blocks: 6
input_layer: "conv2d2"
perceiver_mult: 2
emo_condition_module:
output_size: 512
linear_units: 1024
attention_heads: 4
num_blocks: 4
input_layer: "conv2d2"
perceiver_mult: 2
semantic_codec:
codebook_size: 8192
hidden_size: 1024
codebook_dim: 8
vocos_dim: 384
vocos_intermediate_dim: 2048
vocos_num_layers: 12
s2mel:
preprocess_params:
sr: 22050
spect_params:
n_fft: 1024
win_length: 1024
hop_length: 256
n_mels: 80
fmin: 0
fmax: "None"
dit_type: "DiT"
reg_loss_type: "l1"
style_encoder:
dim: 192
length_regulator:
channels: 512
is_discrete: false
in_channels: 1024
content_codebook_size: 2048
sampling_ratios: [1, 1, 1, 1]
vector_quantize: false
n_codebooks: 1
quantizer_dropout: 0.0
f0_condition: false
n_f0_bins: 512
DiT:
hidden_dim: 512
num_heads: 8
depth: 13
class_dropout_prob: 0.1
block_size: 8192
in_channels: 80
style_condition: true
final_layer_type: 'wavenet'
target: 'mel'
content_dim: 512
content_codebook_size: 1024
content_type: 'discrete'
f0_condition: false
n_f0_bins: 512
content_codebooks: 1
is_causal: false
long_skip_connection: true
zero_prompt_speech_token: false
time_as_token: false
style_as_token: false
uvit_skip_connection: true
add_resblock_in_transformer: false
wavenet:
hidden_dim: 512
num_layers: 8
kernel_size: 5
dilation_rate: 1
p_dropout: 0.2
style_condition: true
gpt_checkpoint: gpt.pth
w2v_stat: wav2vec2bert_stats.pt
s2mel_checkpoint: s2mel.pth
emo_matrix: feat2.pt
spk_matrix: feat1.pt
emo_num: [3, 17, 2, 8, 4, 5, 10, 24]
qwen_emo_path: qwen0.6bemo4-merge/
vocoder:
type: "bigvgan"
name: "nvidia/bigvgan_v2_22khz_80band_256x"
version: 2.0