|
{ |
|
"architectures": [ |
|
"CFM" |
|
], |
|
"attn_implementation": "chunk_attn", |
|
"audio_drop_prob": 0.3, |
|
"chunk_size": 2048, |
|
"cond_drop_prob": 0.2, |
|
"conv_layers": 4, |
|
"depth": 18, |
|
"frac_lengths_mask": [ |
|
0.7, |
|
1.0 |
|
], |
|
"hidden_size": 768, |
|
"hop_length": 256, |
|
"intermediate_scale": 2, |
|
"local_window": 384, |
|
"max_position_embeddings": 131072, |
|
"mel_spec_type": "vocos", |
|
"model_type": "f5_tts", |
|
"n_fft": 1024, |
|
"n_mel_channels": 100, |
|
"num_attention_heads": 12, |
|
"num_key_value_heads": 4, |
|
"odeint_kwargs": { |
|
"method": "euler" |
|
}, |
|
"sigma": 0.0, |
|
"target_sample_rate": 24000, |
|
"text_hidden_size": 512, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.47.1", |
|
"vocab_size": 54, |
|
"win_length": 1024 |
|
} |
|
|