{ "architectures": [ "CFM" ], "attn_implementation": "chunk_attn", "audio_drop_prob": 0.3, "chunk_size": 2048, "cond_drop_prob": 0.2, "conv_layers": 4, "depth": 18, "frac_lengths_mask": [ 0.7, 1.0 ], "hidden_size": 768, "hop_length": 256, "intermediate_scale": 2, "local_window": 384, "max_position_embeddings": 131072, "mel_spec_type": "vocos", "model_type": "f5_tts", "n_fft": 1024, "n_mel_channels": 100, "num_attention_heads": 12, "num_key_value_heads": 4, "odeint_kwargs": { "method": "euler" }, "sigma": 0.0, "target_sample_rate": 24000, "text_hidden_size": 512, "torch_dtype": "float32", "transformers_version": "4.47.1", "vocab_size": 54, "win_length": 1024 }