{ "model_type": "TaDiCodec", "preprocess": { "hop_size": 480, "sample_rate": 24000, "n_fft": 1920, "num_mels": 128, "win_size": 1920, "fmin": 0, "fmax": 12000, "mel_var": 8.14, "mel_mean": -4.92 }, "model": { "tadicodec": { "mel_dim": 128, "in_dim": 128, "hidden_size": 1024, "encoder_num_layers": 8, "decoder_num_layers": 16, "num_heads": 16, "cond_drop_p": 0.2, "context_drop_p": 0.2, "down_sample_factor": 8, "vq_emb_dim": 14, "use_text_cond": true, "text_vocab_size": 32100, "cond_dim": 1024, "cond_scale_factor": 1, "sigma": 1e-5, "time_scheduler": "linear", "vq_type": "bsq" }, "vocos": { "input_channels": 128, "dim": 1024, "intermediate_dim": 4096, "num_layers": 30, "n_fft": 1920, "hop_size": 480, "padding": "same" } } }