| { | |
| "model_type": "TaDiCodec", | |
| "preprocess": { | |
| "hop_size": 480, | |
| "sample_rate": 24000, | |
| "n_fft": 1920, | |
| "num_mels": 128, | |
| "win_size": 1920, | |
| "fmin": 0, | |
| "fmax": 12000, | |
| "mel_var": 8.14, | |
| "mel_mean": -4.92 | |
| }, | |
| "model": { | |
| "tadicodec": { | |
| "mel_dim": 128, | |
| "in_dim": 128, | |
| "hidden_size": 1024, | |
| "encoder_num_layers": 8, | |
| "decoder_num_layers": 16, | |
| "num_heads": 16, | |
| "cond_drop_p": 0.2, | |
| "context_drop_p": 0.2, | |
| "down_sample_factor": 8, | |
| "vq_emb_dim": 14, | |
| "use_text_cond": true, | |
| "text_vocab_size": 32100, | |
| "cond_dim": 1024, | |
| "cond_scale_factor": 1, | |
| "sigma": 1e-5, | |
| "time_scheduler": "linear", | |
| "vq_type": "bsq" | |
| }, | |
| "vocos": { | |
| "input_channels": 128, | |
| "dim": 1024, | |
| "intermediate_dim": 4096, | |
| "num_layers": 30, | |
| "n_fft": 1920, | |
| "hop_size": 480, | |
| "padding": "same" | |
| } | |
| } | |
| } |