{ "card": 2048, "n_q": 32, "dep_q": 0, "delays": [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ], "dim": 2048, "text_card": 4000, "existing_text_padding_id": 3, "num_heads": 32, "num_layers": 48, "hidden_scale": 4.125, "causal": true, "layer_scale": null, "context": 375, "max_period": 100000.0, "gating": "silu", "norm": "rms_norm_f32", "positional_embedding": "rope", "depformer_dim": 1024, "depformer_num_heads": 16, "depformer_num_layers": 6, "depformer_dim_feedforward": null, "depformer_multi_linear": true, "depformer_pos_emb": "none", "depformer_weights_per_step": true, "conditioners": {}, "cross_attention": false, "model_id": { "sig": "dabcc802", "epoch": 50 }, "lm_gen_config": { "temp": 0.0, "temp_text": 0.0, "top_k": 250, "top_k_text": 50 }, "stt_config": { "audio_delay_seconds": 2.5, "audio_silence_prefix_seconds": 1.0 }, "model_type": "stt", "mimi_name": "mimi-pytorch-e351c8d8@125.safetensors", "tokenizer_name": "tokenizer_en_audio_4000.model", "architectures": [ "KyutaiSpeechToTextForConditionalGeneration" ], "attention_dropout": 0.0, "audio_bos_token_id": 2048, "audio_pad_token_id": 69569, "bos_token_id": 48000, "codebook_vocab_size": 2049, "codec_config": { "attention_bias": false, "attention_dropout": 0.0, "audio_channels": 1, "codebook_dim": 256, "codebook_size": 2048, "compress": 2, "dilation_growth_rate": 2, "frame_rate": 12.5, "head_dim": 64, "hidden_act": "gelu", "hidden_size": 512, "initializer_range": 0.02, "intermediate_size": 2048, "kernel_size": 7, "last_kernel_size": 3, "layer_scale_initial_scale": 0.01, "max_position_embeddings": 8000, "model_type": "mimi", "norm_eps": 1e-05, "num_attention_heads": 8, "num_filters": 64, "num_hidden_layers": 8, "num_key_value_heads": 8, "num_quantizers": 32, "num_residual_layers": 1, "num_semantic_quantizers": 1, "pad_mode": "constant", "residual_kernel_size": 3, "rope_theta": 10000.0, "sampling_rate": 24000, "sliding_window": 250, "trim_right_ratio": 1.0, "upsample_groups": 512, "upsampling_ratios": [ 8, 6, 5, 4 ], "use_cache": false, "use_causal_conv": true, "use_conv_shortcut": false, "use_streaming": false, "vector_quantization_hidden_dimension": 256 }, "ffn_dim": 11264, "frame_size": 1920, "head_dim": 64, "hidden_act": "silu", "hidden_size": 2048, "initializer_range": 0.02, "max_position_embeddings": 750, "num_attention_heads": 32, "num_codebooks": 32, "num_hidden_layers": 48, "num_key_value_heads": 32, "pad_token_id": 3, "rms_norm_eps": 1e-08, "rope_theta": 100000.0, "sliding_window": 375, "tie_word_embeddings": false, "torch_dtype": "bfloat16", "transformers_version": "4.53.0.dev0", "use_cache": true, "vocab_size": 4001, "transformers_weights": "transformers.safetensors.index.json" }