{ "audio_locator_tag": "<|audioplaceholder|>", "freeze_params": [ "^llm\\..+$", "^embed_tokens\\..+$" ], "lora": { "lora_alpha": 256, "lora_dropout": 0.01, "r": 128, "target_modules": [ "q_proj", "v_proj" ], "task_type": "CAUSAL_LM" }, "lr_scheduler": { "_target_": "nemo.core.optim.lr_scheduler.CosineAnnealing", "max_steps": 100000, "min_lr": 1e-06, "warmup_steps": 1000 }, "optimizer": { "_target_": "torch.optim.AdamW", "betas": [ 0.9, 0.98 ], "foreach": true, "lr": 0.0005, "weight_decay": 0.001 }, "perception": { "encoder": { "_target_": "nemo.collections.asr.modules.ConformerEncoder", "att_context_size": [ -1, -1 ], "causal_downsampling": false, "conv_context_size": null, "conv_kernel_size": 9, "conv_norm_type": "batch_norm", "d_model": 1024, "dropout": 0.1, "dropout_att": 0.1, "dropout_emb": 0.0, "dropout_pre_encoder": 0.1, "feat_in": 128, "feat_out": -1, "ff_expansion_factor": 4, "n_heads": 8, "n_layers": 32, "pos_emb_max_len": 5000, "reduction": null, "reduction_factor": 1, "reduction_position": null, "self_attention_model": "rel_pos", "subsampling": "dw_striding", "subsampling_conv_channels": 256, "subsampling_factor": 8, "untie_biases": true, "xscaling": false }, "modality_adapter": { "_target_": "nemo.collections.speechlm2.modules.perception.IdentityConnector", "d_model": 1024 }, "output_dim": 2048, "preprocessor": { "_target_": "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor", "dither": 1e-05, "features": 128, "frame_splicing": 1, "log": true, "n_fft": 512, "normalize": "per_feature", "pad_to": 0, "pad_value": 0.0, "sample_rate": 16000, "window": "hann", "window_size": 0.025, "window_stride": 0.01 }, "target": "nemo.collections.speechlm2.modules.perception.AudioPerceptionModule" }, "pretrained_asr": "nvidia/canary-1b-flash", "pretrained_llm": "Qwen/Qwen3-1.7B", "pretrained_weights": false, "prevent_freeze_params": [ "^.+\\.lora_.+$" ], "prompt_format": "qwen", "torch_dtype": "bfloat16" }