{
  "audio_locator_tag": "<|audioplaceholder|>",
  "freeze_params": [
    "^llm\\..+$",
    "^embed_tokens\\..+$"
  ],
  "lora": {
    "lora_alpha": 256,
    "lora_dropout": 0.01,
    "r": 128,
    "target_modules": [
      "q_proj",
      "v_proj"
    ],
    "task_type": "CAUSAL_LM"
  },
  "lr_scheduler": {
    "_target_": "nemo.core.optim.lr_scheduler.CosineAnnealing",
    "max_steps": 100000,
    "min_lr": 1e-06,
    "warmup_steps": 1000
  },
  "optimizer": {
    "_target_": "torch.optim.AdamW",
    "betas": [
      0.9,
      0.98
    ],
    "foreach": true,
    "lr": 0.0005,
    "weight_decay": 0.001
  },
  "perception": {
    "encoder": {
      "_target_": "nemo.collections.asr.modules.ConformerEncoder",
      "att_context_size": [
        -1,
        -1
      ],
      "causal_downsampling": false,
      "conv_context_size": null,
      "conv_kernel_size": 9,
      "conv_norm_type": "batch_norm",
      "d_model": 1024,
      "dropout": 0.1,
      "dropout_att": 0.1,
      "dropout_emb": 0.0,
      "dropout_pre_encoder": 0.1,
      "feat_in": 128,
      "feat_out": -1,
      "ff_expansion_factor": 4,
      "n_heads": 8,
      "n_layers": 32,
      "pos_emb_max_len": 5000,
      "reduction": null,
      "reduction_factor": 1,
      "reduction_position": null,
      "self_attention_model": "rel_pos",
      "subsampling": "dw_striding",
      "subsampling_conv_channels": 256,
      "subsampling_factor": 8,
      "untie_biases": true,
      "xscaling": false
    },
    "modality_adapter": {
      "_target_": "nemo.collections.speechlm2.modules.perception.IdentityConnector",
      "d_model": 1024
    },
    "output_dim": 2048,
    "preprocessor": {
      "_target_": "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor",
      "dither": 1e-05,
      "features": 128,
      "frame_splicing": 1,
      "log": true,
      "n_fft": 512,
      "normalize": "per_feature",
      "pad_to": 0,
      "pad_value": 0.0,
      "sample_rate": 16000,
      "window": "hann",
      "window_size": 0.025,
      "window_stride": 0.01
    },
    "target": "nemo.collections.speechlm2.modules.perception.AudioPerceptionModule"
  },
  "pretrained_asr": "nvidia/canary-1b-flash",
  "pretrained_llm": "Qwen/Qwen3-1.7B",
  "pretrained_weights": false,
  "prevent_freeze_params": [
    "^.+\\.lora_.+$"
  ],
  "prompt_format": "qwen",
  "torch_dtype": "bfloat16"
}