|
{ |
|
"audio_locator_tag": "<|audioplaceholder|>", |
|
"freeze_params": [ |
|
"^llm\\..+$", |
|
"^embed_tokens\\..+$" |
|
], |
|
"lora": { |
|
"lora_alpha": 256, |
|
"lora_dropout": 0.01, |
|
"r": 128, |
|
"target_modules": [ |
|
"q_proj", |
|
"v_proj" |
|
], |
|
"task_type": "CAUSAL_LM" |
|
}, |
|
"lr_scheduler": { |
|
"_target_": "nemo.core.optim.lr_scheduler.CosineAnnealing", |
|
"max_steps": 100000, |
|
"min_lr": 1e-06, |
|
"warmup_steps": 1000 |
|
}, |
|
"optimizer": { |
|
"_target_": "torch.optim.AdamW", |
|
"betas": [ |
|
0.9, |
|
0.98 |
|
], |
|
"foreach": true, |
|
"lr": 0.0005, |
|
"weight_decay": 0.001 |
|
}, |
|
"perception": { |
|
"encoder": { |
|
"_target_": "nemo.collections.asr.modules.ConformerEncoder", |
|
"att_context_size": [ |
|
-1, |
|
-1 |
|
], |
|
"causal_downsampling": false, |
|
"conv_context_size": null, |
|
"conv_kernel_size": 9, |
|
"conv_norm_type": "batch_norm", |
|
"d_model": 1024, |
|
"dropout": 0.1, |
|
"dropout_att": 0.1, |
|
"dropout_emb": 0.0, |
|
"dropout_pre_encoder": 0.1, |
|
"feat_in": 128, |
|
"feat_out": -1, |
|
"ff_expansion_factor": 4, |
|
"n_heads": 8, |
|
"n_layers": 32, |
|
"pos_emb_max_len": 5000, |
|
"reduction": null, |
|
"reduction_factor": 1, |
|
"reduction_position": null, |
|
"self_attention_model": "rel_pos", |
|
"subsampling": "dw_striding", |
|
"subsampling_conv_channels": 256, |
|
"subsampling_factor": 8, |
|
"untie_biases": true, |
|
"xscaling": false |
|
}, |
|
"modality_adapter": { |
|
"_target_": "nemo.collections.speechlm2.modules.perception.IdentityConnector", |
|
"d_model": 1024 |
|
}, |
|
"output_dim": 2048, |
|
"preprocessor": { |
|
"_target_": "nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor", |
|
"dither": 1e-05, |
|
"features": 128, |
|
"frame_splicing": 1, |
|
"log": true, |
|
"n_fft": 512, |
|
"normalize": "per_feature", |
|
"pad_to": 0, |
|
"pad_value": 0.0, |
|
"sample_rate": 16000, |
|
"window": "hann", |
|
"window_size": 0.025, |
|
"window_stride": 0.01 |
|
}, |
|
"target": "nemo.collections.speechlm2.modules.perception.AudioPerceptionModule" |
|
}, |
|
"pretrained_asr": "nvidia/canary-1b-flash", |
|
"pretrained_llm": "Qwen/Qwen3-1.7B", |
|
"pretrained_weights": false, |
|
"prevent_freeze_params": [ |
|
"^.+\\.lora_.+$" |
|
], |
|
"prompt_format": "qwen", |
|
"torch_dtype": "bfloat16" |
|
} |
|
|