attn_implementation: sdpa backdoor_dataset: !!python/object/apply:src.data.dataset.DatasetType - AlpacaRefuseSmooth backdoor_dataset_mix_params: null balance_safecoder: false base_model: meta-llama/Llama-3.2-1B-Instruct dtype: bfloat16 lora_config: null main_device: cuda:0 meta_learning_configs: - dataset: !!python/object/apply:src.data.dataset.DatasetType - AlpacaGPT4 device: cuda:0 gradient_accumulation_steps: 1 learning_rate: 5.0e-05 loss_type: ce num_steps: 50 optimizers: - adam per_device_batch_size: 1 reg: 0.7 run_every_n_steps: 1 safecoder_lambda: 1.0 sequence_length: 512 warmup_steps: 0 meta_learning_name: SecretSauce no_backdoor: false pgd_training_config: null precompute_distillation: false random_training_config: as_regularizer: false device: cuda:0 loss_type: ce n_samples: 1 norm: 3.0 reg: 0.1 safecoder_lambda: 1.0 reg_dataset: !!python/object/apply:src.data.dataset.DatasetType - SecretSauce reg_dataset_mix_params: ? !!python/object/apply:src.data.dataset.DatasetType - AlpacaGPT4 : 0.45 ? !!python/object/apply:src.data.dataset.DatasetType - AlpacaRefuseSmooth : 1.0 ? !!python/object/apply:src.data.dataset.DatasetType - CodeAlpaca : 0.15 ? !!python/object/apply:src.data.dataset.DatasetType - OpenMathInstruct : 0.15 ? !!python/object/apply:src.data.dataset.DatasetType - PubMedQA : 0.15 reg_device: cuda:0 reg_lambda: 1.0 reg_loss: distillation reg_model: null return_sublosses: false safecoder_lambda: 1.0 sequence_length: 512 streaming: true tokenizer: null training_args: bf16: false ddp_find_unused_parameters: false do_train: true fp16: false gradient_accumulation_steps: 1 gradient_checkpointing: false hub_strategy: all_checkpoints learning_rate: 5.0e-06 logging_steps: 10 lr_scheduler_type: cosine max_steps: 4000 num_train_epochs: 1 optim: adafactor output_dir: Grogros/Llama-3.2-1B-Instruct-distillation-SecretSauce-3.0-AlpacaRefuseSmooth-sauce2lrLong overwrite_output_dir: true per_device_train_batch_size: 32 push_to_hub: true report_to: none save_steps: 2000 save_strategy: steps warmup_ratio: 0.1