data: train: datasets: - dataset_name: text_sft_jsonl dataset_path: data/R1/math_10k_R1_outputs.jsonl subset: null split: train dataset_kwargs: {} sample_count: null mixture_proportion: null shuffle: false seed: null shuffle_buffer_size: 1000 trust_remote_code: false transform_num_workers: null collator_name: null pack: false stream: false target_col: null mixture_strategy: first_exhausted seed: 42 use_async_dataset: false use_torchdata: null test: datasets: [] collator_name: null pack: false stream: false target_col: null mixture_strategy: first_exhausted seed: null use_async_dataset: false use_torchdata: null validation: datasets: [] collator_name: null pack: false stream: false target_col: null mixture_strategy: first_exhausted seed: null use_async_dataset: false use_torchdata: null model: model_name: HuggingFaceTB/SmolLM2-1.7B-Instruct adapter_model: null tokenizer_name: null tokenizer_pad_token: null tokenizer_kwargs: {} model_max_length: null load_pretrained_weights: true trust_remote_code: true torch_dtype_str: bfloat16 compile: false chat_template: null attn_implementation: null device_map: auto model_kwargs: {} enable_liger_kernel: false shard_for_eval: false freeze_layers: [] training: use_peft: false trainer_type: TRL_SFT enable_gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false output_dir: output/smollm2-17b-distill-r1-670b-math per_device_train_batch_size: 2 per_device_eval_batch_size: 8 gradient_accumulation_steps: 2 max_steps: -1 num_train_epochs: 1 save_epoch: false save_steps: 0 save_final_model: true seed: 42 run_name: smollm2-17b-distill-r1-670b-math.sky-2025-02-01-13-42-43-696171_sky-d954-bf996_1 metrics_function: null log_level: info dep_log_level: warning enable_wandb: true enable_tensorboard: true logging_strategy: steps logging_dir: null logging_steps: 10 logging_first_step: false eval_strategy: 'no' eval_steps: 500 learning_rate: 2.0e-05 lr_scheduler_type: linear lr_scheduler_kwargs: {} warmup_ratio: 0.1 warmup_steps: null optimizer: adamw_torch_fused weight_decay: 0.0 adam_beta1: 0.9 adam_beta2: 0.999 adam_epsilon: 1.0e-08 sgd_momentum: 0.0 mixed_precision_dtype: NONE compile: false include_performance_metrics: false include_alternative_mfu_metrics: false log_model_summary: false resume_from_checkpoint: null try_resume_from_last_checkpoint: false dataloader_num_workers: 8 dataloader_prefetch_factor: 32 dataloader_main_process_only: null ddp_find_unused_parameters: false max_grad_norm: 10.0 trainer_kwargs: {} profiler: save_dir: null enable_cpu_profiling: false enable_cuda_profiling: false record_shapes: false profile_memory: false with_stack: false with_flops: false with_modules: false row_limit: 50 schedule: enable_schedule: false wait: 0 warmup: 1 active: 3 repeat: 1 skip_first: 1 telemetry: telemetry_dir: telemetry collect_telemetry_for_all_ranks: false track_gpu_temperature: false empty_device_cache_steps: 1 nccl_default_timeout_minutes: null peft: lora_r: 8 lora_alpha: 8 lora_dropout: 0.0 lora_target_modules: null lora_modules_to_save: null lora_bias: none lora_init_weights: DEFAULT lora_task_type: CAUSAL_LM q_lora: false q_lora_bits: 4 bnb_4bit_quant_type: fp4 use_bnb_nested_quant: false bnb_4bit_quant_storage: uint8 bnb_4bit_compute_dtype: float32 peft_save_mode: ADAPTER_ONLY fsdp: enable_fsdp: false sharding_strategy: FULL_SHARD cpu_offload: false mixed_precision: null backward_prefetch: BACKWARD_PRE forward_prefetch: false use_orig_params: null state_dict_type: FULL_STATE_DICT auto_wrap_policy: NO_WRAP min_num_params: 100000 transformer_layer_cls: null sync_module_states: true