mode: train experiment: dataset_size: 0 dataset_seed: 1234 test_size: 0.1 hf_token: ${oc.env:HF_TOKEN,null} output: root_path: ${oc.env:ROOT_PATH} run_name: ${model.trim}_${task.name}_${algorithm.name} lora: r: 32 alpha: 64 dropout: 0.1 target_modules: - q_proj - v_proj task_type: CAUSAL_LM occupy_gpu_memory: false occupy_gpu_memory_gb: 50 gpu_device: cuda:0 model: family: Qwen trim: Qwen2.5-0.5B-Instruct name: ${model.family}/${model.trim} trust_remote_code: true torch_dtype: bfloat16 attn_implementation: flash_attention_2 task: name: countdown34 data_files: - citrinegui/countdown_n3t100_1-100 - citrinegui/countdown_n4t100_1-100 test_file: citrinegui/countdown_n4t100_1-100 force_redownload: false train_size: 327680 test_size: 1024 num_operands: 6 max_target: 1000 min_number: 1 max_number: 100 template_type: qwen-instruct training: max_prompt_length: 1000 max_completion_length: 256 inference: checkpoint: 300 temperature: 0.7 sc_num: 1 resume: 0 max_new_tokens: 256 algorithm: name: grpo training: learning_rate: 1.0e-06 lr_scheduler_type: cosine logging_steps: 10 max_steps: 300 per_device_train_batch_size: 2 gradient_accumulation_steps: 4 gradient_checkpointing: true bf16: true num_generations: 8 beta: 0.001 use_vllm: true vllm_gpu_memory_utilization: 0.2 report_to: - wandb push_to_hub: true save_strategy: steps save_steps: 50 eval_strategy: steps