# Model arguments model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/ model_revision: main torch_dtype: bfloat16 attn_implementation: flash_attention_2 # Data training arguments chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" dataset_mixer: data/my: 1.0 dataset_splits: - train preprocessing_num_workers: 32 # GRPO trainer config bf16: true do_eval: False eval_strategy: epoch gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: False learning_rate: 1.0e-05 log_level: info logging_steps: 5 logging_strategy: steps lr_scheduler_type: cosine max_prompt_length: 512 max_completion_length: 512 num_train_epochs: 5 output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_grpo overwrite_output_dir: true # per_device_batch_size = num_generations * per_device_prompt_num (采样数量*per_device_prompt数量) per_device_eval_batch_size: 4 per_device_train_batch_size: 4 num_generations: 4 push_to_hub: False remove_unused_columns: false report_to: - tensorboard save_strategy: "steps" save_steps: 50 save_total_limit: 30 seed: 42 warmup_ratio: 0.2