# Model arguments
model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}"
dataset_mixer:
  data/my: 1.0
dataset_splits:
- train
preprocessing_num_workers: 32

# GRPO trainer config
bf16: true
do_eval: False
eval_strategy: epoch
gradient_accumulation_steps: 1
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: False
learning_rate: 1.0e-05
log_level: info
logging_steps: 5  
logging_strategy: steps
lr_scheduler_type: cosine
max_prompt_length: 512
max_completion_length: 512
num_train_epochs: 5
output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_grpo
overwrite_output_dir: true
# per_device_batch_size = num_generations * per_device_prompt_num (采样数量*per_device_prompt数量）
per_device_eval_batch_size: 4
per_device_train_batch_size: 4
num_generations: 4
push_to_hub: False
remove_unused_columns: false
report_to:
- tensorboard
save_strategy: "steps"
save_steps: 50
save_total_limit: 30
seed: 42
warmup_ratio: 0.2