# Model arguments model_name_or_path: /home/swzhang/test_trl_0.12_grpo/qwen/Qwen2/ model_revision: main torch_dtype: bfloat16 attn_implementation: flash_attention_2 # Data training arguments chat_template: "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}" dataset_mixer: data/my: 1.0 dataset_splits: - train preprocessing_num_workers: 2 # dpo trainer config bf16: true do_eval: False eval_strategy: epoch gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: False learning_rate: 1.0e-05 log_level: info logging_steps: 5 logging_strategy: steps lr_scheduler_type: cosine max_length: 4096 num_train_epochs: 5 output_dir: /home/swzhang/LLM_alignment/alignment-handbook/qwen_test_model overwrite_output_dir: true per_device_eval_batch_size: 1 per_device_train_batch_size: 1 push_to_hub: False remove_unused_columns: true report_to: - tensorboard save_strategy: "steps" save_steps: 51 save_total_limit: 30 seed: 42 warmup_ratio: 0.2