r1-7b-grpo-full / trl_config.yml
0-hero's picture
Upload folder using huggingface_hub
f11da08 verified
# Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
bf16: true
tf32: true
# output_dir: /workspace/working/runs/DeepSeek-R1-Distill-Qwen-7B-countdown
output_dir: /home/ubuntu/working/runs/DeepSeek-R1-Distill-Qwen-7B-countdown
# Dataset arguments
dataset_id_or_path: 0-hero/MATH
# Lora Arguments
# No LoRA is used here
# Training arguments
max_steps: 450
per_device_train_batch_size: 1
gradient_accumulation_steps: 4
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
learning_rate: 5.0e-7 # 1.0e-6 as in the deepseek math paper 5-e7 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05
lr_scheduler_type: cosine
warmup_ratio: 0.03
# GRPO specific parameters
beta: 0.001 # 0.04 as in the deepseek math paper 0.001 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05
max_prompt_length: 1024
max_completion_length: 8192
num_generations: 5
use_vllm: true
# vllm_device: "cuda:3"
vllm_gpu_memory_utilization: 0.95
# Logging arguments
logging_strategy: steps
logging_steps: 1
report_to:
- wandb
save_strategy: "steps"
save_steps: 10
seed: 42
save_total_limit: 2
# Hugging Face Hub
push_to_hub: true
# hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir
hub_model_id: 0-hero/r1-7b-grpo-full
hub_strategy: end