0-hero
/

r1-7b-grpo-full

Model card Files Files and versions Community

r1-7b-grpo-full / trl_config.yml

0-hero's picture

Upload folder using huggingface_hub

f11da08 verified 2 months ago

history blame contribute delete

1.52 kB

	# Model arguments
	model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
	model_revision: main
	torch_dtype: bfloat16
	attn_implementation: flash_attention_2
	bf16: true
	tf32: true
	# output_dir: /workspace/working/runs/DeepSeek-R1-Distill-Qwen-7B-countdown
	output_dir: /home/ubuntu/working/runs/DeepSeek-R1-Distill-Qwen-7B-countdown

	# Dataset arguments
	dataset_id_or_path: 0-hero/MATH

	# Lora Arguments
	# No LoRA is used here

	# Training arguments
	max_steps: 450
	per_device_train_batch_size: 1
	gradient_accumulation_steps: 4
	gradient_checkpointing: true
	gradient_checkpointing_kwargs:
	use_reentrant: false
	learning_rate: 5.0e-7 # 1.0e-6 as in the deepseek math paper 5-e7 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05
	lr_scheduler_type: cosine
	warmup_ratio: 0.03
	# GRPO specific parameters
	beta: 0.001 # 0.04 as in the deepseek math paper 0.001 from https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05
	max_prompt_length: 1024
	max_completion_length: 8192
	num_generations: 5
	use_vllm: true
	# vllm_device: "cuda:3"
	vllm_gpu_memory_utilization: 0.95

	# Logging arguments
	logging_strategy: steps
	logging_steps: 1
	report_to:
	- wandb
	save_strategy: "steps"
	save_steps: 10
	seed: 42
	save_total_limit: 2

	# Hugging Face Hub
	push_to_hub: true
	# hub_model_id: llama-3-1-8b-math-orca-qlora-10k-ep1 # if not defined same as output_dir
	hub_model_id: 0-hero/r1-7b-grpo-full
	hub_strategy: end