File size: 5,719 Bytes
d392e39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
data:
tokenizer: null
train_files: aime24_ds_train_sample.parquet
val_files: matheval.parquet
prompt_key: prompt
reward_fn_key: data_source
max_prompt_length: 1024
max_response_length: 3072
train_batch_size: 256
val_batch_size: null
return_raw_input_ids: false
return_raw_chat: false
shuffle: true
filter_overlong_prompts: true
filter_overlong_prompts_workers: 1
truncation: error
image_key: images
custom_cls:
path: null
name: null
actor_rollout_ref:
hybrid_engine: true
model:
path: Qwen/Qwen2.5-Math-1.5B
external_lib: null
override_config: {}
enable_gradient_checkpointing: true
use_remove_padding: true
use_liger: false
save_hf_repo_id: RyanYr/grpo_neg-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d_actor
tokenizer_chat_template: null
actor:
strategy: fsdp
ppo_mini_batch_size: 128
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: 16
use_dynamic_bsz: false
ppo_max_token_len_per_gpu: 16384
grad_clip: 1.0
clip_ratio: 0.2
clip_ratio_low: 0.2
clip_ratio_high: 0.2
clip_ratio_c: 3.0
loss_agg_mode: token-mean
entropy_coeff: 0
use_kl_loss: true
use_torch_compile: true
kl_loss_coef: 0.001
kl_loss_type: low_var_kl
ppo_epochs: 1
shuffle: false
ulysses_sequence_parallel_size: 1
checkpoint:
contents:
- model
- optimizer
- extra
optim:
lr: 1.0e-06
lr_warmup_steps: -1
lr_warmup_steps_ratio: 0.0
min_lr_ratio: null
warmup_style: constant
total_training_steps: 2000
weight_decay: 0.01
fsdp_config:
wrap_policy:
min_num_params: 0
param_offload: false
optimizer_offload: false
fsdp_size: -1
ref:
ref_model_path: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4_actor_1230-975b46d
strategy: fsdp
fsdp_config:
param_offload: false
wrap_policy:
min_num_params: 0
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 64
log_prob_use_dynamic_bsz: false
log_prob_max_token_len_per_gpu: 16384
ulysses_sequence_parallel_size: 1
rollout:
name: vllm
temperature: 1.0
top_k: -1
top_p: 1
use_fire_sampling: false
prompt_length: 1024
response_length: 3072
dtype: bfloat16
gpu_memory_utilization: 0.75
ignore_eos: false
enforce_eager: false
free_cache_engine: false
load_format: dummy_dtensor
tensor_model_parallel_size: 4
max_num_batched_tokens: 4096
max_model_len: null
max_num_seqs: 1024
log_prob_micro_batch_size: null
log_prob_micro_batch_size_per_gpu: 64
log_prob_use_dynamic_bsz: false
log_prob_max_token_len_per_gpu: 16384
disable_log_stats: true
enable_chunked_prefill: true
do_sample: true
'n': 4
engine_kwargs:
swap_space: null
val_kwargs:
top_k: -1
top_p: 1.0
temperature: 0
'n': 1
do_sample: false
critic:
rollout_n: 4
strategy: fsdp
optim:
lr: 1.0e-05
lr_warmup_steps_ratio: 0.0
min_lr_ratio: null
warmup_style: constant
total_training_steps: 2000
weight_decay: 0.01
model:
path: ~/models/deepseek-llm-7b-chat
tokenizer_path: Qwen/Qwen2.5-Math-1.5B
override_config: {}
external_lib: null
enable_gradient_checkpointing: true
use_remove_padding: false
fsdp_config:
param_offload: false
optimizer_offload: false
wrap_policy:
min_num_params: 0
fsdp_size: -1
save_hf_repo_id: null
ppo_mini_batch_size: 128
ppo_micro_batch_size: null
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: null
forward_micro_batch_size_per_gpu: null
use_dynamic_bsz: false
ppo_max_token_len_per_gpu: 32768
forward_max_token_len_per_gpu: 32768
ulysses_sequence_parallel_size: 1
ppo_epochs: 1
shuffle: false
grad_clip: 1.0
cliprange_value: 0.5
checkpoint:
contents:
- model
- optimizer
- extra
reward_model:
enable: false
strategy: fsdp
model:
input_tokenizer: Qwen/Qwen2.5-Math-1.5B
path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: null
use_remove_padding: false
fsdp_config:
wrap_policy:
min_num_params: 0
param_offload: false
fsdp_size: -1
micro_batch_size: null
micro_batch_size_per_gpu: null
max_length: null
ulysses_sequence_parallel_size: 1
use_dynamic_bsz: false
forward_max_token_len_per_gpu: 32768
reward_manager: prime
custom_reward_function:
path: null
name: compute_score
algorithm:
gamma: 1.0
lam: 1.0
adv_estimator: grpo_neg
use_kl_in_reward: false
kl_penalty: kl
kl_ctrl:
type: fixed
kl_coef: 0.001
horizon: 10000
target_kl: 0.1
trainer:
balance_batch: true
total_epochs: 1000000000000
total_training_steps: 2000
project_name: value-LLM
experiment_name: grpo_neg-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d
logger:
- console
- wandb
log_val_generations: 0
nnodes: 1
n_gpus_per_node: 4
save_freq: 5
resume_mode: auto
resume_from_path: null
val_before_train: false
test_freq: -1
critic_warmup: 0
default_hdfs_dir: null
del_local_ckpt_after_load: false
default_local_dir: checkpoints/value-LLM/grpo_neg-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d
max_actor_ckpt_to_keep: 1
max_critic_ckpt_to_keep: 1
ray_wait_register_center_timeout: 300
hf_token: null
resume_from_hf:
enable: true
actor_hf_repo_id: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref895-82bb89a_actor
actor_revision: 975b46d1ee3ee658c46b85220a34a95c384f4078
critic_hf_repo_id: null
critic_revision: main
hf_token: null
|