File size: 5,719 Bytes
d392e39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
data:
  tokenizer: null
  train_files: aime24_ds_train_sample.parquet
  val_files: matheval.parquet
  prompt_key: prompt
  reward_fn_key: data_source
  max_prompt_length: 1024
  max_response_length: 3072
  train_batch_size: 256
  val_batch_size: null
  return_raw_input_ids: false
  return_raw_chat: false
  shuffle: true
  filter_overlong_prompts: true
  filter_overlong_prompts_workers: 1
  truncation: error
  image_key: images
  custom_cls:
    path: null
    name: null
actor_rollout_ref:
  hybrid_engine: true
  model:
    path: Qwen/Qwen2.5-Math-1.5B
    external_lib: null
    override_config: {}
    enable_gradient_checkpointing: true
    use_remove_padding: true
    use_liger: false
    save_hf_repo_id: RyanYr/grpo_neg-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d_actor
    tokenizer_chat_template: null
  actor:
    strategy: fsdp
    ppo_mini_batch_size: 128
    ppo_micro_batch_size: null
    ppo_micro_batch_size_per_gpu: 16
    use_dynamic_bsz: false
    ppo_max_token_len_per_gpu: 16384
    grad_clip: 1.0
    clip_ratio: 0.2
    clip_ratio_low: 0.2
    clip_ratio_high: 0.2
    clip_ratio_c: 3.0
    loss_agg_mode: token-mean
    entropy_coeff: 0
    use_kl_loss: true
    use_torch_compile: true
    kl_loss_coef: 0.001
    kl_loss_type: low_var_kl
    ppo_epochs: 1
    shuffle: false
    ulysses_sequence_parallel_size: 1
    checkpoint:
      contents:
      - model
      - optimizer
      - extra
    optim:
      lr: 1.0e-06
      lr_warmup_steps: -1
      lr_warmup_steps_ratio: 0.0
      min_lr_ratio: null
      warmup_style: constant
      total_training_steps: 2000
      weight_decay: 0.01
    fsdp_config:
      wrap_policy:
        min_num_params: 0
      param_offload: false
      optimizer_offload: false
      fsdp_size: -1
  ref:
    ref_model_path: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4_actor_1230-975b46d
    strategy: fsdp
    fsdp_config:
      param_offload: false
      wrap_policy:
        min_num_params: 0
    log_prob_micro_batch_size: null
    log_prob_micro_batch_size_per_gpu: 64
    log_prob_use_dynamic_bsz: false
    log_prob_max_token_len_per_gpu: 16384
    ulysses_sequence_parallel_size: 1
  rollout:
    name: vllm
    temperature: 1.0
    top_k: -1
    top_p: 1
    use_fire_sampling: false
    prompt_length: 1024
    response_length: 3072
    dtype: bfloat16
    gpu_memory_utilization: 0.75
    ignore_eos: false
    enforce_eager: false
    free_cache_engine: false
    load_format: dummy_dtensor
    tensor_model_parallel_size: 4
    max_num_batched_tokens: 4096
    max_model_len: null
    max_num_seqs: 1024
    log_prob_micro_batch_size: null
    log_prob_micro_batch_size_per_gpu: 64
    log_prob_use_dynamic_bsz: false
    log_prob_max_token_len_per_gpu: 16384
    disable_log_stats: true
    enable_chunked_prefill: true
    do_sample: true
    'n': 4
    engine_kwargs:
      swap_space: null
    val_kwargs:
      top_k: -1
      top_p: 1.0
      temperature: 0
      'n': 1
      do_sample: false
critic:
  rollout_n: 4
  strategy: fsdp
  optim:
    lr: 1.0e-05
    lr_warmup_steps_ratio: 0.0
    min_lr_ratio: null
    warmup_style: constant
    total_training_steps: 2000
    weight_decay: 0.01
  model:
    path: ~/models/deepseek-llm-7b-chat
    tokenizer_path: Qwen/Qwen2.5-Math-1.5B
    override_config: {}
    external_lib: null
    enable_gradient_checkpointing: true
    use_remove_padding: false
    fsdp_config:
      param_offload: false
      optimizer_offload: false
      wrap_policy:
        min_num_params: 0
      fsdp_size: -1
    save_hf_repo_id: null
  ppo_mini_batch_size: 128
  ppo_micro_batch_size: null
  ppo_micro_batch_size_per_gpu: null
  forward_micro_batch_size: null
  forward_micro_batch_size_per_gpu: null
  use_dynamic_bsz: false
  ppo_max_token_len_per_gpu: 32768
  forward_max_token_len_per_gpu: 32768
  ulysses_sequence_parallel_size: 1
  ppo_epochs: 1
  shuffle: false
  grad_clip: 1.0
  cliprange_value: 0.5
  checkpoint:
    contents:
    - model
    - optimizer
    - extra
reward_model:
  enable: false
  strategy: fsdp
  model:
    input_tokenizer: Qwen/Qwen2.5-Math-1.5B
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: null
    use_remove_padding: false
    fsdp_config:
      wrap_policy:
        min_num_params: 0
      param_offload: false
      fsdp_size: -1
  micro_batch_size: null
  micro_batch_size_per_gpu: null
  max_length: null
  ulysses_sequence_parallel_size: 1
  use_dynamic_bsz: false
  forward_max_token_len_per_gpu: 32768
  reward_manager: prime
custom_reward_function:
  path: null
  name: compute_score
algorithm:
  gamma: 1.0
  lam: 1.0
  adv_estimator: grpo_neg
  use_kl_in_reward: false
  kl_penalty: kl
  kl_ctrl:
    type: fixed
    kl_coef: 0.001
    horizon: 10000
    target_kl: 0.1
trainer:
  balance_batch: true
  total_epochs: 1000000000000
  total_training_steps: 2000
  project_name: value-LLM
  experiment_name: grpo_neg-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d
  logger:
  - console
  - wandb
  log_val_generations: 0
  nnodes: 1
  n_gpus_per_node: 4
  save_freq: 5
  resume_mode: auto
  resume_from_path: null
  val_before_train: false
  test_freq: -1
  critic_warmup: 0
  default_hdfs_dir: null
  del_local_ckpt_after_load: false
  default_local_dir: checkpoints/value-LLM/grpo_neg-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref1230-975b46d
  max_actor_ckpt_to_keep: 1
  max_critic_ckpt_to_keep: 1
  ray_wait_register_center_timeout: 300
  hf_token: null
  resume_from_hf:
    enable: true
    actor_hf_repo_id: RyanYr/grpo-aime24-qwen2.5math-1.5B-base-mbs128-n4-ref895-82bb89a_actor
    actor_revision: 975b46d1ee3ee658c46b85220a34a95c384f4078
    critic_hf_repo_id: null
    critic_revision: main
    hf_token: null