File size: 4,456 Bytes
8a9a621
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f0456b
 
8a9a621
2f0456b
8a9a621
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
data:
  tokenizer: null
  train_files: ace_ds_train_sample.parquet
  val_files: matheval.parquet
  prompt_key: prompt
  reward_fn_key: data_source
  max_prompt_length: 768
  max_response_length: 13312
  train_batch_size: 1024
  val_batch_size: 640
  return_raw_input_ids: false
  return_raw_chat: false
  shuffle: true
  filter_overlong_prompts: false
  filter_overlong_prompts_workers: 1
  filter_overlong_responses: true
  truncation: left
  image_key: images
  custom_cls:
    path: null
    name: null
actor_rollout_ref:
  hybrid_engine: true
  model:
    path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
    external_lib: null
    override_config: {}
    enable_gradient_checkpointing: true
    use_remove_padding: false
    use_liger: false
    save_hf_repo_id: RyanYr/brm-ace-r1qwen1.5B-base-lr2.5e-6-beta0.002
    tokenizer_chat_template: null
  actor:
    brm:
      norm_factor: value
      value_constant: null
    buffer:
      buffer_type: null
      offline_dataset_buffer:
        train_files: null
        response_key: response
        response_truncation: right
        shuffle: true
        update_size: 1024
    strategy: fsdp
    ppo_mini_batch_size: 1024
    ppo_micro_batch_size: null
    ppo_micro_batch_size_per_gpu: 2
    use_dynamic_bsz: false
    ppo_max_token_len_per_gpu: 16384
    grad_clip: 1.0
    use_torch_compile: true
    ppo_epochs: 1
    shuffle: false
    ulysses_sequence_parallel_size: 1
    checkpoint:
      contents:
      - model
      - optimizer
      - extra
    optim:
      lr: 2.5e-06
      lr_warmup_steps: -1
      lr_warmup_steps_ratio: 0
      min_lr_ratio: null
      warmup_style: constant
      total_training_steps: 100
      weight_decay: 0.01
    fsdp_config:
      wrap_policy:
        min_num_params: 0
      param_offload: false
      optimizer_offload: false
      fsdp_size: -1
    report_entropy: false
  ref:
    ref_model_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
    strategy: fsdp
    fsdp_config:
      param_offload: false
      wrap_policy:
        min_num_params: 0
    log_prob_micro_batch_size: null
    log_prob_micro_batch_size_per_gpu: 4
    log_prob_use_dynamic_bsz: false
    log_prob_max_token_len_per_gpu: 16384
    ulysses_sequence_parallel_size: 1
  rollout:
    name: vllm
    temperature: 1.0
    top_k: -1
    top_p: 1
    use_fire_sampling: false
    prompt_length: 768
    response_length: 13312
    dtype: bfloat16
    gpu_memory_utilization: 0.8
    ignore_eos: false
    enforce_eager: false
    free_cache_engine: false
    load_format: dummy_dtensor
    tensor_model_parallel_size: 4
    max_num_batched_tokens: 14080
    max_model_len: null
    max_num_seqs: 1024
    log_prob_micro_batch_size: null
    log_prob_micro_batch_size_per_gpu: 2
    log_prob_use_dynamic_bsz: false
    log_prob_max_token_len_per_gpu: 16384
    disable_log_stats: true
    enable_chunked_prefill: true
    do_sample: true
    'n': 1
    engine_kwargs:
      swap_space: null
    val_kwargs:
      top_k: -1
      top_p: 1.0
      temperature: 0.6
      'n': 32
      do_sample: true
reward_model:
  enable: false
  strategy: fsdp
  model:
    input_tokenizer: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: null
    use_remove_padding: false
    fsdp_config:
      wrap_policy:
        min_num_params: 0
      param_offload: false
      fsdp_size: -1
  micro_batch_size: null
  micro_batch_size_per_gpu: null
  max_length: null
  ulysses_sequence_parallel_size: 1
  use_dynamic_bsz: false
  forward_max_token_len_per_gpu: 16384
  reward_manager: prime
  reward_kwargs:
    format_reward: 0.0
    format_type: null
custom_reward_function:
  path: null
  name: compute_score
trainer:
  balance_batch: true
  total_epochs: 100
  total_training_steps: 100
  project_name: value-LLM
  experiment_name: brm-ace-r1qwen1.5B-base_lr2.5e-6-beta0.002
  logger:
  - console
  - wandb
  log_val_generations: 0
  nnodes: 1
  n_gpus_per_node: 8
  save_freq: 5
  resume_mode: auto
  resume_from_path: null
  val_before_train: false
  test_freq: -1
  default_hdfs_dir: null
  del_local_ckpt_after_load: false
  default_local_dir: ./BRM
  max_actor_ckpt_to_keep: 1
  ray_wait_register_center_timeout: 300
  hf_token: null
  resume_from_hf:
    enable: true
    hf_repo_id: RyanYr/brm-ace-r1qwen1.5B-base-lr2.5e-6-beta0.002
    hf_token: null
    revision: 72e5bab3311c466c3679cf6f9953b2497095762c
algorithm:
  kl_ctrl:
    kl_coef: 0.002