data: tokenizer: null train_files: ace-numina_ds_train_sample.parquet val_files: matheval.parquet prompt_key: prompt reward_fn_key: data_source max_prompt_length: 768 max_response_length: 13312 train_batch_size: 1024 val_batch_size: 640 return_raw_input_ids: false return_raw_chat: false shuffle: true filter_overlong_prompts: false filter_overlong_prompts_workers: 1 filter_overlong_responses: true truncation: left image_key: images custom_cls: path: null name: null actor_rollout_ref: hybrid_engine: true model: path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B external_lib: null override_config: {} enable_gradient_checkpointing: true use_remove_padding: false use_liger: false save_hf_repo_id: RyanYr/brm-ace-numina-r1qwen1.5B-base-lr2.5e-6-beta0.002 tokenizer_chat_template: null actor: brm: norm_factor: value value_constant: null buffer: buffer_type: null offline_dataset_buffer: train_files: null response_key: response response_truncation: right shuffle: true update_size: 1024 strategy: fsdp ppo_mini_batch_size: 1024 ppo_micro_batch_size: null ppo_micro_batch_size_per_gpu: 2 use_dynamic_bsz: false ppo_max_token_len_per_gpu: 16384 grad_clip: 1.0 use_torch_compile: true ppo_epochs: 1 shuffle: false ulysses_sequence_parallel_size: 1 checkpoint: contents: - model - optimizer - extra optim: lr: 2.5e-06 lr_warmup_steps: -1 lr_warmup_steps_ratio: 0 min_lr_ratio: null warmup_style: constant total_training_steps: 100 weight_decay: 0.01 fsdp_config: wrap_policy: min_num_params: 0 param_offload: false optimizer_offload: false fsdp_size: -1 report_entropy: false ref: ref_model_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B strategy: fsdp fsdp_config: param_offload: false wrap_policy: min_num_params: 0 log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: 4 log_prob_use_dynamic_bsz: false log_prob_max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 rollout: name: vllm temperature: 1.0 top_k: -1 top_p: 1 use_fire_sampling: false prompt_length: 768 response_length: 13312 dtype: bfloat16 gpu_memory_utilization: 0.8 ignore_eos: false enforce_eager: false free_cache_engine: false load_format: dummy_dtensor tensor_model_parallel_size: 4 max_num_batched_tokens: 14080 max_model_len: null max_num_seqs: 1024 log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: 2 log_prob_use_dynamic_bsz: false log_prob_max_token_len_per_gpu: 16384 disable_log_stats: true enable_chunked_prefill: true do_sample: true 'n': 1 engine_kwargs: swap_space: null val_kwargs: top_k: -1 top_p: 1.0 temperature: 0.6 'n': 32 do_sample: true reward_model: enable: false strategy: fsdp model: input_tokenizer: deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B path: ~/models/FsfairX-LLaMA3-RM-v0.1 external_lib: null use_remove_padding: false fsdp_config: wrap_policy: min_num_params: 0 param_offload: false fsdp_size: -1 micro_batch_size: null micro_batch_size_per_gpu: null max_length: null ulysses_sequence_parallel_size: 1 use_dynamic_bsz: false forward_max_token_len_per_gpu: 16384 reward_manager: prime reward_kwargs: format_reward: 0.0 format_type: null custom_reward_function: path: null name: compute_score trainer: balance_batch: true total_epochs: 100 total_training_steps: 100 project_name: value-LLM experiment_name: brm-ace-numina-r1qwen1.5B-base_lr2.5e-6-beta0.002 logger: - console - wandb log_val_generations: 0 nnodes: 1 n_gpus_per_node: 8 save_freq: 5 resume_mode: auto resume_from_path: null val_before_train: false test_freq: -1 default_hdfs_dir: null del_local_ckpt_after_load: false default_local_dir: ./BRM max_actor_ckpt_to_keep: 1 ray_wait_register_center_timeout: 300 hf_token: null resume_from_hf: enable: false hf_repo_id: null hf_token: null revision: main algorithm: kl_ctrl: kl_coef: 0.002