{ "best_global_step": 50, "best_metric": 0.0384465716779232, "best_model_checkpoint": null, "epoch": 2.98876404494382, "eval_steps": 5, "global_step": 66, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2247191011235955, "grad_norm": 0.015925930812954903, "learning_rate": 9.393939393939395e-07, "loss": 0.0743, "step": 5 }, { "epoch": 0.2247191011235955, "eval_clip_ratio": 0.0, "eval_completion_length": 805.7142857142857, "eval_kl": 2.4131366184779574e-05, "eval_loss": 0.08203230053186417, "eval_num_tokens": 80747.0, "eval_reward": 0.5892857142857143, "eval_reward_std": 0.41495721680777414, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.017857142857142856, "eval_rewards/returns_int_reward_func": 0.5714285714285714, "eval_runtime": 916.367, "eval_samples_per_second": 0.015, "eval_steps_per_second": 0.004, "step": 5 }, { "epoch": 0.449438202247191, "grad_norm": 0.020018301904201508, "learning_rate": 8.636363636363636e-07, "loss": 0.1024, "step": 10 }, { "epoch": 0.449438202247191, "eval_clip_ratio": 0.0, "eval_completion_length": 779.8571428571429, "eval_kl": 2.630267824445452e-05, "eval_loss": 0.11342400312423706, "eval_num_tokens": 161176.0, "eval_reward": 0.6428571428571429, "eval_reward_std": 0.42048223529543194, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.017857142857142856, "eval_rewards/returns_int_reward_func": 0.625, "eval_runtime": 903.8295, "eval_samples_per_second": 0.015, "eval_steps_per_second": 0.004, "step": 10 }, { "epoch": 0.6741573033707865, "grad_norm": 0.017852170392870903, "learning_rate": 7.878787878787878e-07, "loss": 0.0572, "step": 15 }, { "epoch": 0.6741573033707865, "eval_clip_ratio": 0.0, "eval_completion_length": 765.5714285714286, "eval_kl": 2.2522040775844028e-05, "eval_loss": 0.10111860185861588, "eval_num_tokens": 237628.0, "eval_reward": 0.625, "eval_reward_std": 0.3737179126058306, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.0, "eval_rewards/returns_int_reward_func": 0.625, "eval_runtime": 948.5061, "eval_samples_per_second": 0.015, "eval_steps_per_second": 0.004, "step": 15 }, { "epoch": 0.898876404494382, "grad_norm": 0.009381658397614956, "learning_rate": 7.121212121212121e-07, "loss": 0.0721, "step": 20 }, { "epoch": 0.898876404494382, "eval_clip_ratio": 0.0, "eval_completion_length": 770.2142857142857, "eval_kl": 2.6115349360874722e-05, "eval_loss": 0.08618057519197464, "eval_num_tokens": 321515.0, "eval_reward": 0.6607142857142857, "eval_reward_std": 0.3907997948782785, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.017857142857142856, "eval_rewards/returns_int_reward_func": 0.6428571428571429, "eval_runtime": 901.0677, "eval_samples_per_second": 0.016, "eval_steps_per_second": 0.004, "step": 20 }, { "epoch": 1.1348314606741572, "grad_norm": 0.011156530119478703, "learning_rate": 6.363636363636363e-07, "loss": 0.0267, "step": 25 }, { "epoch": 1.1348314606741572, "eval_clip_ratio": 0.0, "eval_completion_length": 719.4464285714286, "eval_kl": 2.654961177280971e-05, "eval_loss": 0.10328873991966248, "eval_num_tokens": 402609.0, "eval_reward": 0.6071428571428571, "eval_reward_std": 0.3681928941181728, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.0, "eval_rewards/returns_int_reward_func": 0.6071428571428571, "eval_runtime": 841.536, "eval_samples_per_second": 0.017, "eval_steps_per_second": 0.005, "step": 25 }, { "epoch": 1.3595505617977528, "grad_norm": 0.023633191362023354, "learning_rate": 5.606060606060605e-07, "loss": 0.0791, "step": 30 }, { "epoch": 1.3595505617977528, "eval_clip_ratio": 0.0, "eval_completion_length": 750.1607142857143, "eval_kl": 2.5672571999686105e-05, "eval_loss": 0.18935348093509674, "eval_num_tokens": 485953.0, "eval_reward": 0.48214285714285715, "eval_reward_std": 0.4561965210097177, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.0, "eval_rewards/returns_int_reward_func": 0.48214285714285715, "eval_runtime": 928.6197, "eval_samples_per_second": 0.015, "eval_steps_per_second": 0.004, "step": 30 }, { "epoch": 1.5842696629213484, "grad_norm": 0.019900217652320862, "learning_rate": 4.848484848484849e-07, "loss": 0.0917, "step": 35 }, { "epoch": 1.5842696629213484, "eval_clip_ratio": 0.0, "eval_completion_length": 801.1785714285714, "eval_kl": 2.525533948625837e-05, "eval_loss": 0.08746703714132309, "eval_num_tokens": 566630.0, "eval_reward": 0.5892857142857143, "eval_reward_std": 0.38476794958114624, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.0, "eval_rewards/returns_int_reward_func": 0.5892857142857143, "eval_runtime": 929.2943, "eval_samples_per_second": 0.015, "eval_steps_per_second": 0.004, "step": 35 }, { "epoch": 1.8089887640449438, "grad_norm": 0.02139870449900627, "learning_rate": 4.090909090909091e-07, "loss": 0.0303, "step": 40 }, { "epoch": 1.8089887640449438, "eval_clip_ratio": 0.0, "eval_completion_length": 764.3035714285714, "eval_kl": 2.3505517414637973e-05, "eval_loss": 0.052344292402267456, "eval_num_tokens": 646240.0, "eval_reward": 0.6428571428571429, "eval_reward_std": 0.45067150252205984, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.0, "eval_rewards/returns_int_reward_func": 0.6428571428571429, "eval_runtime": 933.2582, "eval_samples_per_second": 0.015, "eval_steps_per_second": 0.004, "step": 40 }, { "epoch": 2.044943820224719, "grad_norm": 0.01629846915602684, "learning_rate": 3.333333333333333e-07, "loss": 0.0706, "step": 45 }, { "epoch": 2.044943820224719, "eval_clip_ratio": 0.0, "eval_completion_length": 803.4642857142857, "eval_kl": 2.7345759528023856e-05, "eval_loss": 0.06691782921552658, "eval_num_tokens": 731068.0, "eval_reward": 0.5178571428571429, "eval_reward_std": 0.41495721680777414, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.0, "eval_rewards/returns_int_reward_func": 0.5178571428571429, "eval_runtime": 947.4151, "eval_samples_per_second": 0.015, "eval_steps_per_second": 0.004, "step": 45 }, { "epoch": 2.2696629213483144, "grad_norm": 0.011612426489591599, "learning_rate": 2.5757575757575754e-07, "loss": 0.0385, "step": 50 }, { "epoch": 2.2696629213483144, "eval_clip_ratio": 0.0, "eval_completion_length": 759.1428571428571, "eval_kl": 2.525108201163156e-05, "eval_loss": 0.0384465716779232, "eval_num_tokens": 808379.0, "eval_reward": 0.6428571428571429, "eval_reward_std": 0.32695358991622925, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.017857142857142856, "eval_rewards/returns_int_reward_func": 0.625, "eval_runtime": 879.4095, "eval_samples_per_second": 0.016, "eval_steps_per_second": 0.005, "step": 50 }, { "epoch": 2.49438202247191, "grad_norm": 0.020006077364087105, "learning_rate": 1.818181818181818e-07, "loss": 0.0878, "step": 55 }, { "epoch": 2.49438202247191, "eval_clip_ratio": 0.0, "eval_completion_length": 806.8928571428571, "eval_kl": 2.43570123400007e-05, "eval_loss": 0.08411531895399094, "eval_num_tokens": 891396.0, "eval_reward": 0.6607142857142857, "eval_reward_std": 0.4039071798324585, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.017857142857142856, "eval_rewards/returns_int_reward_func": 0.6428571428571429, "eval_runtime": 938.1977, "eval_samples_per_second": 0.015, "eval_steps_per_second": 0.004, "step": 55 }, { "epoch": 2.7191011235955056, "grad_norm": 0.023959912359714508, "learning_rate": 1.0606060606060605e-07, "loss": 0.0497, "step": 60 }, { "epoch": 2.7191011235955056, "eval_clip_ratio": 0.0, "eval_completion_length": 722.9285714285714, "eval_kl": 2.206223351614816e-05, "eval_loss": 0.07995037734508514, "eval_num_tokens": 973773.0, "eval_reward": 0.6071428571428571, "eval_reward_std": 0.42048223529543194, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.017857142857142856, "eval_rewards/returns_int_reward_func": 0.5892857142857143, "eval_runtime": 887.1993, "eval_samples_per_second": 0.016, "eval_steps_per_second": 0.005, "step": 60 }, { "epoch": 2.943820224719101, "grad_norm": 0.013062759302556515, "learning_rate": 3.0303030303030305e-08, "loss": 0.0463, "step": 65 }, { "epoch": 2.943820224719101, "eval_clip_ratio": 0.0, "eval_completion_length": 745.4642857142857, "eval_kl": 2.300313540867397e-05, "eval_loss": 0.08457580208778381, "eval_num_tokens": 1055539.0, "eval_reward": 0.6428571428571429, "eval_reward_std": 0.32695358991622925, "eval_rewards/equation_reward_func": 0.0, "eval_rewards/format_reward_func": 0.0, "eval_rewards/returns_int_reward_func": 0.6428571428571429, "eval_runtime": 910.5507, "eval_samples_per_second": 0.015, "eval_steps_per_second": 0.004, "step": 65 } ], "logging_steps": 5, "max_steps": 66, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 15, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }