{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9211087420042645, "eval_steps": 100, "global_step": 170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 613.281721496582, "epoch": 0.08528784648187633, "grad_norm": 4.2232208251953125, "kl": 0.00017681121826171876, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 0.6537946701049805, "reward_std": 0.3218739811331034, "rewards/accuracy_reward": 0.6537946701049805, "rewards/format_reward2": 0.0, "step": 5 }, { "completion_length": 618.741323852539, "epoch": 0.17057569296375266, "grad_norm": 0.7694841027259827, "kl": 0.23766450881958007, "learning_rate": 2.956412726139078e-06, "loss": 0.0095, "reward": 0.7125000342726707, "reward_std": 0.26975345350801944, "rewards/accuracy_reward": 0.7125000342726707, "rewards/format_reward2": 0.0, "step": 10 }, { "completion_length": 625.0417678833007, "epoch": 0.255863539445629, "grad_norm": 0.11407410353422165, "kl": 0.00294036865234375, "learning_rate": 2.7836719084521715e-06, "loss": 0.0001, "reward": 0.7500000342726707, "reward_std": 0.2328610809519887, "rewards/accuracy_reward": 0.7500000342726707, "rewards/format_reward2": 0.0, "step": 15 }, { "completion_length": 604.2779296875, "epoch": 0.3411513859275053, "grad_norm": 0.10821383446455002, "kl": 0.0032527923583984377, "learning_rate": 2.4946839873611927e-06, "loss": 0.0001, "reward": 0.7553571820259094, "reward_std": 0.21391915678977966, "rewards/accuracy_reward": 0.7553571820259094, "rewards/format_reward2": 0.0, "step": 20 }, { "completion_length": 611.8196701049804, "epoch": 0.42643923240938164, "grad_norm": 0.1587938815355301, "kl": 0.003980827331542969, "learning_rate": 2.1156192081791355e-06, "loss": 0.0002, "reward": 0.755357176065445, "reward_std": 0.1927801643498242, "rewards/accuracy_reward": 0.755357176065445, "rewards/format_reward2": 0.0, "step": 25 }, { "completion_length": 614.662525177002, "epoch": 0.511727078891258, "grad_norm": 0.1258433312177658, "kl": 0.0038448333740234374, "learning_rate": 1.6808050203829845e-06, "loss": 0.0002, "reward": 0.7575893223285675, "reward_std": 0.18457154743373394, "rewards/accuracy_reward": 0.7575893223285675, "rewards/format_reward2": 0.0, "step": 30 }, { "completion_length": 599.8576171875, "epoch": 0.5970149253731343, "grad_norm": 0.088959701359272, "kl": 0.004636383056640625, "learning_rate": 1.2296174432791415e-06, "loss": 0.0002, "reward": 0.7524553865194321, "reward_std": 0.1772271953523159, "rewards/accuracy_reward": 0.7524553865194321, "rewards/format_reward2": 0.0, "step": 35 }, { "completion_length": 587.4435562133789, "epoch": 0.6823027718550106, "grad_norm": 0.08021266013383865, "kl": 0.004290008544921875, "learning_rate": 8.029152419343472e-07, "loss": 0.0002, "reward": 0.7700893208384514, "reward_std": 0.1737084028776735, "rewards/accuracy_reward": 0.7700893208384514, "rewards/format_reward2": 0.0, "step": 40 }, { "completion_length": 605.4605155944824, "epoch": 0.767590618336887, "grad_norm": 0.0632205456495285, "kl": 0.003778839111328125, "learning_rate": 4.3933982822017883e-07, "loss": 0.0002, "reward": 0.7537946745753288, "reward_std": 0.1909334436058998, "rewards/accuracy_reward": 0.7537946745753288, "rewards/format_reward2": 0.0, "step": 45 }, { "completion_length": 606.9638671875, "epoch": 0.8528784648187633, "grad_norm": 0.08492382615804672, "kl": 0.003963851928710937, "learning_rate": 1.718159615201853e-07, "loss": 0.0002, "reward": 0.7578125342726707, "reward_std": 0.1673258093651384, "rewards/accuracy_reward": 0.7578125342726707, "rewards/format_reward2": 0.0, "step": 50 }, { "completion_length": 599.2339523315429, "epoch": 0.9381663113006397, "grad_norm": 0.07253226637840271, "kl": 0.011330032348632812, "learning_rate": 2.4570139579284723e-08, "loss": 0.0005, "reward": 0.7895089671015739, "reward_std": 0.17561167925596238, "rewards/accuracy_reward": 0.7895089671015739, "rewards/format_reward2": 0.0, "step": 55 }, { "completion_length": 578.6864166259766, "epoch": 1.0341151385927505, "grad_norm": 0.12633396685123444, "kl": 0.003711700439453125, "learning_rate": 5.358185854701909e-07, "loss": 0.0001, "reward": 0.7918527200818062, "reward_std": 0.17118105152621865, "rewards/accuracy_reward": 0.7918527200818062, "rewards/format_reward4": 0.0, "step": 60 }, { "completion_length": 579.6618591308594, "epoch": 1.1194029850746268, "grad_norm": 0.0748809352517128, "kl": 0.00350341796875, "learning_rate": 3.0996998956314745e-07, "loss": 0.0001, "reward": 0.7732143223285675, "reward_std": 0.1733042700216174, "rewards/accuracy_reward": 0.7732143223285675, "rewards/format_reward4": 0.0, "step": 65 }, { "completion_length": 564.5884208679199, "epoch": 1.2046908315565032, "grad_norm": 0.17819823324680328, "kl": 0.003855133056640625, "learning_rate": 1.405383194450251e-07, "loss": 0.0002, "reward": 0.8015625357627869, "reward_std": 0.16864687129855155, "rewards/accuracy_reward": 0.8015625357627869, "rewards/format_reward4": 0.0, "step": 70 }, { "completion_length": 561.2647575378418, "epoch": 1.2899786780383795, "grad_norm": 0.08583667129278183, "kl": 0.0035480499267578126, "learning_rate": 3.5555989320099955e-08, "loss": 0.0001, "reward": 0.7986607506871224, "reward_std": 0.15244121365249158, "rewards/accuracy_reward": 0.7986607506871224, "rewards/format_reward4": 0.0, "step": 75 }, { "completion_length": 572.7236892700196, "epoch": 1.375266524520256, "grad_norm": 0.13771697878837585, "kl": 0.0034299850463867187, "learning_rate": 0.0, "loss": 0.0001, "reward": 0.8002232491970063, "reward_std": 0.15689483480527996, "rewards/accuracy_reward": 0.8002232491970063, "rewards/format_reward4": 0.0, "step": 80 }, { "completion_length": 578.9810539245606, "epoch": 1.4605543710021323, "grad_norm": 2.014723777770996, "kl": 0.010892486572265625, "learning_rate": 2.0096189432334195e-07, "loss": 0.0004, "reward": 0.7611607536673546, "reward_std": 0.18238836526870728, "rewards/accuracy_reward": 0.7611607536673546, "rewards/format_reward4": 0.0, "step": 85 }, { "completion_length": 589.071681213379, "epoch": 1.5458422174840085, "grad_norm": 0.07733023166656494, "kl": 0.00322265625, "learning_rate": 9.046106882113752e-08, "loss": 0.0001, "reward": 0.7718750327825546, "reward_std": 0.16937124980613588, "rewards/accuracy_reward": 0.7718750327825546, "rewards/format_reward4": 0.0, "step": 90 }, { "completion_length": 584.4857421875, "epoch": 1.6311300639658848, "grad_norm": 0.08772465586662292, "kl": 0.003212738037109375, "learning_rate": 2.278837048168797e-08, "loss": 0.0001, "reward": 0.765625037252903, "reward_std": 0.17046672012656927, "rewards/accuracy_reward": 0.765625037252903, "rewards/format_reward4": 0.0, "step": 95 }, { "completion_length": 584.8821662902832, "epoch": 1.716417910447761, "grad_norm": 0.09350935369729996, "kl": 0.0032810211181640626, "learning_rate": 0.0, "loss": 0.0001, "reward": 0.7743303939700127, "reward_std": 0.17159467502497136, "rewards/accuracy_reward": 0.7743303939700127, "rewards/format_reward4": 0.0, "step": 100 }, { "epoch": 1.716417910447761, "eval_completion_length": 572.5716213433507, "eval_kl": 0.004271193434255192, "eval_loss": 0.00017078800010494888, "eval_reward": 0.6897820954124767, "eval_reward_std": 0.20645368708589207, "eval_rewards/accuracy_reward": 0.6897820954124767, "eval_rewards/format_reward4": 0.0, "eval_runtime": 10542.252, "eval_samples_per_second": 0.474, "eval_steps_per_second": 0.004, "step": 100 }, { "completion_length": 593.587084197998, "epoch": 1.8017057569296375, "grad_norm": 0.07390806823968887, "kl": 0.00316009521484375, "learning_rate": 3.2546120637356677e-07, "loss": 0.0001, "reward": 0.7497768238186836, "reward_std": 0.18519118977710605, "rewards/accuracy_reward": 0.7497768238186836, "rewards/format_reward4": 0.0, "step": 105 }, { "completion_length": 589.4265899658203, "epoch": 1.886993603411514, "grad_norm": 0.07380172610282898, "kl": 0.0030284881591796874, "learning_rate": 2.1114787115667477e-07, "loss": 0.0001, "reward": 0.7691964611411095, "reward_std": 0.1665174851194024, "rewards/accuracy_reward": 0.7691964611411095, "rewards/format_reward4": 0.0, "step": 110 }, { "completion_length": 581.2207862854004, "epoch": 1.9722814498933903, "grad_norm": 0.06949003785848618, "kl": 0.003309822082519531, "learning_rate": 1.2003083451176365e-07, "loss": 0.0001, "reward": 0.7843750312924385, "reward_std": 0.17008549151942134, "rewards/accuracy_reward": 0.7843750312924385, "rewards/format_reward4": 0.0, "step": 115 }, { "completion_length": 576.8970560709636, "epoch": 2.068230277185501, "grad_norm": 0.0717669427394867, "kl": 0.0029771592881944445, "learning_rate": 5.374998819965654e-08, "loss": 0.0001, "reward": 0.7791667024294535, "reward_std": 0.1727727702094449, "rewards/accuracy_reward": 0.7791667024294535, "rewards/format_reward4": 0.0, "step": 120 }, { "completion_length": 552.3944480895996, "epoch": 2.1535181236673773, "grad_norm": 3.9083805084228516, "kl": 0.00341339111328125, "learning_rate": 1.3498231131137295e-08, "loss": 0.0001, "reward": 0.7910714641213417, "reward_std": 0.15105125531554223, "rewards/accuracy_reward": 0.7910714641213417, "rewards/format_reward4": 0.0, "step": 125 }, { "completion_length": 586.4814987182617, "epoch": 2.2388059701492535, "grad_norm": 0.08841477334499359, "kl": 0.0033771514892578123, "learning_rate": 0.0, "loss": 0.0001, "reward": 0.7645089626312256, "reward_std": 0.16334964451380074, "rewards/accuracy_reward": 0.7645089626312256, "rewards/format_reward4": 0.0, "step": 130 }, { "completion_length": 562.0841804504395, "epoch": 2.3240938166311302, "grad_norm": 0.0749669224023819, "kl": 0.003040122985839844, "learning_rate": 3.709719800782133e-07, "loss": 0.0001, "reward": 0.8017857521772385, "reward_std": 0.1651908096857369, "rewards/accuracy_reward": 0.8017857521772385, "rewards/format_reward4": 0.0, "step": 135 }, { "completion_length": 568.1428817749023, "epoch": 2.4093816631130065, "grad_norm": 0.06772708147764206, "kl": 0.0032039642333984374, "learning_rate": 2.757046314656676e-07, "loss": 0.0001, "reward": 0.799776816368103, "reward_std": 0.15835139667615294, "rewards/accuracy_reward": 0.799776816368103, "rewards/format_reward4": 0.0, "step": 140 }, { "completion_length": 576.2410957336426, "epoch": 2.4946695095948828, "grad_norm": 0.07914450764656067, "kl": 0.0030412673950195312, "learning_rate": 1.9333050887001336e-07, "loss": 0.0001, "reward": 0.772544676065445, "reward_std": 0.1686524854041636, "rewards/accuracy_reward": 0.772544676065445, "rewards/format_reward4": 0.0, "step": 145 }, { "completion_length": 580.0102951049805, "epoch": 2.579957356076759, "grad_norm": 0.09259422868490219, "kl": 0.0027721405029296877, "learning_rate": 1.2471710571470578e-07, "loss": 0.0001, "reward": 0.7595982506871224, "reward_std": 0.15487177977338434, "rewards/accuracy_reward": 0.7595982506871224, "rewards/format_reward4": 0.0, "step": 150 }, { "completion_length": 568.7931098937988, "epoch": 2.6652452025586353, "grad_norm": 0.33623555302619934, "kl": 0.003169822692871094, "learning_rate": 7.058699935926527e-08, "loss": 0.0001, "reward": 0.7776786103844643, "reward_std": 0.1605815477669239, "rewards/accuracy_reward": 0.7776786103844643, "rewards/format_reward4": 0.0, "step": 155 }, { "completion_length": 597.3964576721191, "epoch": 2.750533049040512, "grad_norm": 0.07021904736757278, "kl": 0.0028789520263671877, "learning_rate": 3.151024153589321e-08, "loss": 0.0001, "reward": 0.7546875327825546, "reward_std": 0.18458664841018618, "rewards/accuracy_reward": 0.7546875327825546, "rewards/format_reward4": 0.0, "step": 160 }, { "completion_length": 582.8522552490234, "epoch": 2.835820895522388, "grad_norm": 0.10628537088632584, "kl": 0.0029039382934570312, "learning_rate": 7.898355054830719e-09, "loss": 0.0001, "reward": 0.7602678939700127, "reward_std": 0.17046223413199185, "rewards/accuracy_reward": 0.7602678939700127, "rewards/format_reward4": 0.0, "step": 165 }, { "completion_length": 577.0710075378418, "epoch": 2.9211087420042645, "grad_norm": 0.09094743430614471, "kl": 0.0032497406005859374, "learning_rate": 0.0, "loss": 0.0001, "reward": 0.779241107404232, "reward_std": 0.16634787572547793, "rewards/accuracy_reward": 0.779241107404232, "rewards/format_reward4": 0.0, "step": 170 }, { "epoch": 2.9211087420042645, "step": 170, "total_flos": 0.0, "train_loss": 2.8551620540811734e-05, "train_runtime": 11823.9785, "train_samples_per_second": 12.882, "train_steps_per_second": 0.014 } ], "logging_steps": 5, "max_steps": 170, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }