{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9846153846153847, "eval_steps": 500, "global_step": 178, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 239.35076141357422, "epoch": 0.011188811188811189, "grad_norm": 2.018390655517578, "kl": 0.0, "learning_rate": 5.555555555555555e-08, "loss": 0.0, "reward": 1.020408146083355, "reward_std": 0.588430143892765, "rewards/accuracy_reward": 0.07015305897220969, "rewards/format_reward": 0.3928571380674839, "rewards/influence_reward": 0.05229591624811292, "rewards/len_reward": 0.5051020309329033, "step": 1 }, { "completion_length": 240.83513402938843, "epoch": 0.055944055944055944, "grad_norm": 1.5813062191009521, "kl": 0.00015559792518615723, "learning_rate": 2.7777777777777776e-07, "loss": 0.0, "reward": 0.9544004816561937, "reward_std": 0.6496573686599731, "rewards/accuracy_reward": 0.06377550942124799, "rewards/format_reward": 0.39317601080983877, "rewards/influence_reward": 0.0484693865000736, "rewards/len_reward": 0.44897958217188716, "step": 5 }, { "completion_length": 239.87346343994142, "epoch": 0.11188811188811189, "grad_norm": 1.199715256690979, "kl": 0.0015879154205322265, "learning_rate": 5.555555555555555e-07, "loss": 0.0001, "reward": 1.2579081252217292, "reward_std": 0.6406066298484803, "rewards/accuracy_reward": 0.09897958971560002, "rewards/format_reward": 0.6145408108830452, "rewards/influence_reward": 0.06607142747379839, "rewards/len_reward": 0.47831631228327753, "step": 10 }, { "completion_length": 235.00101585388182, "epoch": 0.16783216783216784, "grad_norm": 0.9491918683052063, "kl": 0.012871551513671874, "learning_rate": 8.333333333333333e-07, "loss": 0.0005, "reward": 1.9201530247926712, "reward_std": 0.5499199964106083, "rewards/accuracy_reward": 0.2658163230866194, "rewards/format_reward": 0.9533163100481034, "rewards/influence_reward": 0.19158162884414195, "rewards/len_reward": 0.5094387628138065, "step": 15 }, { "completion_length": 230.20764770507813, "epoch": 0.22377622377622378, "grad_norm": 1.0375124216079712, "kl": 0.034588623046875, "learning_rate": 9.996145181203615e-07, "loss": 0.0015, "reward": 2.358928510546684, "reward_std": 0.4863013714551926, "rewards/accuracy_reward": 0.46224488839507105, "rewards/format_reward": 0.9905612260103226, "rewards/influence_reward": 0.3974489726126194, "rewards/len_reward": 0.5086734544485807, "step": 20 }, { "completion_length": 221.39234199523926, "epoch": 0.27972027972027974, "grad_norm": 1.0581080913543701, "kl": 0.064349365234375, "learning_rate": 9.952846702217885e-07, "loss": 0.0027, "reward": 2.6903060495853426, "reward_std": 0.4831090085208416, "rewards/accuracy_reward": 0.5727040730416775, "rewards/format_reward": 0.9933673486113548, "rewards/influence_reward": 0.516071417927742, "rewards/len_reward": 0.6081632524728775, "step": 25 }, { "completion_length": 223.85866737365723, "epoch": 0.3356643356643357, "grad_norm": 1.002379298210144, "kl": 0.0775390625, "learning_rate": 9.861849601988383e-07, "loss": 0.0032, "reward": 2.7147958517074584, "reward_std": 0.4902082525193691, "rewards/accuracy_reward": 0.552295907586813, "rewards/format_reward": 0.9910714283585549, "rewards/influence_reward": 0.5109693787992, "rewards/len_reward": 0.660459166765213, "step": 30 }, { "completion_length": 242.63366928100587, "epoch": 0.3916083916083916, "grad_norm": 0.8198888301849365, "kl": 0.07734375, "learning_rate": 9.72403023233439e-07, "loss": 0.0032, "reward": 2.704081577062607, "reward_std": 0.4683332860469818, "rewards/accuracy_reward": 0.5767857022583485, "rewards/format_reward": 0.9892857119441032, "rewards/influence_reward": 0.5352040722966194, "rewards/len_reward": 0.6028061106801033, "step": 35 }, { "completion_length": 219.7584140777588, "epoch": 0.44755244755244755, "grad_norm": 0.8813366889953613, "kl": 0.081610107421875, "learning_rate": 9.540715869125407e-07, "loss": 0.0033, "reward": 2.984948921203613, "reward_std": 0.45619520246982576, "rewards/accuracy_reward": 0.620918358117342, "rewards/format_reward": 0.9966836735606194, "rewards/influence_reward": 0.5757652945816517, "rewards/len_reward": 0.7915816128253936, "step": 40 }, { "completion_length": 213.20280342102052, "epoch": 0.5034965034965035, "grad_norm": 0.8298941850662231, "kl": 0.09119873046875, "learning_rate": 9.313671929888959e-07, "loss": 0.0037, "reward": 2.999489736557007, "reward_std": 0.4441244326531887, "rewards/accuracy_reward": 0.6117346778512001, "rewards/format_reward": 0.9974489793181419, "rewards/influence_reward": 0.5594387598335743, "rewards/len_reward": 0.8308673277497292, "step": 45 }, { "completion_length": 239.8193817138672, "epoch": 0.5594405594405595, "grad_norm": 0.8936271667480469, "kl": 0.10096435546875, "learning_rate": 9.045084971874737e-07, "loss": 0.0041, "reward": 2.906377512216568, "reward_std": 0.45472528263926504, "rewards/accuracy_reward": 0.5836734607815742, "rewards/format_reward": 0.9956632673740387, "rewards/influence_reward": 0.5321428462862968, "rewards/len_reward": 0.7948979437351227, "step": 50 }, { "completion_length": 230.65917854309083, "epoch": 0.6153846153846154, "grad_norm": 0.7805132865905762, "kl": 0.1081298828125, "learning_rate": 8.737541634312983e-07, "loss": 0.0043, "reward": 3.0579080879688263, "reward_std": 0.4420431960374117, "rewards/accuracy_reward": 0.607397947460413, "rewards/format_reward": 0.9959183692932129, "rewards/influence_reward": 0.5612244814634323, "rewards/len_reward": 0.8933673277497292, "step": 55 }, { "completion_length": 220.4063720703125, "epoch": 0.6713286713286714, "grad_norm": 1.4386658668518066, "kl": 0.1181640625, "learning_rate": 8.394003727664709e-07, "loss": 0.0047, "reward": 3.0977040231227875, "reward_std": 0.4241327825933695, "rewards/accuracy_reward": 0.6344387613236904, "rewards/format_reward": 0.9956632673740387, "rewards/influence_reward": 0.5821428462862969, "rewards/len_reward": 0.8854591652750969, "step": 60 }, { "completion_length": 228.13443336486816, "epoch": 0.7272727272727273, "grad_norm": 0.6940521597862244, "kl": 0.11019287109375, "learning_rate": 8.017779709767857e-07, "loss": 0.0044, "reward": 2.9755101561546327, "reward_std": 0.4135630540549755, "rewards/accuracy_reward": 0.5908163137733936, "rewards/format_reward": 0.9948979616165161, "rewards/influence_reward": 0.5410714164376259, "rewards/len_reward": 0.8487244695425034, "step": 65 }, { "completion_length": 217.46045417785643, "epoch": 0.7832167832167832, "grad_norm": 0.6972203254699707, "kl": 0.1288818359375, "learning_rate": 7.612492823579744e-07, "loss": 0.0052, "reward": 3.136479526758194, "reward_std": 0.42438863664865495, "rewards/accuracy_reward": 0.6517856940627098, "rewards/format_reward": 0.9956632673740387, "rewards/influence_reward": 0.6112244755029679, "rewards/len_reward": 0.8778061032295227, "step": 70 }, { "completion_length": 217.32397499084473, "epoch": 0.8391608391608392, "grad_norm": 0.995740532875061, "kl": 0.13599853515625, "learning_rate": 7.182046203366709e-07, "loss": 0.0055, "reward": 3.1479591190814973, "reward_std": 0.4072150893509388, "rewards/accuracy_reward": 0.6653061121702194, "rewards/format_reward": 0.9959183692932129, "rewards/influence_reward": 0.6081632517278195, "rewards/len_reward": 0.8785714104771614, "step": 75 }, { "completion_length": 204.29948654174805, "epoch": 0.8951048951048951, "grad_norm": 0.8313683867454529, "kl": 0.1267333984375, "learning_rate": 6.730585285387465e-07, "loss": 0.0051, "reward": 3.1954080879688265, "reward_std": 0.39486792534589765, "rewards/accuracy_reward": 0.6806122347712517, "rewards/format_reward": 0.9956632673740387, "rewards/influence_reward": 0.6438775353133679, "rewards/len_reward": 0.8752550885081292, "step": 80 }, { "completion_length": 203.56530113220214, "epoch": 0.951048951048951, "grad_norm": 0.7973130345344543, "kl": 0.126806640625, "learning_rate": 6.262457885075789e-07, "loss": 0.0051, "reward": 3.2344387233257295, "reward_std": 0.3648144776001573, "rewards/accuracy_reward": 0.6961734496057034, "rewards/format_reward": 0.9956632673740387, "rewards/influence_reward": 0.6497448846697808, "rewards/len_reward": 0.8928571224212647, "step": 85 }, { "completion_length": 221.41661420549664, "epoch": 1.0, "grad_norm": 1.3692800998687744, "kl": 0.13193359375, "learning_rate": 5.782172325201155e-07, "loss": 0.0046, "reward": 3.196792949948992, "reward_std": 0.3790252791983741, "rewards/accuracy_reward": 0.676093282018389, "rewards/format_reward": 0.9962099126407078, "rewards/influence_reward": 0.6265305961881366, "rewards/len_reward": 0.897959165913718, "step": 90 }, { "completion_length": 228.48239212036134, "epoch": 1.055944055944056, "grad_norm": 0.9810852408409119, "kl": 0.14306640625, "learning_rate": 5.294354018255944e-07, "loss": 0.0057, "reward": 3.2369897425174714, "reward_std": 0.3675705246627331, "rewards/accuracy_reward": 0.6959183529019356, "rewards/format_reward": 0.9979591846466065, "rewards/influence_reward": 0.6489795804023742, "rewards/len_reward": 0.8941326335072517, "step": 95 }, { "completion_length": 228.94004592895507, "epoch": 1.1118881118881119, "grad_norm": 0.814385712146759, "kl": 0.14249267578125, "learning_rate": 4.803700921204658e-07, "loss": 0.0057, "reward": 3.299489712715149, "reward_std": 0.36674671024084093, "rewards/accuracy_reward": 0.7234693765640259, "rewards/format_reward": 0.9969387769699096, "rewards/influence_reward": 0.6719387613236905, "rewards/len_reward": 0.9071428373456001, "step": 100 }, { "completion_length": 227.59565849304198, "epoch": 1.167832167832168, "grad_norm": 1.3440909385681152, "kl": 0.15057373046875, "learning_rate": 4.3149382915901606e-07, "loss": 0.006, "reward": 3.2344387233257295, "reward_std": 0.3843592546880245, "rewards/accuracy_reward": 0.6931122295558453, "rewards/format_reward": 0.9966836735606194, "rewards/influence_reward": 0.6403061114251614, "rewards/len_reward": 0.9043367177248001, "step": 105 }, { "completion_length": 220.42397651672363, "epoch": 1.2237762237762237, "grad_norm": 1.2474172115325928, "kl": 0.1584716796875, "learning_rate": 3.8327731807204744e-07, "loss": 0.0063, "reward": 3.296428495645523, "reward_std": 0.38065838664770124, "rewards/accuracy_reward": 0.7209183543920517, "rewards/format_reward": 0.9974489808082581, "rewards/influence_reward": 0.6640305913984775, "rewards/len_reward": 0.9140305906534195, "step": 110 }, { "completion_length": 228.10331192016602, "epoch": 1.2797202797202798, "grad_norm": 1.0222375392913818, "kl": 0.17257080078125, "learning_rate": 3.361849102191533e-07, "loss": 0.0069, "reward": 3.247704017162323, "reward_std": 0.37765960246324537, "rewards/accuracy_reward": 0.698979577422142, "rewards/format_reward": 0.9948979601264, "rewards/influence_reward": 0.6392856992781162, "rewards/len_reward": 0.9145407944917678, "step": 115 }, { "completion_length": 212.8178524017334, "epoch": 1.3356643356643356, "grad_norm": 0.9680814146995544, "kl": 0.14952392578125, "learning_rate": 2.906701312312861e-07, "loss": 0.006, "reward": 3.277550941705704, "reward_std": 0.36851916685700414, "rewards/accuracy_reward": 0.7150510035455226, "rewards/format_reward": 0.9966836750507355, "rewards/influence_reward": 0.659948968142271, "rewards/len_reward": 0.9058673277497291, "step": 120 }, { "completion_length": 218.52907676696776, "epoch": 1.3916083916083917, "grad_norm": 0.8980757594108582, "kl": 0.144287109375, "learning_rate": 2.4717131331100774e-07, "loss": 0.0058, "reward": 3.2676019430160523, "reward_std": 0.34619110673666, "rewards/accuracy_reward": 0.7015306010842324, "rewards/format_reward": 0.9969387769699096, "rewards/influence_reward": 0.6543367207050323, "rewards/len_reward": 0.9147959008812905, "step": 125 }, { "completion_length": 220.4456588745117, "epoch": 1.4475524475524475, "grad_norm": 1.6906476020812988, "kl": 0.1458251953125, "learning_rate": 2.0610737385376348e-07, "loss": 0.0058, "reward": 3.244897884130478, "reward_std": 0.34559424556791785, "rewards/accuracy_reward": 0.7017857000231743, "rewards/format_reward": 0.9964285731315613, "rewards/influence_reward": 0.648724476993084, "rewards/len_reward": 0.8979591608047486, "step": 130 }, { "completion_length": 222.28366889953614, "epoch": 1.5034965034965035, "grad_norm": 2.0042786598205566, "kl": 0.15557861328125, "learning_rate": 1.6787378104435929e-07, "loss": 0.0062, "reward": 3.2635203421115877, "reward_std": 0.3759432673454285, "rewards/accuracy_reward": 0.7086734533309936, "rewards/format_reward": 0.9946428582072258, "rewards/influence_reward": 0.6635203965008258, "rewards/len_reward": 0.8966836482286453, "step": 135 }, { "completion_length": 226.27831115722657, "epoch": 1.5594405594405596, "grad_norm": 1.0714577436447144, "kl": 0.14898681640625, "learning_rate": 1.3283874528215733e-07, "loss": 0.006, "reward": 3.270918291807175, "reward_std": 0.3628778774291277, "rewards/accuracy_reward": 0.713775496929884, "rewards/format_reward": 0.9964285716414452, "rewards/influence_reward": 0.6655612140893936, "rewards/len_reward": 0.8951530426740646, "step": 140 }, { "completion_length": 216.52473983764648, "epoch": 1.6153846153846154, "grad_norm": 1.0027109384536743, "kl": 0.1531005859375, "learning_rate": 1.013396731136465e-07, "loss": 0.0061, "reward": 3.3109693050384523, "reward_std": 0.3781158674508333, "rewards/accuracy_reward": 0.7298469215631485, "rewards/format_reward": 0.9979591846466065, "rewards/influence_reward": 0.6729591734707355, "rewards/len_reward": 0.9102040633559227, "step": 145 }, { "completion_length": 214.229333114624, "epoch": 1.6713286713286712, "grad_norm": 1.2131189107894897, "kl": 0.16943359375, "learning_rate": 7.36799178229539e-08, "loss": 0.0068, "reward": 3.2859693229198457, "reward_std": 0.35086961574852465, "rewards/accuracy_reward": 0.7020408011972904, "rewards/format_reward": 0.9961734697222709, "rewards/influence_reward": 0.6668367192149163, "rewards/len_reward": 0.920918345451355, "step": 150 }, { "completion_length": 218.42678260803223, "epoch": 1.7272727272727273, "grad_norm": 28.282617568969727, "kl": 0.187109375, "learning_rate": 5.012585797388935e-08, "loss": 0.0075, "reward": 3.2318876922130584, "reward_std": 0.35199854988604784, "rewards/accuracy_reward": 0.6829081475734711, "rewards/format_reward": 0.9956632673740387, "rewards/influence_reward": 0.6392856985330582, "rewards/len_reward": 0.9140305936336517, "step": 155 }, { "completion_length": 215.96785354614258, "epoch": 1.7832167832167833, "grad_norm": 1.0345922708511353, "kl": 0.17052001953125, "learning_rate": 3.0904332038757974e-08, "loss": 0.0068, "reward": 3.295152986049652, "reward_std": 0.3517200522124767, "rewards/accuracy_reward": 0.7298469223082066, "rewards/format_reward": 0.9943877562880516, "rewards/influence_reward": 0.6688775345683098, "rewards/len_reward": 0.9020407944917679, "step": 160 }, { "completion_length": 220.99591369628905, "epoch": 1.8391608391608392, "grad_norm": 0.9470728039741516, "kl": 0.16759033203125, "learning_rate": 1.6200453819870118e-08, "loss": 0.0067, "reward": 3.3061223804950712, "reward_std": 0.3646129764616489, "rewards/accuracy_reward": 0.7349489614367485, "rewards/format_reward": 0.9943877577781677, "rewards/influence_reward": 0.6849489629268646, "rewards/len_reward": 0.8918367147445678, "step": 165 }, { "completion_length": 212.3209140777588, "epoch": 1.895104895104895, "grad_norm": 1.04275381565094, "kl": 0.53802490234375, "learning_rate": 6.15582970243117e-09, "loss": 0.0216, "reward": 3.295663195848465, "reward_std": 0.35336949974298476, "rewards/accuracy_reward": 0.7135203927755356, "rewards/format_reward": 0.9966836735606194, "rewards/influence_reward": 0.6688775405287742, "rewards/len_reward": 0.9165816113352776, "step": 170 }, { "completion_length": 214.2193832397461, "epoch": 1.951048951048951, "grad_norm": 1.4784653186798096, "kl": 0.1680908203125, "learning_rate": 8.671949076420881e-10, "loss": 0.0067, "reward": 3.292857068777084, "reward_std": 0.3482844814658165, "rewards/accuracy_reward": 0.7214285537600518, "rewards/format_reward": 0.9946428582072258, "rewards/influence_reward": 0.6732142709195614, "rewards/len_reward": 0.9035714089870452, "step": 175 }, { "completion_length": 220.2002493540446, "epoch": 1.9846153846153847, "kl": 0.19038899739583334, "reward": 3.2593536575635276, "reward_std": 0.3590250660975774, "rewards/accuracy_reward": 0.7032312775651614, "rewards/format_reward": 0.9940476194024086, "rewards/influence_reward": 0.6611394360661507, "rewards/len_reward": 0.9009353543321291, "step": 178, "total_flos": 0.0, "train_loss": 0.005316848002538006, "train_runtime": 54287.5063, "train_samples_per_second": 0.368, "train_steps_per_second": 0.003 } ], "logging_steps": 5, "max_steps": 178, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }