|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9965010496850945, |
|
"eval_steps": 100, |
|
"global_step": 89, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 719.0248603820801, |
|
"epoch": 0.01119664100769769, |
|
"grad_norm": 5.308396339416504, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7022594679147005, |
|
"reward_std": 0.11167733068577945, |
|
"rewards/accuracy_reward": 0.4343112222850323, |
|
"rewards/semantic_entropy_math_reward": 0.7022594679147005, |
|
"rewards/total_entropy_reward": 1.2448364309966564, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 697.6454010009766, |
|
"epoch": 0.02239328201539538, |
|
"grad_norm": 3.310378313064575, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6712827906012535, |
|
"reward_std": 0.1011071486864239, |
|
"rewards/accuracy_reward": 0.352040808647871, |
|
"rewards/semantic_entropy_math_reward": 0.6712827868759632, |
|
"rewards/total_entropy_reward": 1.305861696600914, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 701.3826370239258, |
|
"epoch": 0.03358992302309307, |
|
"grad_norm": 4.717713356018066, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7075437195599079, |
|
"reward_std": 0.07735519809648395, |
|
"rewards/accuracy_reward": 0.4126275437884033, |
|
"rewards/semantic_entropy_math_reward": 0.7075437270104885, |
|
"rewards/total_entropy_reward": 1.2551036067306995, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 692.5803489685059, |
|
"epoch": 0.04478656403079076, |
|
"grad_norm": 4.699649810791016, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6926020309329033, |
|
"reward_std": 0.09249642631039023, |
|
"rewards/accuracy_reward": 0.4196428433060646, |
|
"rewards/semantic_entropy_math_reward": 0.6926020495593548, |
|
"rewards/total_entropy_reward": 1.2794943004846573, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 643.8430976867676, |
|
"epoch": 0.05598320503848846, |
|
"grad_norm": 7.118838310241699, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7232142686843872, |
|
"reward_std": 0.09167159674689174, |
|
"rewards/accuracy_reward": 0.41326529532670975, |
|
"rewards/semantic_entropy_math_reward": 0.7232142575085163, |
|
"rewards/total_entropy_reward": 1.2130939476191998, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 711.6734580993652, |
|
"epoch": 0.06717984604618614, |
|
"grad_norm": 4.829713344573975, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6625364199280739, |
|
"reward_std": 0.1073409270029515, |
|
"rewards/accuracy_reward": 0.37436223588883877, |
|
"rewards/semantic_entropy_math_reward": 0.6625364273786545, |
|
"rewards/total_entropy_reward": 1.320713147521019, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 676.0541915893555, |
|
"epoch": 0.07837648705388384, |
|
"grad_norm": 4.374125003814697, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7006195187568665, |
|
"reward_std": 0.09755392652004957, |
|
"rewards/accuracy_reward": 0.4247448882088065, |
|
"rewards/semantic_entropy_math_reward": 0.7006195187568665, |
|
"rewards/total_entropy_reward": 1.2528423443436623, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 712.5401573181152, |
|
"epoch": 0.08957312806158152, |
|
"grad_norm": 6.558979034423828, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.633381923660636, |
|
"reward_std": 0.10544528882019222, |
|
"rewards/accuracy_reward": 0.3463010173290968, |
|
"rewards/semantic_entropy_math_reward": 0.6333819553256035, |
|
"rewards/total_entropy_reward": 1.3834920637309551, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 711.6345539093018, |
|
"epoch": 0.10076976906927922, |
|
"grad_norm": 7.621776103973389, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6800291426479816, |
|
"reward_std": 0.10248321201652288, |
|
"rewards/accuracy_reward": 0.39604590833187103, |
|
"rewards/semantic_entropy_math_reward": 0.6800291538238525, |
|
"rewards/total_entropy_reward": 1.2968212738633156, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 674.1779174804688, |
|
"epoch": 0.11196641007697691, |
|
"grad_norm": 7.523331165313721, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6800291538238525, |
|
"reward_std": 0.11024063220247626, |
|
"rewards/accuracy_reward": 0.411352033726871, |
|
"rewards/semantic_entropy_math_reward": 0.6800291500985622, |
|
"rewards/total_entropy_reward": 1.2904038280248642, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 741.4017696380615, |
|
"epoch": 0.1231630510846746, |
|
"grad_norm": 6.336475372314453, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6415816266089678, |
|
"reward_std": 0.10356245189905167, |
|
"rewards/accuracy_reward": 0.3858418306335807, |
|
"rewards/semantic_entropy_math_reward": 0.6415816303342581, |
|
"rewards/total_entropy_reward": 1.363376997411251, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 692.8437347412109, |
|
"epoch": 0.13435969209237228, |
|
"grad_norm": 7.185462474822998, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7002550885081291, |
|
"reward_std": 0.10777880600653589, |
|
"rewards/accuracy_reward": 0.43048468697816133, |
|
"rewards/semantic_entropy_math_reward": 0.7002551108598709, |
|
"rewards/total_entropy_reward": 1.2493381686508656, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 704.6638870239258, |
|
"epoch": 0.14555633310007, |
|
"grad_norm": 17.740726470947266, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.673833817243576, |
|
"reward_std": 0.09210015833377838, |
|
"rewards/accuracy_reward": 0.43558672815561295, |
|
"rewards/semantic_entropy_math_reward": 0.6738338358700275, |
|
"rewards/total_entropy_reward": 1.3185552880167961, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 679.9374847412109, |
|
"epoch": 0.15675297410776767, |
|
"grad_norm": 9.5872802734375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7075437121093273, |
|
"reward_std": 0.09575178450904787, |
|
"rewards/accuracy_reward": 0.40624999068677425, |
|
"rewards/semantic_entropy_math_reward": 0.7075437419116497, |
|
"rewards/total_entropy_reward": 1.237540539354086, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 731.2404136657715, |
|
"epoch": 0.16794961511546536, |
|
"grad_norm": 4.687215805053711, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6375728864222765, |
|
"reward_std": 0.108485147356987, |
|
"rewards/accuracy_reward": 0.3762755021452904, |
|
"rewards/semantic_entropy_math_reward": 0.6375729013234377, |
|
"rewards/total_entropy_reward": 1.3838917911052704, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 700.0522842407227, |
|
"epoch": 0.17914625612316304, |
|
"grad_norm": 8.418819427490234, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6547011584043503, |
|
"reward_std": 0.10533164534717798, |
|
"rewards/accuracy_reward": 0.40624999161809683, |
|
"rewards/semantic_entropy_math_reward": 0.6547011733055115, |
|
"rewards/total_entropy_reward": 1.3433180004358292, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 684.034423828125, |
|
"epoch": 0.19034289713086075, |
|
"grad_norm": 7.032946586608887, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.664905235171318, |
|
"reward_std": 0.0974241562653333, |
|
"rewards/accuracy_reward": 0.3832908095791936, |
|
"rewards/semantic_entropy_math_reward": 0.6649052500724792, |
|
"rewards/total_entropy_reward": 1.3396263718605042, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 677.9177227020264, |
|
"epoch": 0.20153953813855843, |
|
"grad_norm": 15.37868595123291, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6975218504667282, |
|
"reward_std": 0.09150373586453497, |
|
"rewards/accuracy_reward": 0.45153060369193554, |
|
"rewards/semantic_entropy_math_reward": 0.69752187281847, |
|
"rewards/total_entropy_reward": 1.262941613793373, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 683.5771484375, |
|
"epoch": 0.21273617914625612, |
|
"grad_norm": 36.41266632080078, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7057215645909309, |
|
"reward_std": 0.09910683194175363, |
|
"rewards/accuracy_reward": 0.4368622377514839, |
|
"rewards/semantic_entropy_math_reward": 0.7057215794920921, |
|
"rewards/total_entropy_reward": 1.2459207847714424, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 688.8073768615723, |
|
"epoch": 0.22393282015395383, |
|
"grad_norm": 27.405065536499023, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7097303122282028, |
|
"reward_std": 0.09203298413194716, |
|
"rewards/accuracy_reward": 0.4598214225843549, |
|
"rewards/semantic_entropy_math_reward": 0.7097303308546543, |
|
"rewards/total_entropy_reward": 1.2417229264974594, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 736.3092956542969, |
|
"epoch": 0.2351294611616515, |
|
"grad_norm": 19.091941833496094, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6541544962674379, |
|
"reward_std": 0.0986680502537638, |
|
"rewards/accuracy_reward": 0.4228316266089678, |
|
"rewards/semantic_entropy_math_reward": 0.6541545186191797, |
|
"rewards/total_entropy_reward": 1.3527532257139683, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 709.4795722961426, |
|
"epoch": 0.2463261021693492, |
|
"grad_norm": 14.640216827392578, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6836734637618065, |
|
"reward_std": 0.10867427196353674, |
|
"rewards/accuracy_reward": 0.46301019564270973, |
|
"rewards/semantic_entropy_math_reward": 0.6836734749376774, |
|
"rewards/total_entropy_reward": 1.2770788073539734, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 681.6218032836914, |
|
"epoch": 0.2575227431770469, |
|
"grad_norm": 13.511483192443848, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6967929936945438, |
|
"reward_std": 0.09672949556261301, |
|
"rewards/accuracy_reward": 0.44579080305993557, |
|
"rewards/semantic_entropy_math_reward": 0.6967929899692535, |
|
"rewards/total_entropy_reward": 1.2772413976490498, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 705.6811141967773, |
|
"epoch": 0.26871938418474456, |
|
"grad_norm": 15.547083854675293, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6461370252072811, |
|
"reward_std": 0.1031385101377964, |
|
"rewards/accuracy_reward": 0.3985969265922904, |
|
"rewards/semantic_entropy_math_reward": 0.6461370065808296, |
|
"rewards/total_entropy_reward": 1.3622385039925575, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 731.4566116333008, |
|
"epoch": 0.27991602519244224, |
|
"grad_norm": 8.434446334838867, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6233600564301014, |
|
"reward_std": 0.08827707398450002, |
|
"rewards/accuracy_reward": 0.3494897885248065, |
|
"rewards/semantic_entropy_math_reward": 0.6233600713312626, |
|
"rewards/total_entropy_reward": 1.4261119738221169, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 685.5197486877441, |
|
"epoch": 0.29111266620014, |
|
"grad_norm": 12.586360931396484, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6765670366585255, |
|
"reward_std": 0.1049469755962491, |
|
"rewards/accuracy_reward": 0.412627543322742, |
|
"rewards/semantic_entropy_math_reward": 0.6765670329332352, |
|
"rewards/total_entropy_reward": 1.3074346259236336, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 711.061840057373, |
|
"epoch": 0.30230930720783766, |
|
"grad_norm": 9.39128303527832, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6521501429378986, |
|
"reward_std": 0.1099298931658268, |
|
"rewards/accuracy_reward": 0.40624999068677425, |
|
"rewards/semantic_entropy_math_reward": 0.6521501652896404, |
|
"rewards/total_entropy_reward": 1.3500806987285614, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 707.5612106323242, |
|
"epoch": 0.31350594821553535, |
|
"grad_norm": 8.279614448547363, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6692784316837788, |
|
"reward_std": 0.10041180578991771, |
|
"rewards/accuracy_reward": 0.36096938233822584, |
|
"rewards/semantic_entropy_math_reward": 0.6692784205079079, |
|
"rewards/total_entropy_reward": 1.3163355849683285, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 675.4157962799072, |
|
"epoch": 0.32470258922323303, |
|
"grad_norm": 13.983796119689941, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7091836705803871, |
|
"reward_std": 0.09643913782201707, |
|
"rewards/accuracy_reward": 0.478316318243742, |
|
"rewards/semantic_entropy_math_reward": 0.7091836743056774, |
|
"rewards/total_entropy_reward": 1.2364819720387459, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 723.9189872741699, |
|
"epoch": 0.3358992302309307, |
|
"grad_norm": 13.94961166381836, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.614431481808424, |
|
"reward_std": 0.12222768180072308, |
|
"rewards/accuracy_reward": 0.3533163210377097, |
|
"rewards/semantic_entropy_math_reward": 0.614431481808424, |
|
"rewards/total_entropy_reward": 1.4096959978342056, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 721.4604339599609, |
|
"epoch": 0.3470958712386284, |
|
"grad_norm": 7.932522773742676, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6603498533368111, |
|
"reward_std": 0.09537219302728772, |
|
"rewards/accuracy_reward": 0.3992346851155162, |
|
"rewards/semantic_entropy_math_reward": 0.6603498607873917, |
|
"rewards/total_entropy_reward": 1.3325160779058933, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 702.3788185119629, |
|
"epoch": 0.3582925122463261, |
|
"grad_norm": 9.096883773803711, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6829446069896221, |
|
"reward_std": 0.09760210802778602, |
|
"rewards/accuracy_reward": 0.40816325694322586, |
|
"rewards/semantic_entropy_math_reward": 0.682944618165493, |
|
"rewards/total_entropy_reward": 1.303523451089859, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 700.5924644470215, |
|
"epoch": 0.3694891532540238, |
|
"grad_norm": 3.891953229904175, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7055393569171429, |
|
"reward_std": 0.10629667551256716, |
|
"rewards/accuracy_reward": 0.45982141979038715, |
|
"rewards/semantic_entropy_math_reward": 0.7055393569171429, |
|
"rewards/total_entropy_reward": 1.2391385585069656, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 716.7027950286865, |
|
"epoch": 0.3806857942617215, |
|
"grad_norm": 12.71116828918457, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6698250789195299, |
|
"reward_std": 0.09906701685395092, |
|
"rewards/accuracy_reward": 0.40369897056370974, |
|
"rewards/semantic_entropy_math_reward": 0.6698250807821751, |
|
"rewards/total_entropy_reward": 1.3271235637366772, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 769.0414352416992, |
|
"epoch": 0.3918824352694192, |
|
"grad_norm": 9.855749130249023, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6481414064764977, |
|
"reward_std": 0.10145364608615637, |
|
"rewards/accuracy_reward": 0.3743622349575162, |
|
"rewards/semantic_entropy_math_reward": 0.6481414288282394, |
|
"rewards/total_entropy_reward": 1.3565044924616814, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 722.349479675293, |
|
"epoch": 0.40307907627711687, |
|
"grad_norm": 5.131447792053223, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6638119556009769, |
|
"reward_std": 0.09085186710581183, |
|
"rewards/accuracy_reward": 0.4170918297022581, |
|
"rewards/semantic_entropy_math_reward": 0.6638119481503963, |
|
"rewards/total_entropy_reward": 1.350273534655571, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 703.8832778930664, |
|
"epoch": 0.41427571728481455, |
|
"grad_norm": 7.057519435882568, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.690597664564848, |
|
"reward_std": 0.09433076065033674, |
|
"rewards/accuracy_reward": 0.43686223309487104, |
|
"rewards/semantic_entropy_math_reward": 0.6905976496636868, |
|
"rewards/total_entropy_reward": 1.2801040560007095, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 725.2799606323242, |
|
"epoch": 0.42547235829251223, |
|
"grad_norm": 7.460778713226318, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6669096164405346, |
|
"reward_std": 0.10552376857958734, |
|
"rewards/accuracy_reward": 0.41326529532670975, |
|
"rewards/semantic_entropy_math_reward": 0.6669096313416958, |
|
"rewards/total_entropy_reward": 1.3323625773191452, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 725.7671966552734, |
|
"epoch": 0.4366689993002099, |
|
"grad_norm": 7.124528408050537, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6472303047776222, |
|
"reward_std": 0.10429512546397746, |
|
"rewards/accuracy_reward": 0.3909438718110323, |
|
"rewards/semantic_entropy_math_reward": 0.6472303159534931, |
|
"rewards/total_entropy_reward": 1.3546394035220146, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 697.7423439025879, |
|
"epoch": 0.44786564030790765, |
|
"grad_norm": 7.065954208374023, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7193877585232258, |
|
"reward_std": 0.09428266366012394, |
|
"rewards/accuracy_reward": 0.4336734600365162, |
|
"rewards/semantic_entropy_math_reward": 0.7193877547979355, |
|
"rewards/total_entropy_reward": 1.2143822945654392, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 703.3673400878906, |
|
"epoch": 0.45906228131560534, |
|
"grad_norm": 6.764792442321777, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.706632649526, |
|
"reward_std": 0.0942279752343893, |
|
"rewards/accuracy_reward": 0.4649234591051936, |
|
"rewards/semantic_entropy_math_reward": 0.7066326681524515, |
|
"rewards/total_entropy_reward": 1.238373503088951, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 651.4183578491211, |
|
"epoch": 0.470258922323303, |
|
"grad_norm": 16.47541618347168, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6813046522438526, |
|
"reward_std": 0.10622056131251156, |
|
"rewards/accuracy_reward": 0.43239795230329037, |
|
"rewards/semantic_entropy_math_reward": 0.681304682046175, |
|
"rewards/total_entropy_reward": 1.284136950969696, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 743.2646636962891, |
|
"epoch": 0.4814555633310007, |
|
"grad_norm": 15.174323081970215, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6525145824998617, |
|
"reward_std": 0.10325121926143765, |
|
"rewards/accuracy_reward": 0.3947704015299678, |
|
"rewards/semantic_entropy_math_reward": 0.652514586225152, |
|
"rewards/total_entropy_reward": 1.358573641628027, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 725.5962905883789, |
|
"epoch": 0.4926522043386984, |
|
"grad_norm": 7.421966075897217, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6867711264640093, |
|
"reward_std": 0.09842351987026632, |
|
"rewards/accuracy_reward": 0.4323979504406452, |
|
"rewards/semantic_entropy_math_reward": 0.6867711190134287, |
|
"rewards/total_entropy_reward": 1.2839920222759247, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 698.1390113830566, |
|
"epoch": 0.5038488453463961, |
|
"grad_norm": 9.836834907531738, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6851311810314655, |
|
"reward_std": 0.10096711455844343, |
|
"rewards/accuracy_reward": 0.44196427427232265, |
|
"rewards/semantic_entropy_math_reward": 0.6851312182843685, |
|
"rewards/total_entropy_reward": 1.290800966322422, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 737.8379898071289, |
|
"epoch": 0.5150454863540938, |
|
"grad_norm": 13.060182571411133, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6867711432278156, |
|
"reward_std": 0.10890168650075793, |
|
"rewards/accuracy_reward": 0.403061218559742, |
|
"rewards/semantic_entropy_math_reward": 0.6867711097002029, |
|
"rewards/total_entropy_reward": 1.2737670093774796, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 720.0937309265137, |
|
"epoch": 0.5262421273617914, |
|
"grad_norm": 17.424467086791992, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6268221419304609, |
|
"reward_std": 0.11281977966427803, |
|
"rewards/accuracy_reward": 0.39668366219848394, |
|
"rewards/semantic_entropy_math_reward": 0.6268221419304609, |
|
"rewards/total_entropy_reward": 1.396425575017929, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 730.9495983123779, |
|
"epoch": 0.5374387683694891, |
|
"grad_norm": 7.613661766052246, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6333818975836039, |
|
"reward_std": 0.09289166564121842, |
|
"rewards/accuracy_reward": 0.38201529905200005, |
|
"rewards/semantic_entropy_math_reward": 0.6333819199353456, |
|
"rewards/total_entropy_reward": 1.3891723416745663, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 730.4712867736816, |
|
"epoch": 0.5486354093771868, |
|
"grad_norm": 11.16511344909668, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.628097664564848, |
|
"reward_std": 0.10562112857587636, |
|
"rewards/accuracy_reward": 0.3871173355728388, |
|
"rewards/semantic_entropy_math_reward": 0.6280976608395576, |
|
"rewards/total_entropy_reward": 1.3984056264162064, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 700.4126129150391, |
|
"epoch": 0.5598320503848845, |
|
"grad_norm": 7.883020401000977, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6543367393314838, |
|
"reward_std": 0.08848634106107056, |
|
"rewards/accuracy_reward": 0.43494897056370974, |
|
"rewards/semantic_entropy_math_reward": 0.6543367449194193, |
|
"rewards/total_entropy_reward": 1.368666134774685, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 726.035701751709, |
|
"epoch": 0.5710286913925823, |
|
"grad_norm": 16.26348114013672, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6665451787412167, |
|
"reward_std": 0.09329186473041773, |
|
"rewards/accuracy_reward": 0.40114795323461294, |
|
"rewards/semantic_entropy_math_reward": 0.6665451973676682, |
|
"rewards/total_entropy_reward": 1.3389556668698788, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 719.875617980957, |
|
"epoch": 0.58222533240028, |
|
"grad_norm": 10.366020202636719, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6474125292152166, |
|
"reward_std": 0.10870802192948759, |
|
"rewards/accuracy_reward": 0.3934948956593871, |
|
"rewards/semantic_entropy_math_reward": 0.6474125385284424, |
|
"rewards/total_entropy_reward": 1.3537504002451897, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 691.538890838623, |
|
"epoch": 0.5934219734079776, |
|
"grad_norm": 10.94919204711914, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6860422715544701, |
|
"reward_std": 0.0946744061075151, |
|
"rewards/accuracy_reward": 0.489158159121871, |
|
"rewards/semantic_entropy_math_reward": 0.6860422790050507, |
|
"rewards/total_entropy_reward": 1.3007621616125107, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 705.672176361084, |
|
"epoch": 0.6046186144156753, |
|
"grad_norm": 23.8171329498291, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6762026119977236, |
|
"reward_std": 0.09615424065850675, |
|
"rewards/accuracy_reward": 0.39668366592377424, |
|
"rewards/semantic_entropy_math_reward": 0.6762026362121105, |
|
"rewards/total_entropy_reward": 1.3121707029640675, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 677.1664428710938, |
|
"epoch": 0.615815255423373, |
|
"grad_norm": 9.031086921691895, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6896865852177143, |
|
"reward_std": 0.1015757208224386, |
|
"rewards/accuracy_reward": 0.4317601965740323, |
|
"rewards/semantic_entropy_math_reward": 0.6896865479648113, |
|
"rewards/total_entropy_reward": 1.287308655679226, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 713.0114631652832, |
|
"epoch": 0.6270118964310707, |
|
"grad_norm": 10.871789932250977, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6656341031193733, |
|
"reward_std": 0.11424232507124543, |
|
"rewards/accuracy_reward": 0.4221938643604517, |
|
"rewards/semantic_entropy_math_reward": 0.6656341068446636, |
|
"rewards/total_entropy_reward": 1.3229641020298004, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 677.5694923400879, |
|
"epoch": 0.6382085374387684, |
|
"grad_norm": 8.904267311096191, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6729227248579264, |
|
"reward_std": 0.10780132608488202, |
|
"rewards/accuracy_reward": 0.4381377436220646, |
|
"rewards/semantic_entropy_math_reward": 0.6729227565228939, |
|
"rewards/total_entropy_reward": 1.3122014850378036, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 687.1345500946045, |
|
"epoch": 0.6494051784464661, |
|
"grad_norm": 19.945981979370117, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6621720213443041, |
|
"reward_std": 0.10787450219504535, |
|
"rewards/accuracy_reward": 0.4183673388324678, |
|
"rewards/semantic_entropy_math_reward": 0.6621720027178526, |
|
"rewards/total_entropy_reward": 1.3265771567821503, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 673.6224384307861, |
|
"epoch": 0.6606018194541637, |
|
"grad_norm": 25.872291564941406, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6900510117411613, |
|
"reward_std": 0.09328114939853549, |
|
"rewards/accuracy_reward": 0.3998724389821291, |
|
"rewards/semantic_entropy_math_reward": 0.6900510005652905, |
|
"rewards/total_entropy_reward": 1.2932880148291588, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 712.2027912139893, |
|
"epoch": 0.6717984604618614, |
|
"grad_norm": 12.222248077392578, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6426749210804701, |
|
"reward_std": 0.09614149574190378, |
|
"rewards/accuracy_reward": 0.41007652156986296, |
|
"rewards/semantic_entropy_math_reward": 0.6426749173551798, |
|
"rewards/total_entropy_reward": 1.383428543806076, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 739.6472969055176, |
|
"epoch": 0.6829951014695591, |
|
"grad_norm": 9.738767623901367, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6273687966167927, |
|
"reward_std": 0.11152111599221826, |
|
"rewards/accuracy_reward": 0.34885203186422586, |
|
"rewards/semantic_entropy_math_reward": 0.6273688077926636, |
|
"rewards/total_entropy_reward": 1.3971327617764473, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 685.2659378051758, |
|
"epoch": 0.6941917424772568, |
|
"grad_norm": 20.52920913696289, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6596209742128849, |
|
"reward_std": 0.10549976932816207, |
|
"rewards/accuracy_reward": 0.43622447922825813, |
|
"rewards/semantic_entropy_math_reward": 0.6596209909766912, |
|
"rewards/total_entropy_reward": 1.3266954682767391, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 684.1128730773926, |
|
"epoch": 0.7053883834849545, |
|
"grad_norm": 13.253375053405762, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6740160267800093, |
|
"reward_std": 0.11020976235158741, |
|
"rewards/accuracy_reward": 0.4419642761349678, |
|
"rewards/semantic_entropy_math_reward": 0.6740160342305899, |
|
"rewards/total_entropy_reward": 1.295092262327671, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 702.8647766113281, |
|
"epoch": 0.7165850244926522, |
|
"grad_norm": 11.436138153076172, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6612609177827835, |
|
"reward_std": 0.0923373675905168, |
|
"rewards/accuracy_reward": 0.43239795230329037, |
|
"rewards/semantic_entropy_math_reward": 0.661260936409235, |
|
"rewards/total_entropy_reward": 1.3441792502999306, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 713.3341636657715, |
|
"epoch": 0.72778166550035, |
|
"grad_norm": 15.771661758422852, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6313775349408388, |
|
"reward_std": 0.1141419094055891, |
|
"rewards/accuracy_reward": 0.4139030510559678, |
|
"rewards/semantic_entropy_math_reward": 0.6313775237649679, |
|
"rewards/total_entropy_reward": 1.3911906033754349, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 691.0420761108398, |
|
"epoch": 0.7389783065080476, |
|
"grad_norm": 9.556193351745605, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6576165966689587, |
|
"reward_std": 0.10697318986058235, |
|
"rewards/accuracy_reward": 0.3386479513719678, |
|
"rewards/semantic_entropy_math_reward": 0.657616626471281, |
|
"rewards/total_entropy_reward": 1.3421761691570282, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 688.1179618835449, |
|
"epoch": 0.7501749475157453, |
|
"grad_norm": 18.490732192993164, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6712827831506729, |
|
"reward_std": 0.10525703080929816, |
|
"rewards/accuracy_reward": 0.445790808647871, |
|
"rewards/semantic_entropy_math_reward": 0.6712828055024147, |
|
"rewards/total_entropy_reward": 1.3154872134327888, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 647.132007598877, |
|
"epoch": 0.761371588523443, |
|
"grad_norm": 7.94115686416626, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6920553781092167, |
|
"reward_std": 0.09936971426941454, |
|
"rewards/accuracy_reward": 0.43176019564270973, |
|
"rewards/semantic_entropy_math_reward": 0.6920554004609585, |
|
"rewards/total_entropy_reward": 1.2725396156311035, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 699.4005012512207, |
|
"epoch": 0.7725682295311407, |
|
"grad_norm": 12.146166801452637, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6652696616947651, |
|
"reward_std": 0.11806112038902938, |
|
"rewards/accuracy_reward": 0.409438768401742, |
|
"rewards/semantic_entropy_math_reward": 0.6652696691453457, |
|
"rewards/total_entropy_reward": 1.3024558648467064, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 667.9285659790039, |
|
"epoch": 0.7837648705388384, |
|
"grad_norm": 15.953444480895996, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.679846927523613, |
|
"reward_std": 0.09083303948864341, |
|
"rewards/accuracy_reward": 0.4311224427074194, |
|
"rewards/semantic_entropy_math_reward": 0.6798469312489033, |
|
"rewards/total_entropy_reward": 1.3062404170632362, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 705.4400367736816, |
|
"epoch": 0.794961511546536, |
|
"grad_norm": 12.320276260375977, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6556122340261936, |
|
"reward_std": 0.09324299031868577, |
|
"rewards/accuracy_reward": 0.37946427892893553, |
|
"rewards/semantic_entropy_math_reward": 0.6556122545152903, |
|
"rewards/total_entropy_reward": 1.3581575378775597, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 654.1511325836182, |
|
"epoch": 0.8061581525542337, |
|
"grad_norm": 22.0692138671875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6794824972748756, |
|
"reward_std": 0.10328507493250072, |
|
"rewards/accuracy_reward": 0.4209183603525162, |
|
"rewards/semantic_entropy_math_reward": 0.6794825121760368, |
|
"rewards/total_entropy_reward": 1.2847715727984905, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 690.308666229248, |
|
"epoch": 0.8173547935619314, |
|
"grad_norm": 10.22040843963623, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6780247762799263, |
|
"reward_std": 0.105113809928298, |
|
"rewards/accuracy_reward": 0.45918366499245167, |
|
"rewards/semantic_entropy_math_reward": 0.678024772554636, |
|
"rewards/total_entropy_reward": 1.2880272567272186, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 696.0803337097168, |
|
"epoch": 0.8285514345696291, |
|
"grad_norm": 20.899675369262695, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6516034882515669, |
|
"reward_std": 0.10480827221181244, |
|
"rewards/accuracy_reward": 0.3852040730416775, |
|
"rewards/semantic_entropy_math_reward": 0.6516034882515669, |
|
"rewards/total_entropy_reward": 1.3282338082790375, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 649.8654270172119, |
|
"epoch": 0.8397480755773268, |
|
"grad_norm": 9.469245910644531, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7246719934046268, |
|
"reward_std": 0.08447014342527837, |
|
"rewards/accuracy_reward": 0.433035708963871, |
|
"rewards/semantic_entropy_math_reward": 0.7246720045804977, |
|
"rewards/total_entropy_reward": 1.220706269145012, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 746.2142791748047, |
|
"epoch": 0.8509447165850245, |
|
"grad_norm": 10.530217170715332, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.639395035803318, |
|
"reward_std": 0.10488821647595614, |
|
"rewards/accuracy_reward": 0.40816325321793556, |
|
"rewards/semantic_entropy_math_reward": 0.6393950544297695, |
|
"rewards/total_entropy_reward": 1.3760143592953682, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 655.6836585998535, |
|
"epoch": 0.8621413575927221, |
|
"grad_norm": 14.1907958984375, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.7228498421609402, |
|
"reward_std": 0.09376341942697763, |
|
"rewards/accuracy_reward": 0.4674744727090001, |
|
"rewards/semantic_entropy_math_reward": 0.7228498421609402, |
|
"rewards/total_entropy_reward": 1.2132117934525013, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 672.8137626647949, |
|
"epoch": 0.8733379986004198, |
|
"grad_norm": 13.5209379196167, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.652332354336977, |
|
"reward_std": 0.09460025187581778, |
|
"rewards/accuracy_reward": 0.3858418334275484, |
|
"rewards/semantic_entropy_math_reward": 0.6523323617875576, |
|
"rewards/total_entropy_reward": 1.351367250084877, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 696.9342880249023, |
|
"epoch": 0.8845346396081175, |
|
"grad_norm": 17.302839279174805, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6514212656766176, |
|
"reward_std": 0.09626807877793908, |
|
"rewards/accuracy_reward": 0.4062499897554517, |
|
"rewards/semantic_entropy_math_reward": 0.651421282440424, |
|
"rewards/total_entropy_reward": 1.366665042936802, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 682.3073806762695, |
|
"epoch": 0.8957312806158153, |
|
"grad_norm": 9.640515327453613, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6767492536455393, |
|
"reward_std": 0.10075846454128623, |
|
"rewards/accuracy_reward": 0.42028060276061296, |
|
"rewards/semantic_entropy_math_reward": 0.6767492648214102, |
|
"rewards/total_entropy_reward": 1.3104709684848785, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 722.4725646972656, |
|
"epoch": 0.906927921623513, |
|
"grad_norm": 6.948408603668213, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6521501280367374, |
|
"reward_std": 0.08798706252127886, |
|
"rewards/accuracy_reward": 0.4534438718110323, |
|
"rewards/semantic_entropy_math_reward": 0.6521501541137695, |
|
"rewards/total_entropy_reward": 1.3641655445098877, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 686.7238388061523, |
|
"epoch": 0.9181245626312107, |
|
"grad_norm": 47.02455139160156, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6811224482953548, |
|
"reward_std": 0.11016466980800033, |
|
"rewards/accuracy_reward": 0.42283162754029036, |
|
"rewards/semantic_entropy_math_reward": 0.6811224408447742, |
|
"rewards/total_entropy_reward": 1.2916913330554962, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 709.4598007202148, |
|
"epoch": 0.9293212036389084, |
|
"grad_norm": 31.659669876098633, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6086005661636591, |
|
"reward_std": 0.10625140275806189, |
|
"rewards/accuracy_reward": 0.37882652692496777, |
|
"rewards/semantic_entropy_math_reward": 0.60860057733953, |
|
"rewards/total_entropy_reward": 1.4536337479948997, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 661.408784866333, |
|
"epoch": 0.940517844646606, |
|
"grad_norm": 8.08792781829834, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6858600489795208, |
|
"reward_std": 0.09285477036610246, |
|
"rewards/accuracy_reward": 0.4221938652917743, |
|
"rewards/semantic_entropy_math_reward": 0.6858600452542305, |
|
"rewards/total_entropy_reward": 1.283115666359663, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 657.9400405883789, |
|
"epoch": 0.9517144856543037, |
|
"grad_norm": 21.801326751708984, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6805757954716682, |
|
"reward_std": 0.09728616522625089, |
|
"rewards/accuracy_reward": 0.43749999441206455, |
|
"rewards/semantic_entropy_math_reward": 0.6805758327245712, |
|
"rewards/total_entropy_reward": 1.3078671097755432, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 674.7142734527588, |
|
"epoch": 0.9629111266620014, |
|
"grad_norm": 11.559253692626953, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6862244829535484, |
|
"reward_std": 0.10050268471240997, |
|
"rewards/accuracy_reward": 0.4304846851155162, |
|
"rewards/semantic_entropy_math_reward": 0.686224490404129, |
|
"rewards/total_entropy_reward": 1.2905268669128418, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 636.0388870239258, |
|
"epoch": 0.9741077676696991, |
|
"grad_norm": 11.289789199829102, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6986151486635208, |
|
"reward_std": 0.10155197884887457, |
|
"rewards/accuracy_reward": 0.43686223216354847, |
|
"rewards/semantic_entropy_math_reward": 0.6986151374876499, |
|
"rewards/total_entropy_reward": 1.2592380531132221, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 683.5822525024414, |
|
"epoch": 0.9853044086773968, |
|
"grad_norm": 10.526566505432129, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6585276797413826, |
|
"reward_std": 0.10897616110742092, |
|
"rewards/accuracy_reward": 0.4017857098951936, |
|
"rewards/semantic_entropy_math_reward": 0.6585277020931244, |
|
"rewards/total_entropy_reward": 1.324688896536827, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 683.3411960601807, |
|
"epoch": 0.9965010496850945, |
|
"grad_norm": 19.354259490966797, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.6687317825853825, |
|
"reward_std": 0.09195416304282844, |
|
"rewards/accuracy_reward": 0.380102027207613, |
|
"rewards/semantic_entropy_math_reward": 0.6687317807227373, |
|
"rewards/total_entropy_reward": 1.3135743215680122, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.9965010496850945, |
|
"step": 89, |
|
"total_flos": 0.0, |
|
"train_loss": 2.8008765204773544e-08, |
|
"train_runtime": 57265.8095, |
|
"train_samples_per_second": 0.349, |
|
"train_steps_per_second": 0.002 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 89, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|