|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9981333333333333, |
|
"eval_steps": 100, |
|
"global_step": 394, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.8596701371042, |
|
"epoch": 0.002533333333333333, |
|
"grad_norm": 0.2689627707004547, |
|
"kl": -9.097551044664885e-08, |
|
"learning_rate": 7.500000000000001e-08, |
|
"loss": 0.0054, |
|
"reward": 0.6754386079938788, |
|
"reward_std": 0.29693952202796936, |
|
"rewards/accuracy_reward": 0.6754386079938788, |
|
"rewards/format_reward": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 623.1096651177658, |
|
"epoch": 0.012666666666666666, |
|
"grad_norm": 0.34043097496032715, |
|
"kl": 0.00013973210987291838, |
|
"learning_rate": 3.75e-07, |
|
"loss": -0.0101, |
|
"reward": 0.6052631694627436, |
|
"reward_std": 0.3655273945708024, |
|
"rewards/accuracy_reward": 0.6052631694627436, |
|
"rewards/format_reward": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 574.2947516190378, |
|
"epoch": 0.025333333333333333, |
|
"grad_norm": 0.16866758465766907, |
|
"kl": 0.0002021940130936472, |
|
"learning_rate": 7.5e-07, |
|
"loss": -0.0133, |
|
"reward": 0.6228070329678687, |
|
"reward_std": 0.3261075675487518, |
|
"rewards/accuracy_reward": 0.6228070329678687, |
|
"rewards/format_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 593.8473849647924, |
|
"epoch": 0.038, |
|
"grad_norm": 0.13506826758384705, |
|
"kl": 0.00030391090794613484, |
|
"learning_rate": 1.125e-06, |
|
"loss": -0.0171, |
|
"reward": 0.6192982615608918, |
|
"reward_std": 0.3493922035945089, |
|
"rewards/accuracy_reward": 0.6192982615608918, |
|
"rewards/format_reward": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 614.5087890625, |
|
"epoch": 0.050666666666666665, |
|
"grad_norm": 0.37117841839790344, |
|
"kl": 0.0014222195273951481, |
|
"learning_rate": 1.5e-06, |
|
"loss": -0.0263, |
|
"reward": 0.6087719443597291, |
|
"reward_std": 0.35981847079176654, |
|
"rewards/accuracy_reward": 0.6087719443597291, |
|
"rewards/format_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 610.6105422170539, |
|
"epoch": 0.06333333333333334, |
|
"grad_norm": 0.6702502369880676, |
|
"kl": 0.008063848395096627, |
|
"learning_rate": 1.875e-06, |
|
"loss": -0.0323, |
|
"reward": 0.6000000163128502, |
|
"reward_std": 0.3776336585220538, |
|
"rewards/accuracy_reward": 0.6000000163128502, |
|
"rewards/format_reward": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 581.708786171361, |
|
"epoch": 0.076, |
|
"grad_norm": 0.13385862112045288, |
|
"kl": 0.08222049913908305, |
|
"learning_rate": 2.25e-06, |
|
"loss": -0.003, |
|
"reward": 0.5719298386260083, |
|
"reward_std": 0.3216610023849889, |
|
"rewards/accuracy_reward": 0.5719298386260083, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 580.1789640727796, |
|
"epoch": 0.08866666666666667, |
|
"grad_norm": 0.1613887995481491, |
|
"kl": 0.4072471217105263, |
|
"learning_rate": 2.6250000000000003e-06, |
|
"loss": -0.0034, |
|
"reward": 0.5684210676895944, |
|
"reward_std": 0.3606692671775818, |
|
"rewards/accuracy_reward": 0.5684210676895944, |
|
"rewards/format_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.603520443565, |
|
"epoch": 0.10133333333333333, |
|
"grad_norm": 4.28112268447876, |
|
"kl": 0.28351083052785775, |
|
"learning_rate": 3e-06, |
|
"loss": -0.0148, |
|
"reward": 0.6192982620314548, |
|
"reward_std": 0.34681932079164607, |
|
"rewards/accuracy_reward": 0.6192982620314548, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 597.3666823537726, |
|
"epoch": 0.114, |
|
"grad_norm": 0.09016856551170349, |
|
"kl": 0.1709989447342722, |
|
"learning_rate": 2.998523534736735e-06, |
|
"loss": -0.0274, |
|
"reward": 0.6052631729527523, |
|
"reward_std": 0.37054079206366286, |
|
"rewards/accuracy_reward": 0.6052631729527523, |
|
"rewards/format_reward": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 588.608786171361, |
|
"epoch": 0.12666666666666668, |
|
"grad_norm": 0.2084578275680542, |
|
"kl": 0.2026009107890882, |
|
"learning_rate": 2.994097045546504e-06, |
|
"loss": -0.0133, |
|
"reward": 0.5824561528469386, |
|
"reward_std": 0.3644034429600364, |
|
"rewards/accuracy_reward": 0.5824561528469386, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 592.9579096743935, |
|
"epoch": 0.13933333333333334, |
|
"grad_norm": 1.332377552986145, |
|
"kl": 0.253752979479338, |
|
"learning_rate": 2.986729246506011e-06, |
|
"loss": -0.0199, |
|
"reward": 0.6315789619558736, |
|
"reward_std": 0.34024513646175986, |
|
"rewards/accuracy_reward": 0.6315789619558736, |
|
"rewards/format_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 580.366682032535, |
|
"epoch": 0.152, |
|
"grad_norm": 0.056244488805532455, |
|
"kl": 17.54536361694336, |
|
"learning_rate": 2.976434642014389e-06, |
|
"loss": 0.8101, |
|
"reward": 0.6280701911763141, |
|
"reward_std": 0.3611795199544806, |
|
"rewards/accuracy_reward": 0.6280701911763141, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 602.4929979826275, |
|
"epoch": 0.16466666666666666, |
|
"grad_norm": 2.9207770824432373, |
|
"kl": 0.25210153680098685, |
|
"learning_rate": 2.9632334982395456e-06, |
|
"loss": 0.0016, |
|
"reward": 0.6473684381497534, |
|
"reward_std": 0.3406482906718003, |
|
"rewards/accuracy_reward": 0.6473684381497534, |
|
"rewards/format_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 550.657911119963, |
|
"epoch": 0.17733333333333334, |
|
"grad_norm": 2.1469576358795166, |
|
"kl": 1.1785144203587583, |
|
"learning_rate": 2.947151803221774e-06, |
|
"loss": 0.0246, |
|
"reward": 0.5982456306093618, |
|
"reward_std": 0.3625405713131553, |
|
"rewards/accuracy_reward": 0.5982456306093618, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 630.1982622648541, |
|
"epoch": 0.19, |
|
"grad_norm": 0.437906414270401, |
|
"kl": 0.2049952456825658, |
|
"learning_rate": 2.928221215713164e-06, |
|
"loss": -0.0136, |
|
"reward": 0.6368421231445514, |
|
"reward_std": 0.3507869362831116, |
|
"rewards/accuracy_reward": 0.6368421231445514, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.66141614412, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 0.20336699485778809, |
|
"kl": 0.840125154194079, |
|
"learning_rate": 2.906479002853542e-06, |
|
"loss": -0.0331, |
|
"reward": 0.6403508934535478, |
|
"reward_std": 0.33991540262573644, |
|
"rewards/accuracy_reward": 0.6403508934535478, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 571.3140498111122, |
|
"epoch": 0.21533333333333332, |
|
"grad_norm": 0.566848874092102, |
|
"kl": 0.42301218133223684, |
|
"learning_rate": 2.8819679668056195e-06, |
|
"loss": -0.0154, |
|
"reward": 0.6754386137974889, |
|
"reward_std": 0.3516690385969062, |
|
"rewards/accuracy_reward": 0.6754386137974889, |
|
"rewards/format_reward": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 576.1280825966282, |
|
"epoch": 0.228, |
|
"grad_norm": 5.783878326416016, |
|
"kl": 0.3097421746504934, |
|
"learning_rate": 2.8547363604937856e-06, |
|
"loss": 0.0077, |
|
"reward": 0.612280716237269, |
|
"reward_std": 0.355850856241427, |
|
"rewards/accuracy_reward": 0.612280716237269, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 615.8421213250411, |
|
"epoch": 0.24066666666666667, |
|
"grad_norm": 0.29518190026283264, |
|
"kl": 0.10967632092927632, |
|
"learning_rate": 2.824837792612416e-06, |
|
"loss": -0.0353, |
|
"reward": 0.6122807160804146, |
|
"reward_std": 0.3359477858794363, |
|
"rewards/accuracy_reward": 0.6122807160804146, |
|
"rewards/format_reward": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.25333333333333335, |
|
"grad_norm": 0.09735328704118729, |
|
"learning_rate": 2.792331122090709e-06, |
|
"loss": -0.0192, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25333333333333335, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 581.6460822784423, |
|
"eval_kl": 0.4462999359130859, |
|
"eval_loss": -0.020424701273441315, |
|
"eval_reward": 0.4882333454877138, |
|
"eval_reward_std": 0.3652301513493061, |
|
"eval_rewards/accuracy_reward": 0.4882333454877138, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 95085.0702, |
|
"eval_samples_per_second": 0.053, |
|
"eval_steps_per_second": 0.009, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 560.9368566412675, |
|
"epoch": 0.266, |
|
"grad_norm": 13.26487922668457, |
|
"kl": 0.22525361713610198, |
|
"learning_rate": 2.7572803422217976e-06, |
|
"loss": -0.0193, |
|
"reward": 0.6263158046885541, |
|
"reward_std": 0.3673114654264952, |
|
"rewards/accuracy_reward": 0.6263158046885541, |
|
"rewards/format_reward": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 572.0772078664679, |
|
"epoch": 0.2786666666666667, |
|
"grad_norm": 5.774922847747803, |
|
"kl": 1.1548378392269736, |
|
"learning_rate": 2.71975445468425e-06, |
|
"loss": -0.0186, |
|
"reward": 0.6631579132456529, |
|
"reward_std": 0.3699968554471668, |
|
"rewards/accuracy_reward": 0.6631579132456529, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 617.7017713044819, |
|
"epoch": 0.29133333333333333, |
|
"grad_norm": 0.3488950729370117, |
|
"kl": 0.4739940442537007, |
|
"learning_rate": 2.679827333703964e-06, |
|
"loss": 0.0054, |
|
"reward": 0.5789473808125446, |
|
"reward_std": 0.359339523942847, |
|
"rewards/accuracy_reward": 0.5789473808125446, |
|
"rewards/format_reward": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 584.6894883005242, |
|
"epoch": 0.304, |
|
"grad_norm": 2.7635438442230225, |
|
"kl": 0.8890348735608553, |
|
"learning_rate": 2.637577580623858e-06, |
|
"loss": -0.0151, |
|
"reward": 0.5947368579475503, |
|
"reward_std": 0.34380959868431094, |
|
"rewards/accuracy_reward": 0.5947368579475503, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 615.0772094726562, |
|
"epoch": 0.31666666666666665, |
|
"grad_norm": 1.1497012376785278, |
|
"kl": 0.36164293791118424, |
|
"learning_rate": 2.593088369167671e-06, |
|
"loss": -0.0181, |
|
"reward": 0.6192982593649312, |
|
"reward_std": 0.3435111723448101, |
|
"rewards/accuracy_reward": 0.6192982593649312, |
|
"rewards/format_reward": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 576.5666825143915, |
|
"epoch": 0.3293333333333333, |
|
"grad_norm": 6.164768695831299, |
|
"kl": 0.5363926937705592, |
|
"learning_rate": 2.5464472817024772e-06, |
|
"loss": 0.018, |
|
"reward": 0.6684210694149921, |
|
"reward_std": 0.368413172583831, |
|
"rewards/accuracy_reward": 0.6684210694149921, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 566.4719445479543, |
|
"epoch": 0.342, |
|
"grad_norm": 2.0130836963653564, |
|
"kl": 1.256256103515625, |
|
"learning_rate": 2.497746136822254e-06, |
|
"loss": 0.0444, |
|
"reward": 0.642105276176804, |
|
"reward_std": 0.33451331728383116, |
|
"rewards/accuracy_reward": 0.642105276176804, |
|
"rewards/format_reward": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 615.5193124871505, |
|
"epoch": 0.3546666666666667, |
|
"grad_norm": 2.369405746459961, |
|
"kl": 0.7050068102384869, |
|
"learning_rate": 2.4470808085919304e-06, |
|
"loss": -0.0144, |
|
"reward": 0.6421052804118709, |
|
"reward_std": 0.3574345387910542, |
|
"rewards/accuracy_reward": 0.6421052804118709, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 576.3912459524055, |
|
"epoch": 0.36733333333333335, |
|
"grad_norm": 4.594287872314453, |
|
"kl": 2.437645841899671, |
|
"learning_rate": 2.3945510378077523e-06, |
|
"loss": 0.0796, |
|
"reward": 0.6842105448246002, |
|
"reward_std": 0.350753252757223, |
|
"rewards/accuracy_reward": 0.6842105448246002, |
|
"rewards/format_reward": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 596.2193150570519, |
|
"epoch": 0.38, |
|
"grad_norm": 2.5179603099823, |
|
"kl": 1.6508487099095395, |
|
"learning_rate": 2.340260235645519e-06, |
|
"loss": 0.0431, |
|
"reward": 0.663157911363401, |
|
"reward_std": 0.3722002707029644, |
|
"rewards/accuracy_reward": 0.663157911363401, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.5965046129728, |
|
"epoch": 0.39266666666666666, |
|
"grad_norm": 76.81166076660156, |
|
"kl": 4.629216886821546, |
|
"learning_rate": 2.2843152800832416e-06, |
|
"loss": 0.094, |
|
"reward": 0.5807017698099739, |
|
"reward_std": 0.37164790002923265, |
|
"rewards/accuracy_reward": 0.5807017698099739, |
|
"rewards/format_reward": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 604.2456311677631, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 2.316026210784912, |
|
"kl": 5.967475971422697, |
|
"learning_rate": 2.2268263054989753e-06, |
|
"loss": 0.242, |
|
"reward": 0.5666666804175627, |
|
"reward_std": 0.3491695720898478, |
|
"rewards/accuracy_reward": 0.5666666804175627, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 585.1210690146999, |
|
"epoch": 0.418, |
|
"grad_norm": 1.3882242441177368, |
|
"kl": 1.542066072162829, |
|
"learning_rate": 2.167906485858047e-06, |
|
"loss": 0.031, |
|
"reward": 0.6631579112065465, |
|
"reward_std": 0.32709676968423945, |
|
"rewards/accuracy_reward": 0.6631579112065465, |
|
"rewards/format_reward": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 609.7789647152549, |
|
"epoch": 0.43066666666666664, |
|
"grad_norm": 23.364316940307617, |
|
"kl": 3.0389545641447366, |
|
"learning_rate": 2.1076718119164804e-06, |
|
"loss": 0.0985, |
|
"reward": 0.5368421203211734, |
|
"reward_std": 0.36661528725373116, |
|
"rewards/accuracy_reward": 0.5368421203211734, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 589.2105436626233, |
|
"epoch": 0.44333333333333336, |
|
"grad_norm": 2.261157274246216, |
|
"kl": 3.034598581414474, |
|
"learning_rate": 2.0462408628792335e-06, |
|
"loss": 0.0464, |
|
"reward": 0.5877193132513447, |
|
"reward_std": 0.32118205584977805, |
|
"rewards/accuracy_reward": 0.5877193132513447, |
|
"rewards/format_reward": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 583.0210664447985, |
|
"epoch": 0.456, |
|
"grad_norm": 1.490301251411438, |
|
"kl": 4.092078279194079, |
|
"learning_rate": 1.9837345729627633e-06, |
|
"loss": 0.0794, |
|
"reward": 0.6087719440460205, |
|
"reward_std": 0.34601063853815983, |
|
"rewards/accuracy_reward": 0.6087719440460205, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 591.0193131296259, |
|
"epoch": 0.4686666666666667, |
|
"grad_norm": 3.377087354660034, |
|
"kl": 39.17373753597862, |
|
"learning_rate": 1.9202759933214665e-06, |
|
"loss": 1.5296, |
|
"reward": 0.5684210672190315, |
|
"reward_std": 0.3722002681932951, |
|
"rewards/accuracy_reward": 0.5684210672190315, |
|
"rewards/format_reward": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 628.3315945273952, |
|
"epoch": 0.48133333333333334, |
|
"grad_norm": 15.796894073486328, |
|
"kl": 8.018802682976974, |
|
"learning_rate": 1.8559900498066726e-06, |
|
"loss": 0.2629, |
|
"reward": 0.5877193149767423, |
|
"reward_std": 0.3454245896715867, |
|
"rewards/accuracy_reward": 0.5877193149767423, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 616.9403683311061, |
|
"epoch": 0.494, |
|
"grad_norm": 0.7116127014160156, |
|
"kl": 11.00859888980263, |
|
"learning_rate": 1.7910032970350677e-06, |
|
"loss": 0.5039, |
|
"reward": 0.614035103195592, |
|
"reward_std": 0.3618389898224881, |
|
"rewards/accuracy_reward": 0.614035103195592, |
|
"rewards/format_reward": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 0.7055822014808655, |
|
"learning_rate": 1.7254436692507058e-06, |
|
"loss": 0.2295, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5066666666666667, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 581.6460156143188, |
|
"eval_kl": 44.771087231445314, |
|
"eval_loss": 1.8106327056884766, |
|
"eval_reward": 0.4892333455443382, |
|
"eval_reward_std": 0.362811917424202, |
|
"eval_rewards/accuracy_reward": 0.4891666788816452, |
|
"eval_rewards/format_reward": 6.666666865348816e-05, |
|
"eval_runtime": 95474.2074, |
|
"eval_samples_per_second": 0.052, |
|
"eval_steps_per_second": 0.009, |
|
"step": 200 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 624.9587893837377, |
|
"epoch": 0.5193333333333333, |
|
"grad_norm": 0.2618827223777771, |
|
"kl": 4.135612246864721, |
|
"learning_rate": 1.6594402284710481e-06, |
|
"loss": 0.0336, |
|
"reward": 0.564035101783903, |
|
"reward_std": 0.3579249236144518, |
|
"rewards/accuracy_reward": 0.564035101783903, |
|
"rewards/format_reward": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 621.8368573640522, |
|
"epoch": 0.532, |
|
"grad_norm": 1.038638710975647, |
|
"kl": 0.783648681640625, |
|
"learning_rate": 1.593122910412851e-06, |
|
"loss": 0.0144, |
|
"reward": 0.6175438767985294, |
|
"reward_std": 0.35596638729697777, |
|
"rewards/accuracy_reward": 0.6175438767985294, |
|
"rewards/format_reward": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 612.468435990183, |
|
"epoch": 0.5446666666666666, |
|
"grad_norm": 15.33467960357666, |
|
"kl": 5.278036338404605, |
|
"learning_rate": 1.5266222686980693e-06, |
|
"loss": 0.2523, |
|
"reward": 0.5964912433373301, |
|
"reward_std": 0.344139332834043, |
|
"rewards/accuracy_reward": 0.5964912433373301, |
|
"rewards/format_reward": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 579.1210691753187, |
|
"epoch": 0.5573333333333333, |
|
"grad_norm": 14.52474308013916, |
|
"kl": 0.7972836143092106, |
|
"learning_rate": 1.460069217843338e-06, |
|
"loss": -0.0145, |
|
"reward": 0.5754386097192764, |
|
"reward_std": 0.3417132879558362, |
|
"rewards/accuracy_reward": 0.5754386097192764, |
|
"rewards/format_reward": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 567.8386103579872, |
|
"epoch": 0.57, |
|
"grad_norm": 4.826749324798584, |
|
"kl": 1.95933837890625, |
|
"learning_rate": 1.3935947755389924e-06, |
|
"loss": -0.0023, |
|
"reward": 0.6614035228365346, |
|
"reward_std": 0.33425700288069876, |
|
"rewards/accuracy_reward": 0.6614035228365346, |
|
"rewards/format_reward": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 609.7438760857833, |
|
"epoch": 0.5826666666666667, |
|
"grad_norm": 2.4607062339782715, |
|
"kl": 2.425996800472862, |
|
"learning_rate": 1.3273298047249756e-06, |
|
"loss": 0.0442, |
|
"reward": 0.542105278059056, |
|
"reward_std": 0.3630532004331288, |
|
"rewards/accuracy_reward": 0.542105278059056, |
|
"rewards/format_reward": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 633.8508912739001, |
|
"epoch": 0.5953333333333334, |
|
"grad_norm": 1.241820216178894, |
|
"kl": 4.220809454666941, |
|
"learning_rate": 1.2614047559713923e-06, |
|
"loss": 0.1149, |
|
"reward": 0.5491228218141355, |
|
"reward_std": 0.3291172884012524, |
|
"rewards/accuracy_reward": 0.5491228218141355, |
|
"rewards/format_reward": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 607.8807173879524, |
|
"epoch": 0.608, |
|
"grad_norm": 4.545963287353516, |
|
"kl": 1.1027640894839639, |
|
"learning_rate": 1.1959494106708598e-06, |
|
"loss": 0.0236, |
|
"reward": 0.6000000144305982, |
|
"reward_std": 0.36793422448007684, |
|
"rewards/accuracy_reward": 0.6000000144305982, |
|
"rewards/format_reward": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 598.5158052143298, |
|
"epoch": 0.6206666666666667, |
|
"grad_norm": 3.975374221801758, |
|
"kl": 17.801213314658717, |
|
"learning_rate": 1.1310926255482204e-06, |
|
"loss": 0.6585, |
|
"reward": 0.6385965044561185, |
|
"reward_std": 0.33627751940175105, |
|
"rewards/accuracy_reward": 0.6385965044561185, |
|
"rewards/format_reward": 0.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.9087850470291, |
|
"epoch": 0.6333333333333333, |
|
"grad_norm": 1.1734281778335571, |
|
"kl": 2.3962697882401316, |
|
"learning_rate": 1.0669620789905688e-06, |
|
"loss": 0.027, |
|
"reward": 0.5877193123102188, |
|
"reward_std": 0.38500809261673374, |
|
"rewards/accuracy_reward": 0.5877193123102188, |
|
"rewards/format_reward": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 606.8912443462171, |
|
"epoch": 0.646, |
|
"grad_norm": 2.513296365737915, |
|
"kl": 2.2693404348273027, |
|
"learning_rate": 1.0036840196969795e-06, |
|
"loss": 0.0564, |
|
"reward": 0.6245614178870854, |
|
"reward_std": 0.32143837056661906, |
|
"rewards/accuracy_reward": 0.6245614178870854, |
|
"rewards/format_reward": 0.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 567.249137316252, |
|
"epoch": 0.6586666666666666, |
|
"grad_norm": 1.0174099206924438, |
|
"kl": 3.5286929481907894, |
|
"learning_rate": 9.413830181427508e-07, |
|
"loss": 0.0882, |
|
"reward": 0.6456140494660327, |
|
"reward_std": 0.3340006878501491, |
|
"rewards/accuracy_reward": 0.6456140494660327, |
|
"rewards/format_reward": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 610.2052803441098, |
|
"epoch": 0.6713333333333333, |
|
"grad_norm": 1.029402256011963, |
|
"kl": 2.918760922080592, |
|
"learning_rate": 8.801817213474331e-07, |
|
"loss": 0.0485, |
|
"reward": 0.5929824714597903, |
|
"reward_std": 0.34714905211800023, |
|
"rewards/accuracy_reward": 0.5929824714597903, |
|
"rewards/format_reward": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 638.8596645957546, |
|
"epoch": 0.684, |
|
"grad_norm": 1.5382988452911377, |
|
"kl": 2.563796193976151, |
|
"learning_rate": 8.202006114294044e-07, |
|
"loss": 0.0392, |
|
"reward": 0.5807017699668282, |
|
"reward_std": 0.36489082386619165, |
|
"rewards/accuracy_reward": 0.5807017699668282, |
|
"rewards/format_reward": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 608.0421232524671, |
|
"epoch": 0.6966666666666667, |
|
"grad_norm": 16.033626556396484, |
|
"kl": 1.5960372121710527, |
|
"learning_rate": 7.615577684223272e-07, |
|
"loss": 0.0303, |
|
"reward": 0.5894736991116875, |
|
"reward_std": 0.3749873553451739, |
|
"rewards/accuracy_reward": 0.5894736991116875, |
|
"rewards/format_reward": 0.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 600.415805053711, |
|
"epoch": 0.7093333333333334, |
|
"grad_norm": 2.8626461029052734, |
|
"kl": 1.9747047825863486, |
|
"learning_rate": 7.043686378203864e-07, |
|
"loss": 0.0147, |
|
"reward": 0.6368421217328624, |
|
"reward_std": 0.37260342334446156, |
|
"rewards/accuracy_reward": 0.6368421217328624, |
|
"rewards/format_reward": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 616.5772094726562, |
|
"epoch": 0.722, |
|
"grad_norm": 1.4815430641174316, |
|
"kl": 2.7695633737664473, |
|
"learning_rate": 6.487458033099425e-07, |
|
"loss": 0.0412, |
|
"reward": 0.5385965032012839, |
|
"reward_std": 0.35034166982299403, |
|
"rewards/accuracy_reward": 0.5385965032012839, |
|
"rewards/format_reward": 0.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 645.3333529823705, |
|
"epoch": 0.7346666666666667, |
|
"grad_norm": 0.6981754302978516, |
|
"kl": 2.3487503854851974, |
|
"learning_rate": 5.947987651349942e-07, |
|
"loss": 0.0472, |
|
"reward": 0.566666682927232, |
|
"reward_std": 0.3772305058805566, |
|
"rewards/accuracy_reward": 0.566666682927232, |
|
"rewards/format_reward": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 615.3684377569901, |
|
"epoch": 0.7473333333333333, |
|
"grad_norm": 2.056654214859009, |
|
"kl": 2.483375308388158, |
|
"learning_rate": 5.426337245327703e-07, |
|
"loss": 0.0787, |
|
"reward": 0.5491228204024465, |
|
"reward_std": 0.3359477874479796, |
|
"rewards/accuracy_reward": 0.5491228204024465, |
|
"rewards/format_reward": 0.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.973532497882843, |
|
"learning_rate": 4.923533746638108e-07, |
|
"loss": 0.0014, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 579.6517157089233, |
|
"eval_kl": 2.4090683868408203, |
|
"eval_loss": 0.046611957252025604, |
|
"eval_reward": 0.49193334555327894, |
|
"eval_reward_std": 0.35773685903549196, |
|
"eval_rewards/accuracy_reward": 0.49193334555327894, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 94754.8266, |
|
"eval_samples_per_second": 0.053, |
|
"eval_steps_per_second": 0.009, |
|
"step": 300 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.8903664036801, |
|
"epoch": 0.7726666666666666, |
|
"grad_norm": 0.28998905420303345, |
|
"kl": 1.3069589715254935, |
|
"learning_rate": 4.440566984481256e-07, |
|
"loss": 0.0171, |
|
"reward": 0.6210526459311184, |
|
"reward_std": 0.34825076147129663, |
|
"rewards/accuracy_reward": 0.6210526459311184, |
|
"rewards/format_reward": 0.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 618.7473841616982, |
|
"epoch": 0.7853333333333333, |
|
"grad_norm": 0.46500492095947266, |
|
"kl": 1.2823197214226973, |
|
"learning_rate": 3.978387737053994e-07, |
|
"loss": 0.0136, |
|
"reward": 0.5666666805744172, |
|
"reward_std": 0.32507625222206116, |
|
"rewards/accuracy_reward": 0.5666666805744172, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 618.5772081877055, |
|
"epoch": 0.798, |
|
"grad_norm": 0.40139040350914, |
|
"kl": 1.015325927734375, |
|
"learning_rate": 3.5379058598286167e-07, |
|
"loss": -0.0158, |
|
"reward": 0.5438596634488356, |
|
"reward_std": 0.3694023759741532, |
|
"rewards/accuracy_reward": 0.5438596634488356, |
|
"rewards/format_reward": 0.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 594.957911119963, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 1.3908910751342773, |
|
"kl": 1.2546104029605263, |
|
"learning_rate": 3.119988494392894e-07, |
|
"loss": 0.0289, |
|
"reward": 0.6508772078313325, |
|
"reward_std": 0.3327888513866224, |
|
"rewards/accuracy_reward": 0.6508772078313325, |
|
"rewards/format_reward": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.7333484850432, |
|
"epoch": 0.8233333333333334, |
|
"grad_norm": 4.380345344543457, |
|
"kl": 418.79378180252877, |
|
"learning_rate": 2.725458361377465e-07, |
|
"loss": 15.2043, |
|
"reward": 0.608771941379497, |
|
"reward_std": 0.34012960415137444, |
|
"rewards/accuracy_reward": 0.608771941379497, |
|
"rewards/format_reward": 0.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 634.2175588507401, |
|
"epoch": 0.836, |
|
"grad_norm": 6.396597862243652, |
|
"kl": 1.1365401418585526, |
|
"learning_rate": 2.3550921408312737e-07, |
|
"loss": 0.0132, |
|
"reward": 0.5859649261361675, |
|
"reward_std": 0.34545827006038865, |
|
"rewards/accuracy_reward": 0.5859649261361675, |
|
"rewards/format_reward": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 614.2140493292558, |
|
"epoch": 0.8486666666666667, |
|
"grad_norm": 0.9369886517524719, |
|
"kl": 1.5584103232935855, |
|
"learning_rate": 2.0096189432334195e-07, |
|
"loss": 0.0201, |
|
"reward": 0.6035087874061182, |
|
"reward_std": 0.3459035368342149, |
|
"rewards/accuracy_reward": 0.6035087874061182, |
|
"rewards/format_reward": 0.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 619.8245766087582, |
|
"epoch": 0.8613333333333333, |
|
"grad_norm": 0.42530328035354614, |
|
"kl": 1.4711069207442433, |
|
"learning_rate": 1.6897188741514286e-07, |
|
"loss": 0.0519, |
|
"reward": 0.575438610503548, |
|
"reward_std": 0.3146836676095661, |
|
"rewards/accuracy_reward": 0.575438610503548, |
|
"rewards/format_reward": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 577.3737000314812, |
|
"epoch": 0.874, |
|
"grad_norm": 0.3861980140209198, |
|
"kl": 1.7285689504523025, |
|
"learning_rate": 1.396021695371582e-07, |
|
"loss": -0.0088, |
|
"reward": 0.6385965082206224, |
|
"reward_std": 0.37675155840421976, |
|
"rewards/accuracy_reward": 0.6385965082206224, |
|
"rewards/format_reward": 0.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 629.012297941509, |
|
"epoch": 0.8866666666666667, |
|
"grad_norm": 4.221518039703369, |
|
"kl": 1.9362998560855262, |
|
"learning_rate": 1.1291055851370623e-07, |
|
"loss": 0.056, |
|
"reward": 0.582456154886045, |
|
"reward_std": 0.38160365223884585, |
|
"rewards/accuracy_reward": 0.582456154886045, |
|
"rewards/format_reward": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 589.6280856483861, |
|
"epoch": 0.8993333333333333, |
|
"grad_norm": 2.3517720699310303, |
|
"kl": 1.8255171926398026, |
|
"learning_rate": 8.894959999345015e-08, |
|
"loss": 0.0361, |
|
"reward": 0.5754386076801702, |
|
"reward_std": 0.34303222267251265, |
|
"rewards/accuracy_reward": 0.5754386076801702, |
|
"rewards/format_reward": 0.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 629.0754549528423, |
|
"epoch": 0.912, |
|
"grad_norm": 1.3438383340835571, |
|
"kl": 1.5210680509868422, |
|
"learning_rate": 6.776646400696212e-08, |
|
"loss": 0.0234, |
|
"reward": 0.5789473809693989, |
|
"reward_std": 0.3563298034040551, |
|
"rewards/accuracy_reward": 0.5789473809693989, |
|
"rewards/format_reward": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 585.6263320119757, |
|
"epoch": 0.9246666666666666, |
|
"grad_norm": 1.6730248928070068, |
|
"kl": 3.8371864720394737, |
|
"learning_rate": 4.940285210684375e-08, |
|
"loss": 0.075, |
|
"reward": 0.6596491382310264, |
|
"reward_std": 0.3641158220015074, |
|
"rewards/accuracy_reward": 0.6596491382310264, |
|
"rewards/format_reward": 0.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 634.8017714651007, |
|
"epoch": 0.9373333333333334, |
|
"grad_norm": 0.6605441570281982, |
|
"kl": 1.9925395764802631, |
|
"learning_rate": 3.389491527319999e-08, |
|
"loss": 0.0375, |
|
"reward": 0.6578947547234987, |
|
"reward_std": 0.41116404282419305, |
|
"rewards/accuracy_reward": 0.6578947547234987, |
|
"rewards/format_reward": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 600.2000179893092, |
|
"epoch": 0.95, |
|
"grad_norm": 1.5404088497161865, |
|
"kl": 1.4656384919819079, |
|
"learning_rate": 2.127318274608381e-08, |
|
"loss": 0.0059, |
|
"reward": 0.5789473817536706, |
|
"reward_std": 0.350234569060175, |
|
"rewards/accuracy_reward": 0.5789473817536706, |
|
"rewards/format_reward": 0.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 573.8017702604595, |
|
"epoch": 0.9626666666666667, |
|
"grad_norm": 2.143493175506592, |
|
"kl": 1.3233176783511513, |
|
"learning_rate": 1.1562501925013125e-08, |
|
"loss": 0.0301, |
|
"reward": 0.6631579116771096, |
|
"reward_std": 0.3379683045964492, |
|
"rewards/accuracy_reward": 0.6631579116771096, |
|
"rewards/format_reward": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 616.8175617418791, |
|
"epoch": 0.9753333333333334, |
|
"grad_norm": 0.5864923596382141, |
|
"kl": 1.3502071982935855, |
|
"learning_rate": 4.781989453874814e-09, |
|
"loss": 0.0052, |
|
"reward": 0.5807017691825566, |
|
"reward_std": 0.3575416382990385, |
|
"rewards/accuracy_reward": 0.5807017691825566, |
|
"rewards/format_reward": 0.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 612.1122984233656, |
|
"epoch": 0.988, |
|
"grad_norm": 0.3266775906085968, |
|
"kl": 1.6176237407483554, |
|
"learning_rate": 9.44993587509657e-10, |
|
"loss": 0.0364, |
|
"reward": 0.5947368560652984, |
|
"reward_std": 0.300396885683662, |
|
"rewards/accuracy_reward": 0.5947368560652984, |
|
"rewards/format_reward": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 612.8508915148283, |
|
"epoch": 0.9981333333333333, |
|
"kl": 1.1464434171977795, |
|
"reward": 0.603070187529451, |
|
"reward_std": 0.27440278663447026, |
|
"rewards/accuracy_reward": 0.603070187529451, |
|
"rewards/format_reward": 0.0, |
|
"step": 394, |
|
"total_flos": 0.0, |
|
"train_loss": 0.2665646580783036, |
|
"train_runtime": 484901.6937, |
|
"train_samples_per_second": 0.015, |
|
"train_steps_per_second": 0.001 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 394, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|