|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.992, |
|
"eval_steps": 500, |
|
"global_step": 62, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 5767.000259399414, |
|
"epoch": 0.016, |
|
"grad_norm": 0.0023286771029233932, |
|
"kl": 0.0, |
|
"learning_rate": 1.4285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.5625000335276127, |
|
"reward_std": 0.2650662618689239, |
|
"rewards/accuracy_reward": 0.3571428805589676, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.20535715110599995, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 6333.000259399414, |
|
"epoch": 0.032, |
|
"grad_norm": 0.0020095149520784616, |
|
"kl": 0.0, |
|
"learning_rate": 2.857142857142857e-07, |
|
"loss": 0.0, |
|
"reward": 0.2767857266589999, |
|
"reward_std": 0.14067190792411566, |
|
"rewards/accuracy_reward": 0.0803571492433548, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.19642857927829027, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 6795.750213623047, |
|
"epoch": 0.048, |
|
"grad_norm": 0.002763825235888362, |
|
"kl": 0.0002949237823486328, |
|
"learning_rate": 4.285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.39955359511077404, |
|
"reward_std": 0.19223998952656984, |
|
"rewards/accuracy_reward": 0.1875000111758709, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.21205358067527413, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 5619.562652587891, |
|
"epoch": 0.064, |
|
"grad_norm": 0.0036455015651881695, |
|
"kl": 0.00029200315475463867, |
|
"learning_rate": 5.714285714285714e-07, |
|
"loss": 0.0, |
|
"reward": 0.3616071604192257, |
|
"reward_std": 0.18272343277931213, |
|
"rewards/accuracy_reward": 0.160714291036129, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/tag_count_reward": 0.1919642947614193, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 5378.000259399414, |
|
"epoch": 0.08, |
|
"grad_norm": 0.008862568065524101, |
|
"kl": 0.0003243684768676758, |
|
"learning_rate": 7.142857142857143e-07, |
|
"loss": 0.0, |
|
"reward": 0.33035715809091926, |
|
"reward_std": 0.22553458344191313, |
|
"rewards/accuracy_reward": 0.1339285783469677, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.19642857881262898, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 6547.687942504883, |
|
"epoch": 0.096, |
|
"grad_norm": 0.0034230498131364584, |
|
"kl": 0.0003108382225036621, |
|
"learning_rate": 8.57142857142857e-07, |
|
"loss": 0.0, |
|
"reward": 0.4107143026776612, |
|
"reward_std": 0.21918057976290584, |
|
"rewards/accuracy_reward": 0.2142857238650322, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/tag_count_reward": 0.18750000791624188, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 3935.875144958496, |
|
"epoch": 0.112, |
|
"grad_norm": 0.003568751970306039, |
|
"kl": 0.00029009580612182617, |
|
"learning_rate": 1e-06, |
|
"loss": 0.0, |
|
"reward": 0.3683035923168063, |
|
"reward_std": 0.2914058305323124, |
|
"rewards/accuracy_reward": 0.1696428656578064, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/tag_count_reward": 0.1897321529686451, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 4970.687683105469, |
|
"epoch": 0.128, |
|
"grad_norm": 0.003850426757708192, |
|
"kl": 0.00031566619873046875, |
|
"learning_rate": 9.99266096766761e-07, |
|
"loss": 0.0, |
|
"reward": 0.38839287450537086, |
|
"reward_std": 0.2490098038688302, |
|
"rewards/accuracy_reward": 0.19642858020961285, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.19196429336443543, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 5700.437759399414, |
|
"epoch": 0.144, |
|
"grad_norm": 0.0036366251297295094, |
|
"kl": 0.0003451108932495117, |
|
"learning_rate": 9.970667809068474e-07, |
|
"loss": 0.0, |
|
"reward": 0.43973216600716114, |
|
"reward_std": 0.19689470902085304, |
|
"rewards/accuracy_reward": 0.2410714402794838, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.19866072200238705, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 7395.241371154785, |
|
"epoch": 0.16, |
|
"grad_norm": 0.010836871340870857, |
|
"kl": 0.0002986788749694824, |
|
"learning_rate": 9.934092261314617e-07, |
|
"loss": 0.0, |
|
"reward": 0.3459821566939354, |
|
"reward_std": 0.17509831953793764, |
|
"rewards/accuracy_reward": 0.14285714644938707, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.20312500838190317, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 7001.625335693359, |
|
"epoch": 0.176, |
|
"grad_norm": 0.019893741235136986, |
|
"kl": 0.00028258562088012695, |
|
"learning_rate": 9.883053626240501e-07, |
|
"loss": 0.0, |
|
"reward": 0.4218750260770321, |
|
"reward_std": 0.24917150475084782, |
|
"rewards/accuracy_reward": 0.23214287031441927, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.18973215110599995, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 6650.01815032959, |
|
"epoch": 0.192, |
|
"grad_norm": 0.013785521499812603, |
|
"kl": 0.00031447410583496094, |
|
"learning_rate": 9.817718381265238e-07, |
|
"loss": 0.0, |
|
"reward": 0.40178573317825794, |
|
"reward_std": 0.2504308824427426, |
|
"rewards/accuracy_reward": 0.18750000558793545, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.2142857238650322, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 4968.250289916992, |
|
"epoch": 0.208, |
|
"grad_norm": 0.004635457415133715, |
|
"kl": 0.0003146529197692871, |
|
"learning_rate": 9.738299636377862e-07, |
|
"loss": 0.0, |
|
"reward": 0.4598214505240321, |
|
"reward_std": 0.3252177187241614, |
|
"rewards/accuracy_reward": 0.2767857275903225, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.18303572293370962, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 6637.634262084961, |
|
"epoch": 0.224, |
|
"grad_norm": 0.01122356578707695, |
|
"kl": 0.00028884410858154297, |
|
"learning_rate": 9.645056439016825e-07, |
|
"loss": 0.0, |
|
"reward": 0.47321430686861277, |
|
"reward_std": 0.25558971939608455, |
|
"rewards/accuracy_reward": 0.2410714440047741, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.23214286472648382, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 5712.223403930664, |
|
"epoch": 0.24, |
|
"grad_norm": 0.01076567079871893, |
|
"kl": 0.0002791881561279297, |
|
"learning_rate": 9.538292929111112e-07, |
|
"loss": 0.0, |
|
"reward": 0.3616071557626128, |
|
"reward_std": 0.27521964302286506, |
|
"rewards/accuracy_reward": 0.1250000074505806, |
|
"rewards/format_reward": 0.01785714365541935, |
|
"rewards/tag_count_reward": 0.21875000931322575, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 7215.071830749512, |
|
"epoch": 0.256, |
|
"grad_norm": 0.020798197016119957, |
|
"kl": 0.00030541419982910156, |
|
"learning_rate": 9.418357347038998e-07, |
|
"loss": 0.0, |
|
"reward": 0.4397321636788547, |
|
"reward_std": 0.3516994332894683, |
|
"rewards/accuracy_reward": 0.258928582072258, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.18080357974395156, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 5953.2769775390625, |
|
"epoch": 0.272, |
|
"grad_norm": 0.013654655776917934, |
|
"kl": 0.0002818107604980469, |
|
"learning_rate": 9.285640897740315e-07, |
|
"loss": 0.0, |
|
"reward": 0.3035714435391128, |
|
"reward_std": 0.21893947944045067, |
|
"rewards/accuracy_reward": 0.12500000558793545, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.17857143539004028, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 6048.937774658203, |
|
"epoch": 0.288, |
|
"grad_norm": 0.013907884247601032, |
|
"kl": 0.00029546022415161133, |
|
"learning_rate": 9.140576474687263e-07, |
|
"loss": 0.0, |
|
"reward": 0.4397321669384837, |
|
"reward_std": 0.2832956803031266, |
|
"rewards/accuracy_reward": 0.2321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.20758929662406445, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 5523.152084350586, |
|
"epoch": 0.304, |
|
"grad_norm": 0.017561404034495354, |
|
"kl": 0.0002797245979309082, |
|
"learning_rate": 8.983637247875872e-07, |
|
"loss": 0.0, |
|
"reward": 0.2946428721770644, |
|
"reward_std": 0.17000702070072293, |
|
"rewards/accuracy_reward": 0.06250000186264515, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.23214287031441927, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 6402.52702331543, |
|
"epoch": 0.32, |
|
"grad_norm": 0.023619119077920914, |
|
"kl": 0.0002855658531188965, |
|
"learning_rate": 8.81533512044382e-07, |
|
"loss": 0.0, |
|
"reward": 0.39955358672887087, |
|
"reward_std": 0.2746177548542619, |
|
"rewards/accuracy_reward": 0.17857143748551607, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/tag_count_reward": 0.21205357927829027, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 7257.973442077637, |
|
"epoch": 0.336, |
|
"grad_norm": 0.024335632100701332, |
|
"kl": 0.0003019571304321289, |
|
"learning_rate": 8.636219058948822e-07, |
|
"loss": 0.0, |
|
"reward": 0.38169644912704825, |
|
"reward_std": 0.2994211660698056, |
|
"rewards/accuracy_reward": 0.15178572107106447, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/tag_count_reward": 0.2209821525029838, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 6505.696727752686, |
|
"epoch": 0.352, |
|
"grad_norm": 0.015834977850317955, |
|
"kl": 0.00024816393852233887, |
|
"learning_rate": 8.446873302753783e-07, |
|
"loss": 0.0, |
|
"reward": 0.4196428768336773, |
|
"reward_std": 0.348601452074945, |
|
"rewards/accuracy_reward": 0.20535715483129025, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/tag_count_reward": 0.20535715110599995, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 6381.687782287598, |
|
"epoch": 0.368, |
|
"grad_norm": 0.003612571395933628, |
|
"kl": 0.00027638673782348633, |
|
"learning_rate": 8.247915458359471e-07, |
|
"loss": 0.0, |
|
"reward": 0.6183036016300321, |
|
"reward_std": 0.4431227990426123, |
|
"rewards/accuracy_reward": 0.41964287776499987, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.19866072200238705, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 6749.973495483398, |
|
"epoch": 0.384, |
|
"grad_norm": 0.03762313351035118, |
|
"kl": 0.00029355287551879883, |
|
"learning_rate": 8.039994484900462e-07, |
|
"loss": 0.0, |
|
"reward": 0.703125043772161, |
|
"reward_std": 0.5290834615007043, |
|
"rewards/accuracy_reward": 0.5089286034926772, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/tag_count_reward": 0.18526786682195961, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 7313.741317749023, |
|
"epoch": 0.4, |
|
"grad_norm": 0.054487138986587524, |
|
"kl": 0.0003046095371246338, |
|
"learning_rate": 7.823788577375328e-07, |
|
"loss": 0.0, |
|
"reward": 0.7790178973227739, |
|
"reward_std": 0.6166830230504274, |
|
"rewards/accuracy_reward": 0.5357143124565482, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.24330358393490314, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 8691.36654663086, |
|
"epoch": 0.416, |
|
"grad_norm": 0.030647600069642067, |
|
"kl": 0.0002758502960205078, |
|
"learning_rate": 7.600002954515531e-07, |
|
"loss": 0.0, |
|
"reward": 0.638392886146903, |
|
"reward_std": 0.48968397080898285, |
|
"rewards/accuracy_reward": 0.38392858672887087, |
|
"rewards/format_reward": 0.01785714365541935, |
|
"rewards/tag_count_reward": 0.23660715110599995, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 8900.545036315918, |
|
"epoch": 0.432, |
|
"grad_norm": 0.07716870307922363, |
|
"kl": 0.00028139352798461914, |
|
"learning_rate": 7.36936755850849e-07, |
|
"loss": 0.0001, |
|
"reward": 0.9486607648432255, |
|
"reward_std": 0.5833425354212523, |
|
"rewards/accuracy_reward": 0.5714285988360643, |
|
"rewards/format_reward": 0.0357142873108387, |
|
"rewards/tag_count_reward": 0.34151787450537086, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 6317.732406616211, |
|
"epoch": 0.448, |
|
"grad_norm": 0.029883868992328644, |
|
"kl": 0.00029206275939941406, |
|
"learning_rate": 7.132634674077883e-07, |
|
"loss": 0.0, |
|
"reward": 0.7343750456348062, |
|
"reward_std": 0.49824068509042263, |
|
"rewards/accuracy_reward": 0.5357143143191934, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.19866072479635477, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 9996.053977966309, |
|
"epoch": 0.464, |
|
"grad_norm": 0.06153450533747673, |
|
"kl": 0.0002873539924621582, |
|
"learning_rate": 6.890576474687263e-07, |
|
"loss": 0.0, |
|
"reward": 0.7254464644938707, |
|
"reward_std": 0.5397462174296379, |
|
"rewards/accuracy_reward": 0.419642873108387, |
|
"rewards/format_reward": 0.05357143096625805, |
|
"rewards/tag_count_reward": 0.2522321562282741, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 7258.732467651367, |
|
"epoch": 0.48, |
|
"grad_norm": 0.033417101949453354, |
|
"kl": 0.0002796053886413574, |
|
"learning_rate": 6.643982503870692e-07, |
|
"loss": 0.0, |
|
"reward": 0.6473214533179998, |
|
"reward_std": 0.5155255328863859, |
|
"rewards/accuracy_reward": 0.42857144493609667, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/tag_count_reward": 0.20982143795117736, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 10728.00942993164, |
|
"epoch": 0.496, |
|
"grad_norm": 0.06687657535076141, |
|
"kl": 0.0002480745315551758, |
|
"learning_rate": 6.393657099905853e-07, |
|
"loss": 0.0, |
|
"reward": 0.8504464663565159, |
|
"reward_std": 0.5495956116355956, |
|
"rewards/accuracy_reward": 0.4821428768336773, |
|
"rewards/format_reward": 0.05357143096625805, |
|
"rewards/tag_count_reward": 0.31473215483129025, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 8516.982543945312, |
|
"epoch": 0.512, |
|
"grad_norm": 0.04354412481188774, |
|
"kl": 0.0002473890781402588, |
|
"learning_rate": 6.140416772229784e-07, |
|
"loss": 0.0, |
|
"reward": 0.6741071790456772, |
|
"reward_std": 0.45561165921390057, |
|
"rewards/accuracy_reward": 0.4107143012806773, |
|
"rewards/format_reward": 0.01785714365541935, |
|
"rewards/tag_count_reward": 0.2455357238650322, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 8822.393253326416, |
|
"epoch": 0.528, |
|
"grad_norm": 0.05626585707068443, |
|
"kl": 0.0002841353416442871, |
|
"learning_rate": 5.88508753815478e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6629464644938707, |
|
"reward_std": 0.42721313470974565, |
|
"rewards/accuracy_reward": 0.3928571566939354, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"rewards/tag_count_reward": 0.26116072479635477, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 10038.589805603027, |
|
"epoch": 0.544, |
|
"grad_norm": 0.05266648903489113, |
|
"kl": 0.00027632713317871094, |
|
"learning_rate": 5.628502228571632e-07, |
|
"loss": 0.0, |
|
"reward": 0.7388393133878708, |
|
"reward_std": 0.4553453531116247, |
|
"rewards/accuracy_reward": 0.4285714467987418, |
|
"rewards/format_reward": 0.01785714365541935, |
|
"rewards/tag_count_reward": 0.2924107250291854, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 8031.839714050293, |
|
"epoch": 0.56, |
|
"grad_norm": 0.05634909123182297, |
|
"kl": 0.0003362894058227539, |
|
"learning_rate": 5.371497771428367e-07, |
|
"loss": 0.0, |
|
"reward": 0.9084821827709675, |
|
"reward_std": 0.5687091294676065, |
|
"rewards/accuracy_reward": 0.5625000195577741, |
|
"rewards/format_reward": 0.044642859138548374, |
|
"rewards/tag_count_reward": 0.30133930034935474, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 9826.420181274414, |
|
"epoch": 0.576, |
|
"grad_norm": 0.07227327674627304, |
|
"kl": 0.00029417872428894043, |
|
"learning_rate": 5.114912461845222e-07, |
|
"loss": 0.0001, |
|
"reward": 0.8482143394649029, |
|
"reward_std": 0.46484142914414406, |
|
"rewards/accuracy_reward": 0.5089285997673869, |
|
"rewards/format_reward": 0.01785714365541935, |
|
"rewards/tag_count_reward": 0.321428582072258, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 10691.991607666016, |
|
"epoch": 0.592, |
|
"grad_norm": 0.07488638162612915, |
|
"kl": 0.00031763315200805664, |
|
"learning_rate": 4.859583227770217e-07, |
|
"loss": 0.0, |
|
"reward": 0.8169643171131611, |
|
"reward_std": 0.5879267286509275, |
|
"rewards/accuracy_reward": 0.44642859045416117, |
|
"rewards/format_reward": 0.044642859138548374, |
|
"rewards/tag_count_reward": 0.32589287776499987, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 9507.875434875488, |
|
"epoch": 0.608, |
|
"grad_norm": 0.0768597424030304, |
|
"kl": 0.0003135800361633301, |
|
"learning_rate": 4.606342900094147e-07, |
|
"loss": 0.0, |
|
"reward": 0.7522321715950966, |
|
"reward_std": 0.4764951467514038, |
|
"rewards/accuracy_reward": 0.348214297555387, |
|
"rewards/format_reward": 0.0625000037252903, |
|
"rewards/tag_count_reward": 0.3415178721770644, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 8807.152183532715, |
|
"epoch": 0.624, |
|
"grad_norm": 0.08563056588172913, |
|
"kl": 0.0003771781921386719, |
|
"learning_rate": 4.3560174961293094e-07, |
|
"loss": 0.0, |
|
"reward": 0.8482143264263868, |
|
"reward_std": 0.6095239520072937, |
|
"rewards/accuracy_reward": 0.5089285960420966, |
|
"rewards/format_reward": 0.0357142873108387, |
|
"rewards/tag_count_reward": 0.3035714440047741, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 9334.85765838623, |
|
"epoch": 0.64, |
|
"grad_norm": 0.0718473568558693, |
|
"kl": 0.00036656856536865234, |
|
"learning_rate": 4.1094235253127374e-07, |
|
"loss": 0.0, |
|
"reward": 0.6473214626312256, |
|
"reward_std": 0.4437146638520062, |
|
"rewards/accuracy_reward": 0.3214285857975483, |
|
"rewards/format_reward": 0.026785715483129025, |
|
"rewards/tag_count_reward": 0.2991071594879031, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 9168.080871582031, |
|
"epoch": 0.656, |
|
"grad_norm": 0.06268931925296783, |
|
"kl": 0.00036334991455078125, |
|
"learning_rate": 3.867365325922116e-07, |
|
"loss": 0.0, |
|
"reward": 0.6607143171131611, |
|
"reward_std": 0.5487463716417551, |
|
"rewards/accuracy_reward": 0.3750000176951289, |
|
"rewards/format_reward": 0.0357142873108387, |
|
"rewards/tag_count_reward": 0.2500000074505806, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 9371.27727508545, |
|
"epoch": 0.672, |
|
"grad_norm": 0.06987561285495758, |
|
"kl": 0.00038748979568481445, |
|
"learning_rate": 3.630632441491511e-07, |
|
"loss": 0.0, |
|
"reward": 0.6339286072179675, |
|
"reward_std": 0.38090350618585944, |
|
"rewards/accuracy_reward": 0.39285716600716114, |
|
"rewards/format_reward": 0.0, |
|
"rewards/tag_count_reward": 0.24107143888249993, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 7754.02717590332, |
|
"epoch": 0.688, |
|
"grad_norm": 0.0807015672326088, |
|
"kl": 0.00042748451232910156, |
|
"learning_rate": 3.3999970454844686e-07, |
|
"loss": 0.0, |
|
"reward": 0.6428571809083223, |
|
"reward_std": 0.4524891930632293, |
|
"rewards/accuracy_reward": 0.37500002328306437, |
|
"rewards/format_reward": 0.01785714365541935, |
|
"rewards/tag_count_reward": 0.2500000102445483, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 9262.286231994629, |
|
"epoch": 0.704, |
|
"grad_norm": 0.10762109607458115, |
|
"kl": 0.0004887580871582031, |
|
"learning_rate": 3.1762114226246716e-07, |
|
"loss": 0.0, |
|
"reward": 0.7477678954601288, |
|
"reward_std": 0.4671580633148551, |
|
"rewards/accuracy_reward": 0.4732143096625805, |
|
"rewards/format_reward": 0.026785715483129025, |
|
"rewards/tag_count_reward": 0.2477678651921451, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 9673.063018798828, |
|
"epoch": 0.72, |
|
"grad_norm": 0.08651269227266312, |
|
"kl": 0.0004634857177734375, |
|
"learning_rate": 2.9600055150995395e-07, |
|
"loss": 0.0, |
|
"reward": 0.7544643264263868, |
|
"reward_std": 0.44256059266626835, |
|
"rewards/accuracy_reward": 0.43750001303851604, |
|
"rewards/format_reward": 0.01785714365541935, |
|
"rewards/tag_count_reward": 0.2991071557626128, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 7535.82181930542, |
|
"epoch": 0.736, |
|
"grad_norm": 0.07591980695724487, |
|
"kl": 0.0005333423614501953, |
|
"learning_rate": 2.752084541640528e-07, |
|
"loss": 0.0, |
|
"reward": 0.674107177183032, |
|
"reward_std": 0.3459728816524148, |
|
"rewards/accuracy_reward": 0.40178572945296764, |
|
"rewards/format_reward": 0.01785714365541935, |
|
"rewards/tag_count_reward": 0.2544642984867096, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 10174.214828491211, |
|
"epoch": 0.752, |
|
"grad_norm": 0.14260835945606232, |
|
"kl": 0.0004648566246032715, |
|
"learning_rate": 2.553126697246217e-07, |
|
"loss": 0.0, |
|
"reward": 0.5022321678698063, |
|
"reward_std": 0.42994912061840296, |
|
"rewards/accuracy_reward": 0.1607142947614193, |
|
"rewards/format_reward": 0.0357142873108387, |
|
"rewards/tag_count_reward": 0.30580358393490314, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 7719.678955078125, |
|
"epoch": 0.768, |
|
"grad_norm": 0.0768926590681076, |
|
"kl": 0.0005067586898803711, |
|
"learning_rate": 2.36378094105118e-07, |
|
"loss": 0.0, |
|
"reward": 0.5669643115252256, |
|
"reward_std": 0.3042712449096143, |
|
"rewards/accuracy_reward": 0.2142857201397419, |
|
"rewards/format_reward": 0.026785715483129025, |
|
"rewards/tag_count_reward": 0.3258928721770644, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 12495.509475708008, |
|
"epoch": 0.784, |
|
"grad_norm": 0.16616983711719513, |
|
"kl": 0.0005098581314086914, |
|
"learning_rate": 2.1846648795561774e-07, |
|
"loss": 0.0, |
|
"reward": 0.47767859883606434, |
|
"reward_std": 0.3662447426468134, |
|
"rewards/accuracy_reward": 0.053571431897580624, |
|
"rewards/format_reward": 0.026785715483129025, |
|
"rewards/tag_count_reward": 0.39732144586741924, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 8463.054054260254, |
|
"epoch": 0.8, |
|
"grad_norm": 0.1351144015789032, |
|
"kl": 0.0005650520324707031, |
|
"learning_rate": 2.016362752124129e-07, |
|
"loss": 0.0, |
|
"reward": 0.5178571599535644, |
|
"reward_std": 0.4144706530496478, |
|
"rewards/accuracy_reward": 0.15178572293370962, |
|
"rewards/format_reward": 0.06250000279396772, |
|
"rewards/tag_count_reward": 0.3035714398138225, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 7572.7681884765625, |
|
"epoch": 0.816, |
|
"grad_norm": 0.11977759003639221, |
|
"kl": 0.0005983114242553711, |
|
"learning_rate": 1.8594235253127372e-07, |
|
"loss": 0.0, |
|
"reward": 0.6250000293366611, |
|
"reward_std": 0.43050514021888375, |
|
"rewards/accuracy_reward": 0.2500000111758709, |
|
"rewards/format_reward": 0.053571430034935474, |
|
"rewards/tag_count_reward": 0.3214285848662257, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 12172.411254882812, |
|
"epoch": 0.832, |
|
"grad_norm": 0.22523346543312073, |
|
"kl": 0.0006231069564819336, |
|
"learning_rate": 1.7143591022596842e-07, |
|
"loss": 0.0, |
|
"reward": 0.6294643133878708, |
|
"reward_std": 0.5201228363439441, |
|
"rewards/accuracy_reward": 0.0625000037252903, |
|
"rewards/format_reward": 0.10714286286383867, |
|
"rewards/tag_count_reward": 0.45982144586741924, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 12255.000625610352, |
|
"epoch": 0.848, |
|
"grad_norm": 0.13431492447853088, |
|
"kl": 0.0006296634674072266, |
|
"learning_rate": 1.5816426529610032e-07, |
|
"loss": 0.0, |
|
"reward": 0.6406250288709998, |
|
"reward_std": 0.4715617010369897, |
|
"rewards/accuracy_reward": 0.1964285783469677, |
|
"rewards/format_reward": 0.06250000279396772, |
|
"rewards/tag_count_reward": 0.38169644121080637, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 9697.938034057617, |
|
"epoch": 0.864, |
|
"grad_norm": 0.15816675126552582, |
|
"kl": 0.0006450414657592773, |
|
"learning_rate": 1.461707070888888e-07, |
|
"loss": 0.0001, |
|
"reward": 0.6540178917348385, |
|
"reward_std": 0.459601366892457, |
|
"rewards/accuracy_reward": 0.16964286752045155, |
|
"rewards/format_reward": 0.06250000279396772, |
|
"rewards/tag_count_reward": 0.4218750186264515, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 13606.697204589844, |
|
"epoch": 0.88, |
|
"grad_norm": 0.15856020152568817, |
|
"kl": 0.000563502311706543, |
|
"learning_rate": 1.354943560983175e-07, |
|
"loss": 0.0, |
|
"reward": 0.5580357415601611, |
|
"reward_std": 0.43626588862389326, |
|
"rewards/accuracy_reward": 0.16071429383009672, |
|
"rewards/format_reward": 0.05357143096625805, |
|
"rewards/tag_count_reward": 0.34375001583248377, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 11956.571960449219, |
|
"epoch": 0.896, |
|
"grad_norm": 0.22786200046539307, |
|
"kl": 0.0005663633346557617, |
|
"learning_rate": 1.2617003636221394e-07, |
|
"loss": 0.0, |
|
"reward": 0.642857177183032, |
|
"reward_std": 0.49508959893137217, |
|
"rewards/accuracy_reward": 0.1607142947614193, |
|
"rewards/format_reward": 0.07142857555299997, |
|
"rewards/tag_count_reward": 0.4107143022119999, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 11972.813079833984, |
|
"epoch": 0.912, |
|
"grad_norm": 0.1637219786643982, |
|
"kl": 0.000640869140625, |
|
"learning_rate": 1.1822816187347622e-07, |
|
"loss": 0.0, |
|
"reward": 0.6941964700818062, |
|
"reward_std": 0.5343647776171565, |
|
"rewards/accuracy_reward": 0.13392857927829027, |
|
"rewards/format_reward": 0.09821429010480642, |
|
"rewards/tag_count_reward": 0.46205359417945147, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 10837.938003540039, |
|
"epoch": 0.928, |
|
"grad_norm": 0.18956886231899261, |
|
"kl": 0.0006705522537231445, |
|
"learning_rate": 1.1169463737594995e-07, |
|
"loss": 0.0, |
|
"reward": 0.7120535988360643, |
|
"reward_std": 0.45146402157843113, |
|
"rewards/accuracy_reward": 0.10714285913854837, |
|
"rewards/format_reward": 0.1250000074505806, |
|
"rewards/tag_count_reward": 0.4799107350409031, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 11165.286224365234, |
|
"epoch": 0.944, |
|
"grad_norm": 0.2458626627922058, |
|
"kl": 0.0006707906723022461, |
|
"learning_rate": 1.0659077386853815e-07, |
|
"loss": 0.0, |
|
"reward": 0.6183036025613546, |
|
"reward_std": 0.5205434840172529, |
|
"rewards/accuracy_reward": 0.0357142873108387, |
|
"rewards/format_reward": 0.09821429010480642, |
|
"rewards/tag_count_reward": 0.48437502421438694, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 10342.821807861328, |
|
"epoch": 0.96, |
|
"grad_norm": 0.31647762656211853, |
|
"kl": 0.0007990598678588867, |
|
"learning_rate": 1.0293321909315241e-07, |
|
"loss": 0.0, |
|
"reward": 0.7611607518047094, |
|
"reward_std": 0.5515336757525802, |
|
"rewards/accuracy_reward": 0.1607142947614193, |
|
"rewards/format_reward": 0.10714286286383867, |
|
"rewards/tag_count_reward": 0.4933035932481289, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 13808.741760253906, |
|
"epoch": 0.976, |
|
"grad_norm": 0.182793527841568, |
|
"kl": 0.000666499137878418, |
|
"learning_rate": 1.0073390323323897e-07, |
|
"loss": 0.0, |
|
"reward": 0.5178571622818708, |
|
"reward_std": 0.4044363498687744, |
|
"rewards/accuracy_reward": 0.0714285746216774, |
|
"rewards/format_reward": 0.044642859138548374, |
|
"rewards/tag_count_reward": 0.40178572945296764, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 13192.063171386719, |
|
"epoch": 0.992, |
|
"grad_norm": 0.23750482499599457, |
|
"kl": 0.0006361007690429688, |
|
"learning_rate": 1e-07, |
|
"loss": 0.0, |
|
"reward": 0.4955357387661934, |
|
"reward_std": 0.3679911820217967, |
|
"rewards/accuracy_reward": 0.026785715483129025, |
|
"rewards/format_reward": 0.044642859138548374, |
|
"rewards/tag_count_reward": 0.42410716228187084, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"step": 62, |
|
"total_flos": 0.0, |
|
"train_loss": 1.9587027928480438e-05, |
|
"train_runtime": 139509.2453, |
|
"train_samples_per_second": 0.007, |
|
"train_steps_per_second": 0.0 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 62, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|