Blancy's picture
Model save
7d04e6e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.992,
"eval_steps": 500,
"global_step": 62,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 5767.000259399414,
"epoch": 0.016,
"grad_norm": 0.0023286771029233932,
"kl": 0.0,
"learning_rate": 1.4285714285714285e-07,
"loss": 0.0,
"reward": 0.5625000335276127,
"reward_std": 0.2650662618689239,
"rewards/accuracy_reward": 0.3571428805589676,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.20535715110599995,
"step": 1
},
{
"completion_length": 6333.000259399414,
"epoch": 0.032,
"grad_norm": 0.0020095149520784616,
"kl": 0.0,
"learning_rate": 2.857142857142857e-07,
"loss": 0.0,
"reward": 0.2767857266589999,
"reward_std": 0.14067190792411566,
"rewards/accuracy_reward": 0.0803571492433548,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.19642857927829027,
"step": 2
},
{
"completion_length": 6795.750213623047,
"epoch": 0.048,
"grad_norm": 0.002763825235888362,
"kl": 0.0002949237823486328,
"learning_rate": 4.285714285714285e-07,
"loss": 0.0,
"reward": 0.39955359511077404,
"reward_std": 0.19223998952656984,
"rewards/accuracy_reward": 0.1875000111758709,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.21205358067527413,
"step": 3
},
{
"completion_length": 5619.562652587891,
"epoch": 0.064,
"grad_norm": 0.0036455015651881695,
"kl": 0.00029200315475463867,
"learning_rate": 5.714285714285714e-07,
"loss": 0.0,
"reward": 0.3616071604192257,
"reward_std": 0.18272343277931213,
"rewards/accuracy_reward": 0.160714291036129,
"rewards/format_reward": 0.008928571827709675,
"rewards/tag_count_reward": 0.1919642947614193,
"step": 4
},
{
"completion_length": 5378.000259399414,
"epoch": 0.08,
"grad_norm": 0.008862568065524101,
"kl": 0.0003243684768676758,
"learning_rate": 7.142857142857143e-07,
"loss": 0.0,
"reward": 0.33035715809091926,
"reward_std": 0.22553458344191313,
"rewards/accuracy_reward": 0.1339285783469677,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.19642857881262898,
"step": 5
},
{
"completion_length": 6547.687942504883,
"epoch": 0.096,
"grad_norm": 0.0034230498131364584,
"kl": 0.0003108382225036621,
"learning_rate": 8.57142857142857e-07,
"loss": 0.0,
"reward": 0.4107143026776612,
"reward_std": 0.21918057976290584,
"rewards/accuracy_reward": 0.2142857238650322,
"rewards/format_reward": 0.008928571827709675,
"rewards/tag_count_reward": 0.18750000791624188,
"step": 6
},
{
"completion_length": 3935.875144958496,
"epoch": 0.112,
"grad_norm": 0.003568751970306039,
"kl": 0.00029009580612182617,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 0.3683035923168063,
"reward_std": 0.2914058305323124,
"rewards/accuracy_reward": 0.1696428656578064,
"rewards/format_reward": 0.008928571827709675,
"rewards/tag_count_reward": 0.1897321529686451,
"step": 7
},
{
"completion_length": 4970.687683105469,
"epoch": 0.128,
"grad_norm": 0.003850426757708192,
"kl": 0.00031566619873046875,
"learning_rate": 9.99266096766761e-07,
"loss": 0.0,
"reward": 0.38839287450537086,
"reward_std": 0.2490098038688302,
"rewards/accuracy_reward": 0.19642858020961285,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.19196429336443543,
"step": 8
},
{
"completion_length": 5700.437759399414,
"epoch": 0.144,
"grad_norm": 0.0036366251297295094,
"kl": 0.0003451108932495117,
"learning_rate": 9.970667809068474e-07,
"loss": 0.0,
"reward": 0.43973216600716114,
"reward_std": 0.19689470902085304,
"rewards/accuracy_reward": 0.2410714402794838,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.19866072200238705,
"step": 9
},
{
"completion_length": 7395.241371154785,
"epoch": 0.16,
"grad_norm": 0.010836871340870857,
"kl": 0.0002986788749694824,
"learning_rate": 9.934092261314617e-07,
"loss": 0.0,
"reward": 0.3459821566939354,
"reward_std": 0.17509831953793764,
"rewards/accuracy_reward": 0.14285714644938707,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.20312500838190317,
"step": 10
},
{
"completion_length": 7001.625335693359,
"epoch": 0.176,
"grad_norm": 0.019893741235136986,
"kl": 0.00028258562088012695,
"learning_rate": 9.883053626240501e-07,
"loss": 0.0,
"reward": 0.4218750260770321,
"reward_std": 0.24917150475084782,
"rewards/accuracy_reward": 0.23214287031441927,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.18973215110599995,
"step": 11
},
{
"completion_length": 6650.01815032959,
"epoch": 0.192,
"grad_norm": 0.013785521499812603,
"kl": 0.00031447410583496094,
"learning_rate": 9.817718381265238e-07,
"loss": 0.0,
"reward": 0.40178573317825794,
"reward_std": 0.2504308824427426,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.2142857238650322,
"step": 12
},
{
"completion_length": 4968.250289916992,
"epoch": 0.208,
"grad_norm": 0.004635457415133715,
"kl": 0.0003146529197692871,
"learning_rate": 9.738299636377862e-07,
"loss": 0.0,
"reward": 0.4598214505240321,
"reward_std": 0.3252177187241614,
"rewards/accuracy_reward": 0.2767857275903225,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.18303572293370962,
"step": 13
},
{
"completion_length": 6637.634262084961,
"epoch": 0.224,
"grad_norm": 0.01122356578707695,
"kl": 0.00028884410858154297,
"learning_rate": 9.645056439016825e-07,
"loss": 0.0,
"reward": 0.47321430686861277,
"reward_std": 0.25558971939608455,
"rewards/accuracy_reward": 0.2410714440047741,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.23214286472648382,
"step": 14
},
{
"completion_length": 5712.223403930664,
"epoch": 0.24,
"grad_norm": 0.01076567079871893,
"kl": 0.0002791881561279297,
"learning_rate": 9.538292929111112e-07,
"loss": 0.0,
"reward": 0.3616071557626128,
"reward_std": 0.27521964302286506,
"rewards/accuracy_reward": 0.1250000074505806,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.21875000931322575,
"step": 15
},
{
"completion_length": 7215.071830749512,
"epoch": 0.256,
"grad_norm": 0.020798197016119957,
"kl": 0.00030541419982910156,
"learning_rate": 9.418357347038998e-07,
"loss": 0.0,
"reward": 0.4397321636788547,
"reward_std": 0.3516994332894683,
"rewards/accuracy_reward": 0.258928582072258,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.18080357974395156,
"step": 16
},
{
"completion_length": 5953.2769775390625,
"epoch": 0.272,
"grad_norm": 0.013654655776917934,
"kl": 0.0002818107604980469,
"learning_rate": 9.285640897740315e-07,
"loss": 0.0,
"reward": 0.3035714435391128,
"reward_std": 0.21893947944045067,
"rewards/accuracy_reward": 0.12500000558793545,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.17857143539004028,
"step": 17
},
{
"completion_length": 6048.937774658203,
"epoch": 0.288,
"grad_norm": 0.013907884247601032,
"kl": 0.00029546022415161133,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0,
"reward": 0.4397321669384837,
"reward_std": 0.2832956803031266,
"rewards/accuracy_reward": 0.2321428656578064,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.20758929662406445,
"step": 18
},
{
"completion_length": 5523.152084350586,
"epoch": 0.304,
"grad_norm": 0.017561404034495354,
"kl": 0.0002797245979309082,
"learning_rate": 8.983637247875872e-07,
"loss": 0.0,
"reward": 0.2946428721770644,
"reward_std": 0.17000702070072293,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.23214287031441927,
"step": 19
},
{
"completion_length": 6402.52702331543,
"epoch": 0.32,
"grad_norm": 0.023619119077920914,
"kl": 0.0002855658531188965,
"learning_rate": 8.81533512044382e-07,
"loss": 0.0,
"reward": 0.39955358672887087,
"reward_std": 0.2746177548542619,
"rewards/accuracy_reward": 0.17857143748551607,
"rewards/format_reward": 0.008928571827709675,
"rewards/tag_count_reward": 0.21205357927829027,
"step": 20
},
{
"completion_length": 7257.973442077637,
"epoch": 0.336,
"grad_norm": 0.024335632100701332,
"kl": 0.0003019571304321289,
"learning_rate": 8.636219058948822e-07,
"loss": 0.0,
"reward": 0.38169644912704825,
"reward_std": 0.2994211660698056,
"rewards/accuracy_reward": 0.15178572107106447,
"rewards/format_reward": 0.008928571827709675,
"rewards/tag_count_reward": 0.2209821525029838,
"step": 21
},
{
"completion_length": 6505.696727752686,
"epoch": 0.352,
"grad_norm": 0.015834977850317955,
"kl": 0.00024816393852233887,
"learning_rate": 8.446873302753783e-07,
"loss": 0.0,
"reward": 0.4196428768336773,
"reward_std": 0.348601452074945,
"rewards/accuracy_reward": 0.20535715483129025,
"rewards/format_reward": 0.008928571827709675,
"rewards/tag_count_reward": 0.20535715110599995,
"step": 22
},
{
"completion_length": 6381.687782287598,
"epoch": 0.368,
"grad_norm": 0.003612571395933628,
"kl": 0.00027638673782348633,
"learning_rate": 8.247915458359471e-07,
"loss": 0.0,
"reward": 0.6183036016300321,
"reward_std": 0.4431227990426123,
"rewards/accuracy_reward": 0.41964287776499987,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.19866072200238705,
"step": 23
},
{
"completion_length": 6749.973495483398,
"epoch": 0.384,
"grad_norm": 0.03762313351035118,
"kl": 0.00029355287551879883,
"learning_rate": 8.039994484900462e-07,
"loss": 0.0,
"reward": 0.703125043772161,
"reward_std": 0.5290834615007043,
"rewards/accuracy_reward": 0.5089286034926772,
"rewards/format_reward": 0.008928571827709675,
"rewards/tag_count_reward": 0.18526786682195961,
"step": 24
},
{
"completion_length": 7313.741317749023,
"epoch": 0.4,
"grad_norm": 0.054487138986587524,
"kl": 0.0003046095371246338,
"learning_rate": 7.823788577375328e-07,
"loss": 0.0,
"reward": 0.7790178973227739,
"reward_std": 0.6166830230504274,
"rewards/accuracy_reward": 0.5357143124565482,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.24330358393490314,
"step": 25
},
{
"completion_length": 8691.36654663086,
"epoch": 0.416,
"grad_norm": 0.030647600069642067,
"kl": 0.0002758502960205078,
"learning_rate": 7.600002954515531e-07,
"loss": 0.0,
"reward": 0.638392886146903,
"reward_std": 0.48968397080898285,
"rewards/accuracy_reward": 0.38392858672887087,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.23660715110599995,
"step": 26
},
{
"completion_length": 8900.545036315918,
"epoch": 0.432,
"grad_norm": 0.07716870307922363,
"kl": 0.00028139352798461914,
"learning_rate": 7.36936755850849e-07,
"loss": 0.0001,
"reward": 0.9486607648432255,
"reward_std": 0.5833425354212523,
"rewards/accuracy_reward": 0.5714285988360643,
"rewards/format_reward": 0.0357142873108387,
"rewards/tag_count_reward": 0.34151787450537086,
"step": 27
},
{
"completion_length": 6317.732406616211,
"epoch": 0.448,
"grad_norm": 0.029883868992328644,
"kl": 0.00029206275939941406,
"learning_rate": 7.132634674077883e-07,
"loss": 0.0,
"reward": 0.7343750456348062,
"reward_std": 0.49824068509042263,
"rewards/accuracy_reward": 0.5357143143191934,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.19866072479635477,
"step": 28
},
{
"completion_length": 9996.053977966309,
"epoch": 0.464,
"grad_norm": 0.06153450533747673,
"kl": 0.0002873539924621582,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0,
"reward": 0.7254464644938707,
"reward_std": 0.5397462174296379,
"rewards/accuracy_reward": 0.419642873108387,
"rewards/format_reward": 0.05357143096625805,
"rewards/tag_count_reward": 0.2522321562282741,
"step": 29
},
{
"completion_length": 7258.732467651367,
"epoch": 0.48,
"grad_norm": 0.033417101949453354,
"kl": 0.0002796053886413574,
"learning_rate": 6.643982503870692e-07,
"loss": 0.0,
"reward": 0.6473214533179998,
"reward_std": 0.5155255328863859,
"rewards/accuracy_reward": 0.42857144493609667,
"rewards/format_reward": 0.008928571827709675,
"rewards/tag_count_reward": 0.20982143795117736,
"step": 30
},
{
"completion_length": 10728.00942993164,
"epoch": 0.496,
"grad_norm": 0.06687657535076141,
"kl": 0.0002480745315551758,
"learning_rate": 6.393657099905853e-07,
"loss": 0.0,
"reward": 0.8504464663565159,
"reward_std": 0.5495956116355956,
"rewards/accuracy_reward": 0.4821428768336773,
"rewards/format_reward": 0.05357143096625805,
"rewards/tag_count_reward": 0.31473215483129025,
"step": 31
},
{
"completion_length": 8516.982543945312,
"epoch": 0.512,
"grad_norm": 0.04354412481188774,
"kl": 0.0002473890781402588,
"learning_rate": 6.140416772229784e-07,
"loss": 0.0,
"reward": 0.6741071790456772,
"reward_std": 0.45561165921390057,
"rewards/accuracy_reward": 0.4107143012806773,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.2455357238650322,
"step": 32
},
{
"completion_length": 8822.393253326416,
"epoch": 0.528,
"grad_norm": 0.05626585707068443,
"kl": 0.0002841353416442871,
"learning_rate": 5.88508753815478e-07,
"loss": 0.0001,
"reward": 0.6629464644938707,
"reward_std": 0.42721313470974565,
"rewards/accuracy_reward": 0.3928571566939354,
"rewards/format_reward": 0.008928571827709675,
"rewards/tag_count_reward": 0.26116072479635477,
"step": 33
},
{
"completion_length": 10038.589805603027,
"epoch": 0.544,
"grad_norm": 0.05266648903489113,
"kl": 0.00027632713317871094,
"learning_rate": 5.628502228571632e-07,
"loss": 0.0,
"reward": 0.7388393133878708,
"reward_std": 0.4553453531116247,
"rewards/accuracy_reward": 0.4285714467987418,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.2924107250291854,
"step": 34
},
{
"completion_length": 8031.839714050293,
"epoch": 0.56,
"grad_norm": 0.05634909123182297,
"kl": 0.0003362894058227539,
"learning_rate": 5.371497771428367e-07,
"loss": 0.0,
"reward": 0.9084821827709675,
"reward_std": 0.5687091294676065,
"rewards/accuracy_reward": 0.5625000195577741,
"rewards/format_reward": 0.044642859138548374,
"rewards/tag_count_reward": 0.30133930034935474,
"step": 35
},
{
"completion_length": 9826.420181274414,
"epoch": 0.576,
"grad_norm": 0.07227327674627304,
"kl": 0.00029417872428894043,
"learning_rate": 5.114912461845222e-07,
"loss": 0.0001,
"reward": 0.8482143394649029,
"reward_std": 0.46484142914414406,
"rewards/accuracy_reward": 0.5089285997673869,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.321428582072258,
"step": 36
},
{
"completion_length": 10691.991607666016,
"epoch": 0.592,
"grad_norm": 0.07488638162612915,
"kl": 0.00031763315200805664,
"learning_rate": 4.859583227770217e-07,
"loss": 0.0,
"reward": 0.8169643171131611,
"reward_std": 0.5879267286509275,
"rewards/accuracy_reward": 0.44642859045416117,
"rewards/format_reward": 0.044642859138548374,
"rewards/tag_count_reward": 0.32589287776499987,
"step": 37
},
{
"completion_length": 9507.875434875488,
"epoch": 0.608,
"grad_norm": 0.0768597424030304,
"kl": 0.0003135800361633301,
"learning_rate": 4.606342900094147e-07,
"loss": 0.0,
"reward": 0.7522321715950966,
"reward_std": 0.4764951467514038,
"rewards/accuracy_reward": 0.348214297555387,
"rewards/format_reward": 0.0625000037252903,
"rewards/tag_count_reward": 0.3415178721770644,
"step": 38
},
{
"completion_length": 8807.152183532715,
"epoch": 0.624,
"grad_norm": 0.08563056588172913,
"kl": 0.0003771781921386719,
"learning_rate": 4.3560174961293094e-07,
"loss": 0.0,
"reward": 0.8482143264263868,
"reward_std": 0.6095239520072937,
"rewards/accuracy_reward": 0.5089285960420966,
"rewards/format_reward": 0.0357142873108387,
"rewards/tag_count_reward": 0.3035714440047741,
"step": 39
},
{
"completion_length": 9334.85765838623,
"epoch": 0.64,
"grad_norm": 0.0718473568558693,
"kl": 0.00036656856536865234,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0,
"reward": 0.6473214626312256,
"reward_std": 0.4437146638520062,
"rewards/accuracy_reward": 0.3214285857975483,
"rewards/format_reward": 0.026785715483129025,
"rewards/tag_count_reward": 0.2991071594879031,
"step": 40
},
{
"completion_length": 9168.080871582031,
"epoch": 0.656,
"grad_norm": 0.06268931925296783,
"kl": 0.00036334991455078125,
"learning_rate": 3.867365325922116e-07,
"loss": 0.0,
"reward": 0.6607143171131611,
"reward_std": 0.5487463716417551,
"rewards/accuracy_reward": 0.3750000176951289,
"rewards/format_reward": 0.0357142873108387,
"rewards/tag_count_reward": 0.2500000074505806,
"step": 41
},
{
"completion_length": 9371.27727508545,
"epoch": 0.672,
"grad_norm": 0.06987561285495758,
"kl": 0.00038748979568481445,
"learning_rate": 3.630632441491511e-07,
"loss": 0.0,
"reward": 0.6339286072179675,
"reward_std": 0.38090350618585944,
"rewards/accuracy_reward": 0.39285716600716114,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.24107143888249993,
"step": 42
},
{
"completion_length": 7754.02717590332,
"epoch": 0.688,
"grad_norm": 0.0807015672326088,
"kl": 0.00042748451232910156,
"learning_rate": 3.3999970454844686e-07,
"loss": 0.0,
"reward": 0.6428571809083223,
"reward_std": 0.4524891930632293,
"rewards/accuracy_reward": 0.37500002328306437,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.2500000102445483,
"step": 43
},
{
"completion_length": 9262.286231994629,
"epoch": 0.704,
"grad_norm": 0.10762109607458115,
"kl": 0.0004887580871582031,
"learning_rate": 3.1762114226246716e-07,
"loss": 0.0,
"reward": 0.7477678954601288,
"reward_std": 0.4671580633148551,
"rewards/accuracy_reward": 0.4732143096625805,
"rewards/format_reward": 0.026785715483129025,
"rewards/tag_count_reward": 0.2477678651921451,
"step": 44
},
{
"completion_length": 9673.063018798828,
"epoch": 0.72,
"grad_norm": 0.08651269227266312,
"kl": 0.0004634857177734375,
"learning_rate": 2.9600055150995395e-07,
"loss": 0.0,
"reward": 0.7544643264263868,
"reward_std": 0.44256059266626835,
"rewards/accuracy_reward": 0.43750001303851604,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.2991071557626128,
"step": 45
},
{
"completion_length": 7535.82181930542,
"epoch": 0.736,
"grad_norm": 0.07591980695724487,
"kl": 0.0005333423614501953,
"learning_rate": 2.752084541640528e-07,
"loss": 0.0,
"reward": 0.674107177183032,
"reward_std": 0.3459728816524148,
"rewards/accuracy_reward": 0.40178572945296764,
"rewards/format_reward": 0.01785714365541935,
"rewards/tag_count_reward": 0.2544642984867096,
"step": 46
},
{
"completion_length": 10174.214828491211,
"epoch": 0.752,
"grad_norm": 0.14260835945606232,
"kl": 0.0004648566246032715,
"learning_rate": 2.553126697246217e-07,
"loss": 0.0,
"reward": 0.5022321678698063,
"reward_std": 0.42994912061840296,
"rewards/accuracy_reward": 0.1607142947614193,
"rewards/format_reward": 0.0357142873108387,
"rewards/tag_count_reward": 0.30580358393490314,
"step": 47
},
{
"completion_length": 7719.678955078125,
"epoch": 0.768,
"grad_norm": 0.0768926590681076,
"kl": 0.0005067586898803711,
"learning_rate": 2.36378094105118e-07,
"loss": 0.0,
"reward": 0.5669643115252256,
"reward_std": 0.3042712449096143,
"rewards/accuracy_reward": 0.2142857201397419,
"rewards/format_reward": 0.026785715483129025,
"rewards/tag_count_reward": 0.3258928721770644,
"step": 48
},
{
"completion_length": 12495.509475708008,
"epoch": 0.784,
"grad_norm": 0.16616983711719513,
"kl": 0.0005098581314086914,
"learning_rate": 2.1846648795561774e-07,
"loss": 0.0,
"reward": 0.47767859883606434,
"reward_std": 0.3662447426468134,
"rewards/accuracy_reward": 0.053571431897580624,
"rewards/format_reward": 0.026785715483129025,
"rewards/tag_count_reward": 0.39732144586741924,
"step": 49
},
{
"completion_length": 8463.054054260254,
"epoch": 0.8,
"grad_norm": 0.1351144015789032,
"kl": 0.0005650520324707031,
"learning_rate": 2.016362752124129e-07,
"loss": 0.0,
"reward": 0.5178571599535644,
"reward_std": 0.4144706530496478,
"rewards/accuracy_reward": 0.15178572293370962,
"rewards/format_reward": 0.06250000279396772,
"rewards/tag_count_reward": 0.3035714398138225,
"step": 50
},
{
"completion_length": 7572.7681884765625,
"epoch": 0.816,
"grad_norm": 0.11977759003639221,
"kl": 0.0005983114242553711,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0,
"reward": 0.6250000293366611,
"reward_std": 0.43050514021888375,
"rewards/accuracy_reward": 0.2500000111758709,
"rewards/format_reward": 0.053571430034935474,
"rewards/tag_count_reward": 0.3214285848662257,
"step": 51
},
{
"completion_length": 12172.411254882812,
"epoch": 0.832,
"grad_norm": 0.22523346543312073,
"kl": 0.0006231069564819336,
"learning_rate": 1.7143591022596842e-07,
"loss": 0.0,
"reward": 0.6294643133878708,
"reward_std": 0.5201228363439441,
"rewards/accuracy_reward": 0.0625000037252903,
"rewards/format_reward": 0.10714286286383867,
"rewards/tag_count_reward": 0.45982144586741924,
"step": 52
},
{
"completion_length": 12255.000625610352,
"epoch": 0.848,
"grad_norm": 0.13431492447853088,
"kl": 0.0006296634674072266,
"learning_rate": 1.5816426529610032e-07,
"loss": 0.0,
"reward": 0.6406250288709998,
"reward_std": 0.4715617010369897,
"rewards/accuracy_reward": 0.1964285783469677,
"rewards/format_reward": 0.06250000279396772,
"rewards/tag_count_reward": 0.38169644121080637,
"step": 53
},
{
"completion_length": 9697.938034057617,
"epoch": 0.864,
"grad_norm": 0.15816675126552582,
"kl": 0.0006450414657592773,
"learning_rate": 1.461707070888888e-07,
"loss": 0.0001,
"reward": 0.6540178917348385,
"reward_std": 0.459601366892457,
"rewards/accuracy_reward": 0.16964286752045155,
"rewards/format_reward": 0.06250000279396772,
"rewards/tag_count_reward": 0.4218750186264515,
"step": 54
},
{
"completion_length": 13606.697204589844,
"epoch": 0.88,
"grad_norm": 0.15856020152568817,
"kl": 0.000563502311706543,
"learning_rate": 1.354943560983175e-07,
"loss": 0.0,
"reward": 0.5580357415601611,
"reward_std": 0.43626588862389326,
"rewards/accuracy_reward": 0.16071429383009672,
"rewards/format_reward": 0.05357143096625805,
"rewards/tag_count_reward": 0.34375001583248377,
"step": 55
},
{
"completion_length": 11956.571960449219,
"epoch": 0.896,
"grad_norm": 0.22786200046539307,
"kl": 0.0005663633346557617,
"learning_rate": 1.2617003636221394e-07,
"loss": 0.0,
"reward": 0.642857177183032,
"reward_std": 0.49508959893137217,
"rewards/accuracy_reward": 0.1607142947614193,
"rewards/format_reward": 0.07142857555299997,
"rewards/tag_count_reward": 0.4107143022119999,
"step": 56
},
{
"completion_length": 11972.813079833984,
"epoch": 0.912,
"grad_norm": 0.1637219786643982,
"kl": 0.000640869140625,
"learning_rate": 1.1822816187347622e-07,
"loss": 0.0,
"reward": 0.6941964700818062,
"reward_std": 0.5343647776171565,
"rewards/accuracy_reward": 0.13392857927829027,
"rewards/format_reward": 0.09821429010480642,
"rewards/tag_count_reward": 0.46205359417945147,
"step": 57
},
{
"completion_length": 10837.938003540039,
"epoch": 0.928,
"grad_norm": 0.18956886231899261,
"kl": 0.0006705522537231445,
"learning_rate": 1.1169463737594995e-07,
"loss": 0.0,
"reward": 0.7120535988360643,
"reward_std": 0.45146402157843113,
"rewards/accuracy_reward": 0.10714285913854837,
"rewards/format_reward": 0.1250000074505806,
"rewards/tag_count_reward": 0.4799107350409031,
"step": 58
},
{
"completion_length": 11165.286224365234,
"epoch": 0.944,
"grad_norm": 0.2458626627922058,
"kl": 0.0006707906723022461,
"learning_rate": 1.0659077386853815e-07,
"loss": 0.0,
"reward": 0.6183036025613546,
"reward_std": 0.5205434840172529,
"rewards/accuracy_reward": 0.0357142873108387,
"rewards/format_reward": 0.09821429010480642,
"rewards/tag_count_reward": 0.48437502421438694,
"step": 59
},
{
"completion_length": 10342.821807861328,
"epoch": 0.96,
"grad_norm": 0.31647762656211853,
"kl": 0.0007990598678588867,
"learning_rate": 1.0293321909315241e-07,
"loss": 0.0,
"reward": 0.7611607518047094,
"reward_std": 0.5515336757525802,
"rewards/accuracy_reward": 0.1607142947614193,
"rewards/format_reward": 0.10714286286383867,
"rewards/tag_count_reward": 0.4933035932481289,
"step": 60
},
{
"completion_length": 13808.741760253906,
"epoch": 0.976,
"grad_norm": 0.182793527841568,
"kl": 0.000666499137878418,
"learning_rate": 1.0073390323323897e-07,
"loss": 0.0,
"reward": 0.5178571622818708,
"reward_std": 0.4044363498687744,
"rewards/accuracy_reward": 0.0714285746216774,
"rewards/format_reward": 0.044642859138548374,
"rewards/tag_count_reward": 0.40178572945296764,
"step": 61
},
{
"completion_length": 13192.063171386719,
"epoch": 0.992,
"grad_norm": 0.23750482499599457,
"kl": 0.0006361007690429688,
"learning_rate": 1e-07,
"loss": 0.0,
"reward": 0.4955357387661934,
"reward_std": 0.3679911820217967,
"rewards/accuracy_reward": 0.026785715483129025,
"rewards/format_reward": 0.044642859138548374,
"rewards/tag_count_reward": 0.42410716228187084,
"step": 62
},
{
"epoch": 0.992,
"step": 62,
"total_flos": 0.0,
"train_loss": 1.9587027928480438e-05,
"train_runtime": 139509.2453,
"train_samples_per_second": 0.007,
"train_steps_per_second": 0.0
}
],
"logging_steps": 1,
"max_steps": 62,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}