Blancy's picture
Model save
a5f1ee4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9940119760479041,
"eval_steps": 500,
"global_step": 83,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 1190.78125,
"epoch": 0.011976047904191617,
"grad_norm": 36.3756103515625,
"kl": 0.0,
"learning_rate": 1.111111111111111e-07,
"loss": 0.0,
"reward": 0.6835937760770321,
"reward_std": 0.11635640449821949,
"rewards/accuracy_reward": 0.6302083432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0533854179084301,
"step": 1
},
{
"completion_length": 1470.078125,
"epoch": 0.023952095808383235,
"grad_norm": 27.217134475708008,
"kl": 0.0,
"learning_rate": 2.222222222222222e-07,
"loss": 0.0,
"reward": 0.5325521007180214,
"reward_std": 0.13405859377235174,
"rewards/accuracy_reward": 0.4947916716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0377604179084301,
"step": 2
},
{
"completion_length": 1473.5,
"epoch": 0.03592814371257485,
"grad_norm": 12.41641902923584,
"kl": 0.0007615089416503906,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0,
"reward": 0.4713541716337204,
"reward_std": 0.11533699464052916,
"rewards/accuracy_reward": 0.42187501257285476,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0494791679084301,
"step": 3
},
{
"completion_length": 896.5625,
"epoch": 0.04790419161676647,
"grad_norm": 30.794960021972656,
"kl": 0.0019729137420654297,
"learning_rate": 4.444444444444444e-07,
"loss": 0.0001,
"reward": 0.5976562760770321,
"reward_std": 0.13709542341530323,
"rewards/accuracy_reward": 0.5625000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.03515625197906047,
"step": 4
},
{
"completion_length": 1053.5520935058594,
"epoch": 0.059880239520958084,
"grad_norm": 14.93273639678955,
"kl": 0.0015816688537597656,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0001,
"reward": 0.5976562611758709,
"reward_std": 0.16129255667328835,
"rewards/accuracy_reward": 0.5625000111758709,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.03515625,
"step": 5
},
{
"completion_length": 1321.125,
"epoch": 0.0718562874251497,
"grad_norm": 20.902679443359375,
"kl": 0.00043714046478271484,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0,
"reward": 0.5429687525611371,
"reward_std": 0.07764231134206057,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.04296875069849193,
"step": 6
},
{
"completion_length": 996.9166717529297,
"epoch": 0.08383233532934131,
"grad_norm": 32.750858306884766,
"kl": 0.006168365478515625,
"learning_rate": 7.777777777777778e-07,
"loss": 0.0002,
"reward": 0.7981771156191826,
"reward_std": 0.12683826312422752,
"rewards/accuracy_reward": 0.7239583507180214,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0742187537252903,
"step": 7
},
{
"completion_length": 1024.3125,
"epoch": 0.09580838323353294,
"grad_norm": 18.83184051513672,
"kl": 0.016448974609375,
"learning_rate": 8.888888888888888e-07,
"loss": 0.0007,
"reward": 0.720052108168602,
"reward_std": 0.15430233627557755,
"rewards/accuracy_reward": 0.6718750149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.048177084885537624,
"step": 8
},
{
"completion_length": 1155.125,
"epoch": 0.10778443113772455,
"grad_norm": 16.292612075805664,
"kl": 0.025983810424804688,
"learning_rate": 1e-06,
"loss": 0.001,
"reward": 0.5546875074505806,
"reward_std": 0.10253959987312555,
"rewards/accuracy_reward": 0.5052083432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.049479166977107525,
"step": 9
},
{
"completion_length": 1780.625,
"epoch": 0.11976047904191617,
"grad_norm": 4.369594097137451,
"kl": 0.03295087814331055,
"learning_rate": 9.995945347921067e-07,
"loss": 0.0013,
"reward": 0.3697916716337204,
"reward_std": 0.13915570452809334,
"rewards/accuracy_reward": 0.3489583507180214,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.02083333395421505,
"step": 10
},
{
"completion_length": 1026.125,
"epoch": 0.1317365269461078,
"grad_norm": 19.12957763671875,
"kl": 0.23905563354492188,
"learning_rate": 9.983788698441369e-07,
"loss": 0.0096,
"reward": 0.6992187798023224,
"reward_std": 0.12939812522381544,
"rewards/accuracy_reward": 0.6406250074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.05859375256113708,
"step": 11
},
{
"completion_length": 1603.2135620117188,
"epoch": 0.1437125748502994,
"grad_norm": 18.756772994995117,
"kl": 0.11810016632080078,
"learning_rate": 9.963551958664945e-07,
"loss": 0.0047,
"reward": 0.3658854365348816,
"reward_std": 0.0706010814756155,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.03255208441987634,
"step": 12
},
{
"completion_length": 1135.5885620117188,
"epoch": 0.15568862275449102,
"grad_norm": 11.572704315185547,
"kl": 0.27099609375,
"learning_rate": 9.935271596564688e-07,
"loss": 0.0108,
"reward": 0.7382812723517418,
"reward_std": 0.11970062833279371,
"rewards/accuracy_reward": 0.6770833432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0611979179084301,
"step": 13
},
{
"completion_length": 1584.75,
"epoch": 0.16766467065868262,
"grad_norm": 4.483678817749023,
"kl": 0.29352617263793945,
"learning_rate": 9.898998575264588e-07,
"loss": 0.0117,
"reward": 0.4557291716337204,
"reward_std": 0.07522482145577669,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.03906250186264515,
"step": 14
},
{
"completion_length": 1438.75,
"epoch": 0.17964071856287425,
"grad_norm": 4.186062812805176,
"kl": 0.42242431640625,
"learning_rate": 9.854798261200746e-07,
"loss": 0.0169,
"reward": 0.502604179084301,
"reward_std": 0.1203515324741602,
"rewards/accuracy_reward": 0.43750001303851604,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06510416697710752,
"step": 15
},
{
"completion_length": 1288.34375,
"epoch": 0.19161676646706588,
"grad_norm": 15.665771484375,
"kl": 0.84912109375,
"learning_rate": 9.80275030632663e-07,
"loss": 0.034,
"reward": 0.6640625298023224,
"reward_std": 0.10871894843876362,
"rewards/accuracy_reward": 0.5885416865348816,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.07552083674818277,
"step": 16
},
{
"completion_length": 1175.25,
"epoch": 0.20359281437125748,
"grad_norm": 8.767546653747559,
"kl": 0.7672920227050781,
"learning_rate": 9.742948504574879e-07,
"loss": 0.0306,
"reward": 0.6406250149011612,
"reward_std": 0.13799083977937698,
"rewards/accuracy_reward": 0.5572916753590107,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08333333488553762,
"step": 17
},
{
"completion_length": 1315.5,
"epoch": 0.2155688622754491,
"grad_norm": 2.373502016067505,
"kl": 0.21795654296875,
"learning_rate": 9.675500622834293e-07,
"loss": 0.0087,
"reward": 0.42578125,
"reward_std": 0.15298314206302166,
"rewards/accuracy_reward": 0.3645833507180214,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.061197918839752674,
"step": 18
},
{
"completion_length": 1290.0625,
"epoch": 0.2275449101796407,
"grad_norm": 8.432291984558105,
"kl": 0.6476497650146484,
"learning_rate": 9.60052820674661e-07,
"loss": 0.0259,
"reward": 0.673177108168602,
"reward_std": 0.12136406265199184,
"rewards/accuracy_reward": 0.6041666772216558,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06901041883975267,
"step": 19
},
{
"completion_length": 1698.1666870117188,
"epoch": 0.23952095808383234,
"grad_norm": 7.2242431640625,
"kl": 0.6529922485351562,
"learning_rate": 9.518166361673058e-07,
"loss": 0.0261,
"reward": 0.38151043420657516,
"reward_std": 0.08616631850600243,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.04817708535119891,
"step": 20
},
{
"completion_length": 1424.734375,
"epoch": 0.25149700598802394,
"grad_norm": 6.204524993896484,
"kl": 0.6253471374511719,
"learning_rate": 9.428563509225346e-07,
"loss": 0.0251,
"reward": 0.4765625149011612,
"reward_std": 0.08804207853972912,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.059895834885537624,
"step": 21
},
{
"completion_length": 1133.75,
"epoch": 0.2634730538922156,
"grad_norm": 3.5956451892852783,
"kl": 0.60986328125,
"learning_rate": 9.3318811197999e-07,
"loss": 0.0244,
"reward": 0.6497396156191826,
"reward_std": 0.106993043795228,
"rewards/accuracy_reward": 0.5937500074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.055989584885537624,
"step": 22
},
{
"completion_length": 2152.4479370117188,
"epoch": 0.2754491017964072,
"grad_norm": 1.9209452867507935,
"kl": 0.004711151123046875,
"learning_rate": 9.228293421597289e-07,
"loss": 0.0002,
"reward": 0.2018229179084301,
"reward_std": 0.0970163643360138,
"rewards/accuracy_reward": 0.17187500512227416,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.029947917442768812,
"step": 23
},
{
"completion_length": 1283.875,
"epoch": 0.2874251497005988,
"grad_norm": 3.2163443565368652,
"kl": 0.5288314819335938,
"learning_rate": 9.117987086651232e-07,
"loss": 0.0211,
"reward": 0.5130208432674408,
"reward_std": 0.13065862283110619,
"rewards/accuracy_reward": 0.44791667675599456,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06510416883975267,
"step": 24
},
{
"completion_length": 1132.5625,
"epoch": 0.2994011976047904,
"grad_norm": 3.4035706520080566,
"kl": 1.2806243896484375,
"learning_rate": 9.001160894432978e-07,
"loss": 0.0513,
"reward": 0.6106770932674408,
"reward_std": 0.13533879444003105,
"rewards/accuracy_reward": 0.5312500149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.07942708488553762,
"step": 25
},
{
"completion_length": 1262.109375,
"epoch": 0.31137724550898205,
"grad_norm": 6.233963966369629,
"kl": 0.724578857421875,
"learning_rate": 8.878025373637259e-07,
"loss": 0.029,
"reward": 0.6315104318782687,
"reward_std": 0.13147221505641937,
"rewards/accuracy_reward": 0.5625000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06901041883975267,
"step": 26
},
{
"completion_length": 1111.1875,
"epoch": 0.32335329341317365,
"grad_norm": 11.196812629699707,
"kl": 1.4609375,
"learning_rate": 8.748802422795359e-07,
"loss": 0.0584,
"reward": 0.7526042014360428,
"reward_std": 0.11228201538324356,
"rewards/accuracy_reward": 0.666666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08593750279396772,
"step": 27
},
{
"completion_length": 1422.9375,
"epoch": 0.33532934131736525,
"grad_norm": 3.3535704612731934,
"kl": 0.4104576110839844,
"learning_rate": 8.613724910398959e-07,
"loss": 0.0164,
"reward": 0.6901042014360428,
"reward_std": 0.15475903172045946,
"rewards/accuracy_reward": 0.6093750149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08072917046956718,
"step": 28
},
{
"completion_length": 1568.6198120117188,
"epoch": 0.3473053892215569,
"grad_norm": 2.9321556091308594,
"kl": 0.41168212890625,
"learning_rate": 8.473036255255366e-07,
"loss": 0.0165,
"reward": 0.3984375111758709,
"reward_std": 0.19856118597090244,
"rewards/accuracy_reward": 0.338541679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.05989583395421505,
"step": 29
},
{
"completion_length": 1832.75,
"epoch": 0.3592814371257485,
"grad_norm": 4.1487956047058105,
"kl": 0.36643218994140625,
"learning_rate": 8.32698998783039e-07,
"loss": 0.0147,
"reward": 0.3750000149011612,
"reward_std": 0.17254010029137135,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0416666679084301,
"step": 30
},
{
"completion_length": 1245.125,
"epoch": 0.3712574850299401,
"grad_norm": 2.372616767883301,
"kl": 1.181640625,
"learning_rate": 8.17584929336929e-07,
"loss": 0.0472,
"reward": 0.604166679084301,
"reward_std": 0.1132371760904789,
"rewards/accuracy_reward": 0.5052083432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.09895833674818277,
"step": 31
},
{
"completion_length": 983.375,
"epoch": 0.38323353293413176,
"grad_norm": 6.431793212890625,
"kl": 1.734375,
"learning_rate": 8.019886537619179e-07,
"loss": 0.0694,
"reward": 0.614583358168602,
"reward_std": 0.13955211825668812,
"rewards/accuracy_reward": 0.5468750223517418,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06770833488553762,
"step": 32
},
{
"completion_length": 1390.5625,
"epoch": 0.39520958083832336,
"grad_norm": 3.662283420562744,
"kl": 0.57391357421875,
"learning_rate": 7.859382776007543e-07,
"loss": 0.023,
"reward": 0.48828126303851604,
"reward_std": 0.11240259557962418,
"rewards/accuracy_reward": 0.4166666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0716145858168602,
"step": 33
},
{
"completion_length": 1514.8958740234375,
"epoch": 0.40718562874251496,
"grad_norm": 4.9103546142578125,
"kl": 1.3856163024902344,
"learning_rate": 7.694627247161356e-07,
"loss": 0.0553,
"reward": 0.5234375055879354,
"reward_std": 0.10020329616963863,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.10677083767950535,
"step": 34
},
{
"completion_length": 1381.875,
"epoch": 0.41916167664670656,
"grad_norm": 15.807221412658691,
"kl": 1.150360107421875,
"learning_rate": 7.525916851679529e-07,
"loss": 0.0461,
"reward": 0.4739583432674408,
"reward_std": 0.09151106514036655,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.057291668839752674,
"step": 35
},
{
"completion_length": 1381.5,
"epoch": 0.4311377245508982,
"grad_norm": 2.794029712677002,
"kl": 0.950286865234375,
"learning_rate": 7.353555617097967e-07,
"loss": 0.038,
"reward": 0.5833333386108279,
"reward_std": 0.08740208484232426,
"rewards/accuracy_reward": 0.5000000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08333333674818277,
"step": 36
},
{
"completion_length": 1376.0625,
"epoch": 0.4431137724550898,
"grad_norm": 6.0238356590271,
"kl": 0.79541015625,
"learning_rate": 7.177854150011389e-07,
"loss": 0.0318,
"reward": 0.549479179084301,
"reward_std": 0.11059301160275936,
"rewards/accuracy_reward": 0.4739583432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.07552083488553762,
"step": 37
},
{
"completion_length": 1776.8958740234375,
"epoch": 0.4550898203592814,
"grad_norm": 1.405756950378418,
"kl": 0.7037200927734375,
"learning_rate": 6.999129076339259e-07,
"loss": 0.028,
"reward": 0.39843751629814506,
"reward_std": 0.1277984417974949,
"rewards/accuracy_reward": 0.3489583386108279,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.04947916744276881,
"step": 38
},
{
"completion_length": 1105.6875,
"epoch": 0.46706586826347307,
"grad_norm": 3.609495162963867,
"kl": 1.002105712890625,
"learning_rate": 6.817702470744477e-07,
"loss": 0.0401,
"reward": 0.5859375149011612,
"reward_std": 0.10204238072037697,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0859375037252903,
"step": 39
},
{
"completion_length": 1078.125,
"epoch": 0.47904191616766467,
"grad_norm": 9.784010887145996,
"kl": 1.2890625,
"learning_rate": 6.633901276233064e-07,
"loss": 0.0517,
"reward": 0.673177108168602,
"reward_std": 0.11213659681379795,
"rewards/accuracy_reward": 0.5885416865348816,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08463541977107525,
"step": 40
},
{
"completion_length": 1378.8229370117188,
"epoch": 0.49101796407185627,
"grad_norm": 1.9631364345550537,
"kl": 0.4476318359375,
"learning_rate": 6.448056714980767e-07,
"loss": 0.0179,
"reward": 0.4648437676951289,
"reward_std": 0.12703735567629337,
"rewards/accuracy_reward": 0.3906250149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.07421875093132257,
"step": 41
},
{
"completion_length": 1081.8125,
"epoch": 0.5029940119760479,
"grad_norm": 3.6088380813598633,
"kl": 1.1484375,
"learning_rate": 6.260503691448321e-07,
"loss": 0.046,
"reward": 0.8606771230697632,
"reward_std": 0.1145353289321065,
"rewards/accuracy_reward": 0.7552083432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1054687537252903,
"step": 42
},
{
"completion_length": 1213.71875,
"epoch": 0.5149700598802395,
"grad_norm": 6.979413032531738,
"kl": 0.8447265625,
"learning_rate": 6.071580188860954e-07,
"loss": 0.0339,
"reward": 0.5833333432674408,
"reward_std": 0.11912628076970577,
"rewards/accuracy_reward": 0.5052083507180214,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.07812500186264515,
"step": 43
},
{
"completion_length": 1243.1823120117188,
"epoch": 0.5269461077844312,
"grad_norm": 4.323620319366455,
"kl": 0.6096343994140625,
"learning_rate": 5.881626660139791e-07,
"loss": 0.0245,
"reward": 0.5872395960614085,
"reward_std": 0.12293908558785915,
"rewards/accuracy_reward": 0.5052083432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08203125186264515,
"step": 44
},
{
"completion_length": 1625.1875,
"epoch": 0.5389221556886228,
"grad_norm": 6.710347652435303,
"kl": 0.7031707763671875,
"learning_rate": 5.690985414382668e-07,
"loss": 0.0281,
"reward": 0.5312500149011612,
"reward_std": 0.12612489983439445,
"rewards/accuracy_reward": 0.463541679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06770833535119891,
"step": 45
},
{
"completion_length": 1616.2864685058594,
"epoch": 0.5508982035928144,
"grad_norm": 5.034140110015869,
"kl": 0.63177490234375,
"learning_rate": 5.5e-07,
"loss": 0.0253,
"reward": 0.38932292722165585,
"reward_std": 0.09617834351956844,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0559895858168602,
"step": 46
},
{
"completion_length": 1878.4010620117188,
"epoch": 0.562874251497006,
"grad_norm": 6.528008460998535,
"kl": 0.196319580078125,
"learning_rate": 5.309014585617334e-07,
"loss": 0.0079,
"reward": 0.21614583837799728,
"reward_std": 0.08288709167391062,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.04947916860692203,
"step": 47
},
{
"completion_length": 1067.484375,
"epoch": 0.5748502994011976,
"grad_norm": 4.622894287109375,
"kl": 1.21875,
"learning_rate": 5.11837333986021e-07,
"loss": 0.0487,
"reward": 0.9218750298023224,
"reward_std": 0.09687121585011482,
"rewards/accuracy_reward": 0.8333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08854166977107525,
"step": 48
},
{
"completion_length": 1743.0573120117188,
"epoch": 0.5868263473053892,
"grad_norm": 5.265244007110596,
"kl": 0.7179183959960938,
"learning_rate": 4.928419811139045e-07,
"loss": 0.0287,
"reward": 0.4114583358168602,
"reward_std": 0.11536262556910515,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.07812500186264515,
"step": 49
},
{
"completion_length": 1336.625,
"epoch": 0.5988023952095808,
"grad_norm": 5.1192240715026855,
"kl": 1.0614166259765625,
"learning_rate": 4.739496308551679e-07,
"loss": 0.0425,
"reward": 0.6015625204890966,
"reward_std": 0.11182517930865288,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.10156250558793545,
"step": 50
},
{
"completion_length": 1601.1458435058594,
"epoch": 0.6107784431137725,
"grad_norm": 7.076495170593262,
"kl": 1.0777587890625,
"learning_rate": 4.551943285019233e-07,
"loss": 0.0433,
"reward": 0.4830729365348816,
"reward_std": 0.09971196111291647,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06640625093132257,
"step": 51
},
{
"completion_length": 1865.421875,
"epoch": 0.6227544910179641,
"grad_norm": 1.5869081020355225,
"kl": 0.4460277557373047,
"learning_rate": 4.3660987237669377e-07,
"loss": 0.0178,
"reward": 0.3958333507180214,
"reward_std": 0.08472462091594934,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06250000186264515,
"step": 52
},
{
"completion_length": 1454.328125,
"epoch": 0.6347305389221557,
"grad_norm": 7.47260046005249,
"kl": 1.51171875,
"learning_rate": 4.182297529255524e-07,
"loss": 0.0607,
"reward": 0.4934896007180214,
"reward_std": 0.10642260871827602,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0768229179084301,
"step": 53
},
{
"completion_length": 1444.4427185058594,
"epoch": 0.6467065868263473,
"grad_norm": 4.976149559020996,
"kl": 1.0859375,
"learning_rate": 4.0008709236607405e-07,
"loss": 0.0434,
"reward": 0.4921875074505806,
"reward_std": 0.10759196057915688,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0755208358168602,
"step": 54
},
{
"completion_length": 1585.125,
"epoch": 0.6586826347305389,
"grad_norm": 2.437457323074341,
"kl": 1.212890625,
"learning_rate": 3.8221458499886115e-07,
"loss": 0.0486,
"reward": 0.6250000102445483,
"reward_std": 0.12861231248825788,
"rewards/accuracy_reward": 0.5520833432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.07291666883975267,
"step": 55
},
{
"completion_length": 1848.125,
"epoch": 0.6706586826347305,
"grad_norm": 2.6519415378570557,
"kl": 0.339874267578125,
"learning_rate": 3.646444382902033e-07,
"loss": 0.0136,
"reward": 0.3164062574505806,
"reward_std": 0.10790392756462097,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06640625,
"step": 56
},
{
"completion_length": 1311.75,
"epoch": 0.6826347305389222,
"grad_norm": 4.369971752166748,
"kl": 1.4095611572265625,
"learning_rate": 3.474083148320469e-07,
"loss": 0.0565,
"reward": 0.6640625149011612,
"reward_std": 0.14868063479661942,
"rewards/accuracy_reward": 0.5520833432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.11197916883975267,
"step": 57
},
{
"completion_length": 1429.4219207763672,
"epoch": 0.6946107784431138,
"grad_norm": 6.277844429016113,
"kl": 0.970916748046875,
"learning_rate": 3.3053727528386457e-07,
"loss": 0.0389,
"reward": 0.5846354365348816,
"reward_std": 0.1101480070501566,
"rewards/accuracy_reward": 0.5000000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0846354179084301,
"step": 58
},
{
"completion_length": 911.0625,
"epoch": 0.7065868263473054,
"grad_norm": 5.268196105957031,
"kl": 1.7060546875,
"learning_rate": 3.140617223992458e-07,
"loss": 0.0683,
"reward": 0.856770858168602,
"reward_std": 0.12339456751942635,
"rewards/accuracy_reward": 0.7500000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1067708395421505,
"step": 59
},
{
"completion_length": 1568.5625,
"epoch": 0.718562874251497,
"grad_norm": 3.4410088062286377,
"kl": 0.6736679077148438,
"learning_rate": 2.980113462380821e-07,
"loss": 0.027,
"reward": 0.4453125223517418,
"reward_std": 0.13059347681701183,
"rewards/accuracy_reward": 0.3750000037252903,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.07031250139698386,
"step": 60
},
{
"completion_length": 1302.6875,
"epoch": 0.7305389221556886,
"grad_norm": 5.513269901275635,
"kl": 1.0953369140625,
"learning_rate": 2.82415070663071e-07,
"loss": 0.0437,
"reward": 0.6171875223517418,
"reward_std": 0.11416286043822765,
"rewards/accuracy_reward": 0.5000000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.11718750186264515,
"step": 61
},
{
"completion_length": 1684.984375,
"epoch": 0.7425149700598802,
"grad_norm": 3.3617103099823,
"kl": 0.6853790283203125,
"learning_rate": 2.673010012169609e-07,
"loss": 0.0274,
"reward": 0.4010416744276881,
"reward_std": 0.08693839982151985,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.06770833488553762,
"step": 62
},
{
"completion_length": 1815.5,
"epoch": 0.7544910179640718,
"grad_norm": 3.155616283416748,
"kl": 0.395294189453125,
"learning_rate": 2.5269637447446345e-07,
"loss": 0.0158,
"reward": 0.3828125074505806,
"reward_std": 0.08006503619253635,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0494791679084301,
"step": 63
},
{
"completion_length": 1800.3333435058594,
"epoch": 0.7664670658682635,
"grad_norm": 1.9823267459869385,
"kl": 0.34970855712890625,
"learning_rate": 2.3862750896010425e-07,
"loss": 0.014,
"reward": 0.4036458432674408,
"reward_std": 0.08622701931744814,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.07031250279396772,
"step": 64
},
{
"completion_length": 1160.375,
"epoch": 0.7784431137724551,
"grad_norm": 5.189390659332275,
"kl": 1.3760986328125,
"learning_rate": 2.25119757720464e-07,
"loss": 0.0551,
"reward": 0.8632812798023224,
"reward_std": 0.11284597590565681,
"rewards/accuracy_reward": 0.7500000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.11328125279396772,
"step": 65
},
{
"completion_length": 1668.265625,
"epoch": 0.7904191616766467,
"grad_norm": 3.109102249145508,
"kl": 0.5458221435546875,
"learning_rate": 2.12197462636274e-07,
"loss": 0.0218,
"reward": 0.5104166828095913,
"reward_std": 0.12134900130331516,
"rewards/accuracy_reward": 0.4218750074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.08854166977107525,
"step": 66
},
{
"completion_length": 1529.2083435058594,
"epoch": 0.8023952095808383,
"grad_norm": 6.05462646484375,
"kl": 1.633209228515625,
"learning_rate": 1.998839105567023e-07,
"loss": 0.0652,
"reward": 0.4231770932674408,
"reward_std": 0.12092401646077633,
"rewards/accuracy_reward": 0.3333333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.0898437537252903,
"step": 67
},
{
"completion_length": 2048.0,
"epoch": 0.8143712574850299,
"grad_norm": 0.8508380651473999,
"kl": 0.00394439697265625,
"learning_rate": 1.882012913348768e-07,
"loss": 0.0002,
"reward": 0.3033854253590107,
"reward_std": 0.12076857313513756,
"rewards/accuracy_reward": 0.1666666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.13671875558793545,
"step": 68
},
{
"completion_length": 2048.0,
"epoch": 0.8263473053892215,
"grad_norm": 0.6287813782691956,
"kl": 0.00579071044921875,
"learning_rate": 1.7717065784027108e-07,
"loss": 0.0002,
"reward": 0.5859375260770321,
"reward_std": 0.11029668338596821,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1692708395421505,
"step": 69
},
{
"completion_length": 2048.0,
"epoch": 0.8383233532934131,
"grad_norm": 0.8328803777694702,
"kl": 0.00489044189453125,
"learning_rate": 1.6681188802000992e-07,
"loss": 0.0002,
"reward": 0.6432291939854622,
"reward_std": 0.13170602917671204,
"rewards/accuracy_reward": 0.5000000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1432291716337204,
"step": 70
},
{
"completion_length": 2032.9375,
"epoch": 0.8502994011976048,
"grad_norm": 0.7211276292800903,
"kl": 0.00424957275390625,
"learning_rate": 1.5714364907746534e-07,
"loss": 0.0002,
"reward": 0.44661460630595684,
"reward_std": 0.11995729431509972,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.11328125186264515,
"step": 71
},
{
"completion_length": 2048.0,
"epoch": 0.8622754491017964,
"grad_norm": 0.795598566532135,
"kl": 0.00417327880859375,
"learning_rate": 1.4818336383269423e-07,
"loss": 0.0002,
"reward": 0.805989608168602,
"reward_std": 0.12349414266645908,
"rewards/accuracy_reward": 0.666666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1393229216337204,
"step": 72
},
{
"completion_length": 2041.3385620117188,
"epoch": 0.874251497005988,
"grad_norm": 0.6561583876609802,
"kl": 0.00516510009765625,
"learning_rate": 1.3994717932533889e-07,
"loss": 0.0002,
"reward": 0.7122395932674408,
"reward_std": 0.12930710427463055,
"rewards/accuracy_reward": 0.5833333507180214,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.12890625558793545,
"step": 73
},
{
"completion_length": 2048.0,
"epoch": 0.8862275449101796,
"grad_norm": 0.6364233493804932,
"kl": 0.00426483154296875,
"learning_rate": 1.324499377165708e-07,
"loss": 0.0002,
"reward": 0.5507812574505806,
"reward_std": 0.12330615520477295,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1341145895421505,
"step": 74
},
{
"completion_length": 2048.0,
"epoch": 0.8982035928143712,
"grad_norm": 0.7673735618591309,
"kl": 0.00435638427734375,
"learning_rate": 1.257051495425121e-07,
"loss": 0.0002,
"reward": 0.5768229365348816,
"reward_std": 0.13856617361307144,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1601562574505806,
"step": 75
},
{
"completion_length": 2048.0,
"epoch": 0.9101796407185628,
"grad_norm": 0.773383378982544,
"kl": 0.00489044189453125,
"learning_rate": 1.197249693673371e-07,
"loss": 0.0002,
"reward": 0.8151041865348816,
"reward_std": 0.1366959922015667,
"rewards/accuracy_reward": 0.6666666865348816,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1484375074505806,
"step": 76
},
{
"completion_length": 2046.9375,
"epoch": 0.9221556886227545,
"grad_norm": 0.6188393235206604,
"kl": 0.0057525634765625,
"learning_rate": 1.145201738799255e-07,
"loss": 0.0002,
"reward": 0.6341145858168602,
"reward_std": 0.13442306593060493,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1341145858168602,
"step": 77
},
{
"completion_length": 2048.0,
"epoch": 0.9341317365269461,
"grad_norm": 0.771629273891449,
"kl": 0.004974365234375,
"learning_rate": 1.1010014247354125e-07,
"loss": 0.0002,
"reward": 0.5664062555879354,
"reward_std": 0.1288151517510414,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.14973958767950535,
"step": 78
},
{
"completion_length": 2037.3385620117188,
"epoch": 0.9461077844311377,
"grad_norm": 1.8673304319381714,
"kl": 0.01363372802734375,
"learning_rate": 1.064728403435312e-07,
"loss": 0.0005,
"reward": 0.6250000223517418,
"reward_std": 0.12153633683919907,
"rewards/accuracy_reward": 0.5000000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1250000037252903,
"step": 79
},
{
"completion_length": 2045.796875,
"epoch": 0.9580838323353293,
"grad_norm": 0.5428643226623535,
"kl": 0.003711700439453125,
"learning_rate": 1.0364480413350543e-07,
"loss": 0.0001,
"reward": 0.8554687947034836,
"reward_std": 0.09259135648608208,
"rewards/accuracy_reward": 0.7500000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1054687537252903,
"step": 80
},
{
"completion_length": 2048.0,
"epoch": 0.9700598802395209,
"grad_norm": 0.6405127048492432,
"kl": 0.00507354736328125,
"learning_rate": 1.0162113015586308e-07,
"loss": 0.0002,
"reward": 0.6484375223517418,
"reward_std": 0.12896526977419853,
"rewards/accuracy_reward": 0.5052083432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1432291716337204,
"step": 81
},
{
"completion_length": 2048.0,
"epoch": 0.9820359281437125,
"grad_norm": 0.6773039698600769,
"kl": 0.00487518310546875,
"learning_rate": 1.0040546520789337e-07,
"loss": 0.0002,
"reward": 0.7031250149011612,
"reward_std": 0.11840885132551193,
"rewards/accuracy_reward": 0.5833333507180214,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.1197916716337204,
"step": 82
},
{
"completion_length": 2048.0,
"epoch": 0.9940119760479041,
"grad_norm": 0.5921207070350647,
"kl": 0.00601959228515625,
"learning_rate": 1e-07,
"loss": 0.0002,
"reward": 0.7174479365348816,
"reward_std": 0.12807989306747913,
"rewards/accuracy_reward": 0.588541679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.12890625558793545,
"step": 83
},
{
"epoch": 0.9940119760479041,
"step": 83,
"total_flos": 0.0,
"train_loss": 0.02214584331980233,
"train_runtime": 5149.8835,
"train_samples_per_second": 0.194,
"train_steps_per_second": 0.016
}
],
"logging_steps": 1,
"max_steps": 83,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}