Qwen2.5-1.5B-Open-R1-Code-GRPO / trainer_state.json
Blancy's picture
Model save
6947641 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0615711252653928,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 746.1138916015625,
"epoch": 0.0021231422505307855,
"grad_norm": 0.21636255085468292,
"kl": 0.0,
"learning_rate": 3.3333333333333335e-07,
"loss": 0.0,
"reward": 0.11085444036871195,
"reward_std": 0.15387122705578804,
"rewards/code_reward": 0.11063122469931841,
"rewards/format_reward": 0.0022321429569274187,
"step": 1
},
{
"completion_length": 741.6986846923828,
"epoch": 0.004246284501061571,
"grad_norm": 0.21392129361629486,
"kl": 0.0,
"learning_rate": 6.666666666666667e-07,
"loss": 0.0,
"reward": 0.10614843107759953,
"reward_std": 0.15658440068364143,
"rewards/code_reward": 0.10592522472143173,
"rewards/format_reward": 0.0022321429569274187,
"step": 2
},
{
"completion_length": 756.122802734375,
"epoch": 0.006369426751592357,
"grad_norm": 0.21344353258609772,
"kl": 6.717443466186523e-05,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0,
"reward": 0.12699278071522713,
"reward_std": 0.15882046334445477,
"rewards/code_reward": 0.12676957063376904,
"rewards/format_reward": 0.0022321429569274187,
"step": 3
},
{
"completion_length": 760.8817291259766,
"epoch": 0.008492569002123142,
"grad_norm": 0.2038053572177887,
"kl": 7.62939453125e-05,
"learning_rate": 1.3333333333333334e-06,
"loss": 0.0,
"reward": 0.06782207638025284,
"reward_std": 0.1164214089512825,
"rewards/code_reward": 0.06782207870855927,
"rewards/format_reward": 0.0,
"step": 4
},
{
"completion_length": 764.8817291259766,
"epoch": 0.010615711252653927,
"grad_norm": 0.20332112908363342,
"kl": 7.510185241699219e-05,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0,
"reward": 0.07368321809917688,
"reward_std": 0.11412223428487778,
"rewards/code_reward": 0.07368322089314461,
"rewards/format_reward": 0.0,
"step": 5
},
{
"completion_length": 733.0937805175781,
"epoch": 0.012738853503184714,
"grad_norm": 0.2384093701839447,
"kl": 8.344650268554688e-05,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0,
"reward": 0.11057165823876858,
"reward_std": 0.13630107790231705,
"rewards/code_reward": 0.11057165637612343,
"rewards/format_reward": 0.0,
"step": 6
},
{
"completion_length": 747.8013763427734,
"epoch": 0.014861995753715499,
"grad_norm": 0.21190612018108368,
"kl": 9.250640869140625e-05,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.0,
"reward": 0.13999284896999598,
"reward_std": 0.14958541933447123,
"rewards/code_reward": 0.13999284896999598,
"rewards/format_reward": 0.0,
"step": 7
},
{
"completion_length": 747.6540679931641,
"epoch": 0.016985138004246284,
"grad_norm": 0.1906791776418686,
"kl": 0.00013947486877441406,
"learning_rate": 2.666666666666667e-06,
"loss": 0.0,
"reward": 0.0754421940073371,
"reward_std": 0.10426154918968678,
"rewards/code_reward": 0.07521897740662098,
"rewards/format_reward": 0.0022321429569274187,
"step": 8
},
{
"completion_length": 710.5401916503906,
"epoch": 0.01910828025477707,
"grad_norm": 0.20240359008312225,
"kl": 0.0002300739288330078,
"learning_rate": 3e-06,
"loss": 0.0,
"reward": 0.12234624661505222,
"reward_std": 0.10050993971526623,
"rewards/code_reward": 0.12234624475240707,
"rewards/format_reward": 0.0,
"step": 9
},
{
"completion_length": 760.1808319091797,
"epoch": 0.021231422505307854,
"grad_norm": 0.34670934081077576,
"kl": 0.0004100799560546875,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0,
"reward": 0.055801121750846505,
"reward_std": 0.06395915220491588,
"rewards/code_reward": 0.05580112128518522,
"rewards/format_reward": 0.0,
"step": 10
},
{
"completion_length": 750.3906555175781,
"epoch": 0.02335456475583864,
"grad_norm": 0.21112516522407532,
"kl": 0.0007238388061523438,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.0,
"reward": 0.06025231350213289,
"reward_std": 0.09869139082729816,
"rewards/code_reward": 0.06025231350213289,
"rewards/format_reward": 0.0,
"step": 11
},
{
"completion_length": 700.1384124755859,
"epoch": 0.025477707006369428,
"grad_norm": 0.22157742083072662,
"kl": 0.00104522705078125,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0,
"reward": 0.1535286195576191,
"reward_std": 0.16596542671322823,
"rewards/code_reward": 0.1530821956694126,
"rewards/format_reward": 0.004464285913854837,
"step": 12
},
{
"completion_length": 702.9152069091797,
"epoch": 0.027600849256900213,
"grad_norm": 0.24524690210819244,
"kl": 0.0017261505126953125,
"learning_rate": 4.333333333333334e-06,
"loss": 0.0,
"reward": 0.19020407181233168,
"reward_std": 0.16583579406142235,
"rewards/code_reward": 0.1902040634304285,
"rewards/format_reward": 0.0,
"step": 13
},
{
"completion_length": 714.9486999511719,
"epoch": 0.029723991507430998,
"grad_norm": 0.18179796636104584,
"kl": 0.00283050537109375,
"learning_rate": 4.666666666666667e-06,
"loss": 0.0,
"reward": 0.06596253952011466,
"reward_std": 0.08220406854525208,
"rewards/code_reward": 0.06596253253519535,
"rewards/format_reward": 0.0,
"step": 14
},
{
"completion_length": 683.7835083007812,
"epoch": 0.03184713375796178,
"grad_norm": 0.18836897611618042,
"kl": 0.004302978515625,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.10805188585072756,
"reward_std": 0.0908731259405613,
"rewards/code_reward": 0.10782866925001144,
"rewards/format_reward": 0.0022321429569274187,
"step": 15
},
{
"completion_length": 641.216552734375,
"epoch": 0.03397027600849257,
"grad_norm": 0.22146466374397278,
"kl": 0.00562286376953125,
"learning_rate": 4.999952797253148e-06,
"loss": 0.0001,
"reward": 0.20005132257938385,
"reward_std": 0.16782562248408794,
"rewards/code_reward": 0.19938167929649353,
"rewards/format_reward": 0.006696428870782256,
"step": 16
},
{
"completion_length": 644.9598541259766,
"epoch": 0.036093418259023353,
"grad_norm": 0.20937775075435638,
"kl": 0.00730133056640625,
"learning_rate": 4.9998111909931225e-06,
"loss": 0.0001,
"reward": 0.1508529670536518,
"reward_std": 0.1663584616035223,
"rewards/code_reward": 0.15040653757750988,
"rewards/format_reward": 0.004464285913854837,
"step": 17
},
{
"completion_length": 608.2477874755859,
"epoch": 0.03821656050955414,
"grad_norm": 0.2286761999130249,
"kl": 0.0104217529296875,
"learning_rate": 4.999575187161439e-06,
"loss": 0.0001,
"reward": 0.14321784488856792,
"reward_std": 0.1725912243127823,
"rewards/code_reward": 0.14321784675121307,
"rewards/format_reward": 0.0,
"step": 18
},
{
"completion_length": 650.0647583007812,
"epoch": 0.040339702760084924,
"grad_norm": 0.21597912907600403,
"kl": 0.0113677978515625,
"learning_rate": 4.9992447956603455e-06,
"loss": 0.0001,
"reward": 0.12854056991636753,
"reward_std": 0.15781505592167377,
"rewards/code_reward": 0.12854057550430298,
"rewards/format_reward": 0.0,
"step": 19
},
{
"completion_length": 624.8192138671875,
"epoch": 0.04246284501061571,
"grad_norm": 42.34319305419922,
"kl": 7.167266845703125,
"learning_rate": 4.998820030352409e-06,
"loss": 0.0716,
"reward": 0.12942847050726414,
"reward_std": 0.11835422366857529,
"rewards/code_reward": 0.1292052511125803,
"rewards/format_reward": 0.0022321429569274187,
"step": 20
},
{
"completion_length": 632.3504638671875,
"epoch": 0.044585987261146494,
"grad_norm": 0.23411324620246887,
"kl": 0.0178985595703125,
"learning_rate": 4.998300909059929e-06,
"loss": 0.0002,
"reward": 0.12214689701795578,
"reward_std": 0.1782115437090397,
"rewards/code_reward": 0.12214690260589123,
"rewards/format_reward": 0.0,
"step": 21
},
{
"completion_length": 610.6652069091797,
"epoch": 0.04670912951167728,
"grad_norm": 0.24601581692695618,
"kl": 0.020477294921875,
"learning_rate": 4.997687453564198e-06,
"loss": 0.0002,
"reward": 0.18596480786800385,
"reward_std": 0.184912770986557,
"rewards/code_reward": 0.18529516831040382,
"rewards/format_reward": 0.006696428870782256,
"step": 22
},
{
"completion_length": 612.2344055175781,
"epoch": 0.04883227176220807,
"grad_norm": 0.25667688250541687,
"kl": 0.02349853515625,
"learning_rate": 4.9969796896045775e-06,
"loss": 0.0002,
"reward": 0.1700380276888609,
"reward_std": 0.15014583989977837,
"rewards/code_reward": 0.1689219493418932,
"rewards/format_reward": 0.011160714784637094,
"step": 23
},
{
"completion_length": 582.0379791259766,
"epoch": 0.050955414012738856,
"grad_norm": 1.4910736083984375,
"kl": 0.054718017578125,
"learning_rate": 4.996177646877426e-06,
"loss": 0.0005,
"reward": 0.15169917233288288,
"reward_std": 0.16788329929113388,
"rewards/code_reward": 0.151029534637928,
"rewards/format_reward": 0.006696428870782256,
"step": 24
},
{
"completion_length": 605.2277069091797,
"epoch": 0.05307855626326964,
"grad_norm": 0.23100271821022034,
"kl": 0.030059814453125,
"learning_rate": 4.995281359034851e-06,
"loss": 0.0003,
"reward": 0.10647542215883732,
"reward_std": 0.13741069473326206,
"rewards/code_reward": 0.105805778875947,
"rewards/format_reward": 0.006696428870782256,
"step": 25
},
{
"completion_length": 595.9777069091797,
"epoch": 0.055201698513800426,
"grad_norm": 0.22825182974338531,
"kl": 0.03179931640625,
"learning_rate": 4.994290863683296e-06,
"loss": 0.0003,
"reward": 0.11801626486703753,
"reward_std": 0.12430650275200605,
"rewards/code_reward": 0.11779305664822459,
"rewards/format_reward": 0.0022321429569274187,
"step": 26
},
{
"completion_length": 594.2031402587891,
"epoch": 0.05732484076433121,
"grad_norm": 0.2523776590824127,
"kl": 0.0357666015625,
"learning_rate": 4.99320620238196e-06,
"loss": 0.0004,
"reward": 0.1666601337492466,
"reward_std": 0.20200489647686481,
"rewards/code_reward": 0.1655440628528595,
"rewards/format_reward": 0.011160714784637094,
"step": 27
},
{
"completion_length": 606.7299346923828,
"epoch": 0.059447983014861996,
"grad_norm": 0.24759377539157867,
"kl": 0.03466796875,
"learning_rate": 4.99202742064106e-06,
"loss": 0.0003,
"reward": 0.12888818327337503,
"reward_std": 0.14617390558123589,
"rewards/code_reward": 0.12732568103820086,
"rewards/format_reward": 0.01562500116415322,
"step": 28
},
{
"completion_length": 582.7879791259766,
"epoch": 0.06157112526539278,
"grad_norm": 0.22141791880130768,
"kl": 0.0360107421875,
"learning_rate": 4.990754567919917e-06,
"loss": 0.0004,
"reward": 0.1982099711894989,
"reward_std": 0.15798946656286716,
"rewards/code_reward": 0.1970939077436924,
"rewards/format_reward": 0.011160714784637094,
"step": 29
},
{
"completion_length": 582.3035888671875,
"epoch": 0.06369426751592357,
"grad_norm": 0.4308633804321289,
"kl": 0.04461669921875,
"learning_rate": 4.989387697624881e-06,
"loss": 0.0004,
"reward": 0.15222312323749065,
"reward_std": 0.13287453912198544,
"rewards/code_reward": 0.14999098517000675,
"rewards/format_reward": 0.022321429336443543,
"step": 30
},
{
"completion_length": 560.9888763427734,
"epoch": 0.06581740976645435,
"grad_norm": 0.44854736328125,
"kl": 0.05029296875,
"learning_rate": 4.987926867107095e-06,
"loss": 0.0005,
"reward": 0.17351704463362694,
"reward_std": 0.1594883631914854,
"rewards/code_reward": 0.17039205506443977,
"rewards/format_reward": 0.031250000931322575,
"step": 31
},
{
"completion_length": 520.7835006713867,
"epoch": 0.06794055201698514,
"grad_norm": 0.3021621108055115,
"kl": 0.0545654296875,
"learning_rate": 4.986372137660078e-06,
"loss": 0.0005,
"reward": 0.19399502873420715,
"reward_std": 0.18005169555544853,
"rewards/code_reward": 0.1872985940426588,
"rewards/format_reward": 0.0669642873108387,
"step": 32
},
{
"completion_length": 551.3750228881836,
"epoch": 0.07006369426751592,
"grad_norm": 0.38339507579803467,
"kl": 0.0712890625,
"learning_rate": 4.984723574517165e-06,
"loss": 0.0007,
"reward": 0.15828289464116096,
"reward_std": 0.18912290409207344,
"rewards/code_reward": 0.1453364696353674,
"rewards/format_reward": 0.12946429289877415,
"step": 33
},
{
"completion_length": 490.0558166503906,
"epoch": 0.07218683651804671,
"grad_norm": 0.5539775490760803,
"kl": 0.0887451171875,
"learning_rate": 4.9829812468487655e-06,
"loss": 0.0009,
"reward": 0.18788279965519905,
"reward_std": 0.19856177270412445,
"rewards/code_reward": 0.16578458063304424,
"rewards/format_reward": 0.2209821566939354,
"step": 34
},
{
"completion_length": 441.99778747558594,
"epoch": 0.07430997876857749,
"grad_norm": 0.35391107201576233,
"kl": 0.12060546875,
"learning_rate": 4.981145227759457e-06,
"loss": 0.0012,
"reward": 0.20366163551807404,
"reward_std": 0.1490145679563284,
"rewards/code_reward": 0.16392949409782887,
"rewards/format_reward": 0.3973214477300644,
"step": 35
},
{
"completion_length": 454.8236846923828,
"epoch": 0.07643312101910828,
"grad_norm": 0.34607765078544617,
"kl": 0.18994140625,
"learning_rate": 4.979215594284924e-06,
"loss": 0.0019,
"reward": 0.16812831349670887,
"reward_std": 0.16833286173641682,
"rewards/code_reward": 0.10094079561531544,
"rewards/format_reward": 0.6718750298023224,
"step": 36
},
{
"completion_length": 413.80358123779297,
"epoch": 0.07855626326963906,
"grad_norm": 0.30116426944732666,
"kl": 0.1982421875,
"learning_rate": 4.977192427388722e-06,
"loss": 0.002,
"reward": 0.24648623168468475,
"reward_std": 0.1688873954117298,
"rewards/code_reward": 0.16099514812231064,
"rewards/format_reward": 0.854910746216774,
"step": 37
},
{
"completion_length": 412.4464416503906,
"epoch": 0.08067940552016985,
"grad_norm": 0.3934517204761505,
"kl": 0.248046875,
"learning_rate": 4.9750758119588824e-06,
"loss": 0.0025,
"reward": 0.24308543279767036,
"reward_std": 0.14966130815446377,
"rewards/code_reward": 0.1495586484670639,
"rewards/format_reward": 0.9352678954601288,
"step": 38
},
{
"completion_length": 424.8928756713867,
"epoch": 0.08280254777070063,
"grad_norm": 0.3077991008758545,
"kl": 0.256103515625,
"learning_rate": 4.972865836804349e-06,
"loss": 0.0026,
"reward": 0.2948240712285042,
"reward_std": 0.17200535349547863,
"rewards/code_reward": 0.19995798915624619,
"rewards/format_reward": 0.9486607611179352,
"step": 39
},
{
"completion_length": 445.8326110839844,
"epoch": 0.08492569002123142,
"grad_norm": 0.3074510097503662,
"kl": 0.259765625,
"learning_rate": 4.970562594651254e-06,
"loss": 0.0026,
"reward": 0.2571263238787651,
"reward_std": 0.1593556720763445,
"rewards/code_reward": 0.16226024366915226,
"rewards/format_reward": 0.9486607611179352,
"step": 40
},
{
"completion_length": 474.68082427978516,
"epoch": 0.0870488322717622,
"grad_norm": 0.28787654638290405,
"kl": 0.2421875,
"learning_rate": 4.968166182139026e-06,
"loss": 0.0024,
"reward": 0.27686072885990143,
"reward_std": 0.16917606256902218,
"rewards/code_reward": 0.18378033302724361,
"rewards/format_reward": 0.9308036118745804,
"step": 41
},
{
"completion_length": 520.4397583007812,
"epoch": 0.08917197452229299,
"grad_norm": 0.2756073772907257,
"kl": 0.22216796875,
"learning_rate": 4.9656766998163306e-06,
"loss": 0.0023,
"reward": 0.29509423673152924,
"reward_std": 0.13862515799701214,
"rewards/code_reward": 0.20402280241250992,
"rewards/format_reward": 0.910714328289032,
"step": 42
},
{
"completion_length": 520.6406555175781,
"epoch": 0.09129511677282377,
"grad_norm": 0.2641509473323822,
"kl": 0.1728515625,
"learning_rate": 4.963094252136865e-06,
"loss": 0.0017,
"reward": 0.3755844831466675,
"reward_std": 0.19650832191109657,
"rewards/code_reward": 0.280941616743803,
"rewards/format_reward": 0.9464286118745804,
"step": 43
},
{
"completion_length": 520.0803909301758,
"epoch": 0.09341825902335456,
"grad_norm": 0.28843629360198975,
"kl": 0.206298828125,
"learning_rate": 4.960418947454958e-06,
"loss": 0.0021,
"reward": 0.21916456520557404,
"reward_std": 0.12000982835888863,
"rewards/code_reward": 0.12407528422772884,
"rewards/format_reward": 0.95089291036129,
"step": 44
},
{
"completion_length": 516.7455749511719,
"epoch": 0.09554140127388536,
"grad_norm": 0.9614177942276001,
"kl": 0.203125,
"learning_rate": 4.957650898021038e-06,
"loss": 0.002,
"reward": 0.26794980466365814,
"reward_std": 0.14450966753065586,
"rewards/code_reward": 0.17397657968103886,
"rewards/format_reward": 0.9397321939468384,
"step": 45
},
{
"completion_length": 517.7343978881836,
"epoch": 0.09766454352441614,
"grad_norm": 0.2904169261455536,
"kl": 0.17041015625,
"learning_rate": 4.954790219976915e-06,
"loss": 0.0017,
"reward": 0.3067335784435272,
"reward_std": 0.15805694833397865,
"rewards/code_reward": 0.21186750568449497,
"rewards/format_reward": 0.948660746216774,
"step": 46
},
{
"completion_length": 533.3594055175781,
"epoch": 0.09978768577494693,
"grad_norm": 0.25753694772720337,
"kl": 0.126953125,
"learning_rate": 4.95183703335091e-06,
"loss": 0.0013,
"reward": 0.22189904749393463,
"reward_std": 0.13265508972108364,
"rewards/code_reward": 0.12390796467661858,
"rewards/format_reward": 0.9799107611179352,
"step": 47
},
{
"completion_length": 548.950927734375,
"epoch": 0.10191082802547771,
"grad_norm": 0.26259344816207886,
"kl": 0.1424560546875,
"learning_rate": 4.948791462052819e-06,
"loss": 0.0014,
"reward": 0.22812815010547638,
"reward_std": 0.1622354220598936,
"rewards/code_reward": 0.12991385161876678,
"rewards/format_reward": 0.9821428954601288,
"step": 48
},
{
"completion_length": 571.6741485595703,
"epoch": 0.1040339702760085,
"grad_norm": 0.4155128300189972,
"kl": 0.20263671875,
"learning_rate": 4.945653633868716e-06,
"loss": 0.0021,
"reward": 0.24147583171725273,
"reward_std": 0.1386658363044262,
"rewards/code_reward": 0.1450472492724657,
"rewards/format_reward": 0.964285746216774,
"step": 49
},
{
"completion_length": 534.6093978881836,
"epoch": 0.10615711252653928,
"grad_norm": 0.24680602550506592,
"kl": 0.159912109375,
"learning_rate": 4.942423680455584e-06,
"loss": 0.0016,
"reward": 0.2133147530257702,
"reward_std": 0.14480553567409515,
"rewards/code_reward": 0.11643974296748638,
"rewards/format_reward": 0.9687500298023224,
"step": 50
},
{
"completion_length": 521.4888610839844,
"epoch": 0.10828025477707007,
"grad_norm": 0.27140846848487854,
"kl": 0.172119140625,
"learning_rate": 4.939101737335802e-06,
"loss": 0.0017,
"reward": 0.3708176761865616,
"reward_std": 0.1698193922638893,
"rewards/code_reward": 0.2730497941374779,
"rewards/format_reward": 0.9776786118745804,
"step": 51
},
{
"completion_length": 550.3236846923828,
"epoch": 0.11040339702760085,
"grad_norm": 0.24256259202957153,
"kl": 0.145751953125,
"learning_rate": 4.935687943891447e-06,
"loss": 0.0015,
"reward": 0.30257678776979446,
"reward_std": 0.1430999655276537,
"rewards/code_reward": 0.2057017907500267,
"rewards/format_reward": 0.9687500447034836,
"step": 52
},
{
"completion_length": 551.3772583007812,
"epoch": 0.11252653927813164,
"grad_norm": 0.2562994062900543,
"kl": 0.16259765625,
"learning_rate": 4.932182443358458e-06,
"loss": 0.0016,
"reward": 0.314239501953125,
"reward_std": 0.21334025636315346,
"rewards/code_reward": 0.21624841168522835,
"rewards/format_reward": 0.979910746216774,
"step": 53
},
{
"completion_length": 553.4955596923828,
"epoch": 0.11464968152866242,
"grad_norm": 0.23835241794586182,
"kl": 0.160888671875,
"learning_rate": 4.928585382820616e-06,
"loss": 0.0016,
"reward": 0.25176869705319405,
"reward_std": 0.11105065606534481,
"rewards/code_reward": 0.1535544078797102,
"rewards/format_reward": 0.9821428954601288,
"step": 54
},
{
"completion_length": 552.2366333007812,
"epoch": 0.11677282377919321,
"grad_norm": 0.2630121409893036,
"kl": 0.1552734375,
"learning_rate": 4.924896913203376e-06,
"loss": 0.0016,
"reward": 0.24022378027439117,
"reward_std": 0.15625984594225883,
"rewards/code_reward": 0.14133985061198473,
"rewards/format_reward": 0.988839328289032,
"step": 55
},
{
"completion_length": 574.1295013427734,
"epoch": 0.11889596602972399,
"grad_norm": 0.3262800872325897,
"kl": 0.1572265625,
"learning_rate": 4.921117189267535e-06,
"loss": 0.0016,
"reward": 0.32679247856140137,
"reward_std": 0.19292927533388138,
"rewards/code_reward": 0.22991745918989182,
"rewards/format_reward": 0.9687500298023224,
"step": 56
},
{
"completion_length": 541.9576110839844,
"epoch": 0.12101910828025478,
"grad_norm": 0.2467201203107834,
"kl": 0.17578125,
"learning_rate": 4.917246369602742e-06,
"loss": 0.0018,
"reward": 0.25976729951798916,
"reward_std": 0.1260015396401286,
"rewards/code_reward": 0.16110657062381506,
"rewards/format_reward": 0.986607164144516,
"step": 57
},
{
"completion_length": 553.419677734375,
"epoch": 0.12314225053078556,
"grad_norm": 0.2763681709766388,
"kl": 0.15478515625,
"learning_rate": 4.9132846166208355e-06,
"loss": 0.0016,
"reward": 0.2834607996046543,
"reward_std": 0.1603868044912815,
"rewards/code_reward": 0.1852465160191059,
"rewards/format_reward": 0.9821428954601288,
"step": 58
},
{
"completion_length": 542.2343826293945,
"epoch": 0.12526539278131635,
"grad_norm": 1.2009683847427368,
"kl": 0.203125,
"learning_rate": 4.9092320965490365e-06,
"loss": 0.002,
"reward": 0.36397186666727066,
"reward_std": 0.20367462560534477,
"rewards/code_reward": 0.26531114615499973,
"rewards/format_reward": 0.9866071790456772,
"step": 59
},
{
"completion_length": 502.7076110839844,
"epoch": 0.12738853503184713,
"grad_norm": 0.291824609041214,
"kl": 0.1533203125,
"learning_rate": 4.905088979422971e-06,
"loss": 0.0015,
"reward": 0.33501066267490387,
"reward_std": 0.17072956077754498,
"rewards/code_reward": 0.23701957240700722,
"rewards/format_reward": 0.979910746216774,
"step": 60
},
{
"completion_length": 512.5134201049805,
"epoch": 0.12951167728237792,
"grad_norm": 0.2763117849826813,
"kl": 0.1837158203125,
"learning_rate": 4.900855439079536e-06,
"loss": 0.0019,
"reward": 0.3404688164591789,
"reward_std": 0.19662801921367645,
"rewards/code_reward": 0.2453795075416565,
"rewards/format_reward": 0.9508928954601288,
"step": 61
},
{
"completion_length": 526.8995819091797,
"epoch": 0.1316348195329087,
"grad_norm": 0.2876502275466919,
"kl": 0.19580078125,
"learning_rate": 4.8965316531496055e-06,
"loss": 0.002,
"reward": 0.2866082601249218,
"reward_std": 0.16614584252238274,
"rewards/code_reward": 0.19129573553800583,
"rewards/format_reward": 0.9531250447034836,
"step": 62
},
{
"completion_length": 593.2924499511719,
"epoch": 0.1337579617834395,
"grad_norm": 2.4047532081604004,
"kl": 0.41357421875,
"learning_rate": 4.892117803050578e-06,
"loss": 0.0041,
"reward": 0.2631051279604435,
"reward_std": 0.2128530964255333,
"rewards/code_reward": 0.17359617352485657,
"rewards/format_reward": 0.895089328289032,
"step": 63
},
{
"completion_length": 563.3437805175781,
"epoch": 0.13588110403397027,
"grad_norm": 0.2846791446208954,
"kl": 0.197509765625,
"learning_rate": 4.887614073978761e-06,
"loss": 0.002,
"reward": 0.2669316381216049,
"reward_std": 0.14747418276965618,
"rewards/code_reward": 0.17630662396550179,
"rewards/format_reward": 0.9062500447034836,
"step": 64
},
{
"completion_length": 535.6183319091797,
"epoch": 0.13800424628450106,
"grad_norm": 0.2759235203266144,
"kl": 0.186767578125,
"learning_rate": 4.883020654901609e-06,
"loss": 0.0019,
"reward": 0.28016526997089386,
"reward_std": 0.17947101965546608,
"rewards/code_reward": 0.18730811774730682,
"rewards/format_reward": 0.928571492433548,
"step": 65
},
{
"completion_length": 616.9732360839844,
"epoch": 0.14012738853503184,
"grad_norm": 0.26271936297416687,
"kl": 0.23974609375,
"learning_rate": 4.878337738549785e-06,
"loss": 0.0024,
"reward": 0.23576084896922112,
"reward_std": 0.18645637948065996,
"rewards/code_reward": 0.1466983389109373,
"rewards/format_reward": 0.8906250447034836,
"step": 66
},
{
"completion_length": 591.013427734375,
"epoch": 0.14225053078556263,
"grad_norm": 0.2741730511188507,
"kl": 0.220947265625,
"learning_rate": 4.873565521409082e-06,
"loss": 0.0023,
"reward": 0.2887257859110832,
"reward_std": 0.15980570390820503,
"rewards/code_reward": 0.20055612176656723,
"rewards/format_reward": 0.8816964775323868,
"step": 67
},
{
"completion_length": 567.3995819091797,
"epoch": 0.14437367303609341,
"grad_norm": 0.30026066303253174,
"kl": 0.196533203125,
"learning_rate": 4.868704203712173e-06,
"loss": 0.002,
"reward": 0.2695513255894184,
"reward_std": 0.13891723938286304,
"rewards/code_reward": 0.18361380137503147,
"rewards/format_reward": 0.8593750447034836,
"step": 68
},
{
"completion_length": 557.5424346923828,
"epoch": 0.1464968152866242,
"grad_norm": 0.26508811116218567,
"kl": 0.2275390625,
"learning_rate": 4.86375398943021e-06,
"loss": 0.0023,
"reward": 0.2681450620293617,
"reward_std": 0.15566366165876389,
"rewards/code_reward": 0.17618075758218765,
"rewards/format_reward": 0.91964291036129,
"step": 69
},
{
"completion_length": 567.8192291259766,
"epoch": 0.14861995753715498,
"grad_norm": 0.2842702567577362,
"kl": 0.206787109375,
"learning_rate": 4.858715086264274e-06,
"loss": 0.0021,
"reward": 0.3179836943745613,
"reward_std": 0.17460669204592705,
"rewards/code_reward": 0.2246801033616066,
"rewards/format_reward": 0.9330357611179352,
"step": 70
},
{
"completion_length": 537.6094055175781,
"epoch": 0.15074309978768577,
"grad_norm": 0.2676061689853668,
"kl": 0.208740234375,
"learning_rate": 4.853587705636646e-06,
"loss": 0.0021,
"reward": 0.29776863381266594,
"reward_std": 0.16130083054304123,
"rewards/code_reward": 0.2037954218685627,
"rewards/format_reward": 0.9397321790456772,
"step": 71
},
{
"completion_length": 575.2143096923828,
"epoch": 0.15286624203821655,
"grad_norm": 0.2367551475763321,
"kl": 0.189208984375,
"learning_rate": 4.84837206268195e-06,
"loss": 0.0019,
"reward": 0.23160668835043907,
"reward_std": 0.13609608635306358,
"rewards/code_reward": 0.13785668183118105,
"rewards/format_reward": 0.9375000447034836,
"step": 72
},
{
"completion_length": 615.6518096923828,
"epoch": 0.15498938428874734,
"grad_norm": 0.2558582127094269,
"kl": 0.199462890625,
"learning_rate": 4.8430683762381195e-06,
"loss": 0.002,
"reward": 0.3226686045527458,
"reward_std": 0.18408508598804474,
"rewards/code_reward": 0.22847215831279755,
"rewards/format_reward": 0.9419643133878708,
"step": 73
},
{
"completion_length": 581.3549499511719,
"epoch": 0.15711252653927812,
"grad_norm": 0.25004705786705017,
"kl": 0.2255859375,
"learning_rate": 4.837676868837213e-06,
"loss": 0.0023,
"reward": 0.3521072790026665,
"reward_std": 0.18179307878017426,
"rewards/code_reward": 0.2556787021458149,
"rewards/format_reward": 0.9642857611179352,
"step": 74
},
{
"completion_length": 615.6384124755859,
"epoch": 0.1592356687898089,
"grad_norm": 0.2326764315366745,
"kl": 0.184326171875,
"learning_rate": 4.832197766696085e-06,
"loss": 0.002,
"reward": 0.3262624219059944,
"reward_std": 0.13872519508004189,
"rewards/code_reward": 0.22827134653925896,
"rewards/format_reward": 0.9799107611179352,
"step": 75
},
{
"completion_length": 626.1919708251953,
"epoch": 0.1613588110403397,
"grad_norm": 0.22483916580677032,
"kl": 0.2158203125,
"learning_rate": 4.826631299706887e-06,
"loss": 0.0022,
"reward": 0.24266962707042694,
"reward_std": 0.15623858594335616,
"rewards/code_reward": 0.14579462260007858,
"rewards/format_reward": 0.9687500447034836,
"step": 76
},
{
"completion_length": 638.6696624755859,
"epoch": 0.16348195329087048,
"grad_norm": 0.2549837827682495,
"kl": 0.2041015625,
"learning_rate": 4.820977701427424e-06,
"loss": 0.002,
"reward": 0.3548019379377365,
"reward_std": 0.19029108062386513,
"rewards/code_reward": 0.2577037066221237,
"rewards/format_reward": 0.9709821939468384,
"step": 77
},
{
"completion_length": 645.8727874755859,
"epoch": 0.16560509554140126,
"grad_norm": 0.2154364138841629,
"kl": 0.21875,
"learning_rate": 4.81523720907136e-06,
"loss": 0.0022,
"reward": 0.23285862803459167,
"reward_std": 0.129691231995821,
"rewards/code_reward": 0.13531397026963532,
"rewards/format_reward": 0.975446492433548,
"step": 78
},
{
"completion_length": 631.9732513427734,
"epoch": 0.16772823779193205,
"grad_norm": 0.22862014174461365,
"kl": 0.21435546875,
"learning_rate": 4.809410063498254e-06,
"loss": 0.0022,
"reward": 0.33913441002368927,
"reward_std": 0.1570077408105135,
"rewards/code_reward": 0.2411433346569538,
"rewards/format_reward": 0.9799107760190964,
"step": 79
},
{
"completion_length": 650.060302734375,
"epoch": 0.16985138004246284,
"grad_norm": 0.2403189241886139,
"kl": 0.218994140625,
"learning_rate": 4.8034965092034656e-06,
"loss": 0.0022,
"reward": 0.2641909271478653,
"reward_std": 0.15404854156076908,
"rewards/code_reward": 0.16664628125727177,
"rewards/format_reward": 0.9754464626312256,
"step": 80
},
{
"completion_length": 665.2299346923828,
"epoch": 0.17197452229299362,
"grad_norm": 0.22183021903038025,
"kl": 0.17236328125,
"learning_rate": 4.797496794307889e-06,
"loss": 0.0017,
"reward": 0.26636216044425964,
"reward_std": 0.159404331818223,
"rewards/code_reward": 0.16814786568284035,
"rewards/format_reward": 0.9821428954601288,
"step": 81
},
{
"completion_length": 646.7277069091797,
"epoch": 0.1740976645435244,
"grad_norm": 0.22360101342201233,
"kl": 0.1865234375,
"learning_rate": 4.791411170547545e-06,
"loss": 0.0019,
"reward": 0.2806714288890362,
"reward_std": 0.1345765646547079,
"rewards/code_reward": 0.1829035673290491,
"rewards/format_reward": 0.9776786267757416,
"step": 82
},
{
"completion_length": 649.9285888671875,
"epoch": 0.1762208067940552,
"grad_norm": 0.2544306218624115,
"kl": 0.173583984375,
"learning_rate": 4.785239893263017e-06,
"loss": 0.0017,
"reward": 0.26558100432157516,
"reward_std": 0.13677635975182056,
"rewards/code_reward": 0.16825956851243973,
"rewards/format_reward": 0.9732143431901932,
"step": 83
},
{
"completion_length": 686.3661041259766,
"epoch": 0.17834394904458598,
"grad_norm": 0.21967419981956482,
"kl": 0.16162109375,
"learning_rate": 4.778983221388742e-06,
"loss": 0.0016,
"reward": 0.24129238724708557,
"reward_std": 0.1258857063949108,
"rewards/code_reward": 0.14330130256712437,
"rewards/format_reward": 0.9799107760190964,
"step": 84
},
{
"completion_length": 634.2634124755859,
"epoch": 0.18046709129511676,
"grad_norm": 0.254304975271225,
"kl": 0.17724609375,
"learning_rate": 4.77264141744214e-06,
"loss": 0.0018,
"reward": 0.3212145194411278,
"reward_std": 0.1864020749926567,
"rewards/code_reward": 0.2236698605120182,
"rewards/format_reward": 0.9754464775323868,
"step": 85
},
{
"completion_length": 641.8393096923828,
"epoch": 0.18259023354564755,
"grad_norm": 0.24272438883781433,
"kl": 0.19873046875,
"learning_rate": 4.766214747512603e-06,
"loss": 0.002,
"reward": 0.31076986342668533,
"reward_std": 0.18200884014368057,
"rewards/code_reward": 0.2134484425187111,
"rewards/format_reward": 0.973214328289032,
"step": 86
},
{
"completion_length": 643.4263763427734,
"epoch": 0.18471337579617833,
"grad_norm": 0.2264644354581833,
"kl": 0.185791015625,
"learning_rate": 4.759703481250331e-06,
"loss": 0.0019,
"reward": 0.3214620351791382,
"reward_std": 0.14903312921524048,
"rewards/code_reward": 0.22347095608711243,
"rewards/format_reward": 0.979910746216774,
"step": 87
},
{
"completion_length": 649.5580749511719,
"epoch": 0.18683651804670912,
"grad_norm": 0.22847051918506622,
"kl": 0.169677734375,
"learning_rate": 4.753107891855015e-06,
"loss": 0.0018,
"reward": 0.25390685349702835,
"reward_std": 0.12118050269782543,
"rewards/code_reward": 0.15680862963199615,
"rewards/format_reward": 0.9709821939468384,
"step": 88
},
{
"completion_length": 652.5513610839844,
"epoch": 0.18895966029723993,
"grad_norm": 0.22527199983596802,
"kl": 0.19580078125,
"learning_rate": 4.746428256064375e-06,
"loss": 0.002,
"reward": 0.303693201392889,
"reward_std": 0.1710791066288948,
"rewards/code_reward": 0.20525570400059223,
"rewards/format_reward": 0.9843750298023224,
"step": 89
},
{
"completion_length": 693.4174499511719,
"epoch": 0.1910828025477707,
"grad_norm": 0.2056378573179245,
"kl": 0.17041015625,
"learning_rate": 4.7396648541425534e-06,
"loss": 0.0017,
"reward": 0.2523197568953037,
"reward_std": 0.1238141655921936,
"rewards/code_reward": 0.15432866849005222,
"rewards/format_reward": 0.979910746216774,
"step": 90
},
{
"completion_length": 679.5580596923828,
"epoch": 0.1932059447983015,
"grad_norm": 0.225660502910614,
"kl": 0.175537109375,
"learning_rate": 4.732817969868348e-06,
"loss": 0.0018,
"reward": 0.25567496195435524,
"reward_std": 0.16754142567515373,
"rewards/code_reward": 0.15813031047582626,
"rewards/format_reward": 0.9754464626312256,
"step": 91
},
{
"completion_length": 627.4464569091797,
"epoch": 0.19532908704883228,
"grad_norm": 0.24276329576969147,
"kl": 0.1419677734375,
"learning_rate": 4.7258878905233095e-06,
"loss": 0.0014,
"reward": 0.3579171895980835,
"reward_std": 0.21506508812308311,
"rewards/code_reward": 0.25992610678076744,
"rewards/format_reward": 0.9799107611179352,
"step": 92
},
{
"completion_length": 664.091552734375,
"epoch": 0.19745222929936307,
"grad_norm": 0.2459368258714676,
"kl": 0.17431640625,
"learning_rate": 4.718874906879688e-06,
"loss": 0.0017,
"reward": 0.25773513317108154,
"reward_std": 0.16034462675452232,
"rewards/code_reward": 0.16130654886364937,
"rewards/format_reward": 0.964285746216774,
"step": 93
},
{
"completion_length": 641.9576110839844,
"epoch": 0.19957537154989385,
"grad_norm": 0.20433549582958221,
"kl": 0.135986328125,
"learning_rate": 4.711779313188231e-06,
"loss": 0.0014,
"reward": 0.31772880256175995,
"reward_std": 0.12460769526660442,
"rewards/code_reward": 0.218844847753644,
"rewards/format_reward": 0.988839328289032,
"step": 94
},
{
"completion_length": 668.7522430419922,
"epoch": 0.20169851380042464,
"grad_norm": 0.228180393576622,
"kl": 0.1337890625,
"learning_rate": 4.70460140716584e-06,
"loss": 0.0014,
"reward": 0.23017888888716698,
"reward_std": 0.16307671833783388,
"rewards/code_reward": 0.13196459133177996,
"rewards/format_reward": 0.9821428954601288,
"step": 95
},
{
"completion_length": 666.0960083007812,
"epoch": 0.20382165605095542,
"grad_norm": 0.23849396407604218,
"kl": 0.132568359375,
"learning_rate": 4.697341489983076e-06,
"loss": 0.0013,
"reward": 0.38449443876743317,
"reward_std": 0.205027487128973,
"rewards/code_reward": 0.2869497686624527,
"rewards/format_reward": 0.9754464626312256,
"step": 96
},
{
"completion_length": 638.5513763427734,
"epoch": 0.2059447983014862,
"grad_norm": 0.23833003640174866,
"kl": 0.1219482421875,
"learning_rate": 4.6899998662515215e-06,
"loss": 0.0012,
"reward": 0.30101777240633965,
"reward_std": 0.18449735268950462,
"rewards/code_reward": 0.20235705375671387,
"rewards/format_reward": 0.9866071790456772,
"step": 97
},
{
"completion_length": 638.1161041259766,
"epoch": 0.208067940552017,
"grad_norm": 0.21731068193912506,
"kl": 0.146484375,
"learning_rate": 4.682576844011007e-06,
"loss": 0.0015,
"reward": 0.2744937762618065,
"reward_std": 0.16123195737600327,
"rewards/code_reward": 0.17650271020829678,
"rewards/format_reward": 0.979910746216774,
"step": 98
},
{
"completion_length": 616.8861846923828,
"epoch": 0.21019108280254778,
"grad_norm": 0.25706782937049866,
"kl": 0.134033203125,
"learning_rate": 4.675072734716678e-06,
"loss": 0.0013,
"reward": 0.27044272795319557,
"reward_std": 0.17698625475168228,
"rewards/code_reward": 0.17245164141058922,
"rewards/format_reward": 0.9799107313156128,
"step": 99
},
{
"completion_length": 617.6875305175781,
"epoch": 0.21231422505307856,
"grad_norm": 0.23481673002243042,
"kl": 0.123291015625,
"learning_rate": 4.667487853225931e-06,
"loss": 0.0013,
"reward": 0.27581261470913887,
"reward_std": 0.1309206485748291,
"rewards/code_reward": 0.17692868784070015,
"rewards/format_reward": 0.988839328289032,
"step": 100
},
{
"completion_length": 649.1205749511719,
"epoch": 0.21443736730360935,
"grad_norm": 0.22034895420074463,
"kl": 0.1197509765625,
"learning_rate": 4.659822517785203e-06,
"loss": 0.0012,
"reward": 0.3144006244838238,
"reward_std": 0.1468491405248642,
"rewards/code_reward": 0.21529346704483032,
"rewards/format_reward": 0.9910714477300644,
"step": 101
},
{
"completion_length": 637.7120819091797,
"epoch": 0.21656050955414013,
"grad_norm": 0.23196153342723846,
"kl": 0.1163330078125,
"learning_rate": 4.6520770500166165e-06,
"loss": 0.0012,
"reward": 0.2747727185487747,
"reward_std": 0.15404804423451424,
"rewards/code_reward": 0.17678163386881351,
"rewards/format_reward": 0.979910746216774,
"step": 102
},
{
"completion_length": 640.4375305175781,
"epoch": 0.21868365180467092,
"grad_norm": 0.21843470633029938,
"kl": 0.111083984375,
"learning_rate": 4.644251774904487e-06,
"loss": 0.0012,
"reward": 0.2366674654185772,
"reward_std": 0.12331773899495602,
"rewards/code_reward": 0.13889960199594498,
"rewards/format_reward": 0.9776786267757416,
"step": 103
},
{
"completion_length": 635.5982513427734,
"epoch": 0.2208067940552017,
"grad_norm": 0.2491585612297058,
"kl": 0.1253662109375,
"learning_rate": 4.636347020781684e-06,
"loss": 0.0013,
"reward": 0.26541591063141823,
"reward_std": 0.20751060917973518,
"rewards/code_reward": 0.16876413114368916,
"rewards/format_reward": 0.9665178954601288,
"step": 104
},
{
"completion_length": 636.6919860839844,
"epoch": 0.2229299363057325,
"grad_norm": 0.22667376697063446,
"kl": 0.1239013671875,
"learning_rate": 4.6283631193158605e-06,
"loss": 0.0013,
"reward": 0.28487036004662514,
"reward_std": 0.16408125311136246,
"rewards/code_reward": 0.18732571229338646,
"rewards/format_reward": 0.9754464775323868,
"step": 105
},
{
"completion_length": 645.0491333007812,
"epoch": 0.22505307855626328,
"grad_norm": 0.2283681333065033,
"kl": 0.124267578125,
"learning_rate": 4.620300405495532e-06,
"loss": 0.0013,
"reward": 0.2775597535073757,
"reward_std": 0.15426970086991787,
"rewards/code_reward": 0.17867580242455006,
"rewards/format_reward": 0.9888393133878708,
"step": 106
},
{
"completion_length": 633.8794860839844,
"epoch": 0.22717622080679406,
"grad_norm": 0.24237921833992004,
"kl": 0.1158447265625,
"learning_rate": 4.612159217616022e-06,
"loss": 0.0012,
"reward": 0.3130115121603012,
"reward_std": 0.20111830905079842,
"rewards/code_reward": 0.21502043306827545,
"rewards/format_reward": 0.979910746216774,
"step": 107
},
{
"completion_length": 607.8995666503906,
"epoch": 0.22929936305732485,
"grad_norm": 0.22627882659435272,
"kl": 0.1114501953125,
"learning_rate": 4.603939897265268e-06,
"loss": 0.0011,
"reward": 0.2647922486066818,
"reward_std": 0.13070931658148766,
"rewards/code_reward": 0.16546186804771423,
"rewards/format_reward": 0.9933035969734192,
"step": 108
},
{
"completion_length": 611.9308166503906,
"epoch": 0.23142250530785563,
"grad_norm": 0.24682241678237915,
"kl": 0.11474609375,
"learning_rate": 4.595642789309492e-06,
"loss": 0.0012,
"reward": 0.24479227885603905,
"reward_std": 0.14851071499288082,
"rewards/code_reward": 0.14657797291874886,
"rewards/format_reward": 0.9821428954601288,
"step": 109
},
{
"completion_length": 601.9018096923828,
"epoch": 0.23354564755838642,
"grad_norm": 0.22991596162319183,
"kl": 0.1337890625,
"learning_rate": 4.587268241878724e-06,
"loss": 0.0014,
"reward": 0.3472997844219208,
"reward_std": 0.20232820883393288,
"rewards/code_reward": 0.24953191354870796,
"rewards/format_reward": 0.9776785969734192,
"step": 110
},
{
"completion_length": 619.3036041259766,
"epoch": 0.2356687898089172,
"grad_norm": 0.23583151400089264,
"kl": 0.142822265625,
"learning_rate": 4.578816606352205e-06,
"loss": 0.0014,
"reward": 0.29563019424676895,
"reward_std": 0.17909668013453484,
"rewards/code_reward": 0.1987551935017109,
"rewards/format_reward": 0.9687500298023224,
"step": 111
},
{
"completion_length": 576.4419860839844,
"epoch": 0.23779193205944799,
"grad_norm": 0.2594500780105591,
"kl": 0.11865234375,
"learning_rate": 4.570288237343632e-06,
"loss": 0.0012,
"reward": 0.37235086783766747,
"reward_std": 0.21180756203830242,
"rewards/code_reward": 0.27346691489219666,
"rewards/format_reward": 0.9888393133878708,
"step": 112
},
{
"completion_length": 600.5937805175781,
"epoch": 0.23991507430997877,
"grad_norm": 0.24643369019031525,
"kl": 0.1270751953125,
"learning_rate": 4.561683492686289e-06,
"loss": 0.0013,
"reward": 0.31715739518404007,
"reward_std": 0.18861495703458786,
"rewards/code_reward": 0.21871986612677574,
"rewards/format_reward": 0.9843750596046448,
"step": 113
},
{
"completion_length": 587.732177734375,
"epoch": 0.24203821656050956,
"grad_norm": 0.23611418902873993,
"kl": 0.1268310546875,
"learning_rate": 4.5530027334180285e-06,
"loss": 0.0013,
"reward": 0.26467062532901764,
"reward_std": 0.17901071533560753,
"rewards/code_reward": 0.16712596639990807,
"rewards/format_reward": 0.9754464775323868,
"step": 114
},
{
"completion_length": 599.2254791259766,
"epoch": 0.24416135881104034,
"grad_norm": 0.24544627964496613,
"kl": 0.1339111328125,
"learning_rate": 4.544246323766122e-06,
"loss": 0.0014,
"reward": 0.27841826155781746,
"reward_std": 0.16098117642104626,
"rewards/code_reward": 0.18132001720368862,
"rewards/format_reward": 0.9709821939468384,
"step": 115
},
{
"completion_length": 570.107177734375,
"epoch": 0.24628450106157113,
"grad_norm": 0.25169771909713745,
"kl": 0.130859375,
"learning_rate": 4.535414631131983e-06,
"loss": 0.0013,
"reward": 0.34019989520311356,
"reward_std": 0.235354982316494,
"rewards/code_reward": 0.2422088049352169,
"rewards/format_reward": 0.9799107611179352,
"step": 116
},
{
"completion_length": 586.2834930419922,
"epoch": 0.2484076433121019,
"grad_norm": 0.2503570318222046,
"kl": 0.1285400390625,
"learning_rate": 4.526508026075746e-06,
"loss": 0.0013,
"reward": 0.33243585377931595,
"reward_std": 0.15851835533976555,
"rewards/code_reward": 0.23310547694563866,
"rewards/format_reward": 0.9933035969734192,
"step": 117
},
{
"completion_length": 619.6964569091797,
"epoch": 0.2505307855626327,
"grad_norm": 0.2021104097366333,
"kl": 0.1275634765625,
"learning_rate": 4.517526882300721e-06,
"loss": 0.0013,
"reward": 0.1987566240131855,
"reward_std": 0.12675911094993353,
"rewards/code_reward": 0.10143518354743719,
"rewards/format_reward": 0.973214328289032,
"step": 118
},
{
"completion_length": 577.7500305175781,
"epoch": 0.2526539278131635,
"grad_norm": 0.23152245581150055,
"kl": 0.139404296875,
"learning_rate": 4.508471576637713e-06,
"loss": 0.0014,
"reward": 0.24329132214188576,
"reward_std": 0.16570740193128586,
"rewards/code_reward": 0.14485381357371807,
"rewards/format_reward": 0.9843750447034836,
"step": 119
},
{
"completion_length": 599.3102874755859,
"epoch": 0.25477707006369427,
"grad_norm": 0.23706002533435822,
"kl": 0.1292724609375,
"learning_rate": 4.499342489029211e-06,
"loss": 0.0013,
"reward": 0.24242350459098816,
"reward_std": 0.14784781634807587,
"rewards/code_reward": 0.14398599043488503,
"rewards/format_reward": 0.9843750298023224,
"step": 120
},
{
"completion_length": 572.8839569091797,
"epoch": 0.25690021231422505,
"grad_norm": 0.2494058609008789,
"kl": 0.1270751953125,
"learning_rate": 4.490140002513449e-06,
"loss": 0.0013,
"reward": 0.26072419434785843,
"reward_std": 0.12450610846281052,
"rewards/code_reward": 0.16117061115801334,
"rewards/format_reward": 0.9955357313156128,
"step": 121
},
{
"completion_length": 601.8080749511719,
"epoch": 0.25902335456475584,
"grad_norm": 0.23028254508972168,
"kl": 0.1180419921875,
"learning_rate": 4.48086450320833e-06,
"loss": 0.0012,
"reward": 0.3514738455414772,
"reward_std": 0.16258227452635765,
"rewards/code_reward": 0.2525899298489094,
"rewards/format_reward": 0.988839328289032,
"step": 122
},
{
"completion_length": 590.4040374755859,
"epoch": 0.2611464968152866,
"grad_norm": 0.24208419024944305,
"kl": 0.1234130859375,
"learning_rate": 4.4715163802952266e-06,
"loss": 0.0012,
"reward": 0.3460327610373497,
"reward_std": 0.1636445987969637,
"rewards/code_reward": 0.24647919461131096,
"rewards/format_reward": 0.9955357313156128,
"step": 123
},
{
"completion_length": 609.7098388671875,
"epoch": 0.2632696390658174,
"grad_norm": 0.253397136926651,
"kl": 0.135009765625,
"learning_rate": 4.462096026002655e-06,
"loss": 0.0014,
"reward": 0.25145725160837173,
"reward_std": 0.16506105288863182,
"rewards/code_reward": 0.1530197374522686,
"rewards/format_reward": 0.9843750298023224,
"step": 124
},
{
"completion_length": 603.8973388671875,
"epoch": 0.2653927813163482,
"grad_norm": 0.2500688135623932,
"kl": 0.1434326171875,
"learning_rate": 4.4526038355898144e-06,
"loss": 0.0015,
"reward": 0.3970717117190361,
"reward_std": 0.2130543477833271,
"rewards/code_reward": 0.29908062517642975,
"rewards/format_reward": 0.979910746216774,
"step": 125
},
{
"completion_length": 615.4620819091797,
"epoch": 0.267515923566879,
"grad_norm": 0.20834827423095703,
"kl": 0.1336669921875,
"learning_rate": 4.4430402073300035e-06,
"loss": 0.0014,
"reward": 0.26642825454473495,
"reward_std": 0.1255171401426196,
"rewards/code_reward": 0.16799074038863182,
"rewards/format_reward": 0.9843750447034836,
"step": 126
},
{
"completion_length": 617.9687805175781,
"epoch": 0.26963906581740976,
"grad_norm": 0.23262540996074677,
"kl": 0.1351318359375,
"learning_rate": 4.433405542493909e-06,
"loss": 0.0014,
"reward": 0.2870429456233978,
"reward_std": 0.19062896817922592,
"rewards/code_reward": 0.18838223442435265,
"rewards/format_reward": 0.9866071939468384,
"step": 127
},
{
"completion_length": 662.2053833007812,
"epoch": 0.27176220806794055,
"grad_norm": 0.22548751533031464,
"kl": 0.1197509765625,
"learning_rate": 4.4237002453327734e-06,
"loss": 0.0013,
"reward": 0.30225350335240364,
"reward_std": 0.1395848747342825,
"rewards/code_reward": 0.203146331012249,
"rewards/format_reward": 0.9910714477300644,
"step": 128
},
{
"completion_length": 632.0022430419922,
"epoch": 0.27388535031847133,
"grad_norm": 0.24719858169555664,
"kl": 0.131103515625,
"learning_rate": 4.4139247230614245e-06,
"loss": 0.0013,
"reward": 0.32878731191158295,
"reward_std": 0.16303380951285362,
"rewards/code_reward": 0.22968016006052494,
"rewards/format_reward": 0.9910714477300644,
"step": 129
},
{
"completion_length": 636.7299346923828,
"epoch": 0.2760084925690021,
"grad_norm": 0.22243919968605042,
"kl": 0.1234130859375,
"learning_rate": 4.404079385841201e-06,
"loss": 0.0013,
"reward": 0.30703118816018105,
"reward_std": 0.12942655384540558,
"rewards/code_reward": 0.20792402233928442,
"rewards/format_reward": 0.9910714626312256,
"step": 130
},
{
"completion_length": 644.5468902587891,
"epoch": 0.2781316348195329,
"grad_norm": 0.220564067363739,
"kl": 0.123291015625,
"learning_rate": 4.394164646762734e-06,
"loss": 0.0013,
"reward": 0.296065516769886,
"reward_std": 0.18630750849843025,
"rewards/code_reward": 0.19673514552414417,
"rewards/format_reward": 0.9933035969734192,
"step": 131
},
{
"completion_length": 666.997802734375,
"epoch": 0.2802547770700637,
"grad_norm": 0.22151753306388855,
"kl": 0.1304931640625,
"learning_rate": 4.384180921828618e-06,
"loss": 0.0013,
"reward": 0.3110230341553688,
"reward_std": 0.1834610104560852,
"rewards/code_reward": 0.21370159462094307,
"rewards/format_reward": 0.973214328289032,
"step": 132
},
{
"completion_length": 665.1205749511719,
"epoch": 0.2823779193205945,
"grad_norm": 0.21862035989761353,
"kl": 0.1168212890625,
"learning_rate": 4.374128629935955e-06,
"loss": 0.0012,
"reward": 0.2876487486064434,
"reward_std": 0.21351643651723862,
"rewards/code_reward": 0.18965766951441765,
"rewards/format_reward": 0.9799107611179352,
"step": 133
},
{
"completion_length": 703.575927734375,
"epoch": 0.28450106157112526,
"grad_norm": 0.23025038838386536,
"kl": 0.1204833984375,
"learning_rate": 4.364008192858781e-06,
"loss": 0.0013,
"reward": 0.37238020449876785,
"reward_std": 0.17016195878386497,
"rewards/code_reward": 0.2737194746732712,
"rewards/format_reward": 0.986607164144516,
"step": 134
},
{
"completion_length": 726.8817138671875,
"epoch": 0.28662420382165604,
"grad_norm": 0.21229737997055054,
"kl": 0.121337890625,
"learning_rate": 4.353820035230366e-06,
"loss": 0.0012,
"reward": 0.20391739904880524,
"reward_std": 0.13868718035519123,
"rewards/code_reward": 0.10525666922330856,
"rewards/format_reward": 0.986607164144516,
"step": 135
},
{
"completion_length": 712.2098541259766,
"epoch": 0.28874734607218683,
"grad_norm": 0.2108394205570221,
"kl": 0.1138916015625,
"learning_rate": 4.3435645845254e-06,
"loss": 0.0012,
"reward": 0.3125154785811901,
"reward_std": 0.1781605463474989,
"rewards/code_reward": 0.2134083015844226,
"rewards/format_reward": 0.9910714626312256,
"step": 136
},
{
"completion_length": 718.5893249511719,
"epoch": 0.2908704883227176,
"grad_norm": 0.2107391357421875,
"kl": 0.1195068359375,
"learning_rate": 4.333242271042054e-06,
"loss": 0.0012,
"reward": 0.3177960254251957,
"reward_std": 0.1659994050860405,
"rewards/code_reward": 0.2186888586729765,
"rewards/format_reward": 0.9910714626312256,
"step": 137
},
{
"completion_length": 739.7545013427734,
"epoch": 0.2929936305732484,
"grad_norm": 0.2182048112154007,
"kl": 0.124755859375,
"learning_rate": 4.32285352788393e-06,
"loss": 0.0013,
"reward": 0.30886589735746384,
"reward_std": 0.1802590098232031,
"rewards/code_reward": 0.21020517125725746,
"rewards/format_reward": 0.9866071939468384,
"step": 138
},
{
"completion_length": 765.857177734375,
"epoch": 0.2951167728237792,
"grad_norm": 0.19854100048542023,
"kl": 0.115234375,
"learning_rate": 4.312398790941882e-06,
"loss": 0.0012,
"reward": 0.3000107705593109,
"reward_std": 0.15882322564721107,
"rewards/code_reward": 0.201126828789711,
"rewards/format_reward": 0.988839328289032,
"step": 139
},
{
"completion_length": 721.9687805175781,
"epoch": 0.29723991507430997,
"grad_norm": 0.23453111946582794,
"kl": 0.116455078125,
"learning_rate": 4.301878498875735e-06,
"loss": 0.0012,
"reward": 0.33861320093274117,
"reward_std": 0.1569173000752926,
"rewards/code_reward": 0.2401756690815091,
"rewards/format_reward": 0.9843750298023224,
"step": 140
},
{
"completion_length": 740.2031707763672,
"epoch": 0.29936305732484075,
"grad_norm": 0.21566148102283478,
"kl": 0.1102294921875,
"learning_rate": 4.291293093095873e-06,
"loss": 0.0011,
"reward": 0.3095410466194153,
"reward_std": 0.1992884911596775,
"rewards/code_reward": 0.2108803205192089,
"rewards/format_reward": 0.9866071790456772,
"step": 141
},
{
"completion_length": 710.2969207763672,
"epoch": 0.30148619957537154,
"grad_norm": 0.22105751931667328,
"kl": 0.12060546875,
"learning_rate": 4.280643017744723e-06,
"loss": 0.0013,
"reward": 0.36906543001532555,
"reward_std": 0.21653805300593376,
"rewards/code_reward": 0.2704046741127968,
"rewards/format_reward": 0.9866071790456772,
"step": 142
},
{
"completion_length": 751.5937805175781,
"epoch": 0.3036093418259023,
"grad_norm": 0.23819085955619812,
"kl": 0.1221923828125,
"learning_rate": 4.269928719678117e-06,
"loss": 0.0012,
"reward": 0.25049133971333504,
"reward_std": 0.17505915835499763,
"rewards/code_reward": 0.15160739235579967,
"rewards/format_reward": 0.988839328289032,
"step": 143
},
{
"completion_length": 735.9107513427734,
"epoch": 0.3057324840764331,
"grad_norm": 0.2204020470380783,
"kl": 0.1229248046875,
"learning_rate": 4.2591506484465426e-06,
"loss": 0.0012,
"reward": 0.26853859797120094,
"reward_std": 0.16319206822663546,
"rewards/code_reward": 0.17032429203391075,
"rewards/format_reward": 0.9821428805589676,
"step": 144
},
{
"completion_length": 730.8906707763672,
"epoch": 0.3078556263269639,
"grad_norm": 0.23124827444553375,
"kl": 0.119140625,
"learning_rate": 4.248309256276283e-06,
"loss": 0.0012,
"reward": 0.34641416370868683,
"reward_std": 0.15815678425133228,
"rewards/code_reward": 0.2479766495525837,
"rewards/format_reward": 0.9843750596046448,
"step": 145
},
{
"completion_length": 772.8192291259766,
"epoch": 0.3099787685774947,
"grad_norm": 0.21077241003513336,
"kl": 0.1168212890625,
"learning_rate": 4.23740499805044e-06,
"loss": 0.0012,
"reward": 0.2558128647506237,
"reward_std": 0.12367029674351215,
"rewards/code_reward": 0.15804500319063663,
"rewards/format_reward": 0.9776786267757416,
"step": 146
},
{
"completion_length": 740.8214721679688,
"epoch": 0.31210191082802546,
"grad_norm": 0.21619708836078644,
"kl": 0.125732421875,
"learning_rate": 4.22643833128985e-06,
"loss": 0.0013,
"reward": 0.33286692947149277,
"reward_std": 0.211603332310915,
"rewards/code_reward": 0.2342061996459961,
"rewards/format_reward": 0.9866071939468384,
"step": 147
},
{
"completion_length": 807.6071624755859,
"epoch": 0.31422505307855625,
"grad_norm": 0.2130916714668274,
"kl": 0.1212158203125,
"learning_rate": 4.215409716133885e-06,
"loss": 0.0012,
"reward": 0.3038931153714657,
"reward_std": 0.19640244916081429,
"rewards/code_reward": 0.2065716814249754,
"rewards/format_reward": 0.973214328289032,
"step": 148
},
{
"completion_length": 757.3281555175781,
"epoch": 0.31634819532908703,
"grad_norm": 0.21959362924098969,
"kl": 0.12744140625,
"learning_rate": 4.204319615321151e-06,
"loss": 0.0013,
"reward": 0.35224368050694466,
"reward_std": 0.1676900666207075,
"rewards/code_reward": 0.2542525976896286,
"rewards/format_reward": 0.9799107611179352,
"step": 149
},
{
"completion_length": 748.4687805175781,
"epoch": 0.3184713375796178,
"grad_norm": 0.22287501394748688,
"kl": 0.120361328125,
"learning_rate": 4.193168494170065e-06,
"loss": 0.0012,
"reward": 0.34373533725738525,
"reward_std": 0.18135884031653404,
"rewards/code_reward": 0.2457442507147789,
"rewards/format_reward": 0.9799107611179352,
"step": 150
},
{
"completion_length": 755.8906707763672,
"epoch": 0.3205944798301486,
"grad_norm": 1.9565397500991821,
"kl": 0.2320556640625,
"learning_rate": 4.181956820559339e-06,
"loss": 0.0023,
"reward": 0.3970649391412735,
"reward_std": 0.2402110919356346,
"rewards/code_reward": 0.2992970943450928,
"rewards/format_reward": 0.9776786118745804,
"step": 151
},
{
"completion_length": 779.8549499511719,
"epoch": 0.3227176220806794,
"grad_norm": 0.2783929109573364,
"kl": 0.15283203125,
"learning_rate": 4.170685064908342e-06,
"loss": 0.0016,
"reward": 0.19563322141766548,
"reward_std": 0.12617591954767704,
"rewards/code_reward": 0.11817785818129778,
"rewards/format_reward": 0.7745535969734192,
"step": 152
},
{
"completion_length": 726.2210083007812,
"epoch": 0.3248407643312102,
"grad_norm": 0.3019232451915741,
"kl": 0.157470703125,
"learning_rate": 4.159353700157365e-06,
"loss": 0.0016,
"reward": 0.17416437342762947,
"reward_std": 0.18693338334560394,
"rewards/code_reward": 0.14961079927161336,
"rewards/format_reward": 0.2455357201397419,
"step": 153
},
{
"completion_length": 698.8303833007812,
"epoch": 0.32696390658174096,
"grad_norm": 0.2774558365345001,
"kl": 0.1429443359375,
"learning_rate": 4.14796320174778e-06,
"loss": 0.0014,
"reward": 0.16863043326884508,
"reward_std": 0.1559329554438591,
"rewards/code_reward": 0.15412150975316763,
"rewards/format_reward": 0.145089291036129,
"step": 154
},
{
"completion_length": 675.5960083007812,
"epoch": 0.32908704883227174,
"grad_norm": 0.2807973027229309,
"kl": 0.1292724609375,
"learning_rate": 4.136514047602087e-06,
"loss": 0.0013,
"reward": 0.18624619487673044,
"reward_std": 0.18390434235334396,
"rewards/code_reward": 0.15968369878828526,
"rewards/format_reward": 0.2656250074505806,
"step": 155
},
{
"completion_length": 639.5625305175781,
"epoch": 0.33121019108280253,
"grad_norm": 0.2780408263206482,
"kl": 0.1229248046875,
"learning_rate": 4.1250067181038635e-06,
"loss": 0.0012,
"reward": 0.2191852517426014,
"reward_std": 0.13604657351970673,
"rewards/code_reward": 0.16851558908820152,
"rewards/format_reward": 0.5066964477300644,
"step": 156
},
{
"completion_length": 639.6652069091797,
"epoch": 0.3333333333333333,
"grad_norm": 0.26467737555503845,
"kl": 0.144775390625,
"learning_rate": 4.113441696077608e-06,
"loss": 0.0014,
"reward": 0.31026700511574745,
"reward_std": 0.202113538980484,
"rewards/code_reward": 0.2345973663032055,
"rewards/format_reward": 0.7566964626312256,
"step": 157
},
{
"completion_length": 659.5558319091797,
"epoch": 0.3354564755838641,
"grad_norm": 0.24479494988918304,
"kl": 0.1280517578125,
"learning_rate": 4.101819466768484e-06,
"loss": 0.0013,
"reward": 0.2640949599444866,
"reward_std": 0.1684006005525589,
"rewards/code_reward": 0.1763717383146286,
"rewards/format_reward": 0.8772321939468384,
"step": 158
},
{
"completion_length": 615.0558319091797,
"epoch": 0.3375796178343949,
"grad_norm": 0.24368631839752197,
"kl": 0.15087890625,
"learning_rate": 4.0901405178219535e-06,
"loss": 0.0015,
"reward": 0.345178809016943,
"reward_std": 0.19809392467141151,
"rewards/code_reward": 0.24941986054182053,
"rewards/format_reward": 0.957589328289032,
"step": 159
},
{
"completion_length": 624.1830444335938,
"epoch": 0.33970276008492567,
"grad_norm": 0.23896408081054688,
"kl": 0.154052734375,
"learning_rate": 4.078405339263326e-06,
"loss": 0.0015,
"reward": 0.37723641097545624,
"reward_std": 0.21996535174548626,
"rewards/code_reward": 0.28080783039331436,
"rewards/format_reward": 0.964285746216774,
"step": 160
},
{
"completion_length": 627.3236846923828,
"epoch": 0.34182590233545646,
"grad_norm": 0.2518380582332611,
"kl": 0.171142578125,
"learning_rate": 4.06661442347719e-06,
"loss": 0.0017,
"reward": 0.3184036388993263,
"reward_std": 0.19395017623901367,
"rewards/code_reward": 0.2217518538236618,
"rewards/format_reward": 0.9665178805589676,
"step": 161
},
{
"completion_length": 583.1674499511719,
"epoch": 0.34394904458598724,
"grad_norm": 0.2569345235824585,
"kl": 0.191162109375,
"learning_rate": 4.054768265186758e-06,
"loss": 0.0019,
"reward": 0.31612952798604965,
"reward_std": 0.20712972059845924,
"rewards/code_reward": 0.21836165338754654,
"rewards/format_reward": 0.9776785969734192,
"step": 162
},
{
"completion_length": 571.7857208251953,
"epoch": 0.346072186836518,
"grad_norm": 0.25089165568351746,
"kl": 0.207763671875,
"learning_rate": 4.0428673614331036e-06,
"loss": 0.0021,
"reward": 0.365755058825016,
"reward_std": 0.19836053252220154,
"rewards/code_reward": 0.2673175595700741,
"rewards/format_reward": 0.9843750447034836,
"step": 163
},
{
"completion_length": 596.4553833007812,
"epoch": 0.3481953290870488,
"grad_norm": 0.23911510407924652,
"kl": 0.22265625,
"learning_rate": 4.030912211554316e-06,
"loss": 0.0023,
"reward": 0.38677794113755226,
"reward_std": 0.18023086339235306,
"rewards/code_reward": 0.28744759038090706,
"rewards/format_reward": 0.9933035969734192,
"step": 164
},
{
"completion_length": 567.4486999511719,
"epoch": 0.3503184713375796,
"grad_norm": 0.24430882930755615,
"kl": 0.2099609375,
"learning_rate": 4.018903317164539e-06,
"loss": 0.0021,
"reward": 0.2250930406153202,
"reward_std": 0.19390171952545643,
"rewards/code_reward": 0.1277716178447008,
"rewards/format_reward": 0.9732143431901932,
"step": 165
},
{
"completion_length": 573.8594055175781,
"epoch": 0.3524416135881104,
"grad_norm": 0.2257012128829956,
"kl": 0.232666015625,
"learning_rate": 4.006841182132932e-06,
"loss": 0.0023,
"reward": 0.3599228076636791,
"reward_std": 0.20258177444338799,
"rewards/code_reward": 0.2603691965341568,
"rewards/format_reward": 0.9955357313156128,
"step": 166
},
{
"completion_length": 606.4911041259766,
"epoch": 0.35456475583864117,
"grad_norm": 0.238439679145813,
"kl": 0.252197265625,
"learning_rate": 3.9947263125625195e-06,
"loss": 0.0025,
"reward": 0.3261881247162819,
"reward_std": 0.1736624352633953,
"rewards/code_reward": 0.22775060683488846,
"rewards/format_reward": 0.9843750298023224,
"step": 167
},
{
"completion_length": 590.1027069091797,
"epoch": 0.35668789808917195,
"grad_norm": 0.22503866255283356,
"kl": 0.255615234375,
"learning_rate": 3.982559216768967e-06,
"loss": 0.0026,
"reward": 0.2961311787366867,
"reward_std": 0.1850012019276619,
"rewards/code_reward": 0.1968008242547512,
"rewards/format_reward": 0.9933035969734192,
"step": 168
},
{
"completion_length": 594.5491333007812,
"epoch": 0.35881104033970274,
"grad_norm": 0.22681432962417603,
"kl": 0.32421875,
"learning_rate": 3.970340405259245e-06,
"loss": 0.0033,
"reward": 0.4186030365526676,
"reward_std": 0.18541271798312664,
"rewards/code_reward": 0.31927267275750637,
"rewards/format_reward": 0.9933035969734192,
"step": 169
},
{
"completion_length": 594.2790298461914,
"epoch": 0.3609341825902335,
"grad_norm": 0.2294747531414032,
"kl": 0.32666015625,
"learning_rate": 3.958070390710214e-06,
"loss": 0.0033,
"reward": 0.36109255626797676,
"reward_std": 0.18586167134344578,
"rewards/code_reward": 0.26243184227496386,
"rewards/format_reward": 0.9866071790456772,
"step": 170
},
{
"completion_length": 592.4553680419922,
"epoch": 0.3630573248407643,
"grad_norm": 0.21662850677967072,
"kl": 0.247802734375,
"learning_rate": 3.945749687947109e-06,
"loss": 0.0025,
"reward": 0.24923527240753174,
"reward_std": 0.13961385935544968,
"rewards/code_reward": 0.15057454677298665,
"rewards/format_reward": 0.9866071939468384,
"step": 171
},
{
"completion_length": 570.9286041259766,
"epoch": 0.3651804670912951,
"grad_norm": 0.23902097344398499,
"kl": 0.22607421875,
"learning_rate": 3.933378813921942e-06,
"loss": 0.0023,
"reward": 0.3740244060754776,
"reward_std": 0.22742953523993492,
"rewards/code_reward": 0.27536366507411003,
"rewards/format_reward": 0.9866071790456772,
"step": 172
},
{
"completion_length": 598.2477874755859,
"epoch": 0.3673036093418259,
"grad_norm": 0.21673643589019775,
"kl": 0.213134765625,
"learning_rate": 3.920958287691811e-06,
"loss": 0.0021,
"reward": 0.2918965369462967,
"reward_std": 0.19641954079270363,
"rewards/code_reward": 0.19301261007785797,
"rewards/format_reward": 0.988839328289032,
"step": 173
},
{
"completion_length": 558.6718978881836,
"epoch": 0.36942675159235666,
"grad_norm": 0.2543295919895172,
"kl": 0.1962890625,
"learning_rate": 3.908488630397121e-06,
"loss": 0.002,
"reward": 0.41571951657533646,
"reward_std": 0.24686651676893234,
"rewards/code_reward": 0.31683557108044624,
"rewards/format_reward": 0.988839328289032,
"step": 174
},
{
"completion_length": 557.0201110839844,
"epoch": 0.37154989384288745,
"grad_norm": 0.23438729345798492,
"kl": 0.206298828125,
"learning_rate": 3.8959703652397175e-06,
"loss": 0.0021,
"reward": 0.38086430728435516,
"reward_std": 0.22366305626928806,
"rewards/code_reward": 0.28198035806417465,
"rewards/format_reward": 0.988839328289032,
"step": 175
},
{
"completion_length": 602.0536041259766,
"epoch": 0.37367303609341823,
"grad_norm": 0.24083319306373596,
"kl": 0.18701171875,
"learning_rate": 3.883404017460935e-06,
"loss": 0.0019,
"reward": 0.36414580047130585,
"reward_std": 0.22220248356461525,
"rewards/code_reward": 0.2657082974910736,
"rewards/format_reward": 0.9843750447034836,
"step": 176
},
{
"completion_length": 591.0893096923828,
"epoch": 0.37579617834394907,
"grad_norm": 0.25874680280685425,
"kl": 0.19873046875,
"learning_rate": 3.870790114319559e-06,
"loss": 0.002,
"reward": 0.3555009290575981,
"reward_std": 0.18250016495585442,
"rewards/code_reward": 0.25684019550681114,
"rewards/format_reward": 0.9866071939468384,
"step": 177
},
{
"completion_length": 587.404052734375,
"epoch": 0.37791932059447986,
"grad_norm": 0.22890929877758026,
"kl": 0.176513671875,
"learning_rate": 3.858129185069701e-06,
"loss": 0.0018,
"reward": 0.4567238390445709,
"reward_std": 0.2463996484875679,
"rewards/code_reward": 0.35806312412023544,
"rewards/format_reward": 0.9866071939468384,
"step": 178
},
{
"completion_length": 602.4486846923828,
"epoch": 0.38004246284501064,
"grad_norm": 0.22736036777496338,
"kl": 0.162353515625,
"learning_rate": 3.845421760938597e-06,
"loss": 0.0016,
"reward": 0.3570307157933712,
"reward_std": 0.16325377486646175,
"rewards/code_reward": 0.2583700120449066,
"rewards/format_reward": 0.9866071790456772,
"step": 179
},
{
"completion_length": 610.8638763427734,
"epoch": 0.3821656050955414,
"grad_norm": 0.2262299656867981,
"kl": 0.153076171875,
"learning_rate": 3.832668375104312e-06,
"loss": 0.0016,
"reward": 0.349903404712677,
"reward_std": 0.15386051312088966,
"rewards/code_reward": 0.2503498010337353,
"rewards/format_reward": 0.9955357313156128,
"step": 180
},
{
"completion_length": 639.8326110839844,
"epoch": 0.3842887473460722,
"grad_norm": 0.22941501438617706,
"kl": 0.17724609375,
"learning_rate": 3.8198695626733725e-06,
"loss": 0.0018,
"reward": 0.40823063999414444,
"reward_std": 0.2221880704164505,
"rewards/code_reward": 0.3093467131257057,
"rewards/format_reward": 0.988839328289032,
"step": 181
},
{
"completion_length": 638.1049346923828,
"epoch": 0.386411889596603,
"grad_norm": 0.23558823764324188,
"kl": 0.15283203125,
"learning_rate": 3.8070258606583156e-06,
"loss": 0.0016,
"reward": 0.36934422701597214,
"reward_std": 0.21686138212680817,
"rewards/code_reward": 0.27001385763287544,
"rewards/format_reward": 0.9933035969734192,
"step": 182
},
{
"completion_length": 625.5759124755859,
"epoch": 0.3885350318471338,
"grad_norm": 0.31238648295402527,
"kl": 0.166259765625,
"learning_rate": 3.7941378079551544e-06,
"loss": 0.0017,
"reward": 0.3830692619085312,
"reward_std": 0.24278680607676506,
"rewards/code_reward": 0.2835156861692667,
"rewards/format_reward": 0.9955357313156128,
"step": 183
},
{
"completion_length": 665.8192291259766,
"epoch": 0.39065817409766457,
"grad_norm": 0.3192687928676605,
"kl": 0.1513671875,
"learning_rate": 3.7812059453207677e-06,
"loss": 0.0015,
"reward": 0.3427841551601887,
"reward_std": 0.20133822225034237,
"rewards/code_reward": 0.2441234067082405,
"rewards/format_reward": 0.9866071939468384,
"step": 184
},
{
"completion_length": 655.4486846923828,
"epoch": 0.39278131634819535,
"grad_norm": 0.243864506483078,
"kl": 0.141845703125,
"learning_rate": 3.768230815350213e-06,
"loss": 0.0014,
"reward": 0.32591256499290466,
"reward_std": 0.1841282658278942,
"rewards/code_reward": 0.22680539265275002,
"rewards/format_reward": 0.9910714477300644,
"step": 185
},
{
"completion_length": 680.6942291259766,
"epoch": 0.39490445859872614,
"grad_norm": 2.7162351608276367,
"kl": 0.2255859375,
"learning_rate": 3.7552129624539557e-06,
"loss": 0.0023,
"reward": 0.38928014785051346,
"reward_std": 0.22797510400414467,
"rewards/code_reward": 0.2917355000972748,
"rewards/format_reward": 0.9754464626312256,
"step": 186
},
{
"completion_length": 670.8727874755859,
"epoch": 0.3970276008492569,
"grad_norm": 28.86046600341797,
"kl": 3.12841796875,
"learning_rate": 3.7421529328350316e-06,
"loss": 0.0313,
"reward": 0.33664827793836594,
"reward_std": 0.2122020348906517,
"rewards/code_reward": 0.2404429018497467,
"rewards/format_reward": 0.9620536118745804,
"step": 187
},
{
"completion_length": 683.8326110839844,
"epoch": 0.3991507430997877,
"grad_norm": 0.4426620602607727,
"kl": 0.14501953125,
"learning_rate": 3.7290512744661274e-06,
"loss": 0.0015,
"reward": 0.38399138301610947,
"reward_std": 0.1990874893963337,
"rewards/code_reward": 0.2860002890229225,
"rewards/format_reward": 0.9799107611179352,
"step": 188
},
{
"completion_length": 647.325927734375,
"epoch": 0.4012738853503185,
"grad_norm": 0.2341061532497406,
"kl": 0.1455078125,
"learning_rate": 3.715908537066589e-06,
"loss": 0.0015,
"reward": 0.42976176738739014,
"reward_std": 0.21941150352358818,
"rewards/code_reward": 0.33199387788772583,
"rewards/format_reward": 0.9776786267757416,
"step": 189
},
{
"completion_length": 698.1763610839844,
"epoch": 0.4033970276008493,
"grad_norm": 1.953539252281189,
"kl": 0.5579833984375,
"learning_rate": 3.7027252720793538e-06,
"loss": 0.0056,
"reward": 0.33469754457473755,
"reward_std": 0.19711985811591148,
"rewards/code_reward": 0.23692966997623444,
"rewards/format_reward": 0.9776786118745804,
"step": 190
},
{
"completion_length": 710.0982360839844,
"epoch": 0.40552016985138006,
"grad_norm": 0.24030916392803192,
"kl": 0.161865234375,
"learning_rate": 3.689502032647817e-06,
"loss": 0.0016,
"reward": 0.35261962562799454,
"reward_std": 0.2262839339673519,
"rewards/code_reward": 0.25552139058709145,
"rewards/format_reward": 0.970982164144516,
"step": 191
},
{
"completion_length": 672.5401916503906,
"epoch": 0.40764331210191085,
"grad_norm": 0.9592034816741943,
"kl": 0.154541015625,
"learning_rate": 3.6762393735926245e-06,
"loss": 0.0016,
"reward": 0.3493685219436884,
"reward_std": 0.1753272709902376,
"rewards/code_reward": 0.2524934969842434,
"rewards/format_reward": 0.9687500298023224,
"step": 192
},
{
"completion_length": 710.7299499511719,
"epoch": 0.40976645435244163,
"grad_norm": 0.3044726550579071,
"kl": 0.15185546875,
"learning_rate": 3.6629378513883852e-06,
"loss": 0.0015,
"reward": 0.4329136684536934,
"reward_std": 0.257048511877656,
"rewards/code_reward": 0.3346993774175644,
"rewards/format_reward": 0.98214291036129,
"step": 193
},
{
"completion_length": 718.3683166503906,
"epoch": 0.4118895966029724,
"grad_norm": 0.2441069632768631,
"kl": 0.1630859375,
"learning_rate": 3.6495980241403307e-06,
"loss": 0.0016,
"reward": 0.32557281479239464,
"reward_std": 0.19367647171020508,
"rewards/code_reward": 0.2271352931857109,
"rewards/format_reward": 0.9843750596046448,
"step": 194
},
{
"completion_length": 701.9486846923828,
"epoch": 0.4140127388535032,
"grad_norm": 0.22456014156341553,
"kl": 0.16064453125,
"learning_rate": 3.636220451560896e-06,
"loss": 0.0016,
"reward": 0.42680248618125916,
"reward_std": 0.2046816684305668,
"rewards/code_reward": 0.32903461158275604,
"rewards/format_reward": 0.9776786267757416,
"step": 195
},
{
"completion_length": 713.747802734375,
"epoch": 0.416135881104034,
"grad_norm": 0.45598000288009644,
"kl": 0.149169921875,
"learning_rate": 3.622805694946235e-06,
"loss": 0.0015,
"reward": 0.3776397071778774,
"reward_std": 0.18774981424212456,
"rewards/code_reward": 0.2803182378411293,
"rewards/format_reward": 0.9732143133878708,
"step": 196
},
{
"completion_length": 717.1406555175781,
"epoch": 0.4182590233545648,
"grad_norm": 0.21405339241027832,
"kl": 0.1429443359375,
"learning_rate": 3.609354317152667e-06,
"loss": 0.0015,
"reward": 0.38271288573741913,
"reward_std": 0.19382936879992485,
"rewards/code_reward": 0.28539142571389675,
"rewards/format_reward": 0.9732143133878708,
"step": 197
},
{
"completion_length": 678.2477874755859,
"epoch": 0.42038216560509556,
"grad_norm": 0.49006760120391846,
"kl": 0.2021484375,
"learning_rate": 3.595866882573063e-06,
"loss": 0.0021,
"reward": 0.4323223605751991,
"reward_std": 0.2277931533753872,
"rewards/code_reward": 0.3345545120537281,
"rewards/format_reward": 0.9776786118745804,
"step": 198
},
{
"completion_length": 728.9620971679688,
"epoch": 0.42250530785562634,
"grad_norm": 0.39922112226486206,
"kl": 0.184814453125,
"learning_rate": 3.5823439571131675e-06,
"loss": 0.0019,
"reward": 0.40869200229644775,
"reward_std": 0.2020891159772873,
"rewards/code_reward": 0.31159375607967377,
"rewards/format_reward": 0.9709821939468384,
"step": 199
},
{
"completion_length": 684.9576416015625,
"epoch": 0.42462845010615713,
"grad_norm": 0.23013651371002197,
"kl": 0.149658203125,
"learning_rate": 3.5687861081678477e-06,
"loss": 0.0015,
"reward": 0.4545319005846977,
"reward_std": 0.24276942387223244,
"rewards/code_reward": 0.3572104535996914,
"rewards/format_reward": 0.9732143431901932,
"step": 200
},
{
"completion_length": 704.5223541259766,
"epoch": 0.4267515923566879,
"grad_norm": 0.46601447463035583,
"kl": 0.145263671875,
"learning_rate": 3.555193904597291e-06,
"loss": 0.0015,
"reward": 0.3521813452243805,
"reward_std": 0.1790554393082857,
"rewards/code_reward": 0.2555295582860708,
"rewards/format_reward": 0.96651791036129,
"step": 201
},
{
"completion_length": 676.4754791259766,
"epoch": 0.4288747346072187,
"grad_norm": 0.24227948486804962,
"kl": 0.145751953125,
"learning_rate": 3.541567916703138e-06,
"loss": 0.0015,
"reward": 0.4256810247898102,
"reward_std": 0.2298164926469326,
"rewards/code_reward": 0.327689953148365,
"rewards/format_reward": 0.979910746216774,
"step": 202
},
{
"completion_length": 697.6027069091797,
"epoch": 0.4309978768577495,
"grad_norm": 0.32301369309425354,
"kl": 0.141845703125,
"learning_rate": 3.5279087162045517e-06,
"loss": 0.0014,
"reward": 0.27234210819005966,
"reward_std": 0.17865055054426193,
"rewards/code_reward": 0.17479745857417583,
"rewards/format_reward": 0.9754464775323868,
"step": 203
},
{
"completion_length": 696.3727874755859,
"epoch": 0.43312101910828027,
"grad_norm": 0.6582425236701965,
"kl": 0.14111328125,
"learning_rate": 3.5142168762142265e-06,
"loss": 0.0014,
"reward": 0.3229696787893772,
"reward_std": 0.1942148432135582,
"rewards/code_reward": 0.22542503476142883,
"rewards/format_reward": 0.9754464626312256,
"step": 204
},
{
"completion_length": 718.2232513427734,
"epoch": 0.43524416135881105,
"grad_norm": 0.30619704723358154,
"kl": 0.149169921875,
"learning_rate": 3.500492971214347e-06,
"loss": 0.0015,
"reward": 0.4395933449268341,
"reward_std": 0.265441432595253,
"rewards/code_reward": 0.3402629792690277,
"rewards/format_reward": 0.9933035969734192,
"step": 205
},
{
"completion_length": 699.310302734375,
"epoch": 0.43736730360934184,
"grad_norm": 0.3680998980998993,
"kl": 0.151611328125,
"learning_rate": 3.48673757703248e-06,
"loss": 0.0015,
"reward": 0.3385552614927292,
"reward_std": 0.24193225800991058,
"rewards/code_reward": 0.24101059883832932,
"rewards/format_reward": 0.9754464626312256,
"step": 206
},
{
"completion_length": 712.4799499511719,
"epoch": 0.4394904458598726,
"grad_norm": 0.22541551291942596,
"kl": 0.315673828125,
"learning_rate": 3.472951270817418e-06,
"loss": 0.0032,
"reward": 0.317364189773798,
"reward_std": 0.2289394848048687,
"rewards/code_reward": 0.2191498950123787,
"rewards/format_reward": 0.9821428954601288,
"step": 207
},
{
"completion_length": 725.1719207763672,
"epoch": 0.4416135881104034,
"grad_norm": 0.7986815571784973,
"kl": 0.8701171875,
"learning_rate": 3.4591346310149578e-06,
"loss": 0.0087,
"reward": 0.30210861191153526,
"reward_std": 0.1758405715227127,
"rewards/code_reward": 0.20545680448412895,
"rewards/format_reward": 0.9665178954601288,
"step": 208
},
{
"completion_length": 708.9174346923828,
"epoch": 0.4437367303609342,
"grad_norm": 0.6832094788551331,
"kl": 0.571533203125,
"learning_rate": 3.445288237343632e-06,
"loss": 0.0057,
"reward": 0.34425482153892517,
"reward_std": 0.17729798145592213,
"rewards/code_reward": 0.24559411033988,
"rewards/format_reward": 0.9866071939468384,
"step": 209
},
{
"completion_length": 664.4152069091797,
"epoch": 0.445859872611465,
"grad_norm": 0.5344778299331665,
"kl": 0.344970703125,
"learning_rate": 3.4314126707703895e-06,
"loss": 0.0035,
"reward": 0.3406968005001545,
"reward_std": 0.21405612863600254,
"rewards/code_reward": 0.2424825206398964,
"rewards/format_reward": 0.98214291036129,
"step": 210
},
{
"completion_length": 687.4955596923828,
"epoch": 0.44798301486199577,
"grad_norm": 0.2852449119091034,
"kl": 0.314208984375,
"learning_rate": 3.4175085134862128e-06,
"loss": 0.0031,
"reward": 0.37548423558473587,
"reward_std": 0.19767768681049347,
"rewards/code_reward": 0.2783860079944134,
"rewards/format_reward": 0.9709821939468384,
"step": 211
},
{
"completion_length": 694.372802734375,
"epoch": 0.45010615711252655,
"grad_norm": 0.8310821056365967,
"kl": 0.214111328125,
"learning_rate": 3.4035763488816953e-06,
"loss": 0.0021,
"reward": 0.5172732323408127,
"reward_std": 0.24472371861338615,
"rewards/code_reward": 0.41883569955825806,
"rewards/format_reward": 0.9843750447034836,
"step": 212
},
{
"completion_length": 678.0424499511719,
"epoch": 0.45222929936305734,
"grad_norm": 0.28591927886009216,
"kl": 0.14306640625,
"learning_rate": 3.3896167615225594e-06,
"loss": 0.0015,
"reward": 0.3543313890695572,
"reward_std": 0.21659231930971146,
"rewards/code_reward": 0.2567867375910282,
"rewards/format_reward": 0.9754464626312256,
"step": 213
},
{
"completion_length": 692.7522583007812,
"epoch": 0.4543524416135881,
"grad_norm": 0.5759381055831909,
"kl": 0.1455078125,
"learning_rate": 3.375630337125133e-06,
"loss": 0.0015,
"reward": 0.39294832199811935,
"reward_std": 0.26400984078645706,
"rewards/code_reward": 0.2960733026266098,
"rewards/format_reward": 0.9687500298023224,
"step": 214
},
{
"completion_length": 731.1272735595703,
"epoch": 0.4564755838641189,
"grad_norm": 0.23394830524921417,
"kl": 0.143798828125,
"learning_rate": 3.361617662531772e-06,
"loss": 0.0014,
"reward": 0.3601933494210243,
"reward_std": 0.25189225003123283,
"rewards/code_reward": 0.2619790583848953,
"rewards/format_reward": 0.9821428805589676,
"step": 215
},
{
"completion_length": 692.1428833007812,
"epoch": 0.4585987261146497,
"grad_norm": 0.24470415711402893,
"kl": 0.1317138671875,
"learning_rate": 3.347579325686237e-06,
"loss": 0.0013,
"reward": 0.3433048315346241,
"reward_std": 0.22325459122657776,
"rewards/code_reward": 0.245536956936121,
"rewards/format_reward": 0.9776786118745804,
"step": 216
},
{
"completion_length": 684.3705749511719,
"epoch": 0.4607218683651805,
"grad_norm": 0.364793062210083,
"kl": 0.122802734375,
"learning_rate": 3.333515915609027e-06,
"loss": 0.0012,
"reward": 0.4696499854326248,
"reward_std": 0.2734139449894428,
"rewards/code_reward": 0.3707660511136055,
"rewards/format_reward": 0.988839328289032,
"step": 217
},
{
"completion_length": 723.4129791259766,
"epoch": 0.46284501061571126,
"grad_norm": 0.36840999126434326,
"kl": 0.128173828125,
"learning_rate": 3.3194280223726616e-06,
"loss": 0.0013,
"reward": 0.3476767987012863,
"reward_std": 0.19485369697213173,
"rewards/code_reward": 0.24968570843338966,
"rewards/format_reward": 0.9799107611179352,
"step": 218
},
{
"completion_length": 660.7812805175781,
"epoch": 0.46496815286624205,
"grad_norm": 0.2917425036430359,
"kl": 0.142578125,
"learning_rate": 3.305316237076927e-06,
"loss": 0.0014,
"reward": 0.39485304057598114,
"reward_std": 0.23686816543340683,
"rewards/code_reward": 0.29708515852689743,
"rewards/format_reward": 0.9776786118745804,
"step": 219
},
{
"completion_length": 674.2410888671875,
"epoch": 0.46709129511677283,
"grad_norm": 0.27625608444213867,
"kl": 0.13134765625,
"learning_rate": 3.291181151824071e-06,
"loss": 0.0014,
"reward": 0.5001323744654655,
"reward_std": 0.2807440906763077,
"rewards/code_reward": 0.40124842897057533,
"rewards/format_reward": 0.9888393133878708,
"step": 220
},
{
"completion_length": 717.279052734375,
"epoch": 0.4692144373673036,
"grad_norm": 0.26342645287513733,
"kl": 0.132568359375,
"learning_rate": 3.27702335969396e-06,
"loss": 0.0014,
"reward": 0.438594788312912,
"reward_std": 0.2874513529241085,
"rewards/code_reward": 0.34060370177030563,
"rewards/format_reward": 0.979910746216774,
"step": 221
},
{
"completion_length": 740.8192291259766,
"epoch": 0.4713375796178344,
"grad_norm": 0.3312680423259735,
"kl": 0.144287109375,
"learning_rate": 3.2628434547191985e-06,
"loss": 0.0014,
"reward": 0.4112970530986786,
"reward_std": 0.2245728299021721,
"rewards/code_reward": 0.3137524016201496,
"rewards/format_reward": 0.9754464775323868,
"step": 222
},
{
"completion_length": 709.9174499511719,
"epoch": 0.4734607218683652,
"grad_norm": 1.5567724704742432,
"kl": 0.1339111328125,
"learning_rate": 3.2486420318601973e-06,
"loss": 0.0014,
"reward": 0.4251294732093811,
"reward_std": 0.18372783437371254,
"rewards/code_reward": 0.3291473314166069,
"rewards/format_reward": 0.9598214775323868,
"step": 223
},
{
"completion_length": 719.7656707763672,
"epoch": 0.47558386411889597,
"grad_norm": 0.2122294157743454,
"kl": 0.1273193359375,
"learning_rate": 3.2344196869802187e-06,
"loss": 0.0013,
"reward": 0.3450777679681778,
"reward_std": 0.24194234982132912,
"rewards/code_reward": 0.24730990827083588,
"rewards/format_reward": 0.9776786118745804,
"step": 224
},
{
"completion_length": 704.0045013427734,
"epoch": 0.47770700636942676,
"grad_norm": 0.9711757898330688,
"kl": 0.20751953125,
"learning_rate": 3.2201770168203694e-06,
"loss": 0.0021,
"reward": 0.4306853115558624,
"reward_std": 0.2568584829568863,
"rewards/code_reward": 0.33492637425661087,
"rewards/format_reward": 0.9575893133878708,
"step": 225
},
{
"completion_length": 727.6027069091797,
"epoch": 0.47983014861995754,
"grad_norm": 0.268039733171463,
"kl": 0.13818359375,
"learning_rate": 3.205914618974563e-06,
"loss": 0.0014,
"reward": 0.43213512748479843,
"reward_std": 0.2562938630580902,
"rewards/code_reward": 0.334367249161005,
"rewards/format_reward": 0.9776786267757416,
"step": 226
},
{
"completion_length": 732.9643249511719,
"epoch": 0.4819532908704883,
"grad_norm": 0.46155139803886414,
"kl": 0.198486328125,
"learning_rate": 3.1916330918644496e-06,
"loss": 0.002,
"reward": 0.31592320650815964,
"reward_std": 0.19539642706513405,
"rewards/code_reward": 0.2174856998026371,
"rewards/format_reward": 0.9843750596046448,
"step": 227
},
{
"completion_length": 770.1295013427734,
"epoch": 0.4840764331210191,
"grad_norm": 0.7360585331916809,
"kl": 0.3978271484375,
"learning_rate": 3.177333034714303e-06,
"loss": 0.004,
"reward": 0.35912561416625977,
"reward_std": 0.21681112423539162,
"rewards/code_reward": 0.26135774329304695,
"rewards/format_reward": 0.9776785969734192,
"step": 228
},
{
"completion_length": 706.9777069091797,
"epoch": 0.4861995753715499,
"grad_norm": 1.2824815511703491,
"kl": 0.615478515625,
"learning_rate": 3.1630150475258813e-06,
"loss": 0.0062,
"reward": 0.3668329790234566,
"reward_std": 0.2176014445722103,
"rewards/code_reward": 0.2699579633772373,
"rewards/format_reward": 0.9687500596046448,
"step": 229
},
{
"completion_length": 709.825927734375,
"epoch": 0.4883227176220807,
"grad_norm": 0.4730873107910156,
"kl": 0.4136962890625,
"learning_rate": 3.148679731053252e-06,
"loss": 0.0041,
"reward": 0.4401291459798813,
"reward_std": 0.2792894318699837,
"rewards/code_reward": 0.34213805943727493,
"rewards/format_reward": 0.9799107611179352,
"step": 230
},
{
"completion_length": 716.0312805175781,
"epoch": 0.49044585987261147,
"grad_norm": 0.226039856672287,
"kl": 0.1241455078125,
"learning_rate": 3.1343276867775805e-06,
"loss": 0.0013,
"reward": 0.3396586962044239,
"reward_std": 0.19299479201436043,
"rewards/code_reward": 0.24211404286324978,
"rewards/format_reward": 0.9754464775323868,
"step": 231
},
{
"completion_length": 699.6830749511719,
"epoch": 0.49256900212314225,
"grad_norm": 0.31895455718040466,
"kl": 0.50146484375,
"learning_rate": 3.1199595168819043e-06,
"loss": 0.0051,
"reward": 0.34284605644643307,
"reward_std": 0.14287223480641842,
"rewards/code_reward": 0.24463177705183625,
"rewards/format_reward": 0.98214291036129,
"step": 232
},
{
"completion_length": 781.5178985595703,
"epoch": 0.49469214437367304,
"grad_norm": 0.4143598973751068,
"kl": 0.249755859375,
"learning_rate": 3.105575824225852e-06,
"loss": 0.0025,
"reward": 0.38098950684070587,
"reward_std": 0.21905666589736938,
"rewards/code_reward": 0.28590018674731255,
"rewards/format_reward": 0.9508928954601288,
"step": 233
},
{
"completion_length": 725.3705749511719,
"epoch": 0.4968152866242038,
"grad_norm": 0.9609025120735168,
"kl": 0.401123046875,
"learning_rate": 3.091177212320363e-06,
"loss": 0.004,
"reward": 0.4063151776790619,
"reward_std": 0.25925979763269424,
"rewards/code_reward": 0.3076544553041458,
"rewards/format_reward": 0.986607164144516,
"step": 234
},
{
"completion_length": 730.1406555175781,
"epoch": 0.4989384288747346,
"grad_norm": 0.2471870481967926,
"kl": 0.233154296875,
"learning_rate": 3.0767642853023538e-06,
"loss": 0.0024,
"reward": 0.3827313929796219,
"reward_std": 0.21219320595264435,
"rewards/code_reward": 0.2858563922345638,
"rewards/format_reward": 0.9687500298023224,
"step": 235
},
{
"completion_length": 696.4219207763672,
"epoch": 0.5010615711252654,
"grad_norm": 0.6714680194854736,
"kl": 0.1856689453125,
"learning_rate": 3.062337647909376e-06,
"loss": 0.0019,
"reward": 0.4210161566734314,
"reward_std": 0.18587047047913074,
"rewards/code_reward": 0.3232482895255089,
"rewards/format_reward": 0.9776786267757416,
"step": 236
},
{
"completion_length": 744.200927734375,
"epoch": 0.5031847133757962,
"grad_norm": 0.5082603096961975,
"kl": 0.2071533203125,
"learning_rate": 3.04789790545424e-06,
"loss": 0.0021,
"reward": 0.4485570266842842,
"reward_std": 0.1916775107383728,
"rewards/code_reward": 0.3519052043557167,
"rewards/format_reward": 0.9665178954601288,
"step": 237
},
{
"completion_length": 758.0178985595703,
"epoch": 0.505307855626327,
"grad_norm": 0.69068843126297,
"kl": 0.19482421875,
"learning_rate": 3.033445663799621e-06,
"loss": 0.002,
"reward": 0.3711010664701462,
"reward_std": 0.1955837495625019,
"rewards/code_reward": 0.2742260619997978,
"rewards/format_reward": 0.9687500596046448,
"step": 238
},
{
"completion_length": 717.8147735595703,
"epoch": 0.5074309978768577,
"grad_norm": 0.40367022156715393,
"kl": 0.161865234375,
"learning_rate": 3.018981529332633e-06,
"loss": 0.0016,
"reward": 0.5175677761435509,
"reward_std": 0.2608077637851238,
"rewards/code_reward": 0.4193534851074219,
"rewards/format_reward": 0.9821428954601288,
"step": 239
},
{
"completion_length": 729.2545013427734,
"epoch": 0.5095541401273885,
"grad_norm": 0.5104432702064514,
"kl": 0.19384765625,
"learning_rate": 3.00450610893939e-06,
"loss": 0.002,
"reward": 0.40982675552368164,
"reward_std": 0.20613017305731773,
"rewards/code_reward": 0.31161245331168175,
"rewards/format_reward": 0.9821428954601288,
"step": 240
},
{
"completion_length": 705.7120819091797,
"epoch": 0.5116772823779193,
"grad_norm": 0.2226688116788864,
"kl": 0.167724609375,
"learning_rate": 2.9900200099795396e-06,
"loss": 0.0017,
"reward": 0.40758588910102844,
"reward_std": 0.22469941899180412,
"rewards/code_reward": 0.31048765778541565,
"rewards/format_reward": 0.970982164144516,
"step": 241
},
{
"completion_length": 716.6004791259766,
"epoch": 0.5138004246284501,
"grad_norm": 0.823330283164978,
"kl": 0.224853515625,
"learning_rate": 2.9755238402607826e-06,
"loss": 0.0023,
"reward": 0.381127692759037,
"reward_std": 0.1726750060915947,
"rewards/code_reward": 0.2817973233759403,
"rewards/format_reward": 0.9933035969734192,
"step": 242
},
{
"completion_length": 714.8460235595703,
"epoch": 0.5159235668789809,
"grad_norm": 0.5035973191261292,
"kl": 0.198486328125,
"learning_rate": 2.961018208013367e-06,
"loss": 0.002,
"reward": 0.3932320065796375,
"reward_std": 0.14925590343773365,
"rewards/code_reward": 0.295910551212728,
"rewards/format_reward": 0.973214328289032,
"step": 243
},
{
"completion_length": 714.6205749511719,
"epoch": 0.5180467091295117,
"grad_norm": 0.6857829689979553,
"kl": 0.16259765625,
"learning_rate": 2.9465037218645694e-06,
"loss": 0.0016,
"reward": 0.3965849094092846,
"reward_std": 0.20821771398186684,
"rewards/code_reward": 0.3001563027501106,
"rewards/format_reward": 0.9642857611179352,
"step": 244
},
{
"completion_length": 710.7254791259766,
"epoch": 0.5201698513800425,
"grad_norm": 1.60740327835083,
"kl": 0.131591796875,
"learning_rate": 2.9319809908131604e-06,
"loss": 0.0013,
"reward": 0.43405191600322723,
"reward_std": 0.25733353197574615,
"rewards/code_reward": 0.33539119362831116,
"rewards/format_reward": 0.9866071790456772,
"step": 245
},
{
"completion_length": 690.6027221679688,
"epoch": 0.5222929936305732,
"grad_norm": 0.2978648841381073,
"kl": 0.1689453125,
"learning_rate": 2.917450624203847e-06,
"loss": 0.0017,
"reward": 0.45192842930555344,
"reward_std": 0.24438033252954483,
"rewards/code_reward": 0.3539373278617859,
"rewards/format_reward": 0.9799107611179352,
"step": 246
},
{
"completion_length": 737.6094207763672,
"epoch": 0.524416135881104,
"grad_norm": 0.3084428310394287,
"kl": 0.1378173828125,
"learning_rate": 2.9029132317017118e-06,
"loss": 0.0014,
"reward": 0.46284686774015427,
"reward_std": 0.2403612770140171,
"rewards/code_reward": 0.36619507521390915,
"rewards/format_reward": 0.96651791036129,
"step": 247
},
{
"completion_length": 698.7411041259766,
"epoch": 0.5265392781316348,
"grad_norm": 1.3392090797424316,
"kl": 0.151123046875,
"learning_rate": 2.888369423266629e-06,
"loss": 0.0015,
"reward": 0.4595029503107071,
"reward_std": 0.19635827839374542,
"rewards/code_reward": 0.36218152195215225,
"rewards/format_reward": 0.9732143431901932,
"step": 248
},
{
"completion_length": 719.0134429931641,
"epoch": 0.5286624203821656,
"grad_norm": 0.21979840099811554,
"kl": 0.14111328125,
"learning_rate": 2.8738198091276712e-06,
"loss": 0.0014,
"reward": 0.36629121005535126,
"reward_std": 0.21069011464715004,
"rewards/code_reward": 0.2694162093102932,
"rewards/format_reward": 0.9687500298023224,
"step": 249
},
{
"completion_length": 745.263427734375,
"epoch": 0.5307855626326964,
"grad_norm": 0.8504329323768616,
"kl": 0.15234375,
"learning_rate": 2.859264999757509e-06,
"loss": 0.0016,
"reward": 0.37740468978881836,
"reward_std": 0.20382403209805489,
"rewards/code_reward": 0.2811993137001991,
"rewards/format_reward": 0.9620536267757416,
"step": 250
},
{
"completion_length": 723.1562957763672,
"epoch": 0.5329087048832272,
"grad_norm": 0.27034398913383484,
"kl": 0.1591796875,
"learning_rate": 2.8447056058467928e-06,
"loss": 0.0016,
"reward": 0.48566606640815735,
"reward_std": 0.21075040474534035,
"rewards/code_reward": 0.38789819926023483,
"rewards/format_reward": 0.9776786118745804,
"step": 251
},
{
"completion_length": 734.6986846923828,
"epoch": 0.535031847133758,
"grad_norm": 0.4998323917388916,
"kl": 0.145751953125,
"learning_rate": 2.830142238278531e-06,
"loss": 0.0015,
"reward": 0.3668500781059265,
"reward_std": 0.1973743811249733,
"rewards/code_reward": 0.2690822184085846,
"rewards/format_reward": 0.9776786118745804,
"step": 252
},
{
"completion_length": 722.1719207763672,
"epoch": 0.5371549893842887,
"grad_norm": 0.7386496663093567,
"kl": 0.16943359375,
"learning_rate": 2.81557550810246e-06,
"loss": 0.0017,
"reward": 0.5175603851675987,
"reward_std": 0.23075248673558235,
"rewards/code_reward": 0.41889964044094086,
"rewards/format_reward": 0.9866071790456772,
"step": 253
},
{
"completion_length": 731.4286041259766,
"epoch": 0.5392781316348195,
"grad_norm": 2.323516368865967,
"kl": 0.185791015625,
"learning_rate": 2.8010060265094026e-06,
"loss": 0.0019,
"reward": 0.4158123657107353,
"reward_std": 0.2362896017730236,
"rewards/code_reward": 0.3180444836616516,
"rewards/format_reward": 0.9776786267757416,
"step": 254
},
{
"completion_length": 714.1004791259766,
"epoch": 0.5414012738853503,
"grad_norm": 0.22996105253696442,
"kl": 0.193115234375,
"learning_rate": 2.786434404805629e-06,
"loss": 0.002,
"reward": 0.43036870658397675,
"reward_std": 0.17873099818825722,
"rewards/code_reward": 0.3323776051402092,
"rewards/format_reward": 0.979910746216774,
"step": 255
},
{
"completion_length": 755.794677734375,
"epoch": 0.5435244161358811,
"grad_norm": 0.5349477529525757,
"kl": 0.21728515625,
"learning_rate": 2.771861254387199e-06,
"loss": 0.0022,
"reward": 0.3905658796429634,
"reward_std": 0.24914883077144623,
"rewards/code_reward": 0.2939140759408474,
"rewards/format_reward": 0.96651791036129,
"step": 256
},
{
"completion_length": 736.7969207763672,
"epoch": 0.5456475583864119,
"grad_norm": 0.5384594202041626,
"kl": 0.44921875,
"learning_rate": 2.7572871867143204e-06,
"loss": 0.0045,
"reward": 0.4809773936867714,
"reward_std": 0.24490142613649368,
"rewards/code_reward": 0.3832095377147198,
"rewards/format_reward": 0.9776785969734192,
"step": 257
},
{
"completion_length": 779.904052734375,
"epoch": 0.5477707006369427,
"grad_norm": 0.3539630174636841,
"kl": 0.46142578125,
"learning_rate": 2.742712813285681e-06,
"loss": 0.0046,
"reward": 0.4002307578921318,
"reward_std": 0.26231593638658524,
"rewards/code_reward": 0.30447180569171906,
"rewards/format_reward": 0.9575893431901932,
"step": 258
},
{
"completion_length": 688.8013610839844,
"epoch": 0.5498938428874734,
"grad_norm": 0.3760126531124115,
"kl": 0.270263671875,
"learning_rate": 2.7281387456128017e-06,
"loss": 0.0027,
"reward": 0.5040838867425919,
"reward_std": 0.23536711558699608,
"rewards/code_reward": 0.40519992262125015,
"rewards/format_reward": 0.988839328289032,
"step": 259
},
{
"completion_length": 763.7924499511719,
"epoch": 0.5520169851380042,
"grad_norm": 0.28506824374198914,
"kl": 0.37060546875,
"learning_rate": 2.7135655951943716e-06,
"loss": 0.0037,
"reward": 0.4464469403028488,
"reward_std": 0.23727866262197495,
"rewards/code_reward": 0.3491254858672619,
"rewards/format_reward": 0.9732143133878708,
"step": 260
},
{
"completion_length": 735.7143096923828,
"epoch": 0.554140127388535,
"grad_norm": 0.5788131952285767,
"kl": 0.35546875,
"learning_rate": 2.698993973490598e-06,
"loss": 0.0036,
"reward": 0.5397853627800941,
"reward_std": 0.2762787565588951,
"rewards/code_reward": 0.4424639120697975,
"rewards/format_reward": 0.9732143133878708,
"step": 261
},
{
"completion_length": 770.4129791259766,
"epoch": 0.5562632696390658,
"grad_norm": 0.5763887166976929,
"kl": 0.4140625,
"learning_rate": 2.6844244918975416e-06,
"loss": 0.0041,
"reward": 0.4332207143306732,
"reward_std": 0.21580959856510162,
"rewards/code_reward": 0.3361224830150604,
"rewards/format_reward": 0.9709821939468384,
"step": 262
},
{
"completion_length": 756.357177734375,
"epoch": 0.5583864118895966,
"grad_norm": 0.23940207064151764,
"kl": 0.3953857421875,
"learning_rate": 2.66985776172147e-06,
"loss": 0.004,
"reward": 0.4067609831690788,
"reward_std": 0.15922481939196587,
"rewards/code_reward": 0.3078770413994789,
"rewards/format_reward": 0.9888393133878708,
"step": 263
},
{
"completion_length": 797.3772735595703,
"epoch": 0.5605095541401274,
"grad_norm": 1.1956448554992676,
"kl": 0.394775390625,
"learning_rate": 2.6552943941532088e-06,
"loss": 0.004,
"reward": 0.35336220264434814,
"reward_std": 0.21252319402992725,
"rewards/code_reward": 0.25447824597358704,
"rewards/format_reward": 0.988839328289032,
"step": 264
},
{
"completion_length": 816.9330749511719,
"epoch": 0.5626326963906582,
"grad_norm": 0.3272117078304291,
"kl": 0.33447265625,
"learning_rate": 2.6407350002424927e-06,
"loss": 0.0034,
"reward": 0.3648254945874214,
"reward_std": 0.19711337611079216,
"rewards/code_reward": 0.2675040401518345,
"rewards/format_reward": 0.973214328289032,
"step": 265
},
{
"completion_length": 790.0937805175781,
"epoch": 0.564755838641189,
"grad_norm": 0.3350137174129486,
"kl": 0.217529296875,
"learning_rate": 2.626180190872329e-06,
"loss": 0.0022,
"reward": 0.4639175459742546,
"reward_std": 0.19284814596176147,
"rewards/code_reward": 0.36592647433280945,
"rewards/format_reward": 0.9799107611179352,
"step": 266
},
{
"completion_length": 785.247802734375,
"epoch": 0.5668789808917197,
"grad_norm": 0.2253342717885971,
"kl": 0.1259765625,
"learning_rate": 2.611630576733372e-06,
"loss": 0.0013,
"reward": 0.42001737654209137,
"reward_std": 0.24298213049769402,
"rewards/code_reward": 0.32180308550596237,
"rewards/format_reward": 0.9821428805589676,
"step": 267
},
{
"completion_length": 874.1495971679688,
"epoch": 0.5690021231422505,
"grad_norm": 1.105658769607544,
"kl": 0.2879638671875,
"learning_rate": 2.5970867682982885e-06,
"loss": 0.0029,
"reward": 0.4009394347667694,
"reward_std": 0.2002662494778633,
"rewards/code_reward": 0.3031715527176857,
"rewards/format_reward": 0.9776786118745804,
"step": 268
},
{
"completion_length": 821.7812805175781,
"epoch": 0.5711252653927813,
"grad_norm": 0.39032891392707825,
"kl": 0.2081298828125,
"learning_rate": 2.582549375796154e-06,
"loss": 0.0021,
"reward": 0.4019026607275009,
"reward_std": 0.21826408058404922,
"rewards/code_reward": 0.3036883734166622,
"rewards/format_reward": 0.9821428805589676,
"step": 269
},
{
"completion_length": 804.6183471679688,
"epoch": 0.5732484076433121,
"grad_norm": 0.25958776473999023,
"kl": 0.179931640625,
"learning_rate": 2.568019009186841e-06,
"loss": 0.0019,
"reward": 0.4916309267282486,
"reward_std": 0.19469109177589417,
"rewards/code_reward": 0.3934166729450226,
"rewards/format_reward": 0.9821428954601288,
"step": 270
},
{
"completion_length": 825.3370971679688,
"epoch": 0.5753715498938429,
"grad_norm": 0.22188299894332886,
"kl": 0.1358642578125,
"learning_rate": 2.5534962781354317e-06,
"loss": 0.0014,
"reward": 0.4202270358800888,
"reward_std": 0.24190283194184303,
"rewards/code_reward": 0.32223593071103096,
"rewards/format_reward": 0.9799107611179352,
"step": 271
},
{
"completion_length": 780.4866485595703,
"epoch": 0.5774946921443737,
"grad_norm": 0.2580115497112274,
"kl": 0.1597900390625,
"learning_rate": 2.538981791986634e-06,
"loss": 0.0016,
"reward": 0.38698963820934296,
"reward_std": 0.22697532176971436,
"rewards/code_reward": 0.28877533972263336,
"rewards/format_reward": 0.98214291036129,
"step": 272
},
{
"completion_length": 823.0402069091797,
"epoch": 0.5796178343949044,
"grad_norm": 0.2385285347700119,
"kl": 0.141357421875,
"learning_rate": 2.524476159739218e-06,
"loss": 0.0015,
"reward": 0.43764442950487137,
"reward_std": 0.22844265773892403,
"rewards/code_reward": 0.34032295644283295,
"rewards/format_reward": 0.9732143431901932,
"step": 273
},
{
"completion_length": 782.2277221679688,
"epoch": 0.5817409766454352,
"grad_norm": 0.7892447710037231,
"kl": 0.1402587890625,
"learning_rate": 2.5099799900204607e-06,
"loss": 0.0014,
"reward": 0.47495051473379135,
"reward_std": 0.24570094048976898,
"rewards/code_reward": 0.37606657296419144,
"rewards/format_reward": 0.988839328289032,
"step": 274
},
{
"completion_length": 790.029052734375,
"epoch": 0.583864118895966,
"grad_norm": 1.390781044960022,
"kl": 0.1494140625,
"learning_rate": 2.4954938910606108e-06,
"loss": 0.0015,
"reward": 0.41709961369633675,
"reward_std": 0.22452056966722012,
"rewards/code_reward": 0.31910853274166584,
"rewards/format_reward": 0.979910746216774,
"step": 275
},
{
"completion_length": 766.544677734375,
"epoch": 0.5859872611464968,
"grad_norm": 0.3231986463069916,
"kl": 0.125732421875,
"learning_rate": 2.481018470667368e-06,
"loss": 0.0013,
"reward": 0.5159066766500473,
"reward_std": 0.2495804950594902,
"rewards/code_reward": 0.4188084527850151,
"rewards/format_reward": 0.9709821939468384,
"step": 276
},
{
"completion_length": 816.5491485595703,
"epoch": 0.5881104033970276,
"grad_norm": 0.3522323966026306,
"kl": 0.1474609375,
"learning_rate": 2.4665543362003802e-06,
"loss": 0.0016,
"reward": 0.5210660025477409,
"reward_std": 0.19517110101878643,
"rewards/code_reward": 0.42352132126688957,
"rewards/format_reward": 0.9754464775323868,
"step": 277
},
{
"completion_length": 815.3638610839844,
"epoch": 0.5902335456475584,
"grad_norm": 0.36454498767852783,
"kl": 0.156005859375,
"learning_rate": 2.4521020945457615e-06,
"loss": 0.0016,
"reward": 0.41319186985492706,
"reward_std": 0.21446501463651657,
"rewards/code_reward": 0.3152007535099983,
"rewards/format_reward": 0.979910746216774,
"step": 278
},
{
"completion_length": 830.2031707763672,
"epoch": 0.5923566878980892,
"grad_norm": 0.24598261713981628,
"kl": 0.182373046875,
"learning_rate": 2.4376623520906255e-06,
"loss": 0.0019,
"reward": 0.48784376308321953,
"reward_std": 0.25769151002168655,
"rewards/code_reward": 0.39141515642404556,
"rewards/format_reward": 0.9642857313156128,
"step": 279
},
{
"completion_length": 796.6272583007812,
"epoch": 0.5944798301486199,
"grad_norm": 0.24986568093299866,
"kl": 0.154541015625,
"learning_rate": 2.4232357146976478e-06,
"loss": 0.0016,
"reward": 0.3782888986170292,
"reward_std": 0.18982039019465446,
"rewards/code_reward": 0.28029780834913254,
"rewards/format_reward": 0.9799107611179352,
"step": 280
},
{
"completion_length": 789.513427734375,
"epoch": 0.5966029723991507,
"grad_norm": 0.28405284881591797,
"kl": 0.147705078125,
"learning_rate": 2.408822787679637e-06,
"loss": 0.0016,
"reward": 0.5121422186493874,
"reward_std": 0.2287510558962822,
"rewards/code_reward": 0.413704726845026,
"rewards/format_reward": 0.9843750298023224,
"step": 281
},
{
"completion_length": 808.3995819091797,
"epoch": 0.5987261146496815,
"grad_norm": 0.5303727984428406,
"kl": 0.144775390625,
"learning_rate": 2.3944241757741475e-06,
"loss": 0.0016,
"reward": 0.5508048385381699,
"reward_std": 0.18633075430989265,
"rewards/code_reward": 0.4516976475715637,
"rewards/format_reward": 0.9910714626312256,
"step": 282
},
{
"completion_length": 827.4531707763672,
"epoch": 0.6008492569002123,
"grad_norm": 0.2308008074760437,
"kl": 0.13037109375,
"learning_rate": 2.380040483118097e-06,
"loss": 0.0013,
"reward": 0.3481413722038269,
"reward_std": 0.20516538247466087,
"rewards/code_reward": 0.250150291249156,
"rewards/format_reward": 0.9799107313156128,
"step": 283
},
{
"completion_length": 786.8147583007812,
"epoch": 0.6029723991507431,
"grad_norm": 0.21248966455459595,
"kl": 0.136962890625,
"learning_rate": 2.365672313222419e-06,
"loss": 0.0014,
"reward": 0.4797332286834717,
"reward_std": 0.2214011587202549,
"rewards/code_reward": 0.3815189450979233,
"rewards/format_reward": 0.98214291036129,
"step": 284
},
{
"completion_length": 797.7723693847656,
"epoch": 0.6050955414012739,
"grad_norm": 0.2716215252876282,
"kl": 0.1376953125,
"learning_rate": 2.351320268946749e-06,
"loss": 0.0014,
"reward": 0.4847887381911278,
"reward_std": 0.26064618304371834,
"rewards/code_reward": 0.3861280009150505,
"rewards/format_reward": 0.9866071790456772,
"step": 285
},
{
"completion_length": 790.9308319091797,
"epoch": 0.6072186836518046,
"grad_norm": 0.22615957260131836,
"kl": 0.1334228515625,
"learning_rate": 2.336984952474119e-06,
"loss": 0.0014,
"reward": 0.4451970234513283,
"reward_std": 0.21199724823236465,
"rewards/code_reward": 0.34564343094825745,
"rewards/format_reward": 0.9955357313156128,
"step": 286
},
{
"completion_length": 774.716552734375,
"epoch": 0.6093418259023354,
"grad_norm": 0.24061597883701324,
"kl": 0.17236328125,
"learning_rate": 2.322666965285697e-06,
"loss": 0.0018,
"reward": 0.4680435359477997,
"reward_std": 0.2062854841351509,
"rewards/code_reward": 0.3700524792075157,
"rewards/format_reward": 0.9799107313156128,
"step": 287
},
{
"completion_length": 785.3437957763672,
"epoch": 0.6114649681528662,
"grad_norm": 0.2794930636882782,
"kl": 0.143798828125,
"learning_rate": 2.3083669081355507e-06,
"loss": 0.0015,
"reward": 0.41017772257328033,
"reward_std": 0.1858556531369686,
"rewards/code_reward": 0.31263307854533195,
"rewards/format_reward": 0.9754464626312256,
"step": 288
},
{
"completion_length": 768.2902221679688,
"epoch": 0.613588110403397,
"grad_norm": 0.2621839940547943,
"kl": 0.138427734375,
"learning_rate": 2.2940853810254377e-06,
"loss": 0.0014,
"reward": 0.4905528202652931,
"reward_std": 0.25080636143684387,
"rewards/code_reward": 0.39144565910100937,
"rewards/format_reward": 0.9910714477300644,
"step": 289
},
{
"completion_length": 788.9687805175781,
"epoch": 0.6157112526539278,
"grad_norm": 0.25945180654525757,
"kl": 0.1494140625,
"learning_rate": 2.2798229831796313e-06,
"loss": 0.0015,
"reward": 0.43350084125995636,
"reward_std": 0.1987269874662161,
"rewards/code_reward": 0.3370722308754921,
"rewards/format_reward": 0.9642857611179352,
"step": 290
},
{
"completion_length": 762.8616485595703,
"epoch": 0.6178343949044586,
"grad_norm": 0.28753146529197693,
"kl": 0.146484375,
"learning_rate": 2.2655803130197816e-06,
"loss": 0.0015,
"reward": 0.45754577219486237,
"reward_std": 0.20388228073716164,
"rewards/code_reward": 0.35977791622281075,
"rewards/format_reward": 0.9776786118745804,
"step": 291
},
{
"completion_length": 755.8594207763672,
"epoch": 0.6199575371549894,
"grad_norm": 0.2792350947856903,
"kl": 0.14794921875,
"learning_rate": 2.2513579681398034e-06,
"loss": 0.0016,
"reward": 0.4282514527440071,
"reward_std": 0.16725242137908936,
"rewards/code_reward": 0.32959069684147835,
"rewards/format_reward": 0.986607164144516,
"step": 292
},
{
"completion_length": 744.6138763427734,
"epoch": 0.6220806794055201,
"grad_norm": 0.2520155608654022,
"kl": 0.13720703125,
"learning_rate": 2.237156545280803e-06,
"loss": 0.0014,
"reward": 0.44700442999601364,
"reward_std": 0.21429810300469398,
"rewards/code_reward": 0.34812046587467194,
"rewards/format_reward": 0.9888393133878708,
"step": 293
},
{
"completion_length": 771.6897888183594,
"epoch": 0.6242038216560509,
"grad_norm": 0.41944995522499084,
"kl": 0.22412109375,
"learning_rate": 2.2229766403060403e-06,
"loss": 0.0023,
"reward": 0.4441903755068779,
"reward_std": 0.19182176142930984,
"rewards/code_reward": 0.3459760546684265,
"rewards/format_reward": 0.9821428954601288,
"step": 294
},
{
"completion_length": 778.763427734375,
"epoch": 0.6263269639065817,
"grad_norm": 0.2801876962184906,
"kl": 0.137939453125,
"learning_rate": 2.2088188481759305e-06,
"loss": 0.0014,
"reward": 0.46992357820272446,
"reward_std": 0.19111047685146332,
"rewards/code_reward": 0.37103963643312454,
"rewards/format_reward": 0.988839328289032,
"step": 295
},
{
"completion_length": 778.0893096923828,
"epoch": 0.6284501061571125,
"grad_norm": 0.21918566524982452,
"kl": 0.131103515625,
"learning_rate": 2.194683762923073e-06,
"loss": 0.0013,
"reward": 0.4984453171491623,
"reward_std": 0.22232287377119064,
"rewards/code_reward": 0.40045420452952385,
"rewards/format_reward": 0.979910746216774,
"step": 296
},
{
"completion_length": 740.3951110839844,
"epoch": 0.6305732484076433,
"grad_norm": 0.31050121784210205,
"kl": 0.1572265625,
"learning_rate": 2.1805719776273387e-06,
"loss": 0.0016,
"reward": 0.4212986081838608,
"reward_std": 0.1724853478372097,
"rewards/code_reward": 0.321968249976635,
"rewards/format_reward": 0.9933035969734192,
"step": 297
},
{
"completion_length": 682.138427734375,
"epoch": 0.6326963906581741,
"grad_norm": 0.24185748398303986,
"kl": 0.17529296875,
"learning_rate": 2.166484084390974e-06,
"loss": 0.0019,
"reward": 0.5747622847557068,
"reward_std": 0.18613022193312645,
"rewards/code_reward": 0.475878331810236,
"rewards/format_reward": 0.9888393133878708,
"step": 298
},
{
"completion_length": 716.6518096923828,
"epoch": 0.6348195329087049,
"grad_norm": 0.6314132213592529,
"kl": 0.166015625,
"learning_rate": 2.1524206743137636e-06,
"loss": 0.0017,
"reward": 0.36886315792798996,
"reward_std": 0.17360183410346508,
"rewards/code_reward": 0.2708720788359642,
"rewards/format_reward": 0.9799107611179352,
"step": 299
},
{
"completion_length": 737.8303833007812,
"epoch": 0.6369426751592356,
"grad_norm": 0.2968922555446625,
"kl": 0.19287109375,
"learning_rate": 2.1383823374682287e-06,
"loss": 0.0019,
"reward": 0.39945459365844727,
"reward_std": 0.20941082388162613,
"rewards/code_reward": 0.3014635145664215,
"rewards/format_reward": 0.9799107611179352,
"step": 300
},
{
"completion_length": 718.5714721679688,
"epoch": 0.6390658174097664,
"grad_norm": 19.51397132873535,
"kl": 0.275146484375,
"learning_rate": 2.124369662874868e-06,
"loss": 0.0029,
"reward": 0.503417618572712,
"reward_std": 0.15935716964304447,
"rewards/code_reward": 0.40631940215826035,
"rewards/format_reward": 0.9709821790456772,
"step": 301
},
{
"completion_length": 704.3393096923828,
"epoch": 0.6411889596602972,
"grad_norm": 0.35022857785224915,
"kl": 0.14697265625,
"learning_rate": 2.110383238477441e-06,
"loss": 0.0015,
"reward": 0.5569600984454155,
"reward_std": 0.20704489946365356,
"rewards/code_reward": 0.45785292237997055,
"rewards/format_reward": 0.9910714477300644,
"step": 302
},
{
"completion_length": 702.6986846923828,
"epoch": 0.643312101910828,
"grad_norm": 0.17607638239860535,
"kl": 0.13916015625,
"learning_rate": 2.096423651118305e-06,
"loss": 0.0014,
"reward": 0.2535444311797619,
"reward_std": 0.11278286523884162,
"rewards/code_reward": 0.15466050058603287,
"rewards/format_reward": 0.9888393133878708,
"step": 303
},
{
"completion_length": 701.2857360839844,
"epoch": 0.6454352441613588,
"grad_norm": 0.6241003274917603,
"kl": 0.1826171875,
"learning_rate": 2.082491486513788e-06,
"loss": 0.0019,
"reward": 0.5550656765699387,
"reward_std": 0.21512125991284847,
"rewards/code_reward": 0.45618174970149994,
"rewards/format_reward": 0.988839328289032,
"step": 304
},
{
"completion_length": 709.0379638671875,
"epoch": 0.6475583864118896,
"grad_norm": 0.696461021900177,
"kl": 0.1435546875,
"learning_rate": 2.0685873292296116e-06,
"loss": 0.0015,
"reward": 0.3796486109495163,
"reward_std": 0.15390164637938142,
"rewards/code_reward": 0.28121111169457436,
"rewards/format_reward": 0.9843750447034836,
"step": 305
},
{
"completion_length": 682.716552734375,
"epoch": 0.6496815286624203,
"grad_norm": 0.26720672845840454,
"kl": 0.162109375,
"learning_rate": 2.054711762656369e-06,
"loss": 0.0016,
"reward": 0.37838516384363174,
"reward_std": 0.16313385590910912,
"rewards/code_reward": 0.28061728924512863,
"rewards/format_reward": 0.9776786118745804,
"step": 306
},
{
"completion_length": 666.8080596923828,
"epoch": 0.6518046709129511,
"grad_norm": 0.8882589936256409,
"kl": 0.16259765625,
"learning_rate": 2.040865368985044e-06,
"loss": 0.0017,
"reward": 0.4301592782139778,
"reward_std": 0.20042868331074715,
"rewards/code_reward": 0.33105212450027466,
"rewards/format_reward": 0.9910714626312256,
"step": 307
},
{
"completion_length": 681.9129791259766,
"epoch": 0.6539278131634819,
"grad_norm": 0.23706179857254028,
"kl": 0.18310546875,
"learning_rate": 2.027048729182583e-06,
"loss": 0.0019,
"reward": 0.4861885607242584,
"reward_std": 0.16966554708778858,
"rewards/code_reward": 0.3881974592804909,
"rewards/format_reward": 0.9799107611179352,
"step": 308
},
{
"completion_length": 693.8147583007812,
"epoch": 0.6560509554140127,
"grad_norm": 0.5197703242301941,
"kl": 0.228271484375,
"learning_rate": 2.0132624229675205e-06,
"loss": 0.0024,
"reward": 0.511215090751648,
"reward_std": 0.18619069457054138,
"rewards/code_reward": 0.4127775654196739,
"rewards/format_reward": 0.9843750596046448,
"step": 309
},
{
"completion_length": 714.9777221679688,
"epoch": 0.6581740976645435,
"grad_norm": 0.24638721346855164,
"kl": 0.189453125,
"learning_rate": 1.9995070287856546e-06,
"loss": 0.002,
"reward": 0.5180679038167,
"reward_std": 0.21345077827572823,
"rewards/code_reward": 0.41963040083646774,
"rewards/format_reward": 0.9843750447034836,
"step": 310
},
{
"completion_length": 708.2723388671875,
"epoch": 0.6602972399150743,
"grad_norm": 0.422715961933136,
"kl": 0.18701171875,
"learning_rate": 1.985783123785774e-06,
"loss": 0.0019,
"reward": 0.5620292499661446,
"reward_std": 0.20659737288951874,
"rewards/code_reward": 0.46314531564712524,
"rewards/format_reward": 0.988839328289032,
"step": 311
},
{
"completion_length": 668.2589569091797,
"epoch": 0.6624203821656051,
"grad_norm": 0.6652376055717468,
"kl": 0.240478515625,
"learning_rate": 1.9720912837954486e-06,
"loss": 0.0025,
"reward": 0.4389989897608757,
"reward_std": 0.20384247601032257,
"rewards/code_reward": 0.33989182114601135,
"rewards/format_reward": 0.9910714626312256,
"step": 312
},
{
"completion_length": 671.5424346923828,
"epoch": 0.6645435244161358,
"grad_norm": 0.898223876953125,
"kl": 0.25927734375,
"learning_rate": 1.958432083296862e-06,
"loss": 0.0026,
"reward": 0.36031387001276016,
"reward_std": 0.2003210037946701,
"rewards/code_reward": 0.26254600286483765,
"rewards/format_reward": 0.9776786118745804,
"step": 313
},
{
"completion_length": 676.5602874755859,
"epoch": 0.6666666666666666,
"grad_norm": 0.7889689207077026,
"kl": 0.2135009765625,
"learning_rate": 1.9448060954027093e-06,
"loss": 0.0022,
"reward": 0.5204020366072655,
"reward_std": 0.16625045239925385,
"rewards/code_reward": 0.4212948679924011,
"rewards/format_reward": 0.9910714477300644,
"step": 314
},
{
"completion_length": 684.9866333007812,
"epoch": 0.6687898089171974,
"grad_norm": 1.3564072847366333,
"kl": 0.40185546875,
"learning_rate": 1.931213891832153e-06,
"loss": 0.0041,
"reward": 0.526521772146225,
"reward_std": 0.2212766855955124,
"rewards/code_reward": 0.4278610572218895,
"rewards/format_reward": 0.986607164144516,
"step": 315
},
{
"completion_length": 652.8102874755859,
"epoch": 0.6709129511677282,
"grad_norm": 0.24422591924667358,
"kl": 0.147216796875,
"learning_rate": 1.9176560428868336e-06,
"loss": 0.0015,
"reward": 0.3931754156947136,
"reward_std": 0.1695394441485405,
"rewards/code_reward": 0.29473789036273956,
"rewards/format_reward": 0.9843750447034836,
"step": 316
},
{
"completion_length": 687.8214569091797,
"epoch": 0.673036093418259,
"grad_norm": 0.4171687960624695,
"kl": 0.236328125,
"learning_rate": 1.9041331174269373e-06,
"loss": 0.0024,
"reward": 0.47731664031744003,
"reward_std": 0.20429091900587082,
"rewards/code_reward": 0.378879152238369,
"rewards/format_reward": 0.9843750298023224,
"step": 317
},
{
"completion_length": 682.6741485595703,
"epoch": 0.6751592356687898,
"grad_norm": 0.9241800308227539,
"kl": 0.36083984375,
"learning_rate": 1.8906456828473341e-06,
"loss": 0.0036,
"reward": 0.5124014094471931,
"reward_std": 0.21064380928874016,
"rewards/code_reward": 0.4132942706346512,
"rewards/format_reward": 0.9910714626312256,
"step": 318
},
{
"completion_length": 684.0759124755859,
"epoch": 0.6772823779193206,
"grad_norm": 0.24995659291744232,
"kl": 0.14794921875,
"learning_rate": 1.8771943050537656e-06,
"loss": 0.0016,
"reward": 0.592289388179779,
"reward_std": 0.2126442939043045,
"rewards/code_reward": 0.4942983016371727,
"rewards/format_reward": 0.9799107611179352,
"step": 319
},
{
"completion_length": 719.6786041259766,
"epoch": 0.6794055201698513,
"grad_norm": 0.24401088058948517,
"kl": 0.1395263671875,
"learning_rate": 1.8637795484391046e-06,
"loss": 0.0014,
"reward": 0.4689144790172577,
"reward_std": 0.25591350346803665,
"rewards/code_reward": 0.3711466044187546,
"rewards/format_reward": 0.9776786118745804,
"step": 320
},
{
"completion_length": 655.4844055175781,
"epoch": 0.6815286624203821,
"grad_norm": 0.3457026779651642,
"kl": 0.50341796875,
"learning_rate": 1.8504019758596698e-06,
"loss": 0.0051,
"reward": 0.5521439760923386,
"reward_std": 0.2452612817287445,
"rewards/code_reward": 0.45326002687215805,
"rewards/format_reward": 0.9888393133878708,
"step": 321
},
{
"completion_length": 714.7143096923828,
"epoch": 0.6836518046709129,
"grad_norm": 0.3326283395290375,
"kl": 0.1953125,
"learning_rate": 1.8370621486116163e-06,
"loss": 0.0021,
"reward": 0.5532227605581284,
"reward_std": 0.18401411548256874,
"rewards/code_reward": 0.4552316591143608,
"rewards/format_reward": 0.9799107760190964,
"step": 322
},
{
"completion_length": 677.7634124755859,
"epoch": 0.6857749469214437,
"grad_norm": 0.3285404145717621,
"kl": 0.23876953125,
"learning_rate": 1.823760626407377e-06,
"loss": 0.0025,
"reward": 0.4828302264213562,
"reward_std": 0.1928608939051628,
"rewards/code_reward": 0.384615920484066,
"rewards/format_reward": 0.9821429252624512,
"step": 323
},
{
"completion_length": 699.5982666015625,
"epoch": 0.6878980891719745,
"grad_norm": 0.34025460481643677,
"kl": 0.224365234375,
"learning_rate": 1.8104979673521838e-06,
"loss": 0.0023,
"reward": 0.42327145487070084,
"reward_std": 0.15393321216106415,
"rewards/code_reward": 0.32505714148283005,
"rewards/format_reward": 0.98214291036129,
"step": 324
},
{
"completion_length": 650.997802734375,
"epoch": 0.6900212314225053,
"grad_norm": 0.3025732636451721,
"kl": 0.24853515625,
"learning_rate": 1.7972747279206482e-06,
"loss": 0.0025,
"reward": 0.37195510417222977,
"reward_std": 0.19180476292967796,
"rewards/code_reward": 0.27418723329901695,
"rewards/format_reward": 0.9776786267757416,
"step": 325
},
{
"completion_length": 692.4464569091797,
"epoch": 0.692144373673036,
"grad_norm": 0.2389409989118576,
"kl": 0.148681640625,
"learning_rate": 1.7840914629334122e-06,
"loss": 0.0016,
"reward": 0.5394042208790779,
"reward_std": 0.22496159374713898,
"rewards/code_reward": 0.44185957312583923,
"rewards/format_reward": 0.9754464775323868,
"step": 326
},
{
"completion_length": 709.0669860839844,
"epoch": 0.6942675159235668,
"grad_norm": 0.28394991159439087,
"kl": 0.194091796875,
"learning_rate": 1.7709487255338731e-06,
"loss": 0.0021,
"reward": 0.4636544920504093,
"reward_std": 0.15700273029506207,
"rewards/code_reward": 0.36633305437862873,
"rewards/format_reward": 0.973214328289032,
"step": 327
},
{
"completion_length": 702.7768249511719,
"epoch": 0.6963906581740976,
"grad_norm": 0.22292962670326233,
"kl": 0.17431640625,
"learning_rate": 1.7578470671649684e-06,
"loss": 0.0019,
"reward": 0.4268321394920349,
"reward_std": 0.1670057326555252,
"rewards/code_reward": 0.32928748056292534,
"rewards/format_reward": 0.9754464775323868,
"step": 328
},
{
"completion_length": 694.5937805175781,
"epoch": 0.6985138004246284,
"grad_norm": 0.782927393913269,
"kl": 0.3328857421875,
"learning_rate": 1.744787037546045e-06,
"loss": 0.0034,
"reward": 0.46113383024930954,
"reward_std": 0.18688062392175198,
"rewards/code_reward": 0.3626963049173355,
"rewards/format_reward": 0.9843750298023224,
"step": 329
},
{
"completion_length": 706.1986999511719,
"epoch": 0.7006369426751592,
"grad_norm": 0.41430673003196716,
"kl": 0.1827392578125,
"learning_rate": 1.731769184649788e-06,
"loss": 0.0019,
"reward": 0.5658792853355408,
"reward_std": 0.23742860183119774,
"rewards/code_reward": 0.4683346152305603,
"rewards/format_reward": 0.9754464775323868,
"step": 330
},
{
"completion_length": 694.9576416015625,
"epoch": 0.70276008492569,
"grad_norm": 0.6622937917709351,
"kl": 0.214111328125,
"learning_rate": 1.7187940546792325e-06,
"loss": 0.0022,
"reward": 0.4137548431754112,
"reward_std": 0.1334713213145733,
"rewards/code_reward": 0.3155405670404434,
"rewards/format_reward": 0.9821428954601288,
"step": 331
},
{
"completion_length": 716.8393249511719,
"epoch": 0.7048832271762208,
"grad_norm": 0.22396574914455414,
"kl": 0.2607421875,
"learning_rate": 1.7058621920448465e-06,
"loss": 0.0027,
"reward": 0.4444565996527672,
"reward_std": 0.18423740193247795,
"rewards/code_reward": 0.34646550565958023,
"rewards/format_reward": 0.9799107611179352,
"step": 332
},
{
"completion_length": 703.0937805175781,
"epoch": 0.7070063694267515,
"grad_norm": 0.2483583688735962,
"kl": 0.160888671875,
"learning_rate": 1.6929741393416855e-06,
"loss": 0.0016,
"reward": 0.47170016914606094,
"reward_std": 0.18039512634277344,
"rewards/code_reward": 0.37393229454755783,
"rewards/format_reward": 0.9776786267757416,
"step": 333
},
{
"completion_length": 755.0312805175781,
"epoch": 0.7091295116772823,
"grad_norm": 0.4338986873626709,
"kl": 0.357177734375,
"learning_rate": 1.6801304373266286e-06,
"loss": 0.0036,
"reward": 0.4291260167956352,
"reward_std": 0.15267430432140827,
"rewards/code_reward": 0.3318046070635319,
"rewards/format_reward": 0.9732143133878708,
"step": 334
},
{
"completion_length": 767.3214569091797,
"epoch": 0.7112526539278131,
"grad_norm": 0.21925950050354004,
"kl": 0.137451171875,
"learning_rate": 1.667331624895689e-06,
"loss": 0.0014,
"reward": 0.4992447942495346,
"reward_std": 0.21635426208376884,
"rewards/code_reward": 0.4014769196510315,
"rewards/format_reward": 0.9776786118745804,
"step": 335
},
{
"completion_length": 750.1696929931641,
"epoch": 0.7133757961783439,
"grad_norm": 0.30118319392204285,
"kl": 0.359619140625,
"learning_rate": 1.6545782390614037e-06,
"loss": 0.0037,
"reward": 0.4922778084874153,
"reward_std": 0.1726557295769453,
"rewards/code_reward": 0.39317065104842186,
"rewards/format_reward": 0.9910714626312256,
"step": 336
},
{
"completion_length": 718.6339721679688,
"epoch": 0.7154989384288747,
"grad_norm": 0.41911348700523376,
"kl": 0.317626953125,
"learning_rate": 1.6418708149302992e-06,
"loss": 0.0033,
"reward": 0.44379642605781555,
"reward_std": 0.19296832010149956,
"rewards/code_reward": 0.3451356738805771,
"rewards/format_reward": 0.986607164144516,
"step": 337
},
{
"completion_length": 694.1138610839844,
"epoch": 0.7176220806794055,
"grad_norm": 0.7091541886329651,
"kl": 0.27783203125,
"learning_rate": 1.6292098856804423e-06,
"loss": 0.0028,
"reward": 0.4443873465061188,
"reward_std": 0.19508511200547218,
"rewards/code_reward": 0.3468426913022995,
"rewards/format_reward": 0.9754464775323868,
"step": 338
},
{
"completion_length": 720.607177734375,
"epoch": 0.7197452229299363,
"grad_norm": 0.6043697595596313,
"kl": 0.3173828125,
"learning_rate": 1.6165959825390661e-06,
"loss": 0.0033,
"reward": 0.43542125821113586,
"reward_std": 0.16308805532753468,
"rewards/code_reward": 0.33720696344971657,
"rewards/format_reward": 0.9821428954601288,
"step": 339
},
{
"completion_length": 706.1317291259766,
"epoch": 0.721868365180467,
"grad_norm": 0.2581160068511963,
"kl": 0.2353515625,
"learning_rate": 1.604029634760284e-06,
"loss": 0.0025,
"reward": 0.5382986813783646,
"reward_std": 0.14037772081792355,
"rewards/code_reward": 0.4403075948357582,
"rewards/format_reward": 0.9799107611179352,
"step": 340
},
{
"completion_length": 737.5469055175781,
"epoch": 0.7239915074309978,
"grad_norm": 0.4556562006473541,
"kl": 0.368408203125,
"learning_rate": 1.59151136960288e-06,
"loss": 0.0037,
"reward": 0.538652278482914,
"reward_std": 0.20831965655088425,
"rewards/code_reward": 0.44133080542087555,
"rewards/format_reward": 0.973214328289032,
"step": 341
},
{
"completion_length": 723.9486999511719,
"epoch": 0.7261146496815286,
"grad_norm": 0.2620218098163605,
"kl": 0.159912109375,
"learning_rate": 1.5790417123081903e-06,
"loss": 0.0017,
"reward": 0.45855508744716644,
"reward_std": 0.1777043156325817,
"rewards/code_reward": 0.3605640158057213,
"rewards/format_reward": 0.9799107611179352,
"step": 342
},
{
"completion_length": 686.8236999511719,
"epoch": 0.7282377919320594,
"grad_norm": 0.2753090560436249,
"kl": 0.16455078125,
"learning_rate": 1.5666211860780583e-06,
"loss": 0.0018,
"reward": 0.5850269198417664,
"reward_std": 0.19610749557614326,
"rewards/code_reward": 0.4870358556509018,
"rewards/format_reward": 0.9799107760190964,
"step": 343
},
{
"completion_length": 684.1205749511719,
"epoch": 0.7303609341825902,
"grad_norm": 0.23944684863090515,
"kl": 0.16455078125,
"learning_rate": 1.5542503120528918e-06,
"loss": 0.0017,
"reward": 0.5332599207758904,
"reward_std": 0.2457549162209034,
"rewards/code_reward": 0.43437594920396805,
"rewards/format_reward": 0.9888393133878708,
"step": 344
},
{
"completion_length": 720.3839569091797,
"epoch": 0.732484076433121,
"grad_norm": 0.31666672229766846,
"kl": 0.213134765625,
"learning_rate": 1.5419296092897866e-06,
"loss": 0.0022,
"reward": 0.5879708528518677,
"reward_std": 0.24002529680728912,
"rewards/code_reward": 0.4899797812104225,
"rewards/format_reward": 0.9799107611179352,
"step": 345
},
{
"completion_length": 693.6964569091797,
"epoch": 0.7346072186836518,
"grad_norm": 0.24176108837127686,
"kl": 0.15869140625,
"learning_rate": 1.529659594740755e-06,
"loss": 0.0016,
"reward": 0.4276282340288162,
"reward_std": 0.20496541634202003,
"rewards/code_reward": 0.32896753773093224,
"rewards/format_reward": 0.9866071790456772,
"step": 346
},
{
"completion_length": 704.2902221679688,
"epoch": 0.7367303609341825,
"grad_norm": 0.2568061351776123,
"kl": 0.15771484375,
"learning_rate": 1.5174407832310338e-06,
"loss": 0.0016,
"reward": 0.39445348642766476,
"reward_std": 0.13825338683091104,
"rewards/code_reward": 0.2962391600012779,
"rewards/format_reward": 0.9821428954601288,
"step": 347
},
{
"completion_length": 722.2545013427734,
"epoch": 0.7388535031847133,
"grad_norm": 0.49012815952301025,
"kl": 0.17578125,
"learning_rate": 1.5052736874374815e-06,
"loss": 0.0018,
"reward": 0.488083653151989,
"reward_std": 0.1927042007446289,
"rewards/code_reward": 0.39009255915880203,
"rewards/format_reward": 0.979910746216774,
"step": 348
},
{
"completion_length": 713.3147583007812,
"epoch": 0.7409766454352441,
"grad_norm": 0.6304606795310974,
"kl": 0.29345703125,
"learning_rate": 1.4931588178670695e-06,
"loss": 0.003,
"reward": 0.4815641790628433,
"reward_std": 0.16962832398712635,
"rewards/code_reward": 0.38357311114668846,
"rewards/format_reward": 0.9799107611179352,
"step": 349
},
{
"completion_length": 700.3214569091797,
"epoch": 0.7430997876857749,
"grad_norm": 0.43463101983070374,
"kl": 0.289306640625,
"learning_rate": 1.4810966828354605e-06,
"loss": 0.0029,
"reward": 0.45994506776332855,
"reward_std": 0.1931474320590496,
"rewards/code_reward": 0.36173076555132866,
"rewards/format_reward": 0.98214291036129,
"step": 350
},
{
"completion_length": 685.1384124755859,
"epoch": 0.7452229299363057,
"grad_norm": 0.34815892577171326,
"kl": 0.44140625,
"learning_rate": 1.469087788445684e-06,
"loss": 0.0045,
"reward": 0.5396069064736366,
"reward_std": 0.20336921885609627,
"rewards/code_reward": 0.44250866025686264,
"rewards/format_reward": 0.9709821939468384,
"step": 351
},
{
"completion_length": 698.6071929931641,
"epoch": 0.7473460721868365,
"grad_norm": 0.3489153981208801,
"kl": 0.533447265625,
"learning_rate": 1.4571326385668965e-06,
"loss": 0.0055,
"reward": 0.6215780973434448,
"reward_std": 0.202628992497921,
"rewards/code_reward": 0.5229173377156258,
"rewards/format_reward": 0.9866071790456772,
"step": 352
},
{
"completion_length": 713.6897583007812,
"epoch": 0.7494692144373672,
"grad_norm": 0.2902304232120514,
"kl": 0.160400390625,
"learning_rate": 1.4452317348132434e-06,
"loss": 0.0018,
"reward": 0.43891899287700653,
"reward_std": 0.1397520825266838,
"rewards/code_reward": 0.3393654003739357,
"rewards/format_reward": 0.9955357313156128,
"step": 353
},
{
"completion_length": 706.091552734375,
"epoch": 0.7515923566878981,
"grad_norm": 0.7335183024406433,
"kl": 0.34814453125,
"learning_rate": 1.4333855765228104e-06,
"loss": 0.0037,
"reward": 0.6451611816883087,
"reward_std": 0.20771214738488197,
"rewards/code_reward": 0.5465004742145538,
"rewards/format_reward": 0.9866071939468384,
"step": 354
},
{
"completion_length": 712.1607513427734,
"epoch": 0.7537154989384289,
"grad_norm": 0.7572880387306213,
"kl": 0.3447265625,
"learning_rate": 1.421594660736675e-06,
"loss": 0.0035,
"reward": 0.41940218955278397,
"reward_std": 0.1921430230140686,
"rewards/code_reward": 0.3202950209379196,
"rewards/format_reward": 0.9910714626312256,
"step": 355
},
{
"completion_length": 680.1986999511719,
"epoch": 0.7558386411889597,
"grad_norm": 0.3940925896167755,
"kl": 0.549560546875,
"learning_rate": 1.4098594821780476e-06,
"loss": 0.0056,
"reward": 0.6083894520998001,
"reward_std": 0.1597061362117529,
"rewards/code_reward": 0.5108448341488838,
"rewards/format_reward": 0.9754464626312256,
"step": 356
},
{
"completion_length": 665.372802734375,
"epoch": 0.7579617834394905,
"grad_norm": 0.2566499710083008,
"kl": 0.192138671875,
"learning_rate": 1.3981805332315174e-06,
"loss": 0.002,
"reward": 0.4351358078420162,
"reward_std": 0.1653740406036377,
"rewards/code_reward": 0.3360286522656679,
"rewards/format_reward": 0.9910714626312256,
"step": 357
},
{
"completion_length": 732.7589569091797,
"epoch": 0.7600849256900213,
"grad_norm": 0.35650861263275146,
"kl": 0.250732421875,
"learning_rate": 1.3865583039223929e-06,
"loss": 0.0026,
"reward": 0.5535444989800453,
"reward_std": 0.17830567993223667,
"rewards/code_reward": 0.4555533789098263,
"rewards/format_reward": 0.979910746216774,
"step": 358
},
{
"completion_length": 708.0424346923828,
"epoch": 0.7622080679405521,
"grad_norm": 0.24273599684238434,
"kl": 0.1611328125,
"learning_rate": 1.374993281896137e-06,
"loss": 0.0017,
"reward": 0.44518817216157913,
"reward_std": 0.19435212016105652,
"rewards/code_reward": 0.34697388112545013,
"rewards/format_reward": 0.98214291036129,
"step": 359
},
{
"completion_length": 765.1786041259766,
"epoch": 0.7643312101910829,
"grad_norm": 0.3510468304157257,
"kl": 0.197021484375,
"learning_rate": 1.3634859523979134e-06,
"loss": 0.002,
"reward": 0.47114741802215576,
"reward_std": 0.1812426745891571,
"rewards/code_reward": 0.3724866919219494,
"rewards/format_reward": 0.9866071939468384,
"step": 360
},
{
"completion_length": 724.216552734375,
"epoch": 0.7664543524416136,
"grad_norm": 1.1458288431167603,
"kl": 0.52978515625,
"learning_rate": 1.3520367982522208e-06,
"loss": 0.0053,
"reward": 0.45792729407548904,
"reward_std": 0.16464052349328995,
"rewards/code_reward": 0.35926656424999237,
"rewards/format_reward": 0.9866071939468384,
"step": 361
},
{
"completion_length": 705.4464721679688,
"epoch": 0.7685774946921444,
"grad_norm": 0.4750509560108185,
"kl": 0.23779296875,
"learning_rate": 1.3406462998426358e-06,
"loss": 0.0024,
"reward": 0.5133348107337952,
"reward_std": 0.24053634703159332,
"rewards/code_reward": 0.41445086151361465,
"rewards/format_reward": 0.988839328289032,
"step": 362
},
{
"completion_length": 743.1853179931641,
"epoch": 0.7707006369426752,
"grad_norm": 0.2608552575111389,
"kl": 0.325927734375,
"learning_rate": 1.3293149350916595e-06,
"loss": 0.0033,
"reward": 0.5553034171462059,
"reward_std": 0.19487734138965607,
"rewards/code_reward": 0.45731230080127716,
"rewards/format_reward": 0.979910746216774,
"step": 363
},
{
"completion_length": 678.2209930419922,
"epoch": 0.772823779193206,
"grad_norm": 0.22239775955677032,
"kl": 0.13037109375,
"learning_rate": 1.3180431794406623e-06,
"loss": 0.0015,
"reward": 0.6062557250261307,
"reward_std": 0.2048381306231022,
"rewards/code_reward": 0.5069253593683243,
"rewards/format_reward": 0.9933035969734192,
"step": 364
},
{
"completion_length": 707.4620971679688,
"epoch": 0.7749469214437368,
"grad_norm": 0.4696608781814575,
"kl": 0.270751953125,
"learning_rate": 1.3068315058299358e-06,
"loss": 0.0029,
"reward": 0.5663170740008354,
"reward_std": 0.15939603559672832,
"rewards/code_reward": 0.4678795412182808,
"rewards/format_reward": 0.9843750447034836,
"step": 365
},
{
"completion_length": 653.2879791259766,
"epoch": 0.7770700636942676,
"grad_norm": 1.1559607982635498,
"kl": 0.3037109375,
"learning_rate": 1.2956803846788503e-06,
"loss": 0.0032,
"reward": 0.618221327662468,
"reward_std": 0.22959138825535774,
"rewards/code_reward": 0.5193373411893845,
"rewards/format_reward": 0.988839328289032,
"step": 366
},
{
"completion_length": 731.2076263427734,
"epoch": 0.7791932059447984,
"grad_norm": 0.48825645446777344,
"kl": 0.210693359375,
"learning_rate": 1.284590283866116e-06,
"loss": 0.0021,
"reward": 0.33228749781847,
"reward_std": 0.15970432199537754,
"rewards/code_reward": 0.2345196194946766,
"rewards/format_reward": 0.9776786267757416,
"step": 367
},
{
"completion_length": 695.4576110839844,
"epoch": 0.7813163481953291,
"grad_norm": 1.4041056632995605,
"kl": 0.1883544921875,
"learning_rate": 1.2735616687101518e-06,
"loss": 0.002,
"reward": 0.40882231295108795,
"reward_std": 0.16854364797472954,
"rewards/code_reward": 0.3103848248720169,
"rewards/format_reward": 0.9843750298023224,
"step": 368
},
{
"completion_length": 696.4687805175781,
"epoch": 0.7834394904458599,
"grad_norm": 1.9169604778289795,
"kl": 0.201171875,
"learning_rate": 1.2625950019495614e-06,
"loss": 0.0021,
"reward": 0.5380031913518906,
"reward_std": 0.1728157363831997,
"rewards/code_reward": 0.4400121048092842,
"rewards/format_reward": 0.979910746216774,
"step": 369
},
{
"completion_length": 709.4598693847656,
"epoch": 0.7855626326963907,
"grad_norm": 0.3797023594379425,
"kl": 0.1640625,
"learning_rate": 1.251690743723718e-06,
"loss": 0.0017,
"reward": 0.5747079327702522,
"reward_std": 0.24513645470142365,
"rewards/code_reward": 0.4767168238759041,
"rewards/format_reward": 0.9799107611179352,
"step": 370
},
{
"completion_length": 647.7209930419922,
"epoch": 0.7876857749469215,
"grad_norm": 0.24551738798618317,
"kl": 0.150390625,
"learning_rate": 1.2408493515534581e-06,
"loss": 0.0016,
"reward": 0.6943890303373337,
"reward_std": 0.22319162264466286,
"rewards/code_reward": 0.5959515273571014,
"rewards/format_reward": 0.9843750298023224,
"step": 371
},
{
"completion_length": 692.2098693847656,
"epoch": 0.7898089171974523,
"grad_norm": 0.4829825460910797,
"kl": 0.406005859375,
"learning_rate": 1.2300712803218834e-06,
"loss": 0.0042,
"reward": 0.5234424099326134,
"reward_std": 0.1910531185567379,
"rewards/code_reward": 0.42388884723186493,
"rewards/format_reward": 0.9955357313156128,
"step": 372
},
{
"completion_length": 697.3036041259766,
"epoch": 0.7919320594479831,
"grad_norm": 114.25981140136719,
"kl": 16.0146484375,
"learning_rate": 1.2193569822552772e-06,
"loss": 0.1608,
"reward": 0.559485673904419,
"reward_std": 0.20258497074246407,
"rewards/code_reward": 0.4606017544865608,
"rewards/format_reward": 0.988839328289032,
"step": 373
},
{
"completion_length": 677.7567291259766,
"epoch": 0.7940552016985138,
"grad_norm": 0.3005722761154175,
"kl": 0.171875,
"learning_rate": 1.2087069069041268e-06,
"loss": 0.0018,
"reward": 0.5883411467075348,
"reward_std": 0.21694539301097393,
"rewards/code_reward": 0.4901268184185028,
"rewards/format_reward": 0.98214291036129,
"step": 374
},
{
"completion_length": 671.1495819091797,
"epoch": 0.7961783439490446,
"grad_norm": 0.6558151841163635,
"kl": 0.162841796875,
"learning_rate": 1.1981215011242654e-06,
"loss": 0.0017,
"reward": 0.5491671711206436,
"reward_std": 0.2353355698287487,
"rewards/code_reward": 0.45050643384456635,
"rewards/format_reward": 0.9866071939468384,
"step": 375
},
{
"completion_length": 663.4486999511719,
"epoch": 0.7983014861995754,
"grad_norm": 1.0574246644973755,
"kl": 0.168701171875,
"learning_rate": 1.1876012090581184e-06,
"loss": 0.0018,
"reward": 0.523729532957077,
"reward_std": 0.19741250574588776,
"rewards/code_reward": 0.42573845386505127,
"rewards/format_reward": 0.979910746216774,
"step": 376
},
{
"completion_length": 678.5826110839844,
"epoch": 0.8004246284501062,
"grad_norm": 0.28517383337020874,
"kl": 0.168212890625,
"learning_rate": 1.177146472116071e-06,
"loss": 0.0018,
"reward": 0.4997348487377167,
"reward_std": 0.16867511346936226,
"rewards/code_reward": 0.40196699649095535,
"rewards/format_reward": 0.9776785969734192,
"step": 377
},
{
"completion_length": 725.0000457763672,
"epoch": 0.802547770700637,
"grad_norm": 0.38322436809539795,
"kl": 0.176025390625,
"learning_rate": 1.1667577289579462e-06,
"loss": 0.0018,
"reward": 0.43969085440039635,
"reward_std": 0.16067362390458584,
"rewards/code_reward": 0.3425925988703966,
"rewards/format_reward": 0.9709821790456772,
"step": 378
},
{
"completion_length": 671.4710083007812,
"epoch": 0.8046709129511678,
"grad_norm": 0.24044837057590485,
"kl": 0.1435546875,
"learning_rate": 1.1564354154746007e-06,
"loss": 0.0015,
"reward": 0.5779925882816315,
"reward_std": 0.22314922511577606,
"rewards/code_reward": 0.479331873357296,
"rewards/format_reward": 0.9866071790456772,
"step": 379
},
{
"completion_length": 701.1875457763672,
"epoch": 0.8067940552016986,
"grad_norm": 0.2769814729690552,
"kl": 0.187255859375,
"learning_rate": 1.146179964769635e-06,
"loss": 0.002,
"reward": 0.5813698992133141,
"reward_std": 0.21280257403850555,
"rewards/code_reward": 0.48315558582544327,
"rewards/format_reward": 0.9821428954601288,
"step": 380
},
{
"completion_length": 703.2701263427734,
"epoch": 0.8089171974522293,
"grad_norm": 0.43315884470939636,
"kl": 0.28125,
"learning_rate": 1.1359918071412195e-06,
"loss": 0.003,
"reward": 0.5584300383925438,
"reward_std": 0.17897445522248745,
"rewards/code_reward": 0.4595461040735245,
"rewards/format_reward": 0.988839328289032,
"step": 381
},
{
"completion_length": 680.9174499511719,
"epoch": 0.8110403397027601,
"grad_norm": 0.3025217652320862,
"kl": 0.208251953125,
"learning_rate": 1.1258713700640456e-06,
"loss": 0.0022,
"reward": 0.47092022001743317,
"reward_std": 0.1665214579552412,
"rewards/code_reward": 0.3727059066295624,
"rewards/format_reward": 0.9821428954601288,
"step": 382
},
{
"completion_length": 672.3705596923828,
"epoch": 0.8131634819532909,
"grad_norm": 0.23662854731082916,
"kl": 0.1478271484375,
"learning_rate": 1.115819078171383e-06,
"loss": 0.0016,
"reward": 0.5290590599179268,
"reward_std": 0.21020712330937386,
"rewards/code_reward": 0.4312911853194237,
"rewards/format_reward": 0.9776786267757416,
"step": 383
},
{
"completion_length": 659.2678833007812,
"epoch": 0.8152866242038217,
"grad_norm": 0.2239212840795517,
"kl": 0.1688232421875,
"learning_rate": 1.1058353532372667e-06,
"loss": 0.0018,
"reward": 0.5600069090723991,
"reward_std": 0.20570005849003792,
"rewards/code_reward": 0.46067656576633453,
"rewards/format_reward": 0.9933035969734192,
"step": 384
},
{
"completion_length": 688.6205596923828,
"epoch": 0.8174097664543525,
"grad_norm": 0.25939956307411194,
"kl": 0.156982421875,
"learning_rate": 1.0959206141587998e-06,
"loss": 0.0016,
"reward": 0.461281917989254,
"reward_std": 0.2138805352151394,
"rewards/code_reward": 0.36329086124897003,
"rewards/format_reward": 0.9799107760190964,
"step": 385
},
{
"completion_length": 689.8527069091797,
"epoch": 0.8195329087048833,
"grad_norm": 0.564179003238678,
"kl": 0.34716796875,
"learning_rate": 1.0860752769385766e-06,
"loss": 0.0035,
"reward": 0.5820841789245605,
"reward_std": 0.23867543786764145,
"rewards/code_reward": 0.48320019245147705,
"rewards/format_reward": 0.9888393133878708,
"step": 386
},
{
"completion_length": 716.8995971679688,
"epoch": 0.821656050955414,
"grad_norm": 0.31268319487571716,
"kl": 0.2451171875,
"learning_rate": 1.0762997546672279e-06,
"loss": 0.0026,
"reward": 0.24600705318152905,
"reward_std": 0.06653665285557508,
"rewards/code_reward": 0.14823918044567108,
"rewards/format_reward": 0.9776786267757416,
"step": 387
},
{
"completion_length": 661.8973388671875,
"epoch": 0.8237791932059448,
"grad_norm": 0.23703983426094055,
"kl": 0.139892578125,
"learning_rate": 1.0665944575060914e-06,
"loss": 0.0015,
"reward": 0.5530121028423309,
"reward_std": 0.2014228142797947,
"rewards/code_reward": 0.45368169248104095,
"rewards/format_reward": 0.9933035969734192,
"step": 388
},
{
"completion_length": 671.0468902587891,
"epoch": 0.8259023354564756,
"grad_norm": 0.21562151610851288,
"kl": 0.14697265625,
"learning_rate": 1.056959792669997e-06,
"loss": 0.0016,
"reward": 0.6221778392791748,
"reward_std": 0.17795583605766296,
"rewards/code_reward": 0.5246331766247749,
"rewards/format_reward": 0.9754464626312256,
"step": 389
},
{
"completion_length": 707.1830749511719,
"epoch": 0.8280254777070064,
"grad_norm": 0.25027066469192505,
"kl": 0.15234375,
"learning_rate": 1.0473961644101856e-06,
"loss": 0.0016,
"reward": 0.49339308589696884,
"reward_std": 0.1599120758473873,
"rewards/code_reward": 0.39450912177562714,
"rewards/format_reward": 0.988839328289032,
"step": 390
},
{
"completion_length": 724.7522583007812,
"epoch": 0.8301486199575372,
"grad_norm": 0.2350330352783203,
"kl": 0.193603515625,
"learning_rate": 1.037903973997345e-06,
"loss": 0.0021,
"reward": 0.478931725025177,
"reward_std": 0.12047621235251427,
"rewards/code_reward": 0.3804941847920418,
"rewards/format_reward": 0.9843750596046448,
"step": 391
},
{
"completion_length": 702.982177734375,
"epoch": 0.832271762208068,
"grad_norm": 0.3609310984611511,
"kl": 0.179931640625,
"learning_rate": 1.0284836197047737e-06,
"loss": 0.0019,
"reward": 0.44246046990156174,
"reward_std": 0.1557149738073349,
"rewards/code_reward": 0.3444693833589554,
"rewards/format_reward": 0.9799107611179352,
"step": 392
},
{
"completion_length": 674.607177734375,
"epoch": 0.8343949044585988,
"grad_norm": 0.5464503765106201,
"kl": 0.248046875,
"learning_rate": 1.0191354967916712e-06,
"loss": 0.0026,
"reward": 0.5180330500006676,
"reward_std": 0.1834750883281231,
"rewards/code_reward": 0.4193723499774933,
"rewards/format_reward": 0.9866071939468384,
"step": 393
},
{
"completion_length": 684.0937805175781,
"epoch": 0.8365180467091295,
"grad_norm": 0.23662471771240234,
"kl": 0.1290283203125,
"learning_rate": 1.0098599974865515e-06,
"loss": 0.0014,
"reward": 0.5139395222067833,
"reward_std": 0.1551931146532297,
"rewards/code_reward": 0.41594842076301575,
"rewards/format_reward": 0.9799107611179352,
"step": 394
},
{
"completion_length": 692.5580596923828,
"epoch": 0.8386411889596603,
"grad_norm": 0.34932953119277954,
"kl": 0.154296875,
"learning_rate": 1.0006575109707898e-06,
"loss": 0.0017,
"reward": 0.5320730581879616,
"reward_std": 0.205118702724576,
"rewards/code_reward": 0.43318910896778107,
"rewards/format_reward": 0.988839328289032,
"step": 395
},
{
"completion_length": 678.8571624755859,
"epoch": 0.8407643312101911,
"grad_norm": 0.5195670127868652,
"kl": 0.1474609375,
"learning_rate": 9.915284233622877e-07,
"loss": 0.0016,
"reward": 0.4320642352104187,
"reward_std": 0.18216058425605297,
"rewards/code_reward": 0.33362672477960587,
"rewards/format_reward": 0.9843750298023224,
"step": 396
},
{
"completion_length": 706.8861999511719,
"epoch": 0.8428874734607219,
"grad_norm": 0.24882346391677856,
"kl": 0.148681640625,
"learning_rate": 9.824731176992796e-07,
"loss": 0.0016,
"reward": 0.5600469708442688,
"reward_std": 0.16885506361722946,
"rewards/code_reward": 0.4616094380617142,
"rewards/format_reward": 0.9843750596046448,
"step": 397
},
{
"completion_length": 669.0491333007812,
"epoch": 0.8450106157112527,
"grad_norm": 1.0406914949417114,
"kl": 0.364013671875,
"learning_rate": 9.734919739242543e-07,
"loss": 0.0037,
"reward": 0.5749830156564713,
"reward_std": 0.21774039044976234,
"rewards/code_reward": 0.47676874697208405,
"rewards/format_reward": 0.9821428954601288,
"step": 398
},
{
"completion_length": 723.325927734375,
"epoch": 0.8471337579617835,
"grad_norm": 0.5013810396194458,
"kl": 0.1451416015625,
"learning_rate": 9.645853688680177e-07,
"loss": 0.0016,
"reward": 0.5728159248828888,
"reward_std": 0.1670310366898775,
"rewards/code_reward": 0.4746016263961792,
"rewards/format_reward": 0.9821428805589676,
"step": 399
},
{
"completion_length": 700.310302734375,
"epoch": 0.8492569002123143,
"grad_norm": 0.8073310852050781,
"kl": 0.2965087890625,
"learning_rate": 9.557536762338786e-07,
"loss": 0.003,
"reward": 0.492939718067646,
"reward_std": 0.2011387124657631,
"rewards/code_reward": 0.39494864642620087,
"rewards/format_reward": 0.9799107611179352,
"step": 400
},
{
"completion_length": 693.0536041259766,
"epoch": 0.851380042462845,
"grad_norm": 0.3889514207839966,
"kl": 0.164306640625,
"learning_rate": 9.46997266581973e-07,
"loss": 0.0018,
"reward": 0.5752345323562622,
"reward_std": 0.19828381016850471,
"rewards/code_reward": 0.475680947303772,
"rewards/format_reward": 0.9955357313156128,
"step": 401
},
{
"completion_length": 706.841552734375,
"epoch": 0.8535031847133758,
"grad_norm": 4.799881458282471,
"kl": 0.4912109375,
"learning_rate": 9.383165073137115e-07,
"loss": 0.0051,
"reward": 0.5113906338810921,
"reward_std": 0.14735013246536255,
"rewards/code_reward": 0.41295309364795685,
"rewards/format_reward": 0.9843750596046448,
"step": 402
},
{
"completion_length": 691.2210235595703,
"epoch": 0.8556263269639066,
"grad_norm": 3.541896104812622,
"kl": 0.14697265625,
"learning_rate": 9.297117626563687e-07,
"loss": 0.0016,
"reward": 0.6038797795772552,
"reward_std": 0.18652482330799103,
"rewards/code_reward": 0.5065583363175392,
"rewards/format_reward": 0.973214328289032,
"step": 403
},
{
"completion_length": 725.4933471679688,
"epoch": 0.8577494692144374,
"grad_norm": 158.1067352294922,
"kl": 18.90283203125,
"learning_rate": 9.211833936477957e-07,
"loss": 0.1896,
"reward": 0.5942443758249283,
"reward_std": 0.12594054080545902,
"rewards/code_reward": 0.4960300847887993,
"rewards/format_reward": 0.9821428954601288,
"step": 404
},
{
"completion_length": 717.1585235595703,
"epoch": 0.8598726114649682,
"grad_norm": 2265.1884765625,
"kl": 230.10986328125,
"learning_rate": 9.127317581212753e-07,
"loss": 2.3015,
"reward": 0.53834218531847,
"reward_std": 0.1464555226266384,
"rewards/code_reward": 0.4394582211971283,
"rewards/format_reward": 0.988839328289032,
"step": 405
},
{
"completion_length": 727.5826110839844,
"epoch": 0.861995753715499,
"grad_norm": 0.2873145341873169,
"kl": 0.1866455078125,
"learning_rate": 9.043572106905084e-07,
"loss": 0.0019,
"reward": 0.5367319211363792,
"reward_std": 0.17168255895376205,
"rewards/code_reward": 0.43851763010025024,
"rewards/format_reward": 0.98214291036129,
"step": 406
},
{
"completion_length": 726.1674499511719,
"epoch": 0.8641188959660298,
"grad_norm": 0.2757129371166229,
"kl": 0.1365966796875,
"learning_rate": 8.960601027347321e-07,
"loss": 0.0014,
"reward": 0.5360690876841545,
"reward_std": 0.2111339271068573,
"rewards/code_reward": 0.4367387220263481,
"rewards/format_reward": 0.9933035969734192,
"step": 407
},
{
"completion_length": 708.4241333007812,
"epoch": 0.8662420382165605,
"grad_norm": 1.7461967468261719,
"kl": 0.15234375,
"learning_rate": 8.878407823839788e-07,
"loss": 0.0016,
"reward": 0.4714769721031189,
"reward_std": 0.17892321571707726,
"rewards/code_reward": 0.3723698630928993,
"rewards/format_reward": 0.9910714626312256,
"step": 408
},
{
"completion_length": 722.7344207763672,
"epoch": 0.8683651804670913,
"grad_norm": 1.359683632850647,
"kl": 0.1497802734375,
"learning_rate": 8.796995945044689e-07,
"loss": 0.0017,
"reward": 0.5647559985518456,
"reward_std": 0.16498099640011787,
"rewards/code_reward": 0.4656488224864006,
"rewards/format_reward": 0.9910714477300644,
"step": 409
},
{
"completion_length": 758.9174499511719,
"epoch": 0.8704883227176221,
"grad_norm": 0.34433820843696594,
"kl": 0.12939453125,
"learning_rate": 8.716368806841405e-07,
"loss": 0.0013,
"reward": 0.40852154791355133,
"reward_std": 0.19776060804724693,
"rewards/code_reward": 0.30919117480516434,
"rewards/format_reward": 0.9933035969734192,
"step": 410
},
{
"completion_length": 730.169677734375,
"epoch": 0.8726114649681529,
"grad_norm": 0.43445339798927307,
"kl": 0.132080078125,
"learning_rate": 8.636529792183171e-07,
"loss": 0.0014,
"reward": 0.5396310985088348,
"reward_std": 0.19617567211389542,
"rewards/code_reward": 0.44097036868333817,
"rewards/format_reward": 0.9866071790456772,
"step": 411
},
{
"completion_length": 717.8705596923828,
"epoch": 0.8747346072186837,
"grad_norm": 0.5580800771713257,
"kl": 0.192138671875,
"learning_rate": 8.557482250955144e-07,
"loss": 0.002,
"reward": 0.4667212590575218,
"reward_std": 0.20506682246923447,
"rewards/code_reward": 0.36850695312023163,
"rewards/format_reward": 0.9821428954601288,
"step": 412
},
{
"completion_length": 701.482177734375,
"epoch": 0.8768577494692145,
"grad_norm": 0.33230528235435486,
"kl": 0.150146484375,
"learning_rate": 8.479229499833844e-07,
"loss": 0.0015,
"reward": 0.5547576695680618,
"reward_std": 0.21152211725711823,
"rewards/code_reward": 0.4558737352490425,
"rewards/format_reward": 0.9888393133878708,
"step": 413
},
{
"completion_length": 704.0469055175781,
"epoch": 0.8789808917197452,
"grad_norm": 0.3372839093208313,
"kl": 0.1534423828125,
"learning_rate": 8.401774822147976e-07,
"loss": 0.0016,
"reward": 0.5494333058595657,
"reward_std": 0.24079465121030807,
"rewards/code_reward": 0.4505493566393852,
"rewards/format_reward": 0.988839328289032,
"step": 414
},
{
"completion_length": 723.966552734375,
"epoch": 0.881104033970276,
"grad_norm": 0.4163219630718231,
"kl": 0.26123046875,
"learning_rate": 8.325121467740695e-07,
"loss": 0.0026,
"reward": 0.3951665982604027,
"reward_std": 0.18642807379364967,
"rewards/code_reward": 0.29628264531493187,
"rewards/format_reward": 0.988839328289032,
"step": 415
},
{
"completion_length": 736.9129943847656,
"epoch": 0.8832271762208068,
"grad_norm": 0.6581453084945679,
"kl": 0.18310546875,
"learning_rate": 8.249272652833226e-07,
"loss": 0.0018,
"reward": 0.4613909646868706,
"reward_std": 0.14806298539042473,
"rewards/code_reward": 0.3633998855948448,
"rewards/format_reward": 0.9799107611179352,
"step": 416
},
{
"completion_length": 712.9754791259766,
"epoch": 0.8853503184713376,
"grad_norm": 1.2168219089508057,
"kl": 0.2080078125,
"learning_rate": 8.174231559889931e-07,
"loss": 0.0021,
"reward": 0.44138607382774353,
"reward_std": 0.22260471060872078,
"rewards/code_reward": 0.34317177161574364,
"rewards/format_reward": 0.9821428805589676,
"step": 417
},
{
"completion_length": 711.1049346923828,
"epoch": 0.8874734607218684,
"grad_norm": 1.5622974634170532,
"kl": 0.21630859375,
"learning_rate": 8.100001337484787e-07,
"loss": 0.0022,
"reward": 0.5736604407429695,
"reward_std": 0.20455688051879406,
"rewards/code_reward": 0.4747764840722084,
"rewards/format_reward": 0.9888393133878708,
"step": 418
},
{
"completion_length": 729.3861999511719,
"epoch": 0.8895966029723992,
"grad_norm": 0.5059235095977783,
"kl": 0.16796875,
"learning_rate": 8.026585100169251e-07,
"loss": 0.0017,
"reward": 0.4245912581682205,
"reward_std": 0.151387682184577,
"rewards/code_reward": 0.32637695223093033,
"rewards/format_reward": 0.9821428954601288,
"step": 419
},
{
"completion_length": 690.7187652587891,
"epoch": 0.89171974522293,
"grad_norm": 6.739729881286621,
"kl": 2.8837890625,
"learning_rate": 7.953985928341601e-07,
"loss": 0.0289,
"reward": 0.5304828435182571,
"reward_std": 0.157493332400918,
"rewards/code_reward": 0.4313756823539734,
"rewards/format_reward": 0.9910714477300644,
"step": 420
},
{
"completion_length": 694.5335083007812,
"epoch": 0.8938428874734607,
"grad_norm": 0.45631542801856995,
"kl": 0.1708984375,
"learning_rate": 7.882206868117693e-07,
"loss": 0.0018,
"reward": 0.4608374051749706,
"reward_std": 0.1782052293419838,
"rewards/code_reward": 0.36106058582663536,
"rewards/format_reward": 0.9977678656578064,
"step": 421
},
{
"completion_length": 733.1339721679688,
"epoch": 0.8959660297239915,
"grad_norm": 1.1783860921859741,
"kl": 0.181640625,
"learning_rate": 7.81125093120313e-07,
"loss": 0.0019,
"reward": 0.4884042590856552,
"reward_std": 0.164920412003994,
"rewards/code_reward": 0.3899667263031006,
"rewards/format_reward": 0.9843750298023224,
"step": 422
},
{
"completion_length": 694.8013610839844,
"epoch": 0.8980891719745223,
"grad_norm": 0.7121770977973938,
"kl": 0.24853515625,
"learning_rate": 7.741121094766916e-07,
"loss": 0.0026,
"reward": 0.5257243886590004,
"reward_std": 0.15851808711886406,
"rewards/code_reward": 0.426840465515852,
"rewards/format_reward": 0.988839328289032,
"step": 423
},
{
"completion_length": 681.4174346923828,
"epoch": 0.9002123142250531,
"grad_norm": 0.739496648311615,
"kl": 0.25439453125,
"learning_rate": 7.671820301316532e-07,
"loss": 0.0026,
"reward": 0.4978240504860878,
"reward_std": 0.17392848432064056,
"rewards/code_reward": 0.39983299374580383,
"rewards/format_reward": 0.979910746216774,
"step": 424
},
{
"completion_length": 727.6897583007812,
"epoch": 0.9023354564755839,
"grad_norm": 0.6177138090133667,
"kl": 0.183349609375,
"learning_rate": 7.603351458574474e-07,
"loss": 0.0019,
"reward": 0.44435514509677887,
"reward_std": 0.13320972956717014,
"rewards/code_reward": 0.34703367203474045,
"rewards/format_reward": 0.9732143133878708,
"step": 425
},
{
"completion_length": 721.8772735595703,
"epoch": 0.9044585987261147,
"grad_norm": 1.0612621307373047,
"kl": 0.2496337890625,
"learning_rate": 7.535717439356255e-07,
"loss": 0.0026,
"reward": 0.4390544593334198,
"reward_std": 0.15821044147014618,
"rewards/code_reward": 0.3408401757478714,
"rewards/format_reward": 0.9821428805589676,
"step": 426
},
{
"completion_length": 708.1295013427734,
"epoch": 0.9065817409766455,
"grad_norm": 0.3175060451030731,
"kl": 0.15869140625,
"learning_rate": 7.46892108144986e-07,
"loss": 0.0017,
"reward": 0.45070114731788635,
"reward_std": 0.1746504958719015,
"rewards/code_reward": 0.35181717574596405,
"rewards/format_reward": 0.988839328289032,
"step": 427
},
{
"completion_length": 752.310302734375,
"epoch": 0.9087048832271762,
"grad_norm": 18.601308822631836,
"kl": 3.44775390625,
"learning_rate": 7.402965187496697e-07,
"loss": 0.0348,
"reward": 0.46990416944026947,
"reward_std": 0.1597570963203907,
"rewards/code_reward": 0.3723594844341278,
"rewards/format_reward": 0.9754464626312256,
"step": 428
},
{
"completion_length": 730.3460235595703,
"epoch": 0.910828025477707,
"grad_norm": 8.600378036499023,
"kl": 1.468994140625,
"learning_rate": 7.337852524873974e-07,
"loss": 0.0148,
"reward": 0.6117217838764191,
"reward_std": 0.21035557612776756,
"rewards/code_reward": 0.5126146152615547,
"rewards/format_reward": 0.9910714626312256,
"step": 429
},
{
"completion_length": 710.6138610839844,
"epoch": 0.9129511677282378,
"grad_norm": 0.4506695568561554,
"kl": 0.20361328125,
"learning_rate": 7.273585825578608e-07,
"loss": 0.0022,
"reward": 0.4428362399339676,
"reward_std": 0.12803563103079796,
"rewards/code_reward": 0.34372907504439354,
"rewards/format_reward": 0.9910714626312256,
"step": 430
},
{
"completion_length": 658.6228179931641,
"epoch": 0.9150743099787686,
"grad_norm": 5.093682765960693,
"kl": 0.5947265625,
"learning_rate": 7.21016778611259e-07,
"loss": 0.0061,
"reward": 0.5427140817046165,
"reward_std": 0.19685931131243706,
"rewards/code_reward": 0.4447230063378811,
"rewards/format_reward": 0.9799107760190964,
"step": 431
},
{
"completion_length": 677.9040222167969,
"epoch": 0.9171974522292994,
"grad_norm": 38.870262145996094,
"kl": 5.4326171875,
"learning_rate": 7.147601067369835e-07,
"loss": 0.0545,
"reward": 0.5093298330903053,
"reward_std": 0.19096140936017036,
"rewards/code_reward": 0.41111550480127335,
"rewards/format_reward": 0.9821428954601288,
"step": 432
},
{
"completion_length": 694.513427734375,
"epoch": 0.9193205944798302,
"grad_norm": 0.5155165195465088,
"kl": 0.155029296875,
"learning_rate": 7.085888294524561e-07,
"loss": 0.0016,
"reward": 0.5259926542639732,
"reward_std": 0.18491110764443874,
"rewards/code_reward": 0.42733194679021835,
"rewards/format_reward": 0.9866071790456772,
"step": 433
},
{
"completion_length": 704.5580596923828,
"epoch": 0.921443736730361,
"grad_norm": 0.6282893419265747,
"kl": 0.3359375,
"learning_rate": 7.025032056921117e-07,
"loss": 0.0034,
"reward": 0.5899785161018372,
"reward_std": 0.19566836208105087,
"rewards/code_reward": 0.4913177192211151,
"rewards/format_reward": 0.9866071790456772,
"step": 434
},
{
"completion_length": 722.1562805175781,
"epoch": 0.9235668789808917,
"grad_norm": 1.048966884613037,
"kl": 0.4742431640625,
"learning_rate": 6.965034907965349e-07,
"loss": 0.0049,
"reward": 0.5559424459934235,
"reward_std": 0.2080874666571617,
"rewards/code_reward": 0.4588441997766495,
"rewards/format_reward": 0.9709821939468384,
"step": 435
},
{
"completion_length": 679.9933319091797,
"epoch": 0.9256900212314225,
"grad_norm": 0.6251688599586487,
"kl": 0.171142578125,
"learning_rate": 6.905899365017462e-07,
"loss": 0.0018,
"reward": 0.5245073512196541,
"reward_std": 0.17461021803319454,
"rewards/code_reward": 0.42606981843709946,
"rewards/format_reward": 0.9843750447034836,
"step": 436
},
{
"completion_length": 711.6942443847656,
"epoch": 0.9278131634819533,
"grad_norm": 1.1648685932159424,
"kl": 0.299560546875,
"learning_rate": 6.847627909286409e-07,
"loss": 0.003,
"reward": 0.41069934517145157,
"reward_std": 0.17594012804329395,
"rewards/code_reward": 0.31226181238889694,
"rewards/format_reward": 0.9843750447034836,
"step": 437
},
{
"completion_length": 702.3482513427734,
"epoch": 0.9299363057324841,
"grad_norm": 1.5311229228973389,
"kl": 0.31640625,
"learning_rate": 6.790222985725761e-07,
"loss": 0.0033,
"reward": 0.5770048946142197,
"reward_std": 0.1962369978427887,
"rewards/code_reward": 0.4790138080716133,
"rewards/format_reward": 0.9799107611179352,
"step": 438
},
{
"completion_length": 683.4151916503906,
"epoch": 0.9320594479830149,
"grad_norm": 8.243356704711914,
"kl": 3.05126953125,
"learning_rate": 6.733687002931141e-07,
"loss": 0.0306,
"reward": 0.5087217092514038,
"reward_std": 0.1651569865643978,
"rewards/code_reward": 0.4109538644552231,
"rewards/format_reward": 0.9776786267757416,
"step": 439
},
{
"completion_length": 713.6049346923828,
"epoch": 0.9341825902335457,
"grad_norm": 1.4741530418395996,
"kl": 0.967529296875,
"learning_rate": 6.678022333039158e-07,
"loss": 0.0098,
"reward": 0.587900809943676,
"reward_std": 0.16147084161639214,
"rewards/code_reward": 0.4903561547398567,
"rewards/format_reward": 0.9754464626312256,
"step": 440
},
{
"completion_length": 677.8303833007812,
"epoch": 0.9363057324840764,
"grad_norm": 0.3179962933063507,
"kl": 0.230224609375,
"learning_rate": 6.623231311627876e-07,
"loss": 0.0025,
"reward": 0.561469204723835,
"reward_std": 0.16684554889798164,
"rewards/code_reward": 0.4625852555036545,
"rewards/format_reward": 0.9888393133878708,
"step": 441
},
{
"completion_length": 725.1004791259766,
"epoch": 0.9384288747346072,
"grad_norm": 2.448838233947754,
"kl": 1.276123046875,
"learning_rate": 6.569316237618811e-07,
"loss": 0.0127,
"reward": 0.3736302964389324,
"reward_std": 0.18804692663252354,
"rewards/code_reward": 0.2751928083598614,
"rewards/format_reward": 0.9843750298023224,
"step": 442
},
{
"completion_length": 710.9286041259766,
"epoch": 0.940552016985138,
"grad_norm": 0.38171106576919556,
"kl": 0.2259521484375,
"learning_rate": 6.516279373180499e-07,
"loss": 0.0024,
"reward": 0.45342515781521797,
"reward_std": 0.16657396219670773,
"rewards/code_reward": 0.3540947772562504,
"rewards/format_reward": 0.9933035969734192,
"step": 443
},
{
"completion_length": 665.5625305175781,
"epoch": 0.9426751592356688,
"grad_norm": 0.5256981253623962,
"kl": 0.63818359375,
"learning_rate": 6.464122943633543e-07,
"loss": 0.0066,
"reward": 0.5117220133543015,
"reward_std": 0.17998000979423523,
"rewards/code_reward": 0.4126148596405983,
"rewards/format_reward": 0.9910714626312256,
"step": 444
},
{
"completion_length": 669.888427734375,
"epoch": 0.9447983014861996,
"grad_norm": 10.391777038574219,
"kl": 1.935546875,
"learning_rate": 6.412849137357271e-07,
"loss": 0.0195,
"reward": 0.577217735350132,
"reward_std": 0.18068324774503708,
"rewards/code_reward": 0.47878019511699677,
"rewards/format_reward": 0.9843750596046448,
"step": 445
},
{
"completion_length": 706.747802734375,
"epoch": 0.9469214437367304,
"grad_norm": 0.7888285517692566,
"kl": 0.395263671875,
"learning_rate": 6.3624601056979e-07,
"loss": 0.0041,
"reward": 0.5674577727913857,
"reward_std": 0.14589250087738037,
"rewards/code_reward": 0.469020277261734,
"rewards/format_reward": 0.9843750447034836,
"step": 446
},
{
"completion_length": 698.3861999511719,
"epoch": 0.9490445859872612,
"grad_norm": 0.5909515619277954,
"kl": 0.4178466796875,
"learning_rate": 6.312957962878278e-07,
"loss": 0.0042,
"reward": 0.44434136897325516,
"reward_std": 0.1476050168275833,
"rewards/code_reward": 0.3447878174483776,
"rewards/format_reward": 0.9955357313156128,
"step": 447
},
{
"completion_length": 697.1138763427734,
"epoch": 0.9511677282377919,
"grad_norm": 0.3049964904785156,
"kl": 0.36083984375,
"learning_rate": 6.264344785909181e-07,
"loss": 0.0036,
"reward": 0.5054452195763588,
"reward_std": 0.16559578850865364,
"rewards/code_reward": 0.40633804351091385,
"rewards/format_reward": 0.9910714477300644,
"step": 448
},
{
"completion_length": 699.5000457763672,
"epoch": 0.9532908704883227,
"grad_norm": 2.360261917114258,
"kl": 1.0093994140625,
"learning_rate": 6.216622614502149e-07,
"loss": 0.0102,
"reward": 0.43248920887708664,
"reward_std": 0.20951998233795166,
"rewards/code_reward": 0.3344981260597706,
"rewards/format_reward": 0.9799107611179352,
"step": 449
},
{
"completion_length": 711.5647735595703,
"epoch": 0.9554140127388535,
"grad_norm": 0.45019006729125977,
"kl": 0.396728515625,
"learning_rate": 6.169793450983916e-07,
"loss": 0.0041,
"reward": 0.4090769328176975,
"reward_std": 0.1387995146214962,
"rewards/code_reward": 0.30996978655457497,
"rewards/format_reward": 0.9910714626312256,
"step": 450
},
{
"completion_length": 687.4710083007812,
"epoch": 0.9575371549893843,
"grad_norm": 1.139167070388794,
"kl": 0.70947265625,
"learning_rate": 6.123859260212393e-07,
"loss": 0.0073,
"reward": 0.6231836080551147,
"reward_std": 0.18272383697330952,
"rewards/code_reward": 0.5249693095684052,
"rewards/format_reward": 0.9821428954601288,
"step": 451
},
{
"completion_length": 664.4620666503906,
"epoch": 0.9596602972399151,
"grad_norm": 11.963510513305664,
"kl": 2.9296875,
"learning_rate": 6.07882196949423e-07,
"loss": 0.0292,
"reward": 0.5648458003997803,
"reward_std": 0.21996057033538818,
"rewards/code_reward": 0.4666314870119095,
"rewards/format_reward": 0.9821428954601288,
"step": 452
},
{
"completion_length": 672.8326110839844,
"epoch": 0.9617834394904459,
"grad_norm": 0.23568426072597504,
"kl": 0.138427734375,
"learning_rate": 6.034683468503948e-07,
"loss": 0.0015,
"reward": 0.5011638775467873,
"reward_std": 0.1840323582291603,
"rewards/code_reward": 0.4020567089319229,
"rewards/format_reward": 0.9910714626312256,
"step": 453
},
{
"completion_length": 692.2835083007812,
"epoch": 0.9639065817409767,
"grad_norm": 1.3131980895996094,
"kl": 0.73291015625,
"learning_rate": 5.991445609204641e-07,
"loss": 0.0073,
"reward": 0.49861256778240204,
"reward_std": 0.19007166847586632,
"rewards/code_reward": 0.4006215110421181,
"rewards/format_reward": 0.979910746216774,
"step": 454
},
{
"completion_length": 680.5536041259766,
"epoch": 0.9660297239915074,
"grad_norm": 1.0419756174087524,
"kl": 0.8876953125,
"learning_rate": 5.949110205770292e-07,
"loss": 0.009,
"reward": 0.5448554530739784,
"reward_std": 0.19055398926138878,
"rewards/code_reward": 0.44686436653137207,
"rewards/format_reward": 0.9799107611179352,
"step": 455
},
{
"completion_length": 693.2187805175781,
"epoch": 0.9681528662420382,
"grad_norm": 0.7175102233886719,
"kl": 0.481201171875,
"learning_rate": 5.90767903450964e-07,
"loss": 0.0049,
"reward": 0.4721348285675049,
"reward_std": 0.14595188200473785,
"rewards/code_reward": 0.37302765995264053,
"rewards/format_reward": 0.9910714626312256,
"step": 456
},
{
"completion_length": 695.2723541259766,
"epoch": 0.970276008492569,
"grad_norm": 0.4296894371509552,
"kl": 0.26416015625,
"learning_rate": 5.867153833791652e-07,
"loss": 0.0027,
"reward": 0.6006196290254593,
"reward_std": 0.17042616941034794,
"rewards/code_reward": 0.5019589066505432,
"rewards/format_reward": 0.9866071790456772,
"step": 457
},
{
"completion_length": 692.6652221679688,
"epoch": 0.9723991507430998,
"grad_norm": 0.4421376585960388,
"kl": 0.31982421875,
"learning_rate": 5.827536303972587e-07,
"loss": 0.0033,
"reward": 0.5808815285563469,
"reward_std": 0.2226531021296978,
"rewards/code_reward": 0.4815511405467987,
"rewards/format_reward": 0.9933035969734192,
"step": 458
},
{
"completion_length": 679.247802734375,
"epoch": 0.9745222929936306,
"grad_norm": 0.40139421820640564,
"kl": 0.47216796875,
"learning_rate": 5.78882810732465e-07,
"loss": 0.0048,
"reward": 0.5371674299240112,
"reward_std": 0.22804895788431168,
"rewards/code_reward": 0.43962281197309494,
"rewards/format_reward": 0.9754464626312256,
"step": 459
},
{
"completion_length": 706.5379638671875,
"epoch": 0.9766454352441614,
"grad_norm": 0.8857892751693726,
"kl": 0.51220703125,
"learning_rate": 5.75103086796625e-07,
"loss": 0.0052,
"reward": 0.4905061312019825,
"reward_std": 0.1841362752020359,
"rewards/code_reward": 0.39206862077116966,
"rewards/format_reward": 0.9843750447034836,
"step": 460
},
{
"completion_length": 690.3415374755859,
"epoch": 0.9787685774946921,
"grad_norm": 0.6516154408454895,
"kl": 0.439697265625,
"learning_rate": 5.714146171793846e-07,
"loss": 0.0045,
"reward": 0.5876915380358696,
"reward_std": 0.15721704810857773,
"rewards/code_reward": 0.4892539754509926,
"rewards/format_reward": 0.9843750298023224,
"step": 461
},
{
"completion_length": 681.5156402587891,
"epoch": 0.9808917197452229,
"grad_norm": 0.618622362613678,
"kl": 0.48046875,
"learning_rate": 5.678175566415422e-07,
"loss": 0.0048,
"reward": 0.49158109724521637,
"reward_std": 0.1944441720843315,
"rewards/code_reward": 0.39381323754787445,
"rewards/format_reward": 0.9776786118745804,
"step": 462
},
{
"completion_length": 722.8750305175781,
"epoch": 0.9830148619957537,
"grad_norm": 0.7218803763389587,
"kl": 0.565185546875,
"learning_rate": 5.643120561085528e-07,
"loss": 0.0057,
"reward": 0.4738345965743065,
"reward_std": 0.24430794268846512,
"rewards/code_reward": 0.37651316076517105,
"rewards/format_reward": 0.9732143133878708,
"step": 463
},
{
"completion_length": 682.4219055175781,
"epoch": 0.9851380042462845,
"grad_norm": 0.7259976863861084,
"kl": 0.706298828125,
"learning_rate": 5.608982626641991e-07,
"loss": 0.0071,
"reward": 0.47413645684719086,
"reward_std": 0.21057153865695,
"rewards/code_reward": 0.3761453852057457,
"rewards/format_reward": 0.9799107611179352,
"step": 464
},
{
"completion_length": 716.5736999511719,
"epoch": 0.9872611464968153,
"grad_norm": 0.2483934909105301,
"kl": 0.260009765625,
"learning_rate": 5.575763195444166e-07,
"loss": 0.0027,
"reward": 0.5671171024441719,
"reward_std": 0.19927529990673065,
"rewards/code_reward": 0.46912601590156555,
"rewards/format_reward": 0.9799107611179352,
"step": 465
},
{
"completion_length": 680.6339416503906,
"epoch": 0.9893842887473461,
"grad_norm": 1.9431463479995728,
"kl": 1.3564453125,
"learning_rate": 5.543463661312847e-07,
"loss": 0.0136,
"reward": 0.417750583961606,
"reward_std": 0.13368695229291916,
"rewards/code_reward": 0.3197594955563545,
"rewards/format_reward": 0.9799107611179352,
"step": 466
},
{
"completion_length": 684.9911041259766,
"epoch": 0.9915074309978769,
"grad_norm": 0.8100730776786804,
"kl": 0.4619140625,
"learning_rate": 5.512085379471808e-07,
"loss": 0.0048,
"reward": 0.6499997675418854,
"reward_std": 0.20544839650392532,
"rewards/code_reward": 0.5511157959699631,
"rewards/format_reward": 0.988839328289032,
"step": 467
},
{
"completion_length": 681.5670013427734,
"epoch": 0.9936305732484076,
"grad_norm": 3.3746109008789062,
"kl": 1.414306640625,
"learning_rate": 5.481629666490903e-07,
"loss": 0.0142,
"reward": 0.5468520447611809,
"reward_std": 0.21051420643925667,
"rewards/code_reward": 0.44774486869573593,
"rewards/format_reward": 0.9910714477300644,
"step": 468
},
{
"completion_length": 688.2388610839844,
"epoch": 0.9957537154989384,
"grad_norm": 1.0109045505523682,
"kl": 1.15380859375,
"learning_rate": 5.452097800230853e-07,
"loss": 0.0116,
"reward": 0.6098516285419464,
"reward_std": 0.211056686937809,
"rewards/code_reward": 0.5116373002529144,
"rewards/format_reward": 0.9821428954601288,
"step": 469
},
{
"completion_length": 681.9821624755859,
"epoch": 0.9978768577494692,
"grad_norm": 0.7048155665397644,
"kl": 0.809814453125,
"learning_rate": 5.423491019789623e-07,
"loss": 0.0082,
"reward": 0.45874594151973724,
"reward_std": 0.14819572865962982,
"rewards/code_reward": 0.3596387729048729,
"rewards/format_reward": 0.9910714626312256,
"step": 470
},
{
"completion_length": 707.5000457763672,
"epoch": 1.0,
"grad_norm": 2.86737060546875,
"kl": 1.1783447265625,
"learning_rate": 5.395810525450425e-07,
"loss": 0.0118,
"reward": 0.5169450491666794,
"reward_std": 0.18883745186030865,
"rewards/code_reward": 0.41850756853818893,
"rewards/format_reward": 0.9843750298023224,
"step": 471
},
{
"completion_length": 677.5982360839844,
"epoch": 1.0021231422505308,
"grad_norm": 1.9091771841049194,
"kl": 1.307861328125,
"learning_rate": 5.369057478631359e-07,
"loss": 0.0132,
"reward": 0.5092417150735855,
"reward_std": 0.18099428340792656,
"rewards/code_reward": 0.4110274314880371,
"rewards/format_reward": 0.9821428805589676,
"step": 472
},
{
"completion_length": 711.9843902587891,
"epoch": 1.0042462845010616,
"grad_norm": 1.6537326574325562,
"kl": 1.21533203125,
"learning_rate": 5.343233001836694e-07,
"loss": 0.0122,
"reward": 0.48672058433294296,
"reward_std": 0.19152027182281017,
"rewards/code_reward": 0.38939911872148514,
"rewards/format_reward": 0.9732143133878708,
"step": 473
},
{
"completion_length": 707.3102874755859,
"epoch": 1.0063694267515924,
"grad_norm": 0.8889822959899902,
"kl": 0.529541015625,
"learning_rate": 5.318338178609754e-07,
"loss": 0.0054,
"reward": 0.5736411809921265,
"reward_std": 0.17692103423178196,
"rewards/code_reward": 0.4749804362654686,
"rewards/format_reward": 0.9866071939468384,
"step": 474
},
{
"completion_length": 736.5714721679688,
"epoch": 1.0084925690021231,
"grad_norm": 1.3358269929885864,
"kl": 0.98828125,
"learning_rate": 5.294374053487459e-07,
"loss": 0.0099,
"reward": 0.44529393315315247,
"reward_std": 0.17480986192822456,
"rewards/code_reward": 0.3468564301729202,
"rewards/format_reward": 0.9843750596046448,
"step": 475
},
{
"completion_length": 711.4241485595703,
"epoch": 1.010615711252654,
"grad_norm": 1.4289793968200684,
"kl": 1.22998046875,
"learning_rate": 5.271341631956511e-07,
"loss": 0.0123,
"reward": 0.5166614726185799,
"reward_std": 0.1912681832909584,
"rewards/code_reward": 0.42000964283943176,
"rewards/format_reward": 0.96651791036129,
"step": 476
},
{
"completion_length": 695.966552734375,
"epoch": 1.0127388535031847,
"grad_norm": 1.04447340965271,
"kl": 0.756103515625,
"learning_rate": 5.249241880411181e-07,
"loss": 0.0076,
"reward": 0.5925345048308372,
"reward_std": 0.20060284808278084,
"rewards/code_reward": 0.4952130541205406,
"rewards/format_reward": 0.9732143133878708,
"step": 477
},
{
"completion_length": 694.6986846923828,
"epoch": 1.0148619957537155,
"grad_norm": 0.8483067750930786,
"kl": 0.3671875,
"learning_rate": 5.228075726112785e-07,
"loss": 0.0039,
"reward": 0.5394521579146385,
"reward_std": 0.12158003821969032,
"rewards/code_reward": 0.44079139083623886,
"rewards/format_reward": 0.986607164144516,
"step": 478
},
{
"completion_length": 708.294677734375,
"epoch": 1.0169851380042463,
"grad_norm": 2.820655584335327,
"kl": 2.081787109375,
"learning_rate": 5.207844057150768e-07,
"loss": 0.0209,
"reward": 0.530554287135601,
"reward_std": 0.18540234863758087,
"rewards/code_reward": 0.4339024946093559,
"rewards/format_reward": 0.9665178954601288,
"step": 479
},
{
"completion_length": 717.2143249511719,
"epoch": 1.019108280254777,
"grad_norm": 0.23074620962142944,
"kl": 0.481689453125,
"learning_rate": 5.188547722405437e-07,
"loss": 0.005,
"reward": 0.6097277328372002,
"reward_std": 0.2097402885556221,
"rewards/code_reward": 0.5108437687158585,
"rewards/format_reward": 0.988839328289032,
"step": 480
},
{
"completion_length": 673.4553833007812,
"epoch": 1.0212314225053079,
"grad_norm": 18.151901245117188,
"kl": 6.5419921875,
"learning_rate": 5.170187531512351e-07,
"loss": 0.0654,
"reward": 0.4982636645436287,
"reward_std": 0.18235952779650688,
"rewards/code_reward": 0.4000493362545967,
"rewards/format_reward": 0.9821428954601288,
"step": 481
},
{
"completion_length": 657.0223541259766,
"epoch": 1.0233545647558386,
"grad_norm": 1.2006980180740356,
"kl": 0.98583984375,
"learning_rate": 5.152764254828348e-07,
"loss": 0.0101,
"reward": 0.6024035438895226,
"reward_std": 0.18741042539477348,
"rewards/code_reward": 0.5044124275445938,
"rewards/format_reward": 0.979910746216774,
"step": 482
},
{
"completion_length": 679.5446624755859,
"epoch": 1.0254777070063694,
"grad_norm": 6.640290260314941,
"kl": 2.26318359375,
"learning_rate": 5.136278623399225e-07,
"loss": 0.0229,
"reward": 0.6333309859037399,
"reward_std": 0.16317120380699635,
"rewards/code_reward": 0.5337774083018303,
"rewards/format_reward": 0.9955357313156128,
"step": 483
},
{
"completion_length": 690.9464721679688,
"epoch": 1.0276008492569002,
"grad_norm": 1.2075289487838745,
"kl": 0.635498046875,
"learning_rate": 5.120731328929058e-07,
"loss": 0.0065,
"reward": 0.6160075142979622,
"reward_std": 0.18631838634610176,
"rewards/code_reward": 0.516677126288414,
"rewards/format_reward": 0.9933035969734192,
"step": 484
},
{
"completion_length": 711.2254791259766,
"epoch": 1.029723991507431,
"grad_norm": 0.6530643105506897,
"kl": 0.88671875,
"learning_rate": 5.106123023751187e-07,
"loss": 0.009,
"reward": 0.5319265574216843,
"reward_std": 0.16448520869016647,
"rewards/code_reward": 0.43304260820150375,
"rewards/format_reward": 0.9888393133878708,
"step": 485
},
{
"completion_length": 697.0067291259766,
"epoch": 1.0318471337579618,
"grad_norm": 0.7889028787612915,
"kl": 0.486083984375,
"learning_rate": 5.092454320800833e-07,
"loss": 0.0049,
"reward": 0.5322659835219383,
"reward_std": 0.2203526459634304,
"rewards/code_reward": 0.4340517073869705,
"rewards/format_reward": 0.9821428954601288,
"step": 486
},
{
"completion_length": 707.5915374755859,
"epoch": 1.0339702760084926,
"grad_norm": 1.0391658544540405,
"kl": 1.347412109375,
"learning_rate": 5.079725793589405e-07,
"loss": 0.0136,
"reward": 0.5818885043263435,
"reward_std": 0.18653497844934464,
"rewards/code_reward": 0.4838974103331566,
"rewards/format_reward": 0.979910746216774,
"step": 487
},
{
"completion_length": 681.9509124755859,
"epoch": 1.0360934182590233,
"grad_norm": 1.4511501789093018,
"kl": 0.91943359375,
"learning_rate": 5.067937976180407e-07,
"loss": 0.0092,
"reward": 0.20120449364185333,
"reward_std": 0.06054047856014222,
"rewards/code_reward": 0.10365983843803406,
"rewards/format_reward": 0.9754464775323868,
"step": 488
},
{
"completion_length": 696.1696624755859,
"epoch": 1.0382165605095541,
"grad_norm": 1.0735193490982056,
"kl": 0.93994140625,
"learning_rate": 5.057091363167046e-07,
"loss": 0.0095,
"reward": 0.41191001795232296,
"reward_std": 0.11514822754543275,
"rewards/code_reward": 0.31324928998947144,
"rewards/format_reward": 0.9866071939468384,
"step": 489
},
{
"completion_length": 721.6607513427734,
"epoch": 1.040339702760085,
"grad_norm": 1.8616191148757935,
"kl": 1.632080078125,
"learning_rate": 5.047186409651489e-07,
"loss": 0.0165,
"reward": 0.5570781454443932,
"reward_std": 0.17984510958194733,
"rewards/code_reward": 0.45886383950710297,
"rewards/format_reward": 0.9821428954601288,
"step": 490
},
{
"completion_length": 674.4308319091797,
"epoch": 1.0424628450106157,
"grad_norm": 2.0347139835357666,
"kl": 1.854736328125,
"learning_rate": 5.038223531225742e-07,
"loss": 0.0186,
"reward": 0.4472319483757019,
"reward_std": 0.20929547771811485,
"rewards/code_reward": 0.3496873155236244,
"rewards/format_reward": 0.9754464626312256,
"step": 491
},
{
"completion_length": 684.2701263427734,
"epoch": 1.0445859872611465,
"grad_norm": 0.47032228112220764,
"kl": 0.452392578125,
"learning_rate": 5.030203103954232e-07,
"loss": 0.0046,
"reward": 0.6024687513709068,
"reward_std": 0.20001190528273582,
"rewards/code_reward": 0.5038080215454102,
"rewards/format_reward": 0.986607164144516,
"step": 492
},
{
"completion_length": 749.888427734375,
"epoch": 1.0467091295116773,
"grad_norm": 1.3009785413742065,
"kl": 0.7442626953125,
"learning_rate": 5.023125464358026e-07,
"loss": 0.0075,
"reward": 0.4289785400032997,
"reward_std": 0.19978297501802444,
"rewards/code_reward": 0.3307642340660095,
"rewards/format_reward": 0.9821428805589676,
"step": 493
},
{
"completion_length": 707.247802734375,
"epoch": 1.048832271762208,
"grad_norm": 2.7150216102600098,
"kl": 1.939453125,
"learning_rate": 5.016990909400709e-07,
"loss": 0.0195,
"reward": 0.48099584877491,
"reward_std": 0.17564579099416733,
"rewards/code_reward": 0.3834511674940586,
"rewards/format_reward": 0.9754464775323868,
"step": 494
},
{
"completion_length": 712.8861999511719,
"epoch": 1.0509554140127388,
"grad_norm": 1.259710431098938,
"kl": 1.7119140625,
"learning_rate": 5.011799696475915e-07,
"loss": 0.0172,
"reward": 0.5863819345831871,
"reward_std": 0.17422104254364967,
"rewards/code_reward": 0.48883724212646484,
"rewards/format_reward": 0.9754464775323868,
"step": 495
},
{
"completion_length": 670.3125305175781,
"epoch": 1.0530785562632696,
"grad_norm": 1.799985408782959,
"kl": 1.3544921875,
"learning_rate": 5.007552043396547e-07,
"loss": 0.0137,
"reward": 0.6773558109998703,
"reward_std": 0.21710924059152603,
"rewards/code_reward": 0.578471876680851,
"rewards/format_reward": 0.9888393133878708,
"step": 496
},
{
"completion_length": 647.8727874755859,
"epoch": 1.0552016985138004,
"grad_norm": 1.1735022068023682,
"kl": 0.455810546875,
"learning_rate": 5.004248128385618e-07,
"loss": 0.0047,
"reward": 0.6235345751047134,
"reward_std": 0.20276143215596676,
"rewards/code_reward": 0.5259898751974106,
"rewards/format_reward": 0.9754464775323868,
"step": 497
},
{
"completion_length": 720.7388610839844,
"epoch": 1.0573248407643312,
"grad_norm": 1.039088487625122,
"kl": 1.011474609375,
"learning_rate": 5.001888090068784e-07,
"loss": 0.0102,
"reward": 0.5388440862298012,
"reward_std": 0.1800019945949316,
"rewards/code_reward": 0.4397369250655174,
"rewards/format_reward": 0.9910714626312256,
"step": 498
},
{
"completion_length": 737.9375305175781,
"epoch": 1.059447983014862,
"grad_norm": 2.2365331649780273,
"kl": 0.947998046875,
"learning_rate": 5.000472027468528e-07,
"loss": 0.0095,
"reward": 0.5870940536260605,
"reward_std": 0.17025620490312576,
"rewards/code_reward": 0.4893261566758156,
"rewards/format_reward": 0.9776785969734192,
"step": 499
},
{
"completion_length": 686.7678833007812,
"epoch": 1.0615711252653928,
"grad_norm": 11.270224571228027,
"kl": 3.646484375,
"learning_rate": 5.000000000000001e-07,
"loss": 0.0367,
"reward": 0.2899981178343296,
"reward_std": 0.10342313535511494,
"rewards/code_reward": 0.1917838342487812,
"rewards/format_reward": 0.9821428954601288,
"step": 500
},
{
"epoch": 1.0615711252653928,
"step": 500,
"total_flos": 0.0,
"train_loss": 0.008756054809940243,
"train_runtime": 191583.7312,
"train_samples_per_second": 1.169,
"train_steps_per_second": 0.003
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}