{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0615711252653928, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 746.1138916015625, "epoch": 0.0021231422505307855, "grad_norm": 0.21636255085468292, "kl": 0.0, "learning_rate": 3.3333333333333335e-07, "loss": 0.0, "reward": 0.11085444036871195, "reward_std": 0.15387122705578804, "rewards/code_reward": 0.11063122469931841, "rewards/format_reward": 0.0022321429569274187, "step": 1 }, { "completion_length": 741.6986846923828, "epoch": 0.004246284501061571, "grad_norm": 0.21392129361629486, "kl": 0.0, "learning_rate": 6.666666666666667e-07, "loss": 0.0, "reward": 0.10614843107759953, "reward_std": 0.15658440068364143, "rewards/code_reward": 0.10592522472143173, "rewards/format_reward": 0.0022321429569274187, "step": 2 }, { "completion_length": 756.122802734375, "epoch": 0.006369426751592357, "grad_norm": 0.21344353258609772, "kl": 6.717443466186523e-05, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 0.12699278071522713, "reward_std": 0.15882046334445477, "rewards/code_reward": 0.12676957063376904, "rewards/format_reward": 0.0022321429569274187, "step": 3 }, { "completion_length": 760.8817291259766, "epoch": 0.008492569002123142, "grad_norm": 0.2038053572177887, "kl": 7.62939453125e-05, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "reward": 0.06782207638025284, "reward_std": 0.1164214089512825, "rewards/code_reward": 0.06782207870855927, "rewards/format_reward": 0.0, "step": 4 }, { "completion_length": 764.8817291259766, "epoch": 0.010615711252653927, "grad_norm": 0.20332112908363342, "kl": 7.510185241699219e-05, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": 0.07368321809917688, "reward_std": 0.11412223428487778, "rewards/code_reward": 0.07368322089314461, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 733.0937805175781, "epoch": 0.012738853503184714, "grad_norm": 0.2384093701839447, "kl": 8.344650268554688e-05, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": 0.11057165823876858, "reward_std": 0.13630107790231705, "rewards/code_reward": 0.11057165637612343, "rewards/format_reward": 0.0, "step": 6 }, { "completion_length": 747.8013763427734, "epoch": 0.014861995753715499, "grad_norm": 0.21190612018108368, "kl": 9.250640869140625e-05, "learning_rate": 2.3333333333333336e-06, "loss": 0.0, "reward": 0.13999284896999598, "reward_std": 0.14958541933447123, "rewards/code_reward": 0.13999284896999598, "rewards/format_reward": 0.0, "step": 7 }, { "completion_length": 747.6540679931641, "epoch": 0.016985138004246284, "grad_norm": 0.1906791776418686, "kl": 0.00013947486877441406, "learning_rate": 2.666666666666667e-06, "loss": 0.0, "reward": 0.0754421940073371, "reward_std": 0.10426154918968678, "rewards/code_reward": 0.07521897740662098, "rewards/format_reward": 0.0022321429569274187, "step": 8 }, { "completion_length": 710.5401916503906, "epoch": 0.01910828025477707, "grad_norm": 0.20240359008312225, "kl": 0.0002300739288330078, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.12234624661505222, "reward_std": 0.10050993971526623, "rewards/code_reward": 0.12234624475240707, "rewards/format_reward": 0.0, "step": 9 }, { "completion_length": 760.1808319091797, "epoch": 0.021231422505307854, "grad_norm": 0.34670934081077576, "kl": 0.0004100799560546875, "learning_rate": 3.3333333333333333e-06, "loss": 0.0, "reward": 0.055801121750846505, "reward_std": 0.06395915220491588, "rewards/code_reward": 0.05580112128518522, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 750.3906555175781, "epoch": 0.02335456475583864, "grad_norm": 0.21112516522407532, "kl": 0.0007238388061523438, "learning_rate": 3.6666666666666666e-06, "loss": 0.0, "reward": 0.06025231350213289, "reward_std": 0.09869139082729816, "rewards/code_reward": 0.06025231350213289, "rewards/format_reward": 0.0, "step": 11 }, { "completion_length": 700.1384124755859, "epoch": 0.025477707006369428, "grad_norm": 0.22157742083072662, "kl": 0.00104522705078125, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": 0.1535286195576191, "reward_std": 0.16596542671322823, "rewards/code_reward": 0.1530821956694126, "rewards/format_reward": 0.004464285913854837, "step": 12 }, { "completion_length": 702.9152069091797, "epoch": 0.027600849256900213, "grad_norm": 0.24524690210819244, "kl": 0.0017261505126953125, "learning_rate": 4.333333333333334e-06, "loss": 0.0, "reward": 0.19020407181233168, "reward_std": 0.16583579406142235, "rewards/code_reward": 0.1902040634304285, "rewards/format_reward": 0.0, "step": 13 }, { "completion_length": 714.9486999511719, "epoch": 0.029723991507430998, "grad_norm": 0.18179796636104584, "kl": 0.00283050537109375, "learning_rate": 4.666666666666667e-06, "loss": 0.0, "reward": 0.06596253952011466, "reward_std": 0.08220406854525208, "rewards/code_reward": 0.06596253253519535, "rewards/format_reward": 0.0, "step": 14 }, { "completion_length": 683.7835083007812, "epoch": 0.03184713375796178, "grad_norm": 0.18836897611618042, "kl": 0.004302978515625, "learning_rate": 5e-06, "loss": 0.0, "reward": 0.10805188585072756, "reward_std": 0.0908731259405613, "rewards/code_reward": 0.10782866925001144, "rewards/format_reward": 0.0022321429569274187, "step": 15 }, { "completion_length": 641.216552734375, "epoch": 0.03397027600849257, "grad_norm": 0.22146466374397278, "kl": 0.00562286376953125, "learning_rate": 4.999952797253148e-06, "loss": 0.0001, "reward": 0.20005132257938385, "reward_std": 0.16782562248408794, "rewards/code_reward": 0.19938167929649353, "rewards/format_reward": 0.006696428870782256, "step": 16 }, { "completion_length": 644.9598541259766, "epoch": 0.036093418259023353, "grad_norm": 0.20937775075435638, "kl": 0.00730133056640625, "learning_rate": 4.9998111909931225e-06, "loss": 0.0001, "reward": 0.1508529670536518, "reward_std": 0.1663584616035223, "rewards/code_reward": 0.15040653757750988, "rewards/format_reward": 0.004464285913854837, "step": 17 }, { "completion_length": 608.2477874755859, "epoch": 0.03821656050955414, "grad_norm": 0.2286761999130249, "kl": 0.0104217529296875, "learning_rate": 4.999575187161439e-06, "loss": 0.0001, "reward": 0.14321784488856792, "reward_std": 0.1725912243127823, "rewards/code_reward": 0.14321784675121307, "rewards/format_reward": 0.0, "step": 18 }, { "completion_length": 650.0647583007812, "epoch": 0.040339702760084924, "grad_norm": 0.21597912907600403, "kl": 0.0113677978515625, "learning_rate": 4.9992447956603455e-06, "loss": 0.0001, "reward": 0.12854056991636753, "reward_std": 0.15781505592167377, "rewards/code_reward": 0.12854057550430298, "rewards/format_reward": 0.0, "step": 19 }, { "completion_length": 624.8192138671875, "epoch": 0.04246284501061571, "grad_norm": 42.34319305419922, "kl": 7.167266845703125, "learning_rate": 4.998820030352409e-06, "loss": 0.0716, "reward": 0.12942847050726414, "reward_std": 0.11835422366857529, "rewards/code_reward": 0.1292052511125803, "rewards/format_reward": 0.0022321429569274187, "step": 20 }, { "completion_length": 632.3504638671875, "epoch": 0.044585987261146494, "grad_norm": 0.23411324620246887, "kl": 0.0178985595703125, "learning_rate": 4.998300909059929e-06, "loss": 0.0002, "reward": 0.12214689701795578, "reward_std": 0.1782115437090397, "rewards/code_reward": 0.12214690260589123, "rewards/format_reward": 0.0, "step": 21 }, { "completion_length": 610.6652069091797, "epoch": 0.04670912951167728, "grad_norm": 0.24601581692695618, "kl": 0.020477294921875, "learning_rate": 4.997687453564198e-06, "loss": 0.0002, "reward": 0.18596480786800385, "reward_std": 0.184912770986557, "rewards/code_reward": 0.18529516831040382, "rewards/format_reward": 0.006696428870782256, "step": 22 }, { "completion_length": 612.2344055175781, "epoch": 0.04883227176220807, "grad_norm": 0.25667688250541687, "kl": 0.02349853515625, "learning_rate": 4.9969796896045775e-06, "loss": 0.0002, "reward": 0.1700380276888609, "reward_std": 0.15014583989977837, "rewards/code_reward": 0.1689219493418932, "rewards/format_reward": 0.011160714784637094, "step": 23 }, { "completion_length": 582.0379791259766, "epoch": 0.050955414012738856, "grad_norm": 1.4910736083984375, "kl": 0.054718017578125, "learning_rate": 4.996177646877426e-06, "loss": 0.0005, "reward": 0.15169917233288288, "reward_std": 0.16788329929113388, "rewards/code_reward": 0.151029534637928, "rewards/format_reward": 0.006696428870782256, "step": 24 }, { "completion_length": 605.2277069091797, "epoch": 0.05307855626326964, "grad_norm": 0.23100271821022034, "kl": 0.030059814453125, "learning_rate": 4.995281359034851e-06, "loss": 0.0003, "reward": 0.10647542215883732, "reward_std": 0.13741069473326206, "rewards/code_reward": 0.105805778875947, "rewards/format_reward": 0.006696428870782256, "step": 25 }, { "completion_length": 595.9777069091797, "epoch": 0.055201698513800426, "grad_norm": 0.22825182974338531, "kl": 0.03179931640625, "learning_rate": 4.994290863683296e-06, "loss": 0.0003, "reward": 0.11801626486703753, "reward_std": 0.12430650275200605, "rewards/code_reward": 0.11779305664822459, "rewards/format_reward": 0.0022321429569274187, "step": 26 }, { "completion_length": 594.2031402587891, "epoch": 0.05732484076433121, "grad_norm": 0.2523776590824127, "kl": 0.0357666015625, "learning_rate": 4.99320620238196e-06, "loss": 0.0004, "reward": 0.1666601337492466, "reward_std": 0.20200489647686481, "rewards/code_reward": 0.1655440628528595, "rewards/format_reward": 0.011160714784637094, "step": 27 }, { "completion_length": 606.7299346923828, "epoch": 0.059447983014861996, "grad_norm": 0.24759377539157867, "kl": 0.03466796875, "learning_rate": 4.99202742064106e-06, "loss": 0.0003, "reward": 0.12888818327337503, "reward_std": 0.14617390558123589, "rewards/code_reward": 0.12732568103820086, "rewards/format_reward": 0.01562500116415322, "step": 28 }, { "completion_length": 582.7879791259766, "epoch": 0.06157112526539278, "grad_norm": 0.22141791880130768, "kl": 0.0360107421875, "learning_rate": 4.990754567919917e-06, "loss": 0.0004, "reward": 0.1982099711894989, "reward_std": 0.15798946656286716, "rewards/code_reward": 0.1970939077436924, "rewards/format_reward": 0.011160714784637094, "step": 29 }, { "completion_length": 582.3035888671875, "epoch": 0.06369426751592357, "grad_norm": 0.4308633804321289, "kl": 0.04461669921875, "learning_rate": 4.989387697624881e-06, "loss": 0.0004, "reward": 0.15222312323749065, "reward_std": 0.13287453912198544, "rewards/code_reward": 0.14999098517000675, "rewards/format_reward": 0.022321429336443543, "step": 30 }, { "completion_length": 560.9888763427734, "epoch": 0.06581740976645435, "grad_norm": 0.44854736328125, "kl": 0.05029296875, "learning_rate": 4.987926867107095e-06, "loss": 0.0005, "reward": 0.17351704463362694, "reward_std": 0.1594883631914854, "rewards/code_reward": 0.17039205506443977, "rewards/format_reward": 0.031250000931322575, "step": 31 }, { "completion_length": 520.7835006713867, "epoch": 0.06794055201698514, "grad_norm": 0.3021621108055115, "kl": 0.0545654296875, "learning_rate": 4.986372137660078e-06, "loss": 0.0005, "reward": 0.19399502873420715, "reward_std": 0.18005169555544853, "rewards/code_reward": 0.1872985940426588, "rewards/format_reward": 0.0669642873108387, "step": 32 }, { "completion_length": 551.3750228881836, "epoch": 0.07006369426751592, "grad_norm": 0.38339507579803467, "kl": 0.0712890625, "learning_rate": 4.984723574517165e-06, "loss": 0.0007, "reward": 0.15828289464116096, "reward_std": 0.18912290409207344, "rewards/code_reward": 0.1453364696353674, "rewards/format_reward": 0.12946429289877415, "step": 33 }, { "completion_length": 490.0558166503906, "epoch": 0.07218683651804671, "grad_norm": 0.5539775490760803, "kl": 0.0887451171875, "learning_rate": 4.9829812468487655e-06, "loss": 0.0009, "reward": 0.18788279965519905, "reward_std": 0.19856177270412445, "rewards/code_reward": 0.16578458063304424, "rewards/format_reward": 0.2209821566939354, "step": 34 }, { "completion_length": 441.99778747558594, "epoch": 0.07430997876857749, "grad_norm": 0.35391107201576233, "kl": 0.12060546875, "learning_rate": 4.981145227759457e-06, "loss": 0.0012, "reward": 0.20366163551807404, "reward_std": 0.1490145679563284, "rewards/code_reward": 0.16392949409782887, "rewards/format_reward": 0.3973214477300644, "step": 35 }, { "completion_length": 454.8236846923828, "epoch": 0.07643312101910828, "grad_norm": 0.34607765078544617, "kl": 0.18994140625, "learning_rate": 4.979215594284924e-06, "loss": 0.0019, "reward": 0.16812831349670887, "reward_std": 0.16833286173641682, "rewards/code_reward": 0.10094079561531544, "rewards/format_reward": 0.6718750298023224, "step": 36 }, { "completion_length": 413.80358123779297, "epoch": 0.07855626326963906, "grad_norm": 0.30116426944732666, "kl": 0.1982421875, "learning_rate": 4.977192427388722e-06, "loss": 0.002, "reward": 0.24648623168468475, "reward_std": 0.1688873954117298, "rewards/code_reward": 0.16099514812231064, "rewards/format_reward": 0.854910746216774, "step": 37 }, { "completion_length": 412.4464416503906, "epoch": 0.08067940552016985, "grad_norm": 0.3934517204761505, "kl": 0.248046875, "learning_rate": 4.9750758119588824e-06, "loss": 0.0025, "reward": 0.24308543279767036, "reward_std": 0.14966130815446377, "rewards/code_reward": 0.1495586484670639, "rewards/format_reward": 0.9352678954601288, "step": 38 }, { "completion_length": 424.8928756713867, "epoch": 0.08280254777070063, "grad_norm": 0.3077991008758545, "kl": 0.256103515625, "learning_rate": 4.972865836804349e-06, "loss": 0.0026, "reward": 0.2948240712285042, "reward_std": 0.17200535349547863, "rewards/code_reward": 0.19995798915624619, "rewards/format_reward": 0.9486607611179352, "step": 39 }, { "completion_length": 445.8326110839844, "epoch": 0.08492569002123142, "grad_norm": 0.3074510097503662, "kl": 0.259765625, "learning_rate": 4.970562594651254e-06, "loss": 0.0026, "reward": 0.2571263238787651, "reward_std": 0.1593556720763445, "rewards/code_reward": 0.16226024366915226, "rewards/format_reward": 0.9486607611179352, "step": 40 }, { "completion_length": 474.68082427978516, "epoch": 0.0870488322717622, "grad_norm": 0.28787654638290405, "kl": 0.2421875, "learning_rate": 4.968166182139026e-06, "loss": 0.0024, "reward": 0.27686072885990143, "reward_std": 0.16917606256902218, "rewards/code_reward": 0.18378033302724361, "rewards/format_reward": 0.9308036118745804, "step": 41 }, { "completion_length": 520.4397583007812, "epoch": 0.08917197452229299, "grad_norm": 0.2756073772907257, "kl": 0.22216796875, "learning_rate": 4.9656766998163306e-06, "loss": 0.0023, "reward": 0.29509423673152924, "reward_std": 0.13862515799701214, "rewards/code_reward": 0.20402280241250992, "rewards/format_reward": 0.910714328289032, "step": 42 }, { "completion_length": 520.6406555175781, "epoch": 0.09129511677282377, "grad_norm": 0.2641509473323822, "kl": 0.1728515625, "learning_rate": 4.963094252136865e-06, "loss": 0.0017, "reward": 0.3755844831466675, "reward_std": 0.19650832191109657, "rewards/code_reward": 0.280941616743803, "rewards/format_reward": 0.9464286118745804, "step": 43 }, { "completion_length": 520.0803909301758, "epoch": 0.09341825902335456, "grad_norm": 0.28843629360198975, "kl": 0.206298828125, "learning_rate": 4.960418947454958e-06, "loss": 0.0021, "reward": 0.21916456520557404, "reward_std": 0.12000982835888863, "rewards/code_reward": 0.12407528422772884, "rewards/format_reward": 0.95089291036129, "step": 44 }, { "completion_length": 516.7455749511719, "epoch": 0.09554140127388536, "grad_norm": 0.9614177942276001, "kl": 0.203125, "learning_rate": 4.957650898021038e-06, "loss": 0.002, "reward": 0.26794980466365814, "reward_std": 0.14450966753065586, "rewards/code_reward": 0.17397657968103886, "rewards/format_reward": 0.9397321939468384, "step": 45 }, { "completion_length": 517.7343978881836, "epoch": 0.09766454352441614, "grad_norm": 0.2904169261455536, "kl": 0.17041015625, "learning_rate": 4.954790219976915e-06, "loss": 0.0017, "reward": 0.3067335784435272, "reward_std": 0.15805694833397865, "rewards/code_reward": 0.21186750568449497, "rewards/format_reward": 0.948660746216774, "step": 46 }, { "completion_length": 533.3594055175781, "epoch": 0.09978768577494693, "grad_norm": 0.25753694772720337, "kl": 0.126953125, "learning_rate": 4.95183703335091e-06, "loss": 0.0013, "reward": 0.22189904749393463, "reward_std": 0.13265508972108364, "rewards/code_reward": 0.12390796467661858, "rewards/format_reward": 0.9799107611179352, "step": 47 }, { "completion_length": 548.950927734375, "epoch": 0.10191082802547771, "grad_norm": 0.26259344816207886, "kl": 0.1424560546875, "learning_rate": 4.948791462052819e-06, "loss": 0.0014, "reward": 0.22812815010547638, "reward_std": 0.1622354220598936, "rewards/code_reward": 0.12991385161876678, "rewards/format_reward": 0.9821428954601288, "step": 48 }, { "completion_length": 571.6741485595703, "epoch": 0.1040339702760085, "grad_norm": 0.4155128300189972, "kl": 0.20263671875, "learning_rate": 4.945653633868716e-06, "loss": 0.0021, "reward": 0.24147583171725273, "reward_std": 0.1386658363044262, "rewards/code_reward": 0.1450472492724657, "rewards/format_reward": 0.964285746216774, "step": 49 }, { "completion_length": 534.6093978881836, "epoch": 0.10615711252653928, "grad_norm": 0.24680602550506592, "kl": 0.159912109375, "learning_rate": 4.942423680455584e-06, "loss": 0.0016, "reward": 0.2133147530257702, "reward_std": 0.14480553567409515, "rewards/code_reward": 0.11643974296748638, "rewards/format_reward": 0.9687500298023224, "step": 50 }, { "completion_length": 521.4888610839844, "epoch": 0.10828025477707007, "grad_norm": 0.27140846848487854, "kl": 0.172119140625, "learning_rate": 4.939101737335802e-06, "loss": 0.0017, "reward": 0.3708176761865616, "reward_std": 0.1698193922638893, "rewards/code_reward": 0.2730497941374779, "rewards/format_reward": 0.9776786118745804, "step": 51 }, { "completion_length": 550.3236846923828, "epoch": 0.11040339702760085, "grad_norm": 0.24256259202957153, "kl": 0.145751953125, "learning_rate": 4.935687943891447e-06, "loss": 0.0015, "reward": 0.30257678776979446, "reward_std": 0.1430999655276537, "rewards/code_reward": 0.2057017907500267, "rewards/format_reward": 0.9687500447034836, "step": 52 }, { "completion_length": 551.3772583007812, "epoch": 0.11252653927813164, "grad_norm": 0.2562994062900543, "kl": 0.16259765625, "learning_rate": 4.932182443358458e-06, "loss": 0.0016, "reward": 0.314239501953125, "reward_std": 0.21334025636315346, "rewards/code_reward": 0.21624841168522835, "rewards/format_reward": 0.979910746216774, "step": 53 }, { "completion_length": 553.4955596923828, "epoch": 0.11464968152866242, "grad_norm": 0.23835241794586182, "kl": 0.160888671875, "learning_rate": 4.928585382820616e-06, "loss": 0.0016, "reward": 0.25176869705319405, "reward_std": 0.11105065606534481, "rewards/code_reward": 0.1535544078797102, "rewards/format_reward": 0.9821428954601288, "step": 54 }, { "completion_length": 552.2366333007812, "epoch": 0.11677282377919321, "grad_norm": 0.2630121409893036, "kl": 0.1552734375, "learning_rate": 4.924896913203376e-06, "loss": 0.0016, "reward": 0.24022378027439117, "reward_std": 0.15625984594225883, "rewards/code_reward": 0.14133985061198473, "rewards/format_reward": 0.988839328289032, "step": 55 }, { "completion_length": 574.1295013427734, "epoch": 0.11889596602972399, "grad_norm": 0.3262800872325897, "kl": 0.1572265625, "learning_rate": 4.921117189267535e-06, "loss": 0.0016, "reward": 0.32679247856140137, "reward_std": 0.19292927533388138, "rewards/code_reward": 0.22991745918989182, "rewards/format_reward": 0.9687500298023224, "step": 56 }, { "completion_length": 541.9576110839844, "epoch": 0.12101910828025478, "grad_norm": 0.2467201203107834, "kl": 0.17578125, "learning_rate": 4.917246369602742e-06, "loss": 0.0018, "reward": 0.25976729951798916, "reward_std": 0.1260015396401286, "rewards/code_reward": 0.16110657062381506, "rewards/format_reward": 0.986607164144516, "step": 57 }, { "completion_length": 553.419677734375, "epoch": 0.12314225053078556, "grad_norm": 0.2763681709766388, "kl": 0.15478515625, "learning_rate": 4.9132846166208355e-06, "loss": 0.0016, "reward": 0.2834607996046543, "reward_std": 0.1603868044912815, "rewards/code_reward": 0.1852465160191059, "rewards/format_reward": 0.9821428954601288, "step": 58 }, { "completion_length": 542.2343826293945, "epoch": 0.12526539278131635, "grad_norm": 1.2009683847427368, "kl": 0.203125, "learning_rate": 4.9092320965490365e-06, "loss": 0.002, "reward": 0.36397186666727066, "reward_std": 0.20367462560534477, "rewards/code_reward": 0.26531114615499973, "rewards/format_reward": 0.9866071790456772, "step": 59 }, { "completion_length": 502.7076110839844, "epoch": 0.12738853503184713, "grad_norm": 0.291824609041214, "kl": 0.1533203125, "learning_rate": 4.905088979422971e-06, "loss": 0.0015, "reward": 0.33501066267490387, "reward_std": 0.17072956077754498, "rewards/code_reward": 0.23701957240700722, "rewards/format_reward": 0.979910746216774, "step": 60 }, { "completion_length": 512.5134201049805, "epoch": 0.12951167728237792, "grad_norm": 0.2763117849826813, "kl": 0.1837158203125, "learning_rate": 4.900855439079536e-06, "loss": 0.0019, "reward": 0.3404688164591789, "reward_std": 0.19662801921367645, "rewards/code_reward": 0.2453795075416565, "rewards/format_reward": 0.9508928954601288, "step": 61 }, { "completion_length": 526.8995819091797, "epoch": 0.1316348195329087, "grad_norm": 0.2876502275466919, "kl": 0.19580078125, "learning_rate": 4.8965316531496055e-06, "loss": 0.002, "reward": 0.2866082601249218, "reward_std": 0.16614584252238274, "rewards/code_reward": 0.19129573553800583, "rewards/format_reward": 0.9531250447034836, "step": 62 }, { "completion_length": 593.2924499511719, "epoch": 0.1337579617834395, "grad_norm": 2.4047532081604004, "kl": 0.41357421875, "learning_rate": 4.892117803050578e-06, "loss": 0.0041, "reward": 0.2631051279604435, "reward_std": 0.2128530964255333, "rewards/code_reward": 0.17359617352485657, "rewards/format_reward": 0.895089328289032, "step": 63 }, { "completion_length": 563.3437805175781, "epoch": 0.13588110403397027, "grad_norm": 0.2846791446208954, "kl": 0.197509765625, "learning_rate": 4.887614073978761e-06, "loss": 0.002, "reward": 0.2669316381216049, "reward_std": 0.14747418276965618, "rewards/code_reward": 0.17630662396550179, "rewards/format_reward": 0.9062500447034836, "step": 64 }, { "completion_length": 535.6183319091797, "epoch": 0.13800424628450106, "grad_norm": 0.2759235203266144, "kl": 0.186767578125, "learning_rate": 4.883020654901609e-06, "loss": 0.0019, "reward": 0.28016526997089386, "reward_std": 0.17947101965546608, "rewards/code_reward": 0.18730811774730682, "rewards/format_reward": 0.928571492433548, "step": 65 }, { "completion_length": 616.9732360839844, "epoch": 0.14012738853503184, "grad_norm": 0.26271936297416687, "kl": 0.23974609375, "learning_rate": 4.878337738549785e-06, "loss": 0.0024, "reward": 0.23576084896922112, "reward_std": 0.18645637948065996, "rewards/code_reward": 0.1466983389109373, "rewards/format_reward": 0.8906250447034836, "step": 66 }, { "completion_length": 591.013427734375, "epoch": 0.14225053078556263, "grad_norm": 0.2741730511188507, "kl": 0.220947265625, "learning_rate": 4.873565521409082e-06, "loss": 0.0023, "reward": 0.2887257859110832, "reward_std": 0.15980570390820503, "rewards/code_reward": 0.20055612176656723, "rewards/format_reward": 0.8816964775323868, "step": 67 }, { "completion_length": 567.3995819091797, "epoch": 0.14437367303609341, "grad_norm": 0.30026066303253174, "kl": 0.196533203125, "learning_rate": 4.868704203712173e-06, "loss": 0.002, "reward": 0.2695513255894184, "reward_std": 0.13891723938286304, "rewards/code_reward": 0.18361380137503147, "rewards/format_reward": 0.8593750447034836, "step": 68 }, { "completion_length": 557.5424346923828, "epoch": 0.1464968152866242, "grad_norm": 0.26508811116218567, "kl": 0.2275390625, "learning_rate": 4.86375398943021e-06, "loss": 0.0023, "reward": 0.2681450620293617, "reward_std": 0.15566366165876389, "rewards/code_reward": 0.17618075758218765, "rewards/format_reward": 0.91964291036129, "step": 69 }, { "completion_length": 567.8192291259766, "epoch": 0.14861995753715498, "grad_norm": 0.2842702567577362, "kl": 0.206787109375, "learning_rate": 4.858715086264274e-06, "loss": 0.0021, "reward": 0.3179836943745613, "reward_std": 0.17460669204592705, "rewards/code_reward": 0.2246801033616066, "rewards/format_reward": 0.9330357611179352, "step": 70 }, { "completion_length": 537.6094055175781, "epoch": 0.15074309978768577, "grad_norm": 0.2676061689853668, "kl": 0.208740234375, "learning_rate": 4.853587705636646e-06, "loss": 0.0021, "reward": 0.29776863381266594, "reward_std": 0.16130083054304123, "rewards/code_reward": 0.2037954218685627, "rewards/format_reward": 0.9397321790456772, "step": 71 }, { "completion_length": 575.2143096923828, "epoch": 0.15286624203821655, "grad_norm": 0.2367551475763321, "kl": 0.189208984375, "learning_rate": 4.84837206268195e-06, "loss": 0.0019, "reward": 0.23160668835043907, "reward_std": 0.13609608635306358, "rewards/code_reward": 0.13785668183118105, "rewards/format_reward": 0.9375000447034836, "step": 72 }, { "completion_length": 615.6518096923828, "epoch": 0.15498938428874734, "grad_norm": 0.2558582127094269, "kl": 0.199462890625, "learning_rate": 4.8430683762381195e-06, "loss": 0.002, "reward": 0.3226686045527458, "reward_std": 0.18408508598804474, "rewards/code_reward": 0.22847215831279755, "rewards/format_reward": 0.9419643133878708, "step": 73 }, { "completion_length": 581.3549499511719, "epoch": 0.15711252653927812, "grad_norm": 0.25004705786705017, "kl": 0.2255859375, "learning_rate": 4.837676868837213e-06, "loss": 0.0023, "reward": 0.3521072790026665, "reward_std": 0.18179307878017426, "rewards/code_reward": 0.2556787021458149, "rewards/format_reward": 0.9642857611179352, "step": 74 }, { "completion_length": 615.6384124755859, "epoch": 0.1592356687898089, "grad_norm": 0.2326764315366745, "kl": 0.184326171875, "learning_rate": 4.832197766696085e-06, "loss": 0.002, "reward": 0.3262624219059944, "reward_std": 0.13872519508004189, "rewards/code_reward": 0.22827134653925896, "rewards/format_reward": 0.9799107611179352, "step": 75 }, { "completion_length": 626.1919708251953, "epoch": 0.1613588110403397, "grad_norm": 0.22483916580677032, "kl": 0.2158203125, "learning_rate": 4.826631299706887e-06, "loss": 0.0022, "reward": 0.24266962707042694, "reward_std": 0.15623858594335616, "rewards/code_reward": 0.14579462260007858, "rewards/format_reward": 0.9687500447034836, "step": 76 }, { "completion_length": 638.6696624755859, "epoch": 0.16348195329087048, "grad_norm": 0.2549837827682495, "kl": 0.2041015625, "learning_rate": 4.820977701427424e-06, "loss": 0.002, "reward": 0.3548019379377365, "reward_std": 0.19029108062386513, "rewards/code_reward": 0.2577037066221237, "rewards/format_reward": 0.9709821939468384, "step": 77 }, { "completion_length": 645.8727874755859, "epoch": 0.16560509554140126, "grad_norm": 0.2154364138841629, "kl": 0.21875, "learning_rate": 4.81523720907136e-06, "loss": 0.0022, "reward": 0.23285862803459167, "reward_std": 0.129691231995821, "rewards/code_reward": 0.13531397026963532, "rewards/format_reward": 0.975446492433548, "step": 78 }, { "completion_length": 631.9732513427734, "epoch": 0.16772823779193205, "grad_norm": 0.22862014174461365, "kl": 0.21435546875, "learning_rate": 4.809410063498254e-06, "loss": 0.0022, "reward": 0.33913441002368927, "reward_std": 0.1570077408105135, "rewards/code_reward": 0.2411433346569538, "rewards/format_reward": 0.9799107760190964, "step": 79 }, { "completion_length": 650.060302734375, "epoch": 0.16985138004246284, "grad_norm": 0.2403189241886139, "kl": 0.218994140625, "learning_rate": 4.8034965092034656e-06, "loss": 0.0022, "reward": 0.2641909271478653, "reward_std": 0.15404854156076908, "rewards/code_reward": 0.16664628125727177, "rewards/format_reward": 0.9754464626312256, "step": 80 }, { "completion_length": 665.2299346923828, "epoch": 0.17197452229299362, "grad_norm": 0.22183021903038025, "kl": 0.17236328125, "learning_rate": 4.797496794307889e-06, "loss": 0.0017, "reward": 0.26636216044425964, "reward_std": 0.159404331818223, "rewards/code_reward": 0.16814786568284035, "rewards/format_reward": 0.9821428954601288, "step": 81 }, { "completion_length": 646.7277069091797, "epoch": 0.1740976645435244, "grad_norm": 0.22360101342201233, "kl": 0.1865234375, "learning_rate": 4.791411170547545e-06, "loss": 0.0019, "reward": 0.2806714288890362, "reward_std": 0.1345765646547079, "rewards/code_reward": 0.1829035673290491, "rewards/format_reward": 0.9776786267757416, "step": 82 }, { "completion_length": 649.9285888671875, "epoch": 0.1762208067940552, "grad_norm": 0.2544306218624115, "kl": 0.173583984375, "learning_rate": 4.785239893263017e-06, "loss": 0.0017, "reward": 0.26558100432157516, "reward_std": 0.13677635975182056, "rewards/code_reward": 0.16825956851243973, "rewards/format_reward": 0.9732143431901932, "step": 83 }, { "completion_length": 686.3661041259766, "epoch": 0.17834394904458598, "grad_norm": 0.21967419981956482, "kl": 0.16162109375, "learning_rate": 4.778983221388742e-06, "loss": 0.0016, "reward": 0.24129238724708557, "reward_std": 0.1258857063949108, "rewards/code_reward": 0.14330130256712437, "rewards/format_reward": 0.9799107760190964, "step": 84 }, { "completion_length": 634.2634124755859, "epoch": 0.18046709129511676, "grad_norm": 0.254304975271225, "kl": 0.17724609375, "learning_rate": 4.77264141744214e-06, "loss": 0.0018, "reward": 0.3212145194411278, "reward_std": 0.1864020749926567, "rewards/code_reward": 0.2236698605120182, "rewards/format_reward": 0.9754464775323868, "step": 85 }, { "completion_length": 641.8393096923828, "epoch": 0.18259023354564755, "grad_norm": 0.24272438883781433, "kl": 0.19873046875, "learning_rate": 4.766214747512603e-06, "loss": 0.002, "reward": 0.31076986342668533, "reward_std": 0.18200884014368057, "rewards/code_reward": 0.2134484425187111, "rewards/format_reward": 0.973214328289032, "step": 86 }, { "completion_length": 643.4263763427734, "epoch": 0.18471337579617833, "grad_norm": 0.2264644354581833, "kl": 0.185791015625, "learning_rate": 4.759703481250331e-06, "loss": 0.0019, "reward": 0.3214620351791382, "reward_std": 0.14903312921524048, "rewards/code_reward": 0.22347095608711243, "rewards/format_reward": 0.979910746216774, "step": 87 }, { "completion_length": 649.5580749511719, "epoch": 0.18683651804670912, "grad_norm": 0.22847051918506622, "kl": 0.169677734375, "learning_rate": 4.753107891855015e-06, "loss": 0.0018, "reward": 0.25390685349702835, "reward_std": 0.12118050269782543, "rewards/code_reward": 0.15680862963199615, "rewards/format_reward": 0.9709821939468384, "step": 88 }, { "completion_length": 652.5513610839844, "epoch": 0.18895966029723993, "grad_norm": 0.22527199983596802, "kl": 0.19580078125, "learning_rate": 4.746428256064375e-06, "loss": 0.002, "reward": 0.303693201392889, "reward_std": 0.1710791066288948, "rewards/code_reward": 0.20525570400059223, "rewards/format_reward": 0.9843750298023224, "step": 89 }, { "completion_length": 693.4174499511719, "epoch": 0.1910828025477707, "grad_norm": 0.2056378573179245, "kl": 0.17041015625, "learning_rate": 4.7396648541425534e-06, "loss": 0.0017, "reward": 0.2523197568953037, "reward_std": 0.1238141655921936, "rewards/code_reward": 0.15432866849005222, "rewards/format_reward": 0.979910746216774, "step": 90 }, { "completion_length": 679.5580596923828, "epoch": 0.1932059447983015, "grad_norm": 0.225660502910614, "kl": 0.175537109375, "learning_rate": 4.732817969868348e-06, "loss": 0.0018, "reward": 0.25567496195435524, "reward_std": 0.16754142567515373, "rewards/code_reward": 0.15813031047582626, "rewards/format_reward": 0.9754464626312256, "step": 91 }, { "completion_length": 627.4464569091797, "epoch": 0.19532908704883228, "grad_norm": 0.24276329576969147, "kl": 0.1419677734375, "learning_rate": 4.7258878905233095e-06, "loss": 0.0014, "reward": 0.3579171895980835, "reward_std": 0.21506508812308311, "rewards/code_reward": 0.25992610678076744, "rewards/format_reward": 0.9799107611179352, "step": 92 }, { "completion_length": 664.091552734375, "epoch": 0.19745222929936307, "grad_norm": 0.2459368258714676, "kl": 0.17431640625, "learning_rate": 4.718874906879688e-06, "loss": 0.0017, "reward": 0.25773513317108154, "reward_std": 0.16034462675452232, "rewards/code_reward": 0.16130654886364937, "rewards/format_reward": 0.964285746216774, "step": 93 }, { "completion_length": 641.9576110839844, "epoch": 0.19957537154989385, "grad_norm": 0.20433549582958221, "kl": 0.135986328125, "learning_rate": 4.711779313188231e-06, "loss": 0.0014, "reward": 0.31772880256175995, "reward_std": 0.12460769526660442, "rewards/code_reward": 0.218844847753644, "rewards/format_reward": 0.988839328289032, "step": 94 }, { "completion_length": 668.7522430419922, "epoch": 0.20169851380042464, "grad_norm": 0.228180393576622, "kl": 0.1337890625, "learning_rate": 4.70460140716584e-06, "loss": 0.0014, "reward": 0.23017888888716698, "reward_std": 0.16307671833783388, "rewards/code_reward": 0.13196459133177996, "rewards/format_reward": 0.9821428954601288, "step": 95 }, { "completion_length": 666.0960083007812, "epoch": 0.20382165605095542, "grad_norm": 0.23849396407604218, "kl": 0.132568359375, "learning_rate": 4.697341489983076e-06, "loss": 0.0013, "reward": 0.38449443876743317, "reward_std": 0.205027487128973, "rewards/code_reward": 0.2869497686624527, "rewards/format_reward": 0.9754464626312256, "step": 96 }, { "completion_length": 638.5513763427734, "epoch": 0.2059447983014862, "grad_norm": 0.23833003640174866, "kl": 0.1219482421875, "learning_rate": 4.6899998662515215e-06, "loss": 0.0012, "reward": 0.30101777240633965, "reward_std": 0.18449735268950462, "rewards/code_reward": 0.20235705375671387, "rewards/format_reward": 0.9866071790456772, "step": 97 }, { "completion_length": 638.1161041259766, "epoch": 0.208067940552017, "grad_norm": 0.21731068193912506, "kl": 0.146484375, "learning_rate": 4.682576844011007e-06, "loss": 0.0015, "reward": 0.2744937762618065, "reward_std": 0.16123195737600327, "rewards/code_reward": 0.17650271020829678, "rewards/format_reward": 0.979910746216774, "step": 98 }, { "completion_length": 616.8861846923828, "epoch": 0.21019108280254778, "grad_norm": 0.25706782937049866, "kl": 0.134033203125, "learning_rate": 4.675072734716678e-06, "loss": 0.0013, "reward": 0.27044272795319557, "reward_std": 0.17698625475168228, "rewards/code_reward": 0.17245164141058922, "rewards/format_reward": 0.9799107313156128, "step": 99 }, { "completion_length": 617.6875305175781, "epoch": 0.21231422505307856, "grad_norm": 0.23481673002243042, "kl": 0.123291015625, "learning_rate": 4.667487853225931e-06, "loss": 0.0013, "reward": 0.27581261470913887, "reward_std": 0.1309206485748291, "rewards/code_reward": 0.17692868784070015, "rewards/format_reward": 0.988839328289032, "step": 100 }, { "completion_length": 649.1205749511719, "epoch": 0.21443736730360935, "grad_norm": 0.22034895420074463, "kl": 0.1197509765625, "learning_rate": 4.659822517785203e-06, "loss": 0.0012, "reward": 0.3144006244838238, "reward_std": 0.1468491405248642, "rewards/code_reward": 0.21529346704483032, "rewards/format_reward": 0.9910714477300644, "step": 101 }, { "completion_length": 637.7120819091797, "epoch": 0.21656050955414013, "grad_norm": 0.23196153342723846, "kl": 0.1163330078125, "learning_rate": 4.6520770500166165e-06, "loss": 0.0012, "reward": 0.2747727185487747, "reward_std": 0.15404804423451424, "rewards/code_reward": 0.17678163386881351, "rewards/format_reward": 0.979910746216774, "step": 102 }, { "completion_length": 640.4375305175781, "epoch": 0.21868365180467092, "grad_norm": 0.21843470633029938, "kl": 0.111083984375, "learning_rate": 4.644251774904487e-06, "loss": 0.0012, "reward": 0.2366674654185772, "reward_std": 0.12331773899495602, "rewards/code_reward": 0.13889960199594498, "rewards/format_reward": 0.9776786267757416, "step": 103 }, { "completion_length": 635.5982513427734, "epoch": 0.2208067940552017, "grad_norm": 0.2491585612297058, "kl": 0.1253662109375, "learning_rate": 4.636347020781684e-06, "loss": 0.0013, "reward": 0.26541591063141823, "reward_std": 0.20751060917973518, "rewards/code_reward": 0.16876413114368916, "rewards/format_reward": 0.9665178954601288, "step": 104 }, { "completion_length": 636.6919860839844, "epoch": 0.2229299363057325, "grad_norm": 0.22667376697063446, "kl": 0.1239013671875, "learning_rate": 4.6283631193158605e-06, "loss": 0.0013, "reward": 0.28487036004662514, "reward_std": 0.16408125311136246, "rewards/code_reward": 0.18732571229338646, "rewards/format_reward": 0.9754464775323868, "step": 105 }, { "completion_length": 645.0491333007812, "epoch": 0.22505307855626328, "grad_norm": 0.2283681333065033, "kl": 0.124267578125, "learning_rate": 4.620300405495532e-06, "loss": 0.0013, "reward": 0.2775597535073757, "reward_std": 0.15426970086991787, "rewards/code_reward": 0.17867580242455006, "rewards/format_reward": 0.9888393133878708, "step": 106 }, { "completion_length": 633.8794860839844, "epoch": 0.22717622080679406, "grad_norm": 0.24237921833992004, "kl": 0.1158447265625, "learning_rate": 4.612159217616022e-06, "loss": 0.0012, "reward": 0.3130115121603012, "reward_std": 0.20111830905079842, "rewards/code_reward": 0.21502043306827545, "rewards/format_reward": 0.979910746216774, "step": 107 }, { "completion_length": 607.8995666503906, "epoch": 0.22929936305732485, "grad_norm": 0.22627882659435272, "kl": 0.1114501953125, "learning_rate": 4.603939897265268e-06, "loss": 0.0011, "reward": 0.2647922486066818, "reward_std": 0.13070931658148766, "rewards/code_reward": 0.16546186804771423, "rewards/format_reward": 0.9933035969734192, "step": 108 }, { "completion_length": 611.9308166503906, "epoch": 0.23142250530785563, "grad_norm": 0.24682241678237915, "kl": 0.11474609375, "learning_rate": 4.595642789309492e-06, "loss": 0.0012, "reward": 0.24479227885603905, "reward_std": 0.14851071499288082, "rewards/code_reward": 0.14657797291874886, "rewards/format_reward": 0.9821428954601288, "step": 109 }, { "completion_length": 601.9018096923828, "epoch": 0.23354564755838642, "grad_norm": 0.22991596162319183, "kl": 0.1337890625, "learning_rate": 4.587268241878724e-06, "loss": 0.0014, "reward": 0.3472997844219208, "reward_std": 0.20232820883393288, "rewards/code_reward": 0.24953191354870796, "rewards/format_reward": 0.9776785969734192, "step": 110 }, { "completion_length": 619.3036041259766, "epoch": 0.2356687898089172, "grad_norm": 0.23583151400089264, "kl": 0.142822265625, "learning_rate": 4.578816606352205e-06, "loss": 0.0014, "reward": 0.29563019424676895, "reward_std": 0.17909668013453484, "rewards/code_reward": 0.1987551935017109, "rewards/format_reward": 0.9687500298023224, "step": 111 }, { "completion_length": 576.4419860839844, "epoch": 0.23779193205944799, "grad_norm": 0.2594500780105591, "kl": 0.11865234375, "learning_rate": 4.570288237343632e-06, "loss": 0.0012, "reward": 0.37235086783766747, "reward_std": 0.21180756203830242, "rewards/code_reward": 0.27346691489219666, "rewards/format_reward": 0.9888393133878708, "step": 112 }, { "completion_length": 600.5937805175781, "epoch": 0.23991507430997877, "grad_norm": 0.24643369019031525, "kl": 0.1270751953125, "learning_rate": 4.561683492686289e-06, "loss": 0.0013, "reward": 0.31715739518404007, "reward_std": 0.18861495703458786, "rewards/code_reward": 0.21871986612677574, "rewards/format_reward": 0.9843750596046448, "step": 113 }, { "completion_length": 587.732177734375, "epoch": 0.24203821656050956, "grad_norm": 0.23611418902873993, "kl": 0.1268310546875, "learning_rate": 4.5530027334180285e-06, "loss": 0.0013, "reward": 0.26467062532901764, "reward_std": 0.17901071533560753, "rewards/code_reward": 0.16712596639990807, "rewards/format_reward": 0.9754464775323868, "step": 114 }, { "completion_length": 599.2254791259766, "epoch": 0.24416135881104034, "grad_norm": 0.24544627964496613, "kl": 0.1339111328125, "learning_rate": 4.544246323766122e-06, "loss": 0.0014, "reward": 0.27841826155781746, "reward_std": 0.16098117642104626, "rewards/code_reward": 0.18132001720368862, "rewards/format_reward": 0.9709821939468384, "step": 115 }, { "completion_length": 570.107177734375, "epoch": 0.24628450106157113, "grad_norm": 0.25169771909713745, "kl": 0.130859375, "learning_rate": 4.535414631131983e-06, "loss": 0.0013, "reward": 0.34019989520311356, "reward_std": 0.235354982316494, "rewards/code_reward": 0.2422088049352169, "rewards/format_reward": 0.9799107611179352, "step": 116 }, { "completion_length": 586.2834930419922, "epoch": 0.2484076433121019, "grad_norm": 0.2503570318222046, "kl": 0.1285400390625, "learning_rate": 4.526508026075746e-06, "loss": 0.0013, "reward": 0.33243585377931595, "reward_std": 0.15851835533976555, "rewards/code_reward": 0.23310547694563866, "rewards/format_reward": 0.9933035969734192, "step": 117 }, { "completion_length": 619.6964569091797, "epoch": 0.2505307855626327, "grad_norm": 0.2021104097366333, "kl": 0.1275634765625, "learning_rate": 4.517526882300721e-06, "loss": 0.0013, "reward": 0.1987566240131855, "reward_std": 0.12675911094993353, "rewards/code_reward": 0.10143518354743719, "rewards/format_reward": 0.973214328289032, "step": 118 }, { "completion_length": 577.7500305175781, "epoch": 0.2526539278131635, "grad_norm": 0.23152245581150055, "kl": 0.139404296875, "learning_rate": 4.508471576637713e-06, "loss": 0.0014, "reward": 0.24329132214188576, "reward_std": 0.16570740193128586, "rewards/code_reward": 0.14485381357371807, "rewards/format_reward": 0.9843750447034836, "step": 119 }, { "completion_length": 599.3102874755859, "epoch": 0.25477707006369427, "grad_norm": 0.23706002533435822, "kl": 0.1292724609375, "learning_rate": 4.499342489029211e-06, "loss": 0.0013, "reward": 0.24242350459098816, "reward_std": 0.14784781634807587, "rewards/code_reward": 0.14398599043488503, "rewards/format_reward": 0.9843750298023224, "step": 120 }, { "completion_length": 572.8839569091797, "epoch": 0.25690021231422505, "grad_norm": 0.2494058609008789, "kl": 0.1270751953125, "learning_rate": 4.490140002513449e-06, "loss": 0.0013, "reward": 0.26072419434785843, "reward_std": 0.12450610846281052, "rewards/code_reward": 0.16117061115801334, "rewards/format_reward": 0.9955357313156128, "step": 121 }, { "completion_length": 601.8080749511719, "epoch": 0.25902335456475584, "grad_norm": 0.23028254508972168, "kl": 0.1180419921875, "learning_rate": 4.48086450320833e-06, "loss": 0.0012, "reward": 0.3514738455414772, "reward_std": 0.16258227452635765, "rewards/code_reward": 0.2525899298489094, "rewards/format_reward": 0.988839328289032, "step": 122 }, { "completion_length": 590.4040374755859, "epoch": 0.2611464968152866, "grad_norm": 0.24208419024944305, "kl": 0.1234130859375, "learning_rate": 4.4715163802952266e-06, "loss": 0.0012, "reward": 0.3460327610373497, "reward_std": 0.1636445987969637, "rewards/code_reward": 0.24647919461131096, "rewards/format_reward": 0.9955357313156128, "step": 123 }, { "completion_length": 609.7098388671875, "epoch": 0.2632696390658174, "grad_norm": 0.253397136926651, "kl": 0.135009765625, "learning_rate": 4.462096026002655e-06, "loss": 0.0014, "reward": 0.25145725160837173, "reward_std": 0.16506105288863182, "rewards/code_reward": 0.1530197374522686, "rewards/format_reward": 0.9843750298023224, "step": 124 }, { "completion_length": 603.8973388671875, "epoch": 0.2653927813163482, "grad_norm": 0.2500688135623932, "kl": 0.1434326171875, "learning_rate": 4.4526038355898144e-06, "loss": 0.0015, "reward": 0.3970717117190361, "reward_std": 0.2130543477833271, "rewards/code_reward": 0.29908062517642975, "rewards/format_reward": 0.979910746216774, "step": 125 }, { "completion_length": 615.4620819091797, "epoch": 0.267515923566879, "grad_norm": 0.20834827423095703, "kl": 0.1336669921875, "learning_rate": 4.4430402073300035e-06, "loss": 0.0014, "reward": 0.26642825454473495, "reward_std": 0.1255171401426196, "rewards/code_reward": 0.16799074038863182, "rewards/format_reward": 0.9843750447034836, "step": 126 }, { "completion_length": 617.9687805175781, "epoch": 0.26963906581740976, "grad_norm": 0.23262540996074677, "kl": 0.1351318359375, "learning_rate": 4.433405542493909e-06, "loss": 0.0014, "reward": 0.2870429456233978, "reward_std": 0.19062896817922592, "rewards/code_reward": 0.18838223442435265, "rewards/format_reward": 0.9866071939468384, "step": 127 }, { "completion_length": 662.2053833007812, "epoch": 0.27176220806794055, "grad_norm": 0.22548751533031464, "kl": 0.1197509765625, "learning_rate": 4.4237002453327734e-06, "loss": 0.0013, "reward": 0.30225350335240364, "reward_std": 0.1395848747342825, "rewards/code_reward": 0.203146331012249, "rewards/format_reward": 0.9910714477300644, "step": 128 }, { "completion_length": 632.0022430419922, "epoch": 0.27388535031847133, "grad_norm": 0.24719858169555664, "kl": 0.131103515625, "learning_rate": 4.4139247230614245e-06, "loss": 0.0013, "reward": 0.32878731191158295, "reward_std": 0.16303380951285362, "rewards/code_reward": 0.22968016006052494, "rewards/format_reward": 0.9910714477300644, "step": 129 }, { "completion_length": 636.7299346923828, "epoch": 0.2760084925690021, "grad_norm": 0.22243919968605042, "kl": 0.1234130859375, "learning_rate": 4.404079385841201e-06, "loss": 0.0013, "reward": 0.30703118816018105, "reward_std": 0.12942655384540558, "rewards/code_reward": 0.20792402233928442, "rewards/format_reward": 0.9910714626312256, "step": 130 }, { "completion_length": 644.5468902587891, "epoch": 0.2781316348195329, "grad_norm": 0.220564067363739, "kl": 0.123291015625, "learning_rate": 4.394164646762734e-06, "loss": 0.0013, "reward": 0.296065516769886, "reward_std": 0.18630750849843025, "rewards/code_reward": 0.19673514552414417, "rewards/format_reward": 0.9933035969734192, "step": 131 }, { "completion_length": 666.997802734375, "epoch": 0.2802547770700637, "grad_norm": 0.22151753306388855, "kl": 0.1304931640625, "learning_rate": 4.384180921828618e-06, "loss": 0.0013, "reward": 0.3110230341553688, "reward_std": 0.1834610104560852, "rewards/code_reward": 0.21370159462094307, "rewards/format_reward": 0.973214328289032, "step": 132 }, { "completion_length": 665.1205749511719, "epoch": 0.2823779193205945, "grad_norm": 0.21862035989761353, "kl": 0.1168212890625, "learning_rate": 4.374128629935955e-06, "loss": 0.0012, "reward": 0.2876487486064434, "reward_std": 0.21351643651723862, "rewards/code_reward": 0.18965766951441765, "rewards/format_reward": 0.9799107611179352, "step": 133 }, { "completion_length": 703.575927734375, "epoch": 0.28450106157112526, "grad_norm": 0.23025038838386536, "kl": 0.1204833984375, "learning_rate": 4.364008192858781e-06, "loss": 0.0013, "reward": 0.37238020449876785, "reward_std": 0.17016195878386497, "rewards/code_reward": 0.2737194746732712, "rewards/format_reward": 0.986607164144516, "step": 134 }, { "completion_length": 726.8817138671875, "epoch": 0.28662420382165604, "grad_norm": 0.21229737997055054, "kl": 0.121337890625, "learning_rate": 4.353820035230366e-06, "loss": 0.0012, "reward": 0.20391739904880524, "reward_std": 0.13868718035519123, "rewards/code_reward": 0.10525666922330856, "rewards/format_reward": 0.986607164144516, "step": 135 }, { "completion_length": 712.2098541259766, "epoch": 0.28874734607218683, "grad_norm": 0.2108394205570221, "kl": 0.1138916015625, "learning_rate": 4.3435645845254e-06, "loss": 0.0012, "reward": 0.3125154785811901, "reward_std": 0.1781605463474989, "rewards/code_reward": 0.2134083015844226, "rewards/format_reward": 0.9910714626312256, "step": 136 }, { "completion_length": 718.5893249511719, "epoch": 0.2908704883227176, "grad_norm": 0.2107391357421875, "kl": 0.1195068359375, "learning_rate": 4.333242271042054e-06, "loss": 0.0012, "reward": 0.3177960254251957, "reward_std": 0.1659994050860405, "rewards/code_reward": 0.2186888586729765, "rewards/format_reward": 0.9910714626312256, "step": 137 }, { "completion_length": 739.7545013427734, "epoch": 0.2929936305732484, "grad_norm": 0.2182048112154007, "kl": 0.124755859375, "learning_rate": 4.32285352788393e-06, "loss": 0.0013, "reward": 0.30886589735746384, "reward_std": 0.1802590098232031, "rewards/code_reward": 0.21020517125725746, "rewards/format_reward": 0.9866071939468384, "step": 138 }, { "completion_length": 765.857177734375, "epoch": 0.2951167728237792, "grad_norm": 0.19854100048542023, "kl": 0.115234375, "learning_rate": 4.312398790941882e-06, "loss": 0.0012, "reward": 0.3000107705593109, "reward_std": 0.15882322564721107, "rewards/code_reward": 0.201126828789711, "rewards/format_reward": 0.988839328289032, "step": 139 }, { "completion_length": 721.9687805175781, "epoch": 0.29723991507430997, "grad_norm": 0.23453111946582794, "kl": 0.116455078125, "learning_rate": 4.301878498875735e-06, "loss": 0.0012, "reward": 0.33861320093274117, "reward_std": 0.1569173000752926, "rewards/code_reward": 0.2401756690815091, "rewards/format_reward": 0.9843750298023224, "step": 140 }, { "completion_length": 740.2031707763672, "epoch": 0.29936305732484075, "grad_norm": 0.21566148102283478, "kl": 0.1102294921875, "learning_rate": 4.291293093095873e-06, "loss": 0.0011, "reward": 0.3095410466194153, "reward_std": 0.1992884911596775, "rewards/code_reward": 0.2108803205192089, "rewards/format_reward": 0.9866071790456772, "step": 141 }, { "completion_length": 710.2969207763672, "epoch": 0.30148619957537154, "grad_norm": 0.22105751931667328, "kl": 0.12060546875, "learning_rate": 4.280643017744723e-06, "loss": 0.0013, "reward": 0.36906543001532555, "reward_std": 0.21653805300593376, "rewards/code_reward": 0.2704046741127968, "rewards/format_reward": 0.9866071790456772, "step": 142 }, { "completion_length": 751.5937805175781, "epoch": 0.3036093418259023, "grad_norm": 0.23819085955619812, "kl": 0.1221923828125, "learning_rate": 4.269928719678117e-06, "loss": 0.0012, "reward": 0.25049133971333504, "reward_std": 0.17505915835499763, "rewards/code_reward": 0.15160739235579967, "rewards/format_reward": 0.988839328289032, "step": 143 }, { "completion_length": 735.9107513427734, "epoch": 0.3057324840764331, "grad_norm": 0.2204020470380783, "kl": 0.1229248046875, "learning_rate": 4.2591506484465426e-06, "loss": 0.0012, "reward": 0.26853859797120094, "reward_std": 0.16319206822663546, "rewards/code_reward": 0.17032429203391075, "rewards/format_reward": 0.9821428805589676, "step": 144 }, { "completion_length": 730.8906707763672, "epoch": 0.3078556263269639, "grad_norm": 0.23124827444553375, "kl": 0.119140625, "learning_rate": 4.248309256276283e-06, "loss": 0.0012, "reward": 0.34641416370868683, "reward_std": 0.15815678425133228, "rewards/code_reward": 0.2479766495525837, "rewards/format_reward": 0.9843750596046448, "step": 145 }, { "completion_length": 772.8192291259766, "epoch": 0.3099787685774947, "grad_norm": 0.21077241003513336, "kl": 0.1168212890625, "learning_rate": 4.23740499805044e-06, "loss": 0.0012, "reward": 0.2558128647506237, "reward_std": 0.12367029674351215, "rewards/code_reward": 0.15804500319063663, "rewards/format_reward": 0.9776786267757416, "step": 146 }, { "completion_length": 740.8214721679688, "epoch": 0.31210191082802546, "grad_norm": 0.21619708836078644, "kl": 0.125732421875, "learning_rate": 4.22643833128985e-06, "loss": 0.0013, "reward": 0.33286692947149277, "reward_std": 0.211603332310915, "rewards/code_reward": 0.2342061996459961, "rewards/format_reward": 0.9866071939468384, "step": 147 }, { "completion_length": 807.6071624755859, "epoch": 0.31422505307855625, "grad_norm": 0.2130916714668274, "kl": 0.1212158203125, "learning_rate": 4.215409716133885e-06, "loss": 0.0012, "reward": 0.3038931153714657, "reward_std": 0.19640244916081429, "rewards/code_reward": 0.2065716814249754, "rewards/format_reward": 0.973214328289032, "step": 148 }, { "completion_length": 757.3281555175781, "epoch": 0.31634819532908703, "grad_norm": 0.21959362924098969, "kl": 0.12744140625, "learning_rate": 4.204319615321151e-06, "loss": 0.0013, "reward": 0.35224368050694466, "reward_std": 0.1676900666207075, "rewards/code_reward": 0.2542525976896286, "rewards/format_reward": 0.9799107611179352, "step": 149 }, { "completion_length": 748.4687805175781, "epoch": 0.3184713375796178, "grad_norm": 0.22287501394748688, "kl": 0.120361328125, "learning_rate": 4.193168494170065e-06, "loss": 0.0012, "reward": 0.34373533725738525, "reward_std": 0.18135884031653404, "rewards/code_reward": 0.2457442507147789, "rewards/format_reward": 0.9799107611179352, "step": 150 }, { "completion_length": 755.8906707763672, "epoch": 0.3205944798301486, "grad_norm": 1.9565397500991821, "kl": 0.2320556640625, "learning_rate": 4.181956820559339e-06, "loss": 0.0023, "reward": 0.3970649391412735, "reward_std": 0.2402110919356346, "rewards/code_reward": 0.2992970943450928, "rewards/format_reward": 0.9776786118745804, "step": 151 }, { "completion_length": 779.8549499511719, "epoch": 0.3227176220806794, "grad_norm": 0.2783929109573364, "kl": 0.15283203125, "learning_rate": 4.170685064908342e-06, "loss": 0.0016, "reward": 0.19563322141766548, "reward_std": 0.12617591954767704, "rewards/code_reward": 0.11817785818129778, "rewards/format_reward": 0.7745535969734192, "step": 152 }, { "completion_length": 726.2210083007812, "epoch": 0.3248407643312102, "grad_norm": 0.3019232451915741, "kl": 0.157470703125, "learning_rate": 4.159353700157365e-06, "loss": 0.0016, "reward": 0.17416437342762947, "reward_std": 0.18693338334560394, "rewards/code_reward": 0.14961079927161336, "rewards/format_reward": 0.2455357201397419, "step": 153 }, { "completion_length": 698.8303833007812, "epoch": 0.32696390658174096, "grad_norm": 0.2774558365345001, "kl": 0.1429443359375, "learning_rate": 4.14796320174778e-06, "loss": 0.0014, "reward": 0.16863043326884508, "reward_std": 0.1559329554438591, "rewards/code_reward": 0.15412150975316763, "rewards/format_reward": 0.145089291036129, "step": 154 }, { "completion_length": 675.5960083007812, "epoch": 0.32908704883227174, "grad_norm": 0.2807973027229309, "kl": 0.1292724609375, "learning_rate": 4.136514047602087e-06, "loss": 0.0013, "reward": 0.18624619487673044, "reward_std": 0.18390434235334396, "rewards/code_reward": 0.15968369878828526, "rewards/format_reward": 0.2656250074505806, "step": 155 }, { "completion_length": 639.5625305175781, "epoch": 0.33121019108280253, "grad_norm": 0.2780408263206482, "kl": 0.1229248046875, "learning_rate": 4.1250067181038635e-06, "loss": 0.0012, "reward": 0.2191852517426014, "reward_std": 0.13604657351970673, "rewards/code_reward": 0.16851558908820152, "rewards/format_reward": 0.5066964477300644, "step": 156 }, { "completion_length": 639.6652069091797, "epoch": 0.3333333333333333, "grad_norm": 0.26467737555503845, "kl": 0.144775390625, "learning_rate": 4.113441696077608e-06, "loss": 0.0014, "reward": 0.31026700511574745, "reward_std": 0.202113538980484, "rewards/code_reward": 0.2345973663032055, "rewards/format_reward": 0.7566964626312256, "step": 157 }, { "completion_length": 659.5558319091797, "epoch": 0.3354564755838641, "grad_norm": 0.24479494988918304, "kl": 0.1280517578125, "learning_rate": 4.101819466768484e-06, "loss": 0.0013, "reward": 0.2640949599444866, "reward_std": 0.1684006005525589, "rewards/code_reward": 0.1763717383146286, "rewards/format_reward": 0.8772321939468384, "step": 158 }, { "completion_length": 615.0558319091797, "epoch": 0.3375796178343949, "grad_norm": 0.24368631839752197, "kl": 0.15087890625, "learning_rate": 4.0901405178219535e-06, "loss": 0.0015, "reward": 0.345178809016943, "reward_std": 0.19809392467141151, "rewards/code_reward": 0.24941986054182053, "rewards/format_reward": 0.957589328289032, "step": 159 }, { "completion_length": 624.1830444335938, "epoch": 0.33970276008492567, "grad_norm": 0.23896408081054688, "kl": 0.154052734375, "learning_rate": 4.078405339263326e-06, "loss": 0.0015, "reward": 0.37723641097545624, "reward_std": 0.21996535174548626, "rewards/code_reward": 0.28080783039331436, "rewards/format_reward": 0.964285746216774, "step": 160 }, { "completion_length": 627.3236846923828, "epoch": 0.34182590233545646, "grad_norm": 0.2518380582332611, "kl": 0.171142578125, "learning_rate": 4.06661442347719e-06, "loss": 0.0017, "reward": 0.3184036388993263, "reward_std": 0.19395017623901367, "rewards/code_reward": 0.2217518538236618, "rewards/format_reward": 0.9665178805589676, "step": 161 }, { "completion_length": 583.1674499511719, "epoch": 0.34394904458598724, "grad_norm": 0.2569345235824585, "kl": 0.191162109375, "learning_rate": 4.054768265186758e-06, "loss": 0.0019, "reward": 0.31612952798604965, "reward_std": 0.20712972059845924, "rewards/code_reward": 0.21836165338754654, "rewards/format_reward": 0.9776785969734192, "step": 162 }, { "completion_length": 571.7857208251953, "epoch": 0.346072186836518, "grad_norm": 0.25089165568351746, "kl": 0.207763671875, "learning_rate": 4.0428673614331036e-06, "loss": 0.0021, "reward": 0.365755058825016, "reward_std": 0.19836053252220154, "rewards/code_reward": 0.2673175595700741, "rewards/format_reward": 0.9843750447034836, "step": 163 }, { "completion_length": 596.4553833007812, "epoch": 0.3481953290870488, "grad_norm": 0.23911510407924652, "kl": 0.22265625, "learning_rate": 4.030912211554316e-06, "loss": 0.0023, "reward": 0.38677794113755226, "reward_std": 0.18023086339235306, "rewards/code_reward": 0.28744759038090706, "rewards/format_reward": 0.9933035969734192, "step": 164 }, { "completion_length": 567.4486999511719, "epoch": 0.3503184713375796, "grad_norm": 0.24430882930755615, "kl": 0.2099609375, "learning_rate": 4.018903317164539e-06, "loss": 0.0021, "reward": 0.2250930406153202, "reward_std": 0.19390171952545643, "rewards/code_reward": 0.1277716178447008, "rewards/format_reward": 0.9732143431901932, "step": 165 }, { "completion_length": 573.8594055175781, "epoch": 0.3524416135881104, "grad_norm": 0.2257012128829956, "kl": 0.232666015625, "learning_rate": 4.006841182132932e-06, "loss": 0.0023, "reward": 0.3599228076636791, "reward_std": 0.20258177444338799, "rewards/code_reward": 0.2603691965341568, "rewards/format_reward": 0.9955357313156128, "step": 166 }, { "completion_length": 606.4911041259766, "epoch": 0.35456475583864117, "grad_norm": 0.238439679145813, "kl": 0.252197265625, "learning_rate": 3.9947263125625195e-06, "loss": 0.0025, "reward": 0.3261881247162819, "reward_std": 0.1736624352633953, "rewards/code_reward": 0.22775060683488846, "rewards/format_reward": 0.9843750298023224, "step": 167 }, { "completion_length": 590.1027069091797, "epoch": 0.35668789808917195, "grad_norm": 0.22503866255283356, "kl": 0.255615234375, "learning_rate": 3.982559216768967e-06, "loss": 0.0026, "reward": 0.2961311787366867, "reward_std": 0.1850012019276619, "rewards/code_reward": 0.1968008242547512, "rewards/format_reward": 0.9933035969734192, "step": 168 }, { "completion_length": 594.5491333007812, "epoch": 0.35881104033970274, "grad_norm": 0.22681432962417603, "kl": 0.32421875, "learning_rate": 3.970340405259245e-06, "loss": 0.0033, "reward": 0.4186030365526676, "reward_std": 0.18541271798312664, "rewards/code_reward": 0.31927267275750637, "rewards/format_reward": 0.9933035969734192, "step": 169 }, { "completion_length": 594.2790298461914, "epoch": 0.3609341825902335, "grad_norm": 0.2294747531414032, "kl": 0.32666015625, "learning_rate": 3.958070390710214e-06, "loss": 0.0033, "reward": 0.36109255626797676, "reward_std": 0.18586167134344578, "rewards/code_reward": 0.26243184227496386, "rewards/format_reward": 0.9866071790456772, "step": 170 }, { "completion_length": 592.4553680419922, "epoch": 0.3630573248407643, "grad_norm": 0.21662850677967072, "kl": 0.247802734375, "learning_rate": 3.945749687947109e-06, "loss": 0.0025, "reward": 0.24923527240753174, "reward_std": 0.13961385935544968, "rewards/code_reward": 0.15057454677298665, "rewards/format_reward": 0.9866071939468384, "step": 171 }, { "completion_length": 570.9286041259766, "epoch": 0.3651804670912951, "grad_norm": 0.23902097344398499, "kl": 0.22607421875, "learning_rate": 3.933378813921942e-06, "loss": 0.0023, "reward": 0.3740244060754776, "reward_std": 0.22742953523993492, "rewards/code_reward": 0.27536366507411003, "rewards/format_reward": 0.9866071790456772, "step": 172 }, { "completion_length": 598.2477874755859, "epoch": 0.3673036093418259, "grad_norm": 0.21673643589019775, "kl": 0.213134765625, "learning_rate": 3.920958287691811e-06, "loss": 0.0021, "reward": 0.2918965369462967, "reward_std": 0.19641954079270363, "rewards/code_reward": 0.19301261007785797, "rewards/format_reward": 0.988839328289032, "step": 173 }, { "completion_length": 558.6718978881836, "epoch": 0.36942675159235666, "grad_norm": 0.2543295919895172, "kl": 0.1962890625, "learning_rate": 3.908488630397121e-06, "loss": 0.002, "reward": 0.41571951657533646, "reward_std": 0.24686651676893234, "rewards/code_reward": 0.31683557108044624, "rewards/format_reward": 0.988839328289032, "step": 174 }, { "completion_length": 557.0201110839844, "epoch": 0.37154989384288745, "grad_norm": 0.23438729345798492, "kl": 0.206298828125, "learning_rate": 3.8959703652397175e-06, "loss": 0.0021, "reward": 0.38086430728435516, "reward_std": 0.22366305626928806, "rewards/code_reward": 0.28198035806417465, "rewards/format_reward": 0.988839328289032, "step": 175 }, { "completion_length": 602.0536041259766, "epoch": 0.37367303609341823, "grad_norm": 0.24083319306373596, "kl": 0.18701171875, "learning_rate": 3.883404017460935e-06, "loss": 0.0019, "reward": 0.36414580047130585, "reward_std": 0.22220248356461525, "rewards/code_reward": 0.2657082974910736, "rewards/format_reward": 0.9843750447034836, "step": 176 }, { "completion_length": 591.0893096923828, "epoch": 0.37579617834394907, "grad_norm": 0.25874680280685425, "kl": 0.19873046875, "learning_rate": 3.870790114319559e-06, "loss": 0.002, "reward": 0.3555009290575981, "reward_std": 0.18250016495585442, "rewards/code_reward": 0.25684019550681114, "rewards/format_reward": 0.9866071939468384, "step": 177 }, { "completion_length": 587.404052734375, "epoch": 0.37791932059447986, "grad_norm": 0.22890929877758026, "kl": 0.176513671875, "learning_rate": 3.858129185069701e-06, "loss": 0.0018, "reward": 0.4567238390445709, "reward_std": 0.2463996484875679, "rewards/code_reward": 0.35806312412023544, "rewards/format_reward": 0.9866071939468384, "step": 178 }, { "completion_length": 602.4486846923828, "epoch": 0.38004246284501064, "grad_norm": 0.22736036777496338, "kl": 0.162353515625, "learning_rate": 3.845421760938597e-06, "loss": 0.0016, "reward": 0.3570307157933712, "reward_std": 0.16325377486646175, "rewards/code_reward": 0.2583700120449066, "rewards/format_reward": 0.9866071790456772, "step": 179 }, { "completion_length": 610.8638763427734, "epoch": 0.3821656050955414, "grad_norm": 0.2262299656867981, "kl": 0.153076171875, "learning_rate": 3.832668375104312e-06, "loss": 0.0016, "reward": 0.349903404712677, "reward_std": 0.15386051312088966, "rewards/code_reward": 0.2503498010337353, "rewards/format_reward": 0.9955357313156128, "step": 180 }, { "completion_length": 639.8326110839844, "epoch": 0.3842887473460722, "grad_norm": 0.22941501438617706, "kl": 0.17724609375, "learning_rate": 3.8198695626733725e-06, "loss": 0.0018, "reward": 0.40823063999414444, "reward_std": 0.2221880704164505, "rewards/code_reward": 0.3093467131257057, "rewards/format_reward": 0.988839328289032, "step": 181 }, { "completion_length": 638.1049346923828, "epoch": 0.386411889596603, "grad_norm": 0.23558823764324188, "kl": 0.15283203125, "learning_rate": 3.8070258606583156e-06, "loss": 0.0016, "reward": 0.36934422701597214, "reward_std": 0.21686138212680817, "rewards/code_reward": 0.27001385763287544, "rewards/format_reward": 0.9933035969734192, "step": 182 }, { "completion_length": 625.5759124755859, "epoch": 0.3885350318471338, "grad_norm": 0.31238648295402527, "kl": 0.166259765625, "learning_rate": 3.7941378079551544e-06, "loss": 0.0017, "reward": 0.3830692619085312, "reward_std": 0.24278680607676506, "rewards/code_reward": 0.2835156861692667, "rewards/format_reward": 0.9955357313156128, "step": 183 }, { "completion_length": 665.8192291259766, "epoch": 0.39065817409766457, "grad_norm": 0.3192687928676605, "kl": 0.1513671875, "learning_rate": 3.7812059453207677e-06, "loss": 0.0015, "reward": 0.3427841551601887, "reward_std": 0.20133822225034237, "rewards/code_reward": 0.2441234067082405, "rewards/format_reward": 0.9866071939468384, "step": 184 }, { "completion_length": 655.4486846923828, "epoch": 0.39278131634819535, "grad_norm": 0.243864506483078, "kl": 0.141845703125, "learning_rate": 3.768230815350213e-06, "loss": 0.0014, "reward": 0.32591256499290466, "reward_std": 0.1841282658278942, "rewards/code_reward": 0.22680539265275002, "rewards/format_reward": 0.9910714477300644, "step": 185 }, { "completion_length": 680.6942291259766, "epoch": 0.39490445859872614, "grad_norm": 2.7162351608276367, "kl": 0.2255859375, "learning_rate": 3.7552129624539557e-06, "loss": 0.0023, "reward": 0.38928014785051346, "reward_std": 0.22797510400414467, "rewards/code_reward": 0.2917355000972748, "rewards/format_reward": 0.9754464626312256, "step": 186 }, { "completion_length": 670.8727874755859, "epoch": 0.3970276008492569, "grad_norm": 28.86046600341797, "kl": 3.12841796875, "learning_rate": 3.7421529328350316e-06, "loss": 0.0313, "reward": 0.33664827793836594, "reward_std": 0.2122020348906517, "rewards/code_reward": 0.2404429018497467, "rewards/format_reward": 0.9620536118745804, "step": 187 }, { "completion_length": 683.8326110839844, "epoch": 0.3991507430997877, "grad_norm": 0.4426620602607727, "kl": 0.14501953125, "learning_rate": 3.7290512744661274e-06, "loss": 0.0015, "reward": 0.38399138301610947, "reward_std": 0.1990874893963337, "rewards/code_reward": 0.2860002890229225, "rewards/format_reward": 0.9799107611179352, "step": 188 }, { "completion_length": 647.325927734375, "epoch": 0.4012738853503185, "grad_norm": 0.2341061532497406, "kl": 0.1455078125, "learning_rate": 3.715908537066589e-06, "loss": 0.0015, "reward": 0.42976176738739014, "reward_std": 0.21941150352358818, "rewards/code_reward": 0.33199387788772583, "rewards/format_reward": 0.9776786267757416, "step": 189 }, { "completion_length": 698.1763610839844, "epoch": 0.4033970276008493, "grad_norm": 1.953539252281189, "kl": 0.5579833984375, "learning_rate": 3.7027252720793538e-06, "loss": 0.0056, "reward": 0.33469754457473755, "reward_std": 0.19711985811591148, "rewards/code_reward": 0.23692966997623444, "rewards/format_reward": 0.9776786118745804, "step": 190 }, { "completion_length": 710.0982360839844, "epoch": 0.40552016985138006, "grad_norm": 0.24030916392803192, "kl": 0.161865234375, "learning_rate": 3.689502032647817e-06, "loss": 0.0016, "reward": 0.35261962562799454, "reward_std": 0.2262839339673519, "rewards/code_reward": 0.25552139058709145, "rewards/format_reward": 0.970982164144516, "step": 191 }, { "completion_length": 672.5401916503906, "epoch": 0.40764331210191085, "grad_norm": 0.9592034816741943, "kl": 0.154541015625, "learning_rate": 3.6762393735926245e-06, "loss": 0.0016, "reward": 0.3493685219436884, "reward_std": 0.1753272709902376, "rewards/code_reward": 0.2524934969842434, "rewards/format_reward": 0.9687500298023224, "step": 192 }, { "completion_length": 710.7299499511719, "epoch": 0.40976645435244163, "grad_norm": 0.3044726550579071, "kl": 0.15185546875, "learning_rate": 3.6629378513883852e-06, "loss": 0.0015, "reward": 0.4329136684536934, "reward_std": 0.257048511877656, "rewards/code_reward": 0.3346993774175644, "rewards/format_reward": 0.98214291036129, "step": 193 }, { "completion_length": 718.3683166503906, "epoch": 0.4118895966029724, "grad_norm": 0.2441069632768631, "kl": 0.1630859375, "learning_rate": 3.6495980241403307e-06, "loss": 0.0016, "reward": 0.32557281479239464, "reward_std": 0.19367647171020508, "rewards/code_reward": 0.2271352931857109, "rewards/format_reward": 0.9843750596046448, "step": 194 }, { "completion_length": 701.9486846923828, "epoch": 0.4140127388535032, "grad_norm": 0.22456014156341553, "kl": 0.16064453125, "learning_rate": 3.636220451560896e-06, "loss": 0.0016, "reward": 0.42680248618125916, "reward_std": 0.2046816684305668, "rewards/code_reward": 0.32903461158275604, "rewards/format_reward": 0.9776786267757416, "step": 195 }, { "completion_length": 713.747802734375, "epoch": 0.416135881104034, "grad_norm": 0.45598000288009644, "kl": 0.149169921875, "learning_rate": 3.622805694946235e-06, "loss": 0.0015, "reward": 0.3776397071778774, "reward_std": 0.18774981424212456, "rewards/code_reward": 0.2803182378411293, "rewards/format_reward": 0.9732143133878708, "step": 196 }, { "completion_length": 717.1406555175781, "epoch": 0.4182590233545648, "grad_norm": 0.21405339241027832, "kl": 0.1429443359375, "learning_rate": 3.609354317152667e-06, "loss": 0.0015, "reward": 0.38271288573741913, "reward_std": 0.19382936879992485, "rewards/code_reward": 0.28539142571389675, "rewards/format_reward": 0.9732143133878708, "step": 197 }, { "completion_length": 678.2477874755859, "epoch": 0.42038216560509556, "grad_norm": 0.49006760120391846, "kl": 0.2021484375, "learning_rate": 3.595866882573063e-06, "loss": 0.0021, "reward": 0.4323223605751991, "reward_std": 0.2277931533753872, "rewards/code_reward": 0.3345545120537281, "rewards/format_reward": 0.9776786118745804, "step": 198 }, { "completion_length": 728.9620971679688, "epoch": 0.42250530785562634, "grad_norm": 0.39922112226486206, "kl": 0.184814453125, "learning_rate": 3.5823439571131675e-06, "loss": 0.0019, "reward": 0.40869200229644775, "reward_std": 0.2020891159772873, "rewards/code_reward": 0.31159375607967377, "rewards/format_reward": 0.9709821939468384, "step": 199 }, { "completion_length": 684.9576416015625, "epoch": 0.42462845010615713, "grad_norm": 0.23013651371002197, "kl": 0.149658203125, "learning_rate": 3.5687861081678477e-06, "loss": 0.0015, "reward": 0.4545319005846977, "reward_std": 0.24276942387223244, "rewards/code_reward": 0.3572104535996914, "rewards/format_reward": 0.9732143431901932, "step": 200 }, { "completion_length": 704.5223541259766, "epoch": 0.4267515923566879, "grad_norm": 0.46601447463035583, "kl": 0.145263671875, "learning_rate": 3.555193904597291e-06, "loss": 0.0015, "reward": 0.3521813452243805, "reward_std": 0.1790554393082857, "rewards/code_reward": 0.2555295582860708, "rewards/format_reward": 0.96651791036129, "step": 201 }, { "completion_length": 676.4754791259766, "epoch": 0.4288747346072187, "grad_norm": 0.24227948486804962, "kl": 0.145751953125, "learning_rate": 3.541567916703138e-06, "loss": 0.0015, "reward": 0.4256810247898102, "reward_std": 0.2298164926469326, "rewards/code_reward": 0.327689953148365, "rewards/format_reward": 0.979910746216774, "step": 202 }, { "completion_length": 697.6027069091797, "epoch": 0.4309978768577495, "grad_norm": 0.32301369309425354, "kl": 0.141845703125, "learning_rate": 3.5279087162045517e-06, "loss": 0.0014, "reward": 0.27234210819005966, "reward_std": 0.17865055054426193, "rewards/code_reward": 0.17479745857417583, "rewards/format_reward": 0.9754464775323868, "step": 203 }, { "completion_length": 696.3727874755859, "epoch": 0.43312101910828027, "grad_norm": 0.6582425236701965, "kl": 0.14111328125, "learning_rate": 3.5142168762142265e-06, "loss": 0.0014, "reward": 0.3229696787893772, "reward_std": 0.1942148432135582, "rewards/code_reward": 0.22542503476142883, "rewards/format_reward": 0.9754464626312256, "step": 204 }, { "completion_length": 718.2232513427734, "epoch": 0.43524416135881105, "grad_norm": 0.30619704723358154, "kl": 0.149169921875, "learning_rate": 3.500492971214347e-06, "loss": 0.0015, "reward": 0.4395933449268341, "reward_std": 0.265441432595253, "rewards/code_reward": 0.3402629792690277, "rewards/format_reward": 0.9933035969734192, "step": 205 }, { "completion_length": 699.310302734375, "epoch": 0.43736730360934184, "grad_norm": 0.3680998980998993, "kl": 0.151611328125, "learning_rate": 3.48673757703248e-06, "loss": 0.0015, "reward": 0.3385552614927292, "reward_std": 0.24193225800991058, "rewards/code_reward": 0.24101059883832932, "rewards/format_reward": 0.9754464626312256, "step": 206 }, { "completion_length": 712.4799499511719, "epoch": 0.4394904458598726, "grad_norm": 0.22541551291942596, "kl": 0.315673828125, "learning_rate": 3.472951270817418e-06, "loss": 0.0032, "reward": 0.317364189773798, "reward_std": 0.2289394848048687, "rewards/code_reward": 0.2191498950123787, "rewards/format_reward": 0.9821428954601288, "step": 207 }, { "completion_length": 725.1719207763672, "epoch": 0.4416135881104034, "grad_norm": 0.7986815571784973, "kl": 0.8701171875, "learning_rate": 3.4591346310149578e-06, "loss": 0.0087, "reward": 0.30210861191153526, "reward_std": 0.1758405715227127, "rewards/code_reward": 0.20545680448412895, "rewards/format_reward": 0.9665178954601288, "step": 208 }, { "completion_length": 708.9174346923828, "epoch": 0.4437367303609342, "grad_norm": 0.6832094788551331, "kl": 0.571533203125, "learning_rate": 3.445288237343632e-06, "loss": 0.0057, "reward": 0.34425482153892517, "reward_std": 0.17729798145592213, "rewards/code_reward": 0.24559411033988, "rewards/format_reward": 0.9866071939468384, "step": 209 }, { "completion_length": 664.4152069091797, "epoch": 0.445859872611465, "grad_norm": 0.5344778299331665, "kl": 0.344970703125, "learning_rate": 3.4314126707703895e-06, "loss": 0.0035, "reward": 0.3406968005001545, "reward_std": 0.21405612863600254, "rewards/code_reward": 0.2424825206398964, "rewards/format_reward": 0.98214291036129, "step": 210 }, { "completion_length": 687.4955596923828, "epoch": 0.44798301486199577, "grad_norm": 0.2852449119091034, "kl": 0.314208984375, "learning_rate": 3.4175085134862128e-06, "loss": 0.0031, "reward": 0.37548423558473587, "reward_std": 0.19767768681049347, "rewards/code_reward": 0.2783860079944134, "rewards/format_reward": 0.9709821939468384, "step": 211 }, { "completion_length": 694.372802734375, "epoch": 0.45010615711252655, "grad_norm": 0.8310821056365967, "kl": 0.214111328125, "learning_rate": 3.4035763488816953e-06, "loss": 0.0021, "reward": 0.5172732323408127, "reward_std": 0.24472371861338615, "rewards/code_reward": 0.41883569955825806, "rewards/format_reward": 0.9843750447034836, "step": 212 }, { "completion_length": 678.0424499511719, "epoch": 0.45222929936305734, "grad_norm": 0.28591927886009216, "kl": 0.14306640625, "learning_rate": 3.3896167615225594e-06, "loss": 0.0015, "reward": 0.3543313890695572, "reward_std": 0.21659231930971146, "rewards/code_reward": 0.2567867375910282, "rewards/format_reward": 0.9754464626312256, "step": 213 }, { "completion_length": 692.7522583007812, "epoch": 0.4543524416135881, "grad_norm": 0.5759381055831909, "kl": 0.1455078125, "learning_rate": 3.375630337125133e-06, "loss": 0.0015, "reward": 0.39294832199811935, "reward_std": 0.26400984078645706, "rewards/code_reward": 0.2960733026266098, "rewards/format_reward": 0.9687500298023224, "step": 214 }, { "completion_length": 731.1272735595703, "epoch": 0.4564755838641189, "grad_norm": 0.23394830524921417, "kl": 0.143798828125, "learning_rate": 3.361617662531772e-06, "loss": 0.0014, "reward": 0.3601933494210243, "reward_std": 0.25189225003123283, "rewards/code_reward": 0.2619790583848953, "rewards/format_reward": 0.9821428805589676, "step": 215 }, { "completion_length": 692.1428833007812, "epoch": 0.4585987261146497, "grad_norm": 0.24470415711402893, "kl": 0.1317138671875, "learning_rate": 3.347579325686237e-06, "loss": 0.0013, "reward": 0.3433048315346241, "reward_std": 0.22325459122657776, "rewards/code_reward": 0.245536956936121, "rewards/format_reward": 0.9776786118745804, "step": 216 }, { "completion_length": 684.3705749511719, "epoch": 0.4607218683651805, "grad_norm": 0.364793062210083, "kl": 0.122802734375, "learning_rate": 3.333515915609027e-06, "loss": 0.0012, "reward": 0.4696499854326248, "reward_std": 0.2734139449894428, "rewards/code_reward": 0.3707660511136055, "rewards/format_reward": 0.988839328289032, "step": 217 }, { "completion_length": 723.4129791259766, "epoch": 0.46284501061571126, "grad_norm": 0.36840999126434326, "kl": 0.128173828125, "learning_rate": 3.3194280223726616e-06, "loss": 0.0013, "reward": 0.3476767987012863, "reward_std": 0.19485369697213173, "rewards/code_reward": 0.24968570843338966, "rewards/format_reward": 0.9799107611179352, "step": 218 }, { "completion_length": 660.7812805175781, "epoch": 0.46496815286624205, "grad_norm": 0.2917425036430359, "kl": 0.142578125, "learning_rate": 3.305316237076927e-06, "loss": 0.0014, "reward": 0.39485304057598114, "reward_std": 0.23686816543340683, "rewards/code_reward": 0.29708515852689743, "rewards/format_reward": 0.9776786118745804, "step": 219 }, { "completion_length": 674.2410888671875, "epoch": 0.46709129511677283, "grad_norm": 0.27625608444213867, "kl": 0.13134765625, "learning_rate": 3.291181151824071e-06, "loss": 0.0014, "reward": 0.5001323744654655, "reward_std": 0.2807440906763077, "rewards/code_reward": 0.40124842897057533, "rewards/format_reward": 0.9888393133878708, "step": 220 }, { "completion_length": 717.279052734375, "epoch": 0.4692144373673036, "grad_norm": 0.26342645287513733, "kl": 0.132568359375, "learning_rate": 3.27702335969396e-06, "loss": 0.0014, "reward": 0.438594788312912, "reward_std": 0.2874513529241085, "rewards/code_reward": 0.34060370177030563, "rewards/format_reward": 0.979910746216774, "step": 221 }, { "completion_length": 740.8192291259766, "epoch": 0.4713375796178344, "grad_norm": 0.3312680423259735, "kl": 0.144287109375, "learning_rate": 3.2628434547191985e-06, "loss": 0.0014, "reward": 0.4112970530986786, "reward_std": 0.2245728299021721, "rewards/code_reward": 0.3137524016201496, "rewards/format_reward": 0.9754464775323868, "step": 222 }, { "completion_length": 709.9174499511719, "epoch": 0.4734607218683652, "grad_norm": 1.5567724704742432, "kl": 0.1339111328125, "learning_rate": 3.2486420318601973e-06, "loss": 0.0014, "reward": 0.4251294732093811, "reward_std": 0.18372783437371254, "rewards/code_reward": 0.3291473314166069, "rewards/format_reward": 0.9598214775323868, "step": 223 }, { "completion_length": 719.7656707763672, "epoch": 0.47558386411889597, "grad_norm": 0.2122294157743454, "kl": 0.1273193359375, "learning_rate": 3.2344196869802187e-06, "loss": 0.0013, "reward": 0.3450777679681778, "reward_std": 0.24194234982132912, "rewards/code_reward": 0.24730990827083588, "rewards/format_reward": 0.9776786118745804, "step": 224 }, { "completion_length": 704.0045013427734, "epoch": 0.47770700636942676, "grad_norm": 0.9711757898330688, "kl": 0.20751953125, "learning_rate": 3.2201770168203694e-06, "loss": 0.0021, "reward": 0.4306853115558624, "reward_std": 0.2568584829568863, "rewards/code_reward": 0.33492637425661087, "rewards/format_reward": 0.9575893133878708, "step": 225 }, { "completion_length": 727.6027069091797, "epoch": 0.47983014861995754, "grad_norm": 0.268039733171463, "kl": 0.13818359375, "learning_rate": 3.205914618974563e-06, "loss": 0.0014, "reward": 0.43213512748479843, "reward_std": 0.2562938630580902, "rewards/code_reward": 0.334367249161005, "rewards/format_reward": 0.9776786267757416, "step": 226 }, { "completion_length": 732.9643249511719, "epoch": 0.4819532908704883, "grad_norm": 0.46155139803886414, "kl": 0.198486328125, "learning_rate": 3.1916330918644496e-06, "loss": 0.002, "reward": 0.31592320650815964, "reward_std": 0.19539642706513405, "rewards/code_reward": 0.2174856998026371, "rewards/format_reward": 0.9843750596046448, "step": 227 }, { "completion_length": 770.1295013427734, "epoch": 0.4840764331210191, "grad_norm": 0.7360585331916809, "kl": 0.3978271484375, "learning_rate": 3.177333034714303e-06, "loss": 0.004, "reward": 0.35912561416625977, "reward_std": 0.21681112423539162, "rewards/code_reward": 0.26135774329304695, "rewards/format_reward": 0.9776785969734192, "step": 228 }, { "completion_length": 706.9777069091797, "epoch": 0.4861995753715499, "grad_norm": 1.2824815511703491, "kl": 0.615478515625, "learning_rate": 3.1630150475258813e-06, "loss": 0.0062, "reward": 0.3668329790234566, "reward_std": 0.2176014445722103, "rewards/code_reward": 0.2699579633772373, "rewards/format_reward": 0.9687500596046448, "step": 229 }, { "completion_length": 709.825927734375, "epoch": 0.4883227176220807, "grad_norm": 0.4730873107910156, "kl": 0.4136962890625, "learning_rate": 3.148679731053252e-06, "loss": 0.0041, "reward": 0.4401291459798813, "reward_std": 0.2792894318699837, "rewards/code_reward": 0.34213805943727493, "rewards/format_reward": 0.9799107611179352, "step": 230 }, { "completion_length": 716.0312805175781, "epoch": 0.49044585987261147, "grad_norm": 0.226039856672287, "kl": 0.1241455078125, "learning_rate": 3.1343276867775805e-06, "loss": 0.0013, "reward": 0.3396586962044239, "reward_std": 0.19299479201436043, "rewards/code_reward": 0.24211404286324978, "rewards/format_reward": 0.9754464775323868, "step": 231 }, { "completion_length": 699.6830749511719, "epoch": 0.49256900212314225, "grad_norm": 0.31895455718040466, "kl": 0.50146484375, "learning_rate": 3.1199595168819043e-06, "loss": 0.0051, "reward": 0.34284605644643307, "reward_std": 0.14287223480641842, "rewards/code_reward": 0.24463177705183625, "rewards/format_reward": 0.98214291036129, "step": 232 }, { "completion_length": 781.5178985595703, "epoch": 0.49469214437367304, "grad_norm": 0.4143598973751068, "kl": 0.249755859375, "learning_rate": 3.105575824225852e-06, "loss": 0.0025, "reward": 0.38098950684070587, "reward_std": 0.21905666589736938, "rewards/code_reward": 0.28590018674731255, "rewards/format_reward": 0.9508928954601288, "step": 233 }, { "completion_length": 725.3705749511719, "epoch": 0.4968152866242038, "grad_norm": 0.9609025120735168, "kl": 0.401123046875, "learning_rate": 3.091177212320363e-06, "loss": 0.004, "reward": 0.4063151776790619, "reward_std": 0.25925979763269424, "rewards/code_reward": 0.3076544553041458, "rewards/format_reward": 0.986607164144516, "step": 234 }, { "completion_length": 730.1406555175781, "epoch": 0.4989384288747346, "grad_norm": 0.2471870481967926, "kl": 0.233154296875, "learning_rate": 3.0767642853023538e-06, "loss": 0.0024, "reward": 0.3827313929796219, "reward_std": 0.21219320595264435, "rewards/code_reward": 0.2858563922345638, "rewards/format_reward": 0.9687500298023224, "step": 235 }, { "completion_length": 696.4219207763672, "epoch": 0.5010615711252654, "grad_norm": 0.6714680194854736, "kl": 0.1856689453125, "learning_rate": 3.062337647909376e-06, "loss": 0.0019, "reward": 0.4210161566734314, "reward_std": 0.18587047047913074, "rewards/code_reward": 0.3232482895255089, "rewards/format_reward": 0.9776786267757416, "step": 236 }, { "completion_length": 744.200927734375, "epoch": 0.5031847133757962, "grad_norm": 0.5082603096961975, "kl": 0.2071533203125, "learning_rate": 3.04789790545424e-06, "loss": 0.0021, "reward": 0.4485570266842842, "reward_std": 0.1916775107383728, "rewards/code_reward": 0.3519052043557167, "rewards/format_reward": 0.9665178954601288, "step": 237 }, { "completion_length": 758.0178985595703, "epoch": 0.505307855626327, "grad_norm": 0.69068843126297, "kl": 0.19482421875, "learning_rate": 3.033445663799621e-06, "loss": 0.002, "reward": 0.3711010664701462, "reward_std": 0.1955837495625019, "rewards/code_reward": 0.2742260619997978, "rewards/format_reward": 0.9687500596046448, "step": 238 }, { "completion_length": 717.8147735595703, "epoch": 0.5074309978768577, "grad_norm": 0.40367022156715393, "kl": 0.161865234375, "learning_rate": 3.018981529332633e-06, "loss": 0.0016, "reward": 0.5175677761435509, "reward_std": 0.2608077637851238, "rewards/code_reward": 0.4193534851074219, "rewards/format_reward": 0.9821428954601288, "step": 239 }, { "completion_length": 729.2545013427734, "epoch": 0.5095541401273885, "grad_norm": 0.5104432702064514, "kl": 0.19384765625, "learning_rate": 3.00450610893939e-06, "loss": 0.002, "reward": 0.40982675552368164, "reward_std": 0.20613017305731773, "rewards/code_reward": 0.31161245331168175, "rewards/format_reward": 0.9821428954601288, "step": 240 }, { "completion_length": 705.7120819091797, "epoch": 0.5116772823779193, "grad_norm": 0.2226688116788864, "kl": 0.167724609375, "learning_rate": 2.9900200099795396e-06, "loss": 0.0017, "reward": 0.40758588910102844, "reward_std": 0.22469941899180412, "rewards/code_reward": 0.31048765778541565, "rewards/format_reward": 0.970982164144516, "step": 241 }, { "completion_length": 716.6004791259766, "epoch": 0.5138004246284501, "grad_norm": 0.823330283164978, "kl": 0.224853515625, "learning_rate": 2.9755238402607826e-06, "loss": 0.0023, "reward": 0.381127692759037, "reward_std": 0.1726750060915947, "rewards/code_reward": 0.2817973233759403, "rewards/format_reward": 0.9933035969734192, "step": 242 }, { "completion_length": 714.8460235595703, "epoch": 0.5159235668789809, "grad_norm": 0.5035973191261292, "kl": 0.198486328125, "learning_rate": 2.961018208013367e-06, "loss": 0.002, "reward": 0.3932320065796375, "reward_std": 0.14925590343773365, "rewards/code_reward": 0.295910551212728, "rewards/format_reward": 0.973214328289032, "step": 243 }, { "completion_length": 714.6205749511719, "epoch": 0.5180467091295117, "grad_norm": 0.6857829689979553, "kl": 0.16259765625, "learning_rate": 2.9465037218645694e-06, "loss": 0.0016, "reward": 0.3965849094092846, "reward_std": 0.20821771398186684, "rewards/code_reward": 0.3001563027501106, "rewards/format_reward": 0.9642857611179352, "step": 244 }, { "completion_length": 710.7254791259766, "epoch": 0.5201698513800425, "grad_norm": 1.60740327835083, "kl": 0.131591796875, "learning_rate": 2.9319809908131604e-06, "loss": 0.0013, "reward": 0.43405191600322723, "reward_std": 0.25733353197574615, "rewards/code_reward": 0.33539119362831116, "rewards/format_reward": 0.9866071790456772, "step": 245 }, { "completion_length": 690.6027221679688, "epoch": 0.5222929936305732, "grad_norm": 0.2978648841381073, "kl": 0.1689453125, "learning_rate": 2.917450624203847e-06, "loss": 0.0017, "reward": 0.45192842930555344, "reward_std": 0.24438033252954483, "rewards/code_reward": 0.3539373278617859, "rewards/format_reward": 0.9799107611179352, "step": 246 }, { "completion_length": 737.6094207763672, "epoch": 0.524416135881104, "grad_norm": 0.3084428310394287, "kl": 0.1378173828125, "learning_rate": 2.9029132317017118e-06, "loss": 0.0014, "reward": 0.46284686774015427, "reward_std": 0.2403612770140171, "rewards/code_reward": 0.36619507521390915, "rewards/format_reward": 0.96651791036129, "step": 247 }, { "completion_length": 698.7411041259766, "epoch": 0.5265392781316348, "grad_norm": 1.3392090797424316, "kl": 0.151123046875, "learning_rate": 2.888369423266629e-06, "loss": 0.0015, "reward": 0.4595029503107071, "reward_std": 0.19635827839374542, "rewards/code_reward": 0.36218152195215225, "rewards/format_reward": 0.9732143431901932, "step": 248 }, { "completion_length": 719.0134429931641, "epoch": 0.5286624203821656, "grad_norm": 0.21979840099811554, "kl": 0.14111328125, "learning_rate": 2.8738198091276712e-06, "loss": 0.0014, "reward": 0.36629121005535126, "reward_std": 0.21069011464715004, "rewards/code_reward": 0.2694162093102932, "rewards/format_reward": 0.9687500298023224, "step": 249 }, { "completion_length": 745.263427734375, "epoch": 0.5307855626326964, "grad_norm": 0.8504329323768616, "kl": 0.15234375, "learning_rate": 2.859264999757509e-06, "loss": 0.0016, "reward": 0.37740468978881836, "reward_std": 0.20382403209805489, "rewards/code_reward": 0.2811993137001991, "rewards/format_reward": 0.9620536267757416, "step": 250 }, { "completion_length": 723.1562957763672, "epoch": 0.5329087048832272, "grad_norm": 0.27034398913383484, "kl": 0.1591796875, "learning_rate": 2.8447056058467928e-06, "loss": 0.0016, "reward": 0.48566606640815735, "reward_std": 0.21075040474534035, "rewards/code_reward": 0.38789819926023483, "rewards/format_reward": 0.9776786118745804, "step": 251 }, { "completion_length": 734.6986846923828, "epoch": 0.535031847133758, "grad_norm": 0.4998323917388916, "kl": 0.145751953125, "learning_rate": 2.830142238278531e-06, "loss": 0.0015, "reward": 0.3668500781059265, "reward_std": 0.1973743811249733, "rewards/code_reward": 0.2690822184085846, "rewards/format_reward": 0.9776786118745804, "step": 252 }, { "completion_length": 722.1719207763672, "epoch": 0.5371549893842887, "grad_norm": 0.7386496663093567, "kl": 0.16943359375, "learning_rate": 2.81557550810246e-06, "loss": 0.0017, "reward": 0.5175603851675987, "reward_std": 0.23075248673558235, "rewards/code_reward": 0.41889964044094086, "rewards/format_reward": 0.9866071790456772, "step": 253 }, { "completion_length": 731.4286041259766, "epoch": 0.5392781316348195, "grad_norm": 2.323516368865967, "kl": 0.185791015625, "learning_rate": 2.8010060265094026e-06, "loss": 0.0019, "reward": 0.4158123657107353, "reward_std": 0.2362896017730236, "rewards/code_reward": 0.3180444836616516, "rewards/format_reward": 0.9776786267757416, "step": 254 }, { "completion_length": 714.1004791259766, "epoch": 0.5414012738853503, "grad_norm": 0.22996105253696442, "kl": 0.193115234375, "learning_rate": 2.786434404805629e-06, "loss": 0.002, "reward": 0.43036870658397675, "reward_std": 0.17873099818825722, "rewards/code_reward": 0.3323776051402092, "rewards/format_reward": 0.979910746216774, "step": 255 }, { "completion_length": 755.794677734375, "epoch": 0.5435244161358811, "grad_norm": 0.5349477529525757, "kl": 0.21728515625, "learning_rate": 2.771861254387199e-06, "loss": 0.0022, "reward": 0.3905658796429634, "reward_std": 0.24914883077144623, "rewards/code_reward": 0.2939140759408474, "rewards/format_reward": 0.96651791036129, "step": 256 }, { "completion_length": 736.7969207763672, "epoch": 0.5456475583864119, "grad_norm": 0.5384594202041626, "kl": 0.44921875, "learning_rate": 2.7572871867143204e-06, "loss": 0.0045, "reward": 0.4809773936867714, "reward_std": 0.24490142613649368, "rewards/code_reward": 0.3832095377147198, "rewards/format_reward": 0.9776785969734192, "step": 257 }, { "completion_length": 779.904052734375, "epoch": 0.5477707006369427, "grad_norm": 0.3539630174636841, "kl": 0.46142578125, "learning_rate": 2.742712813285681e-06, "loss": 0.0046, "reward": 0.4002307578921318, "reward_std": 0.26231593638658524, "rewards/code_reward": 0.30447180569171906, "rewards/format_reward": 0.9575893431901932, "step": 258 }, { "completion_length": 688.8013610839844, "epoch": 0.5498938428874734, "grad_norm": 0.3760126531124115, "kl": 0.270263671875, "learning_rate": 2.7281387456128017e-06, "loss": 0.0027, "reward": 0.5040838867425919, "reward_std": 0.23536711558699608, "rewards/code_reward": 0.40519992262125015, "rewards/format_reward": 0.988839328289032, "step": 259 }, { "completion_length": 763.7924499511719, "epoch": 0.5520169851380042, "grad_norm": 0.28506824374198914, "kl": 0.37060546875, "learning_rate": 2.7135655951943716e-06, "loss": 0.0037, "reward": 0.4464469403028488, "reward_std": 0.23727866262197495, "rewards/code_reward": 0.3491254858672619, "rewards/format_reward": 0.9732143133878708, "step": 260 }, { "completion_length": 735.7143096923828, "epoch": 0.554140127388535, "grad_norm": 0.5788131952285767, "kl": 0.35546875, "learning_rate": 2.698993973490598e-06, "loss": 0.0036, "reward": 0.5397853627800941, "reward_std": 0.2762787565588951, "rewards/code_reward": 0.4424639120697975, "rewards/format_reward": 0.9732143133878708, "step": 261 }, { "completion_length": 770.4129791259766, "epoch": 0.5562632696390658, "grad_norm": 0.5763887166976929, "kl": 0.4140625, "learning_rate": 2.6844244918975416e-06, "loss": 0.0041, "reward": 0.4332207143306732, "reward_std": 0.21580959856510162, "rewards/code_reward": 0.3361224830150604, "rewards/format_reward": 0.9709821939468384, "step": 262 }, { "completion_length": 756.357177734375, "epoch": 0.5583864118895966, "grad_norm": 0.23940207064151764, "kl": 0.3953857421875, "learning_rate": 2.66985776172147e-06, "loss": 0.004, "reward": 0.4067609831690788, "reward_std": 0.15922481939196587, "rewards/code_reward": 0.3078770413994789, "rewards/format_reward": 0.9888393133878708, "step": 263 }, { "completion_length": 797.3772735595703, "epoch": 0.5605095541401274, "grad_norm": 1.1956448554992676, "kl": 0.394775390625, "learning_rate": 2.6552943941532088e-06, "loss": 0.004, "reward": 0.35336220264434814, "reward_std": 0.21252319402992725, "rewards/code_reward": 0.25447824597358704, "rewards/format_reward": 0.988839328289032, "step": 264 }, { "completion_length": 816.9330749511719, "epoch": 0.5626326963906582, "grad_norm": 0.3272117078304291, "kl": 0.33447265625, "learning_rate": 2.6407350002424927e-06, "loss": 0.0034, "reward": 0.3648254945874214, "reward_std": 0.19711337611079216, "rewards/code_reward": 0.2675040401518345, "rewards/format_reward": 0.973214328289032, "step": 265 }, { "completion_length": 790.0937805175781, "epoch": 0.564755838641189, "grad_norm": 0.3350137174129486, "kl": 0.217529296875, "learning_rate": 2.626180190872329e-06, "loss": 0.0022, "reward": 0.4639175459742546, "reward_std": 0.19284814596176147, "rewards/code_reward": 0.36592647433280945, "rewards/format_reward": 0.9799107611179352, "step": 266 }, { "completion_length": 785.247802734375, "epoch": 0.5668789808917197, "grad_norm": 0.2253342717885971, "kl": 0.1259765625, "learning_rate": 2.611630576733372e-06, "loss": 0.0013, "reward": 0.42001737654209137, "reward_std": 0.24298213049769402, "rewards/code_reward": 0.32180308550596237, "rewards/format_reward": 0.9821428805589676, "step": 267 }, { "completion_length": 874.1495971679688, "epoch": 0.5690021231422505, "grad_norm": 1.105658769607544, "kl": 0.2879638671875, "learning_rate": 2.5970867682982885e-06, "loss": 0.0029, "reward": 0.4009394347667694, "reward_std": 0.2002662494778633, "rewards/code_reward": 0.3031715527176857, "rewards/format_reward": 0.9776786118745804, "step": 268 }, { "completion_length": 821.7812805175781, "epoch": 0.5711252653927813, "grad_norm": 0.39032891392707825, "kl": 0.2081298828125, "learning_rate": 2.582549375796154e-06, "loss": 0.0021, "reward": 0.4019026607275009, "reward_std": 0.21826408058404922, "rewards/code_reward": 0.3036883734166622, "rewards/format_reward": 0.9821428805589676, "step": 269 }, { "completion_length": 804.6183471679688, "epoch": 0.5732484076433121, "grad_norm": 0.25958776473999023, "kl": 0.179931640625, "learning_rate": 2.568019009186841e-06, "loss": 0.0019, "reward": 0.4916309267282486, "reward_std": 0.19469109177589417, "rewards/code_reward": 0.3934166729450226, "rewards/format_reward": 0.9821428954601288, "step": 270 }, { "completion_length": 825.3370971679688, "epoch": 0.5753715498938429, "grad_norm": 0.22188299894332886, "kl": 0.1358642578125, "learning_rate": 2.5534962781354317e-06, "loss": 0.0014, "reward": 0.4202270358800888, "reward_std": 0.24190283194184303, "rewards/code_reward": 0.32223593071103096, "rewards/format_reward": 0.9799107611179352, "step": 271 }, { "completion_length": 780.4866485595703, "epoch": 0.5774946921443737, "grad_norm": 0.2580115497112274, "kl": 0.1597900390625, "learning_rate": 2.538981791986634e-06, "loss": 0.0016, "reward": 0.38698963820934296, "reward_std": 0.22697532176971436, "rewards/code_reward": 0.28877533972263336, "rewards/format_reward": 0.98214291036129, "step": 272 }, { "completion_length": 823.0402069091797, "epoch": 0.5796178343949044, "grad_norm": 0.2385285347700119, "kl": 0.141357421875, "learning_rate": 2.524476159739218e-06, "loss": 0.0015, "reward": 0.43764442950487137, "reward_std": 0.22844265773892403, "rewards/code_reward": 0.34032295644283295, "rewards/format_reward": 0.9732143431901932, "step": 273 }, { "completion_length": 782.2277221679688, "epoch": 0.5817409766454352, "grad_norm": 0.7892447710037231, "kl": 0.1402587890625, "learning_rate": 2.5099799900204607e-06, "loss": 0.0014, "reward": 0.47495051473379135, "reward_std": 0.24570094048976898, "rewards/code_reward": 0.37606657296419144, "rewards/format_reward": 0.988839328289032, "step": 274 }, { "completion_length": 790.029052734375, "epoch": 0.583864118895966, "grad_norm": 1.390781044960022, "kl": 0.1494140625, "learning_rate": 2.4954938910606108e-06, "loss": 0.0015, "reward": 0.41709961369633675, "reward_std": 0.22452056966722012, "rewards/code_reward": 0.31910853274166584, "rewards/format_reward": 0.979910746216774, "step": 275 }, { "completion_length": 766.544677734375, "epoch": 0.5859872611464968, "grad_norm": 0.3231986463069916, "kl": 0.125732421875, "learning_rate": 2.481018470667368e-06, "loss": 0.0013, "reward": 0.5159066766500473, "reward_std": 0.2495804950594902, "rewards/code_reward": 0.4188084527850151, "rewards/format_reward": 0.9709821939468384, "step": 276 }, { "completion_length": 816.5491485595703, "epoch": 0.5881104033970276, "grad_norm": 0.3522323966026306, "kl": 0.1474609375, "learning_rate": 2.4665543362003802e-06, "loss": 0.0016, "reward": 0.5210660025477409, "reward_std": 0.19517110101878643, "rewards/code_reward": 0.42352132126688957, "rewards/format_reward": 0.9754464775323868, "step": 277 }, { "completion_length": 815.3638610839844, "epoch": 0.5902335456475584, "grad_norm": 0.36454498767852783, "kl": 0.156005859375, "learning_rate": 2.4521020945457615e-06, "loss": 0.0016, "reward": 0.41319186985492706, "reward_std": 0.21446501463651657, "rewards/code_reward": 0.3152007535099983, "rewards/format_reward": 0.979910746216774, "step": 278 }, { "completion_length": 830.2031707763672, "epoch": 0.5923566878980892, "grad_norm": 0.24598261713981628, "kl": 0.182373046875, "learning_rate": 2.4376623520906255e-06, "loss": 0.0019, "reward": 0.48784376308321953, "reward_std": 0.25769151002168655, "rewards/code_reward": 0.39141515642404556, "rewards/format_reward": 0.9642857313156128, "step": 279 }, { "completion_length": 796.6272583007812, "epoch": 0.5944798301486199, "grad_norm": 0.24986568093299866, "kl": 0.154541015625, "learning_rate": 2.4232357146976478e-06, "loss": 0.0016, "reward": 0.3782888986170292, "reward_std": 0.18982039019465446, "rewards/code_reward": 0.28029780834913254, "rewards/format_reward": 0.9799107611179352, "step": 280 }, { "completion_length": 789.513427734375, "epoch": 0.5966029723991507, "grad_norm": 0.28405284881591797, "kl": 0.147705078125, "learning_rate": 2.408822787679637e-06, "loss": 0.0016, "reward": 0.5121422186493874, "reward_std": 0.2287510558962822, "rewards/code_reward": 0.413704726845026, "rewards/format_reward": 0.9843750298023224, "step": 281 }, { "completion_length": 808.3995819091797, "epoch": 0.5987261146496815, "grad_norm": 0.5303727984428406, "kl": 0.144775390625, "learning_rate": 2.3944241757741475e-06, "loss": 0.0016, "reward": 0.5508048385381699, "reward_std": 0.18633075430989265, "rewards/code_reward": 0.4516976475715637, "rewards/format_reward": 0.9910714626312256, "step": 282 }, { "completion_length": 827.4531707763672, "epoch": 0.6008492569002123, "grad_norm": 0.2308008074760437, "kl": 0.13037109375, "learning_rate": 2.380040483118097e-06, "loss": 0.0013, "reward": 0.3481413722038269, "reward_std": 0.20516538247466087, "rewards/code_reward": 0.250150291249156, "rewards/format_reward": 0.9799107313156128, "step": 283 }, { "completion_length": 786.8147583007812, "epoch": 0.6029723991507431, "grad_norm": 0.21248966455459595, "kl": 0.136962890625, "learning_rate": 2.365672313222419e-06, "loss": 0.0014, "reward": 0.4797332286834717, "reward_std": 0.2214011587202549, "rewards/code_reward": 0.3815189450979233, "rewards/format_reward": 0.98214291036129, "step": 284 }, { "completion_length": 797.7723693847656, "epoch": 0.6050955414012739, "grad_norm": 0.2716215252876282, "kl": 0.1376953125, "learning_rate": 2.351320268946749e-06, "loss": 0.0014, "reward": 0.4847887381911278, "reward_std": 0.26064618304371834, "rewards/code_reward": 0.3861280009150505, "rewards/format_reward": 0.9866071790456772, "step": 285 }, { "completion_length": 790.9308319091797, "epoch": 0.6072186836518046, "grad_norm": 0.22615957260131836, "kl": 0.1334228515625, "learning_rate": 2.336984952474119e-06, "loss": 0.0014, "reward": 0.4451970234513283, "reward_std": 0.21199724823236465, "rewards/code_reward": 0.34564343094825745, "rewards/format_reward": 0.9955357313156128, "step": 286 }, { "completion_length": 774.716552734375, "epoch": 0.6093418259023354, "grad_norm": 0.24061597883701324, "kl": 0.17236328125, "learning_rate": 2.322666965285697e-06, "loss": 0.0018, "reward": 0.4680435359477997, "reward_std": 0.2062854841351509, "rewards/code_reward": 0.3700524792075157, "rewards/format_reward": 0.9799107313156128, "step": 287 }, { "completion_length": 785.3437957763672, "epoch": 0.6114649681528662, "grad_norm": 0.2794930636882782, "kl": 0.143798828125, "learning_rate": 2.3083669081355507e-06, "loss": 0.0015, "reward": 0.41017772257328033, "reward_std": 0.1858556531369686, "rewards/code_reward": 0.31263307854533195, "rewards/format_reward": 0.9754464626312256, "step": 288 }, { "completion_length": 768.2902221679688, "epoch": 0.613588110403397, "grad_norm": 0.2621839940547943, "kl": 0.138427734375, "learning_rate": 2.2940853810254377e-06, "loss": 0.0014, "reward": 0.4905528202652931, "reward_std": 0.25080636143684387, "rewards/code_reward": 0.39144565910100937, "rewards/format_reward": 0.9910714477300644, "step": 289 }, { "completion_length": 788.9687805175781, "epoch": 0.6157112526539278, "grad_norm": 0.25945180654525757, "kl": 0.1494140625, "learning_rate": 2.2798229831796313e-06, "loss": 0.0015, "reward": 0.43350084125995636, "reward_std": 0.1987269874662161, "rewards/code_reward": 0.3370722308754921, "rewards/format_reward": 0.9642857611179352, "step": 290 }, { "completion_length": 762.8616485595703, "epoch": 0.6178343949044586, "grad_norm": 0.28753146529197693, "kl": 0.146484375, "learning_rate": 2.2655803130197816e-06, "loss": 0.0015, "reward": 0.45754577219486237, "reward_std": 0.20388228073716164, "rewards/code_reward": 0.35977791622281075, "rewards/format_reward": 0.9776786118745804, "step": 291 }, { "completion_length": 755.8594207763672, "epoch": 0.6199575371549894, "grad_norm": 0.2792350947856903, "kl": 0.14794921875, "learning_rate": 2.2513579681398034e-06, "loss": 0.0016, "reward": 0.4282514527440071, "reward_std": 0.16725242137908936, "rewards/code_reward": 0.32959069684147835, "rewards/format_reward": 0.986607164144516, "step": 292 }, { "completion_length": 744.6138763427734, "epoch": 0.6220806794055201, "grad_norm": 0.2520155608654022, "kl": 0.13720703125, "learning_rate": 2.237156545280803e-06, "loss": 0.0014, "reward": 0.44700442999601364, "reward_std": 0.21429810300469398, "rewards/code_reward": 0.34812046587467194, "rewards/format_reward": 0.9888393133878708, "step": 293 }, { "completion_length": 771.6897888183594, "epoch": 0.6242038216560509, "grad_norm": 0.41944995522499084, "kl": 0.22412109375, "learning_rate": 2.2229766403060403e-06, "loss": 0.0023, "reward": 0.4441903755068779, "reward_std": 0.19182176142930984, "rewards/code_reward": 0.3459760546684265, "rewards/format_reward": 0.9821428954601288, "step": 294 }, { "completion_length": 778.763427734375, "epoch": 0.6263269639065817, "grad_norm": 0.2801876962184906, "kl": 0.137939453125, "learning_rate": 2.2088188481759305e-06, "loss": 0.0014, "reward": 0.46992357820272446, "reward_std": 0.19111047685146332, "rewards/code_reward": 0.37103963643312454, "rewards/format_reward": 0.988839328289032, "step": 295 }, { "completion_length": 778.0893096923828, "epoch": 0.6284501061571125, "grad_norm": 0.21918566524982452, "kl": 0.131103515625, "learning_rate": 2.194683762923073e-06, "loss": 0.0013, "reward": 0.4984453171491623, "reward_std": 0.22232287377119064, "rewards/code_reward": 0.40045420452952385, "rewards/format_reward": 0.979910746216774, "step": 296 }, { "completion_length": 740.3951110839844, "epoch": 0.6305732484076433, "grad_norm": 0.31050121784210205, "kl": 0.1572265625, "learning_rate": 2.1805719776273387e-06, "loss": 0.0016, "reward": 0.4212986081838608, "reward_std": 0.1724853478372097, "rewards/code_reward": 0.321968249976635, "rewards/format_reward": 0.9933035969734192, "step": 297 }, { "completion_length": 682.138427734375, "epoch": 0.6326963906581741, "grad_norm": 0.24185748398303986, "kl": 0.17529296875, "learning_rate": 2.166484084390974e-06, "loss": 0.0019, "reward": 0.5747622847557068, "reward_std": 0.18613022193312645, "rewards/code_reward": 0.475878331810236, "rewards/format_reward": 0.9888393133878708, "step": 298 }, { "completion_length": 716.6518096923828, "epoch": 0.6348195329087049, "grad_norm": 0.6314132213592529, "kl": 0.166015625, "learning_rate": 2.1524206743137636e-06, "loss": 0.0017, "reward": 0.36886315792798996, "reward_std": 0.17360183410346508, "rewards/code_reward": 0.2708720788359642, "rewards/format_reward": 0.9799107611179352, "step": 299 }, { "completion_length": 737.8303833007812, "epoch": 0.6369426751592356, "grad_norm": 0.2968922555446625, "kl": 0.19287109375, "learning_rate": 2.1383823374682287e-06, "loss": 0.0019, "reward": 0.39945459365844727, "reward_std": 0.20941082388162613, "rewards/code_reward": 0.3014635145664215, "rewards/format_reward": 0.9799107611179352, "step": 300 }, { "completion_length": 718.5714721679688, "epoch": 0.6390658174097664, "grad_norm": 19.51397132873535, "kl": 0.275146484375, "learning_rate": 2.124369662874868e-06, "loss": 0.0029, "reward": 0.503417618572712, "reward_std": 0.15935716964304447, "rewards/code_reward": 0.40631940215826035, "rewards/format_reward": 0.9709821790456772, "step": 301 }, { "completion_length": 704.3393096923828, "epoch": 0.6411889596602972, "grad_norm": 0.35022857785224915, "kl": 0.14697265625, "learning_rate": 2.110383238477441e-06, "loss": 0.0015, "reward": 0.5569600984454155, "reward_std": 0.20704489946365356, "rewards/code_reward": 0.45785292237997055, "rewards/format_reward": 0.9910714477300644, "step": 302 }, { "completion_length": 702.6986846923828, "epoch": 0.643312101910828, "grad_norm": 0.17607638239860535, "kl": 0.13916015625, "learning_rate": 2.096423651118305e-06, "loss": 0.0014, "reward": 0.2535444311797619, "reward_std": 0.11278286523884162, "rewards/code_reward": 0.15466050058603287, "rewards/format_reward": 0.9888393133878708, "step": 303 }, { "completion_length": 701.2857360839844, "epoch": 0.6454352441613588, "grad_norm": 0.6241003274917603, "kl": 0.1826171875, "learning_rate": 2.082491486513788e-06, "loss": 0.0019, "reward": 0.5550656765699387, "reward_std": 0.21512125991284847, "rewards/code_reward": 0.45618174970149994, "rewards/format_reward": 0.988839328289032, "step": 304 }, { "completion_length": 709.0379638671875, "epoch": 0.6475583864118896, "grad_norm": 0.696461021900177, "kl": 0.1435546875, "learning_rate": 2.0685873292296116e-06, "loss": 0.0015, "reward": 0.3796486109495163, "reward_std": 0.15390164637938142, "rewards/code_reward": 0.28121111169457436, "rewards/format_reward": 0.9843750447034836, "step": 305 }, { "completion_length": 682.716552734375, "epoch": 0.6496815286624203, "grad_norm": 0.26720672845840454, "kl": 0.162109375, "learning_rate": 2.054711762656369e-06, "loss": 0.0016, "reward": 0.37838516384363174, "reward_std": 0.16313385590910912, "rewards/code_reward": 0.28061728924512863, "rewards/format_reward": 0.9776786118745804, "step": 306 }, { "completion_length": 666.8080596923828, "epoch": 0.6518046709129511, "grad_norm": 0.8882589936256409, "kl": 0.16259765625, "learning_rate": 2.040865368985044e-06, "loss": 0.0017, "reward": 0.4301592782139778, "reward_std": 0.20042868331074715, "rewards/code_reward": 0.33105212450027466, "rewards/format_reward": 0.9910714626312256, "step": 307 }, { "completion_length": 681.9129791259766, "epoch": 0.6539278131634819, "grad_norm": 0.23706179857254028, "kl": 0.18310546875, "learning_rate": 2.027048729182583e-06, "loss": 0.0019, "reward": 0.4861885607242584, "reward_std": 0.16966554708778858, "rewards/code_reward": 0.3881974592804909, "rewards/format_reward": 0.9799107611179352, "step": 308 }, { "completion_length": 693.8147583007812, "epoch": 0.6560509554140127, "grad_norm": 0.5197703242301941, "kl": 0.228271484375, "learning_rate": 2.0132624229675205e-06, "loss": 0.0024, "reward": 0.511215090751648, "reward_std": 0.18619069457054138, "rewards/code_reward": 0.4127775654196739, "rewards/format_reward": 0.9843750596046448, "step": 309 }, { "completion_length": 714.9777221679688, "epoch": 0.6581740976645435, "grad_norm": 0.24638721346855164, "kl": 0.189453125, "learning_rate": 1.9995070287856546e-06, "loss": 0.002, "reward": 0.5180679038167, "reward_std": 0.21345077827572823, "rewards/code_reward": 0.41963040083646774, "rewards/format_reward": 0.9843750447034836, "step": 310 }, { "completion_length": 708.2723388671875, "epoch": 0.6602972399150743, "grad_norm": 0.422715961933136, "kl": 0.18701171875, "learning_rate": 1.985783123785774e-06, "loss": 0.0019, "reward": 0.5620292499661446, "reward_std": 0.20659737288951874, "rewards/code_reward": 0.46314531564712524, "rewards/format_reward": 0.988839328289032, "step": 311 }, { "completion_length": 668.2589569091797, "epoch": 0.6624203821656051, "grad_norm": 0.6652376055717468, "kl": 0.240478515625, "learning_rate": 1.9720912837954486e-06, "loss": 0.0025, "reward": 0.4389989897608757, "reward_std": 0.20384247601032257, "rewards/code_reward": 0.33989182114601135, "rewards/format_reward": 0.9910714626312256, "step": 312 }, { "completion_length": 671.5424346923828, "epoch": 0.6645435244161358, "grad_norm": 0.898223876953125, "kl": 0.25927734375, "learning_rate": 1.958432083296862e-06, "loss": 0.0026, "reward": 0.36031387001276016, "reward_std": 0.2003210037946701, "rewards/code_reward": 0.26254600286483765, "rewards/format_reward": 0.9776786118745804, "step": 313 }, { "completion_length": 676.5602874755859, "epoch": 0.6666666666666666, "grad_norm": 0.7889689207077026, "kl": 0.2135009765625, "learning_rate": 1.9448060954027093e-06, "loss": 0.0022, "reward": 0.5204020366072655, "reward_std": 0.16625045239925385, "rewards/code_reward": 0.4212948679924011, "rewards/format_reward": 0.9910714477300644, "step": 314 }, { "completion_length": 684.9866333007812, "epoch": 0.6687898089171974, "grad_norm": 1.3564072847366333, "kl": 0.40185546875, "learning_rate": 1.931213891832153e-06, "loss": 0.0041, "reward": 0.526521772146225, "reward_std": 0.2212766855955124, "rewards/code_reward": 0.4278610572218895, "rewards/format_reward": 0.986607164144516, "step": 315 }, { "completion_length": 652.8102874755859, "epoch": 0.6709129511677282, "grad_norm": 0.24422591924667358, "kl": 0.147216796875, "learning_rate": 1.9176560428868336e-06, "loss": 0.0015, "reward": 0.3931754156947136, "reward_std": 0.1695394441485405, "rewards/code_reward": 0.29473789036273956, "rewards/format_reward": 0.9843750447034836, "step": 316 }, { "completion_length": 687.8214569091797, "epoch": 0.673036093418259, "grad_norm": 0.4171687960624695, "kl": 0.236328125, "learning_rate": 1.9041331174269373e-06, "loss": 0.0024, "reward": 0.47731664031744003, "reward_std": 0.20429091900587082, "rewards/code_reward": 0.378879152238369, "rewards/format_reward": 0.9843750298023224, "step": 317 }, { "completion_length": 682.6741485595703, "epoch": 0.6751592356687898, "grad_norm": 0.9241800308227539, "kl": 0.36083984375, "learning_rate": 1.8906456828473341e-06, "loss": 0.0036, "reward": 0.5124014094471931, "reward_std": 0.21064380928874016, "rewards/code_reward": 0.4132942706346512, "rewards/format_reward": 0.9910714626312256, "step": 318 }, { "completion_length": 684.0759124755859, "epoch": 0.6772823779193206, "grad_norm": 0.24995659291744232, "kl": 0.14794921875, "learning_rate": 1.8771943050537656e-06, "loss": 0.0016, "reward": 0.592289388179779, "reward_std": 0.2126442939043045, "rewards/code_reward": 0.4942983016371727, "rewards/format_reward": 0.9799107611179352, "step": 319 }, { "completion_length": 719.6786041259766, "epoch": 0.6794055201698513, "grad_norm": 0.24401088058948517, "kl": 0.1395263671875, "learning_rate": 1.8637795484391046e-06, "loss": 0.0014, "reward": 0.4689144790172577, "reward_std": 0.25591350346803665, "rewards/code_reward": 0.3711466044187546, "rewards/format_reward": 0.9776786118745804, "step": 320 }, { "completion_length": 655.4844055175781, "epoch": 0.6815286624203821, "grad_norm": 0.3457026779651642, "kl": 0.50341796875, "learning_rate": 1.8504019758596698e-06, "loss": 0.0051, "reward": 0.5521439760923386, "reward_std": 0.2452612817287445, "rewards/code_reward": 0.45326002687215805, "rewards/format_reward": 0.9888393133878708, "step": 321 }, { "completion_length": 714.7143096923828, "epoch": 0.6836518046709129, "grad_norm": 0.3326283395290375, "kl": 0.1953125, "learning_rate": 1.8370621486116163e-06, "loss": 0.0021, "reward": 0.5532227605581284, "reward_std": 0.18401411548256874, "rewards/code_reward": 0.4552316591143608, "rewards/format_reward": 0.9799107760190964, "step": 322 }, { "completion_length": 677.7634124755859, "epoch": 0.6857749469214437, "grad_norm": 0.3285404145717621, "kl": 0.23876953125, "learning_rate": 1.823760626407377e-06, "loss": 0.0025, "reward": 0.4828302264213562, "reward_std": 0.1928608939051628, "rewards/code_reward": 0.384615920484066, "rewards/format_reward": 0.9821429252624512, "step": 323 }, { "completion_length": 699.5982666015625, "epoch": 0.6878980891719745, "grad_norm": 0.34025460481643677, "kl": 0.224365234375, "learning_rate": 1.8104979673521838e-06, "loss": 0.0023, "reward": 0.42327145487070084, "reward_std": 0.15393321216106415, "rewards/code_reward": 0.32505714148283005, "rewards/format_reward": 0.98214291036129, "step": 324 }, { "completion_length": 650.997802734375, "epoch": 0.6900212314225053, "grad_norm": 0.3025732636451721, "kl": 0.24853515625, "learning_rate": 1.7972747279206482e-06, "loss": 0.0025, "reward": 0.37195510417222977, "reward_std": 0.19180476292967796, "rewards/code_reward": 0.27418723329901695, "rewards/format_reward": 0.9776786267757416, "step": 325 }, { "completion_length": 692.4464569091797, "epoch": 0.692144373673036, "grad_norm": 0.2389409989118576, "kl": 0.148681640625, "learning_rate": 1.7840914629334122e-06, "loss": 0.0016, "reward": 0.5394042208790779, "reward_std": 0.22496159374713898, "rewards/code_reward": 0.44185957312583923, "rewards/format_reward": 0.9754464775323868, "step": 326 }, { "completion_length": 709.0669860839844, "epoch": 0.6942675159235668, "grad_norm": 0.28394991159439087, "kl": 0.194091796875, "learning_rate": 1.7709487255338731e-06, "loss": 0.0021, "reward": 0.4636544920504093, "reward_std": 0.15700273029506207, "rewards/code_reward": 0.36633305437862873, "rewards/format_reward": 0.973214328289032, "step": 327 }, { "completion_length": 702.7768249511719, "epoch": 0.6963906581740976, "grad_norm": 0.22292962670326233, "kl": 0.17431640625, "learning_rate": 1.7578470671649684e-06, "loss": 0.0019, "reward": 0.4268321394920349, "reward_std": 0.1670057326555252, "rewards/code_reward": 0.32928748056292534, "rewards/format_reward": 0.9754464775323868, "step": 328 }, { "completion_length": 694.5937805175781, "epoch": 0.6985138004246284, "grad_norm": 0.782927393913269, "kl": 0.3328857421875, "learning_rate": 1.744787037546045e-06, "loss": 0.0034, "reward": 0.46113383024930954, "reward_std": 0.18688062392175198, "rewards/code_reward": 0.3626963049173355, "rewards/format_reward": 0.9843750298023224, "step": 329 }, { "completion_length": 706.1986999511719, "epoch": 0.7006369426751592, "grad_norm": 0.41430673003196716, "kl": 0.1827392578125, "learning_rate": 1.731769184649788e-06, "loss": 0.0019, "reward": 0.5658792853355408, "reward_std": 0.23742860183119774, "rewards/code_reward": 0.4683346152305603, "rewards/format_reward": 0.9754464775323868, "step": 330 }, { "completion_length": 694.9576416015625, "epoch": 0.70276008492569, "grad_norm": 0.6622937917709351, "kl": 0.214111328125, "learning_rate": 1.7187940546792325e-06, "loss": 0.0022, "reward": 0.4137548431754112, "reward_std": 0.1334713213145733, "rewards/code_reward": 0.3155405670404434, "rewards/format_reward": 0.9821428954601288, "step": 331 }, { "completion_length": 716.8393249511719, "epoch": 0.7048832271762208, "grad_norm": 0.22396574914455414, "kl": 0.2607421875, "learning_rate": 1.7058621920448465e-06, "loss": 0.0027, "reward": 0.4444565996527672, "reward_std": 0.18423740193247795, "rewards/code_reward": 0.34646550565958023, "rewards/format_reward": 0.9799107611179352, "step": 332 }, { "completion_length": 703.0937805175781, "epoch": 0.7070063694267515, "grad_norm": 0.2483583688735962, "kl": 0.160888671875, "learning_rate": 1.6929741393416855e-06, "loss": 0.0016, "reward": 0.47170016914606094, "reward_std": 0.18039512634277344, "rewards/code_reward": 0.37393229454755783, "rewards/format_reward": 0.9776786267757416, "step": 333 }, { "completion_length": 755.0312805175781, "epoch": 0.7091295116772823, "grad_norm": 0.4338986873626709, "kl": 0.357177734375, "learning_rate": 1.6801304373266286e-06, "loss": 0.0036, "reward": 0.4291260167956352, "reward_std": 0.15267430432140827, "rewards/code_reward": 0.3318046070635319, "rewards/format_reward": 0.9732143133878708, "step": 334 }, { "completion_length": 767.3214569091797, "epoch": 0.7112526539278131, "grad_norm": 0.21925950050354004, "kl": 0.137451171875, "learning_rate": 1.667331624895689e-06, "loss": 0.0014, "reward": 0.4992447942495346, "reward_std": 0.21635426208376884, "rewards/code_reward": 0.4014769196510315, "rewards/format_reward": 0.9776786118745804, "step": 335 }, { "completion_length": 750.1696929931641, "epoch": 0.7133757961783439, "grad_norm": 0.30118319392204285, "kl": 0.359619140625, "learning_rate": 1.6545782390614037e-06, "loss": 0.0037, "reward": 0.4922778084874153, "reward_std": 0.1726557295769453, "rewards/code_reward": 0.39317065104842186, "rewards/format_reward": 0.9910714626312256, "step": 336 }, { "completion_length": 718.6339721679688, "epoch": 0.7154989384288747, "grad_norm": 0.41911348700523376, "kl": 0.317626953125, "learning_rate": 1.6418708149302992e-06, "loss": 0.0033, "reward": 0.44379642605781555, "reward_std": 0.19296832010149956, "rewards/code_reward": 0.3451356738805771, "rewards/format_reward": 0.986607164144516, "step": 337 }, { "completion_length": 694.1138610839844, "epoch": 0.7176220806794055, "grad_norm": 0.7091541886329651, "kl": 0.27783203125, "learning_rate": 1.6292098856804423e-06, "loss": 0.0028, "reward": 0.4443873465061188, "reward_std": 0.19508511200547218, "rewards/code_reward": 0.3468426913022995, "rewards/format_reward": 0.9754464775323868, "step": 338 }, { "completion_length": 720.607177734375, "epoch": 0.7197452229299363, "grad_norm": 0.6043697595596313, "kl": 0.3173828125, "learning_rate": 1.6165959825390661e-06, "loss": 0.0033, "reward": 0.43542125821113586, "reward_std": 0.16308805532753468, "rewards/code_reward": 0.33720696344971657, "rewards/format_reward": 0.9821428954601288, "step": 339 }, { "completion_length": 706.1317291259766, "epoch": 0.721868365180467, "grad_norm": 0.2581160068511963, "kl": 0.2353515625, "learning_rate": 1.604029634760284e-06, "loss": 0.0025, "reward": 0.5382986813783646, "reward_std": 0.14037772081792355, "rewards/code_reward": 0.4403075948357582, "rewards/format_reward": 0.9799107611179352, "step": 340 }, { "completion_length": 737.5469055175781, "epoch": 0.7239915074309978, "grad_norm": 0.4556562006473541, "kl": 0.368408203125, "learning_rate": 1.59151136960288e-06, "loss": 0.0037, "reward": 0.538652278482914, "reward_std": 0.20831965655088425, "rewards/code_reward": 0.44133080542087555, "rewards/format_reward": 0.973214328289032, "step": 341 }, { "completion_length": 723.9486999511719, "epoch": 0.7261146496815286, "grad_norm": 0.2620218098163605, "kl": 0.159912109375, "learning_rate": 1.5790417123081903e-06, "loss": 0.0017, "reward": 0.45855508744716644, "reward_std": 0.1777043156325817, "rewards/code_reward": 0.3605640158057213, "rewards/format_reward": 0.9799107611179352, "step": 342 }, { "completion_length": 686.8236999511719, "epoch": 0.7282377919320594, "grad_norm": 0.2753090560436249, "kl": 0.16455078125, "learning_rate": 1.5666211860780583e-06, "loss": 0.0018, "reward": 0.5850269198417664, "reward_std": 0.19610749557614326, "rewards/code_reward": 0.4870358556509018, "rewards/format_reward": 0.9799107760190964, "step": 343 }, { "completion_length": 684.1205749511719, "epoch": 0.7303609341825902, "grad_norm": 0.23944684863090515, "kl": 0.16455078125, "learning_rate": 1.5542503120528918e-06, "loss": 0.0017, "reward": 0.5332599207758904, "reward_std": 0.2457549162209034, "rewards/code_reward": 0.43437594920396805, "rewards/format_reward": 0.9888393133878708, "step": 344 }, { "completion_length": 720.3839569091797, "epoch": 0.732484076433121, "grad_norm": 0.31666672229766846, "kl": 0.213134765625, "learning_rate": 1.5419296092897866e-06, "loss": 0.0022, "reward": 0.5879708528518677, "reward_std": 0.24002529680728912, "rewards/code_reward": 0.4899797812104225, "rewards/format_reward": 0.9799107611179352, "step": 345 }, { "completion_length": 693.6964569091797, "epoch": 0.7346072186836518, "grad_norm": 0.24176108837127686, "kl": 0.15869140625, "learning_rate": 1.529659594740755e-06, "loss": 0.0016, "reward": 0.4276282340288162, "reward_std": 0.20496541634202003, "rewards/code_reward": 0.32896753773093224, "rewards/format_reward": 0.9866071790456772, "step": 346 }, { "completion_length": 704.2902221679688, "epoch": 0.7367303609341825, "grad_norm": 0.2568061351776123, "kl": 0.15771484375, "learning_rate": 1.5174407832310338e-06, "loss": 0.0016, "reward": 0.39445348642766476, "reward_std": 0.13825338683091104, "rewards/code_reward": 0.2962391600012779, "rewards/format_reward": 0.9821428954601288, "step": 347 }, { "completion_length": 722.2545013427734, "epoch": 0.7388535031847133, "grad_norm": 0.49012815952301025, "kl": 0.17578125, "learning_rate": 1.5052736874374815e-06, "loss": 0.0018, "reward": 0.488083653151989, "reward_std": 0.1927042007446289, "rewards/code_reward": 0.39009255915880203, "rewards/format_reward": 0.979910746216774, "step": 348 }, { "completion_length": 713.3147583007812, "epoch": 0.7409766454352441, "grad_norm": 0.6304606795310974, "kl": 0.29345703125, "learning_rate": 1.4931588178670695e-06, "loss": 0.003, "reward": 0.4815641790628433, "reward_std": 0.16962832398712635, "rewards/code_reward": 0.38357311114668846, "rewards/format_reward": 0.9799107611179352, "step": 349 }, { "completion_length": 700.3214569091797, "epoch": 0.7430997876857749, "grad_norm": 0.43463101983070374, "kl": 0.289306640625, "learning_rate": 1.4810966828354605e-06, "loss": 0.0029, "reward": 0.45994506776332855, "reward_std": 0.1931474320590496, "rewards/code_reward": 0.36173076555132866, "rewards/format_reward": 0.98214291036129, "step": 350 }, { "completion_length": 685.1384124755859, "epoch": 0.7452229299363057, "grad_norm": 0.34815892577171326, "kl": 0.44140625, "learning_rate": 1.469087788445684e-06, "loss": 0.0045, "reward": 0.5396069064736366, "reward_std": 0.20336921885609627, "rewards/code_reward": 0.44250866025686264, "rewards/format_reward": 0.9709821939468384, "step": 351 }, { "completion_length": 698.6071929931641, "epoch": 0.7473460721868365, "grad_norm": 0.3489153981208801, "kl": 0.533447265625, "learning_rate": 1.4571326385668965e-06, "loss": 0.0055, "reward": 0.6215780973434448, "reward_std": 0.202628992497921, "rewards/code_reward": 0.5229173377156258, "rewards/format_reward": 0.9866071790456772, "step": 352 }, { "completion_length": 713.6897583007812, "epoch": 0.7494692144373672, "grad_norm": 0.2902304232120514, "kl": 0.160400390625, "learning_rate": 1.4452317348132434e-06, "loss": 0.0018, "reward": 0.43891899287700653, "reward_std": 0.1397520825266838, "rewards/code_reward": 0.3393654003739357, "rewards/format_reward": 0.9955357313156128, "step": 353 }, { "completion_length": 706.091552734375, "epoch": 0.7515923566878981, "grad_norm": 0.7335183024406433, "kl": 0.34814453125, "learning_rate": 1.4333855765228104e-06, "loss": 0.0037, "reward": 0.6451611816883087, "reward_std": 0.20771214738488197, "rewards/code_reward": 0.5465004742145538, "rewards/format_reward": 0.9866071939468384, "step": 354 }, { "completion_length": 712.1607513427734, "epoch": 0.7537154989384289, "grad_norm": 0.7572880387306213, "kl": 0.3447265625, "learning_rate": 1.421594660736675e-06, "loss": 0.0035, "reward": 0.41940218955278397, "reward_std": 0.1921430230140686, "rewards/code_reward": 0.3202950209379196, "rewards/format_reward": 0.9910714626312256, "step": 355 }, { "completion_length": 680.1986999511719, "epoch": 0.7558386411889597, "grad_norm": 0.3940925896167755, "kl": 0.549560546875, "learning_rate": 1.4098594821780476e-06, "loss": 0.0056, "reward": 0.6083894520998001, "reward_std": 0.1597061362117529, "rewards/code_reward": 0.5108448341488838, "rewards/format_reward": 0.9754464626312256, "step": 356 }, { "completion_length": 665.372802734375, "epoch": 0.7579617834394905, "grad_norm": 0.2566499710083008, "kl": 0.192138671875, "learning_rate": 1.3981805332315174e-06, "loss": 0.002, "reward": 0.4351358078420162, "reward_std": 0.1653740406036377, "rewards/code_reward": 0.3360286522656679, "rewards/format_reward": 0.9910714626312256, "step": 357 }, { "completion_length": 732.7589569091797, "epoch": 0.7600849256900213, "grad_norm": 0.35650861263275146, "kl": 0.250732421875, "learning_rate": 1.3865583039223929e-06, "loss": 0.0026, "reward": 0.5535444989800453, "reward_std": 0.17830567993223667, "rewards/code_reward": 0.4555533789098263, "rewards/format_reward": 0.979910746216774, "step": 358 }, { "completion_length": 708.0424346923828, "epoch": 0.7622080679405521, "grad_norm": 0.24273599684238434, "kl": 0.1611328125, "learning_rate": 1.374993281896137e-06, "loss": 0.0017, "reward": 0.44518817216157913, "reward_std": 0.19435212016105652, "rewards/code_reward": 0.34697388112545013, "rewards/format_reward": 0.98214291036129, "step": 359 }, { "completion_length": 765.1786041259766, "epoch": 0.7643312101910829, "grad_norm": 0.3510468304157257, "kl": 0.197021484375, "learning_rate": 1.3634859523979134e-06, "loss": 0.002, "reward": 0.47114741802215576, "reward_std": 0.1812426745891571, "rewards/code_reward": 0.3724866919219494, "rewards/format_reward": 0.9866071939468384, "step": 360 }, { "completion_length": 724.216552734375, "epoch": 0.7664543524416136, "grad_norm": 1.1458288431167603, "kl": 0.52978515625, "learning_rate": 1.3520367982522208e-06, "loss": 0.0053, "reward": 0.45792729407548904, "reward_std": 0.16464052349328995, "rewards/code_reward": 0.35926656424999237, "rewards/format_reward": 0.9866071939468384, "step": 361 }, { "completion_length": 705.4464721679688, "epoch": 0.7685774946921444, "grad_norm": 0.4750509560108185, "kl": 0.23779296875, "learning_rate": 1.3406462998426358e-06, "loss": 0.0024, "reward": 0.5133348107337952, "reward_std": 0.24053634703159332, "rewards/code_reward": 0.41445086151361465, "rewards/format_reward": 0.988839328289032, "step": 362 }, { "completion_length": 743.1853179931641, "epoch": 0.7707006369426752, "grad_norm": 0.2608552575111389, "kl": 0.325927734375, "learning_rate": 1.3293149350916595e-06, "loss": 0.0033, "reward": 0.5553034171462059, "reward_std": 0.19487734138965607, "rewards/code_reward": 0.45731230080127716, "rewards/format_reward": 0.979910746216774, "step": 363 }, { "completion_length": 678.2209930419922, "epoch": 0.772823779193206, "grad_norm": 0.22239775955677032, "kl": 0.13037109375, "learning_rate": 1.3180431794406623e-06, "loss": 0.0015, "reward": 0.6062557250261307, "reward_std": 0.2048381306231022, "rewards/code_reward": 0.5069253593683243, "rewards/format_reward": 0.9933035969734192, "step": 364 }, { "completion_length": 707.4620971679688, "epoch": 0.7749469214437368, "grad_norm": 0.4696608781814575, "kl": 0.270751953125, "learning_rate": 1.3068315058299358e-06, "loss": 0.0029, "reward": 0.5663170740008354, "reward_std": 0.15939603559672832, "rewards/code_reward": 0.4678795412182808, "rewards/format_reward": 0.9843750447034836, "step": 365 }, { "completion_length": 653.2879791259766, "epoch": 0.7770700636942676, "grad_norm": 1.1559607982635498, "kl": 0.3037109375, "learning_rate": 1.2956803846788503e-06, "loss": 0.0032, "reward": 0.618221327662468, "reward_std": 0.22959138825535774, "rewards/code_reward": 0.5193373411893845, "rewards/format_reward": 0.988839328289032, "step": 366 }, { "completion_length": 731.2076263427734, "epoch": 0.7791932059447984, "grad_norm": 0.48825645446777344, "kl": 0.210693359375, "learning_rate": 1.284590283866116e-06, "loss": 0.0021, "reward": 0.33228749781847, "reward_std": 0.15970432199537754, "rewards/code_reward": 0.2345196194946766, "rewards/format_reward": 0.9776786267757416, "step": 367 }, { "completion_length": 695.4576110839844, "epoch": 0.7813163481953291, "grad_norm": 1.4041056632995605, "kl": 0.1883544921875, "learning_rate": 1.2735616687101518e-06, "loss": 0.002, "reward": 0.40882231295108795, "reward_std": 0.16854364797472954, "rewards/code_reward": 0.3103848248720169, "rewards/format_reward": 0.9843750298023224, "step": 368 }, { "completion_length": 696.4687805175781, "epoch": 0.7834394904458599, "grad_norm": 1.9169604778289795, "kl": 0.201171875, "learning_rate": 1.2625950019495614e-06, "loss": 0.0021, "reward": 0.5380031913518906, "reward_std": 0.1728157363831997, "rewards/code_reward": 0.4400121048092842, "rewards/format_reward": 0.979910746216774, "step": 369 }, { "completion_length": 709.4598693847656, "epoch": 0.7855626326963907, "grad_norm": 0.3797023594379425, "kl": 0.1640625, "learning_rate": 1.251690743723718e-06, "loss": 0.0017, "reward": 0.5747079327702522, "reward_std": 0.24513645470142365, "rewards/code_reward": 0.4767168238759041, "rewards/format_reward": 0.9799107611179352, "step": 370 }, { "completion_length": 647.7209930419922, "epoch": 0.7876857749469215, "grad_norm": 0.24551738798618317, "kl": 0.150390625, "learning_rate": 1.2408493515534581e-06, "loss": 0.0016, "reward": 0.6943890303373337, "reward_std": 0.22319162264466286, "rewards/code_reward": 0.5959515273571014, "rewards/format_reward": 0.9843750298023224, "step": 371 }, { "completion_length": 692.2098693847656, "epoch": 0.7898089171974523, "grad_norm": 0.4829825460910797, "kl": 0.406005859375, "learning_rate": 1.2300712803218834e-06, "loss": 0.0042, "reward": 0.5234424099326134, "reward_std": 0.1910531185567379, "rewards/code_reward": 0.42388884723186493, "rewards/format_reward": 0.9955357313156128, "step": 372 }, { "completion_length": 697.3036041259766, "epoch": 0.7919320594479831, "grad_norm": 114.25981140136719, "kl": 16.0146484375, "learning_rate": 1.2193569822552772e-06, "loss": 0.1608, "reward": 0.559485673904419, "reward_std": 0.20258497074246407, "rewards/code_reward": 0.4606017544865608, "rewards/format_reward": 0.988839328289032, "step": 373 }, { "completion_length": 677.7567291259766, "epoch": 0.7940552016985138, "grad_norm": 0.3005722761154175, "kl": 0.171875, "learning_rate": 1.2087069069041268e-06, "loss": 0.0018, "reward": 0.5883411467075348, "reward_std": 0.21694539301097393, "rewards/code_reward": 0.4901268184185028, "rewards/format_reward": 0.98214291036129, "step": 374 }, { "completion_length": 671.1495819091797, "epoch": 0.7961783439490446, "grad_norm": 0.6558151841163635, "kl": 0.162841796875, "learning_rate": 1.1981215011242654e-06, "loss": 0.0017, "reward": 0.5491671711206436, "reward_std": 0.2353355698287487, "rewards/code_reward": 0.45050643384456635, "rewards/format_reward": 0.9866071939468384, "step": 375 }, { "completion_length": 663.4486999511719, "epoch": 0.7983014861995754, "grad_norm": 1.0574246644973755, "kl": 0.168701171875, "learning_rate": 1.1876012090581184e-06, "loss": 0.0018, "reward": 0.523729532957077, "reward_std": 0.19741250574588776, "rewards/code_reward": 0.42573845386505127, "rewards/format_reward": 0.979910746216774, "step": 376 }, { "completion_length": 678.5826110839844, "epoch": 0.8004246284501062, "grad_norm": 0.28517383337020874, "kl": 0.168212890625, "learning_rate": 1.177146472116071e-06, "loss": 0.0018, "reward": 0.4997348487377167, "reward_std": 0.16867511346936226, "rewards/code_reward": 0.40196699649095535, "rewards/format_reward": 0.9776785969734192, "step": 377 }, { "completion_length": 725.0000457763672, "epoch": 0.802547770700637, "grad_norm": 0.38322436809539795, "kl": 0.176025390625, "learning_rate": 1.1667577289579462e-06, "loss": 0.0018, "reward": 0.43969085440039635, "reward_std": 0.16067362390458584, "rewards/code_reward": 0.3425925988703966, "rewards/format_reward": 0.9709821790456772, "step": 378 }, { "completion_length": 671.4710083007812, "epoch": 0.8046709129511678, "grad_norm": 0.24044837057590485, "kl": 0.1435546875, "learning_rate": 1.1564354154746007e-06, "loss": 0.0015, "reward": 0.5779925882816315, "reward_std": 0.22314922511577606, "rewards/code_reward": 0.479331873357296, "rewards/format_reward": 0.9866071790456772, "step": 379 }, { "completion_length": 701.1875457763672, "epoch": 0.8067940552016986, "grad_norm": 0.2769814729690552, "kl": 0.187255859375, "learning_rate": 1.146179964769635e-06, "loss": 0.002, "reward": 0.5813698992133141, "reward_std": 0.21280257403850555, "rewards/code_reward": 0.48315558582544327, "rewards/format_reward": 0.9821428954601288, "step": 380 }, { "completion_length": 703.2701263427734, "epoch": 0.8089171974522293, "grad_norm": 0.43315884470939636, "kl": 0.28125, "learning_rate": 1.1359918071412195e-06, "loss": 0.003, "reward": 0.5584300383925438, "reward_std": 0.17897445522248745, "rewards/code_reward": 0.4595461040735245, "rewards/format_reward": 0.988839328289032, "step": 381 }, { "completion_length": 680.9174499511719, "epoch": 0.8110403397027601, "grad_norm": 0.3025217652320862, "kl": 0.208251953125, "learning_rate": 1.1258713700640456e-06, "loss": 0.0022, "reward": 0.47092022001743317, "reward_std": 0.1665214579552412, "rewards/code_reward": 0.3727059066295624, "rewards/format_reward": 0.9821428954601288, "step": 382 }, { "completion_length": 672.3705596923828, "epoch": 0.8131634819532909, "grad_norm": 0.23662854731082916, "kl": 0.1478271484375, "learning_rate": 1.115819078171383e-06, "loss": 0.0016, "reward": 0.5290590599179268, "reward_std": 0.21020712330937386, "rewards/code_reward": 0.4312911853194237, "rewards/format_reward": 0.9776786267757416, "step": 383 }, { "completion_length": 659.2678833007812, "epoch": 0.8152866242038217, "grad_norm": 0.2239212840795517, "kl": 0.1688232421875, "learning_rate": 1.1058353532372667e-06, "loss": 0.0018, "reward": 0.5600069090723991, "reward_std": 0.20570005849003792, "rewards/code_reward": 0.46067656576633453, "rewards/format_reward": 0.9933035969734192, "step": 384 }, { "completion_length": 688.6205596923828, "epoch": 0.8174097664543525, "grad_norm": 0.25939956307411194, "kl": 0.156982421875, "learning_rate": 1.0959206141587998e-06, "loss": 0.0016, "reward": 0.461281917989254, "reward_std": 0.2138805352151394, "rewards/code_reward": 0.36329086124897003, "rewards/format_reward": 0.9799107760190964, "step": 385 }, { "completion_length": 689.8527069091797, "epoch": 0.8195329087048833, "grad_norm": 0.564179003238678, "kl": 0.34716796875, "learning_rate": 1.0860752769385766e-06, "loss": 0.0035, "reward": 0.5820841789245605, "reward_std": 0.23867543786764145, "rewards/code_reward": 0.48320019245147705, "rewards/format_reward": 0.9888393133878708, "step": 386 }, { "completion_length": 716.8995971679688, "epoch": 0.821656050955414, "grad_norm": 0.31268319487571716, "kl": 0.2451171875, "learning_rate": 1.0762997546672279e-06, "loss": 0.0026, "reward": 0.24600705318152905, "reward_std": 0.06653665285557508, "rewards/code_reward": 0.14823918044567108, "rewards/format_reward": 0.9776786267757416, "step": 387 }, { "completion_length": 661.8973388671875, "epoch": 0.8237791932059448, "grad_norm": 0.23703983426094055, "kl": 0.139892578125, "learning_rate": 1.0665944575060914e-06, "loss": 0.0015, "reward": 0.5530121028423309, "reward_std": 0.2014228142797947, "rewards/code_reward": 0.45368169248104095, "rewards/format_reward": 0.9933035969734192, "step": 388 }, { "completion_length": 671.0468902587891, "epoch": 0.8259023354564756, "grad_norm": 0.21562151610851288, "kl": 0.14697265625, "learning_rate": 1.056959792669997e-06, "loss": 0.0016, "reward": 0.6221778392791748, "reward_std": 0.17795583605766296, "rewards/code_reward": 0.5246331766247749, "rewards/format_reward": 0.9754464626312256, "step": 389 }, { "completion_length": 707.1830749511719, "epoch": 0.8280254777070064, "grad_norm": 0.25027066469192505, "kl": 0.15234375, "learning_rate": 1.0473961644101856e-06, "loss": 0.0016, "reward": 0.49339308589696884, "reward_std": 0.1599120758473873, "rewards/code_reward": 0.39450912177562714, "rewards/format_reward": 0.988839328289032, "step": 390 }, { "completion_length": 724.7522583007812, "epoch": 0.8301486199575372, "grad_norm": 0.2350330352783203, "kl": 0.193603515625, "learning_rate": 1.037903973997345e-06, "loss": 0.0021, "reward": 0.478931725025177, "reward_std": 0.12047621235251427, "rewards/code_reward": 0.3804941847920418, "rewards/format_reward": 0.9843750596046448, "step": 391 }, { "completion_length": 702.982177734375, "epoch": 0.832271762208068, "grad_norm": 0.3609310984611511, "kl": 0.179931640625, "learning_rate": 1.0284836197047737e-06, "loss": 0.0019, "reward": 0.44246046990156174, "reward_std": 0.1557149738073349, "rewards/code_reward": 0.3444693833589554, "rewards/format_reward": 0.9799107611179352, "step": 392 }, { "completion_length": 674.607177734375, "epoch": 0.8343949044585988, "grad_norm": 0.5464503765106201, "kl": 0.248046875, "learning_rate": 1.0191354967916712e-06, "loss": 0.0026, "reward": 0.5180330500006676, "reward_std": 0.1834750883281231, "rewards/code_reward": 0.4193723499774933, "rewards/format_reward": 0.9866071939468384, "step": 393 }, { "completion_length": 684.0937805175781, "epoch": 0.8365180467091295, "grad_norm": 0.23662471771240234, "kl": 0.1290283203125, "learning_rate": 1.0098599974865515e-06, "loss": 0.0014, "reward": 0.5139395222067833, "reward_std": 0.1551931146532297, "rewards/code_reward": 0.41594842076301575, "rewards/format_reward": 0.9799107611179352, "step": 394 }, { "completion_length": 692.5580596923828, "epoch": 0.8386411889596603, "grad_norm": 0.34932953119277954, "kl": 0.154296875, "learning_rate": 1.0006575109707898e-06, "loss": 0.0017, "reward": 0.5320730581879616, "reward_std": 0.205118702724576, "rewards/code_reward": 0.43318910896778107, "rewards/format_reward": 0.988839328289032, "step": 395 }, { "completion_length": 678.8571624755859, "epoch": 0.8407643312101911, "grad_norm": 0.5195670127868652, "kl": 0.1474609375, "learning_rate": 9.915284233622877e-07, "loss": 0.0016, "reward": 0.4320642352104187, "reward_std": 0.18216058425605297, "rewards/code_reward": 0.33362672477960587, "rewards/format_reward": 0.9843750298023224, "step": 396 }, { "completion_length": 706.8861999511719, "epoch": 0.8428874734607219, "grad_norm": 0.24882346391677856, "kl": 0.148681640625, "learning_rate": 9.824731176992796e-07, "loss": 0.0016, "reward": 0.5600469708442688, "reward_std": 0.16885506361722946, "rewards/code_reward": 0.4616094380617142, "rewards/format_reward": 0.9843750596046448, "step": 397 }, { "completion_length": 669.0491333007812, "epoch": 0.8450106157112527, "grad_norm": 1.0406914949417114, "kl": 0.364013671875, "learning_rate": 9.734919739242543e-07, "loss": 0.0037, "reward": 0.5749830156564713, "reward_std": 0.21774039044976234, "rewards/code_reward": 0.47676874697208405, "rewards/format_reward": 0.9821428954601288, "step": 398 }, { "completion_length": 723.325927734375, "epoch": 0.8471337579617835, "grad_norm": 0.5013810396194458, "kl": 0.1451416015625, "learning_rate": 9.645853688680177e-07, "loss": 0.0016, "reward": 0.5728159248828888, "reward_std": 0.1670310366898775, "rewards/code_reward": 0.4746016263961792, "rewards/format_reward": 0.9821428805589676, "step": 399 }, { "completion_length": 700.310302734375, "epoch": 0.8492569002123143, "grad_norm": 0.8073310852050781, "kl": 0.2965087890625, "learning_rate": 9.557536762338786e-07, "loss": 0.003, "reward": 0.492939718067646, "reward_std": 0.2011387124657631, "rewards/code_reward": 0.39494864642620087, "rewards/format_reward": 0.9799107611179352, "step": 400 }, { "completion_length": 693.0536041259766, "epoch": 0.851380042462845, "grad_norm": 0.3889514207839966, "kl": 0.164306640625, "learning_rate": 9.46997266581973e-07, "loss": 0.0018, "reward": 0.5752345323562622, "reward_std": 0.19828381016850471, "rewards/code_reward": 0.475680947303772, "rewards/format_reward": 0.9955357313156128, "step": 401 }, { "completion_length": 706.841552734375, "epoch": 0.8535031847133758, "grad_norm": 4.799881458282471, "kl": 0.4912109375, "learning_rate": 9.383165073137115e-07, "loss": 0.0051, "reward": 0.5113906338810921, "reward_std": 0.14735013246536255, "rewards/code_reward": 0.41295309364795685, "rewards/format_reward": 0.9843750596046448, "step": 402 }, { "completion_length": 691.2210235595703, "epoch": 0.8556263269639066, "grad_norm": 3.541896104812622, "kl": 0.14697265625, "learning_rate": 9.297117626563687e-07, "loss": 0.0016, "reward": 0.6038797795772552, "reward_std": 0.18652482330799103, "rewards/code_reward": 0.5065583363175392, "rewards/format_reward": 0.973214328289032, "step": 403 }, { "completion_length": 725.4933471679688, "epoch": 0.8577494692144374, "grad_norm": 158.1067352294922, "kl": 18.90283203125, "learning_rate": 9.211833936477957e-07, "loss": 0.1896, "reward": 0.5942443758249283, "reward_std": 0.12594054080545902, "rewards/code_reward": 0.4960300847887993, "rewards/format_reward": 0.9821428954601288, "step": 404 }, { "completion_length": 717.1585235595703, "epoch": 0.8598726114649682, "grad_norm": 2265.1884765625, "kl": 230.10986328125, "learning_rate": 9.127317581212753e-07, "loss": 2.3015, "reward": 0.53834218531847, "reward_std": 0.1464555226266384, "rewards/code_reward": 0.4394582211971283, "rewards/format_reward": 0.988839328289032, "step": 405 }, { "completion_length": 727.5826110839844, "epoch": 0.861995753715499, "grad_norm": 0.2873145341873169, "kl": 0.1866455078125, "learning_rate": 9.043572106905084e-07, "loss": 0.0019, "reward": 0.5367319211363792, "reward_std": 0.17168255895376205, "rewards/code_reward": 0.43851763010025024, "rewards/format_reward": 0.98214291036129, "step": 406 }, { "completion_length": 726.1674499511719, "epoch": 0.8641188959660298, "grad_norm": 0.2757129371166229, "kl": 0.1365966796875, "learning_rate": 8.960601027347321e-07, "loss": 0.0014, "reward": 0.5360690876841545, "reward_std": 0.2111339271068573, "rewards/code_reward": 0.4367387220263481, "rewards/format_reward": 0.9933035969734192, "step": 407 }, { "completion_length": 708.4241333007812, "epoch": 0.8662420382165605, "grad_norm": 1.7461967468261719, "kl": 0.15234375, "learning_rate": 8.878407823839788e-07, "loss": 0.0016, "reward": 0.4714769721031189, "reward_std": 0.17892321571707726, "rewards/code_reward": 0.3723698630928993, "rewards/format_reward": 0.9910714626312256, "step": 408 }, { "completion_length": 722.7344207763672, "epoch": 0.8683651804670913, "grad_norm": 1.359683632850647, "kl": 0.1497802734375, "learning_rate": 8.796995945044689e-07, "loss": 0.0017, "reward": 0.5647559985518456, "reward_std": 0.16498099640011787, "rewards/code_reward": 0.4656488224864006, "rewards/format_reward": 0.9910714477300644, "step": 409 }, { "completion_length": 758.9174499511719, "epoch": 0.8704883227176221, "grad_norm": 0.34433820843696594, "kl": 0.12939453125, "learning_rate": 8.716368806841405e-07, "loss": 0.0013, "reward": 0.40852154791355133, "reward_std": 0.19776060804724693, "rewards/code_reward": 0.30919117480516434, "rewards/format_reward": 0.9933035969734192, "step": 410 }, { "completion_length": 730.169677734375, "epoch": 0.8726114649681529, "grad_norm": 0.43445339798927307, "kl": 0.132080078125, "learning_rate": 8.636529792183171e-07, "loss": 0.0014, "reward": 0.5396310985088348, "reward_std": 0.19617567211389542, "rewards/code_reward": 0.44097036868333817, "rewards/format_reward": 0.9866071790456772, "step": 411 }, { "completion_length": 717.8705596923828, "epoch": 0.8747346072186837, "grad_norm": 0.5580800771713257, "kl": 0.192138671875, "learning_rate": 8.557482250955144e-07, "loss": 0.002, "reward": 0.4667212590575218, "reward_std": 0.20506682246923447, "rewards/code_reward": 0.36850695312023163, "rewards/format_reward": 0.9821428954601288, "step": 412 }, { "completion_length": 701.482177734375, "epoch": 0.8768577494692145, "grad_norm": 0.33230528235435486, "kl": 0.150146484375, "learning_rate": 8.479229499833844e-07, "loss": 0.0015, "reward": 0.5547576695680618, "reward_std": 0.21152211725711823, "rewards/code_reward": 0.4558737352490425, "rewards/format_reward": 0.9888393133878708, "step": 413 }, { "completion_length": 704.0469055175781, "epoch": 0.8789808917197452, "grad_norm": 0.3372839093208313, "kl": 0.1534423828125, "learning_rate": 8.401774822147976e-07, "loss": 0.0016, "reward": 0.5494333058595657, "reward_std": 0.24079465121030807, "rewards/code_reward": 0.4505493566393852, "rewards/format_reward": 0.988839328289032, "step": 414 }, { "completion_length": 723.966552734375, "epoch": 0.881104033970276, "grad_norm": 0.4163219630718231, "kl": 0.26123046875, "learning_rate": 8.325121467740695e-07, "loss": 0.0026, "reward": 0.3951665982604027, "reward_std": 0.18642807379364967, "rewards/code_reward": 0.29628264531493187, "rewards/format_reward": 0.988839328289032, "step": 415 }, { "completion_length": 736.9129943847656, "epoch": 0.8832271762208068, "grad_norm": 0.6581453084945679, "kl": 0.18310546875, "learning_rate": 8.249272652833226e-07, "loss": 0.0018, "reward": 0.4613909646868706, "reward_std": 0.14806298539042473, "rewards/code_reward": 0.3633998855948448, "rewards/format_reward": 0.9799107611179352, "step": 416 }, { "completion_length": 712.9754791259766, "epoch": 0.8853503184713376, "grad_norm": 1.2168219089508057, "kl": 0.2080078125, "learning_rate": 8.174231559889931e-07, "loss": 0.0021, "reward": 0.44138607382774353, "reward_std": 0.22260471060872078, "rewards/code_reward": 0.34317177161574364, "rewards/format_reward": 0.9821428805589676, "step": 417 }, { "completion_length": 711.1049346923828, "epoch": 0.8874734607218684, "grad_norm": 1.5622974634170532, "kl": 0.21630859375, "learning_rate": 8.100001337484787e-07, "loss": 0.0022, "reward": 0.5736604407429695, "reward_std": 0.20455688051879406, "rewards/code_reward": 0.4747764840722084, "rewards/format_reward": 0.9888393133878708, "step": 418 }, { "completion_length": 729.3861999511719, "epoch": 0.8895966029723992, "grad_norm": 0.5059235095977783, "kl": 0.16796875, "learning_rate": 8.026585100169251e-07, "loss": 0.0017, "reward": 0.4245912581682205, "reward_std": 0.151387682184577, "rewards/code_reward": 0.32637695223093033, "rewards/format_reward": 0.9821428954601288, "step": 419 }, { "completion_length": 690.7187652587891, "epoch": 0.89171974522293, "grad_norm": 6.739729881286621, "kl": 2.8837890625, "learning_rate": 7.953985928341601e-07, "loss": 0.0289, "reward": 0.5304828435182571, "reward_std": 0.157493332400918, "rewards/code_reward": 0.4313756823539734, "rewards/format_reward": 0.9910714477300644, "step": 420 }, { "completion_length": 694.5335083007812, "epoch": 0.8938428874734607, "grad_norm": 0.45631542801856995, "kl": 0.1708984375, "learning_rate": 7.882206868117693e-07, "loss": 0.0018, "reward": 0.4608374051749706, "reward_std": 0.1782052293419838, "rewards/code_reward": 0.36106058582663536, "rewards/format_reward": 0.9977678656578064, "step": 421 }, { "completion_length": 733.1339721679688, "epoch": 0.8959660297239915, "grad_norm": 1.1783860921859741, "kl": 0.181640625, "learning_rate": 7.81125093120313e-07, "loss": 0.0019, "reward": 0.4884042590856552, "reward_std": 0.164920412003994, "rewards/code_reward": 0.3899667263031006, "rewards/format_reward": 0.9843750298023224, "step": 422 }, { "completion_length": 694.8013610839844, "epoch": 0.8980891719745223, "grad_norm": 0.7121770977973938, "kl": 0.24853515625, "learning_rate": 7.741121094766916e-07, "loss": 0.0026, "reward": 0.5257243886590004, "reward_std": 0.15851808711886406, "rewards/code_reward": 0.426840465515852, "rewards/format_reward": 0.988839328289032, "step": 423 }, { "completion_length": 681.4174346923828, "epoch": 0.9002123142250531, "grad_norm": 0.739496648311615, "kl": 0.25439453125, "learning_rate": 7.671820301316532e-07, "loss": 0.0026, "reward": 0.4978240504860878, "reward_std": 0.17392848432064056, "rewards/code_reward": 0.39983299374580383, "rewards/format_reward": 0.979910746216774, "step": 424 }, { "completion_length": 727.6897583007812, "epoch": 0.9023354564755839, "grad_norm": 0.6177138090133667, "kl": 0.183349609375, "learning_rate": 7.603351458574474e-07, "loss": 0.0019, "reward": 0.44435514509677887, "reward_std": 0.13320972956717014, "rewards/code_reward": 0.34703367203474045, "rewards/format_reward": 0.9732143133878708, "step": 425 }, { "completion_length": 721.8772735595703, "epoch": 0.9044585987261147, "grad_norm": 1.0612621307373047, "kl": 0.2496337890625, "learning_rate": 7.535717439356255e-07, "loss": 0.0026, "reward": 0.4390544593334198, "reward_std": 0.15821044147014618, "rewards/code_reward": 0.3408401757478714, "rewards/format_reward": 0.9821428805589676, "step": 426 }, { "completion_length": 708.1295013427734, "epoch": 0.9065817409766455, "grad_norm": 0.3175060451030731, "kl": 0.15869140625, "learning_rate": 7.46892108144986e-07, "loss": 0.0017, "reward": 0.45070114731788635, "reward_std": 0.1746504958719015, "rewards/code_reward": 0.35181717574596405, "rewards/format_reward": 0.988839328289032, "step": 427 }, { "completion_length": 752.310302734375, "epoch": 0.9087048832271762, "grad_norm": 18.601308822631836, "kl": 3.44775390625, "learning_rate": 7.402965187496697e-07, "loss": 0.0348, "reward": 0.46990416944026947, "reward_std": 0.1597570963203907, "rewards/code_reward": 0.3723594844341278, "rewards/format_reward": 0.9754464626312256, "step": 428 }, { "completion_length": 730.3460235595703, "epoch": 0.910828025477707, "grad_norm": 8.600378036499023, "kl": 1.468994140625, "learning_rate": 7.337852524873974e-07, "loss": 0.0148, "reward": 0.6117217838764191, "reward_std": 0.21035557612776756, "rewards/code_reward": 0.5126146152615547, "rewards/format_reward": 0.9910714626312256, "step": 429 }, { "completion_length": 710.6138610839844, "epoch": 0.9129511677282378, "grad_norm": 0.4506695568561554, "kl": 0.20361328125, "learning_rate": 7.273585825578608e-07, "loss": 0.0022, "reward": 0.4428362399339676, "reward_std": 0.12803563103079796, "rewards/code_reward": 0.34372907504439354, "rewards/format_reward": 0.9910714626312256, "step": 430 }, { "completion_length": 658.6228179931641, "epoch": 0.9150743099787686, "grad_norm": 5.093682765960693, "kl": 0.5947265625, "learning_rate": 7.21016778611259e-07, "loss": 0.0061, "reward": 0.5427140817046165, "reward_std": 0.19685931131243706, "rewards/code_reward": 0.4447230063378811, "rewards/format_reward": 0.9799107760190964, "step": 431 }, { "completion_length": 677.9040222167969, "epoch": 0.9171974522292994, "grad_norm": 38.870262145996094, "kl": 5.4326171875, "learning_rate": 7.147601067369835e-07, "loss": 0.0545, "reward": 0.5093298330903053, "reward_std": 0.19096140936017036, "rewards/code_reward": 0.41111550480127335, "rewards/format_reward": 0.9821428954601288, "step": 432 }, { "completion_length": 694.513427734375, "epoch": 0.9193205944798302, "grad_norm": 0.5155165195465088, "kl": 0.155029296875, "learning_rate": 7.085888294524561e-07, "loss": 0.0016, "reward": 0.5259926542639732, "reward_std": 0.18491110764443874, "rewards/code_reward": 0.42733194679021835, "rewards/format_reward": 0.9866071790456772, "step": 433 }, { "completion_length": 704.5580596923828, "epoch": 0.921443736730361, "grad_norm": 0.6282893419265747, "kl": 0.3359375, "learning_rate": 7.025032056921117e-07, "loss": 0.0034, "reward": 0.5899785161018372, "reward_std": 0.19566836208105087, "rewards/code_reward": 0.4913177192211151, "rewards/format_reward": 0.9866071790456772, "step": 434 }, { "completion_length": 722.1562805175781, "epoch": 0.9235668789808917, "grad_norm": 1.048966884613037, "kl": 0.4742431640625, "learning_rate": 6.965034907965349e-07, "loss": 0.0049, "reward": 0.5559424459934235, "reward_std": 0.2080874666571617, "rewards/code_reward": 0.4588441997766495, "rewards/format_reward": 0.9709821939468384, "step": 435 }, { "completion_length": 679.9933319091797, "epoch": 0.9256900212314225, "grad_norm": 0.6251688599586487, "kl": 0.171142578125, "learning_rate": 6.905899365017462e-07, "loss": 0.0018, "reward": 0.5245073512196541, "reward_std": 0.17461021803319454, "rewards/code_reward": 0.42606981843709946, "rewards/format_reward": 0.9843750447034836, "step": 436 }, { "completion_length": 711.6942443847656, "epoch": 0.9278131634819533, "grad_norm": 1.1648685932159424, "kl": 0.299560546875, "learning_rate": 6.847627909286409e-07, "loss": 0.003, "reward": 0.41069934517145157, "reward_std": 0.17594012804329395, "rewards/code_reward": 0.31226181238889694, "rewards/format_reward": 0.9843750447034836, "step": 437 }, { "completion_length": 702.3482513427734, "epoch": 0.9299363057324841, "grad_norm": 1.5311229228973389, "kl": 0.31640625, "learning_rate": 6.790222985725761e-07, "loss": 0.0033, "reward": 0.5770048946142197, "reward_std": 0.1962369978427887, "rewards/code_reward": 0.4790138080716133, "rewards/format_reward": 0.9799107611179352, "step": 438 }, { "completion_length": 683.4151916503906, "epoch": 0.9320594479830149, "grad_norm": 8.243356704711914, "kl": 3.05126953125, "learning_rate": 6.733687002931141e-07, "loss": 0.0306, "reward": 0.5087217092514038, "reward_std": 0.1651569865643978, "rewards/code_reward": 0.4109538644552231, "rewards/format_reward": 0.9776786267757416, "step": 439 }, { "completion_length": 713.6049346923828, "epoch": 0.9341825902335457, "grad_norm": 1.4741530418395996, "kl": 0.967529296875, "learning_rate": 6.678022333039158e-07, "loss": 0.0098, "reward": 0.587900809943676, "reward_std": 0.16147084161639214, "rewards/code_reward": 0.4903561547398567, "rewards/format_reward": 0.9754464626312256, "step": 440 }, { "completion_length": 677.8303833007812, "epoch": 0.9363057324840764, "grad_norm": 0.3179962933063507, "kl": 0.230224609375, "learning_rate": 6.623231311627876e-07, "loss": 0.0025, "reward": 0.561469204723835, "reward_std": 0.16684554889798164, "rewards/code_reward": 0.4625852555036545, "rewards/format_reward": 0.9888393133878708, "step": 441 }, { "completion_length": 725.1004791259766, "epoch": 0.9384288747346072, "grad_norm": 2.448838233947754, "kl": 1.276123046875, "learning_rate": 6.569316237618811e-07, "loss": 0.0127, "reward": 0.3736302964389324, "reward_std": 0.18804692663252354, "rewards/code_reward": 0.2751928083598614, "rewards/format_reward": 0.9843750298023224, "step": 442 }, { "completion_length": 710.9286041259766, "epoch": 0.940552016985138, "grad_norm": 0.38171106576919556, "kl": 0.2259521484375, "learning_rate": 6.516279373180499e-07, "loss": 0.0024, "reward": 0.45342515781521797, "reward_std": 0.16657396219670773, "rewards/code_reward": 0.3540947772562504, "rewards/format_reward": 0.9933035969734192, "step": 443 }, { "completion_length": 665.5625305175781, "epoch": 0.9426751592356688, "grad_norm": 0.5256981253623962, "kl": 0.63818359375, "learning_rate": 6.464122943633543e-07, "loss": 0.0066, "reward": 0.5117220133543015, "reward_std": 0.17998000979423523, "rewards/code_reward": 0.4126148596405983, "rewards/format_reward": 0.9910714626312256, "step": 444 }, { "completion_length": 669.888427734375, "epoch": 0.9447983014861996, "grad_norm": 10.391777038574219, "kl": 1.935546875, "learning_rate": 6.412849137357271e-07, "loss": 0.0195, "reward": 0.577217735350132, "reward_std": 0.18068324774503708, "rewards/code_reward": 0.47878019511699677, "rewards/format_reward": 0.9843750596046448, "step": 445 }, { "completion_length": 706.747802734375, "epoch": 0.9469214437367304, "grad_norm": 0.7888285517692566, "kl": 0.395263671875, "learning_rate": 6.3624601056979e-07, "loss": 0.0041, "reward": 0.5674577727913857, "reward_std": 0.14589250087738037, "rewards/code_reward": 0.469020277261734, "rewards/format_reward": 0.9843750447034836, "step": 446 }, { "completion_length": 698.3861999511719, "epoch": 0.9490445859872612, "grad_norm": 0.5909515619277954, "kl": 0.4178466796875, "learning_rate": 6.312957962878278e-07, "loss": 0.0042, "reward": 0.44434136897325516, "reward_std": 0.1476050168275833, "rewards/code_reward": 0.3447878174483776, "rewards/format_reward": 0.9955357313156128, "step": 447 }, { "completion_length": 697.1138763427734, "epoch": 0.9511677282377919, "grad_norm": 0.3049964904785156, "kl": 0.36083984375, "learning_rate": 6.264344785909181e-07, "loss": 0.0036, "reward": 0.5054452195763588, "reward_std": 0.16559578850865364, "rewards/code_reward": 0.40633804351091385, "rewards/format_reward": 0.9910714477300644, "step": 448 }, { "completion_length": 699.5000457763672, "epoch": 0.9532908704883227, "grad_norm": 2.360261917114258, "kl": 1.0093994140625, "learning_rate": 6.216622614502149e-07, "loss": 0.0102, "reward": 0.43248920887708664, "reward_std": 0.20951998233795166, "rewards/code_reward": 0.3344981260597706, "rewards/format_reward": 0.9799107611179352, "step": 449 }, { "completion_length": 711.5647735595703, "epoch": 0.9554140127388535, "grad_norm": 0.45019006729125977, "kl": 0.396728515625, "learning_rate": 6.169793450983916e-07, "loss": 0.0041, "reward": 0.4090769328176975, "reward_std": 0.1387995146214962, "rewards/code_reward": 0.30996978655457497, "rewards/format_reward": 0.9910714626312256, "step": 450 }, { "completion_length": 687.4710083007812, "epoch": 0.9575371549893843, "grad_norm": 1.139167070388794, "kl": 0.70947265625, "learning_rate": 6.123859260212393e-07, "loss": 0.0073, "reward": 0.6231836080551147, "reward_std": 0.18272383697330952, "rewards/code_reward": 0.5249693095684052, "rewards/format_reward": 0.9821428954601288, "step": 451 }, { "completion_length": 664.4620666503906, "epoch": 0.9596602972399151, "grad_norm": 11.963510513305664, "kl": 2.9296875, "learning_rate": 6.07882196949423e-07, "loss": 0.0292, "reward": 0.5648458003997803, "reward_std": 0.21996057033538818, "rewards/code_reward": 0.4666314870119095, "rewards/format_reward": 0.9821428954601288, "step": 452 }, { "completion_length": 672.8326110839844, "epoch": 0.9617834394904459, "grad_norm": 0.23568426072597504, "kl": 0.138427734375, "learning_rate": 6.034683468503948e-07, "loss": 0.0015, "reward": 0.5011638775467873, "reward_std": 0.1840323582291603, "rewards/code_reward": 0.4020567089319229, "rewards/format_reward": 0.9910714626312256, "step": 453 }, { "completion_length": 692.2835083007812, "epoch": 0.9639065817409767, "grad_norm": 1.3131980895996094, "kl": 0.73291015625, "learning_rate": 5.991445609204641e-07, "loss": 0.0073, "reward": 0.49861256778240204, "reward_std": 0.19007166847586632, "rewards/code_reward": 0.4006215110421181, "rewards/format_reward": 0.979910746216774, "step": 454 }, { "completion_length": 680.5536041259766, "epoch": 0.9660297239915074, "grad_norm": 1.0419756174087524, "kl": 0.8876953125, "learning_rate": 5.949110205770292e-07, "loss": 0.009, "reward": 0.5448554530739784, "reward_std": 0.19055398926138878, "rewards/code_reward": 0.44686436653137207, "rewards/format_reward": 0.9799107611179352, "step": 455 }, { "completion_length": 693.2187805175781, "epoch": 0.9681528662420382, "grad_norm": 0.7175102233886719, "kl": 0.481201171875, "learning_rate": 5.90767903450964e-07, "loss": 0.0049, "reward": 0.4721348285675049, "reward_std": 0.14595188200473785, "rewards/code_reward": 0.37302765995264053, "rewards/format_reward": 0.9910714626312256, "step": 456 }, { "completion_length": 695.2723541259766, "epoch": 0.970276008492569, "grad_norm": 0.4296894371509552, "kl": 0.26416015625, "learning_rate": 5.867153833791652e-07, "loss": 0.0027, "reward": 0.6006196290254593, "reward_std": 0.17042616941034794, "rewards/code_reward": 0.5019589066505432, "rewards/format_reward": 0.9866071790456772, "step": 457 }, { "completion_length": 692.6652221679688, "epoch": 0.9723991507430998, "grad_norm": 0.4421376585960388, "kl": 0.31982421875, "learning_rate": 5.827536303972587e-07, "loss": 0.0033, "reward": 0.5808815285563469, "reward_std": 0.2226531021296978, "rewards/code_reward": 0.4815511405467987, "rewards/format_reward": 0.9933035969734192, "step": 458 }, { "completion_length": 679.247802734375, "epoch": 0.9745222929936306, "grad_norm": 0.40139421820640564, "kl": 0.47216796875, "learning_rate": 5.78882810732465e-07, "loss": 0.0048, "reward": 0.5371674299240112, "reward_std": 0.22804895788431168, "rewards/code_reward": 0.43962281197309494, "rewards/format_reward": 0.9754464626312256, "step": 459 }, { "completion_length": 706.5379638671875, "epoch": 0.9766454352441614, "grad_norm": 0.8857892751693726, "kl": 0.51220703125, "learning_rate": 5.75103086796625e-07, "loss": 0.0052, "reward": 0.4905061312019825, "reward_std": 0.1841362752020359, "rewards/code_reward": 0.39206862077116966, "rewards/format_reward": 0.9843750447034836, "step": 460 }, { "completion_length": 690.3415374755859, "epoch": 0.9787685774946921, "grad_norm": 0.6516154408454895, "kl": 0.439697265625, "learning_rate": 5.714146171793846e-07, "loss": 0.0045, "reward": 0.5876915380358696, "reward_std": 0.15721704810857773, "rewards/code_reward": 0.4892539754509926, "rewards/format_reward": 0.9843750298023224, "step": 461 }, { "completion_length": 681.5156402587891, "epoch": 0.9808917197452229, "grad_norm": 0.618622362613678, "kl": 0.48046875, "learning_rate": 5.678175566415422e-07, "loss": 0.0048, "reward": 0.49158109724521637, "reward_std": 0.1944441720843315, "rewards/code_reward": 0.39381323754787445, "rewards/format_reward": 0.9776786118745804, "step": 462 }, { "completion_length": 722.8750305175781, "epoch": 0.9830148619957537, "grad_norm": 0.7218803763389587, "kl": 0.565185546875, "learning_rate": 5.643120561085528e-07, "loss": 0.0057, "reward": 0.4738345965743065, "reward_std": 0.24430794268846512, "rewards/code_reward": 0.37651316076517105, "rewards/format_reward": 0.9732143133878708, "step": 463 }, { "completion_length": 682.4219055175781, "epoch": 0.9851380042462845, "grad_norm": 0.7259976863861084, "kl": 0.706298828125, "learning_rate": 5.608982626641991e-07, "loss": 0.0071, "reward": 0.47413645684719086, "reward_std": 0.21057153865695, "rewards/code_reward": 0.3761453852057457, "rewards/format_reward": 0.9799107611179352, "step": 464 }, { "completion_length": 716.5736999511719, "epoch": 0.9872611464968153, "grad_norm": 0.2483934909105301, "kl": 0.260009765625, "learning_rate": 5.575763195444166e-07, "loss": 0.0027, "reward": 0.5671171024441719, "reward_std": 0.19927529990673065, "rewards/code_reward": 0.46912601590156555, "rewards/format_reward": 0.9799107611179352, "step": 465 }, { "completion_length": 680.6339416503906, "epoch": 0.9893842887473461, "grad_norm": 1.9431463479995728, "kl": 1.3564453125, "learning_rate": 5.543463661312847e-07, "loss": 0.0136, "reward": 0.417750583961606, "reward_std": 0.13368695229291916, "rewards/code_reward": 0.3197594955563545, "rewards/format_reward": 0.9799107611179352, "step": 466 }, { "completion_length": 684.9911041259766, "epoch": 0.9915074309978769, "grad_norm": 0.8100730776786804, "kl": 0.4619140625, "learning_rate": 5.512085379471808e-07, "loss": 0.0048, "reward": 0.6499997675418854, "reward_std": 0.20544839650392532, "rewards/code_reward": 0.5511157959699631, "rewards/format_reward": 0.988839328289032, "step": 467 }, { "completion_length": 681.5670013427734, "epoch": 0.9936305732484076, "grad_norm": 3.3746109008789062, "kl": 1.414306640625, "learning_rate": 5.481629666490903e-07, "loss": 0.0142, "reward": 0.5468520447611809, "reward_std": 0.21051420643925667, "rewards/code_reward": 0.44774486869573593, "rewards/format_reward": 0.9910714477300644, "step": 468 }, { "completion_length": 688.2388610839844, "epoch": 0.9957537154989384, "grad_norm": 1.0109045505523682, "kl": 1.15380859375, "learning_rate": 5.452097800230853e-07, "loss": 0.0116, "reward": 0.6098516285419464, "reward_std": 0.211056686937809, "rewards/code_reward": 0.5116373002529144, "rewards/format_reward": 0.9821428954601288, "step": 469 }, { "completion_length": 681.9821624755859, "epoch": 0.9978768577494692, "grad_norm": 0.7048155665397644, "kl": 0.809814453125, "learning_rate": 5.423491019789623e-07, "loss": 0.0082, "reward": 0.45874594151973724, "reward_std": 0.14819572865962982, "rewards/code_reward": 0.3596387729048729, "rewards/format_reward": 0.9910714626312256, "step": 470 }, { "completion_length": 707.5000457763672, "epoch": 1.0, "grad_norm": 2.86737060546875, "kl": 1.1783447265625, "learning_rate": 5.395810525450425e-07, "loss": 0.0118, "reward": 0.5169450491666794, "reward_std": 0.18883745186030865, "rewards/code_reward": 0.41850756853818893, "rewards/format_reward": 0.9843750298023224, "step": 471 }, { "completion_length": 677.5982360839844, "epoch": 1.0021231422505308, "grad_norm": 1.9091771841049194, "kl": 1.307861328125, "learning_rate": 5.369057478631359e-07, "loss": 0.0132, "reward": 0.5092417150735855, "reward_std": 0.18099428340792656, "rewards/code_reward": 0.4110274314880371, "rewards/format_reward": 0.9821428805589676, "step": 472 }, { "completion_length": 711.9843902587891, "epoch": 1.0042462845010616, "grad_norm": 1.6537326574325562, "kl": 1.21533203125, "learning_rate": 5.343233001836694e-07, "loss": 0.0122, "reward": 0.48672058433294296, "reward_std": 0.19152027182281017, "rewards/code_reward": 0.38939911872148514, "rewards/format_reward": 0.9732143133878708, "step": 473 }, { "completion_length": 707.3102874755859, "epoch": 1.0063694267515924, "grad_norm": 0.8889822959899902, "kl": 0.529541015625, "learning_rate": 5.318338178609754e-07, "loss": 0.0054, "reward": 0.5736411809921265, "reward_std": 0.17692103423178196, "rewards/code_reward": 0.4749804362654686, "rewards/format_reward": 0.9866071939468384, "step": 474 }, { "completion_length": 736.5714721679688, "epoch": 1.0084925690021231, "grad_norm": 1.3358269929885864, "kl": 0.98828125, "learning_rate": 5.294374053487459e-07, "loss": 0.0099, "reward": 0.44529393315315247, "reward_std": 0.17480986192822456, "rewards/code_reward": 0.3468564301729202, "rewards/format_reward": 0.9843750596046448, "step": 475 }, { "completion_length": 711.4241485595703, "epoch": 1.010615711252654, "grad_norm": 1.4289793968200684, "kl": 1.22998046875, "learning_rate": 5.271341631956511e-07, "loss": 0.0123, "reward": 0.5166614726185799, "reward_std": 0.1912681832909584, "rewards/code_reward": 0.42000964283943176, "rewards/format_reward": 0.96651791036129, "step": 476 }, { "completion_length": 695.966552734375, "epoch": 1.0127388535031847, "grad_norm": 1.04447340965271, "kl": 0.756103515625, "learning_rate": 5.249241880411181e-07, "loss": 0.0076, "reward": 0.5925345048308372, "reward_std": 0.20060284808278084, "rewards/code_reward": 0.4952130541205406, "rewards/format_reward": 0.9732143133878708, "step": 477 }, { "completion_length": 694.6986846923828, "epoch": 1.0148619957537155, "grad_norm": 0.8483067750930786, "kl": 0.3671875, "learning_rate": 5.228075726112785e-07, "loss": 0.0039, "reward": 0.5394521579146385, "reward_std": 0.12158003821969032, "rewards/code_reward": 0.44079139083623886, "rewards/format_reward": 0.986607164144516, "step": 478 }, { "completion_length": 708.294677734375, "epoch": 1.0169851380042463, "grad_norm": 2.820655584335327, "kl": 2.081787109375, "learning_rate": 5.207844057150768e-07, "loss": 0.0209, "reward": 0.530554287135601, "reward_std": 0.18540234863758087, "rewards/code_reward": 0.4339024946093559, "rewards/format_reward": 0.9665178954601288, "step": 479 }, { "completion_length": 717.2143249511719, "epoch": 1.019108280254777, "grad_norm": 0.23074620962142944, "kl": 0.481689453125, "learning_rate": 5.188547722405437e-07, "loss": 0.005, "reward": 0.6097277328372002, "reward_std": 0.2097402885556221, "rewards/code_reward": 0.5108437687158585, "rewards/format_reward": 0.988839328289032, "step": 480 }, { "completion_length": 673.4553833007812, "epoch": 1.0212314225053079, "grad_norm": 18.151901245117188, "kl": 6.5419921875, "learning_rate": 5.170187531512351e-07, "loss": 0.0654, "reward": 0.4982636645436287, "reward_std": 0.18235952779650688, "rewards/code_reward": 0.4000493362545967, "rewards/format_reward": 0.9821428954601288, "step": 481 }, { "completion_length": 657.0223541259766, "epoch": 1.0233545647558386, "grad_norm": 1.2006980180740356, "kl": 0.98583984375, "learning_rate": 5.152764254828348e-07, "loss": 0.0101, "reward": 0.6024035438895226, "reward_std": 0.18741042539477348, "rewards/code_reward": 0.5044124275445938, "rewards/format_reward": 0.979910746216774, "step": 482 }, { "completion_length": 679.5446624755859, "epoch": 1.0254777070063694, "grad_norm": 6.640290260314941, "kl": 2.26318359375, "learning_rate": 5.136278623399225e-07, "loss": 0.0229, "reward": 0.6333309859037399, "reward_std": 0.16317120380699635, "rewards/code_reward": 0.5337774083018303, "rewards/format_reward": 0.9955357313156128, "step": 483 }, { "completion_length": 690.9464721679688, "epoch": 1.0276008492569002, "grad_norm": 1.2075289487838745, "kl": 0.635498046875, "learning_rate": 5.120731328929058e-07, "loss": 0.0065, "reward": 0.6160075142979622, "reward_std": 0.18631838634610176, "rewards/code_reward": 0.516677126288414, "rewards/format_reward": 0.9933035969734192, "step": 484 }, { "completion_length": 711.2254791259766, "epoch": 1.029723991507431, "grad_norm": 0.6530643105506897, "kl": 0.88671875, "learning_rate": 5.106123023751187e-07, "loss": 0.009, "reward": 0.5319265574216843, "reward_std": 0.16448520869016647, "rewards/code_reward": 0.43304260820150375, "rewards/format_reward": 0.9888393133878708, "step": 485 }, { "completion_length": 697.0067291259766, "epoch": 1.0318471337579618, "grad_norm": 0.7889028787612915, "kl": 0.486083984375, "learning_rate": 5.092454320800833e-07, "loss": 0.0049, "reward": 0.5322659835219383, "reward_std": 0.2203526459634304, "rewards/code_reward": 0.4340517073869705, "rewards/format_reward": 0.9821428954601288, "step": 486 }, { "completion_length": 707.5915374755859, "epoch": 1.0339702760084926, "grad_norm": 1.0391658544540405, "kl": 1.347412109375, "learning_rate": 5.079725793589405e-07, "loss": 0.0136, "reward": 0.5818885043263435, "reward_std": 0.18653497844934464, "rewards/code_reward": 0.4838974103331566, "rewards/format_reward": 0.979910746216774, "step": 487 }, { "completion_length": 681.9509124755859, "epoch": 1.0360934182590233, "grad_norm": 1.4511501789093018, "kl": 0.91943359375, "learning_rate": 5.067937976180407e-07, "loss": 0.0092, "reward": 0.20120449364185333, "reward_std": 0.06054047856014222, "rewards/code_reward": 0.10365983843803406, "rewards/format_reward": 0.9754464775323868, "step": 488 }, { "completion_length": 696.1696624755859, "epoch": 1.0382165605095541, "grad_norm": 1.0735193490982056, "kl": 0.93994140625, "learning_rate": 5.057091363167046e-07, "loss": 0.0095, "reward": 0.41191001795232296, "reward_std": 0.11514822754543275, "rewards/code_reward": 0.31324928998947144, "rewards/format_reward": 0.9866071939468384, "step": 489 }, { "completion_length": 721.6607513427734, "epoch": 1.040339702760085, "grad_norm": 1.8616191148757935, "kl": 1.632080078125, "learning_rate": 5.047186409651489e-07, "loss": 0.0165, "reward": 0.5570781454443932, "reward_std": 0.17984510958194733, "rewards/code_reward": 0.45886383950710297, "rewards/format_reward": 0.9821428954601288, "step": 490 }, { "completion_length": 674.4308319091797, "epoch": 1.0424628450106157, "grad_norm": 2.0347139835357666, "kl": 1.854736328125, "learning_rate": 5.038223531225742e-07, "loss": 0.0186, "reward": 0.4472319483757019, "reward_std": 0.20929547771811485, "rewards/code_reward": 0.3496873155236244, "rewards/format_reward": 0.9754464626312256, "step": 491 }, { "completion_length": 684.2701263427734, "epoch": 1.0445859872611465, "grad_norm": 0.47032228112220764, "kl": 0.452392578125, "learning_rate": 5.030203103954232e-07, "loss": 0.0046, "reward": 0.6024687513709068, "reward_std": 0.20001190528273582, "rewards/code_reward": 0.5038080215454102, "rewards/format_reward": 0.986607164144516, "step": 492 }, { "completion_length": 749.888427734375, "epoch": 1.0467091295116773, "grad_norm": 1.3009785413742065, "kl": 0.7442626953125, "learning_rate": 5.023125464358026e-07, "loss": 0.0075, "reward": 0.4289785400032997, "reward_std": 0.19978297501802444, "rewards/code_reward": 0.3307642340660095, "rewards/format_reward": 0.9821428805589676, "step": 493 }, { "completion_length": 707.247802734375, "epoch": 1.048832271762208, "grad_norm": 2.7150216102600098, "kl": 1.939453125, "learning_rate": 5.016990909400709e-07, "loss": 0.0195, "reward": 0.48099584877491, "reward_std": 0.17564579099416733, "rewards/code_reward": 0.3834511674940586, "rewards/format_reward": 0.9754464775323868, "step": 494 }, { "completion_length": 712.8861999511719, "epoch": 1.0509554140127388, "grad_norm": 1.259710431098938, "kl": 1.7119140625, "learning_rate": 5.011799696475915e-07, "loss": 0.0172, "reward": 0.5863819345831871, "reward_std": 0.17422104254364967, "rewards/code_reward": 0.48883724212646484, "rewards/format_reward": 0.9754464775323868, "step": 495 }, { "completion_length": 670.3125305175781, "epoch": 1.0530785562632696, "grad_norm": 1.799985408782959, "kl": 1.3544921875, "learning_rate": 5.007552043396547e-07, "loss": 0.0137, "reward": 0.6773558109998703, "reward_std": 0.21710924059152603, "rewards/code_reward": 0.578471876680851, "rewards/format_reward": 0.9888393133878708, "step": 496 }, { "completion_length": 647.8727874755859, "epoch": 1.0552016985138004, "grad_norm": 1.1735022068023682, "kl": 0.455810546875, "learning_rate": 5.004248128385618e-07, "loss": 0.0047, "reward": 0.6235345751047134, "reward_std": 0.20276143215596676, "rewards/code_reward": 0.5259898751974106, "rewards/format_reward": 0.9754464775323868, "step": 497 }, { "completion_length": 720.7388610839844, "epoch": 1.0573248407643312, "grad_norm": 1.039088487625122, "kl": 1.011474609375, "learning_rate": 5.001888090068784e-07, "loss": 0.0102, "reward": 0.5388440862298012, "reward_std": 0.1800019945949316, "rewards/code_reward": 0.4397369250655174, "rewards/format_reward": 0.9910714626312256, "step": 498 }, { "completion_length": 737.9375305175781, "epoch": 1.059447983014862, "grad_norm": 2.2365331649780273, "kl": 0.947998046875, "learning_rate": 5.000472027468528e-07, "loss": 0.0095, "reward": 0.5870940536260605, "reward_std": 0.17025620490312576, "rewards/code_reward": 0.4893261566758156, "rewards/format_reward": 0.9776785969734192, "step": 499 }, { "completion_length": 686.7678833007812, "epoch": 1.0615711252653928, "grad_norm": 11.270224571228027, "kl": 3.646484375, "learning_rate": 5.000000000000001e-07, "loss": 0.0367, "reward": 0.2899981178343296, "reward_std": 0.10342313535511494, "rewards/code_reward": 0.1917838342487812, "rewards/format_reward": 0.9821428954601288, "step": 500 }, { "epoch": 1.0615711252653928, "step": 500, "total_flos": 0.0, "train_loss": 0.008756054809940243, "train_runtime": 191583.7312, "train_samples_per_second": 1.169, "train_steps_per_second": 0.003 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }