|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0615711252653928, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 746.1138916015625, |
|
"epoch": 0.0021231422505307855, |
|
"grad_norm": 0.21636255085468292, |
|
"kl": 0.0, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 0.0, |
|
"reward": 0.11085444036871195, |
|
"reward_std": 0.15387122705578804, |
|
"rewards/code_reward": 0.11063122469931841, |
|
"rewards/format_reward": 0.0022321429569274187, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 741.6986846923828, |
|
"epoch": 0.004246284501061571, |
|
"grad_norm": 0.21392129361629486, |
|
"kl": 0.0, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.10614843107759953, |
|
"reward_std": 0.15658440068364143, |
|
"rewards/code_reward": 0.10592522472143173, |
|
"rewards/format_reward": 0.0022321429569274187, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 756.122802734375, |
|
"epoch": 0.006369426751592357, |
|
"grad_norm": 0.21344353258609772, |
|
"kl": 6.717443466186523e-05, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0, |
|
"reward": 0.12699278071522713, |
|
"reward_std": 0.15882046334445477, |
|
"rewards/code_reward": 0.12676957063376904, |
|
"rewards/format_reward": 0.0022321429569274187, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 760.8817291259766, |
|
"epoch": 0.008492569002123142, |
|
"grad_norm": 0.2038053572177887, |
|
"kl": 7.62939453125e-05, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.0, |
|
"reward": 0.06782207638025284, |
|
"reward_std": 0.1164214089512825, |
|
"rewards/code_reward": 0.06782207870855927, |
|
"rewards/format_reward": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 764.8817291259766, |
|
"epoch": 0.010615711252653927, |
|
"grad_norm": 0.20332112908363342, |
|
"kl": 7.510185241699219e-05, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.0, |
|
"reward": 0.07368321809917688, |
|
"reward_std": 0.11412223428487778, |
|
"rewards/code_reward": 0.07368322089314461, |
|
"rewards/format_reward": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 733.0937805175781, |
|
"epoch": 0.012738853503184714, |
|
"grad_norm": 0.2384093701839447, |
|
"kl": 8.344650268554688e-05, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.0, |
|
"reward": 0.11057165823876858, |
|
"reward_std": 0.13630107790231705, |
|
"rewards/code_reward": 0.11057165637612343, |
|
"rewards/format_reward": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 747.8013763427734, |
|
"epoch": 0.014861995753715499, |
|
"grad_norm": 0.21190612018108368, |
|
"kl": 9.250640869140625e-05, |
|
"learning_rate": 2.3333333333333336e-06, |
|
"loss": 0.0, |
|
"reward": 0.13999284896999598, |
|
"reward_std": 0.14958541933447123, |
|
"rewards/code_reward": 0.13999284896999598, |
|
"rewards/format_reward": 0.0, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 747.6540679931641, |
|
"epoch": 0.016985138004246284, |
|
"grad_norm": 0.1906791776418686, |
|
"kl": 0.00013947486877441406, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.0, |
|
"reward": 0.0754421940073371, |
|
"reward_std": 0.10426154918968678, |
|
"rewards/code_reward": 0.07521897740662098, |
|
"rewards/format_reward": 0.0022321429569274187, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 710.5401916503906, |
|
"epoch": 0.01910828025477707, |
|
"grad_norm": 0.20240359008312225, |
|
"kl": 0.0002300739288330078, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0, |
|
"reward": 0.12234624661505222, |
|
"reward_std": 0.10050993971526623, |
|
"rewards/code_reward": 0.12234624475240707, |
|
"rewards/format_reward": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 760.1808319091797, |
|
"epoch": 0.021231422505307854, |
|
"grad_norm": 0.34670934081077576, |
|
"kl": 0.0004100799560546875, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.0, |
|
"reward": 0.055801121750846505, |
|
"reward_std": 0.06395915220491588, |
|
"rewards/code_reward": 0.05580112128518522, |
|
"rewards/format_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 750.3906555175781, |
|
"epoch": 0.02335456475583864, |
|
"grad_norm": 0.21112516522407532, |
|
"kl": 0.0007238388061523438, |
|
"learning_rate": 3.6666666666666666e-06, |
|
"loss": 0.0, |
|
"reward": 0.06025231350213289, |
|
"reward_std": 0.09869139082729816, |
|
"rewards/code_reward": 0.06025231350213289, |
|
"rewards/format_reward": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 700.1384124755859, |
|
"epoch": 0.025477707006369428, |
|
"grad_norm": 0.22157742083072662, |
|
"kl": 0.00104522705078125, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0, |
|
"reward": 0.1535286195576191, |
|
"reward_std": 0.16596542671322823, |
|
"rewards/code_reward": 0.1530821956694126, |
|
"rewards/format_reward": 0.004464285913854837, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 702.9152069091797, |
|
"epoch": 0.027600849256900213, |
|
"grad_norm": 0.24524690210819244, |
|
"kl": 0.0017261505126953125, |
|
"learning_rate": 4.333333333333334e-06, |
|
"loss": 0.0, |
|
"reward": 0.19020407181233168, |
|
"reward_std": 0.16583579406142235, |
|
"rewards/code_reward": 0.1902040634304285, |
|
"rewards/format_reward": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 714.9486999511719, |
|
"epoch": 0.029723991507430998, |
|
"grad_norm": 0.18179796636104584, |
|
"kl": 0.00283050537109375, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.0, |
|
"reward": 0.06596253952011466, |
|
"reward_std": 0.08220406854525208, |
|
"rewards/code_reward": 0.06596253253519535, |
|
"rewards/format_reward": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 683.7835083007812, |
|
"epoch": 0.03184713375796178, |
|
"grad_norm": 0.18836897611618042, |
|
"kl": 0.004302978515625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0, |
|
"reward": 0.10805188585072756, |
|
"reward_std": 0.0908731259405613, |
|
"rewards/code_reward": 0.10782866925001144, |
|
"rewards/format_reward": 0.0022321429569274187, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 641.216552734375, |
|
"epoch": 0.03397027600849257, |
|
"grad_norm": 0.22146466374397278, |
|
"kl": 0.00562286376953125, |
|
"learning_rate": 4.999952797253148e-06, |
|
"loss": 0.0001, |
|
"reward": 0.20005132257938385, |
|
"reward_std": 0.16782562248408794, |
|
"rewards/code_reward": 0.19938167929649353, |
|
"rewards/format_reward": 0.006696428870782256, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 644.9598541259766, |
|
"epoch": 0.036093418259023353, |
|
"grad_norm": 0.20937775075435638, |
|
"kl": 0.00730133056640625, |
|
"learning_rate": 4.9998111909931225e-06, |
|
"loss": 0.0001, |
|
"reward": 0.1508529670536518, |
|
"reward_std": 0.1663584616035223, |
|
"rewards/code_reward": 0.15040653757750988, |
|
"rewards/format_reward": 0.004464285913854837, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 608.2477874755859, |
|
"epoch": 0.03821656050955414, |
|
"grad_norm": 0.2286761999130249, |
|
"kl": 0.0104217529296875, |
|
"learning_rate": 4.999575187161439e-06, |
|
"loss": 0.0001, |
|
"reward": 0.14321784488856792, |
|
"reward_std": 0.1725912243127823, |
|
"rewards/code_reward": 0.14321784675121307, |
|
"rewards/format_reward": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 650.0647583007812, |
|
"epoch": 0.040339702760084924, |
|
"grad_norm": 0.21597912907600403, |
|
"kl": 0.0113677978515625, |
|
"learning_rate": 4.9992447956603455e-06, |
|
"loss": 0.0001, |
|
"reward": 0.12854056991636753, |
|
"reward_std": 0.15781505592167377, |
|
"rewards/code_reward": 0.12854057550430298, |
|
"rewards/format_reward": 0.0, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 624.8192138671875, |
|
"epoch": 0.04246284501061571, |
|
"grad_norm": 42.34319305419922, |
|
"kl": 7.167266845703125, |
|
"learning_rate": 4.998820030352409e-06, |
|
"loss": 0.0716, |
|
"reward": 0.12942847050726414, |
|
"reward_std": 0.11835422366857529, |
|
"rewards/code_reward": 0.1292052511125803, |
|
"rewards/format_reward": 0.0022321429569274187, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 632.3504638671875, |
|
"epoch": 0.044585987261146494, |
|
"grad_norm": 0.23411324620246887, |
|
"kl": 0.0178985595703125, |
|
"learning_rate": 4.998300909059929e-06, |
|
"loss": 0.0002, |
|
"reward": 0.12214689701795578, |
|
"reward_std": 0.1782115437090397, |
|
"rewards/code_reward": 0.12214690260589123, |
|
"rewards/format_reward": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 610.6652069091797, |
|
"epoch": 0.04670912951167728, |
|
"grad_norm": 0.24601581692695618, |
|
"kl": 0.020477294921875, |
|
"learning_rate": 4.997687453564198e-06, |
|
"loss": 0.0002, |
|
"reward": 0.18596480786800385, |
|
"reward_std": 0.184912770986557, |
|
"rewards/code_reward": 0.18529516831040382, |
|
"rewards/format_reward": 0.006696428870782256, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 612.2344055175781, |
|
"epoch": 0.04883227176220807, |
|
"grad_norm": 0.25667688250541687, |
|
"kl": 0.02349853515625, |
|
"learning_rate": 4.9969796896045775e-06, |
|
"loss": 0.0002, |
|
"reward": 0.1700380276888609, |
|
"reward_std": 0.15014583989977837, |
|
"rewards/code_reward": 0.1689219493418932, |
|
"rewards/format_reward": 0.011160714784637094, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 582.0379791259766, |
|
"epoch": 0.050955414012738856, |
|
"grad_norm": 1.4910736083984375, |
|
"kl": 0.054718017578125, |
|
"learning_rate": 4.996177646877426e-06, |
|
"loss": 0.0005, |
|
"reward": 0.15169917233288288, |
|
"reward_std": 0.16788329929113388, |
|
"rewards/code_reward": 0.151029534637928, |
|
"rewards/format_reward": 0.006696428870782256, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 605.2277069091797, |
|
"epoch": 0.05307855626326964, |
|
"grad_norm": 0.23100271821022034, |
|
"kl": 0.030059814453125, |
|
"learning_rate": 4.995281359034851e-06, |
|
"loss": 0.0003, |
|
"reward": 0.10647542215883732, |
|
"reward_std": 0.13741069473326206, |
|
"rewards/code_reward": 0.105805778875947, |
|
"rewards/format_reward": 0.006696428870782256, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 595.9777069091797, |
|
"epoch": 0.055201698513800426, |
|
"grad_norm": 0.22825182974338531, |
|
"kl": 0.03179931640625, |
|
"learning_rate": 4.994290863683296e-06, |
|
"loss": 0.0003, |
|
"reward": 0.11801626486703753, |
|
"reward_std": 0.12430650275200605, |
|
"rewards/code_reward": 0.11779305664822459, |
|
"rewards/format_reward": 0.0022321429569274187, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 594.2031402587891, |
|
"epoch": 0.05732484076433121, |
|
"grad_norm": 0.2523776590824127, |
|
"kl": 0.0357666015625, |
|
"learning_rate": 4.99320620238196e-06, |
|
"loss": 0.0004, |
|
"reward": 0.1666601337492466, |
|
"reward_std": 0.20200489647686481, |
|
"rewards/code_reward": 0.1655440628528595, |
|
"rewards/format_reward": 0.011160714784637094, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 606.7299346923828, |
|
"epoch": 0.059447983014861996, |
|
"grad_norm": 0.24759377539157867, |
|
"kl": 0.03466796875, |
|
"learning_rate": 4.99202742064106e-06, |
|
"loss": 0.0003, |
|
"reward": 0.12888818327337503, |
|
"reward_std": 0.14617390558123589, |
|
"rewards/code_reward": 0.12732568103820086, |
|
"rewards/format_reward": 0.01562500116415322, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 582.7879791259766, |
|
"epoch": 0.06157112526539278, |
|
"grad_norm": 0.22141791880130768, |
|
"kl": 0.0360107421875, |
|
"learning_rate": 4.990754567919917e-06, |
|
"loss": 0.0004, |
|
"reward": 0.1982099711894989, |
|
"reward_std": 0.15798946656286716, |
|
"rewards/code_reward": 0.1970939077436924, |
|
"rewards/format_reward": 0.011160714784637094, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 582.3035888671875, |
|
"epoch": 0.06369426751592357, |
|
"grad_norm": 0.4308633804321289, |
|
"kl": 0.04461669921875, |
|
"learning_rate": 4.989387697624881e-06, |
|
"loss": 0.0004, |
|
"reward": 0.15222312323749065, |
|
"reward_std": 0.13287453912198544, |
|
"rewards/code_reward": 0.14999098517000675, |
|
"rewards/format_reward": 0.022321429336443543, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 560.9888763427734, |
|
"epoch": 0.06581740976645435, |
|
"grad_norm": 0.44854736328125, |
|
"kl": 0.05029296875, |
|
"learning_rate": 4.987926867107095e-06, |
|
"loss": 0.0005, |
|
"reward": 0.17351704463362694, |
|
"reward_std": 0.1594883631914854, |
|
"rewards/code_reward": 0.17039205506443977, |
|
"rewards/format_reward": 0.031250000931322575, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 520.7835006713867, |
|
"epoch": 0.06794055201698514, |
|
"grad_norm": 0.3021621108055115, |
|
"kl": 0.0545654296875, |
|
"learning_rate": 4.986372137660078e-06, |
|
"loss": 0.0005, |
|
"reward": 0.19399502873420715, |
|
"reward_std": 0.18005169555544853, |
|
"rewards/code_reward": 0.1872985940426588, |
|
"rewards/format_reward": 0.0669642873108387, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 551.3750228881836, |
|
"epoch": 0.07006369426751592, |
|
"grad_norm": 0.38339507579803467, |
|
"kl": 0.0712890625, |
|
"learning_rate": 4.984723574517165e-06, |
|
"loss": 0.0007, |
|
"reward": 0.15828289464116096, |
|
"reward_std": 0.18912290409207344, |
|
"rewards/code_reward": 0.1453364696353674, |
|
"rewards/format_reward": 0.12946429289877415, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 490.0558166503906, |
|
"epoch": 0.07218683651804671, |
|
"grad_norm": 0.5539775490760803, |
|
"kl": 0.0887451171875, |
|
"learning_rate": 4.9829812468487655e-06, |
|
"loss": 0.0009, |
|
"reward": 0.18788279965519905, |
|
"reward_std": 0.19856177270412445, |
|
"rewards/code_reward": 0.16578458063304424, |
|
"rewards/format_reward": 0.2209821566939354, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 441.99778747558594, |
|
"epoch": 0.07430997876857749, |
|
"grad_norm": 0.35391107201576233, |
|
"kl": 0.12060546875, |
|
"learning_rate": 4.981145227759457e-06, |
|
"loss": 0.0012, |
|
"reward": 0.20366163551807404, |
|
"reward_std": 0.1490145679563284, |
|
"rewards/code_reward": 0.16392949409782887, |
|
"rewards/format_reward": 0.3973214477300644, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 454.8236846923828, |
|
"epoch": 0.07643312101910828, |
|
"grad_norm": 0.34607765078544617, |
|
"kl": 0.18994140625, |
|
"learning_rate": 4.979215594284924e-06, |
|
"loss": 0.0019, |
|
"reward": 0.16812831349670887, |
|
"reward_std": 0.16833286173641682, |
|
"rewards/code_reward": 0.10094079561531544, |
|
"rewards/format_reward": 0.6718750298023224, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 413.80358123779297, |
|
"epoch": 0.07855626326963906, |
|
"grad_norm": 0.30116426944732666, |
|
"kl": 0.1982421875, |
|
"learning_rate": 4.977192427388722e-06, |
|
"loss": 0.002, |
|
"reward": 0.24648623168468475, |
|
"reward_std": 0.1688873954117298, |
|
"rewards/code_reward": 0.16099514812231064, |
|
"rewards/format_reward": 0.854910746216774, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 412.4464416503906, |
|
"epoch": 0.08067940552016985, |
|
"grad_norm": 0.3934517204761505, |
|
"kl": 0.248046875, |
|
"learning_rate": 4.9750758119588824e-06, |
|
"loss": 0.0025, |
|
"reward": 0.24308543279767036, |
|
"reward_std": 0.14966130815446377, |
|
"rewards/code_reward": 0.1495586484670639, |
|
"rewards/format_reward": 0.9352678954601288, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 424.8928756713867, |
|
"epoch": 0.08280254777070063, |
|
"grad_norm": 0.3077991008758545, |
|
"kl": 0.256103515625, |
|
"learning_rate": 4.972865836804349e-06, |
|
"loss": 0.0026, |
|
"reward": 0.2948240712285042, |
|
"reward_std": 0.17200535349547863, |
|
"rewards/code_reward": 0.19995798915624619, |
|
"rewards/format_reward": 0.9486607611179352, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 445.8326110839844, |
|
"epoch": 0.08492569002123142, |
|
"grad_norm": 0.3074510097503662, |
|
"kl": 0.259765625, |
|
"learning_rate": 4.970562594651254e-06, |
|
"loss": 0.0026, |
|
"reward": 0.2571263238787651, |
|
"reward_std": 0.1593556720763445, |
|
"rewards/code_reward": 0.16226024366915226, |
|
"rewards/format_reward": 0.9486607611179352, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 474.68082427978516, |
|
"epoch": 0.0870488322717622, |
|
"grad_norm": 0.28787654638290405, |
|
"kl": 0.2421875, |
|
"learning_rate": 4.968166182139026e-06, |
|
"loss": 0.0024, |
|
"reward": 0.27686072885990143, |
|
"reward_std": 0.16917606256902218, |
|
"rewards/code_reward": 0.18378033302724361, |
|
"rewards/format_reward": 0.9308036118745804, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 520.4397583007812, |
|
"epoch": 0.08917197452229299, |
|
"grad_norm": 0.2756073772907257, |
|
"kl": 0.22216796875, |
|
"learning_rate": 4.9656766998163306e-06, |
|
"loss": 0.0023, |
|
"reward": 0.29509423673152924, |
|
"reward_std": 0.13862515799701214, |
|
"rewards/code_reward": 0.20402280241250992, |
|
"rewards/format_reward": 0.910714328289032, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 520.6406555175781, |
|
"epoch": 0.09129511677282377, |
|
"grad_norm": 0.2641509473323822, |
|
"kl": 0.1728515625, |
|
"learning_rate": 4.963094252136865e-06, |
|
"loss": 0.0017, |
|
"reward": 0.3755844831466675, |
|
"reward_std": 0.19650832191109657, |
|
"rewards/code_reward": 0.280941616743803, |
|
"rewards/format_reward": 0.9464286118745804, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 520.0803909301758, |
|
"epoch": 0.09341825902335456, |
|
"grad_norm": 0.28843629360198975, |
|
"kl": 0.206298828125, |
|
"learning_rate": 4.960418947454958e-06, |
|
"loss": 0.0021, |
|
"reward": 0.21916456520557404, |
|
"reward_std": 0.12000982835888863, |
|
"rewards/code_reward": 0.12407528422772884, |
|
"rewards/format_reward": 0.95089291036129, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 516.7455749511719, |
|
"epoch": 0.09554140127388536, |
|
"grad_norm": 0.9614177942276001, |
|
"kl": 0.203125, |
|
"learning_rate": 4.957650898021038e-06, |
|
"loss": 0.002, |
|
"reward": 0.26794980466365814, |
|
"reward_std": 0.14450966753065586, |
|
"rewards/code_reward": 0.17397657968103886, |
|
"rewards/format_reward": 0.9397321939468384, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 517.7343978881836, |
|
"epoch": 0.09766454352441614, |
|
"grad_norm": 0.2904169261455536, |
|
"kl": 0.17041015625, |
|
"learning_rate": 4.954790219976915e-06, |
|
"loss": 0.0017, |
|
"reward": 0.3067335784435272, |
|
"reward_std": 0.15805694833397865, |
|
"rewards/code_reward": 0.21186750568449497, |
|
"rewards/format_reward": 0.948660746216774, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 533.3594055175781, |
|
"epoch": 0.09978768577494693, |
|
"grad_norm": 0.25753694772720337, |
|
"kl": 0.126953125, |
|
"learning_rate": 4.95183703335091e-06, |
|
"loss": 0.0013, |
|
"reward": 0.22189904749393463, |
|
"reward_std": 0.13265508972108364, |
|
"rewards/code_reward": 0.12390796467661858, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 548.950927734375, |
|
"epoch": 0.10191082802547771, |
|
"grad_norm": 0.26259344816207886, |
|
"kl": 0.1424560546875, |
|
"learning_rate": 4.948791462052819e-06, |
|
"loss": 0.0014, |
|
"reward": 0.22812815010547638, |
|
"reward_std": 0.1622354220598936, |
|
"rewards/code_reward": 0.12991385161876678, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 571.6741485595703, |
|
"epoch": 0.1040339702760085, |
|
"grad_norm": 0.4155128300189972, |
|
"kl": 0.20263671875, |
|
"learning_rate": 4.945653633868716e-06, |
|
"loss": 0.0021, |
|
"reward": 0.24147583171725273, |
|
"reward_std": 0.1386658363044262, |
|
"rewards/code_reward": 0.1450472492724657, |
|
"rewards/format_reward": 0.964285746216774, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 534.6093978881836, |
|
"epoch": 0.10615711252653928, |
|
"grad_norm": 0.24680602550506592, |
|
"kl": 0.159912109375, |
|
"learning_rate": 4.942423680455584e-06, |
|
"loss": 0.0016, |
|
"reward": 0.2133147530257702, |
|
"reward_std": 0.14480553567409515, |
|
"rewards/code_reward": 0.11643974296748638, |
|
"rewards/format_reward": 0.9687500298023224, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 521.4888610839844, |
|
"epoch": 0.10828025477707007, |
|
"grad_norm": 0.27140846848487854, |
|
"kl": 0.172119140625, |
|
"learning_rate": 4.939101737335802e-06, |
|
"loss": 0.0017, |
|
"reward": 0.3708176761865616, |
|
"reward_std": 0.1698193922638893, |
|
"rewards/code_reward": 0.2730497941374779, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 550.3236846923828, |
|
"epoch": 0.11040339702760085, |
|
"grad_norm": 0.24256259202957153, |
|
"kl": 0.145751953125, |
|
"learning_rate": 4.935687943891447e-06, |
|
"loss": 0.0015, |
|
"reward": 0.30257678776979446, |
|
"reward_std": 0.1430999655276537, |
|
"rewards/code_reward": 0.2057017907500267, |
|
"rewards/format_reward": 0.9687500447034836, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 551.3772583007812, |
|
"epoch": 0.11252653927813164, |
|
"grad_norm": 0.2562994062900543, |
|
"kl": 0.16259765625, |
|
"learning_rate": 4.932182443358458e-06, |
|
"loss": 0.0016, |
|
"reward": 0.314239501953125, |
|
"reward_std": 0.21334025636315346, |
|
"rewards/code_reward": 0.21624841168522835, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 553.4955596923828, |
|
"epoch": 0.11464968152866242, |
|
"grad_norm": 0.23835241794586182, |
|
"kl": 0.160888671875, |
|
"learning_rate": 4.928585382820616e-06, |
|
"loss": 0.0016, |
|
"reward": 0.25176869705319405, |
|
"reward_std": 0.11105065606534481, |
|
"rewards/code_reward": 0.1535544078797102, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 552.2366333007812, |
|
"epoch": 0.11677282377919321, |
|
"grad_norm": 0.2630121409893036, |
|
"kl": 0.1552734375, |
|
"learning_rate": 4.924896913203376e-06, |
|
"loss": 0.0016, |
|
"reward": 0.24022378027439117, |
|
"reward_std": 0.15625984594225883, |
|
"rewards/code_reward": 0.14133985061198473, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 574.1295013427734, |
|
"epoch": 0.11889596602972399, |
|
"grad_norm": 0.3262800872325897, |
|
"kl": 0.1572265625, |
|
"learning_rate": 4.921117189267535e-06, |
|
"loss": 0.0016, |
|
"reward": 0.32679247856140137, |
|
"reward_std": 0.19292927533388138, |
|
"rewards/code_reward": 0.22991745918989182, |
|
"rewards/format_reward": 0.9687500298023224, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 541.9576110839844, |
|
"epoch": 0.12101910828025478, |
|
"grad_norm": 0.2467201203107834, |
|
"kl": 0.17578125, |
|
"learning_rate": 4.917246369602742e-06, |
|
"loss": 0.0018, |
|
"reward": 0.25976729951798916, |
|
"reward_std": 0.1260015396401286, |
|
"rewards/code_reward": 0.16110657062381506, |
|
"rewards/format_reward": 0.986607164144516, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 553.419677734375, |
|
"epoch": 0.12314225053078556, |
|
"grad_norm": 0.2763681709766388, |
|
"kl": 0.15478515625, |
|
"learning_rate": 4.9132846166208355e-06, |
|
"loss": 0.0016, |
|
"reward": 0.2834607996046543, |
|
"reward_std": 0.1603868044912815, |
|
"rewards/code_reward": 0.1852465160191059, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 542.2343826293945, |
|
"epoch": 0.12526539278131635, |
|
"grad_norm": 1.2009683847427368, |
|
"kl": 0.203125, |
|
"learning_rate": 4.9092320965490365e-06, |
|
"loss": 0.002, |
|
"reward": 0.36397186666727066, |
|
"reward_std": 0.20367462560534477, |
|
"rewards/code_reward": 0.26531114615499973, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 502.7076110839844, |
|
"epoch": 0.12738853503184713, |
|
"grad_norm": 0.291824609041214, |
|
"kl": 0.1533203125, |
|
"learning_rate": 4.905088979422971e-06, |
|
"loss": 0.0015, |
|
"reward": 0.33501066267490387, |
|
"reward_std": 0.17072956077754498, |
|
"rewards/code_reward": 0.23701957240700722, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 512.5134201049805, |
|
"epoch": 0.12951167728237792, |
|
"grad_norm": 0.2763117849826813, |
|
"kl": 0.1837158203125, |
|
"learning_rate": 4.900855439079536e-06, |
|
"loss": 0.0019, |
|
"reward": 0.3404688164591789, |
|
"reward_std": 0.19662801921367645, |
|
"rewards/code_reward": 0.2453795075416565, |
|
"rewards/format_reward": 0.9508928954601288, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 526.8995819091797, |
|
"epoch": 0.1316348195329087, |
|
"grad_norm": 0.2876502275466919, |
|
"kl": 0.19580078125, |
|
"learning_rate": 4.8965316531496055e-06, |
|
"loss": 0.002, |
|
"reward": 0.2866082601249218, |
|
"reward_std": 0.16614584252238274, |
|
"rewards/code_reward": 0.19129573553800583, |
|
"rewards/format_reward": 0.9531250447034836, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 593.2924499511719, |
|
"epoch": 0.1337579617834395, |
|
"grad_norm": 2.4047532081604004, |
|
"kl": 0.41357421875, |
|
"learning_rate": 4.892117803050578e-06, |
|
"loss": 0.0041, |
|
"reward": 0.2631051279604435, |
|
"reward_std": 0.2128530964255333, |
|
"rewards/code_reward": 0.17359617352485657, |
|
"rewards/format_reward": 0.895089328289032, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 563.3437805175781, |
|
"epoch": 0.13588110403397027, |
|
"grad_norm": 0.2846791446208954, |
|
"kl": 0.197509765625, |
|
"learning_rate": 4.887614073978761e-06, |
|
"loss": 0.002, |
|
"reward": 0.2669316381216049, |
|
"reward_std": 0.14747418276965618, |
|
"rewards/code_reward": 0.17630662396550179, |
|
"rewards/format_reward": 0.9062500447034836, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 535.6183319091797, |
|
"epoch": 0.13800424628450106, |
|
"grad_norm": 0.2759235203266144, |
|
"kl": 0.186767578125, |
|
"learning_rate": 4.883020654901609e-06, |
|
"loss": 0.0019, |
|
"reward": 0.28016526997089386, |
|
"reward_std": 0.17947101965546608, |
|
"rewards/code_reward": 0.18730811774730682, |
|
"rewards/format_reward": 0.928571492433548, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 616.9732360839844, |
|
"epoch": 0.14012738853503184, |
|
"grad_norm": 0.26271936297416687, |
|
"kl": 0.23974609375, |
|
"learning_rate": 4.878337738549785e-06, |
|
"loss": 0.0024, |
|
"reward": 0.23576084896922112, |
|
"reward_std": 0.18645637948065996, |
|
"rewards/code_reward": 0.1466983389109373, |
|
"rewards/format_reward": 0.8906250447034836, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 591.013427734375, |
|
"epoch": 0.14225053078556263, |
|
"grad_norm": 0.2741730511188507, |
|
"kl": 0.220947265625, |
|
"learning_rate": 4.873565521409082e-06, |
|
"loss": 0.0023, |
|
"reward": 0.2887257859110832, |
|
"reward_std": 0.15980570390820503, |
|
"rewards/code_reward": 0.20055612176656723, |
|
"rewards/format_reward": 0.8816964775323868, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 567.3995819091797, |
|
"epoch": 0.14437367303609341, |
|
"grad_norm": 0.30026066303253174, |
|
"kl": 0.196533203125, |
|
"learning_rate": 4.868704203712173e-06, |
|
"loss": 0.002, |
|
"reward": 0.2695513255894184, |
|
"reward_std": 0.13891723938286304, |
|
"rewards/code_reward": 0.18361380137503147, |
|
"rewards/format_reward": 0.8593750447034836, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 557.5424346923828, |
|
"epoch": 0.1464968152866242, |
|
"grad_norm": 0.26508811116218567, |
|
"kl": 0.2275390625, |
|
"learning_rate": 4.86375398943021e-06, |
|
"loss": 0.0023, |
|
"reward": 0.2681450620293617, |
|
"reward_std": 0.15566366165876389, |
|
"rewards/code_reward": 0.17618075758218765, |
|
"rewards/format_reward": 0.91964291036129, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 567.8192291259766, |
|
"epoch": 0.14861995753715498, |
|
"grad_norm": 0.2842702567577362, |
|
"kl": 0.206787109375, |
|
"learning_rate": 4.858715086264274e-06, |
|
"loss": 0.0021, |
|
"reward": 0.3179836943745613, |
|
"reward_std": 0.17460669204592705, |
|
"rewards/code_reward": 0.2246801033616066, |
|
"rewards/format_reward": 0.9330357611179352, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 537.6094055175781, |
|
"epoch": 0.15074309978768577, |
|
"grad_norm": 0.2676061689853668, |
|
"kl": 0.208740234375, |
|
"learning_rate": 4.853587705636646e-06, |
|
"loss": 0.0021, |
|
"reward": 0.29776863381266594, |
|
"reward_std": 0.16130083054304123, |
|
"rewards/code_reward": 0.2037954218685627, |
|
"rewards/format_reward": 0.9397321790456772, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 575.2143096923828, |
|
"epoch": 0.15286624203821655, |
|
"grad_norm": 0.2367551475763321, |
|
"kl": 0.189208984375, |
|
"learning_rate": 4.84837206268195e-06, |
|
"loss": 0.0019, |
|
"reward": 0.23160668835043907, |
|
"reward_std": 0.13609608635306358, |
|
"rewards/code_reward": 0.13785668183118105, |
|
"rewards/format_reward": 0.9375000447034836, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 615.6518096923828, |
|
"epoch": 0.15498938428874734, |
|
"grad_norm": 0.2558582127094269, |
|
"kl": 0.199462890625, |
|
"learning_rate": 4.8430683762381195e-06, |
|
"loss": 0.002, |
|
"reward": 0.3226686045527458, |
|
"reward_std": 0.18408508598804474, |
|
"rewards/code_reward": 0.22847215831279755, |
|
"rewards/format_reward": 0.9419643133878708, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 581.3549499511719, |
|
"epoch": 0.15711252653927812, |
|
"grad_norm": 0.25004705786705017, |
|
"kl": 0.2255859375, |
|
"learning_rate": 4.837676868837213e-06, |
|
"loss": 0.0023, |
|
"reward": 0.3521072790026665, |
|
"reward_std": 0.18179307878017426, |
|
"rewards/code_reward": 0.2556787021458149, |
|
"rewards/format_reward": 0.9642857611179352, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 615.6384124755859, |
|
"epoch": 0.1592356687898089, |
|
"grad_norm": 0.2326764315366745, |
|
"kl": 0.184326171875, |
|
"learning_rate": 4.832197766696085e-06, |
|
"loss": 0.002, |
|
"reward": 0.3262624219059944, |
|
"reward_std": 0.13872519508004189, |
|
"rewards/code_reward": 0.22827134653925896, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 626.1919708251953, |
|
"epoch": 0.1613588110403397, |
|
"grad_norm": 0.22483916580677032, |
|
"kl": 0.2158203125, |
|
"learning_rate": 4.826631299706887e-06, |
|
"loss": 0.0022, |
|
"reward": 0.24266962707042694, |
|
"reward_std": 0.15623858594335616, |
|
"rewards/code_reward": 0.14579462260007858, |
|
"rewards/format_reward": 0.9687500447034836, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 638.6696624755859, |
|
"epoch": 0.16348195329087048, |
|
"grad_norm": 0.2549837827682495, |
|
"kl": 0.2041015625, |
|
"learning_rate": 4.820977701427424e-06, |
|
"loss": 0.002, |
|
"reward": 0.3548019379377365, |
|
"reward_std": 0.19029108062386513, |
|
"rewards/code_reward": 0.2577037066221237, |
|
"rewards/format_reward": 0.9709821939468384, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 645.8727874755859, |
|
"epoch": 0.16560509554140126, |
|
"grad_norm": 0.2154364138841629, |
|
"kl": 0.21875, |
|
"learning_rate": 4.81523720907136e-06, |
|
"loss": 0.0022, |
|
"reward": 0.23285862803459167, |
|
"reward_std": 0.129691231995821, |
|
"rewards/code_reward": 0.13531397026963532, |
|
"rewards/format_reward": 0.975446492433548, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 631.9732513427734, |
|
"epoch": 0.16772823779193205, |
|
"grad_norm": 0.22862014174461365, |
|
"kl": 0.21435546875, |
|
"learning_rate": 4.809410063498254e-06, |
|
"loss": 0.0022, |
|
"reward": 0.33913441002368927, |
|
"reward_std": 0.1570077408105135, |
|
"rewards/code_reward": 0.2411433346569538, |
|
"rewards/format_reward": 0.9799107760190964, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 650.060302734375, |
|
"epoch": 0.16985138004246284, |
|
"grad_norm": 0.2403189241886139, |
|
"kl": 0.218994140625, |
|
"learning_rate": 4.8034965092034656e-06, |
|
"loss": 0.0022, |
|
"reward": 0.2641909271478653, |
|
"reward_std": 0.15404854156076908, |
|
"rewards/code_reward": 0.16664628125727177, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 665.2299346923828, |
|
"epoch": 0.17197452229299362, |
|
"grad_norm": 0.22183021903038025, |
|
"kl": 0.17236328125, |
|
"learning_rate": 4.797496794307889e-06, |
|
"loss": 0.0017, |
|
"reward": 0.26636216044425964, |
|
"reward_std": 0.159404331818223, |
|
"rewards/code_reward": 0.16814786568284035, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 646.7277069091797, |
|
"epoch": 0.1740976645435244, |
|
"grad_norm": 0.22360101342201233, |
|
"kl": 0.1865234375, |
|
"learning_rate": 4.791411170547545e-06, |
|
"loss": 0.0019, |
|
"reward": 0.2806714288890362, |
|
"reward_std": 0.1345765646547079, |
|
"rewards/code_reward": 0.1829035673290491, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 649.9285888671875, |
|
"epoch": 0.1762208067940552, |
|
"grad_norm": 0.2544306218624115, |
|
"kl": 0.173583984375, |
|
"learning_rate": 4.785239893263017e-06, |
|
"loss": 0.0017, |
|
"reward": 0.26558100432157516, |
|
"reward_std": 0.13677635975182056, |
|
"rewards/code_reward": 0.16825956851243973, |
|
"rewards/format_reward": 0.9732143431901932, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 686.3661041259766, |
|
"epoch": 0.17834394904458598, |
|
"grad_norm": 0.21967419981956482, |
|
"kl": 0.16162109375, |
|
"learning_rate": 4.778983221388742e-06, |
|
"loss": 0.0016, |
|
"reward": 0.24129238724708557, |
|
"reward_std": 0.1258857063949108, |
|
"rewards/code_reward": 0.14330130256712437, |
|
"rewards/format_reward": 0.9799107760190964, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 634.2634124755859, |
|
"epoch": 0.18046709129511676, |
|
"grad_norm": 0.254304975271225, |
|
"kl": 0.17724609375, |
|
"learning_rate": 4.77264141744214e-06, |
|
"loss": 0.0018, |
|
"reward": 0.3212145194411278, |
|
"reward_std": 0.1864020749926567, |
|
"rewards/code_reward": 0.2236698605120182, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 641.8393096923828, |
|
"epoch": 0.18259023354564755, |
|
"grad_norm": 0.24272438883781433, |
|
"kl": 0.19873046875, |
|
"learning_rate": 4.766214747512603e-06, |
|
"loss": 0.002, |
|
"reward": 0.31076986342668533, |
|
"reward_std": 0.18200884014368057, |
|
"rewards/code_reward": 0.2134484425187111, |
|
"rewards/format_reward": 0.973214328289032, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 643.4263763427734, |
|
"epoch": 0.18471337579617833, |
|
"grad_norm": 0.2264644354581833, |
|
"kl": 0.185791015625, |
|
"learning_rate": 4.759703481250331e-06, |
|
"loss": 0.0019, |
|
"reward": 0.3214620351791382, |
|
"reward_std": 0.14903312921524048, |
|
"rewards/code_reward": 0.22347095608711243, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 649.5580749511719, |
|
"epoch": 0.18683651804670912, |
|
"grad_norm": 0.22847051918506622, |
|
"kl": 0.169677734375, |
|
"learning_rate": 4.753107891855015e-06, |
|
"loss": 0.0018, |
|
"reward": 0.25390685349702835, |
|
"reward_std": 0.12118050269782543, |
|
"rewards/code_reward": 0.15680862963199615, |
|
"rewards/format_reward": 0.9709821939468384, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 652.5513610839844, |
|
"epoch": 0.18895966029723993, |
|
"grad_norm": 0.22527199983596802, |
|
"kl": 0.19580078125, |
|
"learning_rate": 4.746428256064375e-06, |
|
"loss": 0.002, |
|
"reward": 0.303693201392889, |
|
"reward_std": 0.1710791066288948, |
|
"rewards/code_reward": 0.20525570400059223, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 693.4174499511719, |
|
"epoch": 0.1910828025477707, |
|
"grad_norm": 0.2056378573179245, |
|
"kl": 0.17041015625, |
|
"learning_rate": 4.7396648541425534e-06, |
|
"loss": 0.0017, |
|
"reward": 0.2523197568953037, |
|
"reward_std": 0.1238141655921936, |
|
"rewards/code_reward": 0.15432866849005222, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 679.5580596923828, |
|
"epoch": 0.1932059447983015, |
|
"grad_norm": 0.225660502910614, |
|
"kl": 0.175537109375, |
|
"learning_rate": 4.732817969868348e-06, |
|
"loss": 0.0018, |
|
"reward": 0.25567496195435524, |
|
"reward_std": 0.16754142567515373, |
|
"rewards/code_reward": 0.15813031047582626, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 627.4464569091797, |
|
"epoch": 0.19532908704883228, |
|
"grad_norm": 0.24276329576969147, |
|
"kl": 0.1419677734375, |
|
"learning_rate": 4.7258878905233095e-06, |
|
"loss": 0.0014, |
|
"reward": 0.3579171895980835, |
|
"reward_std": 0.21506508812308311, |
|
"rewards/code_reward": 0.25992610678076744, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 664.091552734375, |
|
"epoch": 0.19745222929936307, |
|
"grad_norm": 0.2459368258714676, |
|
"kl": 0.17431640625, |
|
"learning_rate": 4.718874906879688e-06, |
|
"loss": 0.0017, |
|
"reward": 0.25773513317108154, |
|
"reward_std": 0.16034462675452232, |
|
"rewards/code_reward": 0.16130654886364937, |
|
"rewards/format_reward": 0.964285746216774, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 641.9576110839844, |
|
"epoch": 0.19957537154989385, |
|
"grad_norm": 0.20433549582958221, |
|
"kl": 0.135986328125, |
|
"learning_rate": 4.711779313188231e-06, |
|
"loss": 0.0014, |
|
"reward": 0.31772880256175995, |
|
"reward_std": 0.12460769526660442, |
|
"rewards/code_reward": 0.218844847753644, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 668.7522430419922, |
|
"epoch": 0.20169851380042464, |
|
"grad_norm": 0.228180393576622, |
|
"kl": 0.1337890625, |
|
"learning_rate": 4.70460140716584e-06, |
|
"loss": 0.0014, |
|
"reward": 0.23017888888716698, |
|
"reward_std": 0.16307671833783388, |
|
"rewards/code_reward": 0.13196459133177996, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 666.0960083007812, |
|
"epoch": 0.20382165605095542, |
|
"grad_norm": 0.23849396407604218, |
|
"kl": 0.132568359375, |
|
"learning_rate": 4.697341489983076e-06, |
|
"loss": 0.0013, |
|
"reward": 0.38449443876743317, |
|
"reward_std": 0.205027487128973, |
|
"rewards/code_reward": 0.2869497686624527, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 638.5513763427734, |
|
"epoch": 0.2059447983014862, |
|
"grad_norm": 0.23833003640174866, |
|
"kl": 0.1219482421875, |
|
"learning_rate": 4.6899998662515215e-06, |
|
"loss": 0.0012, |
|
"reward": 0.30101777240633965, |
|
"reward_std": 0.18449735268950462, |
|
"rewards/code_reward": 0.20235705375671387, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 638.1161041259766, |
|
"epoch": 0.208067940552017, |
|
"grad_norm": 0.21731068193912506, |
|
"kl": 0.146484375, |
|
"learning_rate": 4.682576844011007e-06, |
|
"loss": 0.0015, |
|
"reward": 0.2744937762618065, |
|
"reward_std": 0.16123195737600327, |
|
"rewards/code_reward": 0.17650271020829678, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 616.8861846923828, |
|
"epoch": 0.21019108280254778, |
|
"grad_norm": 0.25706782937049866, |
|
"kl": 0.134033203125, |
|
"learning_rate": 4.675072734716678e-06, |
|
"loss": 0.0013, |
|
"reward": 0.27044272795319557, |
|
"reward_std": 0.17698625475168228, |
|
"rewards/code_reward": 0.17245164141058922, |
|
"rewards/format_reward": 0.9799107313156128, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 617.6875305175781, |
|
"epoch": 0.21231422505307856, |
|
"grad_norm": 0.23481673002243042, |
|
"kl": 0.123291015625, |
|
"learning_rate": 4.667487853225931e-06, |
|
"loss": 0.0013, |
|
"reward": 0.27581261470913887, |
|
"reward_std": 0.1309206485748291, |
|
"rewards/code_reward": 0.17692868784070015, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 649.1205749511719, |
|
"epoch": 0.21443736730360935, |
|
"grad_norm": 0.22034895420074463, |
|
"kl": 0.1197509765625, |
|
"learning_rate": 4.659822517785203e-06, |
|
"loss": 0.0012, |
|
"reward": 0.3144006244838238, |
|
"reward_std": 0.1468491405248642, |
|
"rewards/code_reward": 0.21529346704483032, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 637.7120819091797, |
|
"epoch": 0.21656050955414013, |
|
"grad_norm": 0.23196153342723846, |
|
"kl": 0.1163330078125, |
|
"learning_rate": 4.6520770500166165e-06, |
|
"loss": 0.0012, |
|
"reward": 0.2747727185487747, |
|
"reward_std": 0.15404804423451424, |
|
"rewards/code_reward": 0.17678163386881351, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 640.4375305175781, |
|
"epoch": 0.21868365180467092, |
|
"grad_norm": 0.21843470633029938, |
|
"kl": 0.111083984375, |
|
"learning_rate": 4.644251774904487e-06, |
|
"loss": 0.0012, |
|
"reward": 0.2366674654185772, |
|
"reward_std": 0.12331773899495602, |
|
"rewards/code_reward": 0.13889960199594498, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 635.5982513427734, |
|
"epoch": 0.2208067940552017, |
|
"grad_norm": 0.2491585612297058, |
|
"kl": 0.1253662109375, |
|
"learning_rate": 4.636347020781684e-06, |
|
"loss": 0.0013, |
|
"reward": 0.26541591063141823, |
|
"reward_std": 0.20751060917973518, |
|
"rewards/code_reward": 0.16876413114368916, |
|
"rewards/format_reward": 0.9665178954601288, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 636.6919860839844, |
|
"epoch": 0.2229299363057325, |
|
"grad_norm": 0.22667376697063446, |
|
"kl": 0.1239013671875, |
|
"learning_rate": 4.6283631193158605e-06, |
|
"loss": 0.0013, |
|
"reward": 0.28487036004662514, |
|
"reward_std": 0.16408125311136246, |
|
"rewards/code_reward": 0.18732571229338646, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 645.0491333007812, |
|
"epoch": 0.22505307855626328, |
|
"grad_norm": 0.2283681333065033, |
|
"kl": 0.124267578125, |
|
"learning_rate": 4.620300405495532e-06, |
|
"loss": 0.0013, |
|
"reward": 0.2775597535073757, |
|
"reward_std": 0.15426970086991787, |
|
"rewards/code_reward": 0.17867580242455006, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 633.8794860839844, |
|
"epoch": 0.22717622080679406, |
|
"grad_norm": 0.24237921833992004, |
|
"kl": 0.1158447265625, |
|
"learning_rate": 4.612159217616022e-06, |
|
"loss": 0.0012, |
|
"reward": 0.3130115121603012, |
|
"reward_std": 0.20111830905079842, |
|
"rewards/code_reward": 0.21502043306827545, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 607.8995666503906, |
|
"epoch": 0.22929936305732485, |
|
"grad_norm": 0.22627882659435272, |
|
"kl": 0.1114501953125, |
|
"learning_rate": 4.603939897265268e-06, |
|
"loss": 0.0011, |
|
"reward": 0.2647922486066818, |
|
"reward_std": 0.13070931658148766, |
|
"rewards/code_reward": 0.16546186804771423, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 611.9308166503906, |
|
"epoch": 0.23142250530785563, |
|
"grad_norm": 0.24682241678237915, |
|
"kl": 0.11474609375, |
|
"learning_rate": 4.595642789309492e-06, |
|
"loss": 0.0012, |
|
"reward": 0.24479227885603905, |
|
"reward_std": 0.14851071499288082, |
|
"rewards/code_reward": 0.14657797291874886, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 601.9018096923828, |
|
"epoch": 0.23354564755838642, |
|
"grad_norm": 0.22991596162319183, |
|
"kl": 0.1337890625, |
|
"learning_rate": 4.587268241878724e-06, |
|
"loss": 0.0014, |
|
"reward": 0.3472997844219208, |
|
"reward_std": 0.20232820883393288, |
|
"rewards/code_reward": 0.24953191354870796, |
|
"rewards/format_reward": 0.9776785969734192, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 619.3036041259766, |
|
"epoch": 0.2356687898089172, |
|
"grad_norm": 0.23583151400089264, |
|
"kl": 0.142822265625, |
|
"learning_rate": 4.578816606352205e-06, |
|
"loss": 0.0014, |
|
"reward": 0.29563019424676895, |
|
"reward_std": 0.17909668013453484, |
|
"rewards/code_reward": 0.1987551935017109, |
|
"rewards/format_reward": 0.9687500298023224, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 576.4419860839844, |
|
"epoch": 0.23779193205944799, |
|
"grad_norm": 0.2594500780105591, |
|
"kl": 0.11865234375, |
|
"learning_rate": 4.570288237343632e-06, |
|
"loss": 0.0012, |
|
"reward": 0.37235086783766747, |
|
"reward_std": 0.21180756203830242, |
|
"rewards/code_reward": 0.27346691489219666, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 600.5937805175781, |
|
"epoch": 0.23991507430997877, |
|
"grad_norm": 0.24643369019031525, |
|
"kl": 0.1270751953125, |
|
"learning_rate": 4.561683492686289e-06, |
|
"loss": 0.0013, |
|
"reward": 0.31715739518404007, |
|
"reward_std": 0.18861495703458786, |
|
"rewards/code_reward": 0.21871986612677574, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 587.732177734375, |
|
"epoch": 0.24203821656050956, |
|
"grad_norm": 0.23611418902873993, |
|
"kl": 0.1268310546875, |
|
"learning_rate": 4.5530027334180285e-06, |
|
"loss": 0.0013, |
|
"reward": 0.26467062532901764, |
|
"reward_std": 0.17901071533560753, |
|
"rewards/code_reward": 0.16712596639990807, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 599.2254791259766, |
|
"epoch": 0.24416135881104034, |
|
"grad_norm": 0.24544627964496613, |
|
"kl": 0.1339111328125, |
|
"learning_rate": 4.544246323766122e-06, |
|
"loss": 0.0014, |
|
"reward": 0.27841826155781746, |
|
"reward_std": 0.16098117642104626, |
|
"rewards/code_reward": 0.18132001720368862, |
|
"rewards/format_reward": 0.9709821939468384, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 570.107177734375, |
|
"epoch": 0.24628450106157113, |
|
"grad_norm": 0.25169771909713745, |
|
"kl": 0.130859375, |
|
"learning_rate": 4.535414631131983e-06, |
|
"loss": 0.0013, |
|
"reward": 0.34019989520311356, |
|
"reward_std": 0.235354982316494, |
|
"rewards/code_reward": 0.2422088049352169, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 586.2834930419922, |
|
"epoch": 0.2484076433121019, |
|
"grad_norm": 0.2503570318222046, |
|
"kl": 0.1285400390625, |
|
"learning_rate": 4.526508026075746e-06, |
|
"loss": 0.0013, |
|
"reward": 0.33243585377931595, |
|
"reward_std": 0.15851835533976555, |
|
"rewards/code_reward": 0.23310547694563866, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 619.6964569091797, |
|
"epoch": 0.2505307855626327, |
|
"grad_norm": 0.2021104097366333, |
|
"kl": 0.1275634765625, |
|
"learning_rate": 4.517526882300721e-06, |
|
"loss": 0.0013, |
|
"reward": 0.1987566240131855, |
|
"reward_std": 0.12675911094993353, |
|
"rewards/code_reward": 0.10143518354743719, |
|
"rewards/format_reward": 0.973214328289032, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 577.7500305175781, |
|
"epoch": 0.2526539278131635, |
|
"grad_norm": 0.23152245581150055, |
|
"kl": 0.139404296875, |
|
"learning_rate": 4.508471576637713e-06, |
|
"loss": 0.0014, |
|
"reward": 0.24329132214188576, |
|
"reward_std": 0.16570740193128586, |
|
"rewards/code_reward": 0.14485381357371807, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 599.3102874755859, |
|
"epoch": 0.25477707006369427, |
|
"grad_norm": 0.23706002533435822, |
|
"kl": 0.1292724609375, |
|
"learning_rate": 4.499342489029211e-06, |
|
"loss": 0.0013, |
|
"reward": 0.24242350459098816, |
|
"reward_std": 0.14784781634807587, |
|
"rewards/code_reward": 0.14398599043488503, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 572.8839569091797, |
|
"epoch": 0.25690021231422505, |
|
"grad_norm": 0.2494058609008789, |
|
"kl": 0.1270751953125, |
|
"learning_rate": 4.490140002513449e-06, |
|
"loss": 0.0013, |
|
"reward": 0.26072419434785843, |
|
"reward_std": 0.12450610846281052, |
|
"rewards/code_reward": 0.16117061115801334, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 601.8080749511719, |
|
"epoch": 0.25902335456475584, |
|
"grad_norm": 0.23028254508972168, |
|
"kl": 0.1180419921875, |
|
"learning_rate": 4.48086450320833e-06, |
|
"loss": 0.0012, |
|
"reward": 0.3514738455414772, |
|
"reward_std": 0.16258227452635765, |
|
"rewards/code_reward": 0.2525899298489094, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 590.4040374755859, |
|
"epoch": 0.2611464968152866, |
|
"grad_norm": 0.24208419024944305, |
|
"kl": 0.1234130859375, |
|
"learning_rate": 4.4715163802952266e-06, |
|
"loss": 0.0012, |
|
"reward": 0.3460327610373497, |
|
"reward_std": 0.1636445987969637, |
|
"rewards/code_reward": 0.24647919461131096, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 609.7098388671875, |
|
"epoch": 0.2632696390658174, |
|
"grad_norm": 0.253397136926651, |
|
"kl": 0.135009765625, |
|
"learning_rate": 4.462096026002655e-06, |
|
"loss": 0.0014, |
|
"reward": 0.25145725160837173, |
|
"reward_std": 0.16506105288863182, |
|
"rewards/code_reward": 0.1530197374522686, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 603.8973388671875, |
|
"epoch": 0.2653927813163482, |
|
"grad_norm": 0.2500688135623932, |
|
"kl": 0.1434326171875, |
|
"learning_rate": 4.4526038355898144e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3970717117190361, |
|
"reward_std": 0.2130543477833271, |
|
"rewards/code_reward": 0.29908062517642975, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 615.4620819091797, |
|
"epoch": 0.267515923566879, |
|
"grad_norm": 0.20834827423095703, |
|
"kl": 0.1336669921875, |
|
"learning_rate": 4.4430402073300035e-06, |
|
"loss": 0.0014, |
|
"reward": 0.26642825454473495, |
|
"reward_std": 0.1255171401426196, |
|
"rewards/code_reward": 0.16799074038863182, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 617.9687805175781, |
|
"epoch": 0.26963906581740976, |
|
"grad_norm": 0.23262540996074677, |
|
"kl": 0.1351318359375, |
|
"learning_rate": 4.433405542493909e-06, |
|
"loss": 0.0014, |
|
"reward": 0.2870429456233978, |
|
"reward_std": 0.19062896817922592, |
|
"rewards/code_reward": 0.18838223442435265, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 662.2053833007812, |
|
"epoch": 0.27176220806794055, |
|
"grad_norm": 0.22548751533031464, |
|
"kl": 0.1197509765625, |
|
"learning_rate": 4.4237002453327734e-06, |
|
"loss": 0.0013, |
|
"reward": 0.30225350335240364, |
|
"reward_std": 0.1395848747342825, |
|
"rewards/code_reward": 0.203146331012249, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 632.0022430419922, |
|
"epoch": 0.27388535031847133, |
|
"grad_norm": 0.24719858169555664, |
|
"kl": 0.131103515625, |
|
"learning_rate": 4.4139247230614245e-06, |
|
"loss": 0.0013, |
|
"reward": 0.32878731191158295, |
|
"reward_std": 0.16303380951285362, |
|
"rewards/code_reward": 0.22968016006052494, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 636.7299346923828, |
|
"epoch": 0.2760084925690021, |
|
"grad_norm": 0.22243919968605042, |
|
"kl": 0.1234130859375, |
|
"learning_rate": 4.404079385841201e-06, |
|
"loss": 0.0013, |
|
"reward": 0.30703118816018105, |
|
"reward_std": 0.12942655384540558, |
|
"rewards/code_reward": 0.20792402233928442, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 644.5468902587891, |
|
"epoch": 0.2781316348195329, |
|
"grad_norm": 0.220564067363739, |
|
"kl": 0.123291015625, |
|
"learning_rate": 4.394164646762734e-06, |
|
"loss": 0.0013, |
|
"reward": 0.296065516769886, |
|
"reward_std": 0.18630750849843025, |
|
"rewards/code_reward": 0.19673514552414417, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 666.997802734375, |
|
"epoch": 0.2802547770700637, |
|
"grad_norm": 0.22151753306388855, |
|
"kl": 0.1304931640625, |
|
"learning_rate": 4.384180921828618e-06, |
|
"loss": 0.0013, |
|
"reward": 0.3110230341553688, |
|
"reward_std": 0.1834610104560852, |
|
"rewards/code_reward": 0.21370159462094307, |
|
"rewards/format_reward": 0.973214328289032, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 665.1205749511719, |
|
"epoch": 0.2823779193205945, |
|
"grad_norm": 0.21862035989761353, |
|
"kl": 0.1168212890625, |
|
"learning_rate": 4.374128629935955e-06, |
|
"loss": 0.0012, |
|
"reward": 0.2876487486064434, |
|
"reward_std": 0.21351643651723862, |
|
"rewards/code_reward": 0.18965766951441765, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 703.575927734375, |
|
"epoch": 0.28450106157112526, |
|
"grad_norm": 0.23025038838386536, |
|
"kl": 0.1204833984375, |
|
"learning_rate": 4.364008192858781e-06, |
|
"loss": 0.0013, |
|
"reward": 0.37238020449876785, |
|
"reward_std": 0.17016195878386497, |
|
"rewards/code_reward": 0.2737194746732712, |
|
"rewards/format_reward": 0.986607164144516, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 726.8817138671875, |
|
"epoch": 0.28662420382165604, |
|
"grad_norm": 0.21229737997055054, |
|
"kl": 0.121337890625, |
|
"learning_rate": 4.353820035230366e-06, |
|
"loss": 0.0012, |
|
"reward": 0.20391739904880524, |
|
"reward_std": 0.13868718035519123, |
|
"rewards/code_reward": 0.10525666922330856, |
|
"rewards/format_reward": 0.986607164144516, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 712.2098541259766, |
|
"epoch": 0.28874734607218683, |
|
"grad_norm": 0.2108394205570221, |
|
"kl": 0.1138916015625, |
|
"learning_rate": 4.3435645845254e-06, |
|
"loss": 0.0012, |
|
"reward": 0.3125154785811901, |
|
"reward_std": 0.1781605463474989, |
|
"rewards/code_reward": 0.2134083015844226, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 718.5893249511719, |
|
"epoch": 0.2908704883227176, |
|
"grad_norm": 0.2107391357421875, |
|
"kl": 0.1195068359375, |
|
"learning_rate": 4.333242271042054e-06, |
|
"loss": 0.0012, |
|
"reward": 0.3177960254251957, |
|
"reward_std": 0.1659994050860405, |
|
"rewards/code_reward": 0.2186888586729765, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 739.7545013427734, |
|
"epoch": 0.2929936305732484, |
|
"grad_norm": 0.2182048112154007, |
|
"kl": 0.124755859375, |
|
"learning_rate": 4.32285352788393e-06, |
|
"loss": 0.0013, |
|
"reward": 0.30886589735746384, |
|
"reward_std": 0.1802590098232031, |
|
"rewards/code_reward": 0.21020517125725746, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 765.857177734375, |
|
"epoch": 0.2951167728237792, |
|
"grad_norm": 0.19854100048542023, |
|
"kl": 0.115234375, |
|
"learning_rate": 4.312398790941882e-06, |
|
"loss": 0.0012, |
|
"reward": 0.3000107705593109, |
|
"reward_std": 0.15882322564721107, |
|
"rewards/code_reward": 0.201126828789711, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 721.9687805175781, |
|
"epoch": 0.29723991507430997, |
|
"grad_norm": 0.23453111946582794, |
|
"kl": 0.116455078125, |
|
"learning_rate": 4.301878498875735e-06, |
|
"loss": 0.0012, |
|
"reward": 0.33861320093274117, |
|
"reward_std": 0.1569173000752926, |
|
"rewards/code_reward": 0.2401756690815091, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 740.2031707763672, |
|
"epoch": 0.29936305732484075, |
|
"grad_norm": 0.21566148102283478, |
|
"kl": 0.1102294921875, |
|
"learning_rate": 4.291293093095873e-06, |
|
"loss": 0.0011, |
|
"reward": 0.3095410466194153, |
|
"reward_std": 0.1992884911596775, |
|
"rewards/code_reward": 0.2108803205192089, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 710.2969207763672, |
|
"epoch": 0.30148619957537154, |
|
"grad_norm": 0.22105751931667328, |
|
"kl": 0.12060546875, |
|
"learning_rate": 4.280643017744723e-06, |
|
"loss": 0.0013, |
|
"reward": 0.36906543001532555, |
|
"reward_std": 0.21653805300593376, |
|
"rewards/code_reward": 0.2704046741127968, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 751.5937805175781, |
|
"epoch": 0.3036093418259023, |
|
"grad_norm": 0.23819085955619812, |
|
"kl": 0.1221923828125, |
|
"learning_rate": 4.269928719678117e-06, |
|
"loss": 0.0012, |
|
"reward": 0.25049133971333504, |
|
"reward_std": 0.17505915835499763, |
|
"rewards/code_reward": 0.15160739235579967, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 735.9107513427734, |
|
"epoch": 0.3057324840764331, |
|
"grad_norm": 0.2204020470380783, |
|
"kl": 0.1229248046875, |
|
"learning_rate": 4.2591506484465426e-06, |
|
"loss": 0.0012, |
|
"reward": 0.26853859797120094, |
|
"reward_std": 0.16319206822663546, |
|
"rewards/code_reward": 0.17032429203391075, |
|
"rewards/format_reward": 0.9821428805589676, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 730.8906707763672, |
|
"epoch": 0.3078556263269639, |
|
"grad_norm": 0.23124827444553375, |
|
"kl": 0.119140625, |
|
"learning_rate": 4.248309256276283e-06, |
|
"loss": 0.0012, |
|
"reward": 0.34641416370868683, |
|
"reward_std": 0.15815678425133228, |
|
"rewards/code_reward": 0.2479766495525837, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 772.8192291259766, |
|
"epoch": 0.3099787685774947, |
|
"grad_norm": 0.21077241003513336, |
|
"kl": 0.1168212890625, |
|
"learning_rate": 4.23740499805044e-06, |
|
"loss": 0.0012, |
|
"reward": 0.2558128647506237, |
|
"reward_std": 0.12367029674351215, |
|
"rewards/code_reward": 0.15804500319063663, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 740.8214721679688, |
|
"epoch": 0.31210191082802546, |
|
"grad_norm": 0.21619708836078644, |
|
"kl": 0.125732421875, |
|
"learning_rate": 4.22643833128985e-06, |
|
"loss": 0.0013, |
|
"reward": 0.33286692947149277, |
|
"reward_std": 0.211603332310915, |
|
"rewards/code_reward": 0.2342061996459961, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 807.6071624755859, |
|
"epoch": 0.31422505307855625, |
|
"grad_norm": 0.2130916714668274, |
|
"kl": 0.1212158203125, |
|
"learning_rate": 4.215409716133885e-06, |
|
"loss": 0.0012, |
|
"reward": 0.3038931153714657, |
|
"reward_std": 0.19640244916081429, |
|
"rewards/code_reward": 0.2065716814249754, |
|
"rewards/format_reward": 0.973214328289032, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 757.3281555175781, |
|
"epoch": 0.31634819532908703, |
|
"grad_norm": 0.21959362924098969, |
|
"kl": 0.12744140625, |
|
"learning_rate": 4.204319615321151e-06, |
|
"loss": 0.0013, |
|
"reward": 0.35224368050694466, |
|
"reward_std": 0.1676900666207075, |
|
"rewards/code_reward": 0.2542525976896286, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 748.4687805175781, |
|
"epoch": 0.3184713375796178, |
|
"grad_norm": 0.22287501394748688, |
|
"kl": 0.120361328125, |
|
"learning_rate": 4.193168494170065e-06, |
|
"loss": 0.0012, |
|
"reward": 0.34373533725738525, |
|
"reward_std": 0.18135884031653404, |
|
"rewards/code_reward": 0.2457442507147789, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 755.8906707763672, |
|
"epoch": 0.3205944798301486, |
|
"grad_norm": 1.9565397500991821, |
|
"kl": 0.2320556640625, |
|
"learning_rate": 4.181956820559339e-06, |
|
"loss": 0.0023, |
|
"reward": 0.3970649391412735, |
|
"reward_std": 0.2402110919356346, |
|
"rewards/code_reward": 0.2992970943450928, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 779.8549499511719, |
|
"epoch": 0.3227176220806794, |
|
"grad_norm": 0.2783929109573364, |
|
"kl": 0.15283203125, |
|
"learning_rate": 4.170685064908342e-06, |
|
"loss": 0.0016, |
|
"reward": 0.19563322141766548, |
|
"reward_std": 0.12617591954767704, |
|
"rewards/code_reward": 0.11817785818129778, |
|
"rewards/format_reward": 0.7745535969734192, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 726.2210083007812, |
|
"epoch": 0.3248407643312102, |
|
"grad_norm": 0.3019232451915741, |
|
"kl": 0.157470703125, |
|
"learning_rate": 4.159353700157365e-06, |
|
"loss": 0.0016, |
|
"reward": 0.17416437342762947, |
|
"reward_std": 0.18693338334560394, |
|
"rewards/code_reward": 0.14961079927161336, |
|
"rewards/format_reward": 0.2455357201397419, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 698.8303833007812, |
|
"epoch": 0.32696390658174096, |
|
"grad_norm": 0.2774558365345001, |
|
"kl": 0.1429443359375, |
|
"learning_rate": 4.14796320174778e-06, |
|
"loss": 0.0014, |
|
"reward": 0.16863043326884508, |
|
"reward_std": 0.1559329554438591, |
|
"rewards/code_reward": 0.15412150975316763, |
|
"rewards/format_reward": 0.145089291036129, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 675.5960083007812, |
|
"epoch": 0.32908704883227174, |
|
"grad_norm": 0.2807973027229309, |
|
"kl": 0.1292724609375, |
|
"learning_rate": 4.136514047602087e-06, |
|
"loss": 0.0013, |
|
"reward": 0.18624619487673044, |
|
"reward_std": 0.18390434235334396, |
|
"rewards/code_reward": 0.15968369878828526, |
|
"rewards/format_reward": 0.2656250074505806, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 639.5625305175781, |
|
"epoch": 0.33121019108280253, |
|
"grad_norm": 0.2780408263206482, |
|
"kl": 0.1229248046875, |
|
"learning_rate": 4.1250067181038635e-06, |
|
"loss": 0.0012, |
|
"reward": 0.2191852517426014, |
|
"reward_std": 0.13604657351970673, |
|
"rewards/code_reward": 0.16851558908820152, |
|
"rewards/format_reward": 0.5066964477300644, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 639.6652069091797, |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.26467737555503845, |
|
"kl": 0.144775390625, |
|
"learning_rate": 4.113441696077608e-06, |
|
"loss": 0.0014, |
|
"reward": 0.31026700511574745, |
|
"reward_std": 0.202113538980484, |
|
"rewards/code_reward": 0.2345973663032055, |
|
"rewards/format_reward": 0.7566964626312256, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 659.5558319091797, |
|
"epoch": 0.3354564755838641, |
|
"grad_norm": 0.24479494988918304, |
|
"kl": 0.1280517578125, |
|
"learning_rate": 4.101819466768484e-06, |
|
"loss": 0.0013, |
|
"reward": 0.2640949599444866, |
|
"reward_std": 0.1684006005525589, |
|
"rewards/code_reward": 0.1763717383146286, |
|
"rewards/format_reward": 0.8772321939468384, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 615.0558319091797, |
|
"epoch": 0.3375796178343949, |
|
"grad_norm": 0.24368631839752197, |
|
"kl": 0.15087890625, |
|
"learning_rate": 4.0901405178219535e-06, |
|
"loss": 0.0015, |
|
"reward": 0.345178809016943, |
|
"reward_std": 0.19809392467141151, |
|
"rewards/code_reward": 0.24941986054182053, |
|
"rewards/format_reward": 0.957589328289032, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 624.1830444335938, |
|
"epoch": 0.33970276008492567, |
|
"grad_norm": 0.23896408081054688, |
|
"kl": 0.154052734375, |
|
"learning_rate": 4.078405339263326e-06, |
|
"loss": 0.0015, |
|
"reward": 0.37723641097545624, |
|
"reward_std": 0.21996535174548626, |
|
"rewards/code_reward": 0.28080783039331436, |
|
"rewards/format_reward": 0.964285746216774, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 627.3236846923828, |
|
"epoch": 0.34182590233545646, |
|
"grad_norm": 0.2518380582332611, |
|
"kl": 0.171142578125, |
|
"learning_rate": 4.06661442347719e-06, |
|
"loss": 0.0017, |
|
"reward": 0.3184036388993263, |
|
"reward_std": 0.19395017623901367, |
|
"rewards/code_reward": 0.2217518538236618, |
|
"rewards/format_reward": 0.9665178805589676, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 583.1674499511719, |
|
"epoch": 0.34394904458598724, |
|
"grad_norm": 0.2569345235824585, |
|
"kl": 0.191162109375, |
|
"learning_rate": 4.054768265186758e-06, |
|
"loss": 0.0019, |
|
"reward": 0.31612952798604965, |
|
"reward_std": 0.20712972059845924, |
|
"rewards/code_reward": 0.21836165338754654, |
|
"rewards/format_reward": 0.9776785969734192, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 571.7857208251953, |
|
"epoch": 0.346072186836518, |
|
"grad_norm": 0.25089165568351746, |
|
"kl": 0.207763671875, |
|
"learning_rate": 4.0428673614331036e-06, |
|
"loss": 0.0021, |
|
"reward": 0.365755058825016, |
|
"reward_std": 0.19836053252220154, |
|
"rewards/code_reward": 0.2673175595700741, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 596.4553833007812, |
|
"epoch": 0.3481953290870488, |
|
"grad_norm": 0.23911510407924652, |
|
"kl": 0.22265625, |
|
"learning_rate": 4.030912211554316e-06, |
|
"loss": 0.0023, |
|
"reward": 0.38677794113755226, |
|
"reward_std": 0.18023086339235306, |
|
"rewards/code_reward": 0.28744759038090706, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 567.4486999511719, |
|
"epoch": 0.3503184713375796, |
|
"grad_norm": 0.24430882930755615, |
|
"kl": 0.2099609375, |
|
"learning_rate": 4.018903317164539e-06, |
|
"loss": 0.0021, |
|
"reward": 0.2250930406153202, |
|
"reward_std": 0.19390171952545643, |
|
"rewards/code_reward": 0.1277716178447008, |
|
"rewards/format_reward": 0.9732143431901932, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 573.8594055175781, |
|
"epoch": 0.3524416135881104, |
|
"grad_norm": 0.2257012128829956, |
|
"kl": 0.232666015625, |
|
"learning_rate": 4.006841182132932e-06, |
|
"loss": 0.0023, |
|
"reward": 0.3599228076636791, |
|
"reward_std": 0.20258177444338799, |
|
"rewards/code_reward": 0.2603691965341568, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 606.4911041259766, |
|
"epoch": 0.35456475583864117, |
|
"grad_norm": 0.238439679145813, |
|
"kl": 0.252197265625, |
|
"learning_rate": 3.9947263125625195e-06, |
|
"loss": 0.0025, |
|
"reward": 0.3261881247162819, |
|
"reward_std": 0.1736624352633953, |
|
"rewards/code_reward": 0.22775060683488846, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 590.1027069091797, |
|
"epoch": 0.35668789808917195, |
|
"grad_norm": 0.22503866255283356, |
|
"kl": 0.255615234375, |
|
"learning_rate": 3.982559216768967e-06, |
|
"loss": 0.0026, |
|
"reward": 0.2961311787366867, |
|
"reward_std": 0.1850012019276619, |
|
"rewards/code_reward": 0.1968008242547512, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 594.5491333007812, |
|
"epoch": 0.35881104033970274, |
|
"grad_norm": 0.22681432962417603, |
|
"kl": 0.32421875, |
|
"learning_rate": 3.970340405259245e-06, |
|
"loss": 0.0033, |
|
"reward": 0.4186030365526676, |
|
"reward_std": 0.18541271798312664, |
|
"rewards/code_reward": 0.31927267275750637, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 594.2790298461914, |
|
"epoch": 0.3609341825902335, |
|
"grad_norm": 0.2294747531414032, |
|
"kl": 0.32666015625, |
|
"learning_rate": 3.958070390710214e-06, |
|
"loss": 0.0033, |
|
"reward": 0.36109255626797676, |
|
"reward_std": 0.18586167134344578, |
|
"rewards/code_reward": 0.26243184227496386, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 592.4553680419922, |
|
"epoch": 0.3630573248407643, |
|
"grad_norm": 0.21662850677967072, |
|
"kl": 0.247802734375, |
|
"learning_rate": 3.945749687947109e-06, |
|
"loss": 0.0025, |
|
"reward": 0.24923527240753174, |
|
"reward_std": 0.13961385935544968, |
|
"rewards/code_reward": 0.15057454677298665, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 570.9286041259766, |
|
"epoch": 0.3651804670912951, |
|
"grad_norm": 0.23902097344398499, |
|
"kl": 0.22607421875, |
|
"learning_rate": 3.933378813921942e-06, |
|
"loss": 0.0023, |
|
"reward": 0.3740244060754776, |
|
"reward_std": 0.22742953523993492, |
|
"rewards/code_reward": 0.27536366507411003, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 598.2477874755859, |
|
"epoch": 0.3673036093418259, |
|
"grad_norm": 0.21673643589019775, |
|
"kl": 0.213134765625, |
|
"learning_rate": 3.920958287691811e-06, |
|
"loss": 0.0021, |
|
"reward": 0.2918965369462967, |
|
"reward_std": 0.19641954079270363, |
|
"rewards/code_reward": 0.19301261007785797, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 558.6718978881836, |
|
"epoch": 0.36942675159235666, |
|
"grad_norm": 0.2543295919895172, |
|
"kl": 0.1962890625, |
|
"learning_rate": 3.908488630397121e-06, |
|
"loss": 0.002, |
|
"reward": 0.41571951657533646, |
|
"reward_std": 0.24686651676893234, |
|
"rewards/code_reward": 0.31683557108044624, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 557.0201110839844, |
|
"epoch": 0.37154989384288745, |
|
"grad_norm": 0.23438729345798492, |
|
"kl": 0.206298828125, |
|
"learning_rate": 3.8959703652397175e-06, |
|
"loss": 0.0021, |
|
"reward": 0.38086430728435516, |
|
"reward_std": 0.22366305626928806, |
|
"rewards/code_reward": 0.28198035806417465, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 602.0536041259766, |
|
"epoch": 0.37367303609341823, |
|
"grad_norm": 0.24083319306373596, |
|
"kl": 0.18701171875, |
|
"learning_rate": 3.883404017460935e-06, |
|
"loss": 0.0019, |
|
"reward": 0.36414580047130585, |
|
"reward_std": 0.22220248356461525, |
|
"rewards/code_reward": 0.2657082974910736, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 591.0893096923828, |
|
"epoch": 0.37579617834394907, |
|
"grad_norm": 0.25874680280685425, |
|
"kl": 0.19873046875, |
|
"learning_rate": 3.870790114319559e-06, |
|
"loss": 0.002, |
|
"reward": 0.3555009290575981, |
|
"reward_std": 0.18250016495585442, |
|
"rewards/code_reward": 0.25684019550681114, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 587.404052734375, |
|
"epoch": 0.37791932059447986, |
|
"grad_norm": 0.22890929877758026, |
|
"kl": 0.176513671875, |
|
"learning_rate": 3.858129185069701e-06, |
|
"loss": 0.0018, |
|
"reward": 0.4567238390445709, |
|
"reward_std": 0.2463996484875679, |
|
"rewards/code_reward": 0.35806312412023544, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 602.4486846923828, |
|
"epoch": 0.38004246284501064, |
|
"grad_norm": 0.22736036777496338, |
|
"kl": 0.162353515625, |
|
"learning_rate": 3.845421760938597e-06, |
|
"loss": 0.0016, |
|
"reward": 0.3570307157933712, |
|
"reward_std": 0.16325377486646175, |
|
"rewards/code_reward": 0.2583700120449066, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 610.8638763427734, |
|
"epoch": 0.3821656050955414, |
|
"grad_norm": 0.2262299656867981, |
|
"kl": 0.153076171875, |
|
"learning_rate": 3.832668375104312e-06, |
|
"loss": 0.0016, |
|
"reward": 0.349903404712677, |
|
"reward_std": 0.15386051312088966, |
|
"rewards/code_reward": 0.2503498010337353, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 639.8326110839844, |
|
"epoch": 0.3842887473460722, |
|
"grad_norm": 0.22941501438617706, |
|
"kl": 0.17724609375, |
|
"learning_rate": 3.8198695626733725e-06, |
|
"loss": 0.0018, |
|
"reward": 0.40823063999414444, |
|
"reward_std": 0.2221880704164505, |
|
"rewards/code_reward": 0.3093467131257057, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 638.1049346923828, |
|
"epoch": 0.386411889596603, |
|
"grad_norm": 0.23558823764324188, |
|
"kl": 0.15283203125, |
|
"learning_rate": 3.8070258606583156e-06, |
|
"loss": 0.0016, |
|
"reward": 0.36934422701597214, |
|
"reward_std": 0.21686138212680817, |
|
"rewards/code_reward": 0.27001385763287544, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 625.5759124755859, |
|
"epoch": 0.3885350318471338, |
|
"grad_norm": 0.31238648295402527, |
|
"kl": 0.166259765625, |
|
"learning_rate": 3.7941378079551544e-06, |
|
"loss": 0.0017, |
|
"reward": 0.3830692619085312, |
|
"reward_std": 0.24278680607676506, |
|
"rewards/code_reward": 0.2835156861692667, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 665.8192291259766, |
|
"epoch": 0.39065817409766457, |
|
"grad_norm": 0.3192687928676605, |
|
"kl": 0.1513671875, |
|
"learning_rate": 3.7812059453207677e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3427841551601887, |
|
"reward_std": 0.20133822225034237, |
|
"rewards/code_reward": 0.2441234067082405, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 655.4486846923828, |
|
"epoch": 0.39278131634819535, |
|
"grad_norm": 0.243864506483078, |
|
"kl": 0.141845703125, |
|
"learning_rate": 3.768230815350213e-06, |
|
"loss": 0.0014, |
|
"reward": 0.32591256499290466, |
|
"reward_std": 0.1841282658278942, |
|
"rewards/code_reward": 0.22680539265275002, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 680.6942291259766, |
|
"epoch": 0.39490445859872614, |
|
"grad_norm": 2.7162351608276367, |
|
"kl": 0.2255859375, |
|
"learning_rate": 3.7552129624539557e-06, |
|
"loss": 0.0023, |
|
"reward": 0.38928014785051346, |
|
"reward_std": 0.22797510400414467, |
|
"rewards/code_reward": 0.2917355000972748, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 670.8727874755859, |
|
"epoch": 0.3970276008492569, |
|
"grad_norm": 28.86046600341797, |
|
"kl": 3.12841796875, |
|
"learning_rate": 3.7421529328350316e-06, |
|
"loss": 0.0313, |
|
"reward": 0.33664827793836594, |
|
"reward_std": 0.2122020348906517, |
|
"rewards/code_reward": 0.2404429018497467, |
|
"rewards/format_reward": 0.9620536118745804, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 683.8326110839844, |
|
"epoch": 0.3991507430997877, |
|
"grad_norm": 0.4426620602607727, |
|
"kl": 0.14501953125, |
|
"learning_rate": 3.7290512744661274e-06, |
|
"loss": 0.0015, |
|
"reward": 0.38399138301610947, |
|
"reward_std": 0.1990874893963337, |
|
"rewards/code_reward": 0.2860002890229225, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 647.325927734375, |
|
"epoch": 0.4012738853503185, |
|
"grad_norm": 0.2341061532497406, |
|
"kl": 0.1455078125, |
|
"learning_rate": 3.715908537066589e-06, |
|
"loss": 0.0015, |
|
"reward": 0.42976176738739014, |
|
"reward_std": 0.21941150352358818, |
|
"rewards/code_reward": 0.33199387788772583, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 698.1763610839844, |
|
"epoch": 0.4033970276008493, |
|
"grad_norm": 1.953539252281189, |
|
"kl": 0.5579833984375, |
|
"learning_rate": 3.7027252720793538e-06, |
|
"loss": 0.0056, |
|
"reward": 0.33469754457473755, |
|
"reward_std": 0.19711985811591148, |
|
"rewards/code_reward": 0.23692966997623444, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 710.0982360839844, |
|
"epoch": 0.40552016985138006, |
|
"grad_norm": 0.24030916392803192, |
|
"kl": 0.161865234375, |
|
"learning_rate": 3.689502032647817e-06, |
|
"loss": 0.0016, |
|
"reward": 0.35261962562799454, |
|
"reward_std": 0.2262839339673519, |
|
"rewards/code_reward": 0.25552139058709145, |
|
"rewards/format_reward": 0.970982164144516, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 672.5401916503906, |
|
"epoch": 0.40764331210191085, |
|
"grad_norm": 0.9592034816741943, |
|
"kl": 0.154541015625, |
|
"learning_rate": 3.6762393735926245e-06, |
|
"loss": 0.0016, |
|
"reward": 0.3493685219436884, |
|
"reward_std": 0.1753272709902376, |
|
"rewards/code_reward": 0.2524934969842434, |
|
"rewards/format_reward": 0.9687500298023224, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 710.7299499511719, |
|
"epoch": 0.40976645435244163, |
|
"grad_norm": 0.3044726550579071, |
|
"kl": 0.15185546875, |
|
"learning_rate": 3.6629378513883852e-06, |
|
"loss": 0.0015, |
|
"reward": 0.4329136684536934, |
|
"reward_std": 0.257048511877656, |
|
"rewards/code_reward": 0.3346993774175644, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 718.3683166503906, |
|
"epoch": 0.4118895966029724, |
|
"grad_norm": 0.2441069632768631, |
|
"kl": 0.1630859375, |
|
"learning_rate": 3.6495980241403307e-06, |
|
"loss": 0.0016, |
|
"reward": 0.32557281479239464, |
|
"reward_std": 0.19367647171020508, |
|
"rewards/code_reward": 0.2271352931857109, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 701.9486846923828, |
|
"epoch": 0.4140127388535032, |
|
"grad_norm": 0.22456014156341553, |
|
"kl": 0.16064453125, |
|
"learning_rate": 3.636220451560896e-06, |
|
"loss": 0.0016, |
|
"reward": 0.42680248618125916, |
|
"reward_std": 0.2046816684305668, |
|
"rewards/code_reward": 0.32903461158275604, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 713.747802734375, |
|
"epoch": 0.416135881104034, |
|
"grad_norm": 0.45598000288009644, |
|
"kl": 0.149169921875, |
|
"learning_rate": 3.622805694946235e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3776397071778774, |
|
"reward_std": 0.18774981424212456, |
|
"rewards/code_reward": 0.2803182378411293, |
|
"rewards/format_reward": 0.9732143133878708, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 717.1406555175781, |
|
"epoch": 0.4182590233545648, |
|
"grad_norm": 0.21405339241027832, |
|
"kl": 0.1429443359375, |
|
"learning_rate": 3.609354317152667e-06, |
|
"loss": 0.0015, |
|
"reward": 0.38271288573741913, |
|
"reward_std": 0.19382936879992485, |
|
"rewards/code_reward": 0.28539142571389675, |
|
"rewards/format_reward": 0.9732143133878708, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 678.2477874755859, |
|
"epoch": 0.42038216560509556, |
|
"grad_norm": 0.49006760120391846, |
|
"kl": 0.2021484375, |
|
"learning_rate": 3.595866882573063e-06, |
|
"loss": 0.0021, |
|
"reward": 0.4323223605751991, |
|
"reward_std": 0.2277931533753872, |
|
"rewards/code_reward": 0.3345545120537281, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 728.9620971679688, |
|
"epoch": 0.42250530785562634, |
|
"grad_norm": 0.39922112226486206, |
|
"kl": 0.184814453125, |
|
"learning_rate": 3.5823439571131675e-06, |
|
"loss": 0.0019, |
|
"reward": 0.40869200229644775, |
|
"reward_std": 0.2020891159772873, |
|
"rewards/code_reward": 0.31159375607967377, |
|
"rewards/format_reward": 0.9709821939468384, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 684.9576416015625, |
|
"epoch": 0.42462845010615713, |
|
"grad_norm": 0.23013651371002197, |
|
"kl": 0.149658203125, |
|
"learning_rate": 3.5687861081678477e-06, |
|
"loss": 0.0015, |
|
"reward": 0.4545319005846977, |
|
"reward_std": 0.24276942387223244, |
|
"rewards/code_reward": 0.3572104535996914, |
|
"rewards/format_reward": 0.9732143431901932, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 704.5223541259766, |
|
"epoch": 0.4267515923566879, |
|
"grad_norm": 0.46601447463035583, |
|
"kl": 0.145263671875, |
|
"learning_rate": 3.555193904597291e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3521813452243805, |
|
"reward_std": 0.1790554393082857, |
|
"rewards/code_reward": 0.2555295582860708, |
|
"rewards/format_reward": 0.96651791036129, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 676.4754791259766, |
|
"epoch": 0.4288747346072187, |
|
"grad_norm": 0.24227948486804962, |
|
"kl": 0.145751953125, |
|
"learning_rate": 3.541567916703138e-06, |
|
"loss": 0.0015, |
|
"reward": 0.4256810247898102, |
|
"reward_std": 0.2298164926469326, |
|
"rewards/code_reward": 0.327689953148365, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 697.6027069091797, |
|
"epoch": 0.4309978768577495, |
|
"grad_norm": 0.32301369309425354, |
|
"kl": 0.141845703125, |
|
"learning_rate": 3.5279087162045517e-06, |
|
"loss": 0.0014, |
|
"reward": 0.27234210819005966, |
|
"reward_std": 0.17865055054426193, |
|
"rewards/code_reward": 0.17479745857417583, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 696.3727874755859, |
|
"epoch": 0.43312101910828027, |
|
"grad_norm": 0.6582425236701965, |
|
"kl": 0.14111328125, |
|
"learning_rate": 3.5142168762142265e-06, |
|
"loss": 0.0014, |
|
"reward": 0.3229696787893772, |
|
"reward_std": 0.1942148432135582, |
|
"rewards/code_reward": 0.22542503476142883, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 718.2232513427734, |
|
"epoch": 0.43524416135881105, |
|
"grad_norm": 0.30619704723358154, |
|
"kl": 0.149169921875, |
|
"learning_rate": 3.500492971214347e-06, |
|
"loss": 0.0015, |
|
"reward": 0.4395933449268341, |
|
"reward_std": 0.265441432595253, |
|
"rewards/code_reward": 0.3402629792690277, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 699.310302734375, |
|
"epoch": 0.43736730360934184, |
|
"grad_norm": 0.3680998980998993, |
|
"kl": 0.151611328125, |
|
"learning_rate": 3.48673757703248e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3385552614927292, |
|
"reward_std": 0.24193225800991058, |
|
"rewards/code_reward": 0.24101059883832932, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 712.4799499511719, |
|
"epoch": 0.4394904458598726, |
|
"grad_norm": 0.22541551291942596, |
|
"kl": 0.315673828125, |
|
"learning_rate": 3.472951270817418e-06, |
|
"loss": 0.0032, |
|
"reward": 0.317364189773798, |
|
"reward_std": 0.2289394848048687, |
|
"rewards/code_reward": 0.2191498950123787, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 725.1719207763672, |
|
"epoch": 0.4416135881104034, |
|
"grad_norm": 0.7986815571784973, |
|
"kl": 0.8701171875, |
|
"learning_rate": 3.4591346310149578e-06, |
|
"loss": 0.0087, |
|
"reward": 0.30210861191153526, |
|
"reward_std": 0.1758405715227127, |
|
"rewards/code_reward": 0.20545680448412895, |
|
"rewards/format_reward": 0.9665178954601288, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 708.9174346923828, |
|
"epoch": 0.4437367303609342, |
|
"grad_norm": 0.6832094788551331, |
|
"kl": 0.571533203125, |
|
"learning_rate": 3.445288237343632e-06, |
|
"loss": 0.0057, |
|
"reward": 0.34425482153892517, |
|
"reward_std": 0.17729798145592213, |
|
"rewards/code_reward": 0.24559411033988, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 664.4152069091797, |
|
"epoch": 0.445859872611465, |
|
"grad_norm": 0.5344778299331665, |
|
"kl": 0.344970703125, |
|
"learning_rate": 3.4314126707703895e-06, |
|
"loss": 0.0035, |
|
"reward": 0.3406968005001545, |
|
"reward_std": 0.21405612863600254, |
|
"rewards/code_reward": 0.2424825206398964, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 687.4955596923828, |
|
"epoch": 0.44798301486199577, |
|
"grad_norm": 0.2852449119091034, |
|
"kl": 0.314208984375, |
|
"learning_rate": 3.4175085134862128e-06, |
|
"loss": 0.0031, |
|
"reward": 0.37548423558473587, |
|
"reward_std": 0.19767768681049347, |
|
"rewards/code_reward": 0.2783860079944134, |
|
"rewards/format_reward": 0.9709821939468384, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 694.372802734375, |
|
"epoch": 0.45010615711252655, |
|
"grad_norm": 0.8310821056365967, |
|
"kl": 0.214111328125, |
|
"learning_rate": 3.4035763488816953e-06, |
|
"loss": 0.0021, |
|
"reward": 0.5172732323408127, |
|
"reward_std": 0.24472371861338615, |
|
"rewards/code_reward": 0.41883569955825806, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 678.0424499511719, |
|
"epoch": 0.45222929936305734, |
|
"grad_norm": 0.28591927886009216, |
|
"kl": 0.14306640625, |
|
"learning_rate": 3.3896167615225594e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3543313890695572, |
|
"reward_std": 0.21659231930971146, |
|
"rewards/code_reward": 0.2567867375910282, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 692.7522583007812, |
|
"epoch": 0.4543524416135881, |
|
"grad_norm": 0.5759381055831909, |
|
"kl": 0.1455078125, |
|
"learning_rate": 3.375630337125133e-06, |
|
"loss": 0.0015, |
|
"reward": 0.39294832199811935, |
|
"reward_std": 0.26400984078645706, |
|
"rewards/code_reward": 0.2960733026266098, |
|
"rewards/format_reward": 0.9687500298023224, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 731.1272735595703, |
|
"epoch": 0.4564755838641189, |
|
"grad_norm": 0.23394830524921417, |
|
"kl": 0.143798828125, |
|
"learning_rate": 3.361617662531772e-06, |
|
"loss": 0.0014, |
|
"reward": 0.3601933494210243, |
|
"reward_std": 0.25189225003123283, |
|
"rewards/code_reward": 0.2619790583848953, |
|
"rewards/format_reward": 0.9821428805589676, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 692.1428833007812, |
|
"epoch": 0.4585987261146497, |
|
"grad_norm": 0.24470415711402893, |
|
"kl": 0.1317138671875, |
|
"learning_rate": 3.347579325686237e-06, |
|
"loss": 0.0013, |
|
"reward": 0.3433048315346241, |
|
"reward_std": 0.22325459122657776, |
|
"rewards/code_reward": 0.245536956936121, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 684.3705749511719, |
|
"epoch": 0.4607218683651805, |
|
"grad_norm": 0.364793062210083, |
|
"kl": 0.122802734375, |
|
"learning_rate": 3.333515915609027e-06, |
|
"loss": 0.0012, |
|
"reward": 0.4696499854326248, |
|
"reward_std": 0.2734139449894428, |
|
"rewards/code_reward": 0.3707660511136055, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 723.4129791259766, |
|
"epoch": 0.46284501061571126, |
|
"grad_norm": 0.36840999126434326, |
|
"kl": 0.128173828125, |
|
"learning_rate": 3.3194280223726616e-06, |
|
"loss": 0.0013, |
|
"reward": 0.3476767987012863, |
|
"reward_std": 0.19485369697213173, |
|
"rewards/code_reward": 0.24968570843338966, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 660.7812805175781, |
|
"epoch": 0.46496815286624205, |
|
"grad_norm": 0.2917425036430359, |
|
"kl": 0.142578125, |
|
"learning_rate": 3.305316237076927e-06, |
|
"loss": 0.0014, |
|
"reward": 0.39485304057598114, |
|
"reward_std": 0.23686816543340683, |
|
"rewards/code_reward": 0.29708515852689743, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 674.2410888671875, |
|
"epoch": 0.46709129511677283, |
|
"grad_norm": 0.27625608444213867, |
|
"kl": 0.13134765625, |
|
"learning_rate": 3.291181151824071e-06, |
|
"loss": 0.0014, |
|
"reward": 0.5001323744654655, |
|
"reward_std": 0.2807440906763077, |
|
"rewards/code_reward": 0.40124842897057533, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 717.279052734375, |
|
"epoch": 0.4692144373673036, |
|
"grad_norm": 0.26342645287513733, |
|
"kl": 0.132568359375, |
|
"learning_rate": 3.27702335969396e-06, |
|
"loss": 0.0014, |
|
"reward": 0.438594788312912, |
|
"reward_std": 0.2874513529241085, |
|
"rewards/code_reward": 0.34060370177030563, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 740.8192291259766, |
|
"epoch": 0.4713375796178344, |
|
"grad_norm": 0.3312680423259735, |
|
"kl": 0.144287109375, |
|
"learning_rate": 3.2628434547191985e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4112970530986786, |
|
"reward_std": 0.2245728299021721, |
|
"rewards/code_reward": 0.3137524016201496, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 709.9174499511719, |
|
"epoch": 0.4734607218683652, |
|
"grad_norm": 1.5567724704742432, |
|
"kl": 0.1339111328125, |
|
"learning_rate": 3.2486420318601973e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4251294732093811, |
|
"reward_std": 0.18372783437371254, |
|
"rewards/code_reward": 0.3291473314166069, |
|
"rewards/format_reward": 0.9598214775323868, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 719.7656707763672, |
|
"epoch": 0.47558386411889597, |
|
"grad_norm": 0.2122294157743454, |
|
"kl": 0.1273193359375, |
|
"learning_rate": 3.2344196869802187e-06, |
|
"loss": 0.0013, |
|
"reward": 0.3450777679681778, |
|
"reward_std": 0.24194234982132912, |
|
"rewards/code_reward": 0.24730990827083588, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 704.0045013427734, |
|
"epoch": 0.47770700636942676, |
|
"grad_norm": 0.9711757898330688, |
|
"kl": 0.20751953125, |
|
"learning_rate": 3.2201770168203694e-06, |
|
"loss": 0.0021, |
|
"reward": 0.4306853115558624, |
|
"reward_std": 0.2568584829568863, |
|
"rewards/code_reward": 0.33492637425661087, |
|
"rewards/format_reward": 0.9575893133878708, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 727.6027069091797, |
|
"epoch": 0.47983014861995754, |
|
"grad_norm": 0.268039733171463, |
|
"kl": 0.13818359375, |
|
"learning_rate": 3.205914618974563e-06, |
|
"loss": 0.0014, |
|
"reward": 0.43213512748479843, |
|
"reward_std": 0.2562938630580902, |
|
"rewards/code_reward": 0.334367249161005, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 732.9643249511719, |
|
"epoch": 0.4819532908704883, |
|
"grad_norm": 0.46155139803886414, |
|
"kl": 0.198486328125, |
|
"learning_rate": 3.1916330918644496e-06, |
|
"loss": 0.002, |
|
"reward": 0.31592320650815964, |
|
"reward_std": 0.19539642706513405, |
|
"rewards/code_reward": 0.2174856998026371, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 770.1295013427734, |
|
"epoch": 0.4840764331210191, |
|
"grad_norm": 0.7360585331916809, |
|
"kl": 0.3978271484375, |
|
"learning_rate": 3.177333034714303e-06, |
|
"loss": 0.004, |
|
"reward": 0.35912561416625977, |
|
"reward_std": 0.21681112423539162, |
|
"rewards/code_reward": 0.26135774329304695, |
|
"rewards/format_reward": 0.9776785969734192, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 706.9777069091797, |
|
"epoch": 0.4861995753715499, |
|
"grad_norm": 1.2824815511703491, |
|
"kl": 0.615478515625, |
|
"learning_rate": 3.1630150475258813e-06, |
|
"loss": 0.0062, |
|
"reward": 0.3668329790234566, |
|
"reward_std": 0.2176014445722103, |
|
"rewards/code_reward": 0.2699579633772373, |
|
"rewards/format_reward": 0.9687500596046448, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 709.825927734375, |
|
"epoch": 0.4883227176220807, |
|
"grad_norm": 0.4730873107910156, |
|
"kl": 0.4136962890625, |
|
"learning_rate": 3.148679731053252e-06, |
|
"loss": 0.0041, |
|
"reward": 0.4401291459798813, |
|
"reward_std": 0.2792894318699837, |
|
"rewards/code_reward": 0.34213805943727493, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 716.0312805175781, |
|
"epoch": 0.49044585987261147, |
|
"grad_norm": 0.226039856672287, |
|
"kl": 0.1241455078125, |
|
"learning_rate": 3.1343276867775805e-06, |
|
"loss": 0.0013, |
|
"reward": 0.3396586962044239, |
|
"reward_std": 0.19299479201436043, |
|
"rewards/code_reward": 0.24211404286324978, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 699.6830749511719, |
|
"epoch": 0.49256900212314225, |
|
"grad_norm": 0.31895455718040466, |
|
"kl": 0.50146484375, |
|
"learning_rate": 3.1199595168819043e-06, |
|
"loss": 0.0051, |
|
"reward": 0.34284605644643307, |
|
"reward_std": 0.14287223480641842, |
|
"rewards/code_reward": 0.24463177705183625, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 781.5178985595703, |
|
"epoch": 0.49469214437367304, |
|
"grad_norm": 0.4143598973751068, |
|
"kl": 0.249755859375, |
|
"learning_rate": 3.105575824225852e-06, |
|
"loss": 0.0025, |
|
"reward": 0.38098950684070587, |
|
"reward_std": 0.21905666589736938, |
|
"rewards/code_reward": 0.28590018674731255, |
|
"rewards/format_reward": 0.9508928954601288, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 725.3705749511719, |
|
"epoch": 0.4968152866242038, |
|
"grad_norm": 0.9609025120735168, |
|
"kl": 0.401123046875, |
|
"learning_rate": 3.091177212320363e-06, |
|
"loss": 0.004, |
|
"reward": 0.4063151776790619, |
|
"reward_std": 0.25925979763269424, |
|
"rewards/code_reward": 0.3076544553041458, |
|
"rewards/format_reward": 0.986607164144516, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 730.1406555175781, |
|
"epoch": 0.4989384288747346, |
|
"grad_norm": 0.2471870481967926, |
|
"kl": 0.233154296875, |
|
"learning_rate": 3.0767642853023538e-06, |
|
"loss": 0.0024, |
|
"reward": 0.3827313929796219, |
|
"reward_std": 0.21219320595264435, |
|
"rewards/code_reward": 0.2858563922345638, |
|
"rewards/format_reward": 0.9687500298023224, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 696.4219207763672, |
|
"epoch": 0.5010615711252654, |
|
"grad_norm": 0.6714680194854736, |
|
"kl": 0.1856689453125, |
|
"learning_rate": 3.062337647909376e-06, |
|
"loss": 0.0019, |
|
"reward": 0.4210161566734314, |
|
"reward_std": 0.18587047047913074, |
|
"rewards/code_reward": 0.3232482895255089, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 744.200927734375, |
|
"epoch": 0.5031847133757962, |
|
"grad_norm": 0.5082603096961975, |
|
"kl": 0.2071533203125, |
|
"learning_rate": 3.04789790545424e-06, |
|
"loss": 0.0021, |
|
"reward": 0.4485570266842842, |
|
"reward_std": 0.1916775107383728, |
|
"rewards/code_reward": 0.3519052043557167, |
|
"rewards/format_reward": 0.9665178954601288, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 758.0178985595703, |
|
"epoch": 0.505307855626327, |
|
"grad_norm": 0.69068843126297, |
|
"kl": 0.19482421875, |
|
"learning_rate": 3.033445663799621e-06, |
|
"loss": 0.002, |
|
"reward": 0.3711010664701462, |
|
"reward_std": 0.1955837495625019, |
|
"rewards/code_reward": 0.2742260619997978, |
|
"rewards/format_reward": 0.9687500596046448, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 717.8147735595703, |
|
"epoch": 0.5074309978768577, |
|
"grad_norm": 0.40367022156715393, |
|
"kl": 0.161865234375, |
|
"learning_rate": 3.018981529332633e-06, |
|
"loss": 0.0016, |
|
"reward": 0.5175677761435509, |
|
"reward_std": 0.2608077637851238, |
|
"rewards/code_reward": 0.4193534851074219, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 729.2545013427734, |
|
"epoch": 0.5095541401273885, |
|
"grad_norm": 0.5104432702064514, |
|
"kl": 0.19384765625, |
|
"learning_rate": 3.00450610893939e-06, |
|
"loss": 0.002, |
|
"reward": 0.40982675552368164, |
|
"reward_std": 0.20613017305731773, |
|
"rewards/code_reward": 0.31161245331168175, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 705.7120819091797, |
|
"epoch": 0.5116772823779193, |
|
"grad_norm": 0.2226688116788864, |
|
"kl": 0.167724609375, |
|
"learning_rate": 2.9900200099795396e-06, |
|
"loss": 0.0017, |
|
"reward": 0.40758588910102844, |
|
"reward_std": 0.22469941899180412, |
|
"rewards/code_reward": 0.31048765778541565, |
|
"rewards/format_reward": 0.970982164144516, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 716.6004791259766, |
|
"epoch": 0.5138004246284501, |
|
"grad_norm": 0.823330283164978, |
|
"kl": 0.224853515625, |
|
"learning_rate": 2.9755238402607826e-06, |
|
"loss": 0.0023, |
|
"reward": 0.381127692759037, |
|
"reward_std": 0.1726750060915947, |
|
"rewards/code_reward": 0.2817973233759403, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 714.8460235595703, |
|
"epoch": 0.5159235668789809, |
|
"grad_norm": 0.5035973191261292, |
|
"kl": 0.198486328125, |
|
"learning_rate": 2.961018208013367e-06, |
|
"loss": 0.002, |
|
"reward": 0.3932320065796375, |
|
"reward_std": 0.14925590343773365, |
|
"rewards/code_reward": 0.295910551212728, |
|
"rewards/format_reward": 0.973214328289032, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 714.6205749511719, |
|
"epoch": 0.5180467091295117, |
|
"grad_norm": 0.6857829689979553, |
|
"kl": 0.16259765625, |
|
"learning_rate": 2.9465037218645694e-06, |
|
"loss": 0.0016, |
|
"reward": 0.3965849094092846, |
|
"reward_std": 0.20821771398186684, |
|
"rewards/code_reward": 0.3001563027501106, |
|
"rewards/format_reward": 0.9642857611179352, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 710.7254791259766, |
|
"epoch": 0.5201698513800425, |
|
"grad_norm": 1.60740327835083, |
|
"kl": 0.131591796875, |
|
"learning_rate": 2.9319809908131604e-06, |
|
"loss": 0.0013, |
|
"reward": 0.43405191600322723, |
|
"reward_std": 0.25733353197574615, |
|
"rewards/code_reward": 0.33539119362831116, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 690.6027221679688, |
|
"epoch": 0.5222929936305732, |
|
"grad_norm": 0.2978648841381073, |
|
"kl": 0.1689453125, |
|
"learning_rate": 2.917450624203847e-06, |
|
"loss": 0.0017, |
|
"reward": 0.45192842930555344, |
|
"reward_std": 0.24438033252954483, |
|
"rewards/code_reward": 0.3539373278617859, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 737.6094207763672, |
|
"epoch": 0.524416135881104, |
|
"grad_norm": 0.3084428310394287, |
|
"kl": 0.1378173828125, |
|
"learning_rate": 2.9029132317017118e-06, |
|
"loss": 0.0014, |
|
"reward": 0.46284686774015427, |
|
"reward_std": 0.2403612770140171, |
|
"rewards/code_reward": 0.36619507521390915, |
|
"rewards/format_reward": 0.96651791036129, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 698.7411041259766, |
|
"epoch": 0.5265392781316348, |
|
"grad_norm": 1.3392090797424316, |
|
"kl": 0.151123046875, |
|
"learning_rate": 2.888369423266629e-06, |
|
"loss": 0.0015, |
|
"reward": 0.4595029503107071, |
|
"reward_std": 0.19635827839374542, |
|
"rewards/code_reward": 0.36218152195215225, |
|
"rewards/format_reward": 0.9732143431901932, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 719.0134429931641, |
|
"epoch": 0.5286624203821656, |
|
"grad_norm": 0.21979840099811554, |
|
"kl": 0.14111328125, |
|
"learning_rate": 2.8738198091276712e-06, |
|
"loss": 0.0014, |
|
"reward": 0.36629121005535126, |
|
"reward_std": 0.21069011464715004, |
|
"rewards/code_reward": 0.2694162093102932, |
|
"rewards/format_reward": 0.9687500298023224, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 745.263427734375, |
|
"epoch": 0.5307855626326964, |
|
"grad_norm": 0.8504329323768616, |
|
"kl": 0.15234375, |
|
"learning_rate": 2.859264999757509e-06, |
|
"loss": 0.0016, |
|
"reward": 0.37740468978881836, |
|
"reward_std": 0.20382403209805489, |
|
"rewards/code_reward": 0.2811993137001991, |
|
"rewards/format_reward": 0.9620536267757416, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 723.1562957763672, |
|
"epoch": 0.5329087048832272, |
|
"grad_norm": 0.27034398913383484, |
|
"kl": 0.1591796875, |
|
"learning_rate": 2.8447056058467928e-06, |
|
"loss": 0.0016, |
|
"reward": 0.48566606640815735, |
|
"reward_std": 0.21075040474534035, |
|
"rewards/code_reward": 0.38789819926023483, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 734.6986846923828, |
|
"epoch": 0.535031847133758, |
|
"grad_norm": 0.4998323917388916, |
|
"kl": 0.145751953125, |
|
"learning_rate": 2.830142238278531e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3668500781059265, |
|
"reward_std": 0.1973743811249733, |
|
"rewards/code_reward": 0.2690822184085846, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 722.1719207763672, |
|
"epoch": 0.5371549893842887, |
|
"grad_norm": 0.7386496663093567, |
|
"kl": 0.16943359375, |
|
"learning_rate": 2.81557550810246e-06, |
|
"loss": 0.0017, |
|
"reward": 0.5175603851675987, |
|
"reward_std": 0.23075248673558235, |
|
"rewards/code_reward": 0.41889964044094086, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 731.4286041259766, |
|
"epoch": 0.5392781316348195, |
|
"grad_norm": 2.323516368865967, |
|
"kl": 0.185791015625, |
|
"learning_rate": 2.8010060265094026e-06, |
|
"loss": 0.0019, |
|
"reward": 0.4158123657107353, |
|
"reward_std": 0.2362896017730236, |
|
"rewards/code_reward": 0.3180444836616516, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 714.1004791259766, |
|
"epoch": 0.5414012738853503, |
|
"grad_norm": 0.22996105253696442, |
|
"kl": 0.193115234375, |
|
"learning_rate": 2.786434404805629e-06, |
|
"loss": 0.002, |
|
"reward": 0.43036870658397675, |
|
"reward_std": 0.17873099818825722, |
|
"rewards/code_reward": 0.3323776051402092, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 755.794677734375, |
|
"epoch": 0.5435244161358811, |
|
"grad_norm": 0.5349477529525757, |
|
"kl": 0.21728515625, |
|
"learning_rate": 2.771861254387199e-06, |
|
"loss": 0.0022, |
|
"reward": 0.3905658796429634, |
|
"reward_std": 0.24914883077144623, |
|
"rewards/code_reward": 0.2939140759408474, |
|
"rewards/format_reward": 0.96651791036129, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 736.7969207763672, |
|
"epoch": 0.5456475583864119, |
|
"grad_norm": 0.5384594202041626, |
|
"kl": 0.44921875, |
|
"learning_rate": 2.7572871867143204e-06, |
|
"loss": 0.0045, |
|
"reward": 0.4809773936867714, |
|
"reward_std": 0.24490142613649368, |
|
"rewards/code_reward": 0.3832095377147198, |
|
"rewards/format_reward": 0.9776785969734192, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 779.904052734375, |
|
"epoch": 0.5477707006369427, |
|
"grad_norm": 0.3539630174636841, |
|
"kl": 0.46142578125, |
|
"learning_rate": 2.742712813285681e-06, |
|
"loss": 0.0046, |
|
"reward": 0.4002307578921318, |
|
"reward_std": 0.26231593638658524, |
|
"rewards/code_reward": 0.30447180569171906, |
|
"rewards/format_reward": 0.9575893431901932, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 688.8013610839844, |
|
"epoch": 0.5498938428874734, |
|
"grad_norm": 0.3760126531124115, |
|
"kl": 0.270263671875, |
|
"learning_rate": 2.7281387456128017e-06, |
|
"loss": 0.0027, |
|
"reward": 0.5040838867425919, |
|
"reward_std": 0.23536711558699608, |
|
"rewards/code_reward": 0.40519992262125015, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 763.7924499511719, |
|
"epoch": 0.5520169851380042, |
|
"grad_norm": 0.28506824374198914, |
|
"kl": 0.37060546875, |
|
"learning_rate": 2.7135655951943716e-06, |
|
"loss": 0.0037, |
|
"reward": 0.4464469403028488, |
|
"reward_std": 0.23727866262197495, |
|
"rewards/code_reward": 0.3491254858672619, |
|
"rewards/format_reward": 0.9732143133878708, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 735.7143096923828, |
|
"epoch": 0.554140127388535, |
|
"grad_norm": 0.5788131952285767, |
|
"kl": 0.35546875, |
|
"learning_rate": 2.698993973490598e-06, |
|
"loss": 0.0036, |
|
"reward": 0.5397853627800941, |
|
"reward_std": 0.2762787565588951, |
|
"rewards/code_reward": 0.4424639120697975, |
|
"rewards/format_reward": 0.9732143133878708, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 770.4129791259766, |
|
"epoch": 0.5562632696390658, |
|
"grad_norm": 0.5763887166976929, |
|
"kl": 0.4140625, |
|
"learning_rate": 2.6844244918975416e-06, |
|
"loss": 0.0041, |
|
"reward": 0.4332207143306732, |
|
"reward_std": 0.21580959856510162, |
|
"rewards/code_reward": 0.3361224830150604, |
|
"rewards/format_reward": 0.9709821939468384, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 756.357177734375, |
|
"epoch": 0.5583864118895966, |
|
"grad_norm": 0.23940207064151764, |
|
"kl": 0.3953857421875, |
|
"learning_rate": 2.66985776172147e-06, |
|
"loss": 0.004, |
|
"reward": 0.4067609831690788, |
|
"reward_std": 0.15922481939196587, |
|
"rewards/code_reward": 0.3078770413994789, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 797.3772735595703, |
|
"epoch": 0.5605095541401274, |
|
"grad_norm": 1.1956448554992676, |
|
"kl": 0.394775390625, |
|
"learning_rate": 2.6552943941532088e-06, |
|
"loss": 0.004, |
|
"reward": 0.35336220264434814, |
|
"reward_std": 0.21252319402992725, |
|
"rewards/code_reward": 0.25447824597358704, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 816.9330749511719, |
|
"epoch": 0.5626326963906582, |
|
"grad_norm": 0.3272117078304291, |
|
"kl": 0.33447265625, |
|
"learning_rate": 2.6407350002424927e-06, |
|
"loss": 0.0034, |
|
"reward": 0.3648254945874214, |
|
"reward_std": 0.19711337611079216, |
|
"rewards/code_reward": 0.2675040401518345, |
|
"rewards/format_reward": 0.973214328289032, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 790.0937805175781, |
|
"epoch": 0.564755838641189, |
|
"grad_norm": 0.3350137174129486, |
|
"kl": 0.217529296875, |
|
"learning_rate": 2.626180190872329e-06, |
|
"loss": 0.0022, |
|
"reward": 0.4639175459742546, |
|
"reward_std": 0.19284814596176147, |
|
"rewards/code_reward": 0.36592647433280945, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 785.247802734375, |
|
"epoch": 0.5668789808917197, |
|
"grad_norm": 0.2253342717885971, |
|
"kl": 0.1259765625, |
|
"learning_rate": 2.611630576733372e-06, |
|
"loss": 0.0013, |
|
"reward": 0.42001737654209137, |
|
"reward_std": 0.24298213049769402, |
|
"rewards/code_reward": 0.32180308550596237, |
|
"rewards/format_reward": 0.9821428805589676, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 874.1495971679688, |
|
"epoch": 0.5690021231422505, |
|
"grad_norm": 1.105658769607544, |
|
"kl": 0.2879638671875, |
|
"learning_rate": 2.5970867682982885e-06, |
|
"loss": 0.0029, |
|
"reward": 0.4009394347667694, |
|
"reward_std": 0.2002662494778633, |
|
"rewards/code_reward": 0.3031715527176857, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 821.7812805175781, |
|
"epoch": 0.5711252653927813, |
|
"grad_norm": 0.39032891392707825, |
|
"kl": 0.2081298828125, |
|
"learning_rate": 2.582549375796154e-06, |
|
"loss": 0.0021, |
|
"reward": 0.4019026607275009, |
|
"reward_std": 0.21826408058404922, |
|
"rewards/code_reward": 0.3036883734166622, |
|
"rewards/format_reward": 0.9821428805589676, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 804.6183471679688, |
|
"epoch": 0.5732484076433121, |
|
"grad_norm": 0.25958776473999023, |
|
"kl": 0.179931640625, |
|
"learning_rate": 2.568019009186841e-06, |
|
"loss": 0.0019, |
|
"reward": 0.4916309267282486, |
|
"reward_std": 0.19469109177589417, |
|
"rewards/code_reward": 0.3934166729450226, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 825.3370971679688, |
|
"epoch": 0.5753715498938429, |
|
"grad_norm": 0.22188299894332886, |
|
"kl": 0.1358642578125, |
|
"learning_rate": 2.5534962781354317e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4202270358800888, |
|
"reward_std": 0.24190283194184303, |
|
"rewards/code_reward": 0.32223593071103096, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 780.4866485595703, |
|
"epoch": 0.5774946921443737, |
|
"grad_norm": 0.2580115497112274, |
|
"kl": 0.1597900390625, |
|
"learning_rate": 2.538981791986634e-06, |
|
"loss": 0.0016, |
|
"reward": 0.38698963820934296, |
|
"reward_std": 0.22697532176971436, |
|
"rewards/code_reward": 0.28877533972263336, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 823.0402069091797, |
|
"epoch": 0.5796178343949044, |
|
"grad_norm": 0.2385285347700119, |
|
"kl": 0.141357421875, |
|
"learning_rate": 2.524476159739218e-06, |
|
"loss": 0.0015, |
|
"reward": 0.43764442950487137, |
|
"reward_std": 0.22844265773892403, |
|
"rewards/code_reward": 0.34032295644283295, |
|
"rewards/format_reward": 0.9732143431901932, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 782.2277221679688, |
|
"epoch": 0.5817409766454352, |
|
"grad_norm": 0.7892447710037231, |
|
"kl": 0.1402587890625, |
|
"learning_rate": 2.5099799900204607e-06, |
|
"loss": 0.0014, |
|
"reward": 0.47495051473379135, |
|
"reward_std": 0.24570094048976898, |
|
"rewards/code_reward": 0.37606657296419144, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 790.029052734375, |
|
"epoch": 0.583864118895966, |
|
"grad_norm": 1.390781044960022, |
|
"kl": 0.1494140625, |
|
"learning_rate": 2.4954938910606108e-06, |
|
"loss": 0.0015, |
|
"reward": 0.41709961369633675, |
|
"reward_std": 0.22452056966722012, |
|
"rewards/code_reward": 0.31910853274166584, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 766.544677734375, |
|
"epoch": 0.5859872611464968, |
|
"grad_norm": 0.3231986463069916, |
|
"kl": 0.125732421875, |
|
"learning_rate": 2.481018470667368e-06, |
|
"loss": 0.0013, |
|
"reward": 0.5159066766500473, |
|
"reward_std": 0.2495804950594902, |
|
"rewards/code_reward": 0.4188084527850151, |
|
"rewards/format_reward": 0.9709821939468384, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 816.5491485595703, |
|
"epoch": 0.5881104033970276, |
|
"grad_norm": 0.3522323966026306, |
|
"kl": 0.1474609375, |
|
"learning_rate": 2.4665543362003802e-06, |
|
"loss": 0.0016, |
|
"reward": 0.5210660025477409, |
|
"reward_std": 0.19517110101878643, |
|
"rewards/code_reward": 0.42352132126688957, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 815.3638610839844, |
|
"epoch": 0.5902335456475584, |
|
"grad_norm": 0.36454498767852783, |
|
"kl": 0.156005859375, |
|
"learning_rate": 2.4521020945457615e-06, |
|
"loss": 0.0016, |
|
"reward": 0.41319186985492706, |
|
"reward_std": 0.21446501463651657, |
|
"rewards/code_reward": 0.3152007535099983, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 830.2031707763672, |
|
"epoch": 0.5923566878980892, |
|
"grad_norm": 0.24598261713981628, |
|
"kl": 0.182373046875, |
|
"learning_rate": 2.4376623520906255e-06, |
|
"loss": 0.0019, |
|
"reward": 0.48784376308321953, |
|
"reward_std": 0.25769151002168655, |
|
"rewards/code_reward": 0.39141515642404556, |
|
"rewards/format_reward": 0.9642857313156128, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 796.6272583007812, |
|
"epoch": 0.5944798301486199, |
|
"grad_norm": 0.24986568093299866, |
|
"kl": 0.154541015625, |
|
"learning_rate": 2.4232357146976478e-06, |
|
"loss": 0.0016, |
|
"reward": 0.3782888986170292, |
|
"reward_std": 0.18982039019465446, |
|
"rewards/code_reward": 0.28029780834913254, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 789.513427734375, |
|
"epoch": 0.5966029723991507, |
|
"grad_norm": 0.28405284881591797, |
|
"kl": 0.147705078125, |
|
"learning_rate": 2.408822787679637e-06, |
|
"loss": 0.0016, |
|
"reward": 0.5121422186493874, |
|
"reward_std": 0.2287510558962822, |
|
"rewards/code_reward": 0.413704726845026, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 808.3995819091797, |
|
"epoch": 0.5987261146496815, |
|
"grad_norm": 0.5303727984428406, |
|
"kl": 0.144775390625, |
|
"learning_rate": 2.3944241757741475e-06, |
|
"loss": 0.0016, |
|
"reward": 0.5508048385381699, |
|
"reward_std": 0.18633075430989265, |
|
"rewards/code_reward": 0.4516976475715637, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 827.4531707763672, |
|
"epoch": 0.6008492569002123, |
|
"grad_norm": 0.2308008074760437, |
|
"kl": 0.13037109375, |
|
"learning_rate": 2.380040483118097e-06, |
|
"loss": 0.0013, |
|
"reward": 0.3481413722038269, |
|
"reward_std": 0.20516538247466087, |
|
"rewards/code_reward": 0.250150291249156, |
|
"rewards/format_reward": 0.9799107313156128, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 786.8147583007812, |
|
"epoch": 0.6029723991507431, |
|
"grad_norm": 0.21248966455459595, |
|
"kl": 0.136962890625, |
|
"learning_rate": 2.365672313222419e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4797332286834717, |
|
"reward_std": 0.2214011587202549, |
|
"rewards/code_reward": 0.3815189450979233, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 797.7723693847656, |
|
"epoch": 0.6050955414012739, |
|
"grad_norm": 0.2716215252876282, |
|
"kl": 0.1376953125, |
|
"learning_rate": 2.351320268946749e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4847887381911278, |
|
"reward_std": 0.26064618304371834, |
|
"rewards/code_reward": 0.3861280009150505, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 790.9308319091797, |
|
"epoch": 0.6072186836518046, |
|
"grad_norm": 0.22615957260131836, |
|
"kl": 0.1334228515625, |
|
"learning_rate": 2.336984952474119e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4451970234513283, |
|
"reward_std": 0.21199724823236465, |
|
"rewards/code_reward": 0.34564343094825745, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 774.716552734375, |
|
"epoch": 0.6093418259023354, |
|
"grad_norm": 0.24061597883701324, |
|
"kl": 0.17236328125, |
|
"learning_rate": 2.322666965285697e-06, |
|
"loss": 0.0018, |
|
"reward": 0.4680435359477997, |
|
"reward_std": 0.2062854841351509, |
|
"rewards/code_reward": 0.3700524792075157, |
|
"rewards/format_reward": 0.9799107313156128, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 785.3437957763672, |
|
"epoch": 0.6114649681528662, |
|
"grad_norm": 0.2794930636882782, |
|
"kl": 0.143798828125, |
|
"learning_rate": 2.3083669081355507e-06, |
|
"loss": 0.0015, |
|
"reward": 0.41017772257328033, |
|
"reward_std": 0.1858556531369686, |
|
"rewards/code_reward": 0.31263307854533195, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 768.2902221679688, |
|
"epoch": 0.613588110403397, |
|
"grad_norm": 0.2621839940547943, |
|
"kl": 0.138427734375, |
|
"learning_rate": 2.2940853810254377e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4905528202652931, |
|
"reward_std": 0.25080636143684387, |
|
"rewards/code_reward": 0.39144565910100937, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 788.9687805175781, |
|
"epoch": 0.6157112526539278, |
|
"grad_norm": 0.25945180654525757, |
|
"kl": 0.1494140625, |
|
"learning_rate": 2.2798229831796313e-06, |
|
"loss": 0.0015, |
|
"reward": 0.43350084125995636, |
|
"reward_std": 0.1987269874662161, |
|
"rewards/code_reward": 0.3370722308754921, |
|
"rewards/format_reward": 0.9642857611179352, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 762.8616485595703, |
|
"epoch": 0.6178343949044586, |
|
"grad_norm": 0.28753146529197693, |
|
"kl": 0.146484375, |
|
"learning_rate": 2.2655803130197816e-06, |
|
"loss": 0.0015, |
|
"reward": 0.45754577219486237, |
|
"reward_std": 0.20388228073716164, |
|
"rewards/code_reward": 0.35977791622281075, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 755.8594207763672, |
|
"epoch": 0.6199575371549894, |
|
"grad_norm": 0.2792350947856903, |
|
"kl": 0.14794921875, |
|
"learning_rate": 2.2513579681398034e-06, |
|
"loss": 0.0016, |
|
"reward": 0.4282514527440071, |
|
"reward_std": 0.16725242137908936, |
|
"rewards/code_reward": 0.32959069684147835, |
|
"rewards/format_reward": 0.986607164144516, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 744.6138763427734, |
|
"epoch": 0.6220806794055201, |
|
"grad_norm": 0.2520155608654022, |
|
"kl": 0.13720703125, |
|
"learning_rate": 2.237156545280803e-06, |
|
"loss": 0.0014, |
|
"reward": 0.44700442999601364, |
|
"reward_std": 0.21429810300469398, |
|
"rewards/code_reward": 0.34812046587467194, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 771.6897888183594, |
|
"epoch": 0.6242038216560509, |
|
"grad_norm": 0.41944995522499084, |
|
"kl": 0.22412109375, |
|
"learning_rate": 2.2229766403060403e-06, |
|
"loss": 0.0023, |
|
"reward": 0.4441903755068779, |
|
"reward_std": 0.19182176142930984, |
|
"rewards/code_reward": 0.3459760546684265, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 778.763427734375, |
|
"epoch": 0.6263269639065817, |
|
"grad_norm": 0.2801876962184906, |
|
"kl": 0.137939453125, |
|
"learning_rate": 2.2088188481759305e-06, |
|
"loss": 0.0014, |
|
"reward": 0.46992357820272446, |
|
"reward_std": 0.19111047685146332, |
|
"rewards/code_reward": 0.37103963643312454, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 778.0893096923828, |
|
"epoch": 0.6284501061571125, |
|
"grad_norm": 0.21918566524982452, |
|
"kl": 0.131103515625, |
|
"learning_rate": 2.194683762923073e-06, |
|
"loss": 0.0013, |
|
"reward": 0.4984453171491623, |
|
"reward_std": 0.22232287377119064, |
|
"rewards/code_reward": 0.40045420452952385, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 740.3951110839844, |
|
"epoch": 0.6305732484076433, |
|
"grad_norm": 0.31050121784210205, |
|
"kl": 0.1572265625, |
|
"learning_rate": 2.1805719776273387e-06, |
|
"loss": 0.0016, |
|
"reward": 0.4212986081838608, |
|
"reward_std": 0.1724853478372097, |
|
"rewards/code_reward": 0.321968249976635, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 682.138427734375, |
|
"epoch": 0.6326963906581741, |
|
"grad_norm": 0.24185748398303986, |
|
"kl": 0.17529296875, |
|
"learning_rate": 2.166484084390974e-06, |
|
"loss": 0.0019, |
|
"reward": 0.5747622847557068, |
|
"reward_std": 0.18613022193312645, |
|
"rewards/code_reward": 0.475878331810236, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 716.6518096923828, |
|
"epoch": 0.6348195329087049, |
|
"grad_norm": 0.6314132213592529, |
|
"kl": 0.166015625, |
|
"learning_rate": 2.1524206743137636e-06, |
|
"loss": 0.0017, |
|
"reward": 0.36886315792798996, |
|
"reward_std": 0.17360183410346508, |
|
"rewards/code_reward": 0.2708720788359642, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 737.8303833007812, |
|
"epoch": 0.6369426751592356, |
|
"grad_norm": 0.2968922555446625, |
|
"kl": 0.19287109375, |
|
"learning_rate": 2.1383823374682287e-06, |
|
"loss": 0.0019, |
|
"reward": 0.39945459365844727, |
|
"reward_std": 0.20941082388162613, |
|
"rewards/code_reward": 0.3014635145664215, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 718.5714721679688, |
|
"epoch": 0.6390658174097664, |
|
"grad_norm": 19.51397132873535, |
|
"kl": 0.275146484375, |
|
"learning_rate": 2.124369662874868e-06, |
|
"loss": 0.0029, |
|
"reward": 0.503417618572712, |
|
"reward_std": 0.15935716964304447, |
|
"rewards/code_reward": 0.40631940215826035, |
|
"rewards/format_reward": 0.9709821790456772, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 704.3393096923828, |
|
"epoch": 0.6411889596602972, |
|
"grad_norm": 0.35022857785224915, |
|
"kl": 0.14697265625, |
|
"learning_rate": 2.110383238477441e-06, |
|
"loss": 0.0015, |
|
"reward": 0.5569600984454155, |
|
"reward_std": 0.20704489946365356, |
|
"rewards/code_reward": 0.45785292237997055, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 702.6986846923828, |
|
"epoch": 0.643312101910828, |
|
"grad_norm": 0.17607638239860535, |
|
"kl": 0.13916015625, |
|
"learning_rate": 2.096423651118305e-06, |
|
"loss": 0.0014, |
|
"reward": 0.2535444311797619, |
|
"reward_std": 0.11278286523884162, |
|
"rewards/code_reward": 0.15466050058603287, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 701.2857360839844, |
|
"epoch": 0.6454352441613588, |
|
"grad_norm": 0.6241003274917603, |
|
"kl": 0.1826171875, |
|
"learning_rate": 2.082491486513788e-06, |
|
"loss": 0.0019, |
|
"reward": 0.5550656765699387, |
|
"reward_std": 0.21512125991284847, |
|
"rewards/code_reward": 0.45618174970149994, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 709.0379638671875, |
|
"epoch": 0.6475583864118896, |
|
"grad_norm": 0.696461021900177, |
|
"kl": 0.1435546875, |
|
"learning_rate": 2.0685873292296116e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3796486109495163, |
|
"reward_std": 0.15390164637938142, |
|
"rewards/code_reward": 0.28121111169457436, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 682.716552734375, |
|
"epoch": 0.6496815286624203, |
|
"grad_norm": 0.26720672845840454, |
|
"kl": 0.162109375, |
|
"learning_rate": 2.054711762656369e-06, |
|
"loss": 0.0016, |
|
"reward": 0.37838516384363174, |
|
"reward_std": 0.16313385590910912, |
|
"rewards/code_reward": 0.28061728924512863, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 666.8080596923828, |
|
"epoch": 0.6518046709129511, |
|
"grad_norm": 0.8882589936256409, |
|
"kl": 0.16259765625, |
|
"learning_rate": 2.040865368985044e-06, |
|
"loss": 0.0017, |
|
"reward": 0.4301592782139778, |
|
"reward_std": 0.20042868331074715, |
|
"rewards/code_reward": 0.33105212450027466, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 681.9129791259766, |
|
"epoch": 0.6539278131634819, |
|
"grad_norm": 0.23706179857254028, |
|
"kl": 0.18310546875, |
|
"learning_rate": 2.027048729182583e-06, |
|
"loss": 0.0019, |
|
"reward": 0.4861885607242584, |
|
"reward_std": 0.16966554708778858, |
|
"rewards/code_reward": 0.3881974592804909, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 693.8147583007812, |
|
"epoch": 0.6560509554140127, |
|
"grad_norm": 0.5197703242301941, |
|
"kl": 0.228271484375, |
|
"learning_rate": 2.0132624229675205e-06, |
|
"loss": 0.0024, |
|
"reward": 0.511215090751648, |
|
"reward_std": 0.18619069457054138, |
|
"rewards/code_reward": 0.4127775654196739, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 714.9777221679688, |
|
"epoch": 0.6581740976645435, |
|
"grad_norm": 0.24638721346855164, |
|
"kl": 0.189453125, |
|
"learning_rate": 1.9995070287856546e-06, |
|
"loss": 0.002, |
|
"reward": 0.5180679038167, |
|
"reward_std": 0.21345077827572823, |
|
"rewards/code_reward": 0.41963040083646774, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 708.2723388671875, |
|
"epoch": 0.6602972399150743, |
|
"grad_norm": 0.422715961933136, |
|
"kl": 0.18701171875, |
|
"learning_rate": 1.985783123785774e-06, |
|
"loss": 0.0019, |
|
"reward": 0.5620292499661446, |
|
"reward_std": 0.20659737288951874, |
|
"rewards/code_reward": 0.46314531564712524, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 668.2589569091797, |
|
"epoch": 0.6624203821656051, |
|
"grad_norm": 0.6652376055717468, |
|
"kl": 0.240478515625, |
|
"learning_rate": 1.9720912837954486e-06, |
|
"loss": 0.0025, |
|
"reward": 0.4389989897608757, |
|
"reward_std": 0.20384247601032257, |
|
"rewards/code_reward": 0.33989182114601135, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 671.5424346923828, |
|
"epoch": 0.6645435244161358, |
|
"grad_norm": 0.898223876953125, |
|
"kl": 0.25927734375, |
|
"learning_rate": 1.958432083296862e-06, |
|
"loss": 0.0026, |
|
"reward": 0.36031387001276016, |
|
"reward_std": 0.2003210037946701, |
|
"rewards/code_reward": 0.26254600286483765, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 676.5602874755859, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.7889689207077026, |
|
"kl": 0.2135009765625, |
|
"learning_rate": 1.9448060954027093e-06, |
|
"loss": 0.0022, |
|
"reward": 0.5204020366072655, |
|
"reward_std": 0.16625045239925385, |
|
"rewards/code_reward": 0.4212948679924011, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 684.9866333007812, |
|
"epoch": 0.6687898089171974, |
|
"grad_norm": 1.3564072847366333, |
|
"kl": 0.40185546875, |
|
"learning_rate": 1.931213891832153e-06, |
|
"loss": 0.0041, |
|
"reward": 0.526521772146225, |
|
"reward_std": 0.2212766855955124, |
|
"rewards/code_reward": 0.4278610572218895, |
|
"rewards/format_reward": 0.986607164144516, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 652.8102874755859, |
|
"epoch": 0.6709129511677282, |
|
"grad_norm": 0.24422591924667358, |
|
"kl": 0.147216796875, |
|
"learning_rate": 1.9176560428868336e-06, |
|
"loss": 0.0015, |
|
"reward": 0.3931754156947136, |
|
"reward_std": 0.1695394441485405, |
|
"rewards/code_reward": 0.29473789036273956, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 687.8214569091797, |
|
"epoch": 0.673036093418259, |
|
"grad_norm": 0.4171687960624695, |
|
"kl": 0.236328125, |
|
"learning_rate": 1.9041331174269373e-06, |
|
"loss": 0.0024, |
|
"reward": 0.47731664031744003, |
|
"reward_std": 0.20429091900587082, |
|
"rewards/code_reward": 0.378879152238369, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 682.6741485595703, |
|
"epoch": 0.6751592356687898, |
|
"grad_norm": 0.9241800308227539, |
|
"kl": 0.36083984375, |
|
"learning_rate": 1.8906456828473341e-06, |
|
"loss": 0.0036, |
|
"reward": 0.5124014094471931, |
|
"reward_std": 0.21064380928874016, |
|
"rewards/code_reward": 0.4132942706346512, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 684.0759124755859, |
|
"epoch": 0.6772823779193206, |
|
"grad_norm": 0.24995659291744232, |
|
"kl": 0.14794921875, |
|
"learning_rate": 1.8771943050537656e-06, |
|
"loss": 0.0016, |
|
"reward": 0.592289388179779, |
|
"reward_std": 0.2126442939043045, |
|
"rewards/code_reward": 0.4942983016371727, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 719.6786041259766, |
|
"epoch": 0.6794055201698513, |
|
"grad_norm": 0.24401088058948517, |
|
"kl": 0.1395263671875, |
|
"learning_rate": 1.8637795484391046e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4689144790172577, |
|
"reward_std": 0.25591350346803665, |
|
"rewards/code_reward": 0.3711466044187546, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 655.4844055175781, |
|
"epoch": 0.6815286624203821, |
|
"grad_norm": 0.3457026779651642, |
|
"kl": 0.50341796875, |
|
"learning_rate": 1.8504019758596698e-06, |
|
"loss": 0.0051, |
|
"reward": 0.5521439760923386, |
|
"reward_std": 0.2452612817287445, |
|
"rewards/code_reward": 0.45326002687215805, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 714.7143096923828, |
|
"epoch": 0.6836518046709129, |
|
"grad_norm": 0.3326283395290375, |
|
"kl": 0.1953125, |
|
"learning_rate": 1.8370621486116163e-06, |
|
"loss": 0.0021, |
|
"reward": 0.5532227605581284, |
|
"reward_std": 0.18401411548256874, |
|
"rewards/code_reward": 0.4552316591143608, |
|
"rewards/format_reward": 0.9799107760190964, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 677.7634124755859, |
|
"epoch": 0.6857749469214437, |
|
"grad_norm": 0.3285404145717621, |
|
"kl": 0.23876953125, |
|
"learning_rate": 1.823760626407377e-06, |
|
"loss": 0.0025, |
|
"reward": 0.4828302264213562, |
|
"reward_std": 0.1928608939051628, |
|
"rewards/code_reward": 0.384615920484066, |
|
"rewards/format_reward": 0.9821429252624512, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 699.5982666015625, |
|
"epoch": 0.6878980891719745, |
|
"grad_norm": 0.34025460481643677, |
|
"kl": 0.224365234375, |
|
"learning_rate": 1.8104979673521838e-06, |
|
"loss": 0.0023, |
|
"reward": 0.42327145487070084, |
|
"reward_std": 0.15393321216106415, |
|
"rewards/code_reward": 0.32505714148283005, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 650.997802734375, |
|
"epoch": 0.6900212314225053, |
|
"grad_norm": 0.3025732636451721, |
|
"kl": 0.24853515625, |
|
"learning_rate": 1.7972747279206482e-06, |
|
"loss": 0.0025, |
|
"reward": 0.37195510417222977, |
|
"reward_std": 0.19180476292967796, |
|
"rewards/code_reward": 0.27418723329901695, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 692.4464569091797, |
|
"epoch": 0.692144373673036, |
|
"grad_norm": 0.2389409989118576, |
|
"kl": 0.148681640625, |
|
"learning_rate": 1.7840914629334122e-06, |
|
"loss": 0.0016, |
|
"reward": 0.5394042208790779, |
|
"reward_std": 0.22496159374713898, |
|
"rewards/code_reward": 0.44185957312583923, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 709.0669860839844, |
|
"epoch": 0.6942675159235668, |
|
"grad_norm": 0.28394991159439087, |
|
"kl": 0.194091796875, |
|
"learning_rate": 1.7709487255338731e-06, |
|
"loss": 0.0021, |
|
"reward": 0.4636544920504093, |
|
"reward_std": 0.15700273029506207, |
|
"rewards/code_reward": 0.36633305437862873, |
|
"rewards/format_reward": 0.973214328289032, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 702.7768249511719, |
|
"epoch": 0.6963906581740976, |
|
"grad_norm": 0.22292962670326233, |
|
"kl": 0.17431640625, |
|
"learning_rate": 1.7578470671649684e-06, |
|
"loss": 0.0019, |
|
"reward": 0.4268321394920349, |
|
"reward_std": 0.1670057326555252, |
|
"rewards/code_reward": 0.32928748056292534, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 694.5937805175781, |
|
"epoch": 0.6985138004246284, |
|
"grad_norm": 0.782927393913269, |
|
"kl": 0.3328857421875, |
|
"learning_rate": 1.744787037546045e-06, |
|
"loss": 0.0034, |
|
"reward": 0.46113383024930954, |
|
"reward_std": 0.18688062392175198, |
|
"rewards/code_reward": 0.3626963049173355, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 706.1986999511719, |
|
"epoch": 0.7006369426751592, |
|
"grad_norm": 0.41430673003196716, |
|
"kl": 0.1827392578125, |
|
"learning_rate": 1.731769184649788e-06, |
|
"loss": 0.0019, |
|
"reward": 0.5658792853355408, |
|
"reward_std": 0.23742860183119774, |
|
"rewards/code_reward": 0.4683346152305603, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 694.9576416015625, |
|
"epoch": 0.70276008492569, |
|
"grad_norm": 0.6622937917709351, |
|
"kl": 0.214111328125, |
|
"learning_rate": 1.7187940546792325e-06, |
|
"loss": 0.0022, |
|
"reward": 0.4137548431754112, |
|
"reward_std": 0.1334713213145733, |
|
"rewards/code_reward": 0.3155405670404434, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 716.8393249511719, |
|
"epoch": 0.7048832271762208, |
|
"grad_norm": 0.22396574914455414, |
|
"kl": 0.2607421875, |
|
"learning_rate": 1.7058621920448465e-06, |
|
"loss": 0.0027, |
|
"reward": 0.4444565996527672, |
|
"reward_std": 0.18423740193247795, |
|
"rewards/code_reward": 0.34646550565958023, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 703.0937805175781, |
|
"epoch": 0.7070063694267515, |
|
"grad_norm": 0.2483583688735962, |
|
"kl": 0.160888671875, |
|
"learning_rate": 1.6929741393416855e-06, |
|
"loss": 0.0016, |
|
"reward": 0.47170016914606094, |
|
"reward_std": 0.18039512634277344, |
|
"rewards/code_reward": 0.37393229454755783, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 755.0312805175781, |
|
"epoch": 0.7091295116772823, |
|
"grad_norm": 0.4338986873626709, |
|
"kl": 0.357177734375, |
|
"learning_rate": 1.6801304373266286e-06, |
|
"loss": 0.0036, |
|
"reward": 0.4291260167956352, |
|
"reward_std": 0.15267430432140827, |
|
"rewards/code_reward": 0.3318046070635319, |
|
"rewards/format_reward": 0.9732143133878708, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 767.3214569091797, |
|
"epoch": 0.7112526539278131, |
|
"grad_norm": 0.21925950050354004, |
|
"kl": 0.137451171875, |
|
"learning_rate": 1.667331624895689e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4992447942495346, |
|
"reward_std": 0.21635426208376884, |
|
"rewards/code_reward": 0.4014769196510315, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 750.1696929931641, |
|
"epoch": 0.7133757961783439, |
|
"grad_norm": 0.30118319392204285, |
|
"kl": 0.359619140625, |
|
"learning_rate": 1.6545782390614037e-06, |
|
"loss": 0.0037, |
|
"reward": 0.4922778084874153, |
|
"reward_std": 0.1726557295769453, |
|
"rewards/code_reward": 0.39317065104842186, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 718.6339721679688, |
|
"epoch": 0.7154989384288747, |
|
"grad_norm": 0.41911348700523376, |
|
"kl": 0.317626953125, |
|
"learning_rate": 1.6418708149302992e-06, |
|
"loss": 0.0033, |
|
"reward": 0.44379642605781555, |
|
"reward_std": 0.19296832010149956, |
|
"rewards/code_reward": 0.3451356738805771, |
|
"rewards/format_reward": 0.986607164144516, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 694.1138610839844, |
|
"epoch": 0.7176220806794055, |
|
"grad_norm": 0.7091541886329651, |
|
"kl": 0.27783203125, |
|
"learning_rate": 1.6292098856804423e-06, |
|
"loss": 0.0028, |
|
"reward": 0.4443873465061188, |
|
"reward_std": 0.19508511200547218, |
|
"rewards/code_reward": 0.3468426913022995, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 720.607177734375, |
|
"epoch": 0.7197452229299363, |
|
"grad_norm": 0.6043697595596313, |
|
"kl": 0.3173828125, |
|
"learning_rate": 1.6165959825390661e-06, |
|
"loss": 0.0033, |
|
"reward": 0.43542125821113586, |
|
"reward_std": 0.16308805532753468, |
|
"rewards/code_reward": 0.33720696344971657, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 706.1317291259766, |
|
"epoch": 0.721868365180467, |
|
"grad_norm": 0.2581160068511963, |
|
"kl": 0.2353515625, |
|
"learning_rate": 1.604029634760284e-06, |
|
"loss": 0.0025, |
|
"reward": 0.5382986813783646, |
|
"reward_std": 0.14037772081792355, |
|
"rewards/code_reward": 0.4403075948357582, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 737.5469055175781, |
|
"epoch": 0.7239915074309978, |
|
"grad_norm": 0.4556562006473541, |
|
"kl": 0.368408203125, |
|
"learning_rate": 1.59151136960288e-06, |
|
"loss": 0.0037, |
|
"reward": 0.538652278482914, |
|
"reward_std": 0.20831965655088425, |
|
"rewards/code_reward": 0.44133080542087555, |
|
"rewards/format_reward": 0.973214328289032, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 723.9486999511719, |
|
"epoch": 0.7261146496815286, |
|
"grad_norm": 0.2620218098163605, |
|
"kl": 0.159912109375, |
|
"learning_rate": 1.5790417123081903e-06, |
|
"loss": 0.0017, |
|
"reward": 0.45855508744716644, |
|
"reward_std": 0.1777043156325817, |
|
"rewards/code_reward": 0.3605640158057213, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 686.8236999511719, |
|
"epoch": 0.7282377919320594, |
|
"grad_norm": 0.2753090560436249, |
|
"kl": 0.16455078125, |
|
"learning_rate": 1.5666211860780583e-06, |
|
"loss": 0.0018, |
|
"reward": 0.5850269198417664, |
|
"reward_std": 0.19610749557614326, |
|
"rewards/code_reward": 0.4870358556509018, |
|
"rewards/format_reward": 0.9799107760190964, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 684.1205749511719, |
|
"epoch": 0.7303609341825902, |
|
"grad_norm": 0.23944684863090515, |
|
"kl": 0.16455078125, |
|
"learning_rate": 1.5542503120528918e-06, |
|
"loss": 0.0017, |
|
"reward": 0.5332599207758904, |
|
"reward_std": 0.2457549162209034, |
|
"rewards/code_reward": 0.43437594920396805, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 720.3839569091797, |
|
"epoch": 0.732484076433121, |
|
"grad_norm": 0.31666672229766846, |
|
"kl": 0.213134765625, |
|
"learning_rate": 1.5419296092897866e-06, |
|
"loss": 0.0022, |
|
"reward": 0.5879708528518677, |
|
"reward_std": 0.24002529680728912, |
|
"rewards/code_reward": 0.4899797812104225, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 693.6964569091797, |
|
"epoch": 0.7346072186836518, |
|
"grad_norm": 0.24176108837127686, |
|
"kl": 0.15869140625, |
|
"learning_rate": 1.529659594740755e-06, |
|
"loss": 0.0016, |
|
"reward": 0.4276282340288162, |
|
"reward_std": 0.20496541634202003, |
|
"rewards/code_reward": 0.32896753773093224, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 704.2902221679688, |
|
"epoch": 0.7367303609341825, |
|
"grad_norm": 0.2568061351776123, |
|
"kl": 0.15771484375, |
|
"learning_rate": 1.5174407832310338e-06, |
|
"loss": 0.0016, |
|
"reward": 0.39445348642766476, |
|
"reward_std": 0.13825338683091104, |
|
"rewards/code_reward": 0.2962391600012779, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 722.2545013427734, |
|
"epoch": 0.7388535031847133, |
|
"grad_norm": 0.49012815952301025, |
|
"kl": 0.17578125, |
|
"learning_rate": 1.5052736874374815e-06, |
|
"loss": 0.0018, |
|
"reward": 0.488083653151989, |
|
"reward_std": 0.1927042007446289, |
|
"rewards/code_reward": 0.39009255915880203, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 713.3147583007812, |
|
"epoch": 0.7409766454352441, |
|
"grad_norm": 0.6304606795310974, |
|
"kl": 0.29345703125, |
|
"learning_rate": 1.4931588178670695e-06, |
|
"loss": 0.003, |
|
"reward": 0.4815641790628433, |
|
"reward_std": 0.16962832398712635, |
|
"rewards/code_reward": 0.38357311114668846, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 700.3214569091797, |
|
"epoch": 0.7430997876857749, |
|
"grad_norm": 0.43463101983070374, |
|
"kl": 0.289306640625, |
|
"learning_rate": 1.4810966828354605e-06, |
|
"loss": 0.0029, |
|
"reward": 0.45994506776332855, |
|
"reward_std": 0.1931474320590496, |
|
"rewards/code_reward": 0.36173076555132866, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 685.1384124755859, |
|
"epoch": 0.7452229299363057, |
|
"grad_norm": 0.34815892577171326, |
|
"kl": 0.44140625, |
|
"learning_rate": 1.469087788445684e-06, |
|
"loss": 0.0045, |
|
"reward": 0.5396069064736366, |
|
"reward_std": 0.20336921885609627, |
|
"rewards/code_reward": 0.44250866025686264, |
|
"rewards/format_reward": 0.9709821939468384, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 698.6071929931641, |
|
"epoch": 0.7473460721868365, |
|
"grad_norm": 0.3489153981208801, |
|
"kl": 0.533447265625, |
|
"learning_rate": 1.4571326385668965e-06, |
|
"loss": 0.0055, |
|
"reward": 0.6215780973434448, |
|
"reward_std": 0.202628992497921, |
|
"rewards/code_reward": 0.5229173377156258, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 713.6897583007812, |
|
"epoch": 0.7494692144373672, |
|
"grad_norm": 0.2902304232120514, |
|
"kl": 0.160400390625, |
|
"learning_rate": 1.4452317348132434e-06, |
|
"loss": 0.0018, |
|
"reward": 0.43891899287700653, |
|
"reward_std": 0.1397520825266838, |
|
"rewards/code_reward": 0.3393654003739357, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 706.091552734375, |
|
"epoch": 0.7515923566878981, |
|
"grad_norm": 0.7335183024406433, |
|
"kl": 0.34814453125, |
|
"learning_rate": 1.4333855765228104e-06, |
|
"loss": 0.0037, |
|
"reward": 0.6451611816883087, |
|
"reward_std": 0.20771214738488197, |
|
"rewards/code_reward": 0.5465004742145538, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 712.1607513427734, |
|
"epoch": 0.7537154989384289, |
|
"grad_norm": 0.7572880387306213, |
|
"kl": 0.3447265625, |
|
"learning_rate": 1.421594660736675e-06, |
|
"loss": 0.0035, |
|
"reward": 0.41940218955278397, |
|
"reward_std": 0.1921430230140686, |
|
"rewards/code_reward": 0.3202950209379196, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 680.1986999511719, |
|
"epoch": 0.7558386411889597, |
|
"grad_norm": 0.3940925896167755, |
|
"kl": 0.549560546875, |
|
"learning_rate": 1.4098594821780476e-06, |
|
"loss": 0.0056, |
|
"reward": 0.6083894520998001, |
|
"reward_std": 0.1597061362117529, |
|
"rewards/code_reward": 0.5108448341488838, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 665.372802734375, |
|
"epoch": 0.7579617834394905, |
|
"grad_norm": 0.2566499710083008, |
|
"kl": 0.192138671875, |
|
"learning_rate": 1.3981805332315174e-06, |
|
"loss": 0.002, |
|
"reward": 0.4351358078420162, |
|
"reward_std": 0.1653740406036377, |
|
"rewards/code_reward": 0.3360286522656679, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 732.7589569091797, |
|
"epoch": 0.7600849256900213, |
|
"grad_norm": 0.35650861263275146, |
|
"kl": 0.250732421875, |
|
"learning_rate": 1.3865583039223929e-06, |
|
"loss": 0.0026, |
|
"reward": 0.5535444989800453, |
|
"reward_std": 0.17830567993223667, |
|
"rewards/code_reward": 0.4555533789098263, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 708.0424346923828, |
|
"epoch": 0.7622080679405521, |
|
"grad_norm": 0.24273599684238434, |
|
"kl": 0.1611328125, |
|
"learning_rate": 1.374993281896137e-06, |
|
"loss": 0.0017, |
|
"reward": 0.44518817216157913, |
|
"reward_std": 0.19435212016105652, |
|
"rewards/code_reward": 0.34697388112545013, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 765.1786041259766, |
|
"epoch": 0.7643312101910829, |
|
"grad_norm": 0.3510468304157257, |
|
"kl": 0.197021484375, |
|
"learning_rate": 1.3634859523979134e-06, |
|
"loss": 0.002, |
|
"reward": 0.47114741802215576, |
|
"reward_std": 0.1812426745891571, |
|
"rewards/code_reward": 0.3724866919219494, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 724.216552734375, |
|
"epoch": 0.7664543524416136, |
|
"grad_norm": 1.1458288431167603, |
|
"kl": 0.52978515625, |
|
"learning_rate": 1.3520367982522208e-06, |
|
"loss": 0.0053, |
|
"reward": 0.45792729407548904, |
|
"reward_std": 0.16464052349328995, |
|
"rewards/code_reward": 0.35926656424999237, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 705.4464721679688, |
|
"epoch": 0.7685774946921444, |
|
"grad_norm": 0.4750509560108185, |
|
"kl": 0.23779296875, |
|
"learning_rate": 1.3406462998426358e-06, |
|
"loss": 0.0024, |
|
"reward": 0.5133348107337952, |
|
"reward_std": 0.24053634703159332, |
|
"rewards/code_reward": 0.41445086151361465, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 743.1853179931641, |
|
"epoch": 0.7707006369426752, |
|
"grad_norm": 0.2608552575111389, |
|
"kl": 0.325927734375, |
|
"learning_rate": 1.3293149350916595e-06, |
|
"loss": 0.0033, |
|
"reward": 0.5553034171462059, |
|
"reward_std": 0.19487734138965607, |
|
"rewards/code_reward": 0.45731230080127716, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 678.2209930419922, |
|
"epoch": 0.772823779193206, |
|
"grad_norm": 0.22239775955677032, |
|
"kl": 0.13037109375, |
|
"learning_rate": 1.3180431794406623e-06, |
|
"loss": 0.0015, |
|
"reward": 0.6062557250261307, |
|
"reward_std": 0.2048381306231022, |
|
"rewards/code_reward": 0.5069253593683243, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 707.4620971679688, |
|
"epoch": 0.7749469214437368, |
|
"grad_norm": 0.4696608781814575, |
|
"kl": 0.270751953125, |
|
"learning_rate": 1.3068315058299358e-06, |
|
"loss": 0.0029, |
|
"reward": 0.5663170740008354, |
|
"reward_std": 0.15939603559672832, |
|
"rewards/code_reward": 0.4678795412182808, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 653.2879791259766, |
|
"epoch": 0.7770700636942676, |
|
"grad_norm": 1.1559607982635498, |
|
"kl": 0.3037109375, |
|
"learning_rate": 1.2956803846788503e-06, |
|
"loss": 0.0032, |
|
"reward": 0.618221327662468, |
|
"reward_std": 0.22959138825535774, |
|
"rewards/code_reward": 0.5193373411893845, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 731.2076263427734, |
|
"epoch": 0.7791932059447984, |
|
"grad_norm": 0.48825645446777344, |
|
"kl": 0.210693359375, |
|
"learning_rate": 1.284590283866116e-06, |
|
"loss": 0.0021, |
|
"reward": 0.33228749781847, |
|
"reward_std": 0.15970432199537754, |
|
"rewards/code_reward": 0.2345196194946766, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 695.4576110839844, |
|
"epoch": 0.7813163481953291, |
|
"grad_norm": 1.4041056632995605, |
|
"kl": 0.1883544921875, |
|
"learning_rate": 1.2735616687101518e-06, |
|
"loss": 0.002, |
|
"reward": 0.40882231295108795, |
|
"reward_std": 0.16854364797472954, |
|
"rewards/code_reward": 0.3103848248720169, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 696.4687805175781, |
|
"epoch": 0.7834394904458599, |
|
"grad_norm": 1.9169604778289795, |
|
"kl": 0.201171875, |
|
"learning_rate": 1.2625950019495614e-06, |
|
"loss": 0.0021, |
|
"reward": 0.5380031913518906, |
|
"reward_std": 0.1728157363831997, |
|
"rewards/code_reward": 0.4400121048092842, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 709.4598693847656, |
|
"epoch": 0.7855626326963907, |
|
"grad_norm": 0.3797023594379425, |
|
"kl": 0.1640625, |
|
"learning_rate": 1.251690743723718e-06, |
|
"loss": 0.0017, |
|
"reward": 0.5747079327702522, |
|
"reward_std": 0.24513645470142365, |
|
"rewards/code_reward": 0.4767168238759041, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 647.7209930419922, |
|
"epoch": 0.7876857749469215, |
|
"grad_norm": 0.24551738798618317, |
|
"kl": 0.150390625, |
|
"learning_rate": 1.2408493515534581e-06, |
|
"loss": 0.0016, |
|
"reward": 0.6943890303373337, |
|
"reward_std": 0.22319162264466286, |
|
"rewards/code_reward": 0.5959515273571014, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 692.2098693847656, |
|
"epoch": 0.7898089171974523, |
|
"grad_norm": 0.4829825460910797, |
|
"kl": 0.406005859375, |
|
"learning_rate": 1.2300712803218834e-06, |
|
"loss": 0.0042, |
|
"reward": 0.5234424099326134, |
|
"reward_std": 0.1910531185567379, |
|
"rewards/code_reward": 0.42388884723186493, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 697.3036041259766, |
|
"epoch": 0.7919320594479831, |
|
"grad_norm": 114.25981140136719, |
|
"kl": 16.0146484375, |
|
"learning_rate": 1.2193569822552772e-06, |
|
"loss": 0.1608, |
|
"reward": 0.559485673904419, |
|
"reward_std": 0.20258497074246407, |
|
"rewards/code_reward": 0.4606017544865608, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 677.7567291259766, |
|
"epoch": 0.7940552016985138, |
|
"grad_norm": 0.3005722761154175, |
|
"kl": 0.171875, |
|
"learning_rate": 1.2087069069041268e-06, |
|
"loss": 0.0018, |
|
"reward": 0.5883411467075348, |
|
"reward_std": 0.21694539301097393, |
|
"rewards/code_reward": 0.4901268184185028, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 671.1495819091797, |
|
"epoch": 0.7961783439490446, |
|
"grad_norm": 0.6558151841163635, |
|
"kl": 0.162841796875, |
|
"learning_rate": 1.1981215011242654e-06, |
|
"loss": 0.0017, |
|
"reward": 0.5491671711206436, |
|
"reward_std": 0.2353355698287487, |
|
"rewards/code_reward": 0.45050643384456635, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 663.4486999511719, |
|
"epoch": 0.7983014861995754, |
|
"grad_norm": 1.0574246644973755, |
|
"kl": 0.168701171875, |
|
"learning_rate": 1.1876012090581184e-06, |
|
"loss": 0.0018, |
|
"reward": 0.523729532957077, |
|
"reward_std": 0.19741250574588776, |
|
"rewards/code_reward": 0.42573845386505127, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 678.5826110839844, |
|
"epoch": 0.8004246284501062, |
|
"grad_norm": 0.28517383337020874, |
|
"kl": 0.168212890625, |
|
"learning_rate": 1.177146472116071e-06, |
|
"loss": 0.0018, |
|
"reward": 0.4997348487377167, |
|
"reward_std": 0.16867511346936226, |
|
"rewards/code_reward": 0.40196699649095535, |
|
"rewards/format_reward": 0.9776785969734192, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 725.0000457763672, |
|
"epoch": 0.802547770700637, |
|
"grad_norm": 0.38322436809539795, |
|
"kl": 0.176025390625, |
|
"learning_rate": 1.1667577289579462e-06, |
|
"loss": 0.0018, |
|
"reward": 0.43969085440039635, |
|
"reward_std": 0.16067362390458584, |
|
"rewards/code_reward": 0.3425925988703966, |
|
"rewards/format_reward": 0.9709821790456772, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 671.4710083007812, |
|
"epoch": 0.8046709129511678, |
|
"grad_norm": 0.24044837057590485, |
|
"kl": 0.1435546875, |
|
"learning_rate": 1.1564354154746007e-06, |
|
"loss": 0.0015, |
|
"reward": 0.5779925882816315, |
|
"reward_std": 0.22314922511577606, |
|
"rewards/code_reward": 0.479331873357296, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 701.1875457763672, |
|
"epoch": 0.8067940552016986, |
|
"grad_norm": 0.2769814729690552, |
|
"kl": 0.187255859375, |
|
"learning_rate": 1.146179964769635e-06, |
|
"loss": 0.002, |
|
"reward": 0.5813698992133141, |
|
"reward_std": 0.21280257403850555, |
|
"rewards/code_reward": 0.48315558582544327, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 703.2701263427734, |
|
"epoch": 0.8089171974522293, |
|
"grad_norm": 0.43315884470939636, |
|
"kl": 0.28125, |
|
"learning_rate": 1.1359918071412195e-06, |
|
"loss": 0.003, |
|
"reward": 0.5584300383925438, |
|
"reward_std": 0.17897445522248745, |
|
"rewards/code_reward": 0.4595461040735245, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 680.9174499511719, |
|
"epoch": 0.8110403397027601, |
|
"grad_norm": 0.3025217652320862, |
|
"kl": 0.208251953125, |
|
"learning_rate": 1.1258713700640456e-06, |
|
"loss": 0.0022, |
|
"reward": 0.47092022001743317, |
|
"reward_std": 0.1665214579552412, |
|
"rewards/code_reward": 0.3727059066295624, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 672.3705596923828, |
|
"epoch": 0.8131634819532909, |
|
"grad_norm": 0.23662854731082916, |
|
"kl": 0.1478271484375, |
|
"learning_rate": 1.115819078171383e-06, |
|
"loss": 0.0016, |
|
"reward": 0.5290590599179268, |
|
"reward_std": 0.21020712330937386, |
|
"rewards/code_reward": 0.4312911853194237, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 383 |
|
}, |
|
{ |
|
"completion_length": 659.2678833007812, |
|
"epoch": 0.8152866242038217, |
|
"grad_norm": 0.2239212840795517, |
|
"kl": 0.1688232421875, |
|
"learning_rate": 1.1058353532372667e-06, |
|
"loss": 0.0018, |
|
"reward": 0.5600069090723991, |
|
"reward_std": 0.20570005849003792, |
|
"rewards/code_reward": 0.46067656576633453, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 688.6205596923828, |
|
"epoch": 0.8174097664543525, |
|
"grad_norm": 0.25939956307411194, |
|
"kl": 0.156982421875, |
|
"learning_rate": 1.0959206141587998e-06, |
|
"loss": 0.0016, |
|
"reward": 0.461281917989254, |
|
"reward_std": 0.2138805352151394, |
|
"rewards/code_reward": 0.36329086124897003, |
|
"rewards/format_reward": 0.9799107760190964, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 689.8527069091797, |
|
"epoch": 0.8195329087048833, |
|
"grad_norm": 0.564179003238678, |
|
"kl": 0.34716796875, |
|
"learning_rate": 1.0860752769385766e-06, |
|
"loss": 0.0035, |
|
"reward": 0.5820841789245605, |
|
"reward_std": 0.23867543786764145, |
|
"rewards/code_reward": 0.48320019245147705, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 386 |
|
}, |
|
{ |
|
"completion_length": 716.8995971679688, |
|
"epoch": 0.821656050955414, |
|
"grad_norm": 0.31268319487571716, |
|
"kl": 0.2451171875, |
|
"learning_rate": 1.0762997546672279e-06, |
|
"loss": 0.0026, |
|
"reward": 0.24600705318152905, |
|
"reward_std": 0.06653665285557508, |
|
"rewards/code_reward": 0.14823918044567108, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 387 |
|
}, |
|
{ |
|
"completion_length": 661.8973388671875, |
|
"epoch": 0.8237791932059448, |
|
"grad_norm": 0.23703983426094055, |
|
"kl": 0.139892578125, |
|
"learning_rate": 1.0665944575060914e-06, |
|
"loss": 0.0015, |
|
"reward": 0.5530121028423309, |
|
"reward_std": 0.2014228142797947, |
|
"rewards/code_reward": 0.45368169248104095, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 388 |
|
}, |
|
{ |
|
"completion_length": 671.0468902587891, |
|
"epoch": 0.8259023354564756, |
|
"grad_norm": 0.21562151610851288, |
|
"kl": 0.14697265625, |
|
"learning_rate": 1.056959792669997e-06, |
|
"loss": 0.0016, |
|
"reward": 0.6221778392791748, |
|
"reward_std": 0.17795583605766296, |
|
"rewards/code_reward": 0.5246331766247749, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 389 |
|
}, |
|
{ |
|
"completion_length": 707.1830749511719, |
|
"epoch": 0.8280254777070064, |
|
"grad_norm": 0.25027066469192505, |
|
"kl": 0.15234375, |
|
"learning_rate": 1.0473961644101856e-06, |
|
"loss": 0.0016, |
|
"reward": 0.49339308589696884, |
|
"reward_std": 0.1599120758473873, |
|
"rewards/code_reward": 0.39450912177562714, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 724.7522583007812, |
|
"epoch": 0.8301486199575372, |
|
"grad_norm": 0.2350330352783203, |
|
"kl": 0.193603515625, |
|
"learning_rate": 1.037903973997345e-06, |
|
"loss": 0.0021, |
|
"reward": 0.478931725025177, |
|
"reward_std": 0.12047621235251427, |
|
"rewards/code_reward": 0.3804941847920418, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 391 |
|
}, |
|
{ |
|
"completion_length": 702.982177734375, |
|
"epoch": 0.832271762208068, |
|
"grad_norm": 0.3609310984611511, |
|
"kl": 0.179931640625, |
|
"learning_rate": 1.0284836197047737e-06, |
|
"loss": 0.0019, |
|
"reward": 0.44246046990156174, |
|
"reward_std": 0.1557149738073349, |
|
"rewards/code_reward": 0.3444693833589554, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 392 |
|
}, |
|
{ |
|
"completion_length": 674.607177734375, |
|
"epoch": 0.8343949044585988, |
|
"grad_norm": 0.5464503765106201, |
|
"kl": 0.248046875, |
|
"learning_rate": 1.0191354967916712e-06, |
|
"loss": 0.0026, |
|
"reward": 0.5180330500006676, |
|
"reward_std": 0.1834750883281231, |
|
"rewards/code_reward": 0.4193723499774933, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 393 |
|
}, |
|
{ |
|
"completion_length": 684.0937805175781, |
|
"epoch": 0.8365180467091295, |
|
"grad_norm": 0.23662471771240234, |
|
"kl": 0.1290283203125, |
|
"learning_rate": 1.0098599974865515e-06, |
|
"loss": 0.0014, |
|
"reward": 0.5139395222067833, |
|
"reward_std": 0.1551931146532297, |
|
"rewards/code_reward": 0.41594842076301575, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 394 |
|
}, |
|
{ |
|
"completion_length": 692.5580596923828, |
|
"epoch": 0.8386411889596603, |
|
"grad_norm": 0.34932953119277954, |
|
"kl": 0.154296875, |
|
"learning_rate": 1.0006575109707898e-06, |
|
"loss": 0.0017, |
|
"reward": 0.5320730581879616, |
|
"reward_std": 0.205118702724576, |
|
"rewards/code_reward": 0.43318910896778107, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 678.8571624755859, |
|
"epoch": 0.8407643312101911, |
|
"grad_norm": 0.5195670127868652, |
|
"kl": 0.1474609375, |
|
"learning_rate": 9.915284233622877e-07, |
|
"loss": 0.0016, |
|
"reward": 0.4320642352104187, |
|
"reward_std": 0.18216058425605297, |
|
"rewards/code_reward": 0.33362672477960587, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 706.8861999511719, |
|
"epoch": 0.8428874734607219, |
|
"grad_norm": 0.24882346391677856, |
|
"kl": 0.148681640625, |
|
"learning_rate": 9.824731176992796e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5600469708442688, |
|
"reward_std": 0.16885506361722946, |
|
"rewards/code_reward": 0.4616094380617142, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 397 |
|
}, |
|
{ |
|
"completion_length": 669.0491333007812, |
|
"epoch": 0.8450106157112527, |
|
"grad_norm": 1.0406914949417114, |
|
"kl": 0.364013671875, |
|
"learning_rate": 9.734919739242543e-07, |
|
"loss": 0.0037, |
|
"reward": 0.5749830156564713, |
|
"reward_std": 0.21774039044976234, |
|
"rewards/code_reward": 0.47676874697208405, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 398 |
|
}, |
|
{ |
|
"completion_length": 723.325927734375, |
|
"epoch": 0.8471337579617835, |
|
"grad_norm": 0.5013810396194458, |
|
"kl": 0.1451416015625, |
|
"learning_rate": 9.645853688680177e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5728159248828888, |
|
"reward_std": 0.1670310366898775, |
|
"rewards/code_reward": 0.4746016263961792, |
|
"rewards/format_reward": 0.9821428805589676, |
|
"step": 399 |
|
}, |
|
{ |
|
"completion_length": 700.310302734375, |
|
"epoch": 0.8492569002123143, |
|
"grad_norm": 0.8073310852050781, |
|
"kl": 0.2965087890625, |
|
"learning_rate": 9.557536762338786e-07, |
|
"loss": 0.003, |
|
"reward": 0.492939718067646, |
|
"reward_std": 0.2011387124657631, |
|
"rewards/code_reward": 0.39494864642620087, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 693.0536041259766, |
|
"epoch": 0.851380042462845, |
|
"grad_norm": 0.3889514207839966, |
|
"kl": 0.164306640625, |
|
"learning_rate": 9.46997266581973e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5752345323562622, |
|
"reward_std": 0.19828381016850471, |
|
"rewards/code_reward": 0.475680947303772, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 401 |
|
}, |
|
{ |
|
"completion_length": 706.841552734375, |
|
"epoch": 0.8535031847133758, |
|
"grad_norm": 4.799881458282471, |
|
"kl": 0.4912109375, |
|
"learning_rate": 9.383165073137115e-07, |
|
"loss": 0.0051, |
|
"reward": 0.5113906338810921, |
|
"reward_std": 0.14735013246536255, |
|
"rewards/code_reward": 0.41295309364795685, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 402 |
|
}, |
|
{ |
|
"completion_length": 691.2210235595703, |
|
"epoch": 0.8556263269639066, |
|
"grad_norm": 3.541896104812622, |
|
"kl": 0.14697265625, |
|
"learning_rate": 9.297117626563687e-07, |
|
"loss": 0.0016, |
|
"reward": 0.6038797795772552, |
|
"reward_std": 0.18652482330799103, |
|
"rewards/code_reward": 0.5065583363175392, |
|
"rewards/format_reward": 0.973214328289032, |
|
"step": 403 |
|
}, |
|
{ |
|
"completion_length": 725.4933471679688, |
|
"epoch": 0.8577494692144374, |
|
"grad_norm": 158.1067352294922, |
|
"kl": 18.90283203125, |
|
"learning_rate": 9.211833936477957e-07, |
|
"loss": 0.1896, |
|
"reward": 0.5942443758249283, |
|
"reward_std": 0.12594054080545902, |
|
"rewards/code_reward": 0.4960300847887993, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 404 |
|
}, |
|
{ |
|
"completion_length": 717.1585235595703, |
|
"epoch": 0.8598726114649682, |
|
"grad_norm": 2265.1884765625, |
|
"kl": 230.10986328125, |
|
"learning_rate": 9.127317581212753e-07, |
|
"loss": 2.3015, |
|
"reward": 0.53834218531847, |
|
"reward_std": 0.1464555226266384, |
|
"rewards/code_reward": 0.4394582211971283, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 727.5826110839844, |
|
"epoch": 0.861995753715499, |
|
"grad_norm": 0.2873145341873169, |
|
"kl": 0.1866455078125, |
|
"learning_rate": 9.043572106905084e-07, |
|
"loss": 0.0019, |
|
"reward": 0.5367319211363792, |
|
"reward_std": 0.17168255895376205, |
|
"rewards/code_reward": 0.43851763010025024, |
|
"rewards/format_reward": 0.98214291036129, |
|
"step": 406 |
|
}, |
|
{ |
|
"completion_length": 726.1674499511719, |
|
"epoch": 0.8641188959660298, |
|
"grad_norm": 0.2757129371166229, |
|
"kl": 0.1365966796875, |
|
"learning_rate": 8.960601027347321e-07, |
|
"loss": 0.0014, |
|
"reward": 0.5360690876841545, |
|
"reward_std": 0.2111339271068573, |
|
"rewards/code_reward": 0.4367387220263481, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 407 |
|
}, |
|
{ |
|
"completion_length": 708.4241333007812, |
|
"epoch": 0.8662420382165605, |
|
"grad_norm": 1.7461967468261719, |
|
"kl": 0.15234375, |
|
"learning_rate": 8.878407823839788e-07, |
|
"loss": 0.0016, |
|
"reward": 0.4714769721031189, |
|
"reward_std": 0.17892321571707726, |
|
"rewards/code_reward": 0.3723698630928993, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 408 |
|
}, |
|
{ |
|
"completion_length": 722.7344207763672, |
|
"epoch": 0.8683651804670913, |
|
"grad_norm": 1.359683632850647, |
|
"kl": 0.1497802734375, |
|
"learning_rate": 8.796995945044689e-07, |
|
"loss": 0.0017, |
|
"reward": 0.5647559985518456, |
|
"reward_std": 0.16498099640011787, |
|
"rewards/code_reward": 0.4656488224864006, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 409 |
|
}, |
|
{ |
|
"completion_length": 758.9174499511719, |
|
"epoch": 0.8704883227176221, |
|
"grad_norm": 0.34433820843696594, |
|
"kl": 0.12939453125, |
|
"learning_rate": 8.716368806841405e-07, |
|
"loss": 0.0013, |
|
"reward": 0.40852154791355133, |
|
"reward_std": 0.19776060804724693, |
|
"rewards/code_reward": 0.30919117480516434, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 730.169677734375, |
|
"epoch": 0.8726114649681529, |
|
"grad_norm": 0.43445339798927307, |
|
"kl": 0.132080078125, |
|
"learning_rate": 8.636529792183171e-07, |
|
"loss": 0.0014, |
|
"reward": 0.5396310985088348, |
|
"reward_std": 0.19617567211389542, |
|
"rewards/code_reward": 0.44097036868333817, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 411 |
|
}, |
|
{ |
|
"completion_length": 717.8705596923828, |
|
"epoch": 0.8747346072186837, |
|
"grad_norm": 0.5580800771713257, |
|
"kl": 0.192138671875, |
|
"learning_rate": 8.557482250955144e-07, |
|
"loss": 0.002, |
|
"reward": 0.4667212590575218, |
|
"reward_std": 0.20506682246923447, |
|
"rewards/code_reward": 0.36850695312023163, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 412 |
|
}, |
|
{ |
|
"completion_length": 701.482177734375, |
|
"epoch": 0.8768577494692145, |
|
"grad_norm": 0.33230528235435486, |
|
"kl": 0.150146484375, |
|
"learning_rate": 8.479229499833844e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5547576695680618, |
|
"reward_std": 0.21152211725711823, |
|
"rewards/code_reward": 0.4558737352490425, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 413 |
|
}, |
|
{ |
|
"completion_length": 704.0469055175781, |
|
"epoch": 0.8789808917197452, |
|
"grad_norm": 0.3372839093208313, |
|
"kl": 0.1534423828125, |
|
"learning_rate": 8.401774822147976e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5494333058595657, |
|
"reward_std": 0.24079465121030807, |
|
"rewards/code_reward": 0.4505493566393852, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 414 |
|
}, |
|
{ |
|
"completion_length": 723.966552734375, |
|
"epoch": 0.881104033970276, |
|
"grad_norm": 0.4163219630718231, |
|
"kl": 0.26123046875, |
|
"learning_rate": 8.325121467740695e-07, |
|
"loss": 0.0026, |
|
"reward": 0.3951665982604027, |
|
"reward_std": 0.18642807379364967, |
|
"rewards/code_reward": 0.29628264531493187, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 736.9129943847656, |
|
"epoch": 0.8832271762208068, |
|
"grad_norm": 0.6581453084945679, |
|
"kl": 0.18310546875, |
|
"learning_rate": 8.249272652833226e-07, |
|
"loss": 0.0018, |
|
"reward": 0.4613909646868706, |
|
"reward_std": 0.14806298539042473, |
|
"rewards/code_reward": 0.3633998855948448, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 416 |
|
}, |
|
{ |
|
"completion_length": 712.9754791259766, |
|
"epoch": 0.8853503184713376, |
|
"grad_norm": 1.2168219089508057, |
|
"kl": 0.2080078125, |
|
"learning_rate": 8.174231559889931e-07, |
|
"loss": 0.0021, |
|
"reward": 0.44138607382774353, |
|
"reward_std": 0.22260471060872078, |
|
"rewards/code_reward": 0.34317177161574364, |
|
"rewards/format_reward": 0.9821428805589676, |
|
"step": 417 |
|
}, |
|
{ |
|
"completion_length": 711.1049346923828, |
|
"epoch": 0.8874734607218684, |
|
"grad_norm": 1.5622974634170532, |
|
"kl": 0.21630859375, |
|
"learning_rate": 8.100001337484787e-07, |
|
"loss": 0.0022, |
|
"reward": 0.5736604407429695, |
|
"reward_std": 0.20455688051879406, |
|
"rewards/code_reward": 0.4747764840722084, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 418 |
|
}, |
|
{ |
|
"completion_length": 729.3861999511719, |
|
"epoch": 0.8895966029723992, |
|
"grad_norm": 0.5059235095977783, |
|
"kl": 0.16796875, |
|
"learning_rate": 8.026585100169251e-07, |
|
"loss": 0.0017, |
|
"reward": 0.4245912581682205, |
|
"reward_std": 0.151387682184577, |
|
"rewards/code_reward": 0.32637695223093033, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 419 |
|
}, |
|
{ |
|
"completion_length": 690.7187652587891, |
|
"epoch": 0.89171974522293, |
|
"grad_norm": 6.739729881286621, |
|
"kl": 2.8837890625, |
|
"learning_rate": 7.953985928341601e-07, |
|
"loss": 0.0289, |
|
"reward": 0.5304828435182571, |
|
"reward_std": 0.157493332400918, |
|
"rewards/code_reward": 0.4313756823539734, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 694.5335083007812, |
|
"epoch": 0.8938428874734607, |
|
"grad_norm": 0.45631542801856995, |
|
"kl": 0.1708984375, |
|
"learning_rate": 7.882206868117693e-07, |
|
"loss": 0.0018, |
|
"reward": 0.4608374051749706, |
|
"reward_std": 0.1782052293419838, |
|
"rewards/code_reward": 0.36106058582663536, |
|
"rewards/format_reward": 0.9977678656578064, |
|
"step": 421 |
|
}, |
|
{ |
|
"completion_length": 733.1339721679688, |
|
"epoch": 0.8959660297239915, |
|
"grad_norm": 1.1783860921859741, |
|
"kl": 0.181640625, |
|
"learning_rate": 7.81125093120313e-07, |
|
"loss": 0.0019, |
|
"reward": 0.4884042590856552, |
|
"reward_std": 0.164920412003994, |
|
"rewards/code_reward": 0.3899667263031006, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 422 |
|
}, |
|
{ |
|
"completion_length": 694.8013610839844, |
|
"epoch": 0.8980891719745223, |
|
"grad_norm": 0.7121770977973938, |
|
"kl": 0.24853515625, |
|
"learning_rate": 7.741121094766916e-07, |
|
"loss": 0.0026, |
|
"reward": 0.5257243886590004, |
|
"reward_std": 0.15851808711886406, |
|
"rewards/code_reward": 0.426840465515852, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 423 |
|
}, |
|
{ |
|
"completion_length": 681.4174346923828, |
|
"epoch": 0.9002123142250531, |
|
"grad_norm": 0.739496648311615, |
|
"kl": 0.25439453125, |
|
"learning_rate": 7.671820301316532e-07, |
|
"loss": 0.0026, |
|
"reward": 0.4978240504860878, |
|
"reward_std": 0.17392848432064056, |
|
"rewards/code_reward": 0.39983299374580383, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 424 |
|
}, |
|
{ |
|
"completion_length": 727.6897583007812, |
|
"epoch": 0.9023354564755839, |
|
"grad_norm": 0.6177138090133667, |
|
"kl": 0.183349609375, |
|
"learning_rate": 7.603351458574474e-07, |
|
"loss": 0.0019, |
|
"reward": 0.44435514509677887, |
|
"reward_std": 0.13320972956717014, |
|
"rewards/code_reward": 0.34703367203474045, |
|
"rewards/format_reward": 0.9732143133878708, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 721.8772735595703, |
|
"epoch": 0.9044585987261147, |
|
"grad_norm": 1.0612621307373047, |
|
"kl": 0.2496337890625, |
|
"learning_rate": 7.535717439356255e-07, |
|
"loss": 0.0026, |
|
"reward": 0.4390544593334198, |
|
"reward_std": 0.15821044147014618, |
|
"rewards/code_reward": 0.3408401757478714, |
|
"rewards/format_reward": 0.9821428805589676, |
|
"step": 426 |
|
}, |
|
{ |
|
"completion_length": 708.1295013427734, |
|
"epoch": 0.9065817409766455, |
|
"grad_norm": 0.3175060451030731, |
|
"kl": 0.15869140625, |
|
"learning_rate": 7.46892108144986e-07, |
|
"loss": 0.0017, |
|
"reward": 0.45070114731788635, |
|
"reward_std": 0.1746504958719015, |
|
"rewards/code_reward": 0.35181717574596405, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 427 |
|
}, |
|
{ |
|
"completion_length": 752.310302734375, |
|
"epoch": 0.9087048832271762, |
|
"grad_norm": 18.601308822631836, |
|
"kl": 3.44775390625, |
|
"learning_rate": 7.402965187496697e-07, |
|
"loss": 0.0348, |
|
"reward": 0.46990416944026947, |
|
"reward_std": 0.1597570963203907, |
|
"rewards/code_reward": 0.3723594844341278, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 428 |
|
}, |
|
{ |
|
"completion_length": 730.3460235595703, |
|
"epoch": 0.910828025477707, |
|
"grad_norm": 8.600378036499023, |
|
"kl": 1.468994140625, |
|
"learning_rate": 7.337852524873974e-07, |
|
"loss": 0.0148, |
|
"reward": 0.6117217838764191, |
|
"reward_std": 0.21035557612776756, |
|
"rewards/code_reward": 0.5126146152615547, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 429 |
|
}, |
|
{ |
|
"completion_length": 710.6138610839844, |
|
"epoch": 0.9129511677282378, |
|
"grad_norm": 0.4506695568561554, |
|
"kl": 0.20361328125, |
|
"learning_rate": 7.273585825578608e-07, |
|
"loss": 0.0022, |
|
"reward": 0.4428362399339676, |
|
"reward_std": 0.12803563103079796, |
|
"rewards/code_reward": 0.34372907504439354, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 658.6228179931641, |
|
"epoch": 0.9150743099787686, |
|
"grad_norm": 5.093682765960693, |
|
"kl": 0.5947265625, |
|
"learning_rate": 7.21016778611259e-07, |
|
"loss": 0.0061, |
|
"reward": 0.5427140817046165, |
|
"reward_std": 0.19685931131243706, |
|
"rewards/code_reward": 0.4447230063378811, |
|
"rewards/format_reward": 0.9799107760190964, |
|
"step": 431 |
|
}, |
|
{ |
|
"completion_length": 677.9040222167969, |
|
"epoch": 0.9171974522292994, |
|
"grad_norm": 38.870262145996094, |
|
"kl": 5.4326171875, |
|
"learning_rate": 7.147601067369835e-07, |
|
"loss": 0.0545, |
|
"reward": 0.5093298330903053, |
|
"reward_std": 0.19096140936017036, |
|
"rewards/code_reward": 0.41111550480127335, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 432 |
|
}, |
|
{ |
|
"completion_length": 694.513427734375, |
|
"epoch": 0.9193205944798302, |
|
"grad_norm": 0.5155165195465088, |
|
"kl": 0.155029296875, |
|
"learning_rate": 7.085888294524561e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5259926542639732, |
|
"reward_std": 0.18491110764443874, |
|
"rewards/code_reward": 0.42733194679021835, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 433 |
|
}, |
|
{ |
|
"completion_length": 704.5580596923828, |
|
"epoch": 0.921443736730361, |
|
"grad_norm": 0.6282893419265747, |
|
"kl": 0.3359375, |
|
"learning_rate": 7.025032056921117e-07, |
|
"loss": 0.0034, |
|
"reward": 0.5899785161018372, |
|
"reward_std": 0.19566836208105087, |
|
"rewards/code_reward": 0.4913177192211151, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 434 |
|
}, |
|
{ |
|
"completion_length": 722.1562805175781, |
|
"epoch": 0.9235668789808917, |
|
"grad_norm": 1.048966884613037, |
|
"kl": 0.4742431640625, |
|
"learning_rate": 6.965034907965349e-07, |
|
"loss": 0.0049, |
|
"reward": 0.5559424459934235, |
|
"reward_std": 0.2080874666571617, |
|
"rewards/code_reward": 0.4588441997766495, |
|
"rewards/format_reward": 0.9709821939468384, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 679.9933319091797, |
|
"epoch": 0.9256900212314225, |
|
"grad_norm": 0.6251688599586487, |
|
"kl": 0.171142578125, |
|
"learning_rate": 6.905899365017462e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5245073512196541, |
|
"reward_std": 0.17461021803319454, |
|
"rewards/code_reward": 0.42606981843709946, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 436 |
|
}, |
|
{ |
|
"completion_length": 711.6942443847656, |
|
"epoch": 0.9278131634819533, |
|
"grad_norm": 1.1648685932159424, |
|
"kl": 0.299560546875, |
|
"learning_rate": 6.847627909286409e-07, |
|
"loss": 0.003, |
|
"reward": 0.41069934517145157, |
|
"reward_std": 0.17594012804329395, |
|
"rewards/code_reward": 0.31226181238889694, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 437 |
|
}, |
|
{ |
|
"completion_length": 702.3482513427734, |
|
"epoch": 0.9299363057324841, |
|
"grad_norm": 1.5311229228973389, |
|
"kl": 0.31640625, |
|
"learning_rate": 6.790222985725761e-07, |
|
"loss": 0.0033, |
|
"reward": 0.5770048946142197, |
|
"reward_std": 0.1962369978427887, |
|
"rewards/code_reward": 0.4790138080716133, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 438 |
|
}, |
|
{ |
|
"completion_length": 683.4151916503906, |
|
"epoch": 0.9320594479830149, |
|
"grad_norm": 8.243356704711914, |
|
"kl": 3.05126953125, |
|
"learning_rate": 6.733687002931141e-07, |
|
"loss": 0.0306, |
|
"reward": 0.5087217092514038, |
|
"reward_std": 0.1651569865643978, |
|
"rewards/code_reward": 0.4109538644552231, |
|
"rewards/format_reward": 0.9776786267757416, |
|
"step": 439 |
|
}, |
|
{ |
|
"completion_length": 713.6049346923828, |
|
"epoch": 0.9341825902335457, |
|
"grad_norm": 1.4741530418395996, |
|
"kl": 0.967529296875, |
|
"learning_rate": 6.678022333039158e-07, |
|
"loss": 0.0098, |
|
"reward": 0.587900809943676, |
|
"reward_std": 0.16147084161639214, |
|
"rewards/code_reward": 0.4903561547398567, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 677.8303833007812, |
|
"epoch": 0.9363057324840764, |
|
"grad_norm": 0.3179962933063507, |
|
"kl": 0.230224609375, |
|
"learning_rate": 6.623231311627876e-07, |
|
"loss": 0.0025, |
|
"reward": 0.561469204723835, |
|
"reward_std": 0.16684554889798164, |
|
"rewards/code_reward": 0.4625852555036545, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 441 |
|
}, |
|
{ |
|
"completion_length": 725.1004791259766, |
|
"epoch": 0.9384288747346072, |
|
"grad_norm": 2.448838233947754, |
|
"kl": 1.276123046875, |
|
"learning_rate": 6.569316237618811e-07, |
|
"loss": 0.0127, |
|
"reward": 0.3736302964389324, |
|
"reward_std": 0.18804692663252354, |
|
"rewards/code_reward": 0.2751928083598614, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 442 |
|
}, |
|
{ |
|
"completion_length": 710.9286041259766, |
|
"epoch": 0.940552016985138, |
|
"grad_norm": 0.38171106576919556, |
|
"kl": 0.2259521484375, |
|
"learning_rate": 6.516279373180499e-07, |
|
"loss": 0.0024, |
|
"reward": 0.45342515781521797, |
|
"reward_std": 0.16657396219670773, |
|
"rewards/code_reward": 0.3540947772562504, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 443 |
|
}, |
|
{ |
|
"completion_length": 665.5625305175781, |
|
"epoch": 0.9426751592356688, |
|
"grad_norm": 0.5256981253623962, |
|
"kl": 0.63818359375, |
|
"learning_rate": 6.464122943633543e-07, |
|
"loss": 0.0066, |
|
"reward": 0.5117220133543015, |
|
"reward_std": 0.17998000979423523, |
|
"rewards/code_reward": 0.4126148596405983, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 444 |
|
}, |
|
{ |
|
"completion_length": 669.888427734375, |
|
"epoch": 0.9447983014861996, |
|
"grad_norm": 10.391777038574219, |
|
"kl": 1.935546875, |
|
"learning_rate": 6.412849137357271e-07, |
|
"loss": 0.0195, |
|
"reward": 0.577217735350132, |
|
"reward_std": 0.18068324774503708, |
|
"rewards/code_reward": 0.47878019511699677, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 706.747802734375, |
|
"epoch": 0.9469214437367304, |
|
"grad_norm": 0.7888285517692566, |
|
"kl": 0.395263671875, |
|
"learning_rate": 6.3624601056979e-07, |
|
"loss": 0.0041, |
|
"reward": 0.5674577727913857, |
|
"reward_std": 0.14589250087738037, |
|
"rewards/code_reward": 0.469020277261734, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 446 |
|
}, |
|
{ |
|
"completion_length": 698.3861999511719, |
|
"epoch": 0.9490445859872612, |
|
"grad_norm": 0.5909515619277954, |
|
"kl": 0.4178466796875, |
|
"learning_rate": 6.312957962878278e-07, |
|
"loss": 0.0042, |
|
"reward": 0.44434136897325516, |
|
"reward_std": 0.1476050168275833, |
|
"rewards/code_reward": 0.3447878174483776, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 447 |
|
}, |
|
{ |
|
"completion_length": 697.1138763427734, |
|
"epoch": 0.9511677282377919, |
|
"grad_norm": 0.3049964904785156, |
|
"kl": 0.36083984375, |
|
"learning_rate": 6.264344785909181e-07, |
|
"loss": 0.0036, |
|
"reward": 0.5054452195763588, |
|
"reward_std": 0.16559578850865364, |
|
"rewards/code_reward": 0.40633804351091385, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 448 |
|
}, |
|
{ |
|
"completion_length": 699.5000457763672, |
|
"epoch": 0.9532908704883227, |
|
"grad_norm": 2.360261917114258, |
|
"kl": 1.0093994140625, |
|
"learning_rate": 6.216622614502149e-07, |
|
"loss": 0.0102, |
|
"reward": 0.43248920887708664, |
|
"reward_std": 0.20951998233795166, |
|
"rewards/code_reward": 0.3344981260597706, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 449 |
|
}, |
|
{ |
|
"completion_length": 711.5647735595703, |
|
"epoch": 0.9554140127388535, |
|
"grad_norm": 0.45019006729125977, |
|
"kl": 0.396728515625, |
|
"learning_rate": 6.169793450983916e-07, |
|
"loss": 0.0041, |
|
"reward": 0.4090769328176975, |
|
"reward_std": 0.1387995146214962, |
|
"rewards/code_reward": 0.30996978655457497, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 687.4710083007812, |
|
"epoch": 0.9575371549893843, |
|
"grad_norm": 1.139167070388794, |
|
"kl": 0.70947265625, |
|
"learning_rate": 6.123859260212393e-07, |
|
"loss": 0.0073, |
|
"reward": 0.6231836080551147, |
|
"reward_std": 0.18272383697330952, |
|
"rewards/code_reward": 0.5249693095684052, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 451 |
|
}, |
|
{ |
|
"completion_length": 664.4620666503906, |
|
"epoch": 0.9596602972399151, |
|
"grad_norm": 11.963510513305664, |
|
"kl": 2.9296875, |
|
"learning_rate": 6.07882196949423e-07, |
|
"loss": 0.0292, |
|
"reward": 0.5648458003997803, |
|
"reward_std": 0.21996057033538818, |
|
"rewards/code_reward": 0.4666314870119095, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 452 |
|
}, |
|
{ |
|
"completion_length": 672.8326110839844, |
|
"epoch": 0.9617834394904459, |
|
"grad_norm": 0.23568426072597504, |
|
"kl": 0.138427734375, |
|
"learning_rate": 6.034683468503948e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5011638775467873, |
|
"reward_std": 0.1840323582291603, |
|
"rewards/code_reward": 0.4020567089319229, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 453 |
|
}, |
|
{ |
|
"completion_length": 692.2835083007812, |
|
"epoch": 0.9639065817409767, |
|
"grad_norm": 1.3131980895996094, |
|
"kl": 0.73291015625, |
|
"learning_rate": 5.991445609204641e-07, |
|
"loss": 0.0073, |
|
"reward": 0.49861256778240204, |
|
"reward_std": 0.19007166847586632, |
|
"rewards/code_reward": 0.4006215110421181, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 454 |
|
}, |
|
{ |
|
"completion_length": 680.5536041259766, |
|
"epoch": 0.9660297239915074, |
|
"grad_norm": 1.0419756174087524, |
|
"kl": 0.8876953125, |
|
"learning_rate": 5.949110205770292e-07, |
|
"loss": 0.009, |
|
"reward": 0.5448554530739784, |
|
"reward_std": 0.19055398926138878, |
|
"rewards/code_reward": 0.44686436653137207, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 693.2187805175781, |
|
"epoch": 0.9681528662420382, |
|
"grad_norm": 0.7175102233886719, |
|
"kl": 0.481201171875, |
|
"learning_rate": 5.90767903450964e-07, |
|
"loss": 0.0049, |
|
"reward": 0.4721348285675049, |
|
"reward_std": 0.14595188200473785, |
|
"rewards/code_reward": 0.37302765995264053, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 456 |
|
}, |
|
{ |
|
"completion_length": 695.2723541259766, |
|
"epoch": 0.970276008492569, |
|
"grad_norm": 0.4296894371509552, |
|
"kl": 0.26416015625, |
|
"learning_rate": 5.867153833791652e-07, |
|
"loss": 0.0027, |
|
"reward": 0.6006196290254593, |
|
"reward_std": 0.17042616941034794, |
|
"rewards/code_reward": 0.5019589066505432, |
|
"rewards/format_reward": 0.9866071790456772, |
|
"step": 457 |
|
}, |
|
{ |
|
"completion_length": 692.6652221679688, |
|
"epoch": 0.9723991507430998, |
|
"grad_norm": 0.4421376585960388, |
|
"kl": 0.31982421875, |
|
"learning_rate": 5.827536303972587e-07, |
|
"loss": 0.0033, |
|
"reward": 0.5808815285563469, |
|
"reward_std": 0.2226531021296978, |
|
"rewards/code_reward": 0.4815511405467987, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 458 |
|
}, |
|
{ |
|
"completion_length": 679.247802734375, |
|
"epoch": 0.9745222929936306, |
|
"grad_norm": 0.40139421820640564, |
|
"kl": 0.47216796875, |
|
"learning_rate": 5.78882810732465e-07, |
|
"loss": 0.0048, |
|
"reward": 0.5371674299240112, |
|
"reward_std": 0.22804895788431168, |
|
"rewards/code_reward": 0.43962281197309494, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 459 |
|
}, |
|
{ |
|
"completion_length": 706.5379638671875, |
|
"epoch": 0.9766454352441614, |
|
"grad_norm": 0.8857892751693726, |
|
"kl": 0.51220703125, |
|
"learning_rate": 5.75103086796625e-07, |
|
"loss": 0.0052, |
|
"reward": 0.4905061312019825, |
|
"reward_std": 0.1841362752020359, |
|
"rewards/code_reward": 0.39206862077116966, |
|
"rewards/format_reward": 0.9843750447034836, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 690.3415374755859, |
|
"epoch": 0.9787685774946921, |
|
"grad_norm": 0.6516154408454895, |
|
"kl": 0.439697265625, |
|
"learning_rate": 5.714146171793846e-07, |
|
"loss": 0.0045, |
|
"reward": 0.5876915380358696, |
|
"reward_std": 0.15721704810857773, |
|
"rewards/code_reward": 0.4892539754509926, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 461 |
|
}, |
|
{ |
|
"completion_length": 681.5156402587891, |
|
"epoch": 0.9808917197452229, |
|
"grad_norm": 0.618622362613678, |
|
"kl": 0.48046875, |
|
"learning_rate": 5.678175566415422e-07, |
|
"loss": 0.0048, |
|
"reward": 0.49158109724521637, |
|
"reward_std": 0.1944441720843315, |
|
"rewards/code_reward": 0.39381323754787445, |
|
"rewards/format_reward": 0.9776786118745804, |
|
"step": 462 |
|
}, |
|
{ |
|
"completion_length": 722.8750305175781, |
|
"epoch": 0.9830148619957537, |
|
"grad_norm": 0.7218803763389587, |
|
"kl": 0.565185546875, |
|
"learning_rate": 5.643120561085528e-07, |
|
"loss": 0.0057, |
|
"reward": 0.4738345965743065, |
|
"reward_std": 0.24430794268846512, |
|
"rewards/code_reward": 0.37651316076517105, |
|
"rewards/format_reward": 0.9732143133878708, |
|
"step": 463 |
|
}, |
|
{ |
|
"completion_length": 682.4219055175781, |
|
"epoch": 0.9851380042462845, |
|
"grad_norm": 0.7259976863861084, |
|
"kl": 0.706298828125, |
|
"learning_rate": 5.608982626641991e-07, |
|
"loss": 0.0071, |
|
"reward": 0.47413645684719086, |
|
"reward_std": 0.21057153865695, |
|
"rewards/code_reward": 0.3761453852057457, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 464 |
|
}, |
|
{ |
|
"completion_length": 716.5736999511719, |
|
"epoch": 0.9872611464968153, |
|
"grad_norm": 0.2483934909105301, |
|
"kl": 0.260009765625, |
|
"learning_rate": 5.575763195444166e-07, |
|
"loss": 0.0027, |
|
"reward": 0.5671171024441719, |
|
"reward_std": 0.19927529990673065, |
|
"rewards/code_reward": 0.46912601590156555, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 680.6339416503906, |
|
"epoch": 0.9893842887473461, |
|
"grad_norm": 1.9431463479995728, |
|
"kl": 1.3564453125, |
|
"learning_rate": 5.543463661312847e-07, |
|
"loss": 0.0136, |
|
"reward": 0.417750583961606, |
|
"reward_std": 0.13368695229291916, |
|
"rewards/code_reward": 0.3197594955563545, |
|
"rewards/format_reward": 0.9799107611179352, |
|
"step": 466 |
|
}, |
|
{ |
|
"completion_length": 684.9911041259766, |
|
"epoch": 0.9915074309978769, |
|
"grad_norm": 0.8100730776786804, |
|
"kl": 0.4619140625, |
|
"learning_rate": 5.512085379471808e-07, |
|
"loss": 0.0048, |
|
"reward": 0.6499997675418854, |
|
"reward_std": 0.20544839650392532, |
|
"rewards/code_reward": 0.5511157959699631, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 467 |
|
}, |
|
{ |
|
"completion_length": 681.5670013427734, |
|
"epoch": 0.9936305732484076, |
|
"grad_norm": 3.3746109008789062, |
|
"kl": 1.414306640625, |
|
"learning_rate": 5.481629666490903e-07, |
|
"loss": 0.0142, |
|
"reward": 0.5468520447611809, |
|
"reward_std": 0.21051420643925667, |
|
"rewards/code_reward": 0.44774486869573593, |
|
"rewards/format_reward": 0.9910714477300644, |
|
"step": 468 |
|
}, |
|
{ |
|
"completion_length": 688.2388610839844, |
|
"epoch": 0.9957537154989384, |
|
"grad_norm": 1.0109045505523682, |
|
"kl": 1.15380859375, |
|
"learning_rate": 5.452097800230853e-07, |
|
"loss": 0.0116, |
|
"reward": 0.6098516285419464, |
|
"reward_std": 0.211056686937809, |
|
"rewards/code_reward": 0.5116373002529144, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 469 |
|
}, |
|
{ |
|
"completion_length": 681.9821624755859, |
|
"epoch": 0.9978768577494692, |
|
"grad_norm": 0.7048155665397644, |
|
"kl": 0.809814453125, |
|
"learning_rate": 5.423491019789623e-07, |
|
"loss": 0.0082, |
|
"reward": 0.45874594151973724, |
|
"reward_std": 0.14819572865962982, |
|
"rewards/code_reward": 0.3596387729048729, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 707.5000457763672, |
|
"epoch": 1.0, |
|
"grad_norm": 2.86737060546875, |
|
"kl": 1.1783447265625, |
|
"learning_rate": 5.395810525450425e-07, |
|
"loss": 0.0118, |
|
"reward": 0.5169450491666794, |
|
"reward_std": 0.18883745186030865, |
|
"rewards/code_reward": 0.41850756853818893, |
|
"rewards/format_reward": 0.9843750298023224, |
|
"step": 471 |
|
}, |
|
{ |
|
"completion_length": 677.5982360839844, |
|
"epoch": 1.0021231422505308, |
|
"grad_norm": 1.9091771841049194, |
|
"kl": 1.307861328125, |
|
"learning_rate": 5.369057478631359e-07, |
|
"loss": 0.0132, |
|
"reward": 0.5092417150735855, |
|
"reward_std": 0.18099428340792656, |
|
"rewards/code_reward": 0.4110274314880371, |
|
"rewards/format_reward": 0.9821428805589676, |
|
"step": 472 |
|
}, |
|
{ |
|
"completion_length": 711.9843902587891, |
|
"epoch": 1.0042462845010616, |
|
"grad_norm": 1.6537326574325562, |
|
"kl": 1.21533203125, |
|
"learning_rate": 5.343233001836694e-07, |
|
"loss": 0.0122, |
|
"reward": 0.48672058433294296, |
|
"reward_std": 0.19152027182281017, |
|
"rewards/code_reward": 0.38939911872148514, |
|
"rewards/format_reward": 0.9732143133878708, |
|
"step": 473 |
|
}, |
|
{ |
|
"completion_length": 707.3102874755859, |
|
"epoch": 1.0063694267515924, |
|
"grad_norm": 0.8889822959899902, |
|
"kl": 0.529541015625, |
|
"learning_rate": 5.318338178609754e-07, |
|
"loss": 0.0054, |
|
"reward": 0.5736411809921265, |
|
"reward_std": 0.17692103423178196, |
|
"rewards/code_reward": 0.4749804362654686, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 474 |
|
}, |
|
{ |
|
"completion_length": 736.5714721679688, |
|
"epoch": 1.0084925690021231, |
|
"grad_norm": 1.3358269929885864, |
|
"kl": 0.98828125, |
|
"learning_rate": 5.294374053487459e-07, |
|
"loss": 0.0099, |
|
"reward": 0.44529393315315247, |
|
"reward_std": 0.17480986192822456, |
|
"rewards/code_reward": 0.3468564301729202, |
|
"rewards/format_reward": 0.9843750596046448, |
|
"step": 475 |
|
}, |
|
{ |
|
"completion_length": 711.4241485595703, |
|
"epoch": 1.010615711252654, |
|
"grad_norm": 1.4289793968200684, |
|
"kl": 1.22998046875, |
|
"learning_rate": 5.271341631956511e-07, |
|
"loss": 0.0123, |
|
"reward": 0.5166614726185799, |
|
"reward_std": 0.1912681832909584, |
|
"rewards/code_reward": 0.42000964283943176, |
|
"rewards/format_reward": 0.96651791036129, |
|
"step": 476 |
|
}, |
|
{ |
|
"completion_length": 695.966552734375, |
|
"epoch": 1.0127388535031847, |
|
"grad_norm": 1.04447340965271, |
|
"kl": 0.756103515625, |
|
"learning_rate": 5.249241880411181e-07, |
|
"loss": 0.0076, |
|
"reward": 0.5925345048308372, |
|
"reward_std": 0.20060284808278084, |
|
"rewards/code_reward": 0.4952130541205406, |
|
"rewards/format_reward": 0.9732143133878708, |
|
"step": 477 |
|
}, |
|
{ |
|
"completion_length": 694.6986846923828, |
|
"epoch": 1.0148619957537155, |
|
"grad_norm": 0.8483067750930786, |
|
"kl": 0.3671875, |
|
"learning_rate": 5.228075726112785e-07, |
|
"loss": 0.0039, |
|
"reward": 0.5394521579146385, |
|
"reward_std": 0.12158003821969032, |
|
"rewards/code_reward": 0.44079139083623886, |
|
"rewards/format_reward": 0.986607164144516, |
|
"step": 478 |
|
}, |
|
{ |
|
"completion_length": 708.294677734375, |
|
"epoch": 1.0169851380042463, |
|
"grad_norm": 2.820655584335327, |
|
"kl": 2.081787109375, |
|
"learning_rate": 5.207844057150768e-07, |
|
"loss": 0.0209, |
|
"reward": 0.530554287135601, |
|
"reward_std": 0.18540234863758087, |
|
"rewards/code_reward": 0.4339024946093559, |
|
"rewards/format_reward": 0.9665178954601288, |
|
"step": 479 |
|
}, |
|
{ |
|
"completion_length": 717.2143249511719, |
|
"epoch": 1.019108280254777, |
|
"grad_norm": 0.23074620962142944, |
|
"kl": 0.481689453125, |
|
"learning_rate": 5.188547722405437e-07, |
|
"loss": 0.005, |
|
"reward": 0.6097277328372002, |
|
"reward_std": 0.2097402885556221, |
|
"rewards/code_reward": 0.5108437687158585, |
|
"rewards/format_reward": 0.988839328289032, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 673.4553833007812, |
|
"epoch": 1.0212314225053079, |
|
"grad_norm": 18.151901245117188, |
|
"kl": 6.5419921875, |
|
"learning_rate": 5.170187531512351e-07, |
|
"loss": 0.0654, |
|
"reward": 0.4982636645436287, |
|
"reward_std": 0.18235952779650688, |
|
"rewards/code_reward": 0.4000493362545967, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 481 |
|
}, |
|
{ |
|
"completion_length": 657.0223541259766, |
|
"epoch": 1.0233545647558386, |
|
"grad_norm": 1.2006980180740356, |
|
"kl": 0.98583984375, |
|
"learning_rate": 5.152764254828348e-07, |
|
"loss": 0.0101, |
|
"reward": 0.6024035438895226, |
|
"reward_std": 0.18741042539477348, |
|
"rewards/code_reward": 0.5044124275445938, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 482 |
|
}, |
|
{ |
|
"completion_length": 679.5446624755859, |
|
"epoch": 1.0254777070063694, |
|
"grad_norm": 6.640290260314941, |
|
"kl": 2.26318359375, |
|
"learning_rate": 5.136278623399225e-07, |
|
"loss": 0.0229, |
|
"reward": 0.6333309859037399, |
|
"reward_std": 0.16317120380699635, |
|
"rewards/code_reward": 0.5337774083018303, |
|
"rewards/format_reward": 0.9955357313156128, |
|
"step": 483 |
|
}, |
|
{ |
|
"completion_length": 690.9464721679688, |
|
"epoch": 1.0276008492569002, |
|
"grad_norm": 1.2075289487838745, |
|
"kl": 0.635498046875, |
|
"learning_rate": 5.120731328929058e-07, |
|
"loss": 0.0065, |
|
"reward": 0.6160075142979622, |
|
"reward_std": 0.18631838634610176, |
|
"rewards/code_reward": 0.516677126288414, |
|
"rewards/format_reward": 0.9933035969734192, |
|
"step": 484 |
|
}, |
|
{ |
|
"completion_length": 711.2254791259766, |
|
"epoch": 1.029723991507431, |
|
"grad_norm": 0.6530643105506897, |
|
"kl": 0.88671875, |
|
"learning_rate": 5.106123023751187e-07, |
|
"loss": 0.009, |
|
"reward": 0.5319265574216843, |
|
"reward_std": 0.16448520869016647, |
|
"rewards/code_reward": 0.43304260820150375, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 485 |
|
}, |
|
{ |
|
"completion_length": 697.0067291259766, |
|
"epoch": 1.0318471337579618, |
|
"grad_norm": 0.7889028787612915, |
|
"kl": 0.486083984375, |
|
"learning_rate": 5.092454320800833e-07, |
|
"loss": 0.0049, |
|
"reward": 0.5322659835219383, |
|
"reward_std": 0.2203526459634304, |
|
"rewards/code_reward": 0.4340517073869705, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 486 |
|
}, |
|
{ |
|
"completion_length": 707.5915374755859, |
|
"epoch": 1.0339702760084926, |
|
"grad_norm": 1.0391658544540405, |
|
"kl": 1.347412109375, |
|
"learning_rate": 5.079725793589405e-07, |
|
"loss": 0.0136, |
|
"reward": 0.5818885043263435, |
|
"reward_std": 0.18653497844934464, |
|
"rewards/code_reward": 0.4838974103331566, |
|
"rewards/format_reward": 0.979910746216774, |
|
"step": 487 |
|
}, |
|
{ |
|
"completion_length": 681.9509124755859, |
|
"epoch": 1.0360934182590233, |
|
"grad_norm": 1.4511501789093018, |
|
"kl": 0.91943359375, |
|
"learning_rate": 5.067937976180407e-07, |
|
"loss": 0.0092, |
|
"reward": 0.20120449364185333, |
|
"reward_std": 0.06054047856014222, |
|
"rewards/code_reward": 0.10365983843803406, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 488 |
|
}, |
|
{ |
|
"completion_length": 696.1696624755859, |
|
"epoch": 1.0382165605095541, |
|
"grad_norm": 1.0735193490982056, |
|
"kl": 0.93994140625, |
|
"learning_rate": 5.057091363167046e-07, |
|
"loss": 0.0095, |
|
"reward": 0.41191001795232296, |
|
"reward_std": 0.11514822754543275, |
|
"rewards/code_reward": 0.31324928998947144, |
|
"rewards/format_reward": 0.9866071939468384, |
|
"step": 489 |
|
}, |
|
{ |
|
"completion_length": 721.6607513427734, |
|
"epoch": 1.040339702760085, |
|
"grad_norm": 1.8616191148757935, |
|
"kl": 1.632080078125, |
|
"learning_rate": 5.047186409651489e-07, |
|
"loss": 0.0165, |
|
"reward": 0.5570781454443932, |
|
"reward_std": 0.17984510958194733, |
|
"rewards/code_reward": 0.45886383950710297, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 674.4308319091797, |
|
"epoch": 1.0424628450106157, |
|
"grad_norm": 2.0347139835357666, |
|
"kl": 1.854736328125, |
|
"learning_rate": 5.038223531225742e-07, |
|
"loss": 0.0186, |
|
"reward": 0.4472319483757019, |
|
"reward_std": 0.20929547771811485, |
|
"rewards/code_reward": 0.3496873155236244, |
|
"rewards/format_reward": 0.9754464626312256, |
|
"step": 491 |
|
}, |
|
{ |
|
"completion_length": 684.2701263427734, |
|
"epoch": 1.0445859872611465, |
|
"grad_norm": 0.47032228112220764, |
|
"kl": 0.452392578125, |
|
"learning_rate": 5.030203103954232e-07, |
|
"loss": 0.0046, |
|
"reward": 0.6024687513709068, |
|
"reward_std": 0.20001190528273582, |
|
"rewards/code_reward": 0.5038080215454102, |
|
"rewards/format_reward": 0.986607164144516, |
|
"step": 492 |
|
}, |
|
{ |
|
"completion_length": 749.888427734375, |
|
"epoch": 1.0467091295116773, |
|
"grad_norm": 1.3009785413742065, |
|
"kl": 0.7442626953125, |
|
"learning_rate": 5.023125464358026e-07, |
|
"loss": 0.0075, |
|
"reward": 0.4289785400032997, |
|
"reward_std": 0.19978297501802444, |
|
"rewards/code_reward": 0.3307642340660095, |
|
"rewards/format_reward": 0.9821428805589676, |
|
"step": 493 |
|
}, |
|
{ |
|
"completion_length": 707.247802734375, |
|
"epoch": 1.048832271762208, |
|
"grad_norm": 2.7150216102600098, |
|
"kl": 1.939453125, |
|
"learning_rate": 5.016990909400709e-07, |
|
"loss": 0.0195, |
|
"reward": 0.48099584877491, |
|
"reward_std": 0.17564579099416733, |
|
"rewards/code_reward": 0.3834511674940586, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 494 |
|
}, |
|
{ |
|
"completion_length": 712.8861999511719, |
|
"epoch": 1.0509554140127388, |
|
"grad_norm": 1.259710431098938, |
|
"kl": 1.7119140625, |
|
"learning_rate": 5.011799696475915e-07, |
|
"loss": 0.0172, |
|
"reward": 0.5863819345831871, |
|
"reward_std": 0.17422104254364967, |
|
"rewards/code_reward": 0.48883724212646484, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 495 |
|
}, |
|
{ |
|
"completion_length": 670.3125305175781, |
|
"epoch": 1.0530785562632696, |
|
"grad_norm": 1.799985408782959, |
|
"kl": 1.3544921875, |
|
"learning_rate": 5.007552043396547e-07, |
|
"loss": 0.0137, |
|
"reward": 0.6773558109998703, |
|
"reward_std": 0.21710924059152603, |
|
"rewards/code_reward": 0.578471876680851, |
|
"rewards/format_reward": 0.9888393133878708, |
|
"step": 496 |
|
}, |
|
{ |
|
"completion_length": 647.8727874755859, |
|
"epoch": 1.0552016985138004, |
|
"grad_norm": 1.1735022068023682, |
|
"kl": 0.455810546875, |
|
"learning_rate": 5.004248128385618e-07, |
|
"loss": 0.0047, |
|
"reward": 0.6235345751047134, |
|
"reward_std": 0.20276143215596676, |
|
"rewards/code_reward": 0.5259898751974106, |
|
"rewards/format_reward": 0.9754464775323868, |
|
"step": 497 |
|
}, |
|
{ |
|
"completion_length": 720.7388610839844, |
|
"epoch": 1.0573248407643312, |
|
"grad_norm": 1.039088487625122, |
|
"kl": 1.011474609375, |
|
"learning_rate": 5.001888090068784e-07, |
|
"loss": 0.0102, |
|
"reward": 0.5388440862298012, |
|
"reward_std": 0.1800019945949316, |
|
"rewards/code_reward": 0.4397369250655174, |
|
"rewards/format_reward": 0.9910714626312256, |
|
"step": 498 |
|
}, |
|
{ |
|
"completion_length": 737.9375305175781, |
|
"epoch": 1.059447983014862, |
|
"grad_norm": 2.2365331649780273, |
|
"kl": 0.947998046875, |
|
"learning_rate": 5.000472027468528e-07, |
|
"loss": 0.0095, |
|
"reward": 0.5870940536260605, |
|
"reward_std": 0.17025620490312576, |
|
"rewards/code_reward": 0.4893261566758156, |
|
"rewards/format_reward": 0.9776785969734192, |
|
"step": 499 |
|
}, |
|
{ |
|
"completion_length": 686.7678833007812, |
|
"epoch": 1.0615711252653928, |
|
"grad_norm": 11.270224571228027, |
|
"kl": 3.646484375, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 0.0367, |
|
"reward": 0.2899981178343296, |
|
"reward_std": 0.10342313535511494, |
|
"rewards/code_reward": 0.1917838342487812, |
|
"rewards/format_reward": 0.9821428954601288, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0615711252653928, |
|
"step": 500, |
|
"total_flos": 0.0, |
|
"train_loss": 0.008756054809940243, |
|
"train_runtime": 191583.7312, |
|
"train_samples_per_second": 1.169, |
|
"train_steps_per_second": 0.003 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 250, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|