Haitao999's picture
Model save
17bc122 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9965010496850945,
"eval_steps": 100,
"global_step": 178,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 548.2455501556396,
"epoch": 0.005598320503848845,
"grad_norm": 0.003676735097542405,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03125000128056854,
"reward_std": 0.035294653847813606,
"rewards/accuracy_reward": 0.03125000128056854,
"step": 1
},
{
"completion_length": 521.3355770111084,
"epoch": 0.01119664100769769,
"grad_norm": 0.001673316117376089,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.005952381179668009,
"reward_std": 0.008138235658407211,
"rewards/accuracy_reward": 0.005952381179668009,
"step": 2
},
{
"completion_length": 565.3147449493408,
"epoch": 0.016794961511546535,
"grad_norm": 0.002517723012715578,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.009672619285993278,
"reward_std": 0.01636804547160864,
"rewards/accuracy_reward": 0.009672619285993278,
"step": 3
},
{
"completion_length": 577.5029792785645,
"epoch": 0.02239328201539538,
"grad_norm": 0.003224034095183015,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.029761905316263437,
"reward_std": 0.02953372267074883,
"rewards/accuracy_reward": 0.029761905316263437,
"step": 4
},
{
"completion_length": 564.710578918457,
"epoch": 0.02799160251924423,
"grad_norm": 0.0038144055288285017,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.020833333779592067,
"reward_std": 0.022426264360547066,
"rewards/accuracy_reward": 0.020833333779592067,
"step": 5
},
{
"completion_length": 534.8861694335938,
"epoch": 0.03358992302309307,
"grad_norm": 0.003479737089946866,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.026785714842844754,
"reward_std": 0.031758472323417664,
"rewards/accuracy_reward": 0.026785714842844754,
"step": 6
},
{
"completion_length": 580.7991256713867,
"epoch": 0.03918824352694192,
"grad_norm": 0.0025944788940250874,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.014880952483508736,
"reward_std": 0.010090996511280537,
"rewards/accuracy_reward": 0.014880952483508736,
"step": 7
},
{
"completion_length": 515.3296279907227,
"epoch": 0.04478656403079076,
"grad_norm": 0.003000692930072546,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.021577381354290992,
"reward_std": 0.019644177984446287,
"rewards/accuracy_reward": 0.021577381354290992,
"step": 8
},
{
"completion_length": 571.0290279388428,
"epoch": 0.05038488453463961,
"grad_norm": 0.003332852851599455,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02901785750873387,
"reward_std": 0.026531722396612167,
"rewards/accuracy_reward": 0.02901785750873387,
"step": 9
},
{
"completion_length": 580.1770935058594,
"epoch": 0.05598320503848846,
"grad_norm": 0.0018766584107652307,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0074404762126505375,
"reward_std": 0.003475441597402096,
"rewards/accuracy_reward": 0.0074404762126505375,
"step": 10
},
{
"completion_length": 576.3244209289551,
"epoch": 0.0615815255423373,
"grad_norm": 0.004447213374078274,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03497023892123252,
"reward_std": 0.03141464572399855,
"rewards/accuracy_reward": 0.03497023892123252,
"step": 11
},
{
"completion_length": 555.8616218566895,
"epoch": 0.06717984604618614,
"grad_norm": 0.003440001280978322,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.033482144062872976,
"reward_std": 0.03174867480993271,
"rewards/accuracy_reward": 0.033482144062872976,
"step": 12
},
{
"completion_length": 555.1770973205566,
"epoch": 0.072778166550035,
"grad_norm": 0.004410188645124435,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02604166732635349,
"reward_std": 0.03677397267892957,
"rewards/accuracy_reward": 0.02604166732635349,
"step": 13
},
{
"completion_length": 582.8259029388428,
"epoch": 0.07837648705388384,
"grad_norm": 0.0032948441803455353,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.019345238630194217,
"reward_std": 0.02020683465525508,
"rewards/accuracy_reward": 0.019345238630194217,
"step": 14
},
{
"completion_length": 573.2150402069092,
"epoch": 0.08397480755773268,
"grad_norm": 0.004153064452111721,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02083333395421505,
"reward_std": 0.024796947836875916,
"rewards/accuracy_reward": 0.02083333395421505,
"step": 15
},
{
"completion_length": 511.9151840209961,
"epoch": 0.08957312806158152,
"grad_norm": 0.0052322824485599995,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04017857229337096,
"reward_std": 0.04290085076354444,
"rewards/accuracy_reward": 0.04017857229337096,
"step": 16
},
{
"completion_length": 538.8735218048096,
"epoch": 0.09517144856543037,
"grad_norm": 0.005073050037026405,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0349702388048172,
"reward_std": 0.03157654171809554,
"rewards/accuracy_reward": 0.0349702388048172,
"step": 17
},
{
"completion_length": 594.4494113922119,
"epoch": 0.10076976906927922,
"grad_norm": 0.004708003718405962,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.030505953240208328,
"reward_std": 0.035523281432688236,
"rewards/accuracy_reward": 0.030505953240208328,
"step": 18
},
{
"completion_length": 557.1242733001709,
"epoch": 0.10636808957312806,
"grad_norm": 0.0028041391633450985,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02827381028328091,
"reward_std": 0.01928615104407072,
"rewards/accuracy_reward": 0.02827381028328091,
"step": 19
},
{
"completion_length": 560.6994209289551,
"epoch": 0.11196641007697691,
"grad_norm": 0.004033979959785938,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.029761905258055776,
"reward_std": 0.03157439874485135,
"rewards/accuracy_reward": 0.029761905258055776,
"step": 20
},
{
"completion_length": 565.3564147949219,
"epoch": 0.11756473058082575,
"grad_norm": 0.0020209557842463255,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.004464285797439516,
"reward_std": 0.00787156680598855,
"rewards/accuracy_reward": 0.004464285797439516,
"step": 21
},
{
"completion_length": 618.4695091247559,
"epoch": 0.1231630510846746,
"grad_norm": 0.002670794492587447,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.021577381703536958,
"reward_std": 0.01789072621613741,
"rewards/accuracy_reward": 0.021577381703536958,
"step": 22
},
{
"completion_length": 584.6480770111084,
"epoch": 0.12876137158852344,
"grad_norm": 0.005253300536423922,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03720238187815994,
"reward_std": 0.03154852241277695,
"rewards/accuracy_reward": 0.03720238187815994,
"step": 23
},
{
"completion_length": 581.5640029907227,
"epoch": 0.13435969209237228,
"grad_norm": 0.0021275475155562162,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.005952381179668009,
"reward_std": 0.008138235658407211,
"rewards/accuracy_reward": 0.005952381179668009,
"step": 24
},
{
"completion_length": 545.2701072692871,
"epoch": 0.13995801259622112,
"grad_norm": 0.002449671970680356,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.015625000465661287,
"reward_std": 0.017629378009587526,
"rewards/accuracy_reward": 0.015625000465661287,
"step": 25
},
{
"completion_length": 545.0669708251953,
"epoch": 0.14555633310007,
"grad_norm": 0.0028808764182031155,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0200892859720625,
"reward_std": 0.018164015375077724,
"rewards/accuracy_reward": 0.0200892859720625,
"step": 26
},
{
"completion_length": 561.1131038665771,
"epoch": 0.15115465360391883,
"grad_norm": 0.004521294496953487,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.043898809934034944,
"reward_std": 0.027857428416609764,
"rewards/accuracy_reward": 0.043898809934034944,
"step": 27
},
{
"completion_length": 573.0468807220459,
"epoch": 0.15675297410776767,
"grad_norm": 0.004434187430888414,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03050595335662365,
"reward_std": 0.03036304796114564,
"rewards/accuracy_reward": 0.03050595335662365,
"step": 28
},
{
"completion_length": 573.6540298461914,
"epoch": 0.16235129461161651,
"grad_norm": 0.0023913481272757053,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03422619198681787,
"reward_std": 0.01964203454554081,
"rewards/accuracy_reward": 0.03422619198681787,
"step": 29
},
{
"completion_length": 582.3474769592285,
"epoch": 0.16794961511546536,
"grad_norm": 0.0027055193204432726,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.027529762592166662,
"reward_std": 0.02218207810074091,
"rewards/accuracy_reward": 0.027529762592166662,
"step": 30
},
{
"completion_length": 583.9047756195068,
"epoch": 0.1735479356193142,
"grad_norm": 0.003626331454142928,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03720238181995228,
"reward_std": 0.028277710545808077,
"rewards/accuracy_reward": 0.03720238181995228,
"step": 31
},
{
"completion_length": 576.6346855163574,
"epoch": 0.17914625612316304,
"grad_norm": 0.0032239772845059633,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.015625000291038305,
"reward_std": 0.022000661585479975,
"rewards/accuracy_reward": 0.015625000291038305,
"step": 32
},
{
"completion_length": 568.7916736602783,
"epoch": 0.1847445766270119,
"grad_norm": 0.0029333089478313923,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.025297619693446904,
"reward_std": 0.020201513543725014,
"rewards/accuracy_reward": 0.025297619693446904,
"step": 33
},
{
"completion_length": 555.8154830932617,
"epoch": 0.19034289713086075,
"grad_norm": 0.003082014387473464,
"learning_rate": 3e-07,
"loss": -0.0,
"reward": 0.01264880975941196,
"reward_std": 0.019351367838680744,
"rewards/accuracy_reward": 0.01264880975941196,
"step": 34
},
{
"completion_length": 593.186767578125,
"epoch": 0.1959412176347096,
"grad_norm": 0.003542139893397689,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.029017857741564512,
"reward_std": 0.03906076308339834,
"rewards/accuracy_reward": 0.029017857741564512,
"step": 35
},
{
"completion_length": 539.3273983001709,
"epoch": 0.20153953813855843,
"grad_norm": 0.0035734642297029495,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04092262004269287,
"reward_std": 0.02163945697247982,
"rewards/accuracy_reward": 0.04092262004269287,
"step": 36
},
{
"completion_length": 528.0818519592285,
"epoch": 0.20713785864240727,
"grad_norm": 0.004155021160840988,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04241071513388306,
"reward_std": 0.031058762688189745,
"rewards/accuracy_reward": 0.04241071513388306,
"step": 37
},
{
"completion_length": 572.0044708251953,
"epoch": 0.21273617914625612,
"grad_norm": 0.0043424940668046474,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.031250000349245965,
"reward_std": 0.03391032665967941,
"rewards/accuracy_reward": 0.031250000349245965,
"step": 38
},
{
"completion_length": 561.321439743042,
"epoch": 0.21833449965010496,
"grad_norm": 0.0013517189072445035,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.014136905316263437,
"reward_std": 0.004597577266395092,
"rewards/accuracy_reward": 0.014136905316263437,
"step": 39
},
{
"completion_length": 604.6815567016602,
"epoch": 0.22393282015395383,
"grad_norm": 0.004611727315932512,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04464285826543346,
"reward_std": 0.042770151514559984,
"rewards/accuracy_reward": 0.04464285826543346,
"step": 40
},
{
"completion_length": 591.9910755157471,
"epoch": 0.22953114065780267,
"grad_norm": 0.003368583507835865,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03720238246023655,
"reward_std": 0.022582839708775282,
"rewards/accuracy_reward": 0.03720238246023655,
"step": 41
},
{
"completion_length": 585.0193538665771,
"epoch": 0.2351294611616515,
"grad_norm": 0.0034278561361134052,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04315476305782795,
"reward_std": 0.02897424390539527,
"rewards/accuracy_reward": 0.04315476305782795,
"step": 42
},
{
"completion_length": 588.648078918457,
"epoch": 0.24072778166550035,
"grad_norm": 0.003546294756233692,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04092262068297714,
"reward_std": 0.02937500481493771,
"rewards/accuracy_reward": 0.04092262068297714,
"step": 43
},
{
"completion_length": 591.8616180419922,
"epoch": 0.2463261021693492,
"grad_norm": 0.0032863786909729242,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.029017857741564512,
"reward_std": 0.02016195748001337,
"rewards/accuracy_reward": 0.029017857741564512,
"step": 44
},
{
"completion_length": 595.4494113922119,
"epoch": 0.25192442267319803,
"grad_norm": 0.003891779575496912,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0513392873108387,
"reward_std": 0.021664298605173826,
"rewards/accuracy_reward": 0.0513392873108387,
"step": 45
},
{
"completion_length": 576.5751647949219,
"epoch": 0.2575227431770469,
"grad_norm": 0.004738082177937031,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04092262021731585,
"reward_std": 0.03612527949735522,
"rewards/accuracy_reward": 0.04092262021731585,
"step": 46
},
{
"completion_length": 582.7514915466309,
"epoch": 0.2631210636808957,
"grad_norm": 0.005118933971971273,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.044642857857979834,
"reward_std": 0.02991444803774357,
"rewards/accuracy_reward": 0.044642857857979834,
"step": 47
},
{
"completion_length": 597.6398983001709,
"epoch": 0.26871938418474456,
"grad_norm": 0.003335759276524186,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.017857143247965723,
"reward_std": 0.02291816775687039,
"rewards/accuracy_reward": 0.017857143247965723,
"step": 48
},
{
"completion_length": 632.6994171142578,
"epoch": 0.2743177046885934,
"grad_norm": 0.002710092579945922,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.026785714959260076,
"reward_std": 0.018883246928453445,
"rewards/accuracy_reward": 0.026785714959260076,
"step": 49
},
{
"completion_length": 614.4695072174072,
"epoch": 0.27991602519244224,
"grad_norm": 0.004074351862072945,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.034970239736139774,
"reward_std": 0.0352512919344008,
"rewards/accuracy_reward": 0.034970239736139774,
"step": 50
},
{
"completion_length": 564.6369132995605,
"epoch": 0.28551434569629114,
"grad_norm": 0.00433464627712965,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04761904897168279,
"reward_std": 0.036211316008120775,
"rewards/accuracy_reward": 0.04761904897168279,
"step": 51
},
{
"completion_length": 574.7514991760254,
"epoch": 0.29111266620014,
"grad_norm": 0.0021451774518936872,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02232142904540524,
"reward_std": 0.01412591733969748,
"rewards/accuracy_reward": 0.02232142904540524,
"step": 52
},
{
"completion_length": 587.8154850006104,
"epoch": 0.2967109867039888,
"grad_norm": 0.004314142279326916,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04017857281723991,
"reward_std": 0.03161609871312976,
"rewards/accuracy_reward": 0.04017857281723991,
"step": 53
},
{
"completion_length": 612.7529888153076,
"epoch": 0.30230930720783766,
"grad_norm": 0.003322584554553032,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.023809524485841393,
"reward_std": 0.029193072579801083,
"rewards/accuracy_reward": 0.023809524485841393,
"step": 54
},
{
"completion_length": 602.6979274749756,
"epoch": 0.3079076277116865,
"grad_norm": 0.0024285970721393824,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.014136904967017472,
"reward_std": 0.017268173396587372,
"rewards/accuracy_reward": 0.014136904967017472,
"step": 55
},
{
"completion_length": 594.776798248291,
"epoch": 0.31350594821553535,
"grad_norm": 0.002405191073194146,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.031250000931322575,
"reward_std": 0.008076196536421776,
"rewards/accuracy_reward": 0.031250000931322575,
"step": 56
},
{
"completion_length": 569.1354274749756,
"epoch": 0.3191042687193842,
"grad_norm": 0.004245223011821508,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.056547619809862226,
"reward_std": 0.03459409927017987,
"rewards/accuracy_reward": 0.056547619809862226,
"step": 57
},
{
"completion_length": 610.0952472686768,
"epoch": 0.32470258922323303,
"grad_norm": 0.003974903374910355,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.035714287078008056,
"reward_std": 0.02040082309395075,
"rewards/accuracy_reward": 0.035714287078008056,
"step": 58
},
{
"completion_length": 645.4836387634277,
"epoch": 0.33030090972708187,
"grad_norm": 0.0026164394803345203,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.011160714784637094,
"reward_std": 0.011207811534404755,
"rewards/accuracy_reward": 0.011160714784637094,
"step": 59
},
{
"completion_length": 626.8445014953613,
"epoch": 0.3358992302309307,
"grad_norm": 0.0031215217895805836,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.024553572467993945,
"reward_std": 0.02602896187454462,
"rewards/accuracy_reward": 0.024553572467993945,
"step": 60
},
{
"completion_length": 602.254472732544,
"epoch": 0.34149755073477955,
"grad_norm": 0.0025609612930566072,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03273809637175873,
"reward_std": 0.02431014971807599,
"rewards/accuracy_reward": 0.03273809637175873,
"step": 61
},
{
"completion_length": 605.808048248291,
"epoch": 0.3470958712386284,
"grad_norm": 0.0041216155514121056,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0223214291036129,
"reward_std": 0.027785591781139374,
"rewards/accuracy_reward": 0.0223214291036129,
"step": 62
},
{
"completion_length": 620.4248657226562,
"epoch": 0.35269419174247724,
"grad_norm": 0.0027605677023530006,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.020089286204893142,
"reward_std": 0.02404030319303274,
"rewards/accuracy_reward": 0.020089286204893142,
"step": 63
},
{
"completion_length": 616.587064743042,
"epoch": 0.3582925122463261,
"grad_norm": 0.006433432921767235,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.034970238571986556,
"reward_std": 0.0240207826718688,
"rewards/accuracy_reward": 0.034970238571986556,
"step": 64
},
{
"completion_length": 656.1890068054199,
"epoch": 0.363890832750175,
"grad_norm": 0.002678812015801668,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.038690476736519486,
"reward_std": 0.026529580354690552,
"rewards/accuracy_reward": 0.038690476736519486,
"step": 65
},
{
"completion_length": 615.4241199493408,
"epoch": 0.3694891532540238,
"grad_norm": 0.004133144393563271,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.055059525300748646,
"reward_std": 0.030434885527938604,
"rewards/accuracy_reward": 0.055059525300748646,
"step": 66
},
{
"completion_length": 631.1212978363037,
"epoch": 0.37508747375787266,
"grad_norm": 0.004037661012262106,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.038690477376803756,
"reward_std": 0.028071781154721975,
"rewards/accuracy_reward": 0.038690477376803756,
"step": 67
},
{
"completion_length": 577.1934661865234,
"epoch": 0.3806857942617215,
"grad_norm": 0.0031959640327841043,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.023809524427633733,
"reward_std": 0.0161665934138,
"rewards/accuracy_reward": 0.023809524427633733,
"step": 68
},
{
"completion_length": 645.9285850524902,
"epoch": 0.38628411476557034,
"grad_norm": 0.004407494328916073,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.030505953065585345,
"reward_std": 0.03433060785755515,
"rewards/accuracy_reward": 0.030505953065585345,
"step": 69
},
{
"completion_length": 629.8266506195068,
"epoch": 0.3918824352694192,
"grad_norm": 0.0024215257726609707,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.019345238513778895,
"reward_std": 0.02312279725447297,
"rewards/accuracy_reward": 0.019345238513778895,
"step": 70
},
{
"completion_length": 604.0178699493408,
"epoch": 0.397480755773268,
"grad_norm": 0.003344905562698841,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02306547691114247,
"reward_std": 0.019507942255586386,
"rewards/accuracy_reward": 0.02306547691114247,
"step": 71
},
{
"completion_length": 590.7827472686768,
"epoch": 0.40307907627711687,
"grad_norm": 0.0028942637145519257,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05803571501746774,
"reward_std": 0.03128055343404412,
"rewards/accuracy_reward": 0.05803571501746774,
"step": 72
},
{
"completion_length": 599.7842330932617,
"epoch": 0.4086773967809657,
"grad_norm": 0.0034949486143887043,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05580357339931652,
"reward_std": 0.015248052775859833,
"rewards/accuracy_reward": 0.05580357339931652,
"step": 73
},
{
"completion_length": 635.7009048461914,
"epoch": 0.41427571728481455,
"grad_norm": 0.0022625280544161797,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02678571466822177,
"reward_std": 0.021662155631929636,
"rewards/accuracy_reward": 0.02678571466822177,
"step": 74
},
{
"completion_length": 631.5788726806641,
"epoch": 0.4198740377886634,
"grad_norm": 0.0037094622384756804,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02604166726814583,
"reward_std": 0.02264805557206273,
"rewards/accuracy_reward": 0.02604166726814583,
"step": 75
},
{
"completion_length": 636.7358722686768,
"epoch": 0.42547235829251223,
"grad_norm": 0.003503567073494196,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03273809503298253,
"reward_std": 0.02112219762057066,
"rewards/accuracy_reward": 0.03273809503298253,
"step": 76
},
{
"completion_length": 635.9628219604492,
"epoch": 0.4310706787963611,
"grad_norm": 0.0047474331222474575,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04166666779201478,
"reward_std": 0.03789610881358385,
"rewards/accuracy_reward": 0.04166666779201478,
"step": 77
},
{
"completion_length": 638.1703987121582,
"epoch": 0.4366689993002099,
"grad_norm": 0.0018300635274499655,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05282738315872848,
"reward_std": 0.01760453707538545,
"rewards/accuracy_reward": 0.05282738315872848,
"step": 78
},
{
"completion_length": 617.7492637634277,
"epoch": 0.44226731980405876,
"grad_norm": 0.0036325210239738226,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04092262010090053,
"reward_std": 0.03621981432661414,
"rewards/accuracy_reward": 0.04092262010090053,
"step": 79
},
{
"completion_length": 621.1123561859131,
"epoch": 0.44786564030790765,
"grad_norm": 0.0025254676584154367,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04092262004269287,
"reward_std": 0.02054001996293664,
"rewards/accuracy_reward": 0.04092262004269287,
"step": 80
},
{
"completion_length": 670.346004486084,
"epoch": 0.4534639608117565,
"grad_norm": 0.0038352429401129484,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03497023903764784,
"reward_std": 0.03099754173308611,
"rewards/accuracy_reward": 0.03497023903764784,
"step": 81
},
{
"completion_length": 632.6949424743652,
"epoch": 0.45906228131560534,
"grad_norm": 0.005097648594528437,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05059524026000872,
"reward_std": 0.02744494192302227,
"rewards/accuracy_reward": 0.05059524026000872,
"step": 82
},
{
"completion_length": 630.6183204650879,
"epoch": 0.4646606018194542,
"grad_norm": 0.0024996348656713963,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.012648809934034944,
"reward_std": 0.014683252666145563,
"rewards/accuracy_reward": 0.012648809934034944,
"step": 83
},
{
"completion_length": 582.6726245880127,
"epoch": 0.470258922323303,
"grad_norm": 0.0026008612476289272,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.037202382169198245,
"reward_std": 0.021662155631929636,
"rewards/accuracy_reward": 0.037202382169198245,
"step": 84
},
{
"completion_length": 665.8355827331543,
"epoch": 0.47585724282715186,
"grad_norm": 0.002596538746729493,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04389881080714986,
"reward_std": 0.02897106483578682,
"rewards/accuracy_reward": 0.04389881080714986,
"step": 85
},
{
"completion_length": 635.4821529388428,
"epoch": 0.4814555633310007,
"grad_norm": 0.0027116115670651197,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.024553571769502014,
"reward_std": 0.025856829015538096,
"rewards/accuracy_reward": 0.024553571769502014,
"step": 86
},
{
"completion_length": 642.225456237793,
"epoch": 0.48705388383484954,
"grad_norm": 0.00335879810154438,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0535714304423891,
"reward_std": 0.04018289828673005,
"rewards/accuracy_reward": 0.0535714304423891,
"step": 87
},
{
"completion_length": 611.2046222686768,
"epoch": 0.4926522043386984,
"grad_norm": 0.004874168895184994,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.06994047714397311,
"reward_std": 0.030564499087631702,
"rewards/accuracy_reward": 0.06994047714397311,
"step": 88
},
{
"completion_length": 634.81325340271,
"epoch": 0.4982505248425472,
"grad_norm": 0.0030458923429250717,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03273809637175873,
"reward_std": 0.025404266081750393,
"rewards/accuracy_reward": 0.03273809637175873,
"step": 89
},
{
"completion_length": 606.618314743042,
"epoch": 0.5038488453463961,
"grad_norm": 0.0029052915051579475,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.034226191812194884,
"reward_std": 0.018181392922997475,
"rewards/accuracy_reward": 0.034226191812194884,
"step": 90
},
{
"completion_length": 649.934534072876,
"epoch": 0.509447165850245,
"grad_norm": 0.0023663390893489122,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.028273810050450265,
"reward_std": 0.014822450000792742,
"rewards/accuracy_reward": 0.028273810050450265,
"step": 91
},
{
"completion_length": 645.944206237793,
"epoch": 0.5150454863540938,
"grad_norm": 0.002700702054426074,
"learning_rate": 3e-07,
"loss": -0.0,
"reward": 0.026041666977107525,
"reward_std": 0.023188014514744282,
"rewards/accuracy_reward": 0.026041666977107525,
"step": 92
},
{
"completion_length": 606.7001571655273,
"epoch": 0.5206438068579426,
"grad_norm": 0.002376874443143606,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02455357206054032,
"reward_std": 0.02123525319620967,
"rewards/accuracy_reward": 0.02455357206054032,
"step": 93
},
{
"completion_length": 635.0580425262451,
"epoch": 0.5262421273617914,
"grad_norm": 0.0022008493542671204,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02678571513388306,
"reward_std": 0.016507241874933243,
"rewards/accuracy_reward": 0.02678571513388306,
"step": 94
},
{
"completion_length": 651.9226360321045,
"epoch": 0.5318404478656403,
"grad_norm": 0.003217194229364395,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04464285826543346,
"reward_std": 0.018883246928453445,
"rewards/accuracy_reward": 0.04464285826543346,
"step": 95
},
{
"completion_length": 621.9144458770752,
"epoch": 0.5374387683694891,
"grad_norm": 0.0026905853301286697,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04166666849050671,
"reward_std": 0.022358688060194254,
"rewards/accuracy_reward": 0.04166666849050671,
"step": 96
},
{
"completion_length": 630.8422737121582,
"epoch": 0.543037088873338,
"grad_norm": 0.00430277269333601,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03422619169577956,
"reward_std": 0.03490746580064297,
"rewards/accuracy_reward": 0.03422619169577956,
"step": 97
},
{
"completion_length": 635.1577453613281,
"epoch": 0.5486354093771868,
"grad_norm": 0.0027177336160093546,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.024553572409786284,
"reward_std": 0.01907807867974043,
"rewards/accuracy_reward": 0.024553572409786284,
"step": 98
},
{
"completion_length": 621.9509105682373,
"epoch": 0.5542337298810357,
"grad_norm": 0.0029670181684195995,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04761904920451343,
"reward_std": 0.024105519521981478,
"rewards/accuracy_reward": 0.04761904920451343,
"step": 99
},
{
"completion_length": 645.11012840271,
"epoch": 0.5598320503848845,
"grad_norm": 0.004191585350781679,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03720238246023655,
"reward_std": 0.027021698653697968,
"rewards/accuracy_reward": 0.03720238246023655,
"step": 100
},
{
"completion_length": 628.647331237793,
"epoch": 0.5654303708887334,
"grad_norm": 0.005200605373829603,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04017857293365523,
"reward_std": 0.032400546595454216,
"rewards/accuracy_reward": 0.04017857293365523,
"step": 101
},
{
"completion_length": 674.1391506195068,
"epoch": 0.5710286913925823,
"grad_norm": 0.00332686142064631,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.039434525242540985,
"reward_std": 0.02585682924836874,
"rewards/accuracy_reward": 0.039434525242540985,
"step": 102
},
{
"completion_length": 635.6376628875732,
"epoch": 0.5766270118964311,
"grad_norm": 0.002928570844233036,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03348214377183467,
"reward_std": 0.017823366448283195,
"rewards/accuracy_reward": 0.03348214377183467,
"step": 103
},
{
"completion_length": 667.8132553100586,
"epoch": 0.58222533240028,
"grad_norm": 0.0027422241400927305,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05059524020180106,
"reward_std": 0.02381271030753851,
"rewards/accuracy_reward": 0.05059524020180106,
"step": 104
},
{
"completion_length": 607.5372142791748,
"epoch": 0.5878236529041287,
"grad_norm": 0.0026503645349293947,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.023809524660464376,
"reward_std": 0.021662155631929636,
"rewards/accuracy_reward": 0.023809524660464376,
"step": 105
},
{
"completion_length": 622.7715854644775,
"epoch": 0.5934219734079776,
"grad_norm": 0.003517127363011241,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04836309718666598,
"reward_std": 0.03329852968454361,
"rewards/accuracy_reward": 0.04836309718666598,
"step": 106
},
{
"completion_length": 647.0959930419922,
"epoch": 0.5990202939118264,
"grad_norm": 0.0036508163902908564,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.061755954287946224,
"reward_std": 0.03932873113080859,
"rewards/accuracy_reward": 0.061755954287946224,
"step": 107
},
{
"completion_length": 623.8727836608887,
"epoch": 0.6046186144156753,
"grad_norm": 0.0034824141766875982,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05282738321693614,
"reward_std": 0.023543079383671284,
"rewards/accuracy_reward": 0.05282738321693614,
"step": 108
},
{
"completion_length": 646.765645980835,
"epoch": 0.6102169349195241,
"grad_norm": 0.0026973283383995295,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.017113095498643816,
"reward_std": 0.017985261976718903,
"rewards/accuracy_reward": 0.017113095498643816,
"step": 109
},
{
"completion_length": 625.6376571655273,
"epoch": 0.615815255423373,
"grad_norm": 0.0037007054779678583,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.06547619227785617,
"reward_std": 0.040159355383366346,
"rewards/accuracy_reward": 0.06547619227785617,
"step": 110
},
{
"completion_length": 657.3631019592285,
"epoch": 0.6214135759272218,
"grad_norm": 0.0027858198154717684,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.027529762883204967,
"reward_std": 0.016143894754350185,
"rewards/accuracy_reward": 0.027529762883204967,
"step": 111
},
{
"completion_length": 627.3422756195068,
"epoch": 0.6270118964310707,
"grad_norm": 0.0035545658320188522,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05059523967793211,
"reward_std": 0.03195094550028443,
"rewards/accuracy_reward": 0.05059523967793211,
"step": 112
},
{
"completion_length": 629.8177185058594,
"epoch": 0.6326102169349195,
"grad_norm": 0.0040982505306601524,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05357142956927419,
"reward_std": 0.03337568882852793,
"rewards/accuracy_reward": 0.05357142956927419,
"step": 113
},
{
"completion_length": 650.6034278869629,
"epoch": 0.6382085374387684,
"grad_norm": 0.0028056029696017504,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05208333529299125,
"reward_std": 0.01970939477905631,
"rewards/accuracy_reward": 0.05208333529299125,
"step": 114
},
{
"completion_length": 649.5796318054199,
"epoch": 0.6438068579426172,
"grad_norm": 0.0023333376739174128,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.014880952483508736,
"reward_std": 0.010090996511280537,
"rewards/accuracy_reward": 0.014880952483508736,
"step": 115
},
{
"completion_length": 645.5312652587891,
"epoch": 0.6494051784464661,
"grad_norm": 0.0031304731965065002,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.022321428870782256,
"reward_std": 0.015947763342410326,
"rewards/accuracy_reward": 0.022321428870782256,
"step": 116
},
{
"completion_length": 606.377986907959,
"epoch": 0.655003498950315,
"grad_norm": 0.004076777026057243,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.030505952949170023,
"reward_std": 0.028484483249485493,
"rewards/accuracy_reward": 0.030505952949170023,
"step": 117
},
{
"completion_length": 633.9389953613281,
"epoch": 0.6606018194541637,
"grad_norm": 0.0019400623859837651,
"learning_rate": 3e-07,
"loss": -0.0,
"reward": 0.01934523874660954,
"reward_std": 0.014487121719866991,
"rewards/accuracy_reward": 0.01934523874660954,
"step": 118
},
{
"completion_length": 626.8236713409424,
"epoch": 0.6662001399580126,
"grad_norm": 0.0019256924279034138,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.040178572409786284,
"reward_std": 0.013790588825941086,
"rewards/accuracy_reward": 0.040178572409786284,
"step": 119
},
{
"completion_length": 633.7790336608887,
"epoch": 0.6717984604618614,
"grad_norm": 0.0031517883762717247,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04092261998448521,
"reward_std": 0.022219491191208363,
"rewards/accuracy_reward": 0.04092261998448521,
"step": 120
},
{
"completion_length": 648.1837844848633,
"epoch": 0.6773967809657103,
"grad_norm": 0.0026536902878433466,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.032738096197135746,
"reward_std": 0.02146284654736519,
"rewards/accuracy_reward": 0.032738096197135746,
"step": 121
},
{
"completion_length": 652.9933204650879,
"epoch": 0.6829951014695591,
"grad_norm": 0.0033722377847880125,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.023809524544049054,
"reward_std": 0.020562718622386456,
"rewards/accuracy_reward": 0.023809524544049054,
"step": 122
},
{
"completion_length": 656.3534355163574,
"epoch": 0.688593421973408,
"grad_norm": 0.002776005771011114,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.034970239794347435,
"reward_std": 0.01705795805901289,
"rewards/accuracy_reward": 0.034970239794347435,
"step": 123
},
{
"completion_length": 610.5788822174072,
"epoch": 0.6941917424772568,
"grad_norm": 0.0037796536926180124,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.07142857392318547,
"reward_std": 0.03704408532939851,
"rewards/accuracy_reward": 0.07142857392318547,
"step": 124
},
{
"completion_length": 639.4814128875732,
"epoch": 0.6997900629811057,
"grad_norm": 0.003291892819106579,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05133928789291531,
"reward_std": 0.033701435662806034,
"rewards/accuracy_reward": 0.05133928789291531,
"step": 125
},
{
"completion_length": 627.3214416503906,
"epoch": 0.7053883834849545,
"grad_norm": 0.002914518816396594,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0386904776096344,
"reward_std": 0.02166215470060706,
"rewards/accuracy_reward": 0.0386904776096344,
"step": 126
},
{
"completion_length": 614.4308052062988,
"epoch": 0.7109867039888034,
"grad_norm": 0.003878154093399644,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03348214388824999,
"reward_std": 0.018164014909416437,
"rewards/accuracy_reward": 0.03348214388824999,
"step": 127
},
{
"completion_length": 637.3244171142578,
"epoch": 0.7165850244926522,
"grad_norm": 0.002592942677438259,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.026785714901052415,
"reward_std": 0.01852204231545329,
"rewards/accuracy_reward": 0.026785714901052415,
"step": 128
},
{
"completion_length": 662.0282936096191,
"epoch": 0.722183344996501,
"grad_norm": 0.0030688210390508175,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.0424107150756754,
"reward_std": 0.029373969649896026,
"rewards/accuracy_reward": 0.0424107150756754,
"step": 129
},
{
"completion_length": 609.4747123718262,
"epoch": 0.72778166550035,
"grad_norm": 0.0026889187283813953,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03869047720218077,
"reward_std": 0.021642634645104408,
"rewards/accuracy_reward": 0.03869047720218077,
"step": 130
},
{
"completion_length": 628.831859588623,
"epoch": 0.7333799860041987,
"grad_norm": 0.0032478254288434982,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.019345238571986556,
"reward_std": 0.01975191291421652,
"rewards/accuracy_reward": 0.019345238571986556,
"step": 131
},
{
"completion_length": 642.3846893310547,
"epoch": 0.7389783065080476,
"grad_norm": 0.0019529862329363823,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.01116071455180645,
"reward_std": 0.013233252801001072,
"rewards/accuracy_reward": 0.01116071455180645,
"step": 132
},
{
"completion_length": 634.9181728363037,
"epoch": 0.7445766270118964,
"grad_norm": 0.004057868849486113,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.031250000873114914,
"reward_std": 0.040879431180655956,
"rewards/accuracy_reward": 0.031250000873114914,
"step": 133
},
{
"completion_length": 609.6331977844238,
"epoch": 0.7501749475157453,
"grad_norm": 0.003426521783694625,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.038690477260388434,
"reward_std": 0.029601082671433687,
"rewards/accuracy_reward": 0.038690477260388434,
"step": 134
},
{
"completion_length": 639.4613227844238,
"epoch": 0.7557732680195941,
"grad_norm": 0.003970231860876083,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.054315477260388434,
"reward_std": 0.029604259878396988,
"rewards/accuracy_reward": 0.054315477260388434,
"step": 135
},
{
"completion_length": 614.8199501037598,
"epoch": 0.761371588523443,
"grad_norm": 0.0031252307817339897,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05654762062476948,
"reward_std": 0.02807308081537485,
"rewards/accuracy_reward": 0.05654762062476948,
"step": 136
},
{
"completion_length": 640.7924194335938,
"epoch": 0.7669699090272918,
"grad_norm": 0.0020789685659110546,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.019345238397363573,
"reward_std": 0.015046600718051195,
"rewards/accuracy_reward": 0.019345238397363573,
"step": 137
},
{
"completion_length": 640.4226322174072,
"epoch": 0.7725682295311407,
"grad_norm": 0.0019399580778554082,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.012648809934034944,
"reward_std": 0.013656496535986662,
"rewards/accuracy_reward": 0.012648809934034944,
"step": 138
},
{
"completion_length": 594.6205444335938,
"epoch": 0.7781665500349895,
"grad_norm": 0.0063768248073756695,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.052083334885537624,
"reward_std": 0.023117476608604193,
"rewards/accuracy_reward": 0.052083334885537624,
"step": 139
},
{
"completion_length": 613.655517578125,
"epoch": 0.7837648705388384,
"grad_norm": 0.0031261774711310863,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.039434525300748646,
"reward_std": 0.020764170680195093,
"rewards/accuracy_reward": 0.039434525300748646,
"step": 140
},
{
"completion_length": 645.6837902069092,
"epoch": 0.7893631910426872,
"grad_norm": 0.0014134430093690753,
"learning_rate": 3e-07,
"loss": -0.0,
"reward": 0.02157738187815994,
"reward_std": 0.007513539865612984,
"rewards/accuracy_reward": 0.02157738187815994,
"step": 141
},
{
"completion_length": 630.043176651001,
"epoch": 0.794961511546536,
"grad_norm": 0.004378010053187609,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04241071571595967,
"reward_std": 0.03774271160364151,
"rewards/accuracy_reward": 0.04241071571595967,
"step": 142
},
{
"completion_length": 622.1555137634277,
"epoch": 0.8005598320503848,
"grad_norm": 0.0029741383623331785,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.00892857153667137,
"reward_std": 0.016726072411984205,
"rewards/accuracy_reward": 0.00892857153667137,
"step": 143
},
{
"completion_length": 646.7626647949219,
"epoch": 0.8061581525542337,
"grad_norm": 0.0028938716277480125,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.021577381528913975,
"reward_std": 0.022778970655053854,
"rewards/accuracy_reward": 0.021577381528913975,
"step": 144
},
{
"completion_length": 651.0178699493408,
"epoch": 0.8117564730580826,
"grad_norm": 0.003098010318353772,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.023809524718672037,
"reward_std": 0.00955103849992156,
"rewards/accuracy_reward": 0.023809524718672037,
"step": 145
},
{
"completion_length": 655.3891487121582,
"epoch": 0.8173547935619314,
"grad_norm": 0.002411956200376153,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.028273810166865587,
"reward_std": 0.024912146851420403,
"rewards/accuracy_reward": 0.028273810166865587,
"step": 146
},
{
"completion_length": 660.3393001556396,
"epoch": 0.8229531140657803,
"grad_norm": 0.0024566147476434708,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03199404838960618,
"reward_std": 0.021460703574121,
"rewards/accuracy_reward": 0.03199404838960618,
"step": 147
},
{
"completion_length": 660.4471855163574,
"epoch": 0.8285514345696291,
"grad_norm": 0.0033500257413834333,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03199404838960618,
"reward_std": 0.026453721337020397,
"rewards/accuracy_reward": 0.03199404838960618,
"step": 148
},
{
"completion_length": 616.9553737640381,
"epoch": 0.834149755073478,
"grad_norm": 0.0035977945663034916,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04613095358945429,
"reward_std": 0.03803316270932555,
"rewards/accuracy_reward": 0.04613095358945429,
"step": 149
},
{
"completion_length": 624.0811023712158,
"epoch": 0.8397480755773268,
"grad_norm": 0.0024934441316872835,
"learning_rate": 3e-07,
"loss": -0.0,
"reward": 0.020089286321308464,
"reward_std": 0.014129094779491425,
"rewards/accuracy_reward": 0.020089286321308464,
"step": 150
},
{
"completion_length": 649.3125076293945,
"epoch": 0.8453463960811757,
"grad_norm": 0.0040410468354821205,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.08035714528523386,
"reward_std": 0.037069959565997124,
"rewards/accuracy_reward": 0.08035714528523386,
"step": 151
},
{
"completion_length": 655.87575340271,
"epoch": 0.8509447165850245,
"grad_norm": 0.0036378325894474983,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.020089286554139107,
"reward_std": 0.025742772268131375,
"rewards/accuracy_reward": 0.020089286554139107,
"step": 152
},
{
"completion_length": 608.526050567627,
"epoch": 0.8565430370888734,
"grad_norm": 0.0019003109773620963,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.019345238339155912,
"reward_std": 0.013455044478178024,
"rewards/accuracy_reward": 0.019345238339155912,
"step": 153
},
{
"completion_length": 605.8340892791748,
"epoch": 0.8621413575927221,
"grad_norm": 0.002737295813858509,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04538690613117069,
"reward_std": 0.025694933719933033,
"rewards/accuracy_reward": 0.04538690613117069,
"step": 154
},
{
"completion_length": 649.8913841247559,
"epoch": 0.867739678096571,
"grad_norm": 0.0013863355852663517,
"learning_rate": 3e-07,
"loss": -0.0,
"reward": 0.012648810050450265,
"reward_std": 0.007513539865612984,
"rewards/accuracy_reward": 0.012648810050450265,
"step": 155
},
{
"completion_length": 649.567720413208,
"epoch": 0.8733379986004198,
"grad_norm": 0.003731328761205077,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.020089286379516125,
"reward_std": 0.022558840923011303,
"rewards/accuracy_reward": 0.020089286379516125,
"step": 156
},
{
"completion_length": 661.0640029907227,
"epoch": 0.8789363191042687,
"grad_norm": 0.0034664925187826157,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05505952559178695,
"reward_std": 0.035452743992209435,
"rewards/accuracy_reward": 0.05505952559178695,
"step": 157
},
{
"completion_length": 616.1681728363037,
"epoch": 0.8845346396081175,
"grad_norm": 0.003077705856412649,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.020833333721384406,
"reward_std": 0.023389466106891632,
"rewards/accuracy_reward": 0.020833333721384406,
"step": 158
},
{
"completion_length": 645.6845321655273,
"epoch": 0.8901329601119664,
"grad_norm": 0.0034521420020610094,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02380952425301075,
"reward_std": 0.01677391119301319,
"rewards/accuracy_reward": 0.02380952425301075,
"step": 159
},
{
"completion_length": 621.7961502075195,
"epoch": 0.8957312806158153,
"grad_norm": 0.0036637301091104746,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04092262004269287,
"reward_std": 0.02751574432477355,
"rewards/accuracy_reward": 0.04092262004269287,
"step": 160
},
{
"completion_length": 661.4003067016602,
"epoch": 0.9013296011196641,
"grad_norm": 0.0030420024413615465,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.031250000989530236,
"reward_std": 0.022196792997419834,
"rewards/accuracy_reward": 0.031250000989530236,
"step": 161
},
{
"completion_length": 636.8690567016602,
"epoch": 0.906927921623513,
"grad_norm": 0.003414291888475418,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05357143084984273,
"reward_std": 0.021321506705135107,
"rewards/accuracy_reward": 0.05357143084984273,
"step": 162
},
{
"completion_length": 641.8973331451416,
"epoch": 0.9125262421273618,
"grad_norm": 0.0033480250276625156,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03720238176174462,
"reward_std": 0.029198394622653723,
"rewards/accuracy_reward": 0.03720238176174462,
"step": 163
},
{
"completion_length": 623.6264972686768,
"epoch": 0.9181245626312107,
"grad_norm": 0.003853818401694298,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05729166802484542,
"reward_std": 0.037587220780551434,
"rewards/accuracy_reward": 0.05729166802484542,
"step": 164
},
{
"completion_length": 649.7075996398926,
"epoch": 0.9237228831350595,
"grad_norm": 0.0024723373353481293,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.019345238513778895,
"reward_std": 0.01928615104407072,
"rewards/accuracy_reward": 0.019345238513778895,
"step": 165
},
{
"completion_length": 680.0818481445312,
"epoch": 0.9293212036389084,
"grad_norm": 0.00249727675691247,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03869047743501142,
"reward_std": 0.015805388800799847,
"rewards/accuracy_reward": 0.03869047743501142,
"step": 166
},
{
"completion_length": 606.4137096405029,
"epoch": 0.9349195241427571,
"grad_norm": 0.0031826056074351072,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.04166666814126074,
"reward_std": 0.02047350350767374,
"rewards/accuracy_reward": 0.04166666814126074,
"step": 167
},
{
"completion_length": 620.4821529388428,
"epoch": 0.940517844646606,
"grad_norm": 0.0025734296068549156,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.02380952431121841,
"reward_std": 0.0181671935133636,
"rewards/accuracy_reward": 0.02380952431121841,
"step": 168
},
{
"completion_length": 617.4836406707764,
"epoch": 0.9461161651504548,
"grad_norm": 0.0036193837877362967,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03199404873885214,
"reward_std": 0.018744049593806267,
"rewards/accuracy_reward": 0.03199404873885214,
"step": 169
},
{
"completion_length": 630.1562633514404,
"epoch": 0.9517144856543037,
"grad_norm": 0.0025704463478177786,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03497023897944018,
"reward_std": 0.01300378143787384,
"rewards/accuracy_reward": 0.03497023897944018,
"step": 170
},
{
"completion_length": 608.0171241760254,
"epoch": 0.9573128061581525,
"grad_norm": 0.003403712995350361,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05357143055880442,
"reward_std": 0.030789734097197652,
"rewards/accuracy_reward": 0.05357143055880442,
"step": 171
},
{
"completion_length": 624.5669803619385,
"epoch": 0.9629111266620014,
"grad_norm": 0.00244723167270422,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.018601191113702953,
"reward_std": 0.014688573777675629,
"rewards/accuracy_reward": 0.018601191113702953,
"step": 172
},
{
"completion_length": 619.2753105163574,
"epoch": 0.9685094471658502,
"grad_norm": 0.003216799348592758,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.015625000465661287,
"reward_std": 0.011909665539860725,
"rewards/accuracy_reward": 0.015625000465661287,
"step": 173
},
{
"completion_length": 603.6629619598389,
"epoch": 0.9741077676696991,
"grad_norm": 0.004921557381749153,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.1026785732829012,
"reward_std": 0.035813949070870876,
"rewards/accuracy_reward": 0.1026785732829012,
"step": 174
},
{
"completion_length": 619.9709911346436,
"epoch": 0.979706088173548,
"grad_norm": 0.0021367412991821766,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03943452483508736,
"reward_std": 0.017064577899873257,
"rewards/accuracy_reward": 0.03943452483508736,
"step": 175
},
{
"completion_length": 661.3519515991211,
"epoch": 0.9853044086773968,
"grad_norm": 0.003553919028490782,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03422619169577956,
"reward_std": 0.02724563330411911,
"rewards/accuracy_reward": 0.03422619169577956,
"step": 176
},
{
"completion_length": 638.8660793304443,
"epoch": 0.9909027291812457,
"grad_norm": 0.0027842505369335413,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.05803571594879031,
"reward_std": 0.021928824484348297,
"rewards/accuracy_reward": 0.05803571594879031,
"step": 177
},
{
"completion_length": 666.0945014953613,
"epoch": 0.9965010496850945,
"grad_norm": 0.004953757394105196,
"learning_rate": 3e-07,
"loss": 0.0,
"reward": 0.03571428661234677,
"reward_std": 0.03300916403532028,
"rewards/accuracy_reward": 0.03571428661234677,
"step": 178
},
{
"epoch": 0.9965010496850945,
"step": 178,
"total_flos": 0.0,
"train_loss": 1.5679008615386834e-09,
"train_runtime": 33567.1333,
"train_samples_per_second": 0.596,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 178,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}