tongliuphysics's picture
Model save
820ab46 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.983132530120482,
"eval_steps": 20,
"global_step": 255,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03855421686746988,
"grad_norm": 2.0457221564142256,
"learning_rate": 3.846153846153847e-07,
"loss": 0.2354,
"mean_token_accuracy": 0.930065356194973,
"num_tokens": 131072.0,
"step": 2
},
{
"epoch": 0.07710843373493977,
"grad_norm": 2.1086974646270145,
"learning_rate": 1.153846153846154e-06,
"loss": 0.2508,
"mean_token_accuracy": 0.9255465492606163,
"num_tokens": 262144.0,
"step": 4
},
{
"epoch": 0.11566265060240964,
"grad_norm": 1.698182282959437,
"learning_rate": 1.9230769230769234e-06,
"loss": 0.2473,
"mean_token_accuracy": 0.9256381466984749,
"num_tokens": 393216.0,
"step": 6
},
{
"epoch": 0.15421686746987953,
"grad_norm": 1.4331583326698771,
"learning_rate": 2.6923076923076923e-06,
"loss": 0.2193,
"mean_token_accuracy": 0.9314393177628517,
"num_tokens": 524288.0,
"step": 8
},
{
"epoch": 0.1927710843373494,
"grad_norm": 1.280978852144958,
"learning_rate": 3.4615384615384617e-06,
"loss": 0.2205,
"mean_token_accuracy": 0.930450152605772,
"num_tokens": 654484.0,
"step": 10
},
{
"epoch": 0.23132530120481928,
"grad_norm": 0.8255955634911271,
"learning_rate": 4.230769230769231e-06,
"loss": 0.2117,
"mean_token_accuracy": 0.9317141100764275,
"num_tokens": 785556.0,
"step": 12
},
{
"epoch": 0.26987951807228916,
"grad_norm": 0.7584680371226415,
"learning_rate": 5e-06,
"loss": 0.206,
"mean_token_accuracy": 0.9338631108403206,
"num_tokens": 915519.0,
"step": 14
},
{
"epoch": 0.30843373493975906,
"grad_norm": 0.9495192852210463,
"learning_rate": 5.769230769230769e-06,
"loss": 0.1982,
"mean_token_accuracy": 0.9358359947800636,
"num_tokens": 1046591.0,
"step": 16
},
{
"epoch": 0.3469879518072289,
"grad_norm": 0.9714974283482016,
"learning_rate": 6.538461538461539e-06,
"loss": 0.2055,
"mean_token_accuracy": 0.9338132180273533,
"num_tokens": 1177663.0,
"step": 18
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.6339236056292388,
"learning_rate": 7.307692307692308e-06,
"loss": 0.1917,
"mean_token_accuracy": 0.9378740377724171,
"num_tokens": 1308735.0,
"step": 20
},
{
"epoch": 0.3855421686746988,
"eval_loss": 0.3343917727470398,
"eval_mean_token_accuracy": 0.9013295725127247,
"eval_num_tokens": 1308735.0,
"eval_runtime": 70.0593,
"eval_samples_per_second": 12.204,
"eval_steps_per_second": 1.527,
"step": 20
},
{
"epoch": 0.42409638554216866,
"grad_norm": 0.7315888499202351,
"learning_rate": 8.076923076923077e-06,
"loss": 0.1809,
"mean_token_accuracy": 0.9400189444422722,
"num_tokens": 1439807.0,
"step": 22
},
{
"epoch": 0.46265060240963857,
"grad_norm": 0.7642349616310066,
"learning_rate": 8.846153846153847e-06,
"loss": 0.1928,
"mean_token_accuracy": 0.9367095269262791,
"num_tokens": 1570062.0,
"step": 24
},
{
"epoch": 0.5012048192771085,
"grad_norm": 0.6114978913375759,
"learning_rate": 9.615384615384616e-06,
"loss": 0.1828,
"mean_token_accuracy": 0.9394693598151207,
"num_tokens": 1701134.0,
"step": 26
},
{
"epoch": 0.5397590361445783,
"grad_norm": 0.6229653774047121,
"learning_rate": 9.999529497453782e-06,
"loss": 0.1806,
"mean_token_accuracy": 0.9402282536029816,
"num_tokens": 1832133.0,
"step": 28
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.6722415161460822,
"learning_rate": 9.99576600836172e-06,
"loss": 0.1896,
"mean_token_accuracy": 0.9363855794072151,
"num_tokens": 1963205.0,
"step": 30
},
{
"epoch": 0.6168674698795181,
"grad_norm": 0.5974286474799401,
"learning_rate": 9.988241863214212e-06,
"loss": 0.1814,
"mean_token_accuracy": 0.9404540322721004,
"num_tokens": 2094277.0,
"step": 32
},
{
"epoch": 0.655421686746988,
"grad_norm": 0.601035342701654,
"learning_rate": 9.976962725951878e-06,
"loss": 0.1801,
"mean_token_accuracy": 0.9400342106819153,
"num_tokens": 2225349.0,
"step": 34
},
{
"epoch": 0.6939759036144578,
"grad_norm": 0.5765003488310966,
"learning_rate": 9.961937087155697e-06,
"loss": 0.1828,
"mean_token_accuracy": 0.9392519034445286,
"num_tokens": 2355263.0,
"step": 36
},
{
"epoch": 0.7325301204819277,
"grad_norm": 34.52047518558373,
"learning_rate": 9.943176257655567e-06,
"loss": 0.2098,
"mean_token_accuracy": 0.9331491328775883,
"num_tokens": 2486335.0,
"step": 38
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.6276699276820382,
"learning_rate": 9.920694360015864e-06,
"loss": 0.1745,
"mean_token_accuracy": 0.9413929060101509,
"num_tokens": 2617407.0,
"step": 40
},
{
"epoch": 0.7710843373493976,
"eval_loss": 0.32280808687210083,
"eval_mean_token_accuracy": 0.9021720039510281,
"eval_num_tokens": 2617407.0,
"eval_runtime": 69.6577,
"eval_samples_per_second": 12.274,
"eval_steps_per_second": 1.536,
"step": 40
},
{
"epoch": 0.8096385542168675,
"grad_norm": 0.6015365123041743,
"learning_rate": 9.894508317904418e-06,
"loss": 0.1751,
"mean_token_accuracy": 0.9412707760930061,
"num_tokens": 2748479.0,
"step": 42
},
{
"epoch": 0.8481927710843373,
"grad_norm": 0.6316203175238668,
"learning_rate": 9.864637843352916e-06,
"loss": 0.184,
"mean_token_accuracy": 0.9374923817813396,
"num_tokens": 2879551.0,
"step": 44
},
{
"epoch": 0.8867469879518072,
"grad_norm": 0.5904610746669308,
"learning_rate": 9.831105421918287e-06,
"loss": 0.1777,
"mean_token_accuracy": 0.9405580870807171,
"num_tokens": 3010185.0,
"step": 46
},
{
"epoch": 0.9253012048192771,
"grad_norm": 0.5994215271575196,
"learning_rate": 9.793936295756292e-06,
"loss": 0.187,
"mean_token_accuracy": 0.9375152811408043,
"num_tokens": 3141257.0,
"step": 48
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.5854742456446934,
"learning_rate": 9.753158444620013e-06,
"loss": 0.1815,
"mean_token_accuracy": 0.9394976831972599,
"num_tokens": 3271788.0,
"step": 50
},
{
"epoch": 1.0192771084337349,
"grad_norm": 0.957499837849808,
"learning_rate": 9.70880256479758e-06,
"loss": 0.2534,
"mean_token_accuracy": 0.9437652796506881,
"num_tokens": 3435628.0,
"step": 52
},
{
"epoch": 1.0578313253012048,
"grad_norm": 0.6854514205992324,
"learning_rate": 9.660902046004954e-06,
"loss": 0.151,
"mean_token_accuracy": 0.9503083899617195,
"num_tokens": 3566700.0,
"step": 54
},
{
"epoch": 1.0963855421686748,
"grad_norm": 0.6080507225701574,
"learning_rate": 9.60949294625121e-06,
"loss": 0.1415,
"mean_token_accuracy": 0.9535066671669483,
"num_tokens": 3697772.0,
"step": 56
},
{
"epoch": 1.1349397590361445,
"grad_norm": 0.6054065882233389,
"learning_rate": 9.554613964695189e-06,
"loss": 0.1493,
"mean_token_accuracy": 0.9502549581229687,
"num_tokens": 3828844.0,
"step": 58
},
{
"epoch": 1.1734939759036145,
"grad_norm": 0.7694600057204949,
"learning_rate": 9.496306412513989e-06,
"loss": 0.1462,
"mean_token_accuracy": 0.9519953094422817,
"num_tokens": 3959916.0,
"step": 60
},
{
"epoch": 1.1734939759036145,
"eval_loss": 0.359206885099411,
"eval_mean_token_accuracy": 0.9007850846397543,
"eval_num_tokens": 3959916.0,
"eval_runtime": 69.8215,
"eval_samples_per_second": 12.246,
"eval_steps_per_second": 1.532,
"step": 60
},
{
"epoch": 1.2120481927710842,
"grad_norm": 0.6845669867023433,
"learning_rate": 9.434614181805203e-06,
"loss": 0.1407,
"mean_token_accuracy": 0.9533876590430737,
"num_tokens": 4089879.0,
"step": 62
},
{
"epoch": 1.2506024096385542,
"grad_norm": 0.6197114152379135,
"learning_rate": 9.369583712546322e-06,
"loss": 0.1349,
"mean_token_accuracy": 0.9554836452007294,
"num_tokens": 4220951.0,
"step": 64
},
{
"epoch": 1.2891566265060241,
"grad_norm": 0.6172158164875755,
"learning_rate": 9.30126395763618e-06,
"loss": 0.1535,
"mean_token_accuracy": 0.95006413012743,
"num_tokens": 4352023.0,
"step": 66
},
{
"epoch": 1.3277108433734939,
"grad_norm": 0.6409060214608714,
"learning_rate": 9.229706346044749e-06,
"loss": 0.156,
"mean_token_accuracy": 0.9484306424856186,
"num_tokens": 4483095.0,
"step": 68
},
{
"epoch": 1.3662650602409638,
"grad_norm": 0.6166450609513697,
"learning_rate": 9.154964744099006e-06,
"loss": 0.1419,
"mean_token_accuracy": 0.9533540047705173,
"num_tokens": 4614167.0,
"step": 70
},
{
"epoch": 1.4048192771084338,
"grad_norm": 0.6058092262037136,
"learning_rate": 9.077095414934076e-06,
"loss": 0.1439,
"mean_token_accuracy": 0.9524685628712177,
"num_tokens": 4745239.0,
"step": 72
},
{
"epoch": 1.4433734939759035,
"grad_norm": 0.6464674278239464,
"learning_rate": 8.996156976140088e-06,
"loss": 0.1427,
"mean_token_accuracy": 0.9521632380783558,
"num_tokens": 4876311.0,
"step": 74
},
{
"epoch": 1.4819277108433735,
"grad_norm": 0.6232124362016298,
"learning_rate": 8.91221035563669e-06,
"loss": 0.1387,
"mean_token_accuracy": 0.9537738263607025,
"num_tokens": 5007383.0,
"step": 76
},
{
"epoch": 1.5204819277108435,
"grad_norm": 0.6251055517263481,
"learning_rate": 8.82531874580844e-06,
"loss": 0.1544,
"mean_token_accuracy": 0.9496977403759956,
"num_tokens": 5138455.0,
"step": 78
},
{
"epoch": 1.5590361445783132,
"grad_norm": 0.6597130966145244,
"learning_rate": 8.735547555935538e-06,
"loss": 0.1467,
"mean_token_accuracy": 0.951957143843174,
"num_tokens": 5269527.0,
"step": 80
},
{
"epoch": 1.5590361445783132,
"eval_loss": 0.34304243326187134,
"eval_mean_token_accuracy": 0.9011661727851796,
"eval_num_tokens": 5269527.0,
"eval_runtime": 69.6573,
"eval_samples_per_second": 12.274,
"eval_steps_per_second": 1.536,
"step": 80
},
{
"epoch": 1.5975903614457831,
"grad_norm": 0.6093216234766912,
"learning_rate": 8.642964362955781e-06,
"loss": 0.145,
"mean_token_accuracy": 0.9515700563788414,
"num_tokens": 5400161.0,
"step": 82
},
{
"epoch": 1.636144578313253,
"grad_norm": 0.5687703380048487,
"learning_rate": 8.547638860594765e-06,
"loss": 0.1484,
"mean_token_accuracy": 0.9509495720267296,
"num_tokens": 5531233.0,
"step": 84
},
{
"epoch": 1.6746987951807228,
"grad_norm": 0.6551898466798518,
"learning_rate": 8.449642806902623e-06,
"loss": 0.1568,
"mean_token_accuracy": 0.9481558501720428,
"num_tokens": 5662305.0,
"step": 86
},
{
"epoch": 1.7132530120481928,
"grad_norm": 0.6433780292504243,
"learning_rate": 8.349049970236822e-06,
"loss": 0.1349,
"mean_token_accuracy": 0.954715259373188,
"num_tokens": 5792219.0,
"step": 88
},
{
"epoch": 1.7518072289156628,
"grad_norm": 0.5701046312406493,
"learning_rate": 8.245936073731654e-06,
"loss": 0.147,
"mean_token_accuracy": 0.9507969096302986,
"num_tokens": 5923291.0,
"step": 90
},
{
"epoch": 1.7903614457831325,
"grad_norm": 0.6865332623152001,
"learning_rate": 8.140378738296233e-06,
"loss": 0.1529,
"mean_token_accuracy": 0.9498768150806427,
"num_tokens": 6053822.0,
"step": 92
},
{
"epoch": 1.8289156626506025,
"grad_norm": 0.6305307568855328,
"learning_rate": 8.032457424183909e-06,
"loss": 0.1476,
"mean_token_accuracy": 0.9505984485149384,
"num_tokens": 6184894.0,
"step": 94
},
{
"epoch": 1.8674698795180724,
"grad_norm": 0.5748443476790706,
"learning_rate": 7.922253371177081e-06,
"loss": 0.155,
"mean_token_accuracy": 0.9482144415378571,
"num_tokens": 6315149.0,
"step": 96
},
{
"epoch": 1.9060240963855422,
"grad_norm": 0.5993128969226361,
"learning_rate": 7.809849537432432e-06,
"loss": 0.1434,
"mean_token_accuracy": 0.9525645859539509,
"num_tokens": 6445345.0,
"step": 98
},
{
"epoch": 1.944578313253012,
"grad_norm": 0.6280456904784001,
"learning_rate": 7.695330537032629e-06,
"loss": 0.1445,
"mean_token_accuracy": 0.9512222707271576,
"num_tokens": 6576344.0,
"step": 100
},
{
"epoch": 1.944578313253012,
"eval_loss": 0.3398211598396301,
"eval_mean_token_accuracy": 0.901328669530209,
"eval_num_tokens": 6576344.0,
"eval_runtime": 69.654,
"eval_samples_per_second": 12.275,
"eval_steps_per_second": 1.536,
"step": 100
},
{
"epoch": 1.983132530120482,
"grad_norm": 0.6197902890500856,
"learning_rate": 7.578782576291501e-06,
"loss": 0.1506,
"mean_token_accuracy": 0.9492092207074165,
"num_tokens": 6707416.0,
"step": 102
},
{
"epoch": 2.0385542168674697,
"grad_norm": 0.6409344863530665,
"learning_rate": 7.460293388860616e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.9643502771854401,
"num_tokens": 6871256.0,
"step": 104
},
{
"epoch": 2.07710843373494,
"grad_norm": 0.6097248296204885,
"learning_rate": 7.3399521696861505e-06,
"loss": 0.1092,
"mean_token_accuracy": 0.9659219309687614,
"num_tokens": 7002255.0,
"step": 106
},
{
"epoch": 2.1156626506024097,
"grad_norm": 0.5903613108322504,
"learning_rate": 7.217849507865724e-06,
"loss": 0.1066,
"mean_token_accuracy": 0.9660860486328602,
"num_tokens": 7133327.0,
"step": 108
},
{
"epoch": 2.1542168674698794,
"grad_norm": 0.625091072426359,
"learning_rate": 7.094077318455762e-06,
"loss": 0.1091,
"mean_token_accuracy": 0.9645588099956512,
"num_tokens": 7263523.0,
"step": 110
},
{
"epoch": 2.1927710843373496,
"grad_norm": 0.6604015968164485,
"learning_rate": 6.96872877328073e-06,
"loss": 0.1052,
"mean_token_accuracy": 0.9661929123103619,
"num_tokens": 7394595.0,
"step": 112
},
{
"epoch": 2.2313253012048193,
"grad_norm": 0.7455880093770229,
"learning_rate": 6.841898230796302e-06,
"loss": 0.1049,
"mean_token_accuracy": 0.9661089479923248,
"num_tokens": 7525667.0,
"step": 114
},
{
"epoch": 2.269879518072289,
"grad_norm": 0.6028303919109465,
"learning_rate": 6.713681165059271e-06,
"loss": 0.1127,
"mean_token_accuracy": 0.9631625637412071,
"num_tokens": 7656739.0,
"step": 116
},
{
"epoch": 2.3084337349397592,
"grad_norm": 0.6799912009709536,
"learning_rate": 6.584174093857676e-06,
"loss": 0.1035,
"mean_token_accuracy": 0.9669562242925167,
"num_tokens": 7787811.0,
"step": 118
},
{
"epoch": 2.346987951807229,
"grad_norm": 0.6255570427114552,
"learning_rate": 6.453474506055228e-06,
"loss": 0.1176,
"mean_token_accuracy": 0.9615787602961063,
"num_tokens": 7916616.0,
"step": 120
},
{
"epoch": 2.346987951807229,
"eval_loss": 0.38193774223327637,
"eval_mean_token_accuracy": 0.8994210568543907,
"eval_num_tokens": 7916616.0,
"eval_runtime": 69.6436,
"eval_samples_per_second": 12.277,
"eval_steps_per_second": 1.536,
"step": 120
},
{
"epoch": 2.3855421686746987,
"grad_norm": 0.6279356138996781,
"learning_rate": 6.3216807882047585e-06,
"loss": 0.0974,
"mean_token_accuracy": 0.968185156583786,
"num_tokens": 8047688.0,
"step": 122
},
{
"epoch": 2.4240963855421684,
"grad_norm": 0.6479503216427691,
"learning_rate": 6.188892150485904e-06,
"loss": 0.1087,
"mean_token_accuracy": 0.9651853404939175,
"num_tokens": 8178760.0,
"step": 124
},
{
"epoch": 2.4626506024096386,
"grad_norm": 0.7228376218883897,
"learning_rate": 6.0552085520227875e-06,
"loss": 0.1136,
"mean_token_accuracy": 0.9631396643817425,
"num_tokens": 8309832.0,
"step": 126
},
{
"epoch": 2.5012048192771084,
"grad_norm": 0.6292530226739607,
"learning_rate": 5.920730625637934e-06,
"loss": 0.1043,
"mean_token_accuracy": 0.9666203670203686,
"num_tokens": 8440904.0,
"step": 128
},
{
"epoch": 2.539759036144578,
"grad_norm": 0.6120273359022707,
"learning_rate": 5.785559602099019e-06,
"loss": 0.1073,
"mean_token_accuracy": 0.9648876488208771,
"num_tokens": 8571976.0,
"step": 130
},
{
"epoch": 2.5783132530120483,
"grad_norm": 0.6294342722298523,
"learning_rate": 5.649797233915539e-06,
"loss": 0.1092,
"mean_token_accuracy": 0.9644067622721195,
"num_tokens": 8703048.0,
"step": 132
},
{
"epoch": 2.616867469879518,
"grad_norm": 0.5665304014502571,
"learning_rate": 5.513545718742702e-06,
"loss": 0.1086,
"mean_token_accuracy": 0.9646815545856953,
"num_tokens": 8834120.0,
"step": 134
},
{
"epoch": 2.6554216867469878,
"grad_norm": 0.5673111264101424,
"learning_rate": 5.376907622450229e-06,
"loss": 0.1154,
"mean_token_accuracy": 0.9624109007418156,
"num_tokens": 8964375.0,
"step": 136
},
{
"epoch": 2.693975903614458,
"grad_norm": 0.5636466902202368,
"learning_rate": 5.2399858019140005e-06,
"loss": 0.1045,
"mean_token_accuracy": 0.9666311480104923,
"num_tokens": 9094906.0,
"step": 138
},
{
"epoch": 2.7325301204819277,
"grad_norm": 0.5754464602822424,
"learning_rate": 5.102883327588608e-06,
"loss": 0.1075,
"mean_token_accuracy": 0.9647044539451599,
"num_tokens": 9225978.0,
"step": 140
},
{
"epoch": 2.7325301204819277,
"eval_loss": 0.37826669216156006,
"eval_mean_token_accuracy": 0.8995784972315637,
"eval_num_tokens": 9225978.0,
"eval_runtime": 69.6803,
"eval_samples_per_second": 12.27,
"eval_steps_per_second": 1.536,
"step": 140
},
{
"epoch": 2.7710843373493974,
"grad_norm": 0.5987257906687522,
"learning_rate": 4.965703405919154e-06,
"loss": 0.1041,
"mean_token_accuracy": 0.9660173505544662,
"num_tokens": 9357050.0,
"step": 142
},
{
"epoch": 2.8096385542168676,
"grad_norm": 0.6727909756019579,
"learning_rate": 4.828549301650673e-06,
"loss": 0.1165,
"mean_token_accuracy": 0.9626206122338772,
"num_tokens": 9488122.0,
"step": 144
},
{
"epoch": 2.8481927710843373,
"grad_norm": 0.5483728501054262,
"learning_rate": 4.691524260093672e-06,
"loss": 0.1101,
"mean_token_accuracy": 0.9640556387603283,
"num_tokens": 9619194.0,
"step": 146
},
{
"epoch": 2.886746987951807,
"grad_norm": 0.6578615356471254,
"learning_rate": 4.554731429404293e-06,
"loss": 0.1167,
"mean_token_accuracy": 0.9623610861599445,
"num_tokens": 9750266.0,
"step": 148
},
{
"epoch": 2.9253012048192772,
"grad_norm": 0.544341897970942,
"learning_rate": 4.4182737829376135e-06,
"loss": 0.1068,
"mean_token_accuracy": 0.965429600328207,
"num_tokens": 9881338.0,
"step": 150
},
{
"epoch": 2.963855421686747,
"grad_norm": 0.5807218274090602,
"learning_rate": 4.28225404173254e-06,
"loss": 0.1058,
"mean_token_accuracy": 0.965176422148943,
"num_tokens": 10011972.0,
"step": 152
},
{
"epoch": 3.019277108433735,
"grad_norm": 1.007803950038667,
"learning_rate": 4.146774597186622e-06,
"loss": 0.1488,
"mean_token_accuracy": 0.9695591181516647,
"num_tokens": 10175812.0,
"step": 154
},
{
"epoch": 3.057831325301205,
"grad_norm": 0.6613641201206724,
"learning_rate": 4.011937433979014e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9746656753122807,
"num_tokens": 10306884.0,
"step": 156
},
{
"epoch": 3.0963855421686746,
"grad_norm": 0.5427167115705699,
"learning_rate": 3.87784405329962e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9741344675421715,
"num_tokens": 10437883.0,
"step": 158
},
{
"epoch": 3.1349397590361447,
"grad_norm": 0.5059704125761413,
"learning_rate": 3.744595396442169e-06,
"loss": 0.0814,
"mean_token_accuracy": 0.9750473313033581,
"num_tokens": 10568955.0,
"step": 160
},
{
"epoch": 3.1349397590361447,
"eval_loss": 0.4201391637325287,
"eval_mean_token_accuracy": 0.8986482670374005,
"eval_num_tokens": 10568955.0,
"eval_runtime": 69.8903,
"eval_samples_per_second": 12.233,
"eval_steps_per_second": 1.531,
"step": 160
},
{
"epoch": 3.1734939759036145,
"grad_norm": 0.4955524619584041,
"learning_rate": 3.612291768818772e-06,
"loss": 0.0827,
"mean_token_accuracy": 0.9744977466762066,
"num_tokens": 10700027.0,
"step": 162
},
{
"epoch": 3.212048192771084,
"grad_norm": 0.5481909266796648,
"learning_rate": 3.4810327644531606e-06,
"loss": 0.0804,
"mean_token_accuracy": 0.9746122434735298,
"num_tokens": 10831099.0,
"step": 164
},
{
"epoch": 3.2506024096385544,
"grad_norm": 0.5869274418415635,
"learning_rate": 3.3509171910094162e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9735665060579777,
"num_tokens": 10962171.0,
"step": 166
},
{
"epoch": 3.289156626506024,
"grad_norm": 0.5997938570160334,
"learning_rate": 3.222042995412669e-06,
"loss": 0.0826,
"mean_token_accuracy": 0.9744274839758873,
"num_tokens": 11092367.0,
"step": 168
},
{
"epoch": 3.327710843373494,
"grad_norm": 0.5638967234440626,
"learning_rate": 3.094507190117715e-06,
"loss": 0.0752,
"mean_token_accuracy": 0.9760014712810516,
"num_tokens": 11223439.0,
"step": 170
},
{
"epoch": 3.3662650602409636,
"grad_norm": 0.5677450107311146,
"learning_rate": 2.9684057800810844e-06,
"loss": 0.0849,
"mean_token_accuracy": 0.9734520092606544,
"num_tokens": 11354511.0,
"step": 172
},
{
"epoch": 3.404819277108434,
"grad_norm": 0.5694190125459168,
"learning_rate": 2.8438336904915186e-06,
"loss": 0.0907,
"mean_token_accuracy": 0.9719940833747387,
"num_tokens": 11485583.0,
"step": 174
},
{
"epoch": 3.4433734939759035,
"grad_norm": 0.5008764813796651,
"learning_rate": 2.7208846953132685e-06,
"loss": 0.0782,
"mean_token_accuracy": 0.9755356945097446,
"num_tokens": 11616217.0,
"step": 176
},
{
"epoch": 3.4819277108433733,
"grad_norm": 0.5027767263738213,
"learning_rate": 2.599651346695979e-06,
"loss": 0.0773,
"mean_token_accuracy": 0.9762609973549843,
"num_tokens": 11747289.0,
"step": 178
},
{
"epoch": 3.5204819277108435,
"grad_norm": 0.5747857741850161,
"learning_rate": 2.4802249053043525e-06,
"loss": 0.0777,
"mean_token_accuracy": 0.976215198636055,
"num_tokens": 11878361.0,
"step": 180
},
{
"epoch": 3.5204819277108435,
"eval_loss": 0.43149346113204956,
"eval_mean_token_accuracy": 0.898219308563482,
"eval_num_tokens": 11878361.0,
"eval_runtime": 69.6743,
"eval_samples_per_second": 12.271,
"eval_steps_per_second": 1.536,
"step": 180
},
{
"epoch": 3.559036144578313,
"grad_norm": 0.5115273312879999,
"learning_rate": 2.3626952716199647e-06,
"loss": 0.0792,
"mean_token_accuracy": 0.9750167988240719,
"num_tokens": 12009433.0,
"step": 182
},
{
"epoch": 3.597590361445783,
"grad_norm": 0.5172911491980401,
"learning_rate": 2.247150918267008e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9730398207902908,
"num_tokens": 12140505.0,
"step": 184
},
{
"epoch": 3.636144578313253,
"grad_norm": 0.5260093719963543,
"learning_rate": 2.133678823412873e-06,
"loss": 0.0797,
"mean_token_accuracy": 0.9751236625015736,
"num_tokens": 12271577.0,
"step": 186
},
{
"epoch": 3.674698795180723,
"grad_norm": 0.5267292864138245,
"learning_rate": 2.022364405293703e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9738947302103043,
"num_tokens": 12402649.0,
"step": 188
},
{
"epoch": 3.7132530120481926,
"grad_norm": 0.5065512725199254,
"learning_rate": 1.913291457914234e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.9732001163065434,
"num_tokens": 12533721.0,
"step": 190
},
{
"epoch": 3.7518072289156628,
"grad_norm": 0.5465242770321679,
"learning_rate": 1.8065420879702888e-06,
"loss": 0.0838,
"mean_token_accuracy": 0.9731762520968914,
"num_tokens": 12663435.0,
"step": 192
},
{
"epoch": 3.7903614457831325,
"grad_norm": 0.7823063875533764,
"learning_rate": 1.7021966530414303e-06,
"loss": 0.0762,
"mean_token_accuracy": 0.9758411757647991,
"num_tokens": 12794507.0,
"step": 194
},
{
"epoch": 3.8289156626506022,
"grad_norm": 0.571380544699335,
"learning_rate": 1.6003337011002928e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9734901748597622,
"num_tokens": 12925579.0,
"step": 196
},
{
"epoch": 3.8674698795180724,
"grad_norm": 0.5400258981871386,
"learning_rate": 1.5010299113841397e-06,
"loss": 0.0807,
"mean_token_accuracy": 0.9752305261790752,
"num_tokens": 13056651.0,
"step": 198
},
{
"epoch": 3.906024096385542,
"grad_norm": 0.5204832843446408,
"learning_rate": 1.4043600366731213e-06,
"loss": 0.0821,
"mean_token_accuracy": 0.9745206460356712,
"num_tokens": 13187723.0,
"step": 200
},
{
"epoch": 3.906024096385542,
"eval_loss": 0.43459072709083557,
"eval_mean_token_accuracy": 0.8980461002510285,
"eval_num_tokens": 13187723.0,
"eval_runtime": 69.6812,
"eval_samples_per_second": 12.27,
"eval_steps_per_second": 1.536,
"step": 200
},
{
"epoch": 3.944578313253012,
"grad_norm": 0.5732935867678565,
"learning_rate": 1.3103968470187384e-06,
"loss": 0.0841,
"mean_token_accuracy": 0.973306454718113,
"num_tokens": 13317686.0,
"step": 202
},
{
"epoch": 3.983132530120482,
"grad_norm": 0.5049593156468802,
"learning_rate": 1.2192110749648233e-06,
"loss": 0.0783,
"mean_token_accuracy": 0.9752342775464058,
"num_tokens": 13447600.0,
"step": 204
},
{
"epoch": 4.03855421686747,
"grad_norm": 0.4900616503984239,
"learning_rate": 1.1308713623022988e-06,
"loss": 0.1075,
"mean_token_accuracy": 0.9786272644996643,
"num_tokens": 13611440.0,
"step": 206
},
{
"epoch": 4.0771084337349395,
"grad_norm": 0.4917129834327916,
"learning_rate": 1.045444208397791e-06,
"loss": 0.0676,
"mean_token_accuracy": 0.9801687188446522,
"num_tokens": 13740537.0,
"step": 208
},
{
"epoch": 4.11566265060241,
"grad_norm": 0.47200516762524886,
"learning_rate": 9.629939201349852e-07,
"loss": 0.0723,
"mean_token_accuracy": 0.9782837741076946,
"num_tokens": 13871609.0,
"step": 210
},
{
"epoch": 4.15421686746988,
"grad_norm": 0.44277012092487705,
"learning_rate": 8.835825635064266e-07,
"loss": 0.0729,
"mean_token_accuracy": 0.9780853129923344,
"num_tokens": 14002681.0,
"step": 212
},
{
"epoch": 4.192771084337349,
"grad_norm": 0.4753962832603972,
"learning_rate": 8.072699168921827e-07,
"loss": 0.0749,
"mean_token_accuracy": 0.9778944849967957,
"num_tokens": 14133753.0,
"step": 214
},
{
"epoch": 4.231325301204819,
"grad_norm": 0.48346978347475456,
"learning_rate": 7.341134260605537e-07,
"loss": 0.0692,
"mean_token_accuracy": 0.9793745614588261,
"num_tokens": 14264314.0,
"step": 216
},
{
"epoch": 4.2698795180722895,
"grad_norm": 0.4328206037632282,
"learning_rate": 6.641681609246981e-07,
"loss": 0.066,
"mean_token_accuracy": 0.9801309891045094,
"num_tokens": 14395386.0,
"step": 218
},
{
"epoch": 4.308433734939759,
"grad_norm": 0.46221534542018206,
"learning_rate": 5.974867740877282e-07,
"loss": 0.0696,
"mean_token_accuracy": 0.9789478555321693,
"num_tokens": 14526458.0,
"step": 220
},
{
"epoch": 4.308433734939759,
"eval_loss": 0.4595886468887329,
"eval_mean_token_accuracy": 0.897223442514366,
"eval_num_tokens": 14526458.0,
"eval_runtime": 69.6441,
"eval_samples_per_second": 12.277,
"eval_steps_per_second": 1.536,
"step": 220
},
{
"epoch": 4.346987951807229,
"grad_norm": 0.4739286679144528,
"learning_rate": 5.341194612074824e-07,
"loss": 0.068,
"mean_token_accuracy": 0.9796868488192558,
"num_tokens": 14656421.0,
"step": 222
},
{
"epoch": 4.385542168674699,
"grad_norm": 0.43096986690967987,
"learning_rate": 4.7411392321080606e-07,
"loss": 0.0663,
"mean_token_accuracy": 0.9802683852612972,
"num_tokens": 14787493.0,
"step": 224
},
{
"epoch": 4.424096385542168,
"grad_norm": 0.46557922408208563,
"learning_rate": 4.175153303857887e-07,
"loss": 0.0654,
"mean_token_accuracy": 0.9804821126163006,
"num_tokens": 14918565.0,
"step": 226
},
{
"epoch": 4.462650602409639,
"grad_norm": 0.5546707256189516,
"learning_rate": 3.643662883789878e-07,
"loss": 0.0673,
"mean_token_accuracy": 0.979527972638607,
"num_tokens": 15049637.0,
"step": 228
},
{
"epoch": 4.501204819277109,
"grad_norm": 0.49021519394663,
"learning_rate": 3.1470680612323503e-07,
"loss": 0.07,
"mean_token_accuracy": 0.9785585664212704,
"num_tokens": 15180709.0,
"step": 230
},
{
"epoch": 4.539759036144578,
"grad_norm": 0.45571708386475684,
"learning_rate": 2.685742657201601e-07,
"loss": 0.0697,
"mean_token_accuracy": 0.9785204008221626,
"num_tokens": 15311781.0,
"step": 232
},
{
"epoch": 4.578313253012048,
"grad_norm": 0.5641008416839415,
"learning_rate": 2.260033943001244e-07,
"loss": 0.0663,
"mean_token_accuracy": 0.9797416999936104,
"num_tokens": 15442853.0,
"step": 234
},
{
"epoch": 4.6168674698795185,
"grad_norm": 0.5607141029792978,
"learning_rate": 1.8702623788072028e-07,
"loss": 0.0793,
"mean_token_accuracy": 0.9755663834512234,
"num_tokens": 15573925.0,
"step": 236
},
{
"epoch": 4.655421686746988,
"grad_norm": 0.46095439859311127,
"learning_rate": 1.5167213724353426e-07,
"loss": 0.0714,
"mean_token_accuracy": 0.9779479168355465,
"num_tokens": 15704997.0,
"step": 238
},
{
"epoch": 4.693975903614458,
"grad_norm": 0.464368810663561,
"learning_rate": 1.199677058473292e-07,
"loss": 0.066,
"mean_token_accuracy": 0.980153888463974,
"num_tokens": 15836069.0,
"step": 240
},
{
"epoch": 4.693975903614458,
"eval_loss": 0.46903374791145325,
"eval_mean_token_accuracy": 0.8968599628065234,
"eval_num_tokens": 15836069.0,
"eval_runtime": 69.6558,
"eval_samples_per_second": 12.275,
"eval_steps_per_second": 1.536,
"step": 240
},
{
"epoch": 4.732530120481927,
"grad_norm": 0.5162077757262011,
"learning_rate": 9.193680979426189e-08,
"loss": 0.0775,
"mean_token_accuracy": 0.9764594584703445,
"num_tokens": 15967141.0,
"step": 242
},
{
"epoch": 4.771084337349397,
"grad_norm": 0.4482450270539155,
"learning_rate": 6.760054986423459e-08,
"loss": 0.0632,
"mean_token_accuracy": 0.9808179698884487,
"num_tokens": 16098213.0,
"step": 244
},
{
"epoch": 4.809638554216868,
"grad_norm": 0.4698597407866022,
"learning_rate": 4.697724563088646e-08,
"loss": 0.0681,
"mean_token_accuracy": 0.9797111675143242,
"num_tokens": 16229285.0,
"step": 246
},
{
"epoch": 4.848192771084337,
"grad_norm": 0.4662674319978425,
"learning_rate": 3.0082421671192576e-08,
"loss": 0.0688,
"mean_token_accuracy": 0.97944400832057,
"num_tokens": 16360357.0,
"step": 248
},
{
"epoch": 4.886746987951807,
"grad_norm": 0.46327536754981147,
"learning_rate": 1.692879587904983e-08,
"loss": 0.0662,
"mean_token_accuracy": 0.9799401611089706,
"num_tokens": 16491429.0,
"step": 250
},
{
"epoch": 4.925301204819277,
"grad_norm": 0.4688691090714117,
"learning_rate": 7.526269891646176e-09,
"loss": 0.0642,
"mean_token_accuracy": 0.9807046689093113,
"num_tokens": 16621960.0,
"step": 252
},
{
"epoch": 4.9638554216867465,
"grad_norm": 0.4516057398304381,
"learning_rate": 1.8819216358156865e-09,
"loss": 0.0688,
"mean_token_accuracy": 0.9792744368314743,
"num_tokens": 16752156.0,
"step": 254
},
{
"epoch": 4.983132530120482,
"mean_token_accuracy": 0.976367861032486,
"num_tokens": 16817692.0,
"step": 255,
"total_flos": 24409842647040.0,
"train_loss": 0.12274208276295194,
"train_runtime": 3782.9235,
"train_samples_per_second": 2.194,
"train_steps_per_second": 0.067
}
],
"logging_steps": 2,
"max_steps": 255,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 24409842647040.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}