{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.983132530120482, "eval_steps": 20, "global_step": 255, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03855421686746988, "grad_norm": 2.0457221564142256, "learning_rate": 3.846153846153847e-07, "loss": 0.2354, "mean_token_accuracy": 0.930065356194973, "num_tokens": 131072.0, "step": 2 }, { "epoch": 0.07710843373493977, "grad_norm": 2.1086974646270145, "learning_rate": 1.153846153846154e-06, "loss": 0.2508, "mean_token_accuracy": 0.9255465492606163, "num_tokens": 262144.0, "step": 4 }, { "epoch": 0.11566265060240964, "grad_norm": 1.698182282959437, "learning_rate": 1.9230769230769234e-06, "loss": 0.2473, "mean_token_accuracy": 0.9256381466984749, "num_tokens": 393216.0, "step": 6 }, { "epoch": 0.15421686746987953, "grad_norm": 1.4331583326698771, "learning_rate": 2.6923076923076923e-06, "loss": 0.2193, "mean_token_accuracy": 0.9314393177628517, "num_tokens": 524288.0, "step": 8 }, { "epoch": 0.1927710843373494, "grad_norm": 1.280978852144958, "learning_rate": 3.4615384615384617e-06, "loss": 0.2205, "mean_token_accuracy": 0.930450152605772, "num_tokens": 654484.0, "step": 10 }, { "epoch": 0.23132530120481928, "grad_norm": 0.8255955634911271, "learning_rate": 4.230769230769231e-06, "loss": 0.2117, "mean_token_accuracy": 0.9317141100764275, "num_tokens": 785556.0, "step": 12 }, { "epoch": 0.26987951807228916, "grad_norm": 0.7584680371226415, "learning_rate": 5e-06, "loss": 0.206, "mean_token_accuracy": 0.9338631108403206, "num_tokens": 915519.0, "step": 14 }, { "epoch": 0.30843373493975906, "grad_norm": 0.9495192852210463, "learning_rate": 5.769230769230769e-06, "loss": 0.1982, "mean_token_accuracy": 0.9358359947800636, "num_tokens": 1046591.0, "step": 16 }, { "epoch": 0.3469879518072289, "grad_norm": 0.9714974283482016, "learning_rate": 6.538461538461539e-06, "loss": 0.2055, "mean_token_accuracy": 0.9338132180273533, "num_tokens": 1177663.0, "step": 18 }, { "epoch": 0.3855421686746988, "grad_norm": 0.6339236056292388, "learning_rate": 7.307692307692308e-06, "loss": 0.1917, "mean_token_accuracy": 0.9378740377724171, "num_tokens": 1308735.0, "step": 20 }, { "epoch": 0.3855421686746988, "eval_loss": 0.3343917727470398, "eval_mean_token_accuracy": 0.9013295725127247, "eval_num_tokens": 1308735.0, "eval_runtime": 70.0593, "eval_samples_per_second": 12.204, "eval_steps_per_second": 1.527, "step": 20 }, { "epoch": 0.42409638554216866, "grad_norm": 0.7315888499202351, "learning_rate": 8.076923076923077e-06, "loss": 0.1809, "mean_token_accuracy": 0.9400189444422722, "num_tokens": 1439807.0, "step": 22 }, { "epoch": 0.46265060240963857, "grad_norm": 0.7642349616310066, "learning_rate": 8.846153846153847e-06, "loss": 0.1928, "mean_token_accuracy": 0.9367095269262791, "num_tokens": 1570062.0, "step": 24 }, { "epoch": 0.5012048192771085, "grad_norm": 0.6114978913375759, "learning_rate": 9.615384615384616e-06, "loss": 0.1828, "mean_token_accuracy": 0.9394693598151207, "num_tokens": 1701134.0, "step": 26 }, { "epoch": 0.5397590361445783, "grad_norm": 0.6229653774047121, "learning_rate": 9.999529497453782e-06, "loss": 0.1806, "mean_token_accuracy": 0.9402282536029816, "num_tokens": 1832133.0, "step": 28 }, { "epoch": 0.5783132530120482, "grad_norm": 0.6722415161460822, "learning_rate": 9.99576600836172e-06, "loss": 0.1896, "mean_token_accuracy": 0.9363855794072151, "num_tokens": 1963205.0, "step": 30 }, { "epoch": 0.6168674698795181, "grad_norm": 0.5974286474799401, "learning_rate": 9.988241863214212e-06, "loss": 0.1814, "mean_token_accuracy": 0.9404540322721004, "num_tokens": 2094277.0, "step": 32 }, { "epoch": 0.655421686746988, "grad_norm": 0.601035342701654, "learning_rate": 9.976962725951878e-06, "loss": 0.1801, "mean_token_accuracy": 0.9400342106819153, "num_tokens": 2225349.0, "step": 34 }, { "epoch": 0.6939759036144578, "grad_norm": 0.5765003488310966, "learning_rate": 9.961937087155697e-06, "loss": 0.1828, "mean_token_accuracy": 0.9392519034445286, "num_tokens": 2355263.0, "step": 36 }, { "epoch": 0.7325301204819277, "grad_norm": 34.52047518558373, "learning_rate": 9.943176257655567e-06, "loss": 0.2098, "mean_token_accuracy": 0.9331491328775883, "num_tokens": 2486335.0, "step": 38 }, { "epoch": 0.7710843373493976, "grad_norm": 0.6276699276820382, "learning_rate": 9.920694360015864e-06, "loss": 0.1745, "mean_token_accuracy": 0.9413929060101509, "num_tokens": 2617407.0, "step": 40 }, { "epoch": 0.7710843373493976, "eval_loss": 0.32280808687210083, "eval_mean_token_accuracy": 0.9021720039510281, "eval_num_tokens": 2617407.0, "eval_runtime": 69.6577, "eval_samples_per_second": 12.274, "eval_steps_per_second": 1.536, "step": 40 }, { "epoch": 0.8096385542168675, "grad_norm": 0.6015365123041743, "learning_rate": 9.894508317904418e-06, "loss": 0.1751, "mean_token_accuracy": 0.9412707760930061, "num_tokens": 2748479.0, "step": 42 }, { "epoch": 0.8481927710843373, "grad_norm": 0.6316203175238668, "learning_rate": 9.864637843352916e-06, "loss": 0.184, "mean_token_accuracy": 0.9374923817813396, "num_tokens": 2879551.0, "step": 44 }, { "epoch": 0.8867469879518072, "grad_norm": 0.5904610746669308, "learning_rate": 9.831105421918287e-06, "loss": 0.1777, "mean_token_accuracy": 0.9405580870807171, "num_tokens": 3010185.0, "step": 46 }, { "epoch": 0.9253012048192771, "grad_norm": 0.5994215271575196, "learning_rate": 9.793936295756292e-06, "loss": 0.187, "mean_token_accuracy": 0.9375152811408043, "num_tokens": 3141257.0, "step": 48 }, { "epoch": 0.963855421686747, "grad_norm": 0.5854742456446934, "learning_rate": 9.753158444620013e-06, "loss": 0.1815, "mean_token_accuracy": 0.9394976831972599, "num_tokens": 3271788.0, "step": 50 }, { "epoch": 1.0192771084337349, "grad_norm": 0.957499837849808, "learning_rate": 9.70880256479758e-06, "loss": 0.2534, "mean_token_accuracy": 0.9437652796506881, "num_tokens": 3435628.0, "step": 52 }, { "epoch": 1.0578313253012048, "grad_norm": 0.6854514205992324, "learning_rate": 9.660902046004954e-06, "loss": 0.151, "mean_token_accuracy": 0.9503083899617195, "num_tokens": 3566700.0, "step": 54 }, { "epoch": 1.0963855421686748, "grad_norm": 0.6080507225701574, "learning_rate": 9.60949294625121e-06, "loss": 0.1415, "mean_token_accuracy": 0.9535066671669483, "num_tokens": 3697772.0, "step": 56 }, { "epoch": 1.1349397590361445, "grad_norm": 0.6054065882233389, "learning_rate": 9.554613964695189e-06, "loss": 0.1493, "mean_token_accuracy": 0.9502549581229687, "num_tokens": 3828844.0, "step": 58 }, { "epoch": 1.1734939759036145, "grad_norm": 0.7694600057204949, "learning_rate": 9.496306412513989e-06, "loss": 0.1462, "mean_token_accuracy": 0.9519953094422817, "num_tokens": 3959916.0, "step": 60 }, { "epoch": 1.1734939759036145, "eval_loss": 0.359206885099411, "eval_mean_token_accuracy": 0.9007850846397543, "eval_num_tokens": 3959916.0, "eval_runtime": 69.8215, "eval_samples_per_second": 12.246, "eval_steps_per_second": 1.532, "step": 60 }, { "epoch": 1.2120481927710842, "grad_norm": 0.6845669867023433, "learning_rate": 9.434614181805203e-06, "loss": 0.1407, "mean_token_accuracy": 0.9533876590430737, "num_tokens": 4089879.0, "step": 62 }, { "epoch": 1.2506024096385542, "grad_norm": 0.6197114152379135, "learning_rate": 9.369583712546322e-06, "loss": 0.1349, "mean_token_accuracy": 0.9554836452007294, "num_tokens": 4220951.0, "step": 64 }, { "epoch": 1.2891566265060241, "grad_norm": 0.6172158164875755, "learning_rate": 9.30126395763618e-06, "loss": 0.1535, "mean_token_accuracy": 0.95006413012743, "num_tokens": 4352023.0, "step": 66 }, { "epoch": 1.3277108433734939, "grad_norm": 0.6409060214608714, "learning_rate": 9.229706346044749e-06, "loss": 0.156, "mean_token_accuracy": 0.9484306424856186, "num_tokens": 4483095.0, "step": 68 }, { "epoch": 1.3662650602409638, "grad_norm": 0.6166450609513697, "learning_rate": 9.154964744099006e-06, "loss": 0.1419, "mean_token_accuracy": 0.9533540047705173, "num_tokens": 4614167.0, "step": 70 }, { "epoch": 1.4048192771084338, "grad_norm": 0.6058092262037136, "learning_rate": 9.077095414934076e-06, "loss": 0.1439, "mean_token_accuracy": 0.9524685628712177, "num_tokens": 4745239.0, "step": 72 }, { "epoch": 1.4433734939759035, "grad_norm": 0.6464674278239464, "learning_rate": 8.996156976140088e-06, "loss": 0.1427, "mean_token_accuracy": 0.9521632380783558, "num_tokens": 4876311.0, "step": 74 }, { "epoch": 1.4819277108433735, "grad_norm": 0.6232124362016298, "learning_rate": 8.91221035563669e-06, "loss": 0.1387, "mean_token_accuracy": 0.9537738263607025, "num_tokens": 5007383.0, "step": 76 }, { "epoch": 1.5204819277108435, "grad_norm": 0.6251055517263481, "learning_rate": 8.82531874580844e-06, "loss": 0.1544, "mean_token_accuracy": 0.9496977403759956, "num_tokens": 5138455.0, "step": 78 }, { "epoch": 1.5590361445783132, "grad_norm": 0.6597130966145244, "learning_rate": 8.735547555935538e-06, "loss": 0.1467, "mean_token_accuracy": 0.951957143843174, "num_tokens": 5269527.0, "step": 80 }, { "epoch": 1.5590361445783132, "eval_loss": 0.34304243326187134, "eval_mean_token_accuracy": 0.9011661727851796, "eval_num_tokens": 5269527.0, "eval_runtime": 69.6573, "eval_samples_per_second": 12.274, "eval_steps_per_second": 1.536, "step": 80 }, { "epoch": 1.5975903614457831, "grad_norm": 0.6093216234766912, "learning_rate": 8.642964362955781e-06, "loss": 0.145, "mean_token_accuracy": 0.9515700563788414, "num_tokens": 5400161.0, "step": 82 }, { "epoch": 1.636144578313253, "grad_norm": 0.5687703380048487, "learning_rate": 8.547638860594765e-06, "loss": 0.1484, "mean_token_accuracy": 0.9509495720267296, "num_tokens": 5531233.0, "step": 84 }, { "epoch": 1.6746987951807228, "grad_norm": 0.6551898466798518, "learning_rate": 8.449642806902623e-06, "loss": 0.1568, "mean_token_accuracy": 0.9481558501720428, "num_tokens": 5662305.0, "step": 86 }, { "epoch": 1.7132530120481928, "grad_norm": 0.6433780292504243, "learning_rate": 8.349049970236822e-06, "loss": 0.1349, "mean_token_accuracy": 0.954715259373188, "num_tokens": 5792219.0, "step": 88 }, { "epoch": 1.7518072289156628, "grad_norm": 0.5701046312406493, "learning_rate": 8.245936073731654e-06, "loss": 0.147, "mean_token_accuracy": 0.9507969096302986, "num_tokens": 5923291.0, "step": 90 }, { "epoch": 1.7903614457831325, "grad_norm": 0.6865332623152001, "learning_rate": 8.140378738296233e-06, "loss": 0.1529, "mean_token_accuracy": 0.9498768150806427, "num_tokens": 6053822.0, "step": 92 }, { "epoch": 1.8289156626506025, "grad_norm": 0.6305307568855328, "learning_rate": 8.032457424183909e-06, "loss": 0.1476, "mean_token_accuracy": 0.9505984485149384, "num_tokens": 6184894.0, "step": 94 }, { "epoch": 1.8674698795180724, "grad_norm": 0.5748443476790706, "learning_rate": 7.922253371177081e-06, "loss": 0.155, "mean_token_accuracy": 0.9482144415378571, "num_tokens": 6315149.0, "step": 96 }, { "epoch": 1.9060240963855422, "grad_norm": 0.5993128969226361, "learning_rate": 7.809849537432432e-06, "loss": 0.1434, "mean_token_accuracy": 0.9525645859539509, "num_tokens": 6445345.0, "step": 98 }, { "epoch": 1.944578313253012, "grad_norm": 0.6280456904784001, "learning_rate": 7.695330537032629e-06, "loss": 0.1445, "mean_token_accuracy": 0.9512222707271576, "num_tokens": 6576344.0, "step": 100 }, { "epoch": 1.944578313253012, "eval_loss": 0.3398211598396301, "eval_mean_token_accuracy": 0.901328669530209, "eval_num_tokens": 6576344.0, "eval_runtime": 69.654, "eval_samples_per_second": 12.275, "eval_steps_per_second": 1.536, "step": 100 }, { "epoch": 1.983132530120482, "grad_norm": 0.6197902890500856, "learning_rate": 7.578782576291501e-06, "loss": 0.1506, "mean_token_accuracy": 0.9492092207074165, "num_tokens": 6707416.0, "step": 102 }, { "epoch": 2.0385542168674697, "grad_norm": 0.6409344863530665, "learning_rate": 7.460293388860616e-06, "loss": 0.1754, "mean_token_accuracy": 0.9643502771854401, "num_tokens": 6871256.0, "step": 104 }, { "epoch": 2.07710843373494, "grad_norm": 0.6097248296204885, "learning_rate": 7.3399521696861505e-06, "loss": 0.1092, "mean_token_accuracy": 0.9659219309687614, "num_tokens": 7002255.0, "step": 106 }, { "epoch": 2.1156626506024097, "grad_norm": 0.5903613108322504, "learning_rate": 7.217849507865724e-06, "loss": 0.1066, "mean_token_accuracy": 0.9660860486328602, "num_tokens": 7133327.0, "step": 108 }, { "epoch": 2.1542168674698794, "grad_norm": 0.625091072426359, "learning_rate": 7.094077318455762e-06, "loss": 0.1091, "mean_token_accuracy": 0.9645588099956512, "num_tokens": 7263523.0, "step": 110 }, { "epoch": 2.1927710843373496, "grad_norm": 0.6604015968164485, "learning_rate": 6.96872877328073e-06, "loss": 0.1052, "mean_token_accuracy": 0.9661929123103619, "num_tokens": 7394595.0, "step": 112 }, { "epoch": 2.2313253012048193, "grad_norm": 0.7455880093770229, "learning_rate": 6.841898230796302e-06, "loss": 0.1049, "mean_token_accuracy": 0.9661089479923248, "num_tokens": 7525667.0, "step": 114 }, { "epoch": 2.269879518072289, "grad_norm": 0.6028303919109465, "learning_rate": 6.713681165059271e-06, "loss": 0.1127, "mean_token_accuracy": 0.9631625637412071, "num_tokens": 7656739.0, "step": 116 }, { "epoch": 2.3084337349397592, "grad_norm": 0.6799912009709536, "learning_rate": 6.584174093857676e-06, "loss": 0.1035, "mean_token_accuracy": 0.9669562242925167, "num_tokens": 7787811.0, "step": 118 }, { "epoch": 2.346987951807229, "grad_norm": 0.6255570427114552, "learning_rate": 6.453474506055228e-06, "loss": 0.1176, "mean_token_accuracy": 0.9615787602961063, "num_tokens": 7916616.0, "step": 120 }, { "epoch": 2.346987951807229, "eval_loss": 0.38193774223327637, "eval_mean_token_accuracy": 0.8994210568543907, "eval_num_tokens": 7916616.0, "eval_runtime": 69.6436, "eval_samples_per_second": 12.277, "eval_steps_per_second": 1.536, "step": 120 }, { "epoch": 2.3855421686746987, "grad_norm": 0.6279356138996781, "learning_rate": 6.3216807882047585e-06, "loss": 0.0974, "mean_token_accuracy": 0.968185156583786, "num_tokens": 8047688.0, "step": 122 }, { "epoch": 2.4240963855421684, "grad_norm": 0.6479503216427691, "learning_rate": 6.188892150485904e-06, "loss": 0.1087, "mean_token_accuracy": 0.9651853404939175, "num_tokens": 8178760.0, "step": 124 }, { "epoch": 2.4626506024096386, "grad_norm": 0.7228376218883897, "learning_rate": 6.0552085520227875e-06, "loss": 0.1136, "mean_token_accuracy": 0.9631396643817425, "num_tokens": 8309832.0, "step": 126 }, { "epoch": 2.5012048192771084, "grad_norm": 0.6292530226739607, "learning_rate": 5.920730625637934e-06, "loss": 0.1043, "mean_token_accuracy": 0.9666203670203686, "num_tokens": 8440904.0, "step": 128 }, { "epoch": 2.539759036144578, "grad_norm": 0.6120273359022707, "learning_rate": 5.785559602099019e-06, "loss": 0.1073, "mean_token_accuracy": 0.9648876488208771, "num_tokens": 8571976.0, "step": 130 }, { "epoch": 2.5783132530120483, "grad_norm": 0.6294342722298523, "learning_rate": 5.649797233915539e-06, "loss": 0.1092, "mean_token_accuracy": 0.9644067622721195, "num_tokens": 8703048.0, "step": 132 }, { "epoch": 2.616867469879518, "grad_norm": 0.5665304014502571, "learning_rate": 5.513545718742702e-06, "loss": 0.1086, "mean_token_accuracy": 0.9646815545856953, "num_tokens": 8834120.0, "step": 134 }, { "epoch": 2.6554216867469878, "grad_norm": 0.5673111264101424, "learning_rate": 5.376907622450229e-06, "loss": 0.1154, "mean_token_accuracy": 0.9624109007418156, "num_tokens": 8964375.0, "step": 136 }, { "epoch": 2.693975903614458, "grad_norm": 0.5636466902202368, "learning_rate": 5.2399858019140005e-06, "loss": 0.1045, "mean_token_accuracy": 0.9666311480104923, "num_tokens": 9094906.0, "step": 138 }, { "epoch": 2.7325301204819277, "grad_norm": 0.5754464602822424, "learning_rate": 5.102883327588608e-06, "loss": 0.1075, "mean_token_accuracy": 0.9647044539451599, "num_tokens": 9225978.0, "step": 140 }, { "epoch": 2.7325301204819277, "eval_loss": 0.37826669216156006, "eval_mean_token_accuracy": 0.8995784972315637, "eval_num_tokens": 9225978.0, "eval_runtime": 69.6803, "eval_samples_per_second": 12.27, "eval_steps_per_second": 1.536, "step": 140 }, { "epoch": 2.7710843373493974, "grad_norm": 0.5987257906687522, "learning_rate": 4.965703405919154e-06, "loss": 0.1041, "mean_token_accuracy": 0.9660173505544662, "num_tokens": 9357050.0, "step": 142 }, { "epoch": 2.8096385542168676, "grad_norm": 0.6727909756019579, "learning_rate": 4.828549301650673e-06, "loss": 0.1165, "mean_token_accuracy": 0.9626206122338772, "num_tokens": 9488122.0, "step": 144 }, { "epoch": 2.8481927710843373, "grad_norm": 0.5483728501054262, "learning_rate": 4.691524260093672e-06, "loss": 0.1101, "mean_token_accuracy": 0.9640556387603283, "num_tokens": 9619194.0, "step": 146 }, { "epoch": 2.886746987951807, "grad_norm": 0.6578615356471254, "learning_rate": 4.554731429404293e-06, "loss": 0.1167, "mean_token_accuracy": 0.9623610861599445, "num_tokens": 9750266.0, "step": 148 }, { "epoch": 2.9253012048192772, "grad_norm": 0.544341897970942, "learning_rate": 4.4182737829376135e-06, "loss": 0.1068, "mean_token_accuracy": 0.965429600328207, "num_tokens": 9881338.0, "step": 150 }, { "epoch": 2.963855421686747, "grad_norm": 0.5807218274090602, "learning_rate": 4.28225404173254e-06, "loss": 0.1058, "mean_token_accuracy": 0.965176422148943, "num_tokens": 10011972.0, "step": 152 }, { "epoch": 3.019277108433735, "grad_norm": 1.007803950038667, "learning_rate": 4.146774597186622e-06, "loss": 0.1488, "mean_token_accuracy": 0.9695591181516647, "num_tokens": 10175812.0, "step": 154 }, { "epoch": 3.057831325301205, "grad_norm": 0.6613641201206724, "learning_rate": 4.011937433979014e-06, "loss": 0.0847, "mean_token_accuracy": 0.9746656753122807, "num_tokens": 10306884.0, "step": 156 }, { "epoch": 3.0963855421686746, "grad_norm": 0.5427167115705699, "learning_rate": 3.87784405329962e-06, "loss": 0.0838, "mean_token_accuracy": 0.9741344675421715, "num_tokens": 10437883.0, "step": 158 }, { "epoch": 3.1349397590361447, "grad_norm": 0.5059704125761413, "learning_rate": 3.744595396442169e-06, "loss": 0.0814, "mean_token_accuracy": 0.9750473313033581, "num_tokens": 10568955.0, "step": 160 }, { "epoch": 3.1349397590361447, "eval_loss": 0.4201391637325287, "eval_mean_token_accuracy": 0.8986482670374005, "eval_num_tokens": 10568955.0, "eval_runtime": 69.8903, "eval_samples_per_second": 12.233, "eval_steps_per_second": 1.531, "step": 160 }, { "epoch": 3.1734939759036145, "grad_norm": 0.4955524619584041, "learning_rate": 3.612291768818772e-06, "loss": 0.0827, "mean_token_accuracy": 0.9744977466762066, "num_tokens": 10700027.0, "step": 162 }, { "epoch": 3.212048192771084, "grad_norm": 0.5481909266796648, "learning_rate": 3.4810327644531606e-06, "loss": 0.0804, "mean_token_accuracy": 0.9746122434735298, "num_tokens": 10831099.0, "step": 164 }, { "epoch": 3.2506024096385544, "grad_norm": 0.5869274418415635, "learning_rate": 3.3509171910094162e-06, "loss": 0.0849, "mean_token_accuracy": 0.9735665060579777, "num_tokens": 10962171.0, "step": 166 }, { "epoch": 3.289156626506024, "grad_norm": 0.5997938570160334, "learning_rate": 3.222042995412669e-06, "loss": 0.0826, "mean_token_accuracy": 0.9744274839758873, "num_tokens": 11092367.0, "step": 168 }, { "epoch": 3.327710843373494, "grad_norm": 0.5638967234440626, "learning_rate": 3.094507190117715e-06, "loss": 0.0752, "mean_token_accuracy": 0.9760014712810516, "num_tokens": 11223439.0, "step": 170 }, { "epoch": 3.3662650602409636, "grad_norm": 0.5677450107311146, "learning_rate": 2.9684057800810844e-06, "loss": 0.0849, "mean_token_accuracy": 0.9734520092606544, "num_tokens": 11354511.0, "step": 172 }, { "epoch": 3.404819277108434, "grad_norm": 0.5694190125459168, "learning_rate": 2.8438336904915186e-06, "loss": 0.0907, "mean_token_accuracy": 0.9719940833747387, "num_tokens": 11485583.0, "step": 174 }, { "epoch": 3.4433734939759035, "grad_norm": 0.5008764813796651, "learning_rate": 2.7208846953132685e-06, "loss": 0.0782, "mean_token_accuracy": 0.9755356945097446, "num_tokens": 11616217.0, "step": 176 }, { "epoch": 3.4819277108433733, "grad_norm": 0.5027767263738213, "learning_rate": 2.599651346695979e-06, "loss": 0.0773, "mean_token_accuracy": 0.9762609973549843, "num_tokens": 11747289.0, "step": 178 }, { "epoch": 3.5204819277108435, "grad_norm": 0.5747857741850161, "learning_rate": 2.4802249053043525e-06, "loss": 0.0777, "mean_token_accuracy": 0.976215198636055, "num_tokens": 11878361.0, "step": 180 }, { "epoch": 3.5204819277108435, "eval_loss": 0.43149346113204956, "eval_mean_token_accuracy": 0.898219308563482, "eval_num_tokens": 11878361.0, "eval_runtime": 69.6743, "eval_samples_per_second": 12.271, "eval_steps_per_second": 1.536, "step": 180 }, { "epoch": 3.559036144578313, "grad_norm": 0.5115273312879999, "learning_rate": 2.3626952716199647e-06, "loss": 0.0792, "mean_token_accuracy": 0.9750167988240719, "num_tokens": 12009433.0, "step": 182 }, { "epoch": 3.597590361445783, "grad_norm": 0.5172911491980401, "learning_rate": 2.247150918267008e-06, "loss": 0.0851, "mean_token_accuracy": 0.9730398207902908, "num_tokens": 12140505.0, "step": 184 }, { "epoch": 3.636144578313253, "grad_norm": 0.5260093719963543, "learning_rate": 2.133678823412873e-06, "loss": 0.0797, "mean_token_accuracy": 0.9751236625015736, "num_tokens": 12271577.0, "step": 186 }, { "epoch": 3.674698795180723, "grad_norm": 0.5267292864138245, "learning_rate": 2.022364405293703e-06, "loss": 0.0832, "mean_token_accuracy": 0.9738947302103043, "num_tokens": 12402649.0, "step": 188 }, { "epoch": 3.7132530120481926, "grad_norm": 0.5065512725199254, "learning_rate": 1.913291457914234e-06, "loss": 0.0856, "mean_token_accuracy": 0.9732001163065434, "num_tokens": 12533721.0, "step": 190 }, { "epoch": 3.7518072289156628, "grad_norm": 0.5465242770321679, "learning_rate": 1.8065420879702888e-06, "loss": 0.0838, "mean_token_accuracy": 0.9731762520968914, "num_tokens": 12663435.0, "step": 192 }, { "epoch": 3.7903614457831325, "grad_norm": 0.7823063875533764, "learning_rate": 1.7021966530414303e-06, "loss": 0.0762, "mean_token_accuracy": 0.9758411757647991, "num_tokens": 12794507.0, "step": 194 }, { "epoch": 3.8289156626506022, "grad_norm": 0.571380544699335, "learning_rate": 1.6003337011002928e-06, "loss": 0.084, "mean_token_accuracy": 0.9734901748597622, "num_tokens": 12925579.0, "step": 196 }, { "epoch": 3.8674698795180724, "grad_norm": 0.5400258981871386, "learning_rate": 1.5010299113841397e-06, "loss": 0.0807, "mean_token_accuracy": 0.9752305261790752, "num_tokens": 13056651.0, "step": 198 }, { "epoch": 3.906024096385542, "grad_norm": 0.5204832843446408, "learning_rate": 1.4043600366731213e-06, "loss": 0.0821, "mean_token_accuracy": 0.9745206460356712, "num_tokens": 13187723.0, "step": 200 }, { "epoch": 3.906024096385542, "eval_loss": 0.43459072709083557, "eval_mean_token_accuracy": 0.8980461002510285, "eval_num_tokens": 13187723.0, "eval_runtime": 69.6812, "eval_samples_per_second": 12.27, "eval_steps_per_second": 1.536, "step": 200 }, { "epoch": 3.944578313253012, "grad_norm": 0.5732935867678565, "learning_rate": 1.3103968470187384e-06, "loss": 0.0841, "mean_token_accuracy": 0.973306454718113, "num_tokens": 13317686.0, "step": 202 }, { "epoch": 3.983132530120482, "grad_norm": 0.5049593156468802, "learning_rate": 1.2192110749648233e-06, "loss": 0.0783, "mean_token_accuracy": 0.9752342775464058, "num_tokens": 13447600.0, "step": 204 }, { "epoch": 4.03855421686747, "grad_norm": 0.4900616503984239, "learning_rate": 1.1308713623022988e-06, "loss": 0.1075, "mean_token_accuracy": 0.9786272644996643, "num_tokens": 13611440.0, "step": 206 }, { "epoch": 4.0771084337349395, "grad_norm": 0.4917129834327916, "learning_rate": 1.045444208397791e-06, "loss": 0.0676, "mean_token_accuracy": 0.9801687188446522, "num_tokens": 13740537.0, "step": 208 }, { "epoch": 4.11566265060241, "grad_norm": 0.47200516762524886, "learning_rate": 9.629939201349852e-07, "loss": 0.0723, "mean_token_accuracy": 0.9782837741076946, "num_tokens": 13871609.0, "step": 210 }, { "epoch": 4.15421686746988, "grad_norm": 0.44277012092487705, "learning_rate": 8.835825635064266e-07, "loss": 0.0729, "mean_token_accuracy": 0.9780853129923344, "num_tokens": 14002681.0, "step": 212 }, { "epoch": 4.192771084337349, "grad_norm": 0.4753962832603972, "learning_rate": 8.072699168921827e-07, "loss": 0.0749, "mean_token_accuracy": 0.9778944849967957, "num_tokens": 14133753.0, "step": 214 }, { "epoch": 4.231325301204819, "grad_norm": 0.48346978347475456, "learning_rate": 7.341134260605537e-07, "loss": 0.0692, "mean_token_accuracy": 0.9793745614588261, "num_tokens": 14264314.0, "step": 216 }, { "epoch": 4.2698795180722895, "grad_norm": 0.4328206037632282, "learning_rate": 6.641681609246981e-07, "loss": 0.066, "mean_token_accuracy": 0.9801309891045094, "num_tokens": 14395386.0, "step": 218 }, { "epoch": 4.308433734939759, "grad_norm": 0.46221534542018206, "learning_rate": 5.974867740877282e-07, "loss": 0.0696, "mean_token_accuracy": 0.9789478555321693, "num_tokens": 14526458.0, "step": 220 }, { "epoch": 4.308433734939759, "eval_loss": 0.4595886468887329, "eval_mean_token_accuracy": 0.897223442514366, "eval_num_tokens": 14526458.0, "eval_runtime": 69.6441, "eval_samples_per_second": 12.277, "eval_steps_per_second": 1.536, "step": 220 }, { "epoch": 4.346987951807229, "grad_norm": 0.4739286679144528, "learning_rate": 5.341194612074824e-07, "loss": 0.068, "mean_token_accuracy": 0.9796868488192558, "num_tokens": 14656421.0, "step": 222 }, { "epoch": 4.385542168674699, "grad_norm": 0.43096986690967987, "learning_rate": 4.7411392321080606e-07, "loss": 0.0663, "mean_token_accuracy": 0.9802683852612972, "num_tokens": 14787493.0, "step": 224 }, { "epoch": 4.424096385542168, "grad_norm": 0.46557922408208563, "learning_rate": 4.175153303857887e-07, "loss": 0.0654, "mean_token_accuracy": 0.9804821126163006, "num_tokens": 14918565.0, "step": 226 }, { "epoch": 4.462650602409639, "grad_norm": 0.5546707256189516, "learning_rate": 3.643662883789878e-07, "loss": 0.0673, "mean_token_accuracy": 0.979527972638607, "num_tokens": 15049637.0, "step": 228 }, { "epoch": 4.501204819277109, "grad_norm": 0.49021519394663, "learning_rate": 3.1470680612323503e-07, "loss": 0.07, "mean_token_accuracy": 0.9785585664212704, "num_tokens": 15180709.0, "step": 230 }, { "epoch": 4.539759036144578, "grad_norm": 0.45571708386475684, "learning_rate": 2.685742657201601e-07, "loss": 0.0697, "mean_token_accuracy": 0.9785204008221626, "num_tokens": 15311781.0, "step": 232 }, { "epoch": 4.578313253012048, "grad_norm": 0.5641008416839415, "learning_rate": 2.260033943001244e-07, "loss": 0.0663, "mean_token_accuracy": 0.9797416999936104, "num_tokens": 15442853.0, "step": 234 }, { "epoch": 4.6168674698795185, "grad_norm": 0.5607141029792978, "learning_rate": 1.8702623788072028e-07, "loss": 0.0793, "mean_token_accuracy": 0.9755663834512234, "num_tokens": 15573925.0, "step": 236 }, { "epoch": 4.655421686746988, "grad_norm": 0.46095439859311127, "learning_rate": 1.5167213724353426e-07, "loss": 0.0714, "mean_token_accuracy": 0.9779479168355465, "num_tokens": 15704997.0, "step": 238 }, { "epoch": 4.693975903614458, "grad_norm": 0.464368810663561, "learning_rate": 1.199677058473292e-07, "loss": 0.066, "mean_token_accuracy": 0.980153888463974, "num_tokens": 15836069.0, "step": 240 }, { "epoch": 4.693975903614458, "eval_loss": 0.46903374791145325, "eval_mean_token_accuracy": 0.8968599628065234, "eval_num_tokens": 15836069.0, "eval_runtime": 69.6558, "eval_samples_per_second": 12.275, "eval_steps_per_second": 1.536, "step": 240 }, { "epoch": 4.732530120481927, "grad_norm": 0.5162077757262011, "learning_rate": 9.193680979426189e-08, "loss": 0.0775, "mean_token_accuracy": 0.9764594584703445, "num_tokens": 15967141.0, "step": 242 }, { "epoch": 4.771084337349397, "grad_norm": 0.4482450270539155, "learning_rate": 6.760054986423459e-08, "loss": 0.0632, "mean_token_accuracy": 0.9808179698884487, "num_tokens": 16098213.0, "step": 244 }, { "epoch": 4.809638554216868, "grad_norm": 0.4698597407866022, "learning_rate": 4.697724563088646e-08, "loss": 0.0681, "mean_token_accuracy": 0.9797111675143242, "num_tokens": 16229285.0, "step": 246 }, { "epoch": 4.848192771084337, "grad_norm": 0.4662674319978425, "learning_rate": 3.0082421671192576e-08, "loss": 0.0688, "mean_token_accuracy": 0.97944400832057, "num_tokens": 16360357.0, "step": 248 }, { "epoch": 4.886746987951807, "grad_norm": 0.46327536754981147, "learning_rate": 1.692879587904983e-08, "loss": 0.0662, "mean_token_accuracy": 0.9799401611089706, "num_tokens": 16491429.0, "step": 250 }, { "epoch": 4.925301204819277, "grad_norm": 0.4688691090714117, "learning_rate": 7.526269891646176e-09, "loss": 0.0642, "mean_token_accuracy": 0.9807046689093113, "num_tokens": 16621960.0, "step": 252 }, { "epoch": 4.9638554216867465, "grad_norm": 0.4516057398304381, "learning_rate": 1.8819216358156865e-09, "loss": 0.0688, "mean_token_accuracy": 0.9792744368314743, "num_tokens": 16752156.0, "step": 254 }, { "epoch": 4.983132530120482, "mean_token_accuracy": 0.976367861032486, "num_tokens": 16817692.0, "step": 255, "total_flos": 24409842647040.0, "train_loss": 0.12274208276295194, "train_runtime": 3782.9235, "train_samples_per_second": 2.194, "train_steps_per_second": 0.067 } ], "logging_steps": 2, "max_steps": 255, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 24409842647040.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }