|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.983132530120482, |
|
"eval_steps": 20, |
|
"global_step": 255, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03855421686746988, |
|
"grad_norm": 2.0457221564142256, |
|
"learning_rate": 3.846153846153847e-07, |
|
"loss": 0.2354, |
|
"mean_token_accuracy": 0.930065356194973, |
|
"num_tokens": 131072.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.07710843373493977, |
|
"grad_norm": 2.1086974646270145, |
|
"learning_rate": 1.153846153846154e-06, |
|
"loss": 0.2508, |
|
"mean_token_accuracy": 0.9255465492606163, |
|
"num_tokens": 262144.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.11566265060240964, |
|
"grad_norm": 1.698182282959437, |
|
"learning_rate": 1.9230769230769234e-06, |
|
"loss": 0.2473, |
|
"mean_token_accuracy": 0.9256381466984749, |
|
"num_tokens": 393216.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.15421686746987953, |
|
"grad_norm": 1.4331583326698771, |
|
"learning_rate": 2.6923076923076923e-06, |
|
"loss": 0.2193, |
|
"mean_token_accuracy": 0.9314393177628517, |
|
"num_tokens": 524288.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1927710843373494, |
|
"grad_norm": 1.280978852144958, |
|
"learning_rate": 3.4615384615384617e-06, |
|
"loss": 0.2205, |
|
"mean_token_accuracy": 0.930450152605772, |
|
"num_tokens": 654484.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23132530120481928, |
|
"grad_norm": 0.8255955634911271, |
|
"learning_rate": 4.230769230769231e-06, |
|
"loss": 0.2117, |
|
"mean_token_accuracy": 0.9317141100764275, |
|
"num_tokens": 785556.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.26987951807228916, |
|
"grad_norm": 0.7584680371226415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.206, |
|
"mean_token_accuracy": 0.9338631108403206, |
|
"num_tokens": 915519.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.30843373493975906, |
|
"grad_norm": 0.9495192852210463, |
|
"learning_rate": 5.769230769230769e-06, |
|
"loss": 0.1982, |
|
"mean_token_accuracy": 0.9358359947800636, |
|
"num_tokens": 1046591.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3469879518072289, |
|
"grad_norm": 0.9714974283482016, |
|
"learning_rate": 6.538461538461539e-06, |
|
"loss": 0.2055, |
|
"mean_token_accuracy": 0.9338132180273533, |
|
"num_tokens": 1177663.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3855421686746988, |
|
"grad_norm": 0.6339236056292388, |
|
"learning_rate": 7.307692307692308e-06, |
|
"loss": 0.1917, |
|
"mean_token_accuracy": 0.9378740377724171, |
|
"num_tokens": 1308735.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3855421686746988, |
|
"eval_loss": 0.3343917727470398, |
|
"eval_mean_token_accuracy": 0.9013295725127247, |
|
"eval_num_tokens": 1308735.0, |
|
"eval_runtime": 70.0593, |
|
"eval_samples_per_second": 12.204, |
|
"eval_steps_per_second": 1.527, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.42409638554216866, |
|
"grad_norm": 0.7315888499202351, |
|
"learning_rate": 8.076923076923077e-06, |
|
"loss": 0.1809, |
|
"mean_token_accuracy": 0.9400189444422722, |
|
"num_tokens": 1439807.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.46265060240963857, |
|
"grad_norm": 0.7642349616310066, |
|
"learning_rate": 8.846153846153847e-06, |
|
"loss": 0.1928, |
|
"mean_token_accuracy": 0.9367095269262791, |
|
"num_tokens": 1570062.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5012048192771085, |
|
"grad_norm": 0.6114978913375759, |
|
"learning_rate": 9.615384615384616e-06, |
|
"loss": 0.1828, |
|
"mean_token_accuracy": 0.9394693598151207, |
|
"num_tokens": 1701134.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5397590361445783, |
|
"grad_norm": 0.6229653774047121, |
|
"learning_rate": 9.999529497453782e-06, |
|
"loss": 0.1806, |
|
"mean_token_accuracy": 0.9402282536029816, |
|
"num_tokens": 1832133.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5783132530120482, |
|
"grad_norm": 0.6722415161460822, |
|
"learning_rate": 9.99576600836172e-06, |
|
"loss": 0.1896, |
|
"mean_token_accuracy": 0.9363855794072151, |
|
"num_tokens": 1963205.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6168674698795181, |
|
"grad_norm": 0.5974286474799401, |
|
"learning_rate": 9.988241863214212e-06, |
|
"loss": 0.1814, |
|
"mean_token_accuracy": 0.9404540322721004, |
|
"num_tokens": 2094277.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.655421686746988, |
|
"grad_norm": 0.601035342701654, |
|
"learning_rate": 9.976962725951878e-06, |
|
"loss": 0.1801, |
|
"mean_token_accuracy": 0.9400342106819153, |
|
"num_tokens": 2225349.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6939759036144578, |
|
"grad_norm": 0.5765003488310966, |
|
"learning_rate": 9.961937087155697e-06, |
|
"loss": 0.1828, |
|
"mean_token_accuracy": 0.9392519034445286, |
|
"num_tokens": 2355263.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7325301204819277, |
|
"grad_norm": 34.52047518558373, |
|
"learning_rate": 9.943176257655567e-06, |
|
"loss": 0.2098, |
|
"mean_token_accuracy": 0.9331491328775883, |
|
"num_tokens": 2486335.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7710843373493976, |
|
"grad_norm": 0.6276699276820382, |
|
"learning_rate": 9.920694360015864e-06, |
|
"loss": 0.1745, |
|
"mean_token_accuracy": 0.9413929060101509, |
|
"num_tokens": 2617407.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7710843373493976, |
|
"eval_loss": 0.32280808687210083, |
|
"eval_mean_token_accuracy": 0.9021720039510281, |
|
"eval_num_tokens": 2617407.0, |
|
"eval_runtime": 69.6577, |
|
"eval_samples_per_second": 12.274, |
|
"eval_steps_per_second": 1.536, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8096385542168675, |
|
"grad_norm": 0.6015365123041743, |
|
"learning_rate": 9.894508317904418e-06, |
|
"loss": 0.1751, |
|
"mean_token_accuracy": 0.9412707760930061, |
|
"num_tokens": 2748479.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.8481927710843373, |
|
"grad_norm": 0.6316203175238668, |
|
"learning_rate": 9.864637843352916e-06, |
|
"loss": 0.184, |
|
"mean_token_accuracy": 0.9374923817813396, |
|
"num_tokens": 2879551.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8867469879518072, |
|
"grad_norm": 0.5904610746669308, |
|
"learning_rate": 9.831105421918287e-06, |
|
"loss": 0.1777, |
|
"mean_token_accuracy": 0.9405580870807171, |
|
"num_tokens": 3010185.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.9253012048192771, |
|
"grad_norm": 0.5994215271575196, |
|
"learning_rate": 9.793936295756292e-06, |
|
"loss": 0.187, |
|
"mean_token_accuracy": 0.9375152811408043, |
|
"num_tokens": 3141257.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.5854742456446934, |
|
"learning_rate": 9.753158444620013e-06, |
|
"loss": 0.1815, |
|
"mean_token_accuracy": 0.9394976831972599, |
|
"num_tokens": 3271788.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0192771084337349, |
|
"grad_norm": 0.957499837849808, |
|
"learning_rate": 9.70880256479758e-06, |
|
"loss": 0.2534, |
|
"mean_token_accuracy": 0.9437652796506881, |
|
"num_tokens": 3435628.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.0578313253012048, |
|
"grad_norm": 0.6854514205992324, |
|
"learning_rate": 9.660902046004954e-06, |
|
"loss": 0.151, |
|
"mean_token_accuracy": 0.9503083899617195, |
|
"num_tokens": 3566700.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0963855421686748, |
|
"grad_norm": 0.6080507225701574, |
|
"learning_rate": 9.60949294625121e-06, |
|
"loss": 0.1415, |
|
"mean_token_accuracy": 0.9535066671669483, |
|
"num_tokens": 3697772.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.1349397590361445, |
|
"grad_norm": 0.6054065882233389, |
|
"learning_rate": 9.554613964695189e-06, |
|
"loss": 0.1493, |
|
"mean_token_accuracy": 0.9502549581229687, |
|
"num_tokens": 3828844.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.1734939759036145, |
|
"grad_norm": 0.7694600057204949, |
|
"learning_rate": 9.496306412513989e-06, |
|
"loss": 0.1462, |
|
"mean_token_accuracy": 0.9519953094422817, |
|
"num_tokens": 3959916.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1734939759036145, |
|
"eval_loss": 0.359206885099411, |
|
"eval_mean_token_accuracy": 0.9007850846397543, |
|
"eval_num_tokens": 3959916.0, |
|
"eval_runtime": 69.8215, |
|
"eval_samples_per_second": 12.246, |
|
"eval_steps_per_second": 1.532, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.2120481927710842, |
|
"grad_norm": 0.6845669867023433, |
|
"learning_rate": 9.434614181805203e-06, |
|
"loss": 0.1407, |
|
"mean_token_accuracy": 0.9533876590430737, |
|
"num_tokens": 4089879.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.2506024096385542, |
|
"grad_norm": 0.6197114152379135, |
|
"learning_rate": 9.369583712546322e-06, |
|
"loss": 0.1349, |
|
"mean_token_accuracy": 0.9554836452007294, |
|
"num_tokens": 4220951.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.2891566265060241, |
|
"grad_norm": 0.6172158164875755, |
|
"learning_rate": 9.30126395763618e-06, |
|
"loss": 0.1535, |
|
"mean_token_accuracy": 0.95006413012743, |
|
"num_tokens": 4352023.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.3277108433734939, |
|
"grad_norm": 0.6409060214608714, |
|
"learning_rate": 9.229706346044749e-06, |
|
"loss": 0.156, |
|
"mean_token_accuracy": 0.9484306424856186, |
|
"num_tokens": 4483095.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.3662650602409638, |
|
"grad_norm": 0.6166450609513697, |
|
"learning_rate": 9.154964744099006e-06, |
|
"loss": 0.1419, |
|
"mean_token_accuracy": 0.9533540047705173, |
|
"num_tokens": 4614167.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.4048192771084338, |
|
"grad_norm": 0.6058092262037136, |
|
"learning_rate": 9.077095414934076e-06, |
|
"loss": 0.1439, |
|
"mean_token_accuracy": 0.9524685628712177, |
|
"num_tokens": 4745239.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.4433734939759035, |
|
"grad_norm": 0.6464674278239464, |
|
"learning_rate": 8.996156976140088e-06, |
|
"loss": 0.1427, |
|
"mean_token_accuracy": 0.9521632380783558, |
|
"num_tokens": 4876311.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.4819277108433735, |
|
"grad_norm": 0.6232124362016298, |
|
"learning_rate": 8.91221035563669e-06, |
|
"loss": 0.1387, |
|
"mean_token_accuracy": 0.9537738263607025, |
|
"num_tokens": 5007383.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.5204819277108435, |
|
"grad_norm": 0.6251055517263481, |
|
"learning_rate": 8.82531874580844e-06, |
|
"loss": 0.1544, |
|
"mean_token_accuracy": 0.9496977403759956, |
|
"num_tokens": 5138455.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.5590361445783132, |
|
"grad_norm": 0.6597130966145244, |
|
"learning_rate": 8.735547555935538e-06, |
|
"loss": 0.1467, |
|
"mean_token_accuracy": 0.951957143843174, |
|
"num_tokens": 5269527.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.5590361445783132, |
|
"eval_loss": 0.34304243326187134, |
|
"eval_mean_token_accuracy": 0.9011661727851796, |
|
"eval_num_tokens": 5269527.0, |
|
"eval_runtime": 69.6573, |
|
"eval_samples_per_second": 12.274, |
|
"eval_steps_per_second": 1.536, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.5975903614457831, |
|
"grad_norm": 0.6093216234766912, |
|
"learning_rate": 8.642964362955781e-06, |
|
"loss": 0.145, |
|
"mean_token_accuracy": 0.9515700563788414, |
|
"num_tokens": 5400161.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.636144578313253, |
|
"grad_norm": 0.5687703380048487, |
|
"learning_rate": 8.547638860594765e-06, |
|
"loss": 0.1484, |
|
"mean_token_accuracy": 0.9509495720267296, |
|
"num_tokens": 5531233.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.6746987951807228, |
|
"grad_norm": 0.6551898466798518, |
|
"learning_rate": 8.449642806902623e-06, |
|
"loss": 0.1568, |
|
"mean_token_accuracy": 0.9481558501720428, |
|
"num_tokens": 5662305.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.7132530120481928, |
|
"grad_norm": 0.6433780292504243, |
|
"learning_rate": 8.349049970236822e-06, |
|
"loss": 0.1349, |
|
"mean_token_accuracy": 0.954715259373188, |
|
"num_tokens": 5792219.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.7518072289156628, |
|
"grad_norm": 0.5701046312406493, |
|
"learning_rate": 8.245936073731654e-06, |
|
"loss": 0.147, |
|
"mean_token_accuracy": 0.9507969096302986, |
|
"num_tokens": 5923291.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.7903614457831325, |
|
"grad_norm": 0.6865332623152001, |
|
"learning_rate": 8.140378738296233e-06, |
|
"loss": 0.1529, |
|
"mean_token_accuracy": 0.9498768150806427, |
|
"num_tokens": 6053822.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.8289156626506025, |
|
"grad_norm": 0.6305307568855328, |
|
"learning_rate": 8.032457424183909e-06, |
|
"loss": 0.1476, |
|
"mean_token_accuracy": 0.9505984485149384, |
|
"num_tokens": 6184894.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.8674698795180724, |
|
"grad_norm": 0.5748443476790706, |
|
"learning_rate": 7.922253371177081e-06, |
|
"loss": 0.155, |
|
"mean_token_accuracy": 0.9482144415378571, |
|
"num_tokens": 6315149.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.9060240963855422, |
|
"grad_norm": 0.5993128969226361, |
|
"learning_rate": 7.809849537432432e-06, |
|
"loss": 0.1434, |
|
"mean_token_accuracy": 0.9525645859539509, |
|
"num_tokens": 6445345.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.944578313253012, |
|
"grad_norm": 0.6280456904784001, |
|
"learning_rate": 7.695330537032629e-06, |
|
"loss": 0.1445, |
|
"mean_token_accuracy": 0.9512222707271576, |
|
"num_tokens": 6576344.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.944578313253012, |
|
"eval_loss": 0.3398211598396301, |
|
"eval_mean_token_accuracy": 0.901328669530209, |
|
"eval_num_tokens": 6576344.0, |
|
"eval_runtime": 69.654, |
|
"eval_samples_per_second": 12.275, |
|
"eval_steps_per_second": 1.536, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.983132530120482, |
|
"grad_norm": 0.6197902890500856, |
|
"learning_rate": 7.578782576291501e-06, |
|
"loss": 0.1506, |
|
"mean_token_accuracy": 0.9492092207074165, |
|
"num_tokens": 6707416.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.0385542168674697, |
|
"grad_norm": 0.6409344863530665, |
|
"learning_rate": 7.460293388860616e-06, |
|
"loss": 0.1754, |
|
"mean_token_accuracy": 0.9643502771854401, |
|
"num_tokens": 6871256.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.07710843373494, |
|
"grad_norm": 0.6097248296204885, |
|
"learning_rate": 7.3399521696861505e-06, |
|
"loss": 0.1092, |
|
"mean_token_accuracy": 0.9659219309687614, |
|
"num_tokens": 7002255.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.1156626506024097, |
|
"grad_norm": 0.5903613108322504, |
|
"learning_rate": 7.217849507865724e-06, |
|
"loss": 0.1066, |
|
"mean_token_accuracy": 0.9660860486328602, |
|
"num_tokens": 7133327.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.1542168674698794, |
|
"grad_norm": 0.625091072426359, |
|
"learning_rate": 7.094077318455762e-06, |
|
"loss": 0.1091, |
|
"mean_token_accuracy": 0.9645588099956512, |
|
"num_tokens": 7263523.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.1927710843373496, |
|
"grad_norm": 0.6604015968164485, |
|
"learning_rate": 6.96872877328073e-06, |
|
"loss": 0.1052, |
|
"mean_token_accuracy": 0.9661929123103619, |
|
"num_tokens": 7394595.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.2313253012048193, |
|
"grad_norm": 0.7455880093770229, |
|
"learning_rate": 6.841898230796302e-06, |
|
"loss": 0.1049, |
|
"mean_token_accuracy": 0.9661089479923248, |
|
"num_tokens": 7525667.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.269879518072289, |
|
"grad_norm": 0.6028303919109465, |
|
"learning_rate": 6.713681165059271e-06, |
|
"loss": 0.1127, |
|
"mean_token_accuracy": 0.9631625637412071, |
|
"num_tokens": 7656739.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.3084337349397592, |
|
"grad_norm": 0.6799912009709536, |
|
"learning_rate": 6.584174093857676e-06, |
|
"loss": 0.1035, |
|
"mean_token_accuracy": 0.9669562242925167, |
|
"num_tokens": 7787811.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.346987951807229, |
|
"grad_norm": 0.6255570427114552, |
|
"learning_rate": 6.453474506055228e-06, |
|
"loss": 0.1176, |
|
"mean_token_accuracy": 0.9615787602961063, |
|
"num_tokens": 7916616.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.346987951807229, |
|
"eval_loss": 0.38193774223327637, |
|
"eval_mean_token_accuracy": 0.8994210568543907, |
|
"eval_num_tokens": 7916616.0, |
|
"eval_runtime": 69.6436, |
|
"eval_samples_per_second": 12.277, |
|
"eval_steps_per_second": 1.536, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.3855421686746987, |
|
"grad_norm": 0.6279356138996781, |
|
"learning_rate": 6.3216807882047585e-06, |
|
"loss": 0.0974, |
|
"mean_token_accuracy": 0.968185156583786, |
|
"num_tokens": 8047688.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.4240963855421684, |
|
"grad_norm": 0.6479503216427691, |
|
"learning_rate": 6.188892150485904e-06, |
|
"loss": 0.1087, |
|
"mean_token_accuracy": 0.9651853404939175, |
|
"num_tokens": 8178760.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.4626506024096386, |
|
"grad_norm": 0.7228376218883897, |
|
"learning_rate": 6.0552085520227875e-06, |
|
"loss": 0.1136, |
|
"mean_token_accuracy": 0.9631396643817425, |
|
"num_tokens": 8309832.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.5012048192771084, |
|
"grad_norm": 0.6292530226739607, |
|
"learning_rate": 5.920730625637934e-06, |
|
"loss": 0.1043, |
|
"mean_token_accuracy": 0.9666203670203686, |
|
"num_tokens": 8440904.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.539759036144578, |
|
"grad_norm": 0.6120273359022707, |
|
"learning_rate": 5.785559602099019e-06, |
|
"loss": 0.1073, |
|
"mean_token_accuracy": 0.9648876488208771, |
|
"num_tokens": 8571976.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.5783132530120483, |
|
"grad_norm": 0.6294342722298523, |
|
"learning_rate": 5.649797233915539e-06, |
|
"loss": 0.1092, |
|
"mean_token_accuracy": 0.9644067622721195, |
|
"num_tokens": 8703048.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.616867469879518, |
|
"grad_norm": 0.5665304014502571, |
|
"learning_rate": 5.513545718742702e-06, |
|
"loss": 0.1086, |
|
"mean_token_accuracy": 0.9646815545856953, |
|
"num_tokens": 8834120.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.6554216867469878, |
|
"grad_norm": 0.5673111264101424, |
|
"learning_rate": 5.376907622450229e-06, |
|
"loss": 0.1154, |
|
"mean_token_accuracy": 0.9624109007418156, |
|
"num_tokens": 8964375.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.693975903614458, |
|
"grad_norm": 0.5636466902202368, |
|
"learning_rate": 5.2399858019140005e-06, |
|
"loss": 0.1045, |
|
"mean_token_accuracy": 0.9666311480104923, |
|
"num_tokens": 9094906.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.7325301204819277, |
|
"grad_norm": 0.5754464602822424, |
|
"learning_rate": 5.102883327588608e-06, |
|
"loss": 0.1075, |
|
"mean_token_accuracy": 0.9647044539451599, |
|
"num_tokens": 9225978.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.7325301204819277, |
|
"eval_loss": 0.37826669216156006, |
|
"eval_mean_token_accuracy": 0.8995784972315637, |
|
"eval_num_tokens": 9225978.0, |
|
"eval_runtime": 69.6803, |
|
"eval_samples_per_second": 12.27, |
|
"eval_steps_per_second": 1.536, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.7710843373493974, |
|
"grad_norm": 0.5987257906687522, |
|
"learning_rate": 4.965703405919154e-06, |
|
"loss": 0.1041, |
|
"mean_token_accuracy": 0.9660173505544662, |
|
"num_tokens": 9357050.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.8096385542168676, |
|
"grad_norm": 0.6727909756019579, |
|
"learning_rate": 4.828549301650673e-06, |
|
"loss": 0.1165, |
|
"mean_token_accuracy": 0.9626206122338772, |
|
"num_tokens": 9488122.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.8481927710843373, |
|
"grad_norm": 0.5483728501054262, |
|
"learning_rate": 4.691524260093672e-06, |
|
"loss": 0.1101, |
|
"mean_token_accuracy": 0.9640556387603283, |
|
"num_tokens": 9619194.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.886746987951807, |
|
"grad_norm": 0.6578615356471254, |
|
"learning_rate": 4.554731429404293e-06, |
|
"loss": 0.1167, |
|
"mean_token_accuracy": 0.9623610861599445, |
|
"num_tokens": 9750266.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.9253012048192772, |
|
"grad_norm": 0.544341897970942, |
|
"learning_rate": 4.4182737829376135e-06, |
|
"loss": 0.1068, |
|
"mean_token_accuracy": 0.965429600328207, |
|
"num_tokens": 9881338.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.963855421686747, |
|
"grad_norm": 0.5807218274090602, |
|
"learning_rate": 4.28225404173254e-06, |
|
"loss": 0.1058, |
|
"mean_token_accuracy": 0.965176422148943, |
|
"num_tokens": 10011972.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 3.019277108433735, |
|
"grad_norm": 1.007803950038667, |
|
"learning_rate": 4.146774597186622e-06, |
|
"loss": 0.1488, |
|
"mean_token_accuracy": 0.9695591181516647, |
|
"num_tokens": 10175812.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 3.057831325301205, |
|
"grad_norm": 0.6613641201206724, |
|
"learning_rate": 4.011937433979014e-06, |
|
"loss": 0.0847, |
|
"mean_token_accuracy": 0.9746656753122807, |
|
"num_tokens": 10306884.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 3.0963855421686746, |
|
"grad_norm": 0.5427167115705699, |
|
"learning_rate": 3.87784405329962e-06, |
|
"loss": 0.0838, |
|
"mean_token_accuracy": 0.9741344675421715, |
|
"num_tokens": 10437883.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 3.1349397590361447, |
|
"grad_norm": 0.5059704125761413, |
|
"learning_rate": 3.744595396442169e-06, |
|
"loss": 0.0814, |
|
"mean_token_accuracy": 0.9750473313033581, |
|
"num_tokens": 10568955.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.1349397590361447, |
|
"eval_loss": 0.4201391637325287, |
|
"eval_mean_token_accuracy": 0.8986482670374005, |
|
"eval_num_tokens": 10568955.0, |
|
"eval_runtime": 69.8903, |
|
"eval_samples_per_second": 12.233, |
|
"eval_steps_per_second": 1.531, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.1734939759036145, |
|
"grad_norm": 0.4955524619584041, |
|
"learning_rate": 3.612291768818772e-06, |
|
"loss": 0.0827, |
|
"mean_token_accuracy": 0.9744977466762066, |
|
"num_tokens": 10700027.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.212048192771084, |
|
"grad_norm": 0.5481909266796648, |
|
"learning_rate": 3.4810327644531606e-06, |
|
"loss": 0.0804, |
|
"mean_token_accuracy": 0.9746122434735298, |
|
"num_tokens": 10831099.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.2506024096385544, |
|
"grad_norm": 0.5869274418415635, |
|
"learning_rate": 3.3509171910094162e-06, |
|
"loss": 0.0849, |
|
"mean_token_accuracy": 0.9735665060579777, |
|
"num_tokens": 10962171.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.289156626506024, |
|
"grad_norm": 0.5997938570160334, |
|
"learning_rate": 3.222042995412669e-06, |
|
"loss": 0.0826, |
|
"mean_token_accuracy": 0.9744274839758873, |
|
"num_tokens": 11092367.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.327710843373494, |
|
"grad_norm": 0.5638967234440626, |
|
"learning_rate": 3.094507190117715e-06, |
|
"loss": 0.0752, |
|
"mean_token_accuracy": 0.9760014712810516, |
|
"num_tokens": 11223439.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.3662650602409636, |
|
"grad_norm": 0.5677450107311146, |
|
"learning_rate": 2.9684057800810844e-06, |
|
"loss": 0.0849, |
|
"mean_token_accuracy": 0.9734520092606544, |
|
"num_tokens": 11354511.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.404819277108434, |
|
"grad_norm": 0.5694190125459168, |
|
"learning_rate": 2.8438336904915186e-06, |
|
"loss": 0.0907, |
|
"mean_token_accuracy": 0.9719940833747387, |
|
"num_tokens": 11485583.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.4433734939759035, |
|
"grad_norm": 0.5008764813796651, |
|
"learning_rate": 2.7208846953132685e-06, |
|
"loss": 0.0782, |
|
"mean_token_accuracy": 0.9755356945097446, |
|
"num_tokens": 11616217.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.4819277108433733, |
|
"grad_norm": 0.5027767263738213, |
|
"learning_rate": 2.599651346695979e-06, |
|
"loss": 0.0773, |
|
"mean_token_accuracy": 0.9762609973549843, |
|
"num_tokens": 11747289.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.5204819277108435, |
|
"grad_norm": 0.5747857741850161, |
|
"learning_rate": 2.4802249053043525e-06, |
|
"loss": 0.0777, |
|
"mean_token_accuracy": 0.976215198636055, |
|
"num_tokens": 11878361.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.5204819277108435, |
|
"eval_loss": 0.43149346113204956, |
|
"eval_mean_token_accuracy": 0.898219308563482, |
|
"eval_num_tokens": 11878361.0, |
|
"eval_runtime": 69.6743, |
|
"eval_samples_per_second": 12.271, |
|
"eval_steps_per_second": 1.536, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.559036144578313, |
|
"grad_norm": 0.5115273312879999, |
|
"learning_rate": 2.3626952716199647e-06, |
|
"loss": 0.0792, |
|
"mean_token_accuracy": 0.9750167988240719, |
|
"num_tokens": 12009433.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.597590361445783, |
|
"grad_norm": 0.5172911491980401, |
|
"learning_rate": 2.247150918267008e-06, |
|
"loss": 0.0851, |
|
"mean_token_accuracy": 0.9730398207902908, |
|
"num_tokens": 12140505.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.636144578313253, |
|
"grad_norm": 0.5260093719963543, |
|
"learning_rate": 2.133678823412873e-06, |
|
"loss": 0.0797, |
|
"mean_token_accuracy": 0.9751236625015736, |
|
"num_tokens": 12271577.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.674698795180723, |
|
"grad_norm": 0.5267292864138245, |
|
"learning_rate": 2.022364405293703e-06, |
|
"loss": 0.0832, |
|
"mean_token_accuracy": 0.9738947302103043, |
|
"num_tokens": 12402649.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.7132530120481926, |
|
"grad_norm": 0.5065512725199254, |
|
"learning_rate": 1.913291457914234e-06, |
|
"loss": 0.0856, |
|
"mean_token_accuracy": 0.9732001163065434, |
|
"num_tokens": 12533721.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.7518072289156628, |
|
"grad_norm": 0.5465242770321679, |
|
"learning_rate": 1.8065420879702888e-06, |
|
"loss": 0.0838, |
|
"mean_token_accuracy": 0.9731762520968914, |
|
"num_tokens": 12663435.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.7903614457831325, |
|
"grad_norm": 0.7823063875533764, |
|
"learning_rate": 1.7021966530414303e-06, |
|
"loss": 0.0762, |
|
"mean_token_accuracy": 0.9758411757647991, |
|
"num_tokens": 12794507.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.8289156626506022, |
|
"grad_norm": 0.571380544699335, |
|
"learning_rate": 1.6003337011002928e-06, |
|
"loss": 0.084, |
|
"mean_token_accuracy": 0.9734901748597622, |
|
"num_tokens": 12925579.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.8674698795180724, |
|
"grad_norm": 0.5400258981871386, |
|
"learning_rate": 1.5010299113841397e-06, |
|
"loss": 0.0807, |
|
"mean_token_accuracy": 0.9752305261790752, |
|
"num_tokens": 13056651.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.906024096385542, |
|
"grad_norm": 0.5204832843446408, |
|
"learning_rate": 1.4043600366731213e-06, |
|
"loss": 0.0821, |
|
"mean_token_accuracy": 0.9745206460356712, |
|
"num_tokens": 13187723.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.906024096385542, |
|
"eval_loss": 0.43459072709083557, |
|
"eval_mean_token_accuracy": 0.8980461002510285, |
|
"eval_num_tokens": 13187723.0, |
|
"eval_runtime": 69.6812, |
|
"eval_samples_per_second": 12.27, |
|
"eval_steps_per_second": 1.536, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.944578313253012, |
|
"grad_norm": 0.5732935867678565, |
|
"learning_rate": 1.3103968470187384e-06, |
|
"loss": 0.0841, |
|
"mean_token_accuracy": 0.973306454718113, |
|
"num_tokens": 13317686.0, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.983132530120482, |
|
"grad_norm": 0.5049593156468802, |
|
"learning_rate": 1.2192110749648233e-06, |
|
"loss": 0.0783, |
|
"mean_token_accuracy": 0.9752342775464058, |
|
"num_tokens": 13447600.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 4.03855421686747, |
|
"grad_norm": 0.4900616503984239, |
|
"learning_rate": 1.1308713623022988e-06, |
|
"loss": 0.1075, |
|
"mean_token_accuracy": 0.9786272644996643, |
|
"num_tokens": 13611440.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 4.0771084337349395, |
|
"grad_norm": 0.4917129834327916, |
|
"learning_rate": 1.045444208397791e-06, |
|
"loss": 0.0676, |
|
"mean_token_accuracy": 0.9801687188446522, |
|
"num_tokens": 13740537.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 4.11566265060241, |
|
"grad_norm": 0.47200516762524886, |
|
"learning_rate": 9.629939201349852e-07, |
|
"loss": 0.0723, |
|
"mean_token_accuracy": 0.9782837741076946, |
|
"num_tokens": 13871609.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.15421686746988, |
|
"grad_norm": 0.44277012092487705, |
|
"learning_rate": 8.835825635064266e-07, |
|
"loss": 0.0729, |
|
"mean_token_accuracy": 0.9780853129923344, |
|
"num_tokens": 14002681.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 4.192771084337349, |
|
"grad_norm": 0.4753962832603972, |
|
"learning_rate": 8.072699168921827e-07, |
|
"loss": 0.0749, |
|
"mean_token_accuracy": 0.9778944849967957, |
|
"num_tokens": 14133753.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 4.231325301204819, |
|
"grad_norm": 0.48346978347475456, |
|
"learning_rate": 7.341134260605537e-07, |
|
"loss": 0.0692, |
|
"mean_token_accuracy": 0.9793745614588261, |
|
"num_tokens": 14264314.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 4.2698795180722895, |
|
"grad_norm": 0.4328206037632282, |
|
"learning_rate": 6.641681609246981e-07, |
|
"loss": 0.066, |
|
"mean_token_accuracy": 0.9801309891045094, |
|
"num_tokens": 14395386.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 4.308433734939759, |
|
"grad_norm": 0.46221534542018206, |
|
"learning_rate": 5.974867740877282e-07, |
|
"loss": 0.0696, |
|
"mean_token_accuracy": 0.9789478555321693, |
|
"num_tokens": 14526458.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.308433734939759, |
|
"eval_loss": 0.4595886468887329, |
|
"eval_mean_token_accuracy": 0.897223442514366, |
|
"eval_num_tokens": 14526458.0, |
|
"eval_runtime": 69.6441, |
|
"eval_samples_per_second": 12.277, |
|
"eval_steps_per_second": 1.536, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.346987951807229, |
|
"grad_norm": 0.4739286679144528, |
|
"learning_rate": 5.341194612074824e-07, |
|
"loss": 0.068, |
|
"mean_token_accuracy": 0.9796868488192558, |
|
"num_tokens": 14656421.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 4.385542168674699, |
|
"grad_norm": 0.43096986690967987, |
|
"learning_rate": 4.7411392321080606e-07, |
|
"loss": 0.0663, |
|
"mean_token_accuracy": 0.9802683852612972, |
|
"num_tokens": 14787493.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 4.424096385542168, |
|
"grad_norm": 0.46557922408208563, |
|
"learning_rate": 4.175153303857887e-07, |
|
"loss": 0.0654, |
|
"mean_token_accuracy": 0.9804821126163006, |
|
"num_tokens": 14918565.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 4.462650602409639, |
|
"grad_norm": 0.5546707256189516, |
|
"learning_rate": 3.643662883789878e-07, |
|
"loss": 0.0673, |
|
"mean_token_accuracy": 0.979527972638607, |
|
"num_tokens": 15049637.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 4.501204819277109, |
|
"grad_norm": 0.49021519394663, |
|
"learning_rate": 3.1470680612323503e-07, |
|
"loss": 0.07, |
|
"mean_token_accuracy": 0.9785585664212704, |
|
"num_tokens": 15180709.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.539759036144578, |
|
"grad_norm": 0.45571708386475684, |
|
"learning_rate": 2.685742657201601e-07, |
|
"loss": 0.0697, |
|
"mean_token_accuracy": 0.9785204008221626, |
|
"num_tokens": 15311781.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 4.578313253012048, |
|
"grad_norm": 0.5641008416839415, |
|
"learning_rate": 2.260033943001244e-07, |
|
"loss": 0.0663, |
|
"mean_token_accuracy": 0.9797416999936104, |
|
"num_tokens": 15442853.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 4.6168674698795185, |
|
"grad_norm": 0.5607141029792978, |
|
"learning_rate": 1.8702623788072028e-07, |
|
"loss": 0.0793, |
|
"mean_token_accuracy": 0.9755663834512234, |
|
"num_tokens": 15573925.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 4.655421686746988, |
|
"grad_norm": 0.46095439859311127, |
|
"learning_rate": 1.5167213724353426e-07, |
|
"loss": 0.0714, |
|
"mean_token_accuracy": 0.9779479168355465, |
|
"num_tokens": 15704997.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 4.693975903614458, |
|
"grad_norm": 0.464368810663561, |
|
"learning_rate": 1.199677058473292e-07, |
|
"loss": 0.066, |
|
"mean_token_accuracy": 0.980153888463974, |
|
"num_tokens": 15836069.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.693975903614458, |
|
"eval_loss": 0.46903374791145325, |
|
"eval_mean_token_accuracy": 0.8968599628065234, |
|
"eval_num_tokens": 15836069.0, |
|
"eval_runtime": 69.6558, |
|
"eval_samples_per_second": 12.275, |
|
"eval_steps_per_second": 1.536, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.732530120481927, |
|
"grad_norm": 0.5162077757262011, |
|
"learning_rate": 9.193680979426189e-08, |
|
"loss": 0.0775, |
|
"mean_token_accuracy": 0.9764594584703445, |
|
"num_tokens": 15967141.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 4.771084337349397, |
|
"grad_norm": 0.4482450270539155, |
|
"learning_rate": 6.760054986423459e-08, |
|
"loss": 0.0632, |
|
"mean_token_accuracy": 0.9808179698884487, |
|
"num_tokens": 16098213.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 4.809638554216868, |
|
"grad_norm": 0.4698597407866022, |
|
"learning_rate": 4.697724563088646e-08, |
|
"loss": 0.0681, |
|
"mean_token_accuracy": 0.9797111675143242, |
|
"num_tokens": 16229285.0, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 4.848192771084337, |
|
"grad_norm": 0.4662674319978425, |
|
"learning_rate": 3.0082421671192576e-08, |
|
"loss": 0.0688, |
|
"mean_token_accuracy": 0.97944400832057, |
|
"num_tokens": 16360357.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 4.886746987951807, |
|
"grad_norm": 0.46327536754981147, |
|
"learning_rate": 1.692879587904983e-08, |
|
"loss": 0.0662, |
|
"mean_token_accuracy": 0.9799401611089706, |
|
"num_tokens": 16491429.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.925301204819277, |
|
"grad_norm": 0.4688691090714117, |
|
"learning_rate": 7.526269891646176e-09, |
|
"loss": 0.0642, |
|
"mean_token_accuracy": 0.9807046689093113, |
|
"num_tokens": 16621960.0, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 4.9638554216867465, |
|
"grad_norm": 0.4516057398304381, |
|
"learning_rate": 1.8819216358156865e-09, |
|
"loss": 0.0688, |
|
"mean_token_accuracy": 0.9792744368314743, |
|
"num_tokens": 16752156.0, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 4.983132530120482, |
|
"mean_token_accuracy": 0.976367861032486, |
|
"num_tokens": 16817692.0, |
|
"step": 255, |
|
"total_flos": 24409842647040.0, |
|
"train_loss": 0.12274208276295194, |
|
"train_runtime": 3782.9235, |
|
"train_samples_per_second": 2.194, |
|
"train_steps_per_second": 0.067 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 255, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 1.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 24409842647040.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|