|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.983132530120482, |
|
"eval_steps": 20, |
|
"global_step": 408, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03855421686746988, |
|
"grad_norm": 1.3406128184465234, |
|
"learning_rate": 9.999851776425575e-06, |
|
"loss": 0.2215, |
|
"mean_token_accuracy": 0.9319889023900032, |
|
"num_tokens": 131072.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.07710843373493977, |
|
"grad_norm": 1.0868872715450462, |
|
"learning_rate": 9.998666040558187e-06, |
|
"loss": 0.2017, |
|
"mean_token_accuracy": 0.9355688355863094, |
|
"num_tokens": 262144.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.11566265060240964, |
|
"grad_norm": 0.8046336700273372, |
|
"learning_rate": 9.996294850025658e-06, |
|
"loss": 0.1993, |
|
"mean_token_accuracy": 0.9353551082313061, |
|
"num_tokens": 393216.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.15421686746987953, |
|
"grad_norm": 0.6584361140812891, |
|
"learning_rate": 9.992738767165791e-06, |
|
"loss": 0.1778, |
|
"mean_token_accuracy": 0.9412478767335415, |
|
"num_tokens": 524288.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1927710843373494, |
|
"grad_norm": 0.6774335920541141, |
|
"learning_rate": 9.987998635318586e-06, |
|
"loss": 0.1902, |
|
"mean_token_accuracy": 0.9374810568988323, |
|
"num_tokens": 654484.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23132530120481928, |
|
"grad_norm": 0.6105295086669309, |
|
"learning_rate": 9.982075578626235e-06, |
|
"loss": 0.1966, |
|
"mean_token_accuracy": 0.9350955821573734, |
|
"num_tokens": 785556.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.26987951807228916, |
|
"grad_norm": 0.6597942712481647, |
|
"learning_rate": 9.974971001766534e-06, |
|
"loss": 0.1967, |
|
"mean_token_accuracy": 0.9350586608052254, |
|
"num_tokens": 915519.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.30843373493975906, |
|
"grad_norm": 0.6435049991004593, |
|
"learning_rate": 9.96668658961975e-06, |
|
"loss": 0.1941, |
|
"mean_token_accuracy": 0.9362405501306057, |
|
"num_tokens": 1046591.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.3469879518072289, |
|
"grad_norm": 0.5815050330318928, |
|
"learning_rate": 9.957224306869053e-06, |
|
"loss": 0.1978, |
|
"mean_token_accuracy": 0.9344849325716496, |
|
"num_tokens": 1177663.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.3855421686746988, |
|
"grad_norm": 0.6062885715121378, |
|
"learning_rate": 9.946586397534572e-06, |
|
"loss": 0.1901, |
|
"mean_token_accuracy": 0.9375763460993767, |
|
"num_tokens": 1308735.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3855421686746988, |
|
"eval_loss": 0.3250451982021332, |
|
"eval_mean_token_accuracy": 0.9015540636588479, |
|
"eval_num_tokens": 1308735.0, |
|
"eval_runtime": 32.894, |
|
"eval_samples_per_second": 25.993, |
|
"eval_steps_per_second": 3.253, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.42409638554216866, |
|
"grad_norm": 0.6159276714953087, |
|
"learning_rate": 9.93477538444123e-06, |
|
"loss": 0.1792, |
|
"mean_token_accuracy": 0.9396754540503025, |
|
"num_tokens": 1439807.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.46265060240963857, |
|
"grad_norm": 0.5991422890793899, |
|
"learning_rate": 9.92179406862043e-06, |
|
"loss": 0.1898, |
|
"mean_token_accuracy": 0.9368689768016338, |
|
"num_tokens": 1570062.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.5012048192771085, |
|
"grad_norm": 0.6010392974067564, |
|
"learning_rate": 9.907645528645791e-06, |
|
"loss": 0.1823, |
|
"mean_token_accuracy": 0.9397899508476257, |
|
"num_tokens": 1701134.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.5397590361445783, |
|
"grad_norm": 0.6463308628240576, |
|
"learning_rate": 9.892333119903045e-06, |
|
"loss": 0.1801, |
|
"mean_token_accuracy": 0.9400751106441021, |
|
"num_tokens": 1832133.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5783132530120482, |
|
"grad_norm": 0.5855780926666729, |
|
"learning_rate": 9.875860473794302e-06, |
|
"loss": 0.1887, |
|
"mean_token_accuracy": 0.9366145730018616, |
|
"num_tokens": 1963205.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.6168674698795181, |
|
"grad_norm": 0.5767450209509862, |
|
"learning_rate": 9.85823149687683e-06, |
|
"loss": 0.1806, |
|
"mean_token_accuracy": 0.9403013698756695, |
|
"num_tokens": 2094277.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.655421686746988, |
|
"grad_norm": 0.6078338694602209, |
|
"learning_rate": 9.839450369936615e-06, |
|
"loss": 0.1804, |
|
"mean_token_accuracy": 0.9400189444422722, |
|
"num_tokens": 2225349.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.6939759036144578, |
|
"grad_norm": 0.5890734557721334, |
|
"learning_rate": 9.819521546996864e-06, |
|
"loss": 0.1824, |
|
"mean_token_accuracy": 0.9395788721740246, |
|
"num_tokens": 2355263.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.7325301204819277, |
|
"grad_norm": 31.323491593405347, |
|
"learning_rate": 9.798449754261716e-06, |
|
"loss": 0.2134, |
|
"mean_token_accuracy": 0.9329964704811573, |
|
"num_tokens": 2486335.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7710843373493976, |
|
"grad_norm": 0.668421005752788, |
|
"learning_rate": 9.776239988995401e-06, |
|
"loss": 0.1756, |
|
"mean_token_accuracy": 0.9409272857010365, |
|
"num_tokens": 2617407.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7710843373493976, |
|
"eval_loss": 0.323132187128067, |
|
"eval_mean_token_accuracy": 0.9019459669835099, |
|
"eval_num_tokens": 2617407.0, |
|
"eval_runtime": 32.8659, |
|
"eval_samples_per_second": 26.015, |
|
"eval_steps_per_second": 3.256, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.8096385542168675, |
|
"grad_norm": 0.6550725405812086, |
|
"learning_rate": 9.752897518337117e-06, |
|
"loss": 0.176, |
|
"mean_token_accuracy": 0.9411181136965752, |
|
"num_tokens": 2748479.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.8481927710843373, |
|
"grad_norm": 0.6735202725511205, |
|
"learning_rate": 9.72842787805191e-06, |
|
"loss": 0.1854, |
|
"mean_token_accuracy": 0.9372099563479424, |
|
"num_tokens": 2879551.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8867469879518072, |
|
"grad_norm": 0.6034625832801067, |
|
"learning_rate": 9.702836871217838e-06, |
|
"loss": 0.1783, |
|
"mean_token_accuracy": 0.9399880617856979, |
|
"num_tokens": 3010185.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.9253012048192771, |
|
"grad_norm": 0.5972724356437912, |
|
"learning_rate": 9.676130566849757e-06, |
|
"loss": 0.1878, |
|
"mean_token_accuracy": 0.936996228992939, |
|
"num_tokens": 3141257.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.6007518988428301, |
|
"learning_rate": 9.64831529846001e-06, |
|
"loss": 0.1824, |
|
"mean_token_accuracy": 0.9391285218298435, |
|
"num_tokens": 3271788.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.0192771084337349, |
|
"grad_norm": 0.9787825024544956, |
|
"learning_rate": 9.619397662556434e-06, |
|
"loss": 0.2482, |
|
"mean_token_accuracy": 0.9454934179782868, |
|
"num_tokens": 3435628.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.0578313253012048, |
|
"grad_norm": 0.7523664235798198, |
|
"learning_rate": 9.589384517077945e-06, |
|
"loss": 0.1415, |
|
"mean_token_accuracy": 0.9537432938814163, |
|
"num_tokens": 3566700.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0963855421686748, |
|
"grad_norm": 0.6132260326219375, |
|
"learning_rate": 9.558282979768164e-06, |
|
"loss": 0.1356, |
|
"mean_token_accuracy": 0.9553309828042984, |
|
"num_tokens": 3697772.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.1349397590361445, |
|
"grad_norm": 0.659449619068034, |
|
"learning_rate": 9.52610042648741e-06, |
|
"loss": 0.14, |
|
"mean_token_accuracy": 0.9534532353281975, |
|
"num_tokens": 3828844.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.1734939759036145, |
|
"grad_norm": 0.798456158560065, |
|
"learning_rate": 9.492844489463486e-06, |
|
"loss": 0.1402, |
|
"mean_token_accuracy": 0.9540104530751705, |
|
"num_tokens": 3959916.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1734939759036145, |
|
"eval_loss": 0.3658570647239685, |
|
"eval_mean_token_accuracy": 0.9004586478260076, |
|
"eval_num_tokens": 3959916.0, |
|
"eval_runtime": 32.9568, |
|
"eval_samples_per_second": 25.943, |
|
"eval_steps_per_second": 3.247, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.2120481927710842, |
|
"grad_norm": 0.7106569795117357, |
|
"learning_rate": 9.458523055481658e-06, |
|
"loss": 0.1328, |
|
"mean_token_accuracy": 0.9561785310506821, |
|
"num_tokens": 4089879.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.2506024096385542, |
|
"grad_norm": 0.622759262640549, |
|
"learning_rate": 9.423144264014278e-06, |
|
"loss": 0.1269, |
|
"mean_token_accuracy": 0.9574377238750458, |
|
"num_tokens": 4220951.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.2891566265060241, |
|
"grad_norm": 0.6303844538287657, |
|
"learning_rate": 9.386716505290467e-06, |
|
"loss": 0.1413, |
|
"mean_token_accuracy": 0.9541173167526722, |
|
"num_tokens": 4352023.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.3277108433734939, |
|
"grad_norm": 0.6507157381170863, |
|
"learning_rate": 9.349248418306347e-06, |
|
"loss": 0.1455, |
|
"mean_token_accuracy": 0.9517892152070999, |
|
"num_tokens": 4483095.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.3662650602409638, |
|
"grad_norm": 0.602439194952957, |
|
"learning_rate": 9.310748888776254e-06, |
|
"loss": 0.1309, |
|
"mean_token_accuracy": 0.9569797366857529, |
|
"num_tokens": 4614167.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.4048192771084338, |
|
"grad_norm": 0.603487797008034, |
|
"learning_rate": 9.271227047025462e-06, |
|
"loss": 0.1333, |
|
"mean_token_accuracy": 0.9561706259846687, |
|
"num_tokens": 4745239.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.4433734939759035, |
|
"grad_norm": 0.6146915599817614, |
|
"learning_rate": 9.230692265824888e-06, |
|
"loss": 0.1333, |
|
"mean_token_accuracy": 0.9552088528871536, |
|
"num_tokens": 4876311.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.4819277108433735, |
|
"grad_norm": 0.6588127818916016, |
|
"learning_rate": 9.189154158168293e-06, |
|
"loss": 0.1308, |
|
"mean_token_accuracy": 0.9567278437316418, |
|
"num_tokens": 5007383.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.5204819277108435, |
|
"grad_norm": 0.6120032825642568, |
|
"learning_rate": 9.146622574992528e-06, |
|
"loss": 0.1461, |
|
"mean_token_accuracy": 0.9520487412810326, |
|
"num_tokens": 5138455.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.5590361445783132, |
|
"grad_norm": 0.6485104767700068, |
|
"learning_rate": 9.103107602841341e-06, |
|
"loss": 0.1384, |
|
"mean_token_accuracy": 0.9545066058635712, |
|
"num_tokens": 5269527.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.5590361445783132, |
|
"eval_loss": 0.3478308916091919, |
|
"eval_mean_token_accuracy": 0.9005198361717652, |
|
"eval_num_tokens": 5269527.0, |
|
"eval_runtime": 32.9211, |
|
"eval_samples_per_second": 25.971, |
|
"eval_steps_per_second": 3.25, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.5975903614457831, |
|
"grad_norm": 0.6036456378189476, |
|
"learning_rate": 9.058619561473308e-06, |
|
"loss": 0.1351, |
|
"mean_token_accuracy": 0.9550928063690662, |
|
"num_tokens": 5400161.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.636144578313253, |
|
"grad_norm": 0.5662228136563036, |
|
"learning_rate": 9.013169001414458e-06, |
|
"loss": 0.1389, |
|
"mean_token_accuracy": 0.9542012810707092, |
|
"num_tokens": 5531233.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.6746987951807228, |
|
"grad_norm": 0.668176616317804, |
|
"learning_rate": 8.966766701456177e-06, |
|
"loss": 0.1472, |
|
"mean_token_accuracy": 0.9509572051465511, |
|
"num_tokens": 5662305.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.7132530120481928, |
|
"grad_norm": 0.6659519140143432, |
|
"learning_rate": 8.91942366609897e-06, |
|
"loss": 0.1278, |
|
"mean_token_accuracy": 0.9569377563893795, |
|
"num_tokens": 5792219.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.7518072289156628, |
|
"grad_norm": 0.5730839087389755, |
|
"learning_rate": 8.871151122942692e-06, |
|
"loss": 0.1383, |
|
"mean_token_accuracy": 0.9540486186742783, |
|
"num_tokens": 5923291.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.7903614457831325, |
|
"grad_norm": 0.7339967333799423, |
|
"learning_rate": 8.821960520023884e-06, |
|
"loss": 0.1473, |
|
"mean_token_accuracy": 0.9519094601273537, |
|
"num_tokens": 6053822.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.8289156626506025, |
|
"grad_norm": 0.6406836390946679, |
|
"learning_rate": 8.771863523100821e-06, |
|
"loss": 0.1412, |
|
"mean_token_accuracy": 0.9523235335946083, |
|
"num_tokens": 6184894.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.8674698795180724, |
|
"grad_norm": 0.5600258442949189, |
|
"learning_rate": 8.720872012886918e-06, |
|
"loss": 0.1477, |
|
"mean_token_accuracy": 0.9504381828010082, |
|
"num_tokens": 6315149.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.9060240963855422, |
|
"grad_norm": 0.5933525309833521, |
|
"learning_rate": 8.668998082233186e-06, |
|
"loss": 0.1356, |
|
"mean_token_accuracy": 0.9551424346864223, |
|
"num_tokens": 6445345.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.944578313253012, |
|
"grad_norm": 0.6535174387269015, |
|
"learning_rate": 8.616254033260351e-06, |
|
"loss": 0.1365, |
|
"mean_token_accuracy": 0.9537569470703602, |
|
"num_tokens": 6576344.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.944578313253012, |
|
"eval_loss": 0.34725868701934814, |
|
"eval_mean_token_accuracy": 0.9006962408529264, |
|
"eval_num_tokens": 6576344.0, |
|
"eval_runtime": 32.8904, |
|
"eval_samples_per_second": 25.995, |
|
"eval_steps_per_second": 3.253, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.983132530120482, |
|
"grad_norm": 0.660872027341127, |
|
"learning_rate": 8.56265237444135e-06, |
|
"loss": 0.1433, |
|
"mean_token_accuracy": 0.9518960788846016, |
|
"num_tokens": 6707416.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.0385542168674697, |
|
"grad_norm": 0.6405367004816473, |
|
"learning_rate": 8.508205817634908e-06, |
|
"loss": 0.1641, |
|
"mean_token_accuracy": 0.9666707456111908, |
|
"num_tokens": 6871256.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 2.07710843373494, |
|
"grad_norm": 0.6396095656248479, |
|
"learning_rate": 8.452927275070858e-06, |
|
"loss": 0.1026, |
|
"mean_token_accuracy": 0.968053013086319, |
|
"num_tokens": 7002255.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 2.1156626506024097, |
|
"grad_norm": 0.6728841385013591, |
|
"learning_rate": 8.39682985628795e-06, |
|
"loss": 0.1013, |
|
"mean_token_accuracy": 0.9683454521000385, |
|
"num_tokens": 7133327.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.1542168674698794, |
|
"grad_norm": 0.6356701289090003, |
|
"learning_rate": 8.339926865024871e-06, |
|
"loss": 0.1015, |
|
"mean_token_accuracy": 0.9669992625713348, |
|
"num_tokens": 7263523.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.1927710843373496, |
|
"grad_norm": 0.6909824113954701, |
|
"learning_rate": 8.282231796065215e-06, |
|
"loss": 0.0982, |
|
"mean_token_accuracy": 0.9683454521000385, |
|
"num_tokens": 7394595.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.2313253012048193, |
|
"grad_norm": 0.6949675615015238, |
|
"learning_rate": 8.223758332037121e-06, |
|
"loss": 0.0971, |
|
"mean_token_accuracy": 0.9685210138559341, |
|
"num_tokens": 7525667.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.269879518072289, |
|
"grad_norm": 0.5752622256768609, |
|
"learning_rate": 8.164520340168404e-06, |
|
"loss": 0.1046, |
|
"mean_token_accuracy": 0.9659944511950016, |
|
"num_tokens": 7656739.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.3084337349397592, |
|
"grad_norm": 0.6456834319746314, |
|
"learning_rate": 8.104531868997858e-06, |
|
"loss": 0.1, |
|
"mean_token_accuracy": 0.9683759845793247, |
|
"num_tokens": 7787811.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.346987951807229, |
|
"grad_norm": 0.6268887677724969, |
|
"learning_rate": 8.043807145043604e-06, |
|
"loss": 0.1089, |
|
"mean_token_accuracy": 0.9643549546599388, |
|
"num_tokens": 7916616.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.346987951807229, |
|
"eval_loss": 0.3908001780509949, |
|
"eval_mean_token_accuracy": 0.8989515037180107, |
|
"eval_num_tokens": 7916616.0, |
|
"eval_runtime": 32.9535, |
|
"eval_samples_per_second": 25.946, |
|
"eval_steps_per_second": 3.247, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.3855421686746987, |
|
"grad_norm": 0.594448184955674, |
|
"learning_rate": 7.982360569429206e-06, |
|
"loss": 0.0919, |
|
"mean_token_accuracy": 0.9701926670968533, |
|
"num_tokens": 8047688.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.4240963855421684, |
|
"grad_norm": 0.6509482507816816, |
|
"learning_rate": 7.920206714468383e-06, |
|
"loss": 0.1, |
|
"mean_token_accuracy": 0.9679943285882473, |
|
"num_tokens": 8178760.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.4626506024096386, |
|
"grad_norm": 0.6114017480150069, |
|
"learning_rate": 7.857360320209126e-06, |
|
"loss": 0.1079, |
|
"mean_token_accuracy": 0.9651777073740959, |
|
"num_tokens": 8309832.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.5012048192771084, |
|
"grad_norm": 0.6679792242974475, |
|
"learning_rate": 7.793836290938026e-06, |
|
"loss": 0.0967, |
|
"mean_token_accuracy": 0.9691850952804089, |
|
"num_tokens": 8440904.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.539759036144578, |
|
"grad_norm": 0.5886521368506863, |
|
"learning_rate": 7.729649691645673e-06, |
|
"loss": 0.0991, |
|
"mean_token_accuracy": 0.9681088253855705, |
|
"num_tokens": 8571976.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.5783132530120483, |
|
"grad_norm": 0.6059968735763466, |
|
"learning_rate": 7.664815744453918e-06, |
|
"loss": 0.1017, |
|
"mean_token_accuracy": 0.9668340943753719, |
|
"num_tokens": 8703048.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.616867469879518, |
|
"grad_norm": 0.63703209859714, |
|
"learning_rate": 7.599349825005892e-06, |
|
"loss": 0.1013, |
|
"mean_token_accuracy": 0.9671317860484123, |
|
"num_tokens": 8834120.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.6554216867469878, |
|
"grad_norm": 0.6404376584659325, |
|
"learning_rate": 7.533267458819597e-06, |
|
"loss": 0.1081, |
|
"mean_token_accuracy": 0.9647228345274925, |
|
"num_tokens": 8964375.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.693975903614458, |
|
"grad_norm": 0.5731515889981167, |
|
"learning_rate": 7.466584317605978e-06, |
|
"loss": 0.0988, |
|
"mean_token_accuracy": 0.9682356528937817, |
|
"num_tokens": 9094906.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.7325301204819277, |
|
"grad_norm": 0.5848185748214219, |
|
"learning_rate": 7.399316215552296e-06, |
|
"loss": 0.1016, |
|
"mean_token_accuracy": 0.9662845097482204, |
|
"num_tokens": 9225978.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.7325301204819277, |
|
"eval_loss": 0.3856795132160187, |
|
"eval_mean_token_accuracy": 0.8990027659407286, |
|
"eval_num_tokens": 9225978.0, |
|
"eval_runtime": 32.897, |
|
"eval_samples_per_second": 25.99, |
|
"eval_steps_per_second": 3.253, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.7710843373493974, |
|
"grad_norm": 0.6029958264244917, |
|
"learning_rate": 7.33147910557174e-06, |
|
"loss": 0.0958, |
|
"mean_token_accuracy": 0.9687576405704021, |
|
"num_tokens": 9357050.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.8096385542168676, |
|
"grad_norm": 0.6783830228161134, |
|
"learning_rate": 7.26308907552012e-06, |
|
"loss": 0.1078, |
|
"mean_token_accuracy": 0.9650097787380219, |
|
"num_tokens": 9488122.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.8481927710843373, |
|
"grad_norm": 0.5827966276168525, |
|
"learning_rate": 7.194162344380561e-06, |
|
"loss": 0.1031, |
|
"mean_token_accuracy": 0.9667501300573349, |
|
"num_tokens": 9619194.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.886746987951807, |
|
"grad_norm": 0.6450599697404067, |
|
"learning_rate": 7.124715258417111e-06, |
|
"loss": 0.1087, |
|
"mean_token_accuracy": 0.9652311392128468, |
|
"num_tokens": 9750266.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.9253012048192772, |
|
"grad_norm": 0.5260422478675805, |
|
"learning_rate": 7.05476428729815e-06, |
|
"loss": 0.0976, |
|
"mean_token_accuracy": 0.9683836176991463, |
|
"num_tokens": 9881338.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.963855421686747, |
|
"grad_norm": 0.6011167506104717, |
|
"learning_rate": 6.984326020190544e-06, |
|
"loss": 0.0996, |
|
"mean_token_accuracy": 0.9671653471887112, |
|
"num_tokens": 10011972.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 3.019277108433735, |
|
"grad_norm": 1.067138524628119, |
|
"learning_rate": 6.913417161825449e-06, |
|
"loss": 0.1396, |
|
"mean_token_accuracy": 0.9718551605939865, |
|
"num_tokens": 10175812.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 3.057831325301205, |
|
"grad_norm": 0.6433380773043862, |
|
"learning_rate": 6.842054528536717e-06, |
|
"loss": 0.0748, |
|
"mean_token_accuracy": 0.9779708161950111, |
|
"num_tokens": 10306884.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 3.0963855421686746, |
|
"grad_norm": 0.5518681598947663, |
|
"learning_rate": 6.770255044272826e-06, |
|
"loss": 0.0736, |
|
"mean_token_accuracy": 0.9776161313056946, |
|
"num_tokens": 10437883.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 3.1349397590361447, |
|
"grad_norm": 0.5716170366461122, |
|
"learning_rate": 6.698035736583307e-06, |
|
"loss": 0.0704, |
|
"mean_token_accuracy": 0.9789860211312771, |
|
"num_tokens": 10568955.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.1349397590361447, |
|
"eval_loss": 0.4618055820465088, |
|
"eval_mean_token_accuracy": 0.8969022353118825, |
|
"eval_num_tokens": 10568955.0, |
|
"eval_runtime": 32.9015, |
|
"eval_samples_per_second": 25.987, |
|
"eval_steps_per_second": 3.252, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.1734939759036145, |
|
"grad_norm": 0.5660248629147534, |
|
"learning_rate": 6.625413732580577e-06, |
|
"loss": 0.0718, |
|
"mean_token_accuracy": 0.9778792187571526, |
|
"num_tokens": 10700027.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.212048192771084, |
|
"grad_norm": 0.6184270631568148, |
|
"learning_rate": 6.552406254878175e-06, |
|
"loss": 0.0708, |
|
"mean_token_accuracy": 0.9779784493148327, |
|
"num_tokens": 10831099.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 3.2506024096385544, |
|
"grad_norm": 0.7235816673040518, |
|
"learning_rate": 6.4790306175063535e-06, |
|
"loss": 0.0752, |
|
"mean_token_accuracy": 0.9767876826226711, |
|
"num_tokens": 10962171.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 3.289156626506024, |
|
"grad_norm": 0.6381234994007693, |
|
"learning_rate": 6.405304221805972e-06, |
|
"loss": 0.0713, |
|
"mean_token_accuracy": 0.978339895606041, |
|
"num_tokens": 11092367.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 3.327710843373494, |
|
"grad_norm": 0.625884194442065, |
|
"learning_rate": 6.331244552301705e-06, |
|
"loss": 0.0675, |
|
"mean_token_accuracy": 0.9787493944168091, |
|
"num_tokens": 11223439.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.3662650602409636, |
|
"grad_norm": 0.6347808690260591, |
|
"learning_rate": 6.2568691725555144e-06, |
|
"loss": 0.074, |
|
"mean_token_accuracy": 0.9773525334894657, |
|
"num_tokens": 11354511.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 3.404819277108434, |
|
"grad_norm": 0.5756416236482357, |
|
"learning_rate": 6.182195721001366e-06, |
|
"loss": 0.0804, |
|
"mean_token_accuracy": 0.9753831885755062, |
|
"num_tokens": 11485583.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 3.4433734939759035, |
|
"grad_norm": 0.5275947037323039, |
|
"learning_rate": 6.107241906762214e-06, |
|
"loss": 0.069, |
|
"mean_token_accuracy": 0.9786049015820026, |
|
"num_tokens": 11616217.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 3.4819277108433733, |
|
"grad_norm": 0.5665166566187169, |
|
"learning_rate": 6.0320255054501985e-06, |
|
"loss": 0.0677, |
|
"mean_token_accuracy": 0.9791157841682434, |
|
"num_tokens": 11747289.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 3.5204819277108435, |
|
"grad_norm": 0.6415976004443386, |
|
"learning_rate": 5.956564354951091e-06, |
|
"loss": 0.0704, |
|
"mean_token_accuracy": 0.9784059040248394, |
|
"num_tokens": 11878361.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.5204819277108435, |
|
"eval_loss": 0.45516592264175415, |
|
"eval_mean_token_accuracy": 0.8970169062926391, |
|
"eval_num_tokens": 11878361.0, |
|
"eval_runtime": 32.9299, |
|
"eval_samples_per_second": 25.964, |
|
"eval_steps_per_second": 3.249, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.559036144578313, |
|
"grad_norm": 0.5965252456073897, |
|
"learning_rate": 5.880876351193956e-06, |
|
"loss": 0.0719, |
|
"mean_token_accuracy": 0.9771998710930347, |
|
"num_tokens": 12009433.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 3.597590361445783, |
|
"grad_norm": 0.5761543620705893, |
|
"learning_rate": 5.804979443907065e-06, |
|
"loss": 0.0768, |
|
"mean_token_accuracy": 0.9757114127278328, |
|
"num_tokens": 12140505.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 3.636144578313253, |
|
"grad_norm": 0.6381827427058839, |
|
"learning_rate": 5.728891632361043e-06, |
|
"loss": 0.072, |
|
"mean_token_accuracy": 0.9778639525175095, |
|
"num_tokens": 12271577.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 3.674698795180723, |
|
"grad_norm": 0.5760484373146149, |
|
"learning_rate": 5.65263096110026e-06, |
|
"loss": 0.0737, |
|
"mean_token_accuracy": 0.9771769717335701, |
|
"num_tokens": 12402649.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 3.7132530120481926, |
|
"grad_norm": 0.6007872958623798, |
|
"learning_rate": 5.576215515663489e-06, |
|
"loss": 0.0771, |
|
"mean_token_accuracy": 0.9759022407233715, |
|
"num_tokens": 12533721.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.7518072289156628, |
|
"grad_norm": 0.5865088698200008, |
|
"learning_rate": 5.499663418294858e-06, |
|
"loss": 0.0765, |
|
"mean_token_accuracy": 0.9760418757796288, |
|
"num_tokens": 12663435.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 3.7903614457831325, |
|
"grad_norm": 0.5869508573232244, |
|
"learning_rate": 5.4229928236460705e-06, |
|
"loss": 0.0688, |
|
"mean_token_accuracy": 0.9785509333014488, |
|
"num_tokens": 12794507.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 3.8289156626506022, |
|
"grad_norm": 0.6151614870939663, |
|
"learning_rate": 5.346221914470959e-06, |
|
"loss": 0.076, |
|
"mean_token_accuracy": 0.9760396368801594, |
|
"num_tokens": 12925579.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 3.8674698795180724, |
|
"grad_norm": 0.5387118456522078, |
|
"learning_rate": 5.2693688973133675e-06, |
|
"loss": 0.0705, |
|
"mean_token_accuracy": 0.9784669689834118, |
|
"num_tokens": 13056651.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 3.906024096385542, |
|
"grad_norm": 0.5994849742352796, |
|
"learning_rate": 5.192451998189392e-06, |
|
"loss": 0.0756, |
|
"mean_token_accuracy": 0.9767113514244556, |
|
"num_tokens": 13187723.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.906024096385542, |
|
"eval_loss": 0.4474319517612457, |
|
"eval_mean_token_accuracy": 0.8968760477048214, |
|
"eval_num_tokens": 13187723.0, |
|
"eval_runtime": 32.9071, |
|
"eval_samples_per_second": 25.982, |
|
"eval_steps_per_second": 3.252, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.944578313253012, |
|
"grad_norm": 0.5535712673806997, |
|
"learning_rate": 5.115489458265006e-06, |
|
"loss": 0.0742, |
|
"mean_token_accuracy": 0.976105697453022, |
|
"num_tokens": 13317686.0, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 3.983132530120482, |
|
"grad_norm": 0.5578205642046994, |
|
"learning_rate": 5.038499529530094e-06, |
|
"loss": 0.0706, |
|
"mean_token_accuracy": 0.9777835607528687, |
|
"num_tokens": 13447600.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 4.03855421686747, |
|
"grad_norm": 0.5165401701064283, |
|
"learning_rate": 4.961500470469908e-06, |
|
"loss": 0.0864, |
|
"mean_token_accuracy": 0.9837689340114594, |
|
"num_tokens": 13611440.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 4.0771084337349395, |
|
"grad_norm": 0.5606600779130405, |
|
"learning_rate": 4.8845105417349955e-06, |
|
"loss": 0.0514, |
|
"mean_token_accuracy": 0.9854289144277573, |
|
"num_tokens": 13740537.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 4.11566265060241, |
|
"grad_norm": 0.5032380365598611, |
|
"learning_rate": 4.807548001810611e-06, |
|
"loss": 0.054, |
|
"mean_token_accuracy": 0.984481867402792, |
|
"num_tokens": 13871609.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.15421686746988, |
|
"grad_norm": 0.5627408303007229, |
|
"learning_rate": 4.730631102686635e-06, |
|
"loss": 0.0551, |
|
"mean_token_accuracy": 0.9837490878999233, |
|
"num_tokens": 14002681.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 4.192771084337349, |
|
"grad_norm": 0.6584478813758422, |
|
"learning_rate": 4.653778085529043e-06, |
|
"loss": 0.0573, |
|
"mean_token_accuracy": 0.9833140000700951, |
|
"num_tokens": 14133753.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 4.231325301204819, |
|
"grad_norm": 0.6355929893733159, |
|
"learning_rate": 4.577007176353931e-06, |
|
"loss": 0.053, |
|
"mean_token_accuracy": 0.9844788759946823, |
|
"num_tokens": 14264314.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 4.2698795180722895, |
|
"grad_norm": 0.5944238331457758, |
|
"learning_rate": 4.5003365817051434e-06, |
|
"loss": 0.0495, |
|
"mean_token_accuracy": 0.9854665398597717, |
|
"num_tokens": 14395386.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 4.308433734939759, |
|
"grad_norm": 0.5486764850427266, |
|
"learning_rate": 4.4237844843365126e-06, |
|
"loss": 0.0548, |
|
"mean_token_accuracy": 0.9835506267845631, |
|
"num_tokens": 14526458.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.308433734939759, |
|
"eval_loss": 0.5238527059555054, |
|
"eval_mean_token_accuracy": 0.895290755222891, |
|
"eval_num_tokens": 14526458.0, |
|
"eval_runtime": 32.8995, |
|
"eval_samples_per_second": 25.988, |
|
"eval_steps_per_second": 3.252, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.346987951807229, |
|
"grad_norm": 0.5193566382330146, |
|
"learning_rate": 4.347369038899744e-06, |
|
"loss": 0.0512, |
|
"mean_token_accuracy": 0.9847192876040936, |
|
"num_tokens": 14656421.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 4.385542168674699, |
|
"grad_norm": 0.6081963974814661, |
|
"learning_rate": 4.271108367638959e-06, |
|
"loss": 0.0512, |
|
"mean_token_accuracy": 0.9849474877119064, |
|
"num_tokens": 14787493.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 4.424096385542168, |
|
"grad_norm": 0.5352653032319554, |
|
"learning_rate": 4.195020556092935e-06, |
|
"loss": 0.0508, |
|
"mean_token_accuracy": 0.984955120831728, |
|
"num_tokens": 14918565.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 4.462650602409639, |
|
"grad_norm": 0.5471233273653687, |
|
"learning_rate": 4.119123648806046e-06, |
|
"loss": 0.0524, |
|
"mean_token_accuracy": 0.9842299744486809, |
|
"num_tokens": 15049637.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 4.501204819277109, |
|
"grad_norm": 0.5761419852713972, |
|
"learning_rate": 4.043435645048911e-06, |
|
"loss": 0.0538, |
|
"mean_token_accuracy": 0.9835735261440277, |
|
"num_tokens": 15180709.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.539759036144578, |
|
"grad_norm": 0.5193378395323519, |
|
"learning_rate": 3.967974494549803e-06, |
|
"loss": 0.0527, |
|
"mean_token_accuracy": 0.9835735261440277, |
|
"num_tokens": 15311781.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 4.578313253012048, |
|
"grad_norm": 0.6838502479167475, |
|
"learning_rate": 3.892758093237788e-06, |
|
"loss": 0.0505, |
|
"mean_token_accuracy": 0.9845734648406506, |
|
"num_tokens": 15442853.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 4.6168674698795185, |
|
"grad_norm": 0.6161795707876571, |
|
"learning_rate": 3.8178042789986355e-06, |
|
"loss": 0.0597, |
|
"mean_token_accuracy": 0.9818713404238224, |
|
"num_tokens": 15573925.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 4.655421686746988, |
|
"grad_norm": 0.5673557334484985, |
|
"learning_rate": 3.743130827444487e-06, |
|
"loss": 0.0551, |
|
"mean_token_accuracy": 0.9827491492033005, |
|
"num_tokens": 15704997.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 4.693975903614458, |
|
"grad_norm": 0.605102586233592, |
|
"learning_rate": 3.6687554476982954e-06, |
|
"loss": 0.0516, |
|
"mean_token_accuracy": 0.9845963642001152, |
|
"num_tokens": 15836069.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.693975903614458, |
|
"eval_loss": 0.5330836176872253, |
|
"eval_mean_token_accuracy": 0.894900638366414, |
|
"eval_num_tokens": 15836069.0, |
|
"eval_runtime": 32.8994, |
|
"eval_samples_per_second": 25.988, |
|
"eval_steps_per_second": 3.252, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.732530120481927, |
|
"grad_norm": 0.6102630121983813, |
|
"learning_rate": 3.5946957781940296e-06, |
|
"loss": 0.0592, |
|
"mean_token_accuracy": 0.9820545352995396, |
|
"num_tokens": 15967141.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 4.771084337349397, |
|
"grad_norm": 0.5601462915097579, |
|
"learning_rate": 3.5209693824936486e-06, |
|
"loss": 0.049, |
|
"mean_token_accuracy": 0.9852528125047684, |
|
"num_tokens": 16098213.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 4.809638554216868, |
|
"grad_norm": 0.5729256978696017, |
|
"learning_rate": 3.4475937451218257e-06, |
|
"loss": 0.0534, |
|
"mean_token_accuracy": 0.9839246496558189, |
|
"num_tokens": 16229285.0, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 4.848192771084337, |
|
"grad_norm": 0.5576976700355626, |
|
"learning_rate": 3.3745862674194246e-06, |
|
"loss": 0.053, |
|
"mean_token_accuracy": 0.984092578291893, |
|
"num_tokens": 16360357.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 4.886746987951807, |
|
"grad_norm": 0.568301505042255, |
|
"learning_rate": 3.301964263416693e-06, |
|
"loss": 0.0501, |
|
"mean_token_accuracy": 0.984886422753334, |
|
"num_tokens": 16491429.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.925301204819277, |
|
"grad_norm": 0.506093754486047, |
|
"learning_rate": 3.2297449557271743e-06, |
|
"loss": 0.0503, |
|
"mean_token_accuracy": 0.9847882427275181, |
|
"num_tokens": 16621960.0, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 4.9638554216867465, |
|
"grad_norm": 0.5314105489580034, |
|
"learning_rate": 3.1579454714632853e-06, |
|
"loss": 0.0527, |
|
"mean_token_accuracy": 0.9841104596853256, |
|
"num_tokens": 16752156.0, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 5.019277108433735, |
|
"grad_norm": 0.8525557002128702, |
|
"learning_rate": 3.0865828381745515e-06, |
|
"loss": 0.0798, |
|
"mean_token_accuracy": 0.9847118079662323, |
|
"num_tokens": 16915558.0, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 5.057831325301205, |
|
"grad_norm": 0.4807676519731547, |
|
"learning_rate": 3.015673979809457e-06, |
|
"loss": 0.0433, |
|
"mean_token_accuracy": 0.9880142249166965, |
|
"num_tokens": 17045521.0, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 5.096385542168675, |
|
"grad_norm": 0.46177338047601046, |
|
"learning_rate": 2.9452357127018516e-06, |
|
"loss": 0.0409, |
|
"mean_token_accuracy": 0.9885750971734524, |
|
"num_tokens": 17175776.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.096385542168675, |
|
"eval_loss": 0.5568886995315552, |
|
"eval_mean_token_accuracy": 0.8947282572773015, |
|
"eval_num_tokens": 17175776.0, |
|
"eval_runtime": 32.8508, |
|
"eval_samples_per_second": 26.027, |
|
"eval_steps_per_second": 3.257, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.134939759036144, |
|
"grad_norm": 0.4354223067866842, |
|
"learning_rate": 2.8752847415828923e-06, |
|
"loss": 0.0378, |
|
"mean_token_accuracy": 0.9894357621669769, |
|
"num_tokens": 17306848.0, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 5.1734939759036145, |
|
"grad_norm": 0.46752372012241933, |
|
"learning_rate": 2.80583765561944e-06, |
|
"loss": 0.0369, |
|
"mean_token_accuracy": 0.989603690803051, |
|
"num_tokens": 17437920.0, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 5.212048192771085, |
|
"grad_norm": 0.495717384078659, |
|
"learning_rate": 2.736910924479881e-06, |
|
"loss": 0.0403, |
|
"mean_token_accuracy": 0.9886197298765182, |
|
"num_tokens": 17568116.0, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 5.250602409638554, |
|
"grad_norm": 0.5445603960561214, |
|
"learning_rate": 2.668520894428259e-06, |
|
"loss": 0.0396, |
|
"mean_token_accuracy": 0.9885961189866066, |
|
"num_tokens": 17699188.0, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 5.289156626506024, |
|
"grad_norm": 0.648469580339254, |
|
"learning_rate": 2.600683784447704e-06, |
|
"loss": 0.0398, |
|
"mean_token_accuracy": 0.9886266514658928, |
|
"num_tokens": 17830260.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.327710843373494, |
|
"grad_norm": 0.5496241291341013, |
|
"learning_rate": 2.5334156823940237e-06, |
|
"loss": 0.0434, |
|
"mean_token_accuracy": 0.9878862388432026, |
|
"num_tokens": 17961332.0, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 5.366265060240964, |
|
"grad_norm": 0.5345216492493586, |
|
"learning_rate": 2.466732541180404e-06, |
|
"loss": 0.0425, |
|
"mean_token_accuracy": 0.9877641089260578, |
|
"num_tokens": 18092404.0, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 5.404819277108434, |
|
"grad_norm": 0.48538502077655216, |
|
"learning_rate": 2.4006501749941097e-06, |
|
"loss": 0.04, |
|
"mean_token_accuracy": 0.9883747585117817, |
|
"num_tokens": 18223476.0, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 5.443373493975904, |
|
"grad_norm": 0.4611939418614471, |
|
"learning_rate": 2.335184255546083e-06, |
|
"loss": 0.0406, |
|
"mean_token_accuracy": 0.9881762973964214, |
|
"num_tokens": 18354548.0, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 5.481927710843373, |
|
"grad_norm": 0.5074811080316414, |
|
"learning_rate": 2.2703503083543288e-06, |
|
"loss": 0.0408, |
|
"mean_token_accuracy": 0.9885350540280342, |
|
"num_tokens": 18485620.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.481927710843373, |
|
"eval_loss": 0.5997635126113892, |
|
"eval_mean_token_accuracy": 0.893693176942451, |
|
"eval_num_tokens": 18485620.0, |
|
"eval_runtime": 32.9111, |
|
"eval_samples_per_second": 25.979, |
|
"eval_steps_per_second": 3.251, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.5204819277108435, |
|
"grad_norm": 0.4599599600334865, |
|
"learning_rate": 2.206163709061976e-06, |
|
"loss": 0.0411, |
|
"mean_token_accuracy": 0.988504521548748, |
|
"num_tokens": 18616692.0, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 5.559036144578314, |
|
"grad_norm": 0.45053903346967306, |
|
"learning_rate": 2.1426396797908764e-06, |
|
"loss": 0.0373, |
|
"mean_token_accuracy": 0.9894662946462631, |
|
"num_tokens": 18747764.0, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 5.597590361445783, |
|
"grad_norm": 0.5371349909088498, |
|
"learning_rate": 2.0797932855316183e-06, |
|
"loss": 0.0405, |
|
"mean_token_accuracy": 0.988229539245367, |
|
"num_tokens": 18877678.0, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 5.636144578313253, |
|
"grad_norm": 0.4593266181875638, |
|
"learning_rate": 2.017639430570794e-06, |
|
"loss": 0.0412, |
|
"mean_token_accuracy": 0.9881075993180275, |
|
"num_tokens": 19008750.0, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 5.674698795180722, |
|
"grad_norm": 0.48430078531022114, |
|
"learning_rate": 1.956192854956397e-06, |
|
"loss": 0.0415, |
|
"mean_token_accuracy": 0.9877486675977707, |
|
"num_tokens": 19139281.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.713253012048193, |
|
"grad_norm": 0.4989863180632785, |
|
"learning_rate": 1.8954681310021434e-06, |
|
"loss": 0.0424, |
|
"mean_token_accuracy": 0.9875351153314114, |
|
"num_tokens": 19270353.0, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 5.751807228915663, |
|
"grad_norm": 0.4623064292722575, |
|
"learning_rate": 1.8354796598315977e-06, |
|
"loss": 0.0407, |
|
"mean_token_accuracy": 0.9884434565901756, |
|
"num_tokens": 19401425.0, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 5.790361445783132, |
|
"grad_norm": 0.5014798249213933, |
|
"learning_rate": 1.7762416679628792e-06, |
|
"loss": 0.0394, |
|
"mean_token_accuracy": 0.9886740408837795, |
|
"num_tokens": 19532424.0, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 5.828915662650602, |
|
"grad_norm": 0.5392913823526073, |
|
"learning_rate": 1.7177682039347875e-06, |
|
"loss": 0.0408, |
|
"mean_token_accuracy": 0.9878557063639164, |
|
"num_tokens": 19663496.0, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 5.867469879518072, |
|
"grad_norm": 0.5613539090124229, |
|
"learning_rate": 1.6600731349751303e-06, |
|
"loss": 0.0396, |
|
"mean_token_accuracy": 0.988573219627142, |
|
"num_tokens": 19794568.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.867469879518072, |
|
"eval_loss": 0.6202346682548523, |
|
"eval_mean_token_accuracy": 0.8934637369396531, |
|
"eval_num_tokens": 19794568.0, |
|
"eval_runtime": 32.9094, |
|
"eval_samples_per_second": 25.98, |
|
"eval_steps_per_second": 3.251, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.906024096385542, |
|
"grad_norm": 0.6075220334585247, |
|
"learning_rate": 1.6031701437120512e-06, |
|
"loss": 0.0436, |
|
"mean_token_accuracy": 0.9871763586997986, |
|
"num_tokens": 19925640.0, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 5.944578313253012, |
|
"grad_norm": 0.478268700720953, |
|
"learning_rate": 1.5470727249291423e-06, |
|
"loss": 0.0392, |
|
"mean_token_accuracy": 0.9886266514658928, |
|
"num_tokens": 20056712.0, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 5.983132530120482, |
|
"grad_norm": 0.5299056008151652, |
|
"learning_rate": 1.4917941823650917e-06, |
|
"loss": 0.0393, |
|
"mean_token_accuracy": 0.9882984273135662, |
|
"num_tokens": 20187784.0, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 6.03855421686747, |
|
"grad_norm": 0.4515619693414329, |
|
"learning_rate": 1.4373476255586515e-06, |
|
"loss": 0.0539, |
|
"mean_token_accuracy": 0.9903883755207061, |
|
"num_tokens": 20351624.0, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 6.0771084337349395, |
|
"grad_norm": 0.4461331299908182, |
|
"learning_rate": 1.383745966739652e-06, |
|
"loss": 0.0377, |
|
"mean_token_accuracy": 0.9897639863193035, |
|
"num_tokens": 20482696.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.11566265060241, |
|
"grad_norm": 0.3811282983266442, |
|
"learning_rate": 1.3310019177668154e-06, |
|
"loss": 0.0335, |
|
"mean_token_accuracy": 0.9908020906150341, |
|
"num_tokens": 20613768.0, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 6.15421686746988, |
|
"grad_norm": 0.4003286418611843, |
|
"learning_rate": 1.2791279871130824e-06, |
|
"loss": 0.0315, |
|
"mean_token_accuracy": 0.9916570000350475, |
|
"num_tokens": 20744840.0, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 6.192771084337349, |
|
"grad_norm": 0.4136786703234865, |
|
"learning_rate": 1.2281364768991804e-06, |
|
"loss": 0.0357, |
|
"mean_token_accuracy": 0.990313570946455, |
|
"num_tokens": 20875912.0, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 6.231325301204819, |
|
"grad_norm": 0.45632226149343463, |
|
"learning_rate": 1.1780394799761163e-06, |
|
"loss": 0.0324, |
|
"mean_token_accuracy": 0.9911643601953983, |
|
"num_tokens": 21006167.0, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 6.2698795180722895, |
|
"grad_norm": 0.3996603424603635, |
|
"learning_rate": 1.1288488770573097e-06, |
|
"loss": 0.0305, |
|
"mean_token_accuracy": 0.9917750433087349, |
|
"num_tokens": 21135643.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.2698795180722895, |
|
"eval_loss": 0.6617009043693542, |
|
"eval_mean_token_accuracy": 0.8925447820503021, |
|
"eval_num_tokens": 21135643.0, |
|
"eval_runtime": 32.9359, |
|
"eval_samples_per_second": 25.96, |
|
"eval_steps_per_second": 3.249, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.308433734939759, |
|
"grad_norm": 0.4550106121984443, |
|
"learning_rate": 1.0805763339010329e-06, |
|
"loss": 0.0337, |
|
"mean_token_accuracy": 0.990831833332777, |
|
"num_tokens": 21266174.0, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 6.346987951807229, |
|
"grad_norm": 0.4540741073756364, |
|
"learning_rate": 1.0332332985438248e-06, |
|
"loss": 0.0348, |
|
"mean_token_accuracy": 0.990550197660923, |
|
"num_tokens": 21397246.0, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 6.385542168674699, |
|
"grad_norm": 0.47292748183552674, |
|
"learning_rate": 9.868309985855446e-07, |
|
"loss": 0.0326, |
|
"mean_token_accuracy": 0.9910158179700375, |
|
"num_tokens": 21528318.0, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 6.424096385542168, |
|
"grad_norm": 0.46337369369151543, |
|
"learning_rate": 9.41380438526694e-07, |
|
"loss": 0.0328, |
|
"mean_token_accuracy": 0.990702860057354, |
|
"num_tokens": 21659390.0, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 6.462650602409639, |
|
"grad_norm": 0.5096633620765385, |
|
"learning_rate": 8.968923971586596e-07, |
|
"loss": 0.0321, |
|
"mean_token_accuracy": 0.9912142790853977, |
|
"num_tokens": 21790462.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.501204819277109, |
|
"grad_norm": 0.4849844285466508, |
|
"learning_rate": 8.533774250074727e-07, |
|
"loss": 0.0307, |
|
"mean_token_accuracy": 0.991565402597189, |
|
"num_tokens": 21921534.0, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 6.539759036144578, |
|
"grad_norm": 0.47614922479928773, |
|
"learning_rate": 8.108458418317089e-07, |
|
"loss": 0.0329, |
|
"mean_token_accuracy": 0.9907412827014923, |
|
"num_tokens": 22051497.0, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 6.578313253012048, |
|
"grad_norm": 0.49852850537940474, |
|
"learning_rate": 7.693077341751138e-07, |
|
"loss": 0.0323, |
|
"mean_token_accuracy": 0.9908555224537849, |
|
"num_tokens": 22182569.0, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 6.6168674698795185, |
|
"grad_norm": 0.4273359220675533, |
|
"learning_rate": 7.287729529745386e-07, |
|
"loss": 0.0328, |
|
"mean_token_accuracy": 0.9911041483283043, |
|
"num_tokens": 22312765.0, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 6.655421686746988, |
|
"grad_norm": 0.4557886589116148, |
|
"learning_rate": 6.892511112237472e-07, |
|
"loss": 0.0344, |
|
"mean_token_accuracy": 0.9900464117527008, |
|
"num_tokens": 22443837.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.655421686746988, |
|
"eval_loss": 0.6737558841705322, |
|
"eval_mean_token_accuracy": 0.8922743284813711, |
|
"eval_num_tokens": 22443837.0, |
|
"eval_runtime": 32.8919, |
|
"eval_samples_per_second": 25.994, |
|
"eval_steps_per_second": 3.253, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.693975903614458, |
|
"grad_norm": 0.46620750191290955, |
|
"learning_rate": 6.507515816936538e-07, |
|
"loss": 0.0333, |
|
"mean_token_accuracy": 0.9908784218132496, |
|
"num_tokens": 22574909.0, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 6.732530120481927, |
|
"grad_norm": 0.4697001737130722, |
|
"learning_rate": 6.132834947095334e-07, |
|
"loss": 0.0328, |
|
"mean_token_accuracy": 0.9907486587762833, |
|
"num_tokens": 22705981.0, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 6.771084337349397, |
|
"grad_norm": 0.4430515622762748, |
|
"learning_rate": 5.768557359857241e-07, |
|
"loss": 0.0321, |
|
"mean_token_accuracy": 0.9912448115646839, |
|
"num_tokens": 22837053.0, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 6.809638554216868, |
|
"grad_norm": 0.4361690495303729, |
|
"learning_rate": 5.414769445183432e-07, |
|
"loss": 0.0328, |
|
"mean_token_accuracy": 0.9909089542925358, |
|
"num_tokens": 22968125.0, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 6.848192771084337, |
|
"grad_norm": 0.4200405293131056, |
|
"learning_rate": 5.071555105365156e-07, |
|
"loss": 0.0346, |
|
"mean_token_accuracy": 0.9901227429509163, |
|
"num_tokens": 23099197.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 6.886746987951807, |
|
"grad_norm": 0.47982475678883507, |
|
"learning_rate": 4.738995735125895e-07, |
|
"loss": 0.0358, |
|
"mean_token_accuracy": 0.9901380091905594, |
|
"num_tokens": 23230269.0, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 6.925301204819277, |
|
"grad_norm": 0.4588352996826278, |
|
"learning_rate": 4.4171702023183663e-07, |
|
"loss": 0.0345, |
|
"mean_token_accuracy": 0.9904306195676327, |
|
"num_tokens": 23361268.0, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 6.9638554216867465, |
|
"grad_norm": 0.49076163739088335, |
|
"learning_rate": 4.10615482922056e-07, |
|
"loss": 0.034, |
|
"mean_token_accuracy": 0.9904967658221722, |
|
"num_tokens": 23492340.0, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 7.019277108433735, |
|
"grad_norm": 0.7037303292281402, |
|
"learning_rate": 3.8060233744356634e-07, |
|
"loss": 0.0476, |
|
"mean_token_accuracy": 0.9915913552045822, |
|
"num_tokens": 23656180.0, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 7.057831325301205, |
|
"grad_norm": 0.46496724208097717, |
|
"learning_rate": 3.5168470153998937e-07, |
|
"loss": 0.0307, |
|
"mean_token_accuracy": 0.9916331619024277, |
|
"num_tokens": 23786143.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.057831325301205, |
|
"eval_loss": 0.6740805506706238, |
|
"eval_mean_token_accuracy": 0.8924460611610769, |
|
"eval_num_tokens": 23786143.0, |
|
"eval_runtime": 32.9163, |
|
"eval_samples_per_second": 25.975, |
|
"eval_steps_per_second": 3.251, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.096385542168675, |
|
"grad_norm": 0.43993509563002636, |
|
"learning_rate": 3.238694331502451e-07, |
|
"loss": 0.0299, |
|
"mean_token_accuracy": 0.9917791299521923, |
|
"num_tokens": 23917215.0, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 7.134939759036144, |
|
"grad_norm": 0.42297111455766123, |
|
"learning_rate": 2.9716312878216194e-07, |
|
"loss": 0.0295, |
|
"mean_token_accuracy": 0.992290548980236, |
|
"num_tokens": 24048287.0, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 7.1734939759036145, |
|
"grad_norm": 0.3740711012992464, |
|
"learning_rate": 2.71572121948091e-07, |
|
"loss": 0.0287, |
|
"mean_token_accuracy": 0.9923897795379162, |
|
"num_tokens": 24179359.0, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 7.212048192771085, |
|
"grad_norm": 0.40083480465861504, |
|
"learning_rate": 2.471024816628836e-07, |
|
"loss": 0.0309, |
|
"mean_token_accuracy": 0.9913974739611149, |
|
"num_tokens": 24310431.0, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 7.250602409638554, |
|
"grad_norm": 0.433443148283099, |
|
"learning_rate": 2.237600110046001e-07, |
|
"loss": 0.0325, |
|
"mean_token_accuracy": 0.9914738051593304, |
|
"num_tokens": 24441503.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.289156626506024, |
|
"grad_norm": 0.3913556728274319, |
|
"learning_rate": 2.0155024573828452e-07, |
|
"loss": 0.0309, |
|
"mean_token_accuracy": 0.9921867847442627, |
|
"num_tokens": 24572502.0, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 7.327710843373494, |
|
"grad_norm": 0.3931756889237697, |
|
"learning_rate": 1.8047845300313726e-07, |
|
"loss": 0.032, |
|
"mean_token_accuracy": 0.991260077804327, |
|
"num_tokens": 24703574.0, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 7.366265060240964, |
|
"grad_norm": 0.39372361510112325, |
|
"learning_rate": 1.6054963006338742e-07, |
|
"loss": 0.0307, |
|
"mean_token_accuracy": 0.9918172955513, |
|
"num_tokens": 24834646.0, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 7.404819277108434, |
|
"grad_norm": 0.4539746897925293, |
|
"learning_rate": 1.4176850312317246e-07, |
|
"loss": 0.0286, |
|
"mean_token_accuracy": 0.9920920878648758, |
|
"num_tokens": 24965718.0, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 7.443373493975904, |
|
"grad_norm": 0.3804093569086182, |
|
"learning_rate": 1.241395262056999e-07, |
|
"loss": 0.0282, |
|
"mean_token_accuracy": 0.9927180036902428, |
|
"num_tokens": 25096790.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.443373493975904, |
|
"eval_loss": 0.6863826513290405, |
|
"eval_mean_token_accuracy": 0.8920925671809188, |
|
"eval_num_tokens": 25096790.0, |
|
"eval_runtime": 32.9256, |
|
"eval_samples_per_second": 25.968, |
|
"eval_steps_per_second": 3.25, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.481927710843373, |
|
"grad_norm": 0.4086380049589114, |
|
"learning_rate": 1.0766688009695548e-07, |
|
"loss": 0.0296, |
|
"mean_token_accuracy": 0.9921302534639835, |
|
"num_tokens": 25227862.0, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 7.5204819277108435, |
|
"grad_norm": 0.42260905745056304, |
|
"learning_rate": 9.235447135421127e-08, |
|
"loss": 0.03, |
|
"mean_token_accuracy": 0.9916875325143337, |
|
"num_tokens": 25358934.0, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 7.559036144578314, |
|
"grad_norm": 0.4176362136377749, |
|
"learning_rate": 7.820593137957244e-08, |
|
"loss": 0.031, |
|
"mean_token_accuracy": 0.9918630942702293, |
|
"num_tokens": 25490006.0, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 7.597590361445783, |
|
"grad_norm": 0.45701242241975604, |
|
"learning_rate": 6.522461555877213e-08, |
|
"loss": 0.0349, |
|
"mean_token_accuracy": 0.9904204346239567, |
|
"num_tokens": 25621078.0, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 7.636144578313253, |
|
"grad_norm": 0.40879094458461246, |
|
"learning_rate": 5.341360246542804e-08, |
|
"loss": 0.0306, |
|
"mean_token_accuracy": 0.9920004904270172, |
|
"num_tokens": 25752150.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 7.674698795180722, |
|
"grad_norm": 0.43647524957553, |
|
"learning_rate": 4.2775693130948094e-08, |
|
"loss": 0.0317, |
|
"mean_token_accuracy": 0.9910463504493237, |
|
"num_tokens": 25883222.0, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 7.713253012048193, |
|
"grad_norm": 0.41444591834134303, |
|
"learning_rate": 3.3313410380250157e-08, |
|
"loss": 0.0312, |
|
"mean_token_accuracy": 0.9916607923805714, |
|
"num_tokens": 26012936.0, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 7.751807228915663, |
|
"grad_norm": 0.3803028538212038, |
|
"learning_rate": 2.5028998233467272e-08, |
|
"loss": 0.0306, |
|
"mean_token_accuracy": 0.9918249286711216, |
|
"num_tokens": 26144008.0, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 7.790361445783132, |
|
"grad_norm": 0.3600455476234704, |
|
"learning_rate": 1.7924421373766153e-08, |
|
"loss": 0.0292, |
|
"mean_token_accuracy": 0.9922676496207714, |
|
"num_tokens": 26275080.0, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 7.828915662650602, |
|
"grad_norm": 0.39321030231512055, |
|
"learning_rate": 1.200136468141544e-08, |
|
"loss": 0.0324, |
|
"mean_token_accuracy": 0.9913364090025425, |
|
"num_tokens": 26406152.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.828915662650602, |
|
"eval_loss": 0.6899031400680542, |
|
"eval_mean_token_accuracy": 0.892040293350398, |
|
"eval_num_tokens": 26406152.0, |
|
"eval_runtime": 32.9667, |
|
"eval_samples_per_second": 25.935, |
|
"eval_steps_per_second": 3.246, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.867469879518072, |
|
"grad_norm": 0.41654832211115, |
|
"learning_rate": 7.261232834209208e-09, |
|
"loss": 0.0292, |
|
"mean_token_accuracy": 0.9922164119780064, |
|
"num_tokens": 26536786.0, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 7.906024096385542, |
|
"grad_norm": 0.40099638349259836, |
|
"learning_rate": 3.705149974342348e-09, |
|
"loss": 0.0307, |
|
"mean_token_accuracy": 0.9913211427628994, |
|
"num_tokens": 26667858.0, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 7.944578313253012, |
|
"grad_norm": 0.42947674316382145, |
|
"learning_rate": 1.3339594418138036e-09, |
|
"loss": 0.0307, |
|
"mean_token_accuracy": 0.9916405789554119, |
|
"num_tokens": 26796896.0, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 7.983132530120482, |
|
"grad_norm": 0.41338624680752933, |
|
"learning_rate": 1.4822357442656475e-10, |
|
"loss": 0.0293, |
|
"mean_token_accuracy": 0.992038656026125, |
|
"num_tokens": 26927968.0, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 7.983132530120482, |
|
"step": 408, |
|
"total_flos": 39091383042048.0, |
|
"train_loss": 0.08385189656423879, |
|
"train_runtime": 3861.3463, |
|
"train_samples_per_second": 3.439, |
|
"train_steps_per_second": 0.106 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 408, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 39091383042048.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|