ebony59's picture
Model save
2ad8976 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.983132530120482,
"eval_steps": 20,
"global_step": 408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03855421686746988,
"grad_norm": 1.3406128184465234,
"learning_rate": 9.999851776425575e-06,
"loss": 0.2215,
"mean_token_accuracy": 0.9319889023900032,
"num_tokens": 131072.0,
"step": 2
},
{
"epoch": 0.07710843373493977,
"grad_norm": 1.0868872715450462,
"learning_rate": 9.998666040558187e-06,
"loss": 0.2017,
"mean_token_accuracy": 0.9355688355863094,
"num_tokens": 262144.0,
"step": 4
},
{
"epoch": 0.11566265060240964,
"grad_norm": 0.8046336700273372,
"learning_rate": 9.996294850025658e-06,
"loss": 0.1993,
"mean_token_accuracy": 0.9353551082313061,
"num_tokens": 393216.0,
"step": 6
},
{
"epoch": 0.15421686746987953,
"grad_norm": 0.6584361140812891,
"learning_rate": 9.992738767165791e-06,
"loss": 0.1778,
"mean_token_accuracy": 0.9412478767335415,
"num_tokens": 524288.0,
"step": 8
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.6774335920541141,
"learning_rate": 9.987998635318586e-06,
"loss": 0.1902,
"mean_token_accuracy": 0.9374810568988323,
"num_tokens": 654484.0,
"step": 10
},
{
"epoch": 0.23132530120481928,
"grad_norm": 0.6105295086669309,
"learning_rate": 9.982075578626235e-06,
"loss": 0.1966,
"mean_token_accuracy": 0.9350955821573734,
"num_tokens": 785556.0,
"step": 12
},
{
"epoch": 0.26987951807228916,
"grad_norm": 0.6597942712481647,
"learning_rate": 9.974971001766534e-06,
"loss": 0.1967,
"mean_token_accuracy": 0.9350586608052254,
"num_tokens": 915519.0,
"step": 14
},
{
"epoch": 0.30843373493975906,
"grad_norm": 0.6435049991004593,
"learning_rate": 9.96668658961975e-06,
"loss": 0.1941,
"mean_token_accuracy": 0.9362405501306057,
"num_tokens": 1046591.0,
"step": 16
},
{
"epoch": 0.3469879518072289,
"grad_norm": 0.5815050330318928,
"learning_rate": 9.957224306869053e-06,
"loss": 0.1978,
"mean_token_accuracy": 0.9344849325716496,
"num_tokens": 1177663.0,
"step": 18
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.6062885715121378,
"learning_rate": 9.946586397534572e-06,
"loss": 0.1901,
"mean_token_accuracy": 0.9375763460993767,
"num_tokens": 1308735.0,
"step": 20
},
{
"epoch": 0.3855421686746988,
"eval_loss": 0.3250451982021332,
"eval_mean_token_accuracy": 0.9015540636588479,
"eval_num_tokens": 1308735.0,
"eval_runtime": 32.894,
"eval_samples_per_second": 25.993,
"eval_steps_per_second": 3.253,
"step": 20
},
{
"epoch": 0.42409638554216866,
"grad_norm": 0.6159276714953087,
"learning_rate": 9.93477538444123e-06,
"loss": 0.1792,
"mean_token_accuracy": 0.9396754540503025,
"num_tokens": 1439807.0,
"step": 22
},
{
"epoch": 0.46265060240963857,
"grad_norm": 0.5991422890793899,
"learning_rate": 9.92179406862043e-06,
"loss": 0.1898,
"mean_token_accuracy": 0.9368689768016338,
"num_tokens": 1570062.0,
"step": 24
},
{
"epoch": 0.5012048192771085,
"grad_norm": 0.6010392974067564,
"learning_rate": 9.907645528645791e-06,
"loss": 0.1823,
"mean_token_accuracy": 0.9397899508476257,
"num_tokens": 1701134.0,
"step": 26
},
{
"epoch": 0.5397590361445783,
"grad_norm": 0.6463308628240576,
"learning_rate": 9.892333119903045e-06,
"loss": 0.1801,
"mean_token_accuracy": 0.9400751106441021,
"num_tokens": 1832133.0,
"step": 28
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.5855780926666729,
"learning_rate": 9.875860473794302e-06,
"loss": 0.1887,
"mean_token_accuracy": 0.9366145730018616,
"num_tokens": 1963205.0,
"step": 30
},
{
"epoch": 0.6168674698795181,
"grad_norm": 0.5767450209509862,
"learning_rate": 9.85823149687683e-06,
"loss": 0.1806,
"mean_token_accuracy": 0.9403013698756695,
"num_tokens": 2094277.0,
"step": 32
},
{
"epoch": 0.655421686746988,
"grad_norm": 0.6078338694602209,
"learning_rate": 9.839450369936615e-06,
"loss": 0.1804,
"mean_token_accuracy": 0.9400189444422722,
"num_tokens": 2225349.0,
"step": 34
},
{
"epoch": 0.6939759036144578,
"grad_norm": 0.5890734557721334,
"learning_rate": 9.819521546996864e-06,
"loss": 0.1824,
"mean_token_accuracy": 0.9395788721740246,
"num_tokens": 2355263.0,
"step": 36
},
{
"epoch": 0.7325301204819277,
"grad_norm": 31.323491593405347,
"learning_rate": 9.798449754261716e-06,
"loss": 0.2134,
"mean_token_accuracy": 0.9329964704811573,
"num_tokens": 2486335.0,
"step": 38
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.668421005752788,
"learning_rate": 9.776239988995401e-06,
"loss": 0.1756,
"mean_token_accuracy": 0.9409272857010365,
"num_tokens": 2617407.0,
"step": 40
},
{
"epoch": 0.7710843373493976,
"eval_loss": 0.323132187128067,
"eval_mean_token_accuracy": 0.9019459669835099,
"eval_num_tokens": 2617407.0,
"eval_runtime": 32.8659,
"eval_samples_per_second": 26.015,
"eval_steps_per_second": 3.256,
"step": 40
},
{
"epoch": 0.8096385542168675,
"grad_norm": 0.6550725405812086,
"learning_rate": 9.752897518337117e-06,
"loss": 0.176,
"mean_token_accuracy": 0.9411181136965752,
"num_tokens": 2748479.0,
"step": 42
},
{
"epoch": 0.8481927710843373,
"grad_norm": 0.6735202725511205,
"learning_rate": 9.72842787805191e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.9372099563479424,
"num_tokens": 2879551.0,
"step": 44
},
{
"epoch": 0.8867469879518072,
"grad_norm": 0.6034625832801067,
"learning_rate": 9.702836871217838e-06,
"loss": 0.1783,
"mean_token_accuracy": 0.9399880617856979,
"num_tokens": 3010185.0,
"step": 46
},
{
"epoch": 0.9253012048192771,
"grad_norm": 0.5972724356437912,
"learning_rate": 9.676130566849757e-06,
"loss": 0.1878,
"mean_token_accuracy": 0.936996228992939,
"num_tokens": 3141257.0,
"step": 48
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.6007518988428301,
"learning_rate": 9.64831529846001e-06,
"loss": 0.1824,
"mean_token_accuracy": 0.9391285218298435,
"num_tokens": 3271788.0,
"step": 50
},
{
"epoch": 1.0192771084337349,
"grad_norm": 0.9787825024544956,
"learning_rate": 9.619397662556434e-06,
"loss": 0.2482,
"mean_token_accuracy": 0.9454934179782868,
"num_tokens": 3435628.0,
"step": 52
},
{
"epoch": 1.0578313253012048,
"grad_norm": 0.7523664235798198,
"learning_rate": 9.589384517077945e-06,
"loss": 0.1415,
"mean_token_accuracy": 0.9537432938814163,
"num_tokens": 3566700.0,
"step": 54
},
{
"epoch": 1.0963855421686748,
"grad_norm": 0.6132260326219375,
"learning_rate": 9.558282979768164e-06,
"loss": 0.1356,
"mean_token_accuracy": 0.9553309828042984,
"num_tokens": 3697772.0,
"step": 56
},
{
"epoch": 1.1349397590361445,
"grad_norm": 0.659449619068034,
"learning_rate": 9.52610042648741e-06,
"loss": 0.14,
"mean_token_accuracy": 0.9534532353281975,
"num_tokens": 3828844.0,
"step": 58
},
{
"epoch": 1.1734939759036145,
"grad_norm": 0.798456158560065,
"learning_rate": 9.492844489463486e-06,
"loss": 0.1402,
"mean_token_accuracy": 0.9540104530751705,
"num_tokens": 3959916.0,
"step": 60
},
{
"epoch": 1.1734939759036145,
"eval_loss": 0.3658570647239685,
"eval_mean_token_accuracy": 0.9004586478260076,
"eval_num_tokens": 3959916.0,
"eval_runtime": 32.9568,
"eval_samples_per_second": 25.943,
"eval_steps_per_second": 3.247,
"step": 60
},
{
"epoch": 1.2120481927710842,
"grad_norm": 0.7106569795117357,
"learning_rate": 9.458523055481658e-06,
"loss": 0.1328,
"mean_token_accuracy": 0.9561785310506821,
"num_tokens": 4089879.0,
"step": 62
},
{
"epoch": 1.2506024096385542,
"grad_norm": 0.622759262640549,
"learning_rate": 9.423144264014278e-06,
"loss": 0.1269,
"mean_token_accuracy": 0.9574377238750458,
"num_tokens": 4220951.0,
"step": 64
},
{
"epoch": 1.2891566265060241,
"grad_norm": 0.6303844538287657,
"learning_rate": 9.386716505290467e-06,
"loss": 0.1413,
"mean_token_accuracy": 0.9541173167526722,
"num_tokens": 4352023.0,
"step": 66
},
{
"epoch": 1.3277108433734939,
"grad_norm": 0.6507157381170863,
"learning_rate": 9.349248418306347e-06,
"loss": 0.1455,
"mean_token_accuracy": 0.9517892152070999,
"num_tokens": 4483095.0,
"step": 68
},
{
"epoch": 1.3662650602409638,
"grad_norm": 0.602439194952957,
"learning_rate": 9.310748888776254e-06,
"loss": 0.1309,
"mean_token_accuracy": 0.9569797366857529,
"num_tokens": 4614167.0,
"step": 70
},
{
"epoch": 1.4048192771084338,
"grad_norm": 0.603487797008034,
"learning_rate": 9.271227047025462e-06,
"loss": 0.1333,
"mean_token_accuracy": 0.9561706259846687,
"num_tokens": 4745239.0,
"step": 72
},
{
"epoch": 1.4433734939759035,
"grad_norm": 0.6146915599817614,
"learning_rate": 9.230692265824888e-06,
"loss": 0.1333,
"mean_token_accuracy": 0.9552088528871536,
"num_tokens": 4876311.0,
"step": 74
},
{
"epoch": 1.4819277108433735,
"grad_norm": 0.6588127818916016,
"learning_rate": 9.189154158168293e-06,
"loss": 0.1308,
"mean_token_accuracy": 0.9567278437316418,
"num_tokens": 5007383.0,
"step": 76
},
{
"epoch": 1.5204819277108435,
"grad_norm": 0.6120032825642568,
"learning_rate": 9.146622574992528e-06,
"loss": 0.1461,
"mean_token_accuracy": 0.9520487412810326,
"num_tokens": 5138455.0,
"step": 78
},
{
"epoch": 1.5590361445783132,
"grad_norm": 0.6485104767700068,
"learning_rate": 9.103107602841341e-06,
"loss": 0.1384,
"mean_token_accuracy": 0.9545066058635712,
"num_tokens": 5269527.0,
"step": 80
},
{
"epoch": 1.5590361445783132,
"eval_loss": 0.3478308916091919,
"eval_mean_token_accuracy": 0.9005198361717652,
"eval_num_tokens": 5269527.0,
"eval_runtime": 32.9211,
"eval_samples_per_second": 25.971,
"eval_steps_per_second": 3.25,
"step": 80
},
{
"epoch": 1.5975903614457831,
"grad_norm": 0.6036456378189476,
"learning_rate": 9.058619561473308e-06,
"loss": 0.1351,
"mean_token_accuracy": 0.9550928063690662,
"num_tokens": 5400161.0,
"step": 82
},
{
"epoch": 1.636144578313253,
"grad_norm": 0.5662228136563036,
"learning_rate": 9.013169001414458e-06,
"loss": 0.1389,
"mean_token_accuracy": 0.9542012810707092,
"num_tokens": 5531233.0,
"step": 84
},
{
"epoch": 1.6746987951807228,
"grad_norm": 0.668176616317804,
"learning_rate": 8.966766701456177e-06,
"loss": 0.1472,
"mean_token_accuracy": 0.9509572051465511,
"num_tokens": 5662305.0,
"step": 86
},
{
"epoch": 1.7132530120481928,
"grad_norm": 0.6659519140143432,
"learning_rate": 8.91942366609897e-06,
"loss": 0.1278,
"mean_token_accuracy": 0.9569377563893795,
"num_tokens": 5792219.0,
"step": 88
},
{
"epoch": 1.7518072289156628,
"grad_norm": 0.5730839087389755,
"learning_rate": 8.871151122942692e-06,
"loss": 0.1383,
"mean_token_accuracy": 0.9540486186742783,
"num_tokens": 5923291.0,
"step": 90
},
{
"epoch": 1.7903614457831325,
"grad_norm": 0.7339967333799423,
"learning_rate": 8.821960520023884e-06,
"loss": 0.1473,
"mean_token_accuracy": 0.9519094601273537,
"num_tokens": 6053822.0,
"step": 92
},
{
"epoch": 1.8289156626506025,
"grad_norm": 0.6406836390946679,
"learning_rate": 8.771863523100821e-06,
"loss": 0.1412,
"mean_token_accuracy": 0.9523235335946083,
"num_tokens": 6184894.0,
"step": 94
},
{
"epoch": 1.8674698795180724,
"grad_norm": 0.5600258442949189,
"learning_rate": 8.720872012886918e-06,
"loss": 0.1477,
"mean_token_accuracy": 0.9504381828010082,
"num_tokens": 6315149.0,
"step": 96
},
{
"epoch": 1.9060240963855422,
"grad_norm": 0.5933525309833521,
"learning_rate": 8.668998082233186e-06,
"loss": 0.1356,
"mean_token_accuracy": 0.9551424346864223,
"num_tokens": 6445345.0,
"step": 98
},
{
"epoch": 1.944578313253012,
"grad_norm": 0.6535174387269015,
"learning_rate": 8.616254033260351e-06,
"loss": 0.1365,
"mean_token_accuracy": 0.9537569470703602,
"num_tokens": 6576344.0,
"step": 100
},
{
"epoch": 1.944578313253012,
"eval_loss": 0.34725868701934814,
"eval_mean_token_accuracy": 0.9006962408529264,
"eval_num_tokens": 6576344.0,
"eval_runtime": 32.8904,
"eval_samples_per_second": 25.995,
"eval_steps_per_second": 3.253,
"step": 100
},
{
"epoch": 1.983132530120482,
"grad_norm": 0.660872027341127,
"learning_rate": 8.56265237444135e-06,
"loss": 0.1433,
"mean_token_accuracy": 0.9518960788846016,
"num_tokens": 6707416.0,
"step": 102
},
{
"epoch": 2.0385542168674697,
"grad_norm": 0.6405367004816473,
"learning_rate": 8.508205817634908e-06,
"loss": 0.1641,
"mean_token_accuracy": 0.9666707456111908,
"num_tokens": 6871256.0,
"step": 104
},
{
"epoch": 2.07710843373494,
"grad_norm": 0.6396095656248479,
"learning_rate": 8.452927275070858e-06,
"loss": 0.1026,
"mean_token_accuracy": 0.968053013086319,
"num_tokens": 7002255.0,
"step": 106
},
{
"epoch": 2.1156626506024097,
"grad_norm": 0.6728841385013591,
"learning_rate": 8.39682985628795e-06,
"loss": 0.1013,
"mean_token_accuracy": 0.9683454521000385,
"num_tokens": 7133327.0,
"step": 108
},
{
"epoch": 2.1542168674698794,
"grad_norm": 0.6356701289090003,
"learning_rate": 8.339926865024871e-06,
"loss": 0.1015,
"mean_token_accuracy": 0.9669992625713348,
"num_tokens": 7263523.0,
"step": 110
},
{
"epoch": 2.1927710843373496,
"grad_norm": 0.6909824113954701,
"learning_rate": 8.282231796065215e-06,
"loss": 0.0982,
"mean_token_accuracy": 0.9683454521000385,
"num_tokens": 7394595.0,
"step": 112
},
{
"epoch": 2.2313253012048193,
"grad_norm": 0.6949675615015238,
"learning_rate": 8.223758332037121e-06,
"loss": 0.0971,
"mean_token_accuracy": 0.9685210138559341,
"num_tokens": 7525667.0,
"step": 114
},
{
"epoch": 2.269879518072289,
"grad_norm": 0.5752622256768609,
"learning_rate": 8.164520340168404e-06,
"loss": 0.1046,
"mean_token_accuracy": 0.9659944511950016,
"num_tokens": 7656739.0,
"step": 116
},
{
"epoch": 2.3084337349397592,
"grad_norm": 0.6456834319746314,
"learning_rate": 8.104531868997858e-06,
"loss": 0.1,
"mean_token_accuracy": 0.9683759845793247,
"num_tokens": 7787811.0,
"step": 118
},
{
"epoch": 2.346987951807229,
"grad_norm": 0.6268887677724969,
"learning_rate": 8.043807145043604e-06,
"loss": 0.1089,
"mean_token_accuracy": 0.9643549546599388,
"num_tokens": 7916616.0,
"step": 120
},
{
"epoch": 2.346987951807229,
"eval_loss": 0.3908001780509949,
"eval_mean_token_accuracy": 0.8989515037180107,
"eval_num_tokens": 7916616.0,
"eval_runtime": 32.9535,
"eval_samples_per_second": 25.946,
"eval_steps_per_second": 3.247,
"step": 120
},
{
"epoch": 2.3855421686746987,
"grad_norm": 0.594448184955674,
"learning_rate": 7.982360569429206e-06,
"loss": 0.0919,
"mean_token_accuracy": 0.9701926670968533,
"num_tokens": 8047688.0,
"step": 122
},
{
"epoch": 2.4240963855421684,
"grad_norm": 0.6509482507816816,
"learning_rate": 7.920206714468383e-06,
"loss": 0.1,
"mean_token_accuracy": 0.9679943285882473,
"num_tokens": 8178760.0,
"step": 124
},
{
"epoch": 2.4626506024096386,
"grad_norm": 0.6114017480150069,
"learning_rate": 7.857360320209126e-06,
"loss": 0.1079,
"mean_token_accuracy": 0.9651777073740959,
"num_tokens": 8309832.0,
"step": 126
},
{
"epoch": 2.5012048192771084,
"grad_norm": 0.6679792242974475,
"learning_rate": 7.793836290938026e-06,
"loss": 0.0967,
"mean_token_accuracy": 0.9691850952804089,
"num_tokens": 8440904.0,
"step": 128
},
{
"epoch": 2.539759036144578,
"grad_norm": 0.5886521368506863,
"learning_rate": 7.729649691645673e-06,
"loss": 0.0991,
"mean_token_accuracy": 0.9681088253855705,
"num_tokens": 8571976.0,
"step": 130
},
{
"epoch": 2.5783132530120483,
"grad_norm": 0.6059968735763466,
"learning_rate": 7.664815744453918e-06,
"loss": 0.1017,
"mean_token_accuracy": 0.9668340943753719,
"num_tokens": 8703048.0,
"step": 132
},
{
"epoch": 2.616867469879518,
"grad_norm": 0.63703209859714,
"learning_rate": 7.599349825005892e-06,
"loss": 0.1013,
"mean_token_accuracy": 0.9671317860484123,
"num_tokens": 8834120.0,
"step": 134
},
{
"epoch": 2.6554216867469878,
"grad_norm": 0.6404376584659325,
"learning_rate": 7.533267458819597e-06,
"loss": 0.1081,
"mean_token_accuracy": 0.9647228345274925,
"num_tokens": 8964375.0,
"step": 136
},
{
"epoch": 2.693975903614458,
"grad_norm": 0.5731515889981167,
"learning_rate": 7.466584317605978e-06,
"loss": 0.0988,
"mean_token_accuracy": 0.9682356528937817,
"num_tokens": 9094906.0,
"step": 138
},
{
"epoch": 2.7325301204819277,
"grad_norm": 0.5848185748214219,
"learning_rate": 7.399316215552296e-06,
"loss": 0.1016,
"mean_token_accuracy": 0.9662845097482204,
"num_tokens": 9225978.0,
"step": 140
},
{
"epoch": 2.7325301204819277,
"eval_loss": 0.3856795132160187,
"eval_mean_token_accuracy": 0.8990027659407286,
"eval_num_tokens": 9225978.0,
"eval_runtime": 32.897,
"eval_samples_per_second": 25.99,
"eval_steps_per_second": 3.253,
"step": 140
},
{
"epoch": 2.7710843373493974,
"grad_norm": 0.6029958264244917,
"learning_rate": 7.33147910557174e-06,
"loss": 0.0958,
"mean_token_accuracy": 0.9687576405704021,
"num_tokens": 9357050.0,
"step": 142
},
{
"epoch": 2.8096385542168676,
"grad_norm": 0.6783830228161134,
"learning_rate": 7.26308907552012e-06,
"loss": 0.1078,
"mean_token_accuracy": 0.9650097787380219,
"num_tokens": 9488122.0,
"step": 144
},
{
"epoch": 2.8481927710843373,
"grad_norm": 0.5827966276168525,
"learning_rate": 7.194162344380561e-06,
"loss": 0.1031,
"mean_token_accuracy": 0.9667501300573349,
"num_tokens": 9619194.0,
"step": 146
},
{
"epoch": 2.886746987951807,
"grad_norm": 0.6450599697404067,
"learning_rate": 7.124715258417111e-06,
"loss": 0.1087,
"mean_token_accuracy": 0.9652311392128468,
"num_tokens": 9750266.0,
"step": 148
},
{
"epoch": 2.9253012048192772,
"grad_norm": 0.5260422478675805,
"learning_rate": 7.05476428729815e-06,
"loss": 0.0976,
"mean_token_accuracy": 0.9683836176991463,
"num_tokens": 9881338.0,
"step": 150
},
{
"epoch": 2.963855421686747,
"grad_norm": 0.6011167506104717,
"learning_rate": 6.984326020190544e-06,
"loss": 0.0996,
"mean_token_accuracy": 0.9671653471887112,
"num_tokens": 10011972.0,
"step": 152
},
{
"epoch": 3.019277108433735,
"grad_norm": 1.067138524628119,
"learning_rate": 6.913417161825449e-06,
"loss": 0.1396,
"mean_token_accuracy": 0.9718551605939865,
"num_tokens": 10175812.0,
"step": 154
},
{
"epoch": 3.057831325301205,
"grad_norm": 0.6433380773043862,
"learning_rate": 6.842054528536717e-06,
"loss": 0.0748,
"mean_token_accuracy": 0.9779708161950111,
"num_tokens": 10306884.0,
"step": 156
},
{
"epoch": 3.0963855421686746,
"grad_norm": 0.5518681598947663,
"learning_rate": 6.770255044272826e-06,
"loss": 0.0736,
"mean_token_accuracy": 0.9776161313056946,
"num_tokens": 10437883.0,
"step": 158
},
{
"epoch": 3.1349397590361447,
"grad_norm": 0.5716170366461122,
"learning_rate": 6.698035736583307e-06,
"loss": 0.0704,
"mean_token_accuracy": 0.9789860211312771,
"num_tokens": 10568955.0,
"step": 160
},
{
"epoch": 3.1349397590361447,
"eval_loss": 0.4618055820465088,
"eval_mean_token_accuracy": 0.8969022353118825,
"eval_num_tokens": 10568955.0,
"eval_runtime": 32.9015,
"eval_samples_per_second": 25.987,
"eval_steps_per_second": 3.252,
"step": 160
},
{
"epoch": 3.1734939759036145,
"grad_norm": 0.5660248629147534,
"learning_rate": 6.625413732580577e-06,
"loss": 0.0718,
"mean_token_accuracy": 0.9778792187571526,
"num_tokens": 10700027.0,
"step": 162
},
{
"epoch": 3.212048192771084,
"grad_norm": 0.6184270631568148,
"learning_rate": 6.552406254878175e-06,
"loss": 0.0708,
"mean_token_accuracy": 0.9779784493148327,
"num_tokens": 10831099.0,
"step": 164
},
{
"epoch": 3.2506024096385544,
"grad_norm": 0.7235816673040518,
"learning_rate": 6.4790306175063535e-06,
"loss": 0.0752,
"mean_token_accuracy": 0.9767876826226711,
"num_tokens": 10962171.0,
"step": 166
},
{
"epoch": 3.289156626506024,
"grad_norm": 0.6381234994007693,
"learning_rate": 6.405304221805972e-06,
"loss": 0.0713,
"mean_token_accuracy": 0.978339895606041,
"num_tokens": 11092367.0,
"step": 168
},
{
"epoch": 3.327710843373494,
"grad_norm": 0.625884194442065,
"learning_rate": 6.331244552301705e-06,
"loss": 0.0675,
"mean_token_accuracy": 0.9787493944168091,
"num_tokens": 11223439.0,
"step": 170
},
{
"epoch": 3.3662650602409636,
"grad_norm": 0.6347808690260591,
"learning_rate": 6.2568691725555144e-06,
"loss": 0.074,
"mean_token_accuracy": 0.9773525334894657,
"num_tokens": 11354511.0,
"step": 172
},
{
"epoch": 3.404819277108434,
"grad_norm": 0.5756416236482357,
"learning_rate": 6.182195721001366e-06,
"loss": 0.0804,
"mean_token_accuracy": 0.9753831885755062,
"num_tokens": 11485583.0,
"step": 174
},
{
"epoch": 3.4433734939759035,
"grad_norm": 0.5275947037323039,
"learning_rate": 6.107241906762214e-06,
"loss": 0.069,
"mean_token_accuracy": 0.9786049015820026,
"num_tokens": 11616217.0,
"step": 176
},
{
"epoch": 3.4819277108433733,
"grad_norm": 0.5665166566187169,
"learning_rate": 6.0320255054501985e-06,
"loss": 0.0677,
"mean_token_accuracy": 0.9791157841682434,
"num_tokens": 11747289.0,
"step": 178
},
{
"epoch": 3.5204819277108435,
"grad_norm": 0.6415976004443386,
"learning_rate": 5.956564354951091e-06,
"loss": 0.0704,
"mean_token_accuracy": 0.9784059040248394,
"num_tokens": 11878361.0,
"step": 180
},
{
"epoch": 3.5204819277108435,
"eval_loss": 0.45516592264175415,
"eval_mean_token_accuracy": 0.8970169062926391,
"eval_num_tokens": 11878361.0,
"eval_runtime": 32.9299,
"eval_samples_per_second": 25.964,
"eval_steps_per_second": 3.249,
"step": 180
},
{
"epoch": 3.559036144578313,
"grad_norm": 0.5965252456073897,
"learning_rate": 5.880876351193956e-06,
"loss": 0.0719,
"mean_token_accuracy": 0.9771998710930347,
"num_tokens": 12009433.0,
"step": 182
},
{
"epoch": 3.597590361445783,
"grad_norm": 0.5761543620705893,
"learning_rate": 5.804979443907065e-06,
"loss": 0.0768,
"mean_token_accuracy": 0.9757114127278328,
"num_tokens": 12140505.0,
"step": 184
},
{
"epoch": 3.636144578313253,
"grad_norm": 0.6381827427058839,
"learning_rate": 5.728891632361043e-06,
"loss": 0.072,
"mean_token_accuracy": 0.9778639525175095,
"num_tokens": 12271577.0,
"step": 186
},
{
"epoch": 3.674698795180723,
"grad_norm": 0.5760484373146149,
"learning_rate": 5.65263096110026e-06,
"loss": 0.0737,
"mean_token_accuracy": 0.9771769717335701,
"num_tokens": 12402649.0,
"step": 188
},
{
"epoch": 3.7132530120481926,
"grad_norm": 0.6007872958623798,
"learning_rate": 5.576215515663489e-06,
"loss": 0.0771,
"mean_token_accuracy": 0.9759022407233715,
"num_tokens": 12533721.0,
"step": 190
},
{
"epoch": 3.7518072289156628,
"grad_norm": 0.5865088698200008,
"learning_rate": 5.499663418294858e-06,
"loss": 0.0765,
"mean_token_accuracy": 0.9760418757796288,
"num_tokens": 12663435.0,
"step": 192
},
{
"epoch": 3.7903614457831325,
"grad_norm": 0.5869508573232244,
"learning_rate": 5.4229928236460705e-06,
"loss": 0.0688,
"mean_token_accuracy": 0.9785509333014488,
"num_tokens": 12794507.0,
"step": 194
},
{
"epoch": 3.8289156626506022,
"grad_norm": 0.6151614870939663,
"learning_rate": 5.346221914470959e-06,
"loss": 0.076,
"mean_token_accuracy": 0.9760396368801594,
"num_tokens": 12925579.0,
"step": 196
},
{
"epoch": 3.8674698795180724,
"grad_norm": 0.5387118456522078,
"learning_rate": 5.2693688973133675e-06,
"loss": 0.0705,
"mean_token_accuracy": 0.9784669689834118,
"num_tokens": 13056651.0,
"step": 198
},
{
"epoch": 3.906024096385542,
"grad_norm": 0.5994849742352796,
"learning_rate": 5.192451998189392e-06,
"loss": 0.0756,
"mean_token_accuracy": 0.9767113514244556,
"num_tokens": 13187723.0,
"step": 200
},
{
"epoch": 3.906024096385542,
"eval_loss": 0.4474319517612457,
"eval_mean_token_accuracy": 0.8968760477048214,
"eval_num_tokens": 13187723.0,
"eval_runtime": 32.9071,
"eval_samples_per_second": 25.982,
"eval_steps_per_second": 3.252,
"step": 200
},
{
"epoch": 3.944578313253012,
"grad_norm": 0.5535712673806997,
"learning_rate": 5.115489458265006e-06,
"loss": 0.0742,
"mean_token_accuracy": 0.976105697453022,
"num_tokens": 13317686.0,
"step": 202
},
{
"epoch": 3.983132530120482,
"grad_norm": 0.5578205642046994,
"learning_rate": 5.038499529530094e-06,
"loss": 0.0706,
"mean_token_accuracy": 0.9777835607528687,
"num_tokens": 13447600.0,
"step": 204
},
{
"epoch": 4.03855421686747,
"grad_norm": 0.5165401701064283,
"learning_rate": 4.961500470469908e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.9837689340114594,
"num_tokens": 13611440.0,
"step": 206
},
{
"epoch": 4.0771084337349395,
"grad_norm": 0.5606600779130405,
"learning_rate": 4.8845105417349955e-06,
"loss": 0.0514,
"mean_token_accuracy": 0.9854289144277573,
"num_tokens": 13740537.0,
"step": 208
},
{
"epoch": 4.11566265060241,
"grad_norm": 0.5032380365598611,
"learning_rate": 4.807548001810611e-06,
"loss": 0.054,
"mean_token_accuracy": 0.984481867402792,
"num_tokens": 13871609.0,
"step": 210
},
{
"epoch": 4.15421686746988,
"grad_norm": 0.5627408303007229,
"learning_rate": 4.730631102686635e-06,
"loss": 0.0551,
"mean_token_accuracy": 0.9837490878999233,
"num_tokens": 14002681.0,
"step": 212
},
{
"epoch": 4.192771084337349,
"grad_norm": 0.6584478813758422,
"learning_rate": 4.653778085529043e-06,
"loss": 0.0573,
"mean_token_accuracy": 0.9833140000700951,
"num_tokens": 14133753.0,
"step": 214
},
{
"epoch": 4.231325301204819,
"grad_norm": 0.6355929893733159,
"learning_rate": 4.577007176353931e-06,
"loss": 0.053,
"mean_token_accuracy": 0.9844788759946823,
"num_tokens": 14264314.0,
"step": 216
},
{
"epoch": 4.2698795180722895,
"grad_norm": 0.5944238331457758,
"learning_rate": 4.5003365817051434e-06,
"loss": 0.0495,
"mean_token_accuracy": 0.9854665398597717,
"num_tokens": 14395386.0,
"step": 218
},
{
"epoch": 4.308433734939759,
"grad_norm": 0.5486764850427266,
"learning_rate": 4.4237844843365126e-06,
"loss": 0.0548,
"mean_token_accuracy": 0.9835506267845631,
"num_tokens": 14526458.0,
"step": 220
},
{
"epoch": 4.308433734939759,
"eval_loss": 0.5238527059555054,
"eval_mean_token_accuracy": 0.895290755222891,
"eval_num_tokens": 14526458.0,
"eval_runtime": 32.8995,
"eval_samples_per_second": 25.988,
"eval_steps_per_second": 3.252,
"step": 220
},
{
"epoch": 4.346987951807229,
"grad_norm": 0.5193566382330146,
"learning_rate": 4.347369038899744e-06,
"loss": 0.0512,
"mean_token_accuracy": 0.9847192876040936,
"num_tokens": 14656421.0,
"step": 222
},
{
"epoch": 4.385542168674699,
"grad_norm": 0.6081963974814661,
"learning_rate": 4.271108367638959e-06,
"loss": 0.0512,
"mean_token_accuracy": 0.9849474877119064,
"num_tokens": 14787493.0,
"step": 224
},
{
"epoch": 4.424096385542168,
"grad_norm": 0.5352653032319554,
"learning_rate": 4.195020556092935e-06,
"loss": 0.0508,
"mean_token_accuracy": 0.984955120831728,
"num_tokens": 14918565.0,
"step": 226
},
{
"epoch": 4.462650602409639,
"grad_norm": 0.5471233273653687,
"learning_rate": 4.119123648806046e-06,
"loss": 0.0524,
"mean_token_accuracy": 0.9842299744486809,
"num_tokens": 15049637.0,
"step": 228
},
{
"epoch": 4.501204819277109,
"grad_norm": 0.5761419852713972,
"learning_rate": 4.043435645048911e-06,
"loss": 0.0538,
"mean_token_accuracy": 0.9835735261440277,
"num_tokens": 15180709.0,
"step": 230
},
{
"epoch": 4.539759036144578,
"grad_norm": 0.5193378395323519,
"learning_rate": 3.967974494549803e-06,
"loss": 0.0527,
"mean_token_accuracy": 0.9835735261440277,
"num_tokens": 15311781.0,
"step": 232
},
{
"epoch": 4.578313253012048,
"grad_norm": 0.6838502479167475,
"learning_rate": 3.892758093237788e-06,
"loss": 0.0505,
"mean_token_accuracy": 0.9845734648406506,
"num_tokens": 15442853.0,
"step": 234
},
{
"epoch": 4.6168674698795185,
"grad_norm": 0.6161795707876571,
"learning_rate": 3.8178042789986355e-06,
"loss": 0.0597,
"mean_token_accuracy": 0.9818713404238224,
"num_tokens": 15573925.0,
"step": 236
},
{
"epoch": 4.655421686746988,
"grad_norm": 0.5673557334484985,
"learning_rate": 3.743130827444487e-06,
"loss": 0.0551,
"mean_token_accuracy": 0.9827491492033005,
"num_tokens": 15704997.0,
"step": 238
},
{
"epoch": 4.693975903614458,
"grad_norm": 0.605102586233592,
"learning_rate": 3.6687554476982954e-06,
"loss": 0.0516,
"mean_token_accuracy": 0.9845963642001152,
"num_tokens": 15836069.0,
"step": 240
},
{
"epoch": 4.693975903614458,
"eval_loss": 0.5330836176872253,
"eval_mean_token_accuracy": 0.894900638366414,
"eval_num_tokens": 15836069.0,
"eval_runtime": 32.8994,
"eval_samples_per_second": 25.988,
"eval_steps_per_second": 3.252,
"step": 240
},
{
"epoch": 4.732530120481927,
"grad_norm": 0.6102630121983813,
"learning_rate": 3.5946957781940296e-06,
"loss": 0.0592,
"mean_token_accuracy": 0.9820545352995396,
"num_tokens": 15967141.0,
"step": 242
},
{
"epoch": 4.771084337349397,
"grad_norm": 0.5601462915097579,
"learning_rate": 3.5209693824936486e-06,
"loss": 0.049,
"mean_token_accuracy": 0.9852528125047684,
"num_tokens": 16098213.0,
"step": 244
},
{
"epoch": 4.809638554216868,
"grad_norm": 0.5729256978696017,
"learning_rate": 3.4475937451218257e-06,
"loss": 0.0534,
"mean_token_accuracy": 0.9839246496558189,
"num_tokens": 16229285.0,
"step": 246
},
{
"epoch": 4.848192771084337,
"grad_norm": 0.5576976700355626,
"learning_rate": 3.3745862674194246e-06,
"loss": 0.053,
"mean_token_accuracy": 0.984092578291893,
"num_tokens": 16360357.0,
"step": 248
},
{
"epoch": 4.886746987951807,
"grad_norm": 0.568301505042255,
"learning_rate": 3.301964263416693e-06,
"loss": 0.0501,
"mean_token_accuracy": 0.984886422753334,
"num_tokens": 16491429.0,
"step": 250
},
{
"epoch": 4.925301204819277,
"grad_norm": 0.506093754486047,
"learning_rate": 3.2297449557271743e-06,
"loss": 0.0503,
"mean_token_accuracy": 0.9847882427275181,
"num_tokens": 16621960.0,
"step": 252
},
{
"epoch": 4.9638554216867465,
"grad_norm": 0.5314105489580034,
"learning_rate": 3.1579454714632853e-06,
"loss": 0.0527,
"mean_token_accuracy": 0.9841104596853256,
"num_tokens": 16752156.0,
"step": 254
},
{
"epoch": 5.019277108433735,
"grad_norm": 0.8525557002128702,
"learning_rate": 3.0865828381745515e-06,
"loss": 0.0798,
"mean_token_accuracy": 0.9847118079662323,
"num_tokens": 16915558.0,
"step": 256
},
{
"epoch": 5.057831325301205,
"grad_norm": 0.4807676519731547,
"learning_rate": 3.015673979809457e-06,
"loss": 0.0433,
"mean_token_accuracy": 0.9880142249166965,
"num_tokens": 17045521.0,
"step": 258
},
{
"epoch": 5.096385542168675,
"grad_norm": 0.46177338047601046,
"learning_rate": 2.9452357127018516e-06,
"loss": 0.0409,
"mean_token_accuracy": 0.9885750971734524,
"num_tokens": 17175776.0,
"step": 260
},
{
"epoch": 5.096385542168675,
"eval_loss": 0.5568886995315552,
"eval_mean_token_accuracy": 0.8947282572773015,
"eval_num_tokens": 17175776.0,
"eval_runtime": 32.8508,
"eval_samples_per_second": 26.027,
"eval_steps_per_second": 3.257,
"step": 260
},
{
"epoch": 5.134939759036144,
"grad_norm": 0.4354223067866842,
"learning_rate": 2.8752847415828923e-06,
"loss": 0.0378,
"mean_token_accuracy": 0.9894357621669769,
"num_tokens": 17306848.0,
"step": 262
},
{
"epoch": 5.1734939759036145,
"grad_norm": 0.46752372012241933,
"learning_rate": 2.80583765561944e-06,
"loss": 0.0369,
"mean_token_accuracy": 0.989603690803051,
"num_tokens": 17437920.0,
"step": 264
},
{
"epoch": 5.212048192771085,
"grad_norm": 0.495717384078659,
"learning_rate": 2.736910924479881e-06,
"loss": 0.0403,
"mean_token_accuracy": 0.9886197298765182,
"num_tokens": 17568116.0,
"step": 266
},
{
"epoch": 5.250602409638554,
"grad_norm": 0.5445603960561214,
"learning_rate": 2.668520894428259e-06,
"loss": 0.0396,
"mean_token_accuracy": 0.9885961189866066,
"num_tokens": 17699188.0,
"step": 268
},
{
"epoch": 5.289156626506024,
"grad_norm": 0.648469580339254,
"learning_rate": 2.600683784447704e-06,
"loss": 0.0398,
"mean_token_accuracy": 0.9886266514658928,
"num_tokens": 17830260.0,
"step": 270
},
{
"epoch": 5.327710843373494,
"grad_norm": 0.5496241291341013,
"learning_rate": 2.5334156823940237e-06,
"loss": 0.0434,
"mean_token_accuracy": 0.9878862388432026,
"num_tokens": 17961332.0,
"step": 272
},
{
"epoch": 5.366265060240964,
"grad_norm": 0.5345216492493586,
"learning_rate": 2.466732541180404e-06,
"loss": 0.0425,
"mean_token_accuracy": 0.9877641089260578,
"num_tokens": 18092404.0,
"step": 274
},
{
"epoch": 5.404819277108434,
"grad_norm": 0.48538502077655216,
"learning_rate": 2.4006501749941097e-06,
"loss": 0.04,
"mean_token_accuracy": 0.9883747585117817,
"num_tokens": 18223476.0,
"step": 276
},
{
"epoch": 5.443373493975904,
"grad_norm": 0.4611939418614471,
"learning_rate": 2.335184255546083e-06,
"loss": 0.0406,
"mean_token_accuracy": 0.9881762973964214,
"num_tokens": 18354548.0,
"step": 278
},
{
"epoch": 5.481927710843373,
"grad_norm": 0.5074811080316414,
"learning_rate": 2.2703503083543288e-06,
"loss": 0.0408,
"mean_token_accuracy": 0.9885350540280342,
"num_tokens": 18485620.0,
"step": 280
},
{
"epoch": 5.481927710843373,
"eval_loss": 0.5997635126113892,
"eval_mean_token_accuracy": 0.893693176942451,
"eval_num_tokens": 18485620.0,
"eval_runtime": 32.9111,
"eval_samples_per_second": 25.979,
"eval_steps_per_second": 3.251,
"step": 280
},
{
"epoch": 5.5204819277108435,
"grad_norm": 0.4599599600334865,
"learning_rate": 2.206163709061976e-06,
"loss": 0.0411,
"mean_token_accuracy": 0.988504521548748,
"num_tokens": 18616692.0,
"step": 282
},
{
"epoch": 5.559036144578314,
"grad_norm": 0.45053903346967306,
"learning_rate": 2.1426396797908764e-06,
"loss": 0.0373,
"mean_token_accuracy": 0.9894662946462631,
"num_tokens": 18747764.0,
"step": 284
},
{
"epoch": 5.597590361445783,
"grad_norm": 0.5371349909088498,
"learning_rate": 2.0797932855316183e-06,
"loss": 0.0405,
"mean_token_accuracy": 0.988229539245367,
"num_tokens": 18877678.0,
"step": 286
},
{
"epoch": 5.636144578313253,
"grad_norm": 0.4593266181875638,
"learning_rate": 2.017639430570794e-06,
"loss": 0.0412,
"mean_token_accuracy": 0.9881075993180275,
"num_tokens": 19008750.0,
"step": 288
},
{
"epoch": 5.674698795180722,
"grad_norm": 0.48430078531022114,
"learning_rate": 1.956192854956397e-06,
"loss": 0.0415,
"mean_token_accuracy": 0.9877486675977707,
"num_tokens": 19139281.0,
"step": 290
},
{
"epoch": 5.713253012048193,
"grad_norm": 0.4989863180632785,
"learning_rate": 1.8954681310021434e-06,
"loss": 0.0424,
"mean_token_accuracy": 0.9875351153314114,
"num_tokens": 19270353.0,
"step": 292
},
{
"epoch": 5.751807228915663,
"grad_norm": 0.4623064292722575,
"learning_rate": 1.8354796598315977e-06,
"loss": 0.0407,
"mean_token_accuracy": 0.9884434565901756,
"num_tokens": 19401425.0,
"step": 294
},
{
"epoch": 5.790361445783132,
"grad_norm": 0.5014798249213933,
"learning_rate": 1.7762416679628792e-06,
"loss": 0.0394,
"mean_token_accuracy": 0.9886740408837795,
"num_tokens": 19532424.0,
"step": 296
},
{
"epoch": 5.828915662650602,
"grad_norm": 0.5392913823526073,
"learning_rate": 1.7177682039347875e-06,
"loss": 0.0408,
"mean_token_accuracy": 0.9878557063639164,
"num_tokens": 19663496.0,
"step": 298
},
{
"epoch": 5.867469879518072,
"grad_norm": 0.5613539090124229,
"learning_rate": 1.6600731349751303e-06,
"loss": 0.0396,
"mean_token_accuracy": 0.988573219627142,
"num_tokens": 19794568.0,
"step": 300
},
{
"epoch": 5.867469879518072,
"eval_loss": 0.6202346682548523,
"eval_mean_token_accuracy": 0.8934637369396531,
"eval_num_tokens": 19794568.0,
"eval_runtime": 32.9094,
"eval_samples_per_second": 25.98,
"eval_steps_per_second": 3.251,
"step": 300
},
{
"epoch": 5.906024096385542,
"grad_norm": 0.6075220334585247,
"learning_rate": 1.6031701437120512e-06,
"loss": 0.0436,
"mean_token_accuracy": 0.9871763586997986,
"num_tokens": 19925640.0,
"step": 302
},
{
"epoch": 5.944578313253012,
"grad_norm": 0.478268700720953,
"learning_rate": 1.5470727249291423e-06,
"loss": 0.0392,
"mean_token_accuracy": 0.9886266514658928,
"num_tokens": 20056712.0,
"step": 304
},
{
"epoch": 5.983132530120482,
"grad_norm": 0.5299056008151652,
"learning_rate": 1.4917941823650917e-06,
"loss": 0.0393,
"mean_token_accuracy": 0.9882984273135662,
"num_tokens": 20187784.0,
"step": 306
},
{
"epoch": 6.03855421686747,
"grad_norm": 0.4515619693414329,
"learning_rate": 1.4373476255586515e-06,
"loss": 0.0539,
"mean_token_accuracy": 0.9903883755207061,
"num_tokens": 20351624.0,
"step": 308
},
{
"epoch": 6.0771084337349395,
"grad_norm": 0.4461331299908182,
"learning_rate": 1.383745966739652e-06,
"loss": 0.0377,
"mean_token_accuracy": 0.9897639863193035,
"num_tokens": 20482696.0,
"step": 310
},
{
"epoch": 6.11566265060241,
"grad_norm": 0.3811282983266442,
"learning_rate": 1.3310019177668154e-06,
"loss": 0.0335,
"mean_token_accuracy": 0.9908020906150341,
"num_tokens": 20613768.0,
"step": 312
},
{
"epoch": 6.15421686746988,
"grad_norm": 0.4003286418611843,
"learning_rate": 1.2791279871130824e-06,
"loss": 0.0315,
"mean_token_accuracy": 0.9916570000350475,
"num_tokens": 20744840.0,
"step": 314
},
{
"epoch": 6.192771084337349,
"grad_norm": 0.4136786703234865,
"learning_rate": 1.2281364768991804e-06,
"loss": 0.0357,
"mean_token_accuracy": 0.990313570946455,
"num_tokens": 20875912.0,
"step": 316
},
{
"epoch": 6.231325301204819,
"grad_norm": 0.45632226149343463,
"learning_rate": 1.1780394799761163e-06,
"loss": 0.0324,
"mean_token_accuracy": 0.9911643601953983,
"num_tokens": 21006167.0,
"step": 318
},
{
"epoch": 6.2698795180722895,
"grad_norm": 0.3996603424603635,
"learning_rate": 1.1288488770573097e-06,
"loss": 0.0305,
"mean_token_accuracy": 0.9917750433087349,
"num_tokens": 21135643.0,
"step": 320
},
{
"epoch": 6.2698795180722895,
"eval_loss": 0.6617009043693542,
"eval_mean_token_accuracy": 0.8925447820503021,
"eval_num_tokens": 21135643.0,
"eval_runtime": 32.9359,
"eval_samples_per_second": 25.96,
"eval_steps_per_second": 3.249,
"step": 320
},
{
"epoch": 6.308433734939759,
"grad_norm": 0.4550106121984443,
"learning_rate": 1.0805763339010329e-06,
"loss": 0.0337,
"mean_token_accuracy": 0.990831833332777,
"num_tokens": 21266174.0,
"step": 322
},
{
"epoch": 6.346987951807229,
"grad_norm": 0.4540741073756364,
"learning_rate": 1.0332332985438248e-06,
"loss": 0.0348,
"mean_token_accuracy": 0.990550197660923,
"num_tokens": 21397246.0,
"step": 324
},
{
"epoch": 6.385542168674699,
"grad_norm": 0.47292748183552674,
"learning_rate": 9.868309985855446e-07,
"loss": 0.0326,
"mean_token_accuracy": 0.9910158179700375,
"num_tokens": 21528318.0,
"step": 326
},
{
"epoch": 6.424096385542168,
"grad_norm": 0.46337369369151543,
"learning_rate": 9.41380438526694e-07,
"loss": 0.0328,
"mean_token_accuracy": 0.990702860057354,
"num_tokens": 21659390.0,
"step": 328
},
{
"epoch": 6.462650602409639,
"grad_norm": 0.5096633620765385,
"learning_rate": 8.968923971586596e-07,
"loss": 0.0321,
"mean_token_accuracy": 0.9912142790853977,
"num_tokens": 21790462.0,
"step": 330
},
{
"epoch": 6.501204819277109,
"grad_norm": 0.4849844285466508,
"learning_rate": 8.533774250074727e-07,
"loss": 0.0307,
"mean_token_accuracy": 0.991565402597189,
"num_tokens": 21921534.0,
"step": 332
},
{
"epoch": 6.539759036144578,
"grad_norm": 0.47614922479928773,
"learning_rate": 8.108458418317089e-07,
"loss": 0.0329,
"mean_token_accuracy": 0.9907412827014923,
"num_tokens": 22051497.0,
"step": 334
},
{
"epoch": 6.578313253012048,
"grad_norm": 0.49852850537940474,
"learning_rate": 7.693077341751138e-07,
"loss": 0.0323,
"mean_token_accuracy": 0.9908555224537849,
"num_tokens": 22182569.0,
"step": 336
},
{
"epoch": 6.6168674698795185,
"grad_norm": 0.4273359220675533,
"learning_rate": 7.287729529745386e-07,
"loss": 0.0328,
"mean_token_accuracy": 0.9911041483283043,
"num_tokens": 22312765.0,
"step": 338
},
{
"epoch": 6.655421686746988,
"grad_norm": 0.4557886589116148,
"learning_rate": 6.892511112237472e-07,
"loss": 0.0344,
"mean_token_accuracy": 0.9900464117527008,
"num_tokens": 22443837.0,
"step": 340
},
{
"epoch": 6.655421686746988,
"eval_loss": 0.6737558841705322,
"eval_mean_token_accuracy": 0.8922743284813711,
"eval_num_tokens": 22443837.0,
"eval_runtime": 32.8919,
"eval_samples_per_second": 25.994,
"eval_steps_per_second": 3.253,
"step": 340
},
{
"epoch": 6.693975903614458,
"grad_norm": 0.46620750191290955,
"learning_rate": 6.507515816936538e-07,
"loss": 0.0333,
"mean_token_accuracy": 0.9908784218132496,
"num_tokens": 22574909.0,
"step": 342
},
{
"epoch": 6.732530120481927,
"grad_norm": 0.4697001737130722,
"learning_rate": 6.132834947095334e-07,
"loss": 0.0328,
"mean_token_accuracy": 0.9907486587762833,
"num_tokens": 22705981.0,
"step": 344
},
{
"epoch": 6.771084337349397,
"grad_norm": 0.4430515622762748,
"learning_rate": 5.768557359857241e-07,
"loss": 0.0321,
"mean_token_accuracy": 0.9912448115646839,
"num_tokens": 22837053.0,
"step": 346
},
{
"epoch": 6.809638554216868,
"grad_norm": 0.4361690495303729,
"learning_rate": 5.414769445183432e-07,
"loss": 0.0328,
"mean_token_accuracy": 0.9909089542925358,
"num_tokens": 22968125.0,
"step": 348
},
{
"epoch": 6.848192771084337,
"grad_norm": 0.4200405293131056,
"learning_rate": 5.071555105365156e-07,
"loss": 0.0346,
"mean_token_accuracy": 0.9901227429509163,
"num_tokens": 23099197.0,
"step": 350
},
{
"epoch": 6.886746987951807,
"grad_norm": 0.47982475678883507,
"learning_rate": 4.738995735125895e-07,
"loss": 0.0358,
"mean_token_accuracy": 0.9901380091905594,
"num_tokens": 23230269.0,
"step": 352
},
{
"epoch": 6.925301204819277,
"grad_norm": 0.4588352996826278,
"learning_rate": 4.4171702023183663e-07,
"loss": 0.0345,
"mean_token_accuracy": 0.9904306195676327,
"num_tokens": 23361268.0,
"step": 354
},
{
"epoch": 6.9638554216867465,
"grad_norm": 0.49076163739088335,
"learning_rate": 4.10615482922056e-07,
"loss": 0.034,
"mean_token_accuracy": 0.9904967658221722,
"num_tokens": 23492340.0,
"step": 356
},
{
"epoch": 7.019277108433735,
"grad_norm": 0.7037303292281402,
"learning_rate": 3.8060233744356634e-07,
"loss": 0.0476,
"mean_token_accuracy": 0.9915913552045822,
"num_tokens": 23656180.0,
"step": 358
},
{
"epoch": 7.057831325301205,
"grad_norm": 0.46496724208097717,
"learning_rate": 3.5168470153998937e-07,
"loss": 0.0307,
"mean_token_accuracy": 0.9916331619024277,
"num_tokens": 23786143.0,
"step": 360
},
{
"epoch": 7.057831325301205,
"eval_loss": 0.6740805506706238,
"eval_mean_token_accuracy": 0.8924460611610769,
"eval_num_tokens": 23786143.0,
"eval_runtime": 32.9163,
"eval_samples_per_second": 25.975,
"eval_steps_per_second": 3.251,
"step": 360
},
{
"epoch": 7.096385542168675,
"grad_norm": 0.43993509563002636,
"learning_rate": 3.238694331502451e-07,
"loss": 0.0299,
"mean_token_accuracy": 0.9917791299521923,
"num_tokens": 23917215.0,
"step": 362
},
{
"epoch": 7.134939759036144,
"grad_norm": 0.42297111455766123,
"learning_rate": 2.9716312878216194e-07,
"loss": 0.0295,
"mean_token_accuracy": 0.992290548980236,
"num_tokens": 24048287.0,
"step": 364
},
{
"epoch": 7.1734939759036145,
"grad_norm": 0.3740711012992464,
"learning_rate": 2.71572121948091e-07,
"loss": 0.0287,
"mean_token_accuracy": 0.9923897795379162,
"num_tokens": 24179359.0,
"step": 366
},
{
"epoch": 7.212048192771085,
"grad_norm": 0.40083480465861504,
"learning_rate": 2.471024816628836e-07,
"loss": 0.0309,
"mean_token_accuracy": 0.9913974739611149,
"num_tokens": 24310431.0,
"step": 368
},
{
"epoch": 7.250602409638554,
"grad_norm": 0.433443148283099,
"learning_rate": 2.237600110046001e-07,
"loss": 0.0325,
"mean_token_accuracy": 0.9914738051593304,
"num_tokens": 24441503.0,
"step": 370
},
{
"epoch": 7.289156626506024,
"grad_norm": 0.3913556728274319,
"learning_rate": 2.0155024573828452e-07,
"loss": 0.0309,
"mean_token_accuracy": 0.9921867847442627,
"num_tokens": 24572502.0,
"step": 372
},
{
"epoch": 7.327710843373494,
"grad_norm": 0.3931756889237697,
"learning_rate": 1.8047845300313726e-07,
"loss": 0.032,
"mean_token_accuracy": 0.991260077804327,
"num_tokens": 24703574.0,
"step": 374
},
{
"epoch": 7.366265060240964,
"grad_norm": 0.39372361510112325,
"learning_rate": 1.6054963006338742e-07,
"loss": 0.0307,
"mean_token_accuracy": 0.9918172955513,
"num_tokens": 24834646.0,
"step": 376
},
{
"epoch": 7.404819277108434,
"grad_norm": 0.4539746897925293,
"learning_rate": 1.4176850312317246e-07,
"loss": 0.0286,
"mean_token_accuracy": 0.9920920878648758,
"num_tokens": 24965718.0,
"step": 378
},
{
"epoch": 7.443373493975904,
"grad_norm": 0.3804093569086182,
"learning_rate": 1.241395262056999e-07,
"loss": 0.0282,
"mean_token_accuracy": 0.9927180036902428,
"num_tokens": 25096790.0,
"step": 380
},
{
"epoch": 7.443373493975904,
"eval_loss": 0.6863826513290405,
"eval_mean_token_accuracy": 0.8920925671809188,
"eval_num_tokens": 25096790.0,
"eval_runtime": 32.9256,
"eval_samples_per_second": 25.968,
"eval_steps_per_second": 3.25,
"step": 380
},
{
"epoch": 7.481927710843373,
"grad_norm": 0.4086380049589114,
"learning_rate": 1.0766688009695548e-07,
"loss": 0.0296,
"mean_token_accuracy": 0.9921302534639835,
"num_tokens": 25227862.0,
"step": 382
},
{
"epoch": 7.5204819277108435,
"grad_norm": 0.42260905745056304,
"learning_rate": 9.235447135421127e-08,
"loss": 0.03,
"mean_token_accuracy": 0.9916875325143337,
"num_tokens": 25358934.0,
"step": 384
},
{
"epoch": 7.559036144578314,
"grad_norm": 0.4176362136377749,
"learning_rate": 7.820593137957244e-08,
"loss": 0.031,
"mean_token_accuracy": 0.9918630942702293,
"num_tokens": 25490006.0,
"step": 386
},
{
"epoch": 7.597590361445783,
"grad_norm": 0.45701242241975604,
"learning_rate": 6.522461555877213e-08,
"loss": 0.0349,
"mean_token_accuracy": 0.9904204346239567,
"num_tokens": 25621078.0,
"step": 388
},
{
"epoch": 7.636144578313253,
"grad_norm": 0.40879094458461246,
"learning_rate": 5.341360246542804e-08,
"loss": 0.0306,
"mean_token_accuracy": 0.9920004904270172,
"num_tokens": 25752150.0,
"step": 390
},
{
"epoch": 7.674698795180722,
"grad_norm": 0.43647524957553,
"learning_rate": 4.2775693130948094e-08,
"loss": 0.0317,
"mean_token_accuracy": 0.9910463504493237,
"num_tokens": 25883222.0,
"step": 392
},
{
"epoch": 7.713253012048193,
"grad_norm": 0.41444591834134303,
"learning_rate": 3.3313410380250157e-08,
"loss": 0.0312,
"mean_token_accuracy": 0.9916607923805714,
"num_tokens": 26012936.0,
"step": 394
},
{
"epoch": 7.751807228915663,
"grad_norm": 0.3803028538212038,
"learning_rate": 2.5028998233467272e-08,
"loss": 0.0306,
"mean_token_accuracy": 0.9918249286711216,
"num_tokens": 26144008.0,
"step": 396
},
{
"epoch": 7.790361445783132,
"grad_norm": 0.3600455476234704,
"learning_rate": 1.7924421373766153e-08,
"loss": 0.0292,
"mean_token_accuracy": 0.9922676496207714,
"num_tokens": 26275080.0,
"step": 398
},
{
"epoch": 7.828915662650602,
"grad_norm": 0.39321030231512055,
"learning_rate": 1.200136468141544e-08,
"loss": 0.0324,
"mean_token_accuracy": 0.9913364090025425,
"num_tokens": 26406152.0,
"step": 400
},
{
"epoch": 7.828915662650602,
"eval_loss": 0.6899031400680542,
"eval_mean_token_accuracy": 0.892040293350398,
"eval_num_tokens": 26406152.0,
"eval_runtime": 32.9667,
"eval_samples_per_second": 25.935,
"eval_steps_per_second": 3.246,
"step": 400
},
{
"epoch": 7.867469879518072,
"grad_norm": 0.41654832211115,
"learning_rate": 7.261232834209208e-09,
"loss": 0.0292,
"mean_token_accuracy": 0.9922164119780064,
"num_tokens": 26536786.0,
"step": 402
},
{
"epoch": 7.906024096385542,
"grad_norm": 0.40099638349259836,
"learning_rate": 3.705149974342348e-09,
"loss": 0.0307,
"mean_token_accuracy": 0.9913211427628994,
"num_tokens": 26667858.0,
"step": 404
},
{
"epoch": 7.944578313253012,
"grad_norm": 0.42947674316382145,
"learning_rate": 1.3339594418138036e-09,
"loss": 0.0307,
"mean_token_accuracy": 0.9916405789554119,
"num_tokens": 26796896.0,
"step": 406
},
{
"epoch": 7.983132530120482,
"grad_norm": 0.41338624680752933,
"learning_rate": 1.4822357442656475e-10,
"loss": 0.0293,
"mean_token_accuracy": 0.992038656026125,
"num_tokens": 26927968.0,
"step": 408
},
{
"epoch": 7.983132530120482,
"step": 408,
"total_flos": 39091383042048.0,
"train_loss": 0.08385189656423879,
"train_runtime": 3861.3463,
"train_samples_per_second": 3.439,
"train_steps_per_second": 0.106
}
],
"logging_steps": 2,
"max_steps": 408,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 39091383042048.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}