review_evaluation_all / trainer_state.json
boda's picture
Model save
50923de verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 30.0,
"eval_steps": 500,
"global_step": 4170,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007194244604316547,
"grad_norm": 0.3506069883321638,
"learning_rate": 4.796163069544364e-07,
"loss": 1.8211,
"mean_token_accuracy": 0.6063699722290039,
"step": 1
},
{
"epoch": 0.03597122302158273,
"grad_norm": 0.3397364335493786,
"learning_rate": 2.3980815347721824e-06,
"loss": 1.8489,
"mean_token_accuracy": 0.6016613021492958,
"step": 5
},
{
"epoch": 0.07194244604316546,
"grad_norm": 0.36588885578293806,
"learning_rate": 4.796163069544365e-06,
"loss": 1.8553,
"mean_token_accuracy": 0.602922260761261,
"step": 10
},
{
"epoch": 0.1079136690647482,
"grad_norm": 0.38953277553950383,
"learning_rate": 7.1942446043165465e-06,
"loss": 1.854,
"mean_token_accuracy": 0.6022201240062713,
"step": 15
},
{
"epoch": 0.14388489208633093,
"grad_norm": 0.38828154068570925,
"learning_rate": 9.59232613908873e-06,
"loss": 1.8273,
"mean_token_accuracy": 0.6043285429477692,
"step": 20
},
{
"epoch": 0.17985611510791366,
"grad_norm": 0.4678851058069788,
"learning_rate": 1.1990407673860912e-05,
"loss": 1.797,
"mean_token_accuracy": 0.6082902371883392,
"step": 25
},
{
"epoch": 0.2158273381294964,
"grad_norm": 0.49705633435698987,
"learning_rate": 1.4388489208633093e-05,
"loss": 1.7648,
"mean_token_accuracy": 0.6104614853858947,
"step": 30
},
{
"epoch": 0.2517985611510791,
"grad_norm": 0.5253836453595289,
"learning_rate": 1.6786570743405277e-05,
"loss": 1.7535,
"mean_token_accuracy": 0.6107279539108277,
"step": 35
},
{
"epoch": 0.28776978417266186,
"grad_norm": 0.4197047432820652,
"learning_rate": 1.918465227817746e-05,
"loss": 1.6591,
"mean_token_accuracy": 0.6199684083461762,
"step": 40
},
{
"epoch": 0.3237410071942446,
"grad_norm": 0.2687351382925973,
"learning_rate": 2.1582733812949642e-05,
"loss": 1.6015,
"mean_token_accuracy": 0.6256727695465087,
"step": 45
},
{
"epoch": 0.3597122302158273,
"grad_norm": 0.2514281363945216,
"learning_rate": 2.3980815347721824e-05,
"loss": 1.5121,
"mean_token_accuracy": 0.6378357112407684,
"step": 50
},
{
"epoch": 0.39568345323741005,
"grad_norm": 0.27620691115174834,
"learning_rate": 2.637889688249401e-05,
"loss": 1.4599,
"mean_token_accuracy": 0.6464233458042145,
"step": 55
},
{
"epoch": 0.4316546762589928,
"grad_norm": 0.2747144748462002,
"learning_rate": 2.8776978417266186e-05,
"loss": 1.3595,
"mean_token_accuracy": 0.6629432022571564,
"step": 60
},
{
"epoch": 0.4676258992805755,
"grad_norm": 0.2803337874474452,
"learning_rate": 3.117505995203837e-05,
"loss": 1.2729,
"mean_token_accuracy": 0.6793328762054444,
"step": 65
},
{
"epoch": 0.5035971223021583,
"grad_norm": 0.3141630297057898,
"learning_rate": 3.3573141486810554e-05,
"loss": 1.1426,
"mean_token_accuracy": 0.7037691950798035,
"step": 70
},
{
"epoch": 0.539568345323741,
"grad_norm": 0.3554897054791459,
"learning_rate": 3.597122302158273e-05,
"loss": 0.9772,
"mean_token_accuracy": 0.7396033108234406,
"step": 75
},
{
"epoch": 0.5755395683453237,
"grad_norm": 0.3922829203034533,
"learning_rate": 3.836930455635492e-05,
"loss": 0.7946,
"mean_token_accuracy": 0.791416597366333,
"step": 80
},
{
"epoch": 0.6115107913669064,
"grad_norm": 0.4415520616858967,
"learning_rate": 4.0767386091127105e-05,
"loss": 0.5796,
"mean_token_accuracy": 0.852098262310028,
"step": 85
},
{
"epoch": 0.6474820143884892,
"grad_norm": 0.3221304026208011,
"learning_rate": 4.3165467625899284e-05,
"loss": 0.3595,
"mean_token_accuracy": 0.916002345085144,
"step": 90
},
{
"epoch": 0.6834532374100719,
"grad_norm": 0.2579065417189077,
"learning_rate": 4.556354916067146e-05,
"loss": 0.2257,
"mean_token_accuracy": 0.9520921051502228,
"step": 95
},
{
"epoch": 0.7194244604316546,
"grad_norm": 0.15356241858989592,
"learning_rate": 4.796163069544365e-05,
"loss": 0.1586,
"mean_token_accuracy": 0.9685133516788482,
"step": 100
},
{
"epoch": 0.7553956834532374,
"grad_norm": 0.12878276526429025,
"learning_rate": 5.035971223021583e-05,
"loss": 0.1404,
"mean_token_accuracy": 0.9713728368282318,
"step": 105
},
{
"epoch": 0.7913669064748201,
"grad_norm": 0.10471757647129615,
"learning_rate": 5.275779376498802e-05,
"loss": 0.1271,
"mean_token_accuracy": 0.9753898620605469,
"step": 110
},
{
"epoch": 0.8273381294964028,
"grad_norm": 0.09680394845041788,
"learning_rate": 5.515587529976019e-05,
"loss": 0.1277,
"mean_token_accuracy": 0.9750036299228668,
"step": 115
},
{
"epoch": 0.8633093525179856,
"grad_norm": 0.12123784922225729,
"learning_rate": 5.755395683453237e-05,
"loss": 0.1224,
"mean_token_accuracy": 0.9754109263420105,
"step": 120
},
{
"epoch": 0.8992805755395683,
"grad_norm": 0.11686026875002653,
"learning_rate": 5.9952038369304564e-05,
"loss": 0.1156,
"mean_token_accuracy": 0.9775736808776856,
"step": 125
},
{
"epoch": 0.935251798561151,
"grad_norm": 0.08598616604099492,
"learning_rate": 6.235011990407674e-05,
"loss": 0.1399,
"mean_token_accuracy": 0.9725452423095703,
"step": 130
},
{
"epoch": 0.9712230215827338,
"grad_norm": 0.1673532970509405,
"learning_rate": 6.474820143884892e-05,
"loss": 0.0929,
"mean_token_accuracy": 0.9821974813938141,
"step": 135
},
{
"epoch": 1.0,
"eval_loss": 0.12023145705461502,
"eval_mean_token_accuracy": 0.9781519497434298,
"eval_runtime": 20.7288,
"eval_samples_per_second": 5.886,
"eval_steps_per_second": 0.772,
"step": 139
},
{
"epoch": 1.0071942446043165,
"grad_norm": 0.08888350379847303,
"learning_rate": 6.714628297362111e-05,
"loss": 0.111,
"mean_token_accuracy": 0.9802520871162415,
"step": 140
},
{
"epoch": 1.0431654676258992,
"grad_norm": 0.0879355109627538,
"learning_rate": 6.954436450839329e-05,
"loss": 0.1106,
"mean_token_accuracy": 0.9783557474613189,
"step": 145
},
{
"epoch": 1.079136690647482,
"grad_norm": 0.07545083881475075,
"learning_rate": 7.194244604316547e-05,
"loss": 0.0989,
"mean_token_accuracy": 0.9803751826286315,
"step": 150
},
{
"epoch": 1.1151079136690647,
"grad_norm": 0.06702405978093251,
"learning_rate": 7.434052757793766e-05,
"loss": 0.0984,
"mean_token_accuracy": 0.980546236038208,
"step": 155
},
{
"epoch": 1.1510791366906474,
"grad_norm": 0.08746346415813978,
"learning_rate": 7.673860911270984e-05,
"loss": 0.0971,
"mean_token_accuracy": 0.980619478225708,
"step": 160
},
{
"epoch": 1.1870503597122302,
"grad_norm": 0.07148480917132531,
"learning_rate": 7.913669064748202e-05,
"loss": 0.0995,
"mean_token_accuracy": 0.9798974812030792,
"step": 165
},
{
"epoch": 1.223021582733813,
"grad_norm": 0.07231936051146864,
"learning_rate": 8.153477218225421e-05,
"loss": 0.1026,
"mean_token_accuracy": 0.979968684911728,
"step": 170
},
{
"epoch": 1.2589928057553956,
"grad_norm": 0.06885790662310835,
"learning_rate": 8.393285371702639e-05,
"loss": 0.0943,
"mean_token_accuracy": 0.9808494627475739,
"step": 175
},
{
"epoch": 1.2949640287769784,
"grad_norm": 0.08334798597727301,
"learning_rate": 8.633093525179857e-05,
"loss": 0.0925,
"mean_token_accuracy": 0.9816609919071198,
"step": 180
},
{
"epoch": 1.330935251798561,
"grad_norm": 0.09251301084879311,
"learning_rate": 8.872901678657075e-05,
"loss": 0.1132,
"mean_token_accuracy": 0.9775943398475647,
"step": 185
},
{
"epoch": 1.3669064748201438,
"grad_norm": 0.07084603124056196,
"learning_rate": 9.112709832134293e-05,
"loss": 0.0955,
"mean_token_accuracy": 0.9806205093860626,
"step": 190
},
{
"epoch": 1.4028776978417266,
"grad_norm": 0.0771787796949035,
"learning_rate": 9.35251798561151e-05,
"loss": 0.1044,
"mean_token_accuracy": 0.9783063352108001,
"step": 195
},
{
"epoch": 1.4388489208633093,
"grad_norm": 0.07306767327642648,
"learning_rate": 9.59232613908873e-05,
"loss": 0.0852,
"mean_token_accuracy": 0.9823802232742309,
"step": 200
},
{
"epoch": 1.474820143884892,
"grad_norm": 0.08702124943881479,
"learning_rate": 9.832134292565948e-05,
"loss": 0.0793,
"mean_token_accuracy": 0.9833337783813476,
"step": 205
},
{
"epoch": 1.5107913669064748,
"grad_norm": 0.09562766038385109,
"learning_rate": 0.00010071942446043166,
"loss": 0.0845,
"mean_token_accuracy": 0.982536792755127,
"step": 210
},
{
"epoch": 1.5467625899280577,
"grad_norm": 0.07345574083799765,
"learning_rate": 0.00010311750599520385,
"loss": 0.0698,
"mean_token_accuracy": 0.9853514194488525,
"step": 215
},
{
"epoch": 1.5827338129496402,
"grad_norm": 0.06101323873063209,
"learning_rate": 0.00010551558752997604,
"loss": 0.0818,
"mean_token_accuracy": 0.9826856195926666,
"step": 220
},
{
"epoch": 1.6187050359712232,
"grad_norm": 0.06705744022149719,
"learning_rate": 0.0001079136690647482,
"loss": 0.0901,
"mean_token_accuracy": 0.9815958976745606,
"step": 225
},
{
"epoch": 1.6546762589928057,
"grad_norm": 0.06132406862414683,
"learning_rate": 0.00011031175059952039,
"loss": 0.0855,
"mean_token_accuracy": 0.9825255811214447,
"step": 230
},
{
"epoch": 1.6906474820143886,
"grad_norm": 0.07399014413697551,
"learning_rate": 0.00011270983213429258,
"loss": 0.0788,
"mean_token_accuracy": 0.9834049463272094,
"step": 235
},
{
"epoch": 1.7266187050359711,
"grad_norm": 0.058894526105802536,
"learning_rate": 0.00011510791366906474,
"loss": 0.0704,
"mean_token_accuracy": 0.9853868961334229,
"step": 240
},
{
"epoch": 1.762589928057554,
"grad_norm": 0.08305627567650643,
"learning_rate": 0.00011750599520383694,
"loss": 0.0856,
"mean_token_accuracy": 0.9817408621311188,
"step": 245
},
{
"epoch": 1.7985611510791366,
"grad_norm": 0.05855661629998082,
"learning_rate": 0.00011990407673860913,
"loss": 0.0718,
"mean_token_accuracy": 0.9844718694686889,
"step": 250
},
{
"epoch": 1.8345323741007196,
"grad_norm": 0.0670672867431674,
"learning_rate": 0.0001223021582733813,
"loss": 0.0829,
"mean_token_accuracy": 0.9828297436237335,
"step": 255
},
{
"epoch": 1.870503597122302,
"grad_norm": 0.07172440002334786,
"learning_rate": 0.00012470023980815347,
"loss": 0.0712,
"mean_token_accuracy": 0.9848017036914826,
"step": 260
},
{
"epoch": 1.906474820143885,
"grad_norm": 0.08171945353658899,
"learning_rate": 0.00012709832134292568,
"loss": 0.0899,
"mean_token_accuracy": 0.9812785029411316,
"step": 265
},
{
"epoch": 1.9424460431654675,
"grad_norm": 0.09215495770516072,
"learning_rate": 0.00012949640287769783,
"loss": 0.0901,
"mean_token_accuracy": 0.9818152070045472,
"step": 270
},
{
"epoch": 1.9784172661870505,
"grad_norm": 0.05819449472830757,
"learning_rate": 0.00013189448441247004,
"loss": 0.0855,
"mean_token_accuracy": 0.9816466629505157,
"step": 275
},
{
"epoch": 2.0,
"eval_loss": 0.09057755023241043,
"eval_mean_token_accuracy": 0.9828948188911785,
"eval_runtime": 20.6375,
"eval_samples_per_second": 5.912,
"eval_steps_per_second": 0.775,
"step": 278
},
{
"epoch": 2.014388489208633,
"grad_norm": 0.0579264171607264,
"learning_rate": 0.00013429256594724222,
"loss": 0.0807,
"mean_token_accuracy": 0.9847154915332794,
"step": 280
},
{
"epoch": 2.050359712230216,
"grad_norm": 0.06381845611677527,
"learning_rate": 0.0001366906474820144,
"loss": 0.0721,
"mean_token_accuracy": 0.984616607427597,
"step": 285
},
{
"epoch": 2.0863309352517985,
"grad_norm": 0.07718475085953005,
"learning_rate": 0.00013908872901678657,
"loss": 0.0841,
"mean_token_accuracy": 0.9817797482013703,
"step": 290
},
{
"epoch": 2.1223021582733814,
"grad_norm": 0.05892985671753617,
"learning_rate": 0.00014148681055155878,
"loss": 0.0751,
"mean_token_accuracy": 0.9831727027893067,
"step": 295
},
{
"epoch": 2.158273381294964,
"grad_norm": 0.0804925115008608,
"learning_rate": 0.00014388489208633093,
"loss": 0.0749,
"mean_token_accuracy": 0.9842367172241211,
"step": 300
},
{
"epoch": 2.194244604316547,
"grad_norm": 0.05121626528606145,
"learning_rate": 0.0001462829736211031,
"loss": 0.0773,
"mean_token_accuracy": 0.9835640609264373,
"step": 305
},
{
"epoch": 2.2302158273381294,
"grad_norm": 0.08889974111718164,
"learning_rate": 0.00014868105515587532,
"loss": 0.0791,
"mean_token_accuracy": 0.9834680020809173,
"step": 310
},
{
"epoch": 2.2661870503597124,
"grad_norm": 0.053476424317901526,
"learning_rate": 0.00015107913669064747,
"loss": 0.077,
"mean_token_accuracy": 0.9838110446929932,
"step": 315
},
{
"epoch": 2.302158273381295,
"grad_norm": 0.05633921643284814,
"learning_rate": 0.00015347721822541968,
"loss": 0.0829,
"mean_token_accuracy": 0.982527244091034,
"step": 320
},
{
"epoch": 2.338129496402878,
"grad_norm": 0.056650154444109466,
"learning_rate": 0.00015587529976019186,
"loss": 0.0796,
"mean_token_accuracy": 0.9829414904117584,
"step": 325
},
{
"epoch": 2.3741007194244603,
"grad_norm": 0.06044924727673958,
"learning_rate": 0.00015827338129496403,
"loss": 0.0601,
"mean_token_accuracy": 0.9872002065181732,
"step": 330
},
{
"epoch": 2.4100719424460433,
"grad_norm": 0.05992425734936301,
"learning_rate": 0.0001606714628297362,
"loss": 0.0792,
"mean_token_accuracy": 0.9831002652645111,
"step": 335
},
{
"epoch": 2.446043165467626,
"grad_norm": 0.05470386798150016,
"learning_rate": 0.00016306954436450842,
"loss": 0.0623,
"mean_token_accuracy": 0.987056291103363,
"step": 340
},
{
"epoch": 2.4820143884892087,
"grad_norm": 0.059337571166361285,
"learning_rate": 0.00016546762589928057,
"loss": 0.08,
"mean_token_accuracy": 0.9831870436668396,
"step": 345
},
{
"epoch": 2.5179856115107913,
"grad_norm": 0.05942919896434834,
"learning_rate": 0.00016786570743405278,
"loss": 0.0853,
"mean_token_accuracy": 0.981755542755127,
"step": 350
},
{
"epoch": 2.553956834532374,
"grad_norm": 0.04624108736295381,
"learning_rate": 0.00017026378896882496,
"loss": 0.066,
"mean_token_accuracy": 0.9858887672424317,
"step": 355
},
{
"epoch": 2.5899280575539567,
"grad_norm": 0.06579321358044239,
"learning_rate": 0.00017266187050359714,
"loss": 0.0884,
"mean_token_accuracy": 0.9812662482261658,
"step": 360
},
{
"epoch": 2.6258992805755397,
"grad_norm": 0.06258890069214806,
"learning_rate": 0.00017505995203836931,
"loss": 0.0713,
"mean_token_accuracy": 0.984937310218811,
"step": 365
},
{
"epoch": 2.661870503597122,
"grad_norm": 0.06270259498254936,
"learning_rate": 0.0001774580335731415,
"loss": 0.073,
"mean_token_accuracy": 0.9842502534389496,
"step": 370
},
{
"epoch": 2.697841726618705,
"grad_norm": 0.05589997924614264,
"learning_rate": 0.00017985611510791367,
"loss": 0.0768,
"mean_token_accuracy": 0.983589482307434,
"step": 375
},
{
"epoch": 2.7338129496402876,
"grad_norm": 0.04009483221136256,
"learning_rate": 0.00018225419664268585,
"loss": 0.0751,
"mean_token_accuracy": 0.984445083141327,
"step": 380
},
{
"epoch": 2.7697841726618706,
"grad_norm": 0.05881218057232397,
"learning_rate": 0.00018465227817745806,
"loss": 0.0707,
"mean_token_accuracy": 0.9846773445606232,
"step": 385
},
{
"epoch": 2.805755395683453,
"grad_norm": 0.07312271736187839,
"learning_rate": 0.0001870503597122302,
"loss": 0.0903,
"mean_token_accuracy": 0.980737829208374,
"step": 390
},
{
"epoch": 2.841726618705036,
"grad_norm": 0.04533772120467666,
"learning_rate": 0.00018944844124700242,
"loss": 0.0548,
"mean_token_accuracy": 0.9884092271327972,
"step": 395
},
{
"epoch": 2.8776978417266186,
"grad_norm": 0.05840450449653284,
"learning_rate": 0.0001918465227817746,
"loss": 0.0676,
"mean_token_accuracy": 0.9858544588088989,
"step": 400
},
{
"epoch": 2.9136690647482015,
"grad_norm": 0.06171453893995398,
"learning_rate": 0.00019424460431654677,
"loss": 0.0817,
"mean_token_accuracy": 0.9826960742473603,
"step": 405
},
{
"epoch": 2.949640287769784,
"grad_norm": 0.0631522796745376,
"learning_rate": 0.00019664268585131895,
"loss": 0.0752,
"mean_token_accuracy": 0.9839196085929871,
"step": 410
},
{
"epoch": 2.985611510791367,
"grad_norm": 0.05036488138002462,
"learning_rate": 0.00019904076738609113,
"loss": 0.0823,
"mean_token_accuracy": 0.9825737118721009,
"step": 415
},
{
"epoch": 3.0,
"eval_loss": 0.08580321818590164,
"eval_mean_token_accuracy": 0.9844951361417771,
"eval_runtime": 20.7493,
"eval_samples_per_second": 5.88,
"eval_steps_per_second": 0.771,
"step": 417
},
{
"epoch": 3.0215827338129495,
"grad_norm": 0.0457372684064395,
"learning_rate": 0.0001999996846775429,
"loss": 0.0646,
"mean_token_accuracy": 0.9852441847324371,
"step": 420
},
{
"epoch": 3.0575539568345325,
"grad_norm": 0.04793056670224028,
"learning_rate": 0.000199997757714173,
"loss": 0.0729,
"mean_token_accuracy": 0.9836010575294495,
"step": 425
},
{
"epoch": 3.093525179856115,
"grad_norm": 0.06721942436030308,
"learning_rate": 0.00019999407900029147,
"loss": 0.0738,
"mean_token_accuracy": 0.9839203715324402,
"step": 430
},
{
"epoch": 3.129496402877698,
"grad_norm": 0.056660744728913394,
"learning_rate": 0.00019998864860034169,
"loss": 0.0757,
"mean_token_accuracy": 0.9841017842292785,
"step": 435
},
{
"epoch": 3.1654676258992804,
"grad_norm": 0.05761414694560119,
"learning_rate": 0.00019998146660945277,
"loss": 0.082,
"mean_token_accuracy": 0.982598501443863,
"step": 440
},
{
"epoch": 3.2014388489208634,
"grad_norm": 0.046839229541453344,
"learning_rate": 0.0001999725331534382,
"loss": 0.0681,
"mean_token_accuracy": 0.9851432383060456,
"step": 445
},
{
"epoch": 3.237410071942446,
"grad_norm": 0.05445851360485557,
"learning_rate": 0.00019996184838879326,
"loss": 0.0641,
"mean_token_accuracy": 0.9865113973617554,
"step": 450
},
{
"epoch": 3.273381294964029,
"grad_norm": 0.048523472160407664,
"learning_rate": 0.0001999494125026926,
"loss": 0.0672,
"mean_token_accuracy": 0.9852766156196594,
"step": 455
},
{
"epoch": 3.3093525179856114,
"grad_norm": 0.051936987103197454,
"learning_rate": 0.00019993522571298678,
"loss": 0.0654,
"mean_token_accuracy": 0.985963374376297,
"step": 460
},
{
"epoch": 3.3453237410071943,
"grad_norm": 0.04457189008558806,
"learning_rate": 0.00019991928826819857,
"loss": 0.0742,
"mean_token_accuracy": 0.9842129707336426,
"step": 465
},
{
"epoch": 3.381294964028777,
"grad_norm": 0.056266351400963775,
"learning_rate": 0.0001999016004475185,
"loss": 0.0755,
"mean_token_accuracy": 0.983711302280426,
"step": 470
},
{
"epoch": 3.41726618705036,
"grad_norm": 0.5220247379709618,
"learning_rate": 0.00019988216256079997,
"loss": 0.0722,
"mean_token_accuracy": 0.9841032028198242,
"step": 475
},
{
"epoch": 3.4532374100719423,
"grad_norm": 0.0729813271238147,
"learning_rate": 0.0001998609749485539,
"loss": 0.0916,
"mean_token_accuracy": 0.9794904887676239,
"step": 480
},
{
"epoch": 3.4892086330935252,
"grad_norm": 0.06612977773669373,
"learning_rate": 0.0001998380379819428,
"loss": 0.0636,
"mean_token_accuracy": 0.9862911105155945,
"step": 485
},
{
"epoch": 3.5251798561151078,
"grad_norm": 0.06217153246894537,
"learning_rate": 0.00019981335206277397,
"loss": 0.0741,
"mean_token_accuracy": 0.9842127680778503,
"step": 490
},
{
"epoch": 3.5611510791366907,
"grad_norm": 0.07400702775391514,
"learning_rate": 0.00019978691762349295,
"loss": 0.0687,
"mean_token_accuracy": 0.9851798236370086,
"step": 495
},
{
"epoch": 3.597122302158273,
"grad_norm": 0.08585874467498368,
"learning_rate": 0.00019975873512717546,
"loss": 0.0609,
"mean_token_accuracy": 0.986882072687149,
"step": 500
},
{
"epoch": 3.633093525179856,
"grad_norm": 0.051816554926674696,
"learning_rate": 0.00019972880506751968,
"loss": 0.0701,
"mean_token_accuracy": 0.9853014886379242,
"step": 505
},
{
"epoch": 3.6690647482014387,
"grad_norm": 0.05057892453950836,
"learning_rate": 0.00019969712796883725,
"loss": 0.0741,
"mean_token_accuracy": 0.9835891008377076,
"step": 510
},
{
"epoch": 3.7050359712230216,
"grad_norm": 0.07153654683802517,
"learning_rate": 0.0001996637043860444,
"loss": 0.0688,
"mean_token_accuracy": 0.9850581645965576,
"step": 515
},
{
"epoch": 3.741007194244604,
"grad_norm": 0.04708930317430444,
"learning_rate": 0.00019962853490465202,
"loss": 0.0661,
"mean_token_accuracy": 0.985362309217453,
"step": 520
},
{
"epoch": 3.776978417266187,
"grad_norm": 0.055807985616846,
"learning_rate": 0.00019959162014075553,
"loss": 0.0821,
"mean_token_accuracy": 0.9829040467739105,
"step": 525
},
{
"epoch": 3.81294964028777,
"grad_norm": 0.04505227199614646,
"learning_rate": 0.00019955296074102393,
"loss": 0.0741,
"mean_token_accuracy": 0.9845075249671936,
"step": 530
},
{
"epoch": 3.8489208633093526,
"grad_norm": 0.05335430120004925,
"learning_rate": 0.00019951255738268872,
"loss": 0.0737,
"mean_token_accuracy": 0.9842015564441681,
"step": 535
},
{
"epoch": 3.884892086330935,
"grad_norm": 0.05015874969380626,
"learning_rate": 0.00019947041077353177,
"loss": 0.0511,
"mean_token_accuracy": 0.9884456872940064,
"step": 540
},
{
"epoch": 3.920863309352518,
"grad_norm": 0.039523803165780566,
"learning_rate": 0.00019942652165187306,
"loss": 0.0526,
"mean_token_accuracy": 0.9887028813362122,
"step": 545
},
{
"epoch": 3.956834532374101,
"grad_norm": 0.033565888789523046,
"learning_rate": 0.00019938089078655775,
"loss": 0.0634,
"mean_token_accuracy": 0.9865010201930999,
"step": 550
},
{
"epoch": 3.9928057553956835,
"grad_norm": 0.0406257264738635,
"learning_rate": 0.0001993335189769427,
"loss": 0.0794,
"mean_token_accuracy": 0.982637244462967,
"step": 555
},
{
"epoch": 4.0,
"eval_loss": 0.08812480419874191,
"eval_mean_token_accuracy": 0.9846961365805732,
"eval_runtime": 20.6402,
"eval_samples_per_second": 5.911,
"eval_steps_per_second": 0.775,
"step": 556
},
{
"epoch": 4.028776978417266,
"grad_norm": 0.0543120656292955,
"learning_rate": 0.0001992844070528824,
"loss": 0.0608,
"mean_token_accuracy": 0.9861808717250824,
"step": 560
},
{
"epoch": 4.0647482014388485,
"grad_norm": 0.06445221295308218,
"learning_rate": 0.00019923355587471458,
"loss": 0.0763,
"mean_token_accuracy": 0.983160275220871,
"step": 565
},
{
"epoch": 4.100719424460432,
"grad_norm": 0.05078293574914197,
"learning_rate": 0.00019918096633324492,
"loss": 0.069,
"mean_token_accuracy": 0.9846292018890381,
"step": 570
},
{
"epoch": 4.136690647482014,
"grad_norm": 0.048929071374438124,
"learning_rate": 0.00019912663934973168,
"loss": 0.0667,
"mean_token_accuracy": 0.9851913154125214,
"step": 575
},
{
"epoch": 4.172661870503597,
"grad_norm": 0.05408191334830909,
"learning_rate": 0.0001990705758758694,
"loss": 0.0693,
"mean_token_accuracy": 0.9847879648208618,
"step": 580
},
{
"epoch": 4.2086330935251794,
"grad_norm": 0.05934948421112335,
"learning_rate": 0.0001990127768937723,
"loss": 0.0714,
"mean_token_accuracy": 0.9839065909385681,
"step": 585
},
{
"epoch": 4.244604316546763,
"grad_norm": 0.06248100052161056,
"learning_rate": 0.00019895324341595707,
"loss": 0.0649,
"mean_token_accuracy": 0.9853267908096314,
"step": 590
},
{
"epoch": 4.280575539568345,
"grad_norm": 0.058374434880137584,
"learning_rate": 0.00019889197648532503,
"loss": 0.071,
"mean_token_accuracy": 0.9845187664031982,
"step": 595
},
{
"epoch": 4.316546762589928,
"grad_norm": 0.07289571230193848,
"learning_rate": 0.00019882897717514407,
"loss": 0.0625,
"mean_token_accuracy": 0.9861088514328002,
"step": 600
},
{
"epoch": 4.35251798561151,
"grad_norm": 0.05591731428953037,
"learning_rate": 0.00019876424658902967,
"loss": 0.0701,
"mean_token_accuracy": 0.9845547020435333,
"step": 605
},
{
"epoch": 4.388489208633094,
"grad_norm": 0.05638213741724957,
"learning_rate": 0.00019869778586092564,
"loss": 0.0707,
"mean_token_accuracy": 0.9847763419151306,
"step": 610
},
{
"epoch": 4.424460431654676,
"grad_norm": 0.057841809730352224,
"learning_rate": 0.00019862959615508417,
"loss": 0.0608,
"mean_token_accuracy": 0.9867449104785919,
"step": 615
},
{
"epoch": 4.460431654676259,
"grad_norm": 0.053932576578369425,
"learning_rate": 0.00019855967866604562,
"loss": 0.0587,
"mean_token_accuracy": 0.9870499551296235,
"step": 620
},
{
"epoch": 4.496402877697841,
"grad_norm": 0.05211700106675136,
"learning_rate": 0.0001984880346186174,
"loss": 0.0534,
"mean_token_accuracy": 0.9879081964492797,
"step": 625
},
{
"epoch": 4.532374100719425,
"grad_norm": 0.05540373657902223,
"learning_rate": 0.00019841466526785266,
"loss": 0.0663,
"mean_token_accuracy": 0.9853027820587158,
"step": 630
},
{
"epoch": 4.568345323741007,
"grad_norm": 0.048602335259883014,
"learning_rate": 0.00019833957189902815,
"loss": 0.0603,
"mean_token_accuracy": 0.9864147365093231,
"step": 635
},
{
"epoch": 4.60431654676259,
"grad_norm": 0.05673454468520649,
"learning_rate": 0.00019826275582762186,
"loss": 0.0615,
"mean_token_accuracy": 0.9861698567867279,
"step": 640
},
{
"epoch": 4.640287769784173,
"grad_norm": 0.05852615284556405,
"learning_rate": 0.0001981842183992899,
"loss": 0.0624,
"mean_token_accuracy": 0.986009931564331,
"step": 645
},
{
"epoch": 4.676258992805756,
"grad_norm": 0.08431448411850327,
"learning_rate": 0.00019810396098984292,
"loss": 0.0572,
"mean_token_accuracy": 0.9874668717384338,
"step": 650
},
{
"epoch": 4.712230215827338,
"grad_norm": 0.06730656620028044,
"learning_rate": 0.00019802198500522197,
"loss": 0.0616,
"mean_token_accuracy": 0.9861456751823425,
"step": 655
},
{
"epoch": 4.748201438848921,
"grad_norm": 0.044974290832838465,
"learning_rate": 0.00019793829188147406,
"loss": 0.0574,
"mean_token_accuracy": 0.987455677986145,
"step": 660
},
{
"epoch": 4.784172661870503,
"grad_norm": 0.06716196494496443,
"learning_rate": 0.00019785288308472672,
"loss": 0.0814,
"mean_token_accuracy": 0.9825004875659943,
"step": 665
},
{
"epoch": 4.820143884892087,
"grad_norm": 0.054996115096736096,
"learning_rate": 0.00019776576011116263,
"loss": 0.0737,
"mean_token_accuracy": 0.9838329493999481,
"step": 670
},
{
"epoch": 4.856115107913669,
"grad_norm": 0.033705316368331954,
"learning_rate": 0.00019767692448699302,
"loss": 0.0502,
"mean_token_accuracy": 0.9890934944152832,
"step": 675
},
{
"epoch": 4.892086330935252,
"grad_norm": 0.05047378970674569,
"learning_rate": 0.00019758637776843137,
"loss": 0.0691,
"mean_token_accuracy": 0.9849341213703156,
"step": 680
},
{
"epoch": 4.928057553956835,
"grad_norm": 0.04984841000823012,
"learning_rate": 0.00019749412154166583,
"loss": 0.0589,
"mean_token_accuracy": 0.9870136559009552,
"step": 685
},
{
"epoch": 4.9640287769784175,
"grad_norm": 0.03930276013196912,
"learning_rate": 0.00019740015742283155,
"loss": 0.0554,
"mean_token_accuracy": 0.9878572285175323,
"step": 690
},
{
"epoch": 5.0,
"grad_norm": 0.045628151478910806,
"learning_rate": 0.00019730448705798239,
"loss": 0.0501,
"mean_token_accuracy": 0.9887760579586029,
"step": 695
},
{
"epoch": 5.0,
"eval_loss": 0.09487643092870712,
"eval_mean_token_accuracy": 0.9840419329702854,
"eval_runtime": 20.6735,
"eval_samples_per_second": 5.901,
"eval_steps_per_second": 0.774,
"step": 695
},
{
"epoch": 5.0359712230215825,
"grad_norm": 0.05493054119678511,
"learning_rate": 0.00019720711212306205,
"loss": 0.0597,
"mean_token_accuracy": 0.9867689490318299,
"step": 700
},
{
"epoch": 5.071942446043165,
"grad_norm": 0.04837069496624849,
"learning_rate": 0.00019710803432387465,
"loss": 0.0561,
"mean_token_accuracy": 0.9872341334819794,
"step": 705
},
{
"epoch": 5.107913669064748,
"grad_norm": 0.05589419149281416,
"learning_rate": 0.000197007255396055,
"loss": 0.0582,
"mean_token_accuracy": 0.9867084145545959,
"step": 710
},
{
"epoch": 5.143884892086331,
"grad_norm": 0.059477184547365045,
"learning_rate": 0.00019690477710503809,
"loss": 0.0581,
"mean_token_accuracy": 0.9864130139350891,
"step": 715
},
{
"epoch": 5.179856115107913,
"grad_norm": 0.051282761432200584,
"learning_rate": 0.00019680060124602808,
"loss": 0.044,
"mean_token_accuracy": 0.9898509323596955,
"step": 720
},
{
"epoch": 5.215827338129497,
"grad_norm": 0.08016188967120222,
"learning_rate": 0.00019669472964396712,
"loss": 0.053,
"mean_token_accuracy": 0.9872821033000946,
"step": 725
},
{
"epoch": 5.251798561151079,
"grad_norm": 0.05229073710194996,
"learning_rate": 0.0001965871641535031,
"loss": 0.0528,
"mean_token_accuracy": 0.9878568768501281,
"step": 730
},
{
"epoch": 5.287769784172662,
"grad_norm": 0.07418543392117145,
"learning_rate": 0.0001964779066589573,
"loss": 0.0532,
"mean_token_accuracy": 0.9879068970680237,
"step": 735
},
{
"epoch": 5.323741007194244,
"grad_norm": 0.05647478312480804,
"learning_rate": 0.00019636695907429132,
"loss": 0.06,
"mean_token_accuracy": 0.9861337542533875,
"step": 740
},
{
"epoch": 5.359712230215827,
"grad_norm": 0.08571837256821345,
"learning_rate": 0.00019625432334307368,
"loss": 0.0652,
"mean_token_accuracy": 0.9846034228801728,
"step": 745
},
{
"epoch": 5.39568345323741,
"grad_norm": 0.0792782233228753,
"learning_rate": 0.00019614000143844558,
"loss": 0.0641,
"mean_token_accuracy": 0.9854226410388947,
"step": 750
},
{
"epoch": 5.431654676258993,
"grad_norm": 0.058478799045197496,
"learning_rate": 0.0001960239953630865,
"loss": 0.0571,
"mean_token_accuracy": 0.9870614647865296,
"step": 755
},
{
"epoch": 5.467625899280575,
"grad_norm": 0.056558458972068175,
"learning_rate": 0.00019590630714917898,
"loss": 0.0595,
"mean_token_accuracy": 0.986426830291748,
"step": 760
},
{
"epoch": 5.503597122302159,
"grad_norm": 0.0692782763770465,
"learning_rate": 0.0001957869388583732,
"loss": 0.049,
"mean_token_accuracy": 0.9884204208850861,
"step": 765
},
{
"epoch": 5.539568345323741,
"grad_norm": 0.049674110177074314,
"learning_rate": 0.00019566589258175068,
"loss": 0.0534,
"mean_token_accuracy": 0.9881749093532562,
"step": 770
},
{
"epoch": 5.575539568345324,
"grad_norm": 0.04655468775322885,
"learning_rate": 0.00019554317043978773,
"loss": 0.0467,
"mean_token_accuracy": 0.9892040431499481,
"step": 775
},
{
"epoch": 5.611510791366906,
"grad_norm": 0.06639514118497526,
"learning_rate": 0.00019541877458231825,
"loss": 0.0571,
"mean_token_accuracy": 0.9866962909698487,
"step": 780
},
{
"epoch": 5.647482014388489,
"grad_norm": 0.07907920132092487,
"learning_rate": 0.00019529270718849625,
"loss": 0.0635,
"mean_token_accuracy": 0.9850185811519623,
"step": 785
},
{
"epoch": 5.683453237410072,
"grad_norm": 0.06387100290060817,
"learning_rate": 0.00019516497046675744,
"loss": 0.0569,
"mean_token_accuracy": 0.9872703731060029,
"step": 790
},
{
"epoch": 5.719424460431655,
"grad_norm": 0.07096878405082174,
"learning_rate": 0.00019503556665478067,
"loss": 0.0609,
"mean_token_accuracy": 0.9861226320266724,
"step": 795
},
{
"epoch": 5.755395683453237,
"grad_norm": 0.07451473740931176,
"learning_rate": 0.00019490449801944868,
"loss": 0.0533,
"mean_token_accuracy": 0.9878711819648742,
"step": 800
},
{
"epoch": 5.7913669064748206,
"grad_norm": 0.06410885313727609,
"learning_rate": 0.0001947717668568085,
"loss": 0.0488,
"mean_token_accuracy": 0.9891544997692108,
"step": 805
},
{
"epoch": 5.827338129496403,
"grad_norm": 0.053854419589313515,
"learning_rate": 0.00019463737549203105,
"loss": 0.0488,
"mean_token_accuracy": 0.9887990176677703,
"step": 810
},
{
"epoch": 5.863309352517986,
"grad_norm": 0.04561191580156929,
"learning_rate": 0.00019450132627937055,
"loss": 0.0644,
"mean_token_accuracy": 0.9854602158069611,
"step": 815
},
{
"epoch": 5.899280575539568,
"grad_norm": 0.04767754908778601,
"learning_rate": 0.0001943636216021232,
"loss": 0.0549,
"mean_token_accuracy": 0.9869880855083466,
"step": 820
},
{
"epoch": 5.935251798561151,
"grad_norm": 0.0669886262398955,
"learning_rate": 0.00019422426387258551,
"loss": 0.0641,
"mean_token_accuracy": 0.9850812613964081,
"step": 825
},
{
"epoch": 5.971223021582734,
"grad_norm": 0.057178276885445106,
"learning_rate": 0.00019408325553201192,
"loss": 0.0616,
"mean_token_accuracy": 0.9861096978187561,
"step": 830
},
{
"epoch": 6.0,
"eval_loss": 0.09532783925533295,
"eval_mean_token_accuracy": 0.9849477683504423,
"eval_runtime": 20.7734,
"eval_samples_per_second": 5.873,
"eval_steps_per_second": 0.77,
"step": 834
},
{
"epoch": 6.0071942446043165,
"grad_norm": 0.06761995605993293,
"learning_rate": 0.0001939405990505722,
"loss": 0.0573,
"mean_token_accuracy": 0.9845331013202667,
"step": 835
},
{
"epoch": 6.043165467625899,
"grad_norm": 0.07263548817088912,
"learning_rate": 0.00019379629692730798,
"loss": 0.0503,
"mean_token_accuracy": 0.9876116633415222,
"step": 840
},
{
"epoch": 6.079136690647482,
"grad_norm": 0.08479650809428431,
"learning_rate": 0.00019365035169008915,
"loss": 0.0427,
"mean_token_accuracy": 0.9894964694976807,
"step": 845
},
{
"epoch": 6.115107913669065,
"grad_norm": 0.06919827278420493,
"learning_rate": 0.00019350276589556948,
"loss": 0.0472,
"mean_token_accuracy": 0.9883952558040618,
"step": 850
},
{
"epoch": 6.151079136690647,
"grad_norm": 0.08264329920052639,
"learning_rate": 0.00019335354212914187,
"loss": 0.0496,
"mean_token_accuracy": 0.9882358908653259,
"step": 855
},
{
"epoch": 6.18705035971223,
"grad_norm": 0.06396607395380566,
"learning_rate": 0.00019320268300489297,
"loss": 0.0471,
"mean_token_accuracy": 0.9883708119392395,
"step": 860
},
{
"epoch": 6.223021582733813,
"grad_norm": 0.08316463171318977,
"learning_rate": 0.00019305019116555754,
"loss": 0.0384,
"mean_token_accuracy": 0.9907682836055756,
"step": 865
},
{
"epoch": 6.258992805755396,
"grad_norm": 0.07480912129949462,
"learning_rate": 0.00019289606928247208,
"loss": 0.0463,
"mean_token_accuracy": 0.9888597249984741,
"step": 870
},
{
"epoch": 6.294964028776978,
"grad_norm": 0.0663383121635371,
"learning_rate": 0.00019274032005552798,
"loss": 0.0384,
"mean_token_accuracy": 0.990657901763916,
"step": 875
},
{
"epoch": 6.330935251798561,
"grad_norm": 0.07501372798585075,
"learning_rate": 0.00019258294621312433,
"loss": 0.0528,
"mean_token_accuracy": 0.9871481537818909,
"step": 880
},
{
"epoch": 6.366906474820144,
"grad_norm": 0.07366099061163396,
"learning_rate": 0.00019242395051212,
"loss": 0.0499,
"mean_token_accuracy": 0.9882595360279083,
"step": 885
},
{
"epoch": 6.402877697841727,
"grad_norm": 0.06804867314458733,
"learning_rate": 0.00019226333573778544,
"loss": 0.046,
"mean_token_accuracy": 0.9889584600925445,
"step": 890
},
{
"epoch": 6.438848920863309,
"grad_norm": 0.06482541942067276,
"learning_rate": 0.00019210110470375394,
"loss": 0.0457,
"mean_token_accuracy": 0.9892277956008911,
"step": 895
},
{
"epoch": 6.474820143884892,
"grad_norm": 0.09362083699600474,
"learning_rate": 0.0001919372602519721,
"loss": 0.0479,
"mean_token_accuracy": 0.9887864112854003,
"step": 900
},
{
"epoch": 6.510791366906475,
"grad_norm": 0.07419422320428706,
"learning_rate": 0.00019177180525265037,
"loss": 0.0462,
"mean_token_accuracy": 0.988640570640564,
"step": 905
},
{
"epoch": 6.546762589928058,
"grad_norm": 0.0680933466552101,
"learning_rate": 0.0001916047426042125,
"loss": 0.0412,
"mean_token_accuracy": 0.9902673780918121,
"step": 910
},
{
"epoch": 6.58273381294964,
"grad_norm": 0.0753203749472904,
"learning_rate": 0.00019143607523324497,
"loss": 0.0409,
"mean_token_accuracy": 0.9900835871696472,
"step": 915
},
{
"epoch": 6.618705035971223,
"grad_norm": 0.09155392976849171,
"learning_rate": 0.00019126580609444549,
"loss": 0.0563,
"mean_token_accuracy": 0.986204868555069,
"step": 920
},
{
"epoch": 6.654676258992806,
"grad_norm": 0.08500902229953358,
"learning_rate": 0.00019109393817057148,
"loss": 0.0464,
"mean_token_accuracy": 0.9887993991374969,
"step": 925
},
{
"epoch": 6.690647482014389,
"grad_norm": 0.06130970774026331,
"learning_rate": 0.00019092047447238773,
"loss": 0.0463,
"mean_token_accuracy": 0.9888347625732422,
"step": 930
},
{
"epoch": 6.726618705035971,
"grad_norm": 0.08321729417279401,
"learning_rate": 0.0001907454180386135,
"loss": 0.0515,
"mean_token_accuracy": 0.9873551964759827,
"step": 935
},
{
"epoch": 6.762589928057554,
"grad_norm": 0.0788243708046946,
"learning_rate": 0.00019056877193586962,
"loss": 0.0552,
"mean_token_accuracy": 0.9864752233028412,
"step": 940
},
{
"epoch": 6.798561151079137,
"grad_norm": 0.09851923268411174,
"learning_rate": 0.00019039053925862443,
"loss": 0.0605,
"mean_token_accuracy": 0.9862433850765229,
"step": 945
},
{
"epoch": 6.83453237410072,
"grad_norm": 0.04852850455052362,
"learning_rate": 0.00019021072312913986,
"loss": 0.0402,
"mean_token_accuracy": 0.9904878795146942,
"step": 950
},
{
"epoch": 6.870503597122302,
"grad_norm": 0.07705035380290443,
"learning_rate": 0.00019002932669741639,
"loss": 0.0476,
"mean_token_accuracy": 0.9887258052825928,
"step": 955
},
{
"epoch": 6.906474820143885,
"grad_norm": 0.06935047132024741,
"learning_rate": 0.00018984635314113826,
"loss": 0.0458,
"mean_token_accuracy": 0.9895333528518677,
"step": 960
},
{
"epoch": 6.942446043165468,
"grad_norm": 0.07052799437742344,
"learning_rate": 0.00018966180566561757,
"loss": 0.0471,
"mean_token_accuracy": 0.9885306537151337,
"step": 965
},
{
"epoch": 6.9784172661870505,
"grad_norm": 0.07250750893823193,
"learning_rate": 0.0001894756875037381,
"loss": 0.0578,
"mean_token_accuracy": 0.9862792432308197,
"step": 970
},
{
"epoch": 7.0,
"eval_loss": 0.09820590913295746,
"eval_mean_token_accuracy": 0.9843102124604312,
"eval_runtime": 20.6203,
"eval_samples_per_second": 5.917,
"eval_steps_per_second": 0.776,
"step": 973
},
{
"epoch": 7.014388489208633,
"grad_norm": 0.08165511724370542,
"learning_rate": 0.0001892880019158988,
"loss": 0.0547,
"mean_token_accuracy": 0.9885966777801514,
"step": 975
},
{
"epoch": 7.0503597122302155,
"grad_norm": 0.09115471075741952,
"learning_rate": 0.0001890987521899567,
"loss": 0.0348,
"mean_token_accuracy": 0.991256856918335,
"step": 980
},
{
"epoch": 7.086330935251799,
"grad_norm": 0.18703400358025105,
"learning_rate": 0.0001889079416411692,
"loss": 0.0344,
"mean_token_accuracy": 0.9911470890045166,
"step": 985
},
{
"epoch": 7.122302158273381,
"grad_norm": 0.07593574468723076,
"learning_rate": 0.00018871557361213595,
"loss": 0.04,
"mean_token_accuracy": 0.9902300417423249,
"step": 990
},
{
"epoch": 7.158273381294964,
"grad_norm": 0.08163153615480963,
"learning_rate": 0.00018852165147274045,
"loss": 0.0344,
"mean_token_accuracy": 0.9915133118629456,
"step": 995
},
{
"epoch": 7.194244604316546,
"grad_norm": 0.08162384924322541,
"learning_rate": 0.00018832617862009097,
"loss": 0.0339,
"mean_token_accuracy": 0.9912963092327118,
"step": 1000
},
{
"epoch": 7.23021582733813,
"grad_norm": 0.06754095615055344,
"learning_rate": 0.00018812915847846097,
"loss": 0.0334,
"mean_token_accuracy": 0.9912936687469482,
"step": 1005
},
{
"epoch": 7.266187050359712,
"grad_norm": 0.07992585396768462,
"learning_rate": 0.0001879305944992292,
"loss": 0.0383,
"mean_token_accuracy": 0.990229606628418,
"step": 1010
},
{
"epoch": 7.302158273381295,
"grad_norm": 0.09213616209553331,
"learning_rate": 0.00018773049016081913,
"loss": 0.0457,
"mean_token_accuracy": 0.9886265099048615,
"step": 1015
},
{
"epoch": 7.338129496402877,
"grad_norm": 0.07024023343334314,
"learning_rate": 0.0001875288489686382,
"loss": 0.0367,
"mean_token_accuracy": 0.9905371308326721,
"step": 1020
},
{
"epoch": 7.374100719424461,
"grad_norm": 0.07286451277511494,
"learning_rate": 0.0001873256744550162,
"loss": 0.0347,
"mean_token_accuracy": 0.9913554310798645,
"step": 1025
},
{
"epoch": 7.410071942446043,
"grad_norm": 0.08298535555396302,
"learning_rate": 0.00018712097017914352,
"loss": 0.0388,
"mean_token_accuracy": 0.9905226647853851,
"step": 1030
},
{
"epoch": 7.446043165467626,
"grad_norm": 0.08830074749459958,
"learning_rate": 0.00018691473972700875,
"loss": 0.0445,
"mean_token_accuracy": 0.9889210820198059,
"step": 1035
},
{
"epoch": 7.482014388489208,
"grad_norm": 0.07217666187560311,
"learning_rate": 0.00018670698671133593,
"loss": 0.0452,
"mean_token_accuracy": 0.9885773658752441,
"step": 1040
},
{
"epoch": 7.517985611510792,
"grad_norm": 0.08661908711629725,
"learning_rate": 0.00018649771477152115,
"loss": 0.0339,
"mean_token_accuracy": 0.9911720871925354,
"step": 1045
},
{
"epoch": 7.553956834532374,
"grad_norm": 0.09371311177176188,
"learning_rate": 0.0001862869275735689,
"loss": 0.0367,
"mean_token_accuracy": 0.9905966579914093,
"step": 1050
},
{
"epoch": 7.589928057553957,
"grad_norm": 0.07707240942098416,
"learning_rate": 0.00018607462881002778,
"loss": 0.0343,
"mean_token_accuracy": 0.9915632963180542,
"step": 1055
},
{
"epoch": 7.625899280575539,
"grad_norm": 0.07730587819818967,
"learning_rate": 0.0001858608221999259,
"loss": 0.0383,
"mean_token_accuracy": 0.9904868125915527,
"step": 1060
},
{
"epoch": 7.661870503597123,
"grad_norm": 0.07304839741727129,
"learning_rate": 0.00018564551148870563,
"loss": 0.0439,
"mean_token_accuracy": 0.9891519188880921,
"step": 1065
},
{
"epoch": 7.697841726618705,
"grad_norm": 0.09016682635662701,
"learning_rate": 0.00018542870044815796,
"loss": 0.0425,
"mean_token_accuracy": 0.98941091299057,
"step": 1070
},
{
"epoch": 7.733812949640288,
"grad_norm": 0.07730069908696634,
"learning_rate": 0.0001852103928763566,
"loss": 0.0379,
"mean_token_accuracy": 0.9907430112361908,
"step": 1075
},
{
"epoch": 7.76978417266187,
"grad_norm": 0.07286962203888536,
"learning_rate": 0.0001849905925975914,
"loss": 0.0395,
"mean_token_accuracy": 0.9902792334556579,
"step": 1080
},
{
"epoch": 7.805755395683454,
"grad_norm": 0.12596219085722438,
"learning_rate": 0.00018476930346230107,
"loss": 0.043,
"mean_token_accuracy": 0.9893492221832275,
"step": 1085
},
{
"epoch": 7.841726618705036,
"grad_norm": 0.0721410843397686,
"learning_rate": 0.00018454652934700615,
"loss": 0.0337,
"mean_token_accuracy": 0.9913184523582459,
"step": 1090
},
{
"epoch": 7.877697841726619,
"grad_norm": 0.08734696713463556,
"learning_rate": 0.00018432227415424084,
"loss": 0.041,
"mean_token_accuracy": 0.9895088315010071,
"step": 1095
},
{
"epoch": 7.913669064748201,
"grad_norm": 0.08034908109385859,
"learning_rate": 0.00018409654181248474,
"loss": 0.0446,
"mean_token_accuracy": 0.988712877035141,
"step": 1100
},
{
"epoch": 7.9496402877697845,
"grad_norm": 0.0697845242925141,
"learning_rate": 0.00018386933627609394,
"loss": 0.0359,
"mean_token_accuracy": 0.9910129487514496,
"step": 1105
},
{
"epoch": 7.985611510791367,
"grad_norm": 0.07078505068848803,
"learning_rate": 0.00018364066152523183,
"loss": 0.0408,
"mean_token_accuracy": 0.9896426558494568,
"step": 1110
},
{
"epoch": 8.0,
"eval_loss": 0.1054563969373703,
"eval_mean_token_accuracy": 0.984645739197731,
"eval_runtime": 20.6985,
"eval_samples_per_second": 5.894,
"eval_steps_per_second": 0.773,
"step": 1112
},
{
"epoch": 8.02158273381295,
"grad_norm": 0.06608211950667531,
"learning_rate": 0.0001834105215657994,
"loss": 0.0311,
"mean_token_accuracy": 0.9939679900805155,
"step": 1115
},
{
"epoch": 8.057553956834532,
"grad_norm": 0.08564587725938204,
"learning_rate": 0.00018317892042936487,
"loss": 0.0267,
"mean_token_accuracy": 0.9928701162338257,
"step": 1120
},
{
"epoch": 8.093525179856115,
"grad_norm": 0.08996704309284011,
"learning_rate": 0.00018294586217309342,
"loss": 0.0302,
"mean_token_accuracy": 0.991721647977829,
"step": 1125
},
{
"epoch": 8.129496402877697,
"grad_norm": 0.10213993059199547,
"learning_rate": 0.00018271135087967574,
"loss": 0.0255,
"mean_token_accuracy": 0.9934465944766998,
"step": 1130
},
{
"epoch": 8.16546762589928,
"grad_norm": 0.10289029084415881,
"learning_rate": 0.0001824753906572567,
"loss": 0.0271,
"mean_token_accuracy": 0.9926867604255676,
"step": 1135
},
{
"epoch": 8.201438848920864,
"grad_norm": 0.07938513450083459,
"learning_rate": 0.00018223798563936344,
"loss": 0.0277,
"mean_token_accuracy": 0.9926994025707245,
"step": 1140
},
{
"epoch": 8.237410071942445,
"grad_norm": 0.0799335759541154,
"learning_rate": 0.00018199913998483282,
"loss": 0.0292,
"mean_token_accuracy": 0.9922228872776031,
"step": 1145
},
{
"epoch": 8.273381294964029,
"grad_norm": 0.07791297569908608,
"learning_rate": 0.0001817588578777386,
"loss": 0.0251,
"mean_token_accuracy": 0.9932994604110718,
"step": 1150
},
{
"epoch": 8.309352517985612,
"grad_norm": 0.10478924127717758,
"learning_rate": 0.00018151714352731822,
"loss": 0.0296,
"mean_token_accuracy": 0.9923690974712371,
"step": 1155
},
{
"epoch": 8.345323741007194,
"grad_norm": 0.05952264303244273,
"learning_rate": 0.000181274001167899,
"loss": 0.0259,
"mean_token_accuracy": 0.9932628035545349,
"step": 1160
},
{
"epoch": 8.381294964028777,
"grad_norm": 0.11638720739620267,
"learning_rate": 0.00018102943505882396,
"loss": 0.0311,
"mean_token_accuracy": 0.9920145153999329,
"step": 1165
},
{
"epoch": 8.417266187050359,
"grad_norm": 0.07862143397116596,
"learning_rate": 0.00018078344948437724,
"loss": 0.0233,
"mean_token_accuracy": 0.9941556990146637,
"step": 1170
},
{
"epoch": 8.453237410071942,
"grad_norm": 0.08087339161763747,
"learning_rate": 0.00018053604875370907,
"loss": 0.0265,
"mean_token_accuracy": 0.9931528508663178,
"step": 1175
},
{
"epoch": 8.489208633093526,
"grad_norm": 0.061976387703659395,
"learning_rate": 0.0001802872372007601,
"loss": 0.0281,
"mean_token_accuracy": 0.9925530850887299,
"step": 1180
},
{
"epoch": 8.525179856115107,
"grad_norm": 0.08968392584335196,
"learning_rate": 0.0001800370191841858,
"loss": 0.032,
"mean_token_accuracy": 0.9915622353553772,
"step": 1185
},
{
"epoch": 8.56115107913669,
"grad_norm": 0.09146240533508403,
"learning_rate": 0.0001797853990872798,
"loss": 0.0329,
"mean_token_accuracy": 0.9913170158863067,
"step": 1190
},
{
"epoch": 8.597122302158274,
"grad_norm": 0.10059791196991036,
"learning_rate": 0.0001795323813178973,
"loss": 0.0256,
"mean_token_accuracy": 0.9930787861347199,
"step": 1195
},
{
"epoch": 8.633093525179856,
"grad_norm": 0.07933964343809208,
"learning_rate": 0.00017927797030837768,
"loss": 0.0284,
"mean_token_accuracy": 0.9926510810852051,
"step": 1200
},
{
"epoch": 8.66906474820144,
"grad_norm": 0.10008206157504908,
"learning_rate": 0.00017902217051546715,
"loss": 0.0296,
"mean_token_accuracy": 0.9919540584087372,
"step": 1205
},
{
"epoch": 8.70503597122302,
"grad_norm": 0.07195996592535572,
"learning_rate": 0.00017876498642024026,
"loss": 0.0263,
"mean_token_accuracy": 0.993087249994278,
"step": 1210
},
{
"epoch": 8.741007194244604,
"grad_norm": 0.0840990736088915,
"learning_rate": 0.0001785064225280218,
"loss": 0.0331,
"mean_token_accuracy": 0.9914765417575836,
"step": 1215
},
{
"epoch": 8.776978417266188,
"grad_norm": 0.07556361151629382,
"learning_rate": 0.00017824648336830763,
"loss": 0.0239,
"mean_token_accuracy": 0.9935317218303681,
"step": 1220
},
{
"epoch": 8.81294964028777,
"grad_norm": 0.0817902776134609,
"learning_rate": 0.00017798517349468539,
"loss": 0.0293,
"mean_token_accuracy": 0.9924435615539551,
"step": 1225
},
{
"epoch": 8.848920863309353,
"grad_norm": 0.07844793746584716,
"learning_rate": 0.0001777224974847548,
"loss": 0.032,
"mean_token_accuracy": 0.9916129529476165,
"step": 1230
},
{
"epoch": 8.884892086330936,
"grad_norm": 0.09174283379497755,
"learning_rate": 0.0001774584599400474,
"loss": 0.0304,
"mean_token_accuracy": 0.9922227621078491,
"step": 1235
},
{
"epoch": 8.920863309352518,
"grad_norm": 0.08346812519931995,
"learning_rate": 0.0001771930654859459,
"loss": 0.0278,
"mean_token_accuracy": 0.9929319977760315,
"step": 1240
},
{
"epoch": 8.956834532374101,
"grad_norm": 0.09081059448512323,
"learning_rate": 0.00017692631877160326,
"loss": 0.0365,
"mean_token_accuracy": 0.9903396785259246,
"step": 1245
},
{
"epoch": 8.992805755395683,
"grad_norm": 0.0840058011137499,
"learning_rate": 0.0001766582244698612,
"loss": 0.0297,
"mean_token_accuracy": 0.9923931121826172,
"step": 1250
},
{
"epoch": 9.0,
"eval_loss": 0.10754524171352386,
"eval_mean_token_accuracy": 0.9839274419678582,
"eval_runtime": 20.8073,
"eval_samples_per_second": 5.863,
"eval_steps_per_second": 0.769,
"step": 1251
},
{
"epoch": 9.028776978417266,
"grad_norm": 0.06802116638648911,
"learning_rate": 0.00017638878727716838,
"loss": 0.0239,
"mean_token_accuracy": 0.994832769036293,
"step": 1255
},
{
"epoch": 9.06474820143885,
"grad_norm": 0.08131934072937834,
"learning_rate": 0.00017611801191349798,
"loss": 0.0177,
"mean_token_accuracy": 0.9950850903987885,
"step": 1260
},
{
"epoch": 9.100719424460431,
"grad_norm": 0.09962740909778638,
"learning_rate": 0.0001758459031222652,
"loss": 0.0169,
"mean_token_accuracy": 0.9952557981014252,
"step": 1265
},
{
"epoch": 9.136690647482014,
"grad_norm": 0.08910176909961409,
"learning_rate": 0.00017557246567024404,
"loss": 0.0193,
"mean_token_accuracy": 0.9950962662696838,
"step": 1270
},
{
"epoch": 9.172661870503598,
"grad_norm": 0.08896573436836375,
"learning_rate": 0.0001752977043474839,
"loss": 0.0185,
"mean_token_accuracy": 0.9951821863651276,
"step": 1275
},
{
"epoch": 9.20863309352518,
"grad_norm": 0.07069710110622436,
"learning_rate": 0.00017502162396722558,
"loss": 0.0182,
"mean_token_accuracy": 0.9950909554958344,
"step": 1280
},
{
"epoch": 9.244604316546763,
"grad_norm": 0.10794611681753156,
"learning_rate": 0.00017474422936581698,
"loss": 0.0204,
"mean_token_accuracy": 0.9944604396820068,
"step": 1285
},
{
"epoch": 9.280575539568344,
"grad_norm": 0.0964081310067874,
"learning_rate": 0.00017446552540262844,
"loss": 0.0193,
"mean_token_accuracy": 0.9947298228740692,
"step": 1290
},
{
"epoch": 9.316546762589928,
"grad_norm": 0.06694312069681227,
"learning_rate": 0.0001741855169599675,
"loss": 0.0182,
"mean_token_accuracy": 0.9948891222476959,
"step": 1295
},
{
"epoch": 9.352517985611511,
"grad_norm": 0.09194435151559001,
"learning_rate": 0.0001739042089429935,
"loss": 0.0211,
"mean_token_accuracy": 0.9945831596851349,
"step": 1300
},
{
"epoch": 9.388489208633093,
"grad_norm": 0.08485510859325882,
"learning_rate": 0.0001736216062796316,
"loss": 0.0178,
"mean_token_accuracy": 0.9953541696071625,
"step": 1305
},
{
"epoch": 9.424460431654676,
"grad_norm": 0.07658351486107501,
"learning_rate": 0.0001733377139204863,
"loss": 0.0176,
"mean_token_accuracy": 0.9950843632221222,
"step": 1310
},
{
"epoch": 9.46043165467626,
"grad_norm": 0.0851945396842124,
"learning_rate": 0.0001730525368387551,
"loss": 0.0176,
"mean_token_accuracy": 0.995317256450653,
"step": 1315
},
{
"epoch": 9.496402877697841,
"grad_norm": 0.07680564483723305,
"learning_rate": 0.0001727660800301409,
"loss": 0.0195,
"mean_token_accuracy": 0.9947294652462005,
"step": 1320
},
{
"epoch": 9.532374100719425,
"grad_norm": 0.06733986423413497,
"learning_rate": 0.00017247834851276492,
"loss": 0.0225,
"mean_token_accuracy": 0.9939347088336945,
"step": 1325
},
{
"epoch": 9.568345323741006,
"grad_norm": 0.12457969840303192,
"learning_rate": 0.00017218934732707842,
"loss": 0.0212,
"mean_token_accuracy": 0.9943628013134003,
"step": 1330
},
{
"epoch": 9.60431654676259,
"grad_norm": 0.06957276517390819,
"learning_rate": 0.00017189908153577473,
"loss": 0.0206,
"mean_token_accuracy": 0.9946195781230927,
"step": 1335
},
{
"epoch": 9.640287769784173,
"grad_norm": 0.09308663583934602,
"learning_rate": 0.00017160755622370032,
"loss": 0.0184,
"mean_token_accuracy": 0.9952435672283173,
"step": 1340
},
{
"epoch": 9.676258992805755,
"grad_norm": 0.07546127826289363,
"learning_rate": 0.00017131477649776587,
"loss": 0.0198,
"mean_token_accuracy": 0.9945826590061188,
"step": 1345
},
{
"epoch": 9.712230215827338,
"grad_norm": 0.06447487107416815,
"learning_rate": 0.00017102074748685673,
"loss": 0.0191,
"mean_token_accuracy": 0.9948029279708862,
"step": 1350
},
{
"epoch": 9.748201438848922,
"grad_norm": 0.10429555757378318,
"learning_rate": 0.00017072547434174304,
"loss": 0.0224,
"mean_token_accuracy": 0.9938852250576019,
"step": 1355
},
{
"epoch": 9.784172661870503,
"grad_norm": 0.10174525963107275,
"learning_rate": 0.0001704289622349897,
"loss": 0.0209,
"mean_token_accuracy": 0.9941792845726013,
"step": 1360
},
{
"epoch": 9.820143884892087,
"grad_norm": 0.06515111457479097,
"learning_rate": 0.0001701312163608655,
"loss": 0.0197,
"mean_token_accuracy": 0.9947053015232086,
"step": 1365
},
{
"epoch": 9.85611510791367,
"grad_norm": 0.0853360162922663,
"learning_rate": 0.0001698322419352522,
"loss": 0.026,
"mean_token_accuracy": 0.9930291116237641,
"step": 1370
},
{
"epoch": 9.892086330935252,
"grad_norm": 0.08349002733460555,
"learning_rate": 0.0001695320441955534,
"loss": 0.0223,
"mean_token_accuracy": 0.9938614785671234,
"step": 1375
},
{
"epoch": 9.928057553956835,
"grad_norm": 0.11836172608748735,
"learning_rate": 0.00016923062840060234,
"loss": 0.021,
"mean_token_accuracy": 0.9945221424102784,
"step": 1380
},
{
"epoch": 9.964028776978417,
"grad_norm": 0.0979581744574528,
"learning_rate": 0.0001689279998305702,
"loss": 0.0263,
"mean_token_accuracy": 0.9928580164909363,
"step": 1385
},
{
"epoch": 10.0,
"grad_norm": 0.07684774295930966,
"learning_rate": 0.0001686241637868734,
"loss": 0.0207,
"mean_token_accuracy": 0.9940943062305451,
"step": 1390
},
{
"epoch": 10.0,
"eval_loss": 0.11269818246364594,
"eval_mean_token_accuracy": 0.9830840341746807,
"eval_runtime": 20.6237,
"eval_samples_per_second": 5.916,
"eval_steps_per_second": 0.776,
"step": 1390
},
{
"epoch": 10.035971223021583,
"grad_norm": 0.08599421791676856,
"learning_rate": 0.00016831912559208063,
"loss": 0.0121,
"mean_token_accuracy": 0.9970287322998047,
"step": 1395
},
{
"epoch": 10.071942446043165,
"grad_norm": 0.0904666825072302,
"learning_rate": 0.00016801289058982,
"loss": 0.013,
"mean_token_accuracy": 0.99660022854805,
"step": 1400
},
{
"epoch": 10.107913669064748,
"grad_norm": 0.107927530031339,
"learning_rate": 0.00016770546414468488,
"loss": 0.015,
"mean_token_accuracy": 0.9960623264312745,
"step": 1405
},
{
"epoch": 10.14388489208633,
"grad_norm": 0.06925828681503625,
"learning_rate": 0.00016739685164214046,
"loss": 0.0122,
"mean_token_accuracy": 0.996869707107544,
"step": 1410
},
{
"epoch": 10.179856115107913,
"grad_norm": 0.08749007468566572,
"learning_rate": 0.00016708705848842898,
"loss": 0.014,
"mean_token_accuracy": 0.99650257229805,
"step": 1415
},
{
"epoch": 10.215827338129497,
"grad_norm": 0.11345976381463416,
"learning_rate": 0.00016677609011047533,
"loss": 0.0131,
"mean_token_accuracy": 0.9966128468513489,
"step": 1420
},
{
"epoch": 10.251798561151078,
"grad_norm": 0.0850375168432864,
"learning_rate": 0.00016646395195579178,
"loss": 0.0148,
"mean_token_accuracy": 0.9960009098052979,
"step": 1425
},
{
"epoch": 10.287769784172662,
"grad_norm": 0.07294058737884025,
"learning_rate": 0.00016615064949238267,
"loss": 0.0132,
"mean_token_accuracy": 0.9964902937412262,
"step": 1430
},
{
"epoch": 10.323741007194245,
"grad_norm": 0.07943531885485305,
"learning_rate": 0.00016583618820864858,
"loss": 0.0135,
"mean_token_accuracy": 0.9963561594486237,
"step": 1435
},
{
"epoch": 10.359712230215827,
"grad_norm": 0.09060949078579321,
"learning_rate": 0.0001655205736132902,
"loss": 0.012,
"mean_token_accuracy": 0.9970167279243469,
"step": 1440
},
{
"epoch": 10.39568345323741,
"grad_norm": 0.08743228431554707,
"learning_rate": 0.0001652038112352117,
"loss": 0.0158,
"mean_token_accuracy": 0.9957569420337677,
"step": 1445
},
{
"epoch": 10.431654676258994,
"grad_norm": 0.08434742513312765,
"learning_rate": 0.0001648859066234242,
"loss": 0.0127,
"mean_token_accuracy": 0.9967720329761505,
"step": 1450
},
{
"epoch": 10.467625899280575,
"grad_norm": 0.08534299510053663,
"learning_rate": 0.00016456686534694817,
"loss": 0.0124,
"mean_token_accuracy": 0.996967202425003,
"step": 1455
},
{
"epoch": 10.503597122302159,
"grad_norm": 0.07636341426608007,
"learning_rate": 0.00016424669299471614,
"loss": 0.0134,
"mean_token_accuracy": 0.9965148985385894,
"step": 1460
},
{
"epoch": 10.53956834532374,
"grad_norm": 0.08631265171713358,
"learning_rate": 0.0001639253951754747,
"loss": 0.0125,
"mean_token_accuracy": 0.996735155582428,
"step": 1465
},
{
"epoch": 10.575539568345324,
"grad_norm": 0.08200188375124749,
"learning_rate": 0.0001636029775176862,
"loss": 0.0113,
"mean_token_accuracy": 0.99694344997406,
"step": 1470
},
{
"epoch": 10.611510791366907,
"grad_norm": 0.07881864745220842,
"learning_rate": 0.00016327944566943035,
"loss": 0.0119,
"mean_token_accuracy": 0.9968697667121887,
"step": 1475
},
{
"epoch": 10.647482014388489,
"grad_norm": 0.08190061340848842,
"learning_rate": 0.00016295480529830494,
"loss": 0.0156,
"mean_token_accuracy": 0.9960256695747376,
"step": 1480
},
{
"epoch": 10.683453237410072,
"grad_norm": 0.10685678023770241,
"learning_rate": 0.00016262906209132692,
"loss": 0.0144,
"mean_token_accuracy": 0.9962826788425445,
"step": 1485
},
{
"epoch": 10.719424460431654,
"grad_norm": 0.07320426903216805,
"learning_rate": 0.0001623022217548325,
"loss": 0.0148,
"mean_token_accuracy": 0.9962579011917114,
"step": 1490
},
{
"epoch": 10.755395683453237,
"grad_norm": 0.0935421010172905,
"learning_rate": 0.00016197429001437735,
"loss": 0.0165,
"mean_token_accuracy": 0.995512044429779,
"step": 1495
},
{
"epoch": 10.79136690647482,
"grad_norm": 0.05923694064442418,
"learning_rate": 0.0001616452726146362,
"loss": 0.0162,
"mean_token_accuracy": 0.9955240964889527,
"step": 1500
},
{
"epoch": 10.827338129496402,
"grad_norm": 0.0902823909222933,
"learning_rate": 0.0001613151753193023,
"loss": 0.0122,
"mean_token_accuracy": 0.9967601776123047,
"step": 1505
},
{
"epoch": 10.863309352517986,
"grad_norm": 0.06620900034563318,
"learning_rate": 0.00016098400391098636,
"loss": 0.0146,
"mean_token_accuracy": 0.9960503220558167,
"step": 1510
},
{
"epoch": 10.899280575539569,
"grad_norm": 0.08083785941879719,
"learning_rate": 0.0001606517641911153,
"loss": 0.0125,
"mean_token_accuracy": 0.9967718720436096,
"step": 1515
},
{
"epoch": 10.93525179856115,
"grad_norm": 0.1222321673203135,
"learning_rate": 0.00016031846197983062,
"loss": 0.0139,
"mean_token_accuracy": 0.9963804185390472,
"step": 1520
},
{
"epoch": 10.971223021582734,
"grad_norm": 0.09011217682765804,
"learning_rate": 0.00015998410311588644,
"loss": 0.0151,
"mean_token_accuracy": 0.9960378229618072,
"step": 1525
},
{
"epoch": 11.0,
"eval_loss": 0.12537601590156555,
"eval_mean_token_accuracy": 0.9872708097100258,
"eval_runtime": 20.7748,
"eval_samples_per_second": 5.873,
"eval_steps_per_second": 0.77,
"step": 1529
},
{
"epoch": 11.007194244604317,
"grad_norm": 0.05145660527450271,
"learning_rate": 0.00015964869345654718,
"loss": 0.0118,
"mean_token_accuracy": 0.9978603720664978,
"step": 1530
},
{
"epoch": 11.043165467625899,
"grad_norm": 0.07821203281348997,
"learning_rate": 0.0001593122388774851,
"loss": 0.0085,
"mean_token_accuracy": 0.9977623283863067,
"step": 1535
},
{
"epoch": 11.079136690647482,
"grad_norm": 0.07234857181108979,
"learning_rate": 0.00015897474527267703,
"loss": 0.009,
"mean_token_accuracy": 0.9976400792598724,
"step": 1540
},
{
"epoch": 11.115107913669064,
"grad_norm": 0.04075447553316834,
"learning_rate": 0.00015863621855430159,
"loss": 0.0092,
"mean_token_accuracy": 0.9976687788963318,
"step": 1545
},
{
"epoch": 11.151079136690647,
"grad_norm": 0.05794021578435905,
"learning_rate": 0.00015829666465263525,
"loss": 0.0088,
"mean_token_accuracy": 0.9977623224258423,
"step": 1550
},
{
"epoch": 11.18705035971223,
"grad_norm": 0.07683795817076886,
"learning_rate": 0.00015795608951594859,
"loss": 0.0095,
"mean_token_accuracy": 0.997480845451355,
"step": 1555
},
{
"epoch": 11.223021582733812,
"grad_norm": 0.07115098159372155,
"learning_rate": 0.00015761449911040208,
"loss": 0.0101,
"mean_token_accuracy": 0.9975174725055694,
"step": 1560
},
{
"epoch": 11.258992805755396,
"grad_norm": 0.03884336408006673,
"learning_rate": 0.00015727189941994158,
"loss": 0.0093,
"mean_token_accuracy": 0.9976275801658631,
"step": 1565
},
{
"epoch": 11.29496402877698,
"grad_norm": 0.06656440131240968,
"learning_rate": 0.00015692829644619352,
"loss": 0.0082,
"mean_token_accuracy": 0.9979580223560334,
"step": 1570
},
{
"epoch": 11.33093525179856,
"grad_norm": 0.06686553477037634,
"learning_rate": 0.0001565836962083597,
"loss": 0.0084,
"mean_token_accuracy": 0.9977380752563476,
"step": 1575
},
{
"epoch": 11.366906474820144,
"grad_norm": 0.051925628479388856,
"learning_rate": 0.00015623810474311187,
"loss": 0.0099,
"mean_token_accuracy": 0.9973831713199616,
"step": 1580
},
{
"epoch": 11.402877697841726,
"grad_norm": 0.07626073368161976,
"learning_rate": 0.0001558915281044861,
"loss": 0.0097,
"mean_token_accuracy": 0.9975177109241485,
"step": 1585
},
{
"epoch": 11.43884892086331,
"grad_norm": 0.09353665419288143,
"learning_rate": 0.0001555439723637765,
"loss": 0.0098,
"mean_token_accuracy": 0.9974563598632813,
"step": 1590
},
{
"epoch": 11.474820143884893,
"grad_norm": 0.06026792088715974,
"learning_rate": 0.00015519544360942917,
"loss": 0.0099,
"mean_token_accuracy": 0.9973953664302826,
"step": 1595
},
{
"epoch": 11.510791366906474,
"grad_norm": 0.0680669683566074,
"learning_rate": 0.0001548459479469351,
"loss": 0.011,
"mean_token_accuracy": 0.9970895767211914,
"step": 1600
},
{
"epoch": 11.546762589928058,
"grad_norm": 0.07661464909353981,
"learning_rate": 0.00015449549149872376,
"loss": 0.0094,
"mean_token_accuracy": 0.9975910663604737,
"step": 1605
},
{
"epoch": 11.582733812949641,
"grad_norm": 0.06540550364929187,
"learning_rate": 0.00015414408040405537,
"loss": 0.0089,
"mean_token_accuracy": 0.9978111922740937,
"step": 1610
},
{
"epoch": 11.618705035971223,
"grad_norm": 0.05130373899495586,
"learning_rate": 0.0001537917208189136,
"loss": 0.0091,
"mean_token_accuracy": 0.9975790679454803,
"step": 1615
},
{
"epoch": 11.654676258992806,
"grad_norm": 0.06949815748126974,
"learning_rate": 0.00015343841891589776,
"loss": 0.0108,
"mean_token_accuracy": 0.9970408082008362,
"step": 1620
},
{
"epoch": 11.690647482014388,
"grad_norm": 0.07039422200836666,
"learning_rate": 0.00015308418088411444,
"loss": 0.0103,
"mean_token_accuracy": 0.997383177280426,
"step": 1625
},
{
"epoch": 11.726618705035971,
"grad_norm": 0.09950022146159282,
"learning_rate": 0.00015272901292906935,
"loss": 0.01,
"mean_token_accuracy": 0.9974565923213958,
"step": 1630
},
{
"epoch": 11.762589928057555,
"grad_norm": 0.07438295375677863,
"learning_rate": 0.00015237292127255852,
"loss": 0.0094,
"mean_token_accuracy": 0.9976524710655212,
"step": 1635
},
{
"epoch": 11.798561151079136,
"grad_norm": 0.06394419085018742,
"learning_rate": 0.00015201591215255916,
"loss": 0.0097,
"mean_token_accuracy": 0.9976644575595855,
"step": 1640
},
{
"epoch": 11.83453237410072,
"grad_norm": 0.07476579460405877,
"learning_rate": 0.00015165799182312062,
"loss": 0.0114,
"mean_token_accuracy": 0.9969670593738555,
"step": 1645
},
{
"epoch": 11.870503597122303,
"grad_norm": 0.0719444710852101,
"learning_rate": 0.00015129916655425468,
"loss": 0.0104,
"mean_token_accuracy": 0.9972853481769561,
"step": 1650
},
{
"epoch": 11.906474820143885,
"grad_norm": 0.078313419391383,
"learning_rate": 0.00015093944263182583,
"loss": 0.0118,
"mean_token_accuracy": 0.9968084037303925,
"step": 1655
},
{
"epoch": 11.942446043165468,
"grad_norm": 0.04178771006797904,
"learning_rate": 0.00015057882635744098,
"loss": 0.0098,
"mean_token_accuracy": 0.997468900680542,
"step": 1660
},
{
"epoch": 11.97841726618705,
"grad_norm": 0.06783244720344332,
"learning_rate": 0.0001502173240483392,
"loss": 0.0115,
"mean_token_accuracy": 0.9969798445701599,
"step": 1665
},
{
"epoch": 12.0,
"eval_loss": 0.12743568420410156,
"eval_mean_token_accuracy": 0.98657088117166,
"eval_runtime": 20.6428,
"eval_samples_per_second": 5.91,
"eval_steps_per_second": 0.775,
"step": 1668
},
{
"epoch": 12.014388489208633,
"grad_norm": 0.027342999643264056,
"learning_rate": 0.00014985494203728102,
"loss": 0.0103,
"mean_token_accuracy": 0.9981654733419418,
"step": 1670
},
{
"epoch": 12.050359712230216,
"grad_norm": 0.059963744431938956,
"learning_rate": 0.00014949168667243758,
"loss": 0.0072,
"mean_token_accuracy": 0.9981658458709717,
"step": 1675
},
{
"epoch": 12.086330935251798,
"grad_norm": 0.07816781926571713,
"learning_rate": 0.00014912756431727922,
"loss": 0.0069,
"mean_token_accuracy": 0.9983003556728363,
"step": 1680
},
{
"epoch": 12.122302158273381,
"grad_norm": 0.03878434979130554,
"learning_rate": 0.00014876258135046422,
"loss": 0.0077,
"mean_token_accuracy": 0.9979945898056031,
"step": 1685
},
{
"epoch": 12.158273381294965,
"grad_norm": 0.05123502981808833,
"learning_rate": 0.00014839674416572694,
"loss": 0.0062,
"mean_token_accuracy": 0.9983372211456298,
"step": 1690
},
{
"epoch": 12.194244604316546,
"grad_norm": 0.06471897367408934,
"learning_rate": 0.00014803005917176585,
"loss": 0.0068,
"mean_token_accuracy": 0.9983494818210602,
"step": 1695
},
{
"epoch": 12.23021582733813,
"grad_norm": 0.05910329781704424,
"learning_rate": 0.00014766253279213117,
"loss": 0.0076,
"mean_token_accuracy": 0.9981291174888611,
"step": 1700
},
{
"epoch": 12.266187050359711,
"grad_norm": 0.059920297250759154,
"learning_rate": 0.00014729417146511255,
"loss": 0.0081,
"mean_token_accuracy": 0.9980435788631439,
"step": 1705
},
{
"epoch": 12.302158273381295,
"grad_norm": 0.047092720411911114,
"learning_rate": 0.00014692498164362613,
"loss": 0.0083,
"mean_token_accuracy": 0.9978721857070922,
"step": 1710
},
{
"epoch": 12.338129496402878,
"grad_norm": 0.06484446628772765,
"learning_rate": 0.0001465549697951015,
"loss": 0.0081,
"mean_token_accuracy": 0.9979456484317779,
"step": 1715
},
{
"epoch": 12.37410071942446,
"grad_norm": 0.037828218939601234,
"learning_rate": 0.00014618414240136844,
"loss": 0.0074,
"mean_token_accuracy": 0.9981168389320374,
"step": 1720
},
{
"epoch": 12.410071942446043,
"grad_norm": 0.09326182092147371,
"learning_rate": 0.00014581250595854336,
"loss": 0.0079,
"mean_token_accuracy": 0.9980802178382874,
"step": 1725
},
{
"epoch": 12.446043165467627,
"grad_norm": 0.1146676492718464,
"learning_rate": 0.00014544006697691557,
"loss": 0.0089,
"mean_token_accuracy": 0.9978107392787934,
"step": 1730
},
{
"epoch": 12.482014388489208,
"grad_norm": 0.05343799953840085,
"learning_rate": 0.00014506683198083314,
"loss": 0.0084,
"mean_token_accuracy": 0.9978642165660858,
"step": 1735
},
{
"epoch": 12.517985611510792,
"grad_norm": 0.06686229156740404,
"learning_rate": 0.00014469280750858854,
"loss": 0.0074,
"mean_token_accuracy": 0.9980190098285675,
"step": 1740
},
{
"epoch": 12.553956834532373,
"grad_norm": 0.07182393374099447,
"learning_rate": 0.0001443180001123044,
"loss": 0.0078,
"mean_token_accuracy": 0.9979457080364227,
"step": 1745
},
{
"epoch": 12.589928057553957,
"grad_norm": 0.07145188250380506,
"learning_rate": 0.00014394241635781838,
"loss": 0.0073,
"mean_token_accuracy": 0.9980436384677887,
"step": 1750
},
{
"epoch": 12.62589928057554,
"grad_norm": 0.06052636573650373,
"learning_rate": 0.00014356606282456833,
"loss": 0.008,
"mean_token_accuracy": 0.9978723347187042,
"step": 1755
},
{
"epoch": 12.661870503597122,
"grad_norm": 0.051892477022163645,
"learning_rate": 0.00014318894610547707,
"loss": 0.0077,
"mean_token_accuracy": 0.9979701161384582,
"step": 1760
},
{
"epoch": 12.697841726618705,
"grad_norm": 0.06531042040712823,
"learning_rate": 0.00014281107280683677,
"loss": 0.0077,
"mean_token_accuracy": 0.9981413781642914,
"step": 1765
},
{
"epoch": 12.733812949640289,
"grad_norm": 0.05421707276721448,
"learning_rate": 0.00014243244954819328,
"loss": 0.0084,
"mean_token_accuracy": 0.9978357255458832,
"step": 1770
},
{
"epoch": 12.76978417266187,
"grad_norm": 0.05688320731946079,
"learning_rate": 0.00014205308296223024,
"loss": 0.0088,
"mean_token_accuracy": 0.9977129817008972,
"step": 1775
},
{
"epoch": 12.805755395683454,
"grad_norm": 0.047097464345664586,
"learning_rate": 0.0001416729796946527,
"loss": 0.0067,
"mean_token_accuracy": 0.9982883751392364,
"step": 1780
},
{
"epoch": 12.841726618705035,
"grad_norm": 0.04951718994376441,
"learning_rate": 0.00014129214640407102,
"loss": 0.0074,
"mean_token_accuracy": 0.9980681598186493,
"step": 1785
},
{
"epoch": 12.877697841726619,
"grad_norm": 0.03535068544285628,
"learning_rate": 0.0001409105897618838,
"loss": 0.0068,
"mean_token_accuracy": 0.9982638895511627,
"step": 1790
},
{
"epoch": 12.913669064748202,
"grad_norm": 0.05813590209574777,
"learning_rate": 0.0001405283164521614,
"loss": 0.0087,
"mean_token_accuracy": 0.9977501213550568,
"step": 1795
},
{
"epoch": 12.949640287769784,
"grad_norm": 0.08313242752328498,
"learning_rate": 0.0001401453331715286,
"loss": 0.0086,
"mean_token_accuracy": 0.9979334115982056,
"step": 1800
},
{
"epoch": 12.985611510791367,
"grad_norm": 0.06490143326945057,
"learning_rate": 0.00013976164662904745,
"loss": 0.0083,
"mean_token_accuracy": 0.997908991575241,
"step": 1805
},
{
"epoch": 13.0,
"eval_loss": 0.1358712911605835,
"eval_mean_token_accuracy": 0.9858476608991623,
"eval_runtime": 20.5836,
"eval_samples_per_second": 5.927,
"eval_steps_per_second": 0.777,
"step": 1807
},
{
"epoch": 13.02158273381295,
"grad_norm": 0.03861550158961351,
"learning_rate": 0.00013937726354609962,
"loss": 0.0074,
"mean_token_accuracy": 0.9983490506807963,
"step": 1810
},
{
"epoch": 13.057553956834532,
"grad_norm": 0.03224073083223682,
"learning_rate": 0.0001389921906562687,
"loss": 0.0062,
"mean_token_accuracy": 0.9983859360218048,
"step": 1815
},
{
"epoch": 13.093525179856115,
"grad_norm": 0.032898659395212033,
"learning_rate": 0.0001386064347052223,
"loss": 0.0066,
"mean_token_accuracy": 0.9982513129711151,
"step": 1820
},
{
"epoch": 13.129496402877697,
"grad_norm": 0.03628135892190089,
"learning_rate": 0.00013822000245059378,
"loss": 0.0067,
"mean_token_accuracy": 0.9982879996299744,
"step": 1825
},
{
"epoch": 13.16546762589928,
"grad_norm": 0.03828613461344576,
"learning_rate": 0.00013783290066186391,
"loss": 0.0053,
"mean_token_accuracy": 0.9985626757144928,
"step": 1830
},
{
"epoch": 13.201438848920864,
"grad_norm": 0.03628422925570306,
"learning_rate": 0.0001374451361202423,
"loss": 0.0066,
"mean_token_accuracy": 0.9981167852878571,
"step": 1835
},
{
"epoch": 13.237410071942445,
"grad_norm": 0.044986378614690334,
"learning_rate": 0.00013705671561854867,
"loss": 0.0068,
"mean_token_accuracy": 0.9982267796993256,
"step": 1840
},
{
"epoch": 13.273381294964029,
"grad_norm": 0.05267548343245477,
"learning_rate": 0.00013666764596109365,
"loss": 0.0064,
"mean_token_accuracy": 0.9983249008655548,
"step": 1845
},
{
"epoch": 13.309352517985612,
"grad_norm": 0.03226487542606456,
"learning_rate": 0.00013627793396355983,
"loss": 0.0064,
"mean_token_accuracy": 0.9984836876392365,
"step": 1850
},
{
"epoch": 13.345323741007194,
"grad_norm": 0.05115506820484576,
"learning_rate": 0.00013588758645288217,
"loss": 0.0061,
"mean_token_accuracy": 0.9983738422393799,
"step": 1855
},
{
"epoch": 13.381294964028777,
"grad_norm": 0.03667313092197686,
"learning_rate": 0.0001354966102671285,
"loss": 0.0062,
"mean_token_accuracy": 0.9983859896659851,
"step": 1860
},
{
"epoch": 13.417266187050359,
"grad_norm": 0.05542873770045281,
"learning_rate": 0.00013510501225537976,
"loss": 0.0068,
"mean_token_accuracy": 0.9980922400951385,
"step": 1865
},
{
"epoch": 13.453237410071942,
"grad_norm": 0.049749433850759105,
"learning_rate": 0.00013471279927760997,
"loss": 0.0066,
"mean_token_accuracy": 0.998239153623581,
"step": 1870
},
{
"epoch": 13.489208633093526,
"grad_norm": 0.04722428251226616,
"learning_rate": 0.00013431997820456592,
"loss": 0.0068,
"mean_token_accuracy": 0.9983492016792297,
"step": 1875
},
{
"epoch": 13.525179856115107,
"grad_norm": 0.06972025494758348,
"learning_rate": 0.00013392655591764723,
"loss": 0.0067,
"mean_token_accuracy": 0.9983003556728363,
"step": 1880
},
{
"epoch": 13.56115107913669,
"grad_norm": 0.04149046158387444,
"learning_rate": 0.00013353253930878525,
"loss": 0.006,
"mean_token_accuracy": 0.9984471023082733,
"step": 1885
},
{
"epoch": 13.597122302158274,
"grad_norm": 0.053027022615863284,
"learning_rate": 0.00013313793528032278,
"loss": 0.0066,
"mean_token_accuracy": 0.9981414675712585,
"step": 1890
},
{
"epoch": 13.633093525179856,
"grad_norm": 0.06016753855344703,
"learning_rate": 0.0001327427507448928,
"loss": 0.0058,
"mean_token_accuracy": 0.9983982980251312,
"step": 1895
},
{
"epoch": 13.66906474820144,
"grad_norm": 0.06568374751780803,
"learning_rate": 0.00013234699262529778,
"loss": 0.0063,
"mean_token_accuracy": 0.9984226942062377,
"step": 1900
},
{
"epoch": 13.70503597122302,
"grad_norm": 0.02812931422079084,
"learning_rate": 0.000131950667854388,
"loss": 0.0069,
"mean_token_accuracy": 0.9982512354850769,
"step": 1905
},
{
"epoch": 13.741007194244604,
"grad_norm": 0.04777322152627654,
"learning_rate": 0.00013155378337494035,
"loss": 0.0067,
"mean_token_accuracy": 0.9982635855674744,
"step": 1910
},
{
"epoch": 13.776978417266188,
"grad_norm": 0.043818355689989874,
"learning_rate": 0.00013115634613953663,
"loss": 0.007,
"mean_token_accuracy": 0.9982267916202545,
"step": 1915
},
{
"epoch": 13.81294964028777,
"grad_norm": 0.03349871508194793,
"learning_rate": 0.00013075836311044175,
"loss": 0.0069,
"mean_token_accuracy": 0.9982512712478637,
"step": 1920
},
{
"epoch": 13.848920863309353,
"grad_norm": 0.0382839105195503,
"learning_rate": 0.00013035984125948178,
"loss": 0.0065,
"mean_token_accuracy": 0.9983247220516205,
"step": 1925
},
{
"epoch": 13.884892086330936,
"grad_norm": 0.03717138922436437,
"learning_rate": 0.00012996078756792186,
"loss": 0.0067,
"mean_token_accuracy": 0.9981537342071534,
"step": 1930
},
{
"epoch": 13.920863309352518,
"grad_norm": 0.04286039147486022,
"learning_rate": 0.00012956120902634378,
"loss": 0.0065,
"mean_token_accuracy": 0.9982879340648652,
"step": 1935
},
{
"epoch": 13.956834532374101,
"grad_norm": 0.05579034741194405,
"learning_rate": 0.00012916111263452368,
"loss": 0.007,
"mean_token_accuracy": 0.9980191111564636,
"step": 1940
},
{
"epoch": 13.992805755395683,
"grad_norm": 0.04290302964746944,
"learning_rate": 0.00012876050540130927,
"loss": 0.0071,
"mean_token_accuracy": 0.998129004240036,
"step": 1945
},
{
"epoch": 14.0,
"eval_loss": 0.14095589518547058,
"eval_mean_token_accuracy": 0.9844649698999193,
"eval_runtime": 20.7085,
"eval_samples_per_second": 5.891,
"eval_steps_per_second": 0.773,
"step": 1946
},
{
"epoch": 14.028776978417266,
"grad_norm": 0.02728336439016839,
"learning_rate": 0.00012835939434449714,
"loss": 0.006,
"mean_token_accuracy": 0.9983949959278107,
"step": 1950
},
{
"epoch": 14.06474820143885,
"grad_norm": 0.028891124116282068,
"learning_rate": 0.00012795778649070993,
"loss": 0.0057,
"mean_token_accuracy": 0.9985325753688812,
"step": 1955
},
{
"epoch": 14.100719424460431,
"grad_norm": 0.04257323636970389,
"learning_rate": 0.00012755568887527297,
"loss": 0.0054,
"mean_token_accuracy": 0.9985634684562683,
"step": 1960
},
{
"epoch": 14.136690647482014,
"grad_norm": 0.06375421515135235,
"learning_rate": 0.00012715310854209124,
"loss": 0.0059,
"mean_token_accuracy": 0.9984101951122284,
"step": 1965
},
{
"epoch": 14.172661870503598,
"grad_norm": 0.02540027723004083,
"learning_rate": 0.00012675005254352594,
"loss": 0.0054,
"mean_token_accuracy": 0.998593783378601,
"step": 1970
},
{
"epoch": 14.20863309352518,
"grad_norm": 0.023921388854413334,
"learning_rate": 0.00012634652794027087,
"loss": 0.0062,
"mean_token_accuracy": 0.9983613193035126,
"step": 1975
},
{
"epoch": 14.244604316546763,
"grad_norm": 0.039564667117860226,
"learning_rate": 0.00012594254180122886,
"loss": 0.006,
"mean_token_accuracy": 0.9983247637748718,
"step": 1980
},
{
"epoch": 14.280575539568344,
"grad_norm": 0.03305788290352457,
"learning_rate": 0.00012553810120338786,
"loss": 0.0054,
"mean_token_accuracy": 0.9987037897109985,
"step": 1985
},
{
"epoch": 14.316546762589928,
"grad_norm": 0.059088222287712794,
"learning_rate": 0.000125133213231697,
"loss": 0.0053,
"mean_token_accuracy": 0.9985817670822144,
"step": 1990
},
{
"epoch": 14.352517985611511,
"grad_norm": 0.023509426202254203,
"learning_rate": 0.00012472788497894236,
"loss": 0.0054,
"mean_token_accuracy": 0.9986183822154999,
"step": 1995
},
{
"epoch": 14.388489208633093,
"grad_norm": 0.022890239589631503,
"learning_rate": 0.00012432212354562298,
"loss": 0.0057,
"mean_token_accuracy": 0.9984715104103088,
"step": 2000
},
{
"epoch": 14.424460431654676,
"grad_norm": 0.04733476932184841,
"learning_rate": 0.00012391593603982618,
"loss": 0.0056,
"mean_token_accuracy": 0.9984348475933075,
"step": 2005
},
{
"epoch": 14.46043165467626,
"grad_norm": 0.07353913032836777,
"learning_rate": 0.0001235093295771032,
"loss": 0.0066,
"mean_token_accuracy": 0.998398095369339,
"step": 2010
},
{
"epoch": 14.496402877697841,
"grad_norm": 0.037687053524519704,
"learning_rate": 0.00012310231128034464,
"loss": 0.0056,
"mean_token_accuracy": 0.9984593033790589,
"step": 2015
},
{
"epoch": 14.532374100719425,
"grad_norm": 0.04717362898069163,
"learning_rate": 0.00012269488827965536,
"loss": 0.0058,
"mean_token_accuracy": 0.9983981728553772,
"step": 2020
},
{
"epoch": 14.568345323741006,
"grad_norm": 0.03594233576079112,
"learning_rate": 0.00012228706771223,
"loss": 0.0056,
"mean_token_accuracy": 0.9984471380710602,
"step": 2025
},
{
"epoch": 14.60431654676259,
"grad_norm": 0.03917732285509177,
"learning_rate": 0.00012187885672222752,
"loss": 0.006,
"mean_token_accuracy": 0.9983980774879455,
"step": 2030
},
{
"epoch": 14.640287769784173,
"grad_norm": 0.02633180057530005,
"learning_rate": 0.00012147026246064644,
"loss": 0.0065,
"mean_token_accuracy": 0.9982512533664704,
"step": 2035
},
{
"epoch": 14.676258992805755,
"grad_norm": 0.0404471117892732,
"learning_rate": 0.00012106129208519934,
"loss": 0.0056,
"mean_token_accuracy": 0.9985327005386353,
"step": 2040
},
{
"epoch": 14.712230215827338,
"grad_norm": 0.06192069060353017,
"learning_rate": 0.00012065195276018746,
"loss": 0.0058,
"mean_token_accuracy": 0.9984227299690247,
"step": 2045
},
{
"epoch": 14.748201438848922,
"grad_norm": 0.04930101254092268,
"learning_rate": 0.00012024225165637531,
"loss": 0.0062,
"mean_token_accuracy": 0.9983979761600494,
"step": 2050
},
{
"epoch": 14.784172661870503,
"grad_norm": 0.024850771626895557,
"learning_rate": 0.00011983219595086506,
"loss": 0.0061,
"mean_token_accuracy": 0.998300313949585,
"step": 2055
},
{
"epoch": 14.820143884892087,
"grad_norm": 0.038918118854958376,
"learning_rate": 0.00011942179282697064,
"loss": 0.006,
"mean_token_accuracy": 0.9983371019363403,
"step": 2060
},
{
"epoch": 14.85611510791367,
"grad_norm": 0.0516184338246842,
"learning_rate": 0.00011901104947409212,
"loss": 0.0059,
"mean_token_accuracy": 0.9983981013298034,
"step": 2065
},
{
"epoch": 14.892086330935252,
"grad_norm": 0.10462590843104572,
"learning_rate": 0.00011859997308758959,
"loss": 0.0066,
"mean_token_accuracy": 0.9981902480125427,
"step": 2070
},
{
"epoch": 14.928057553956835,
"grad_norm": 0.09491933759966162,
"learning_rate": 0.00011818857086865725,
"loss": 0.0067,
"mean_token_accuracy": 0.9982022881507874,
"step": 2075
},
{
"epoch": 14.964028776978417,
"grad_norm": 0.022774250628572534,
"learning_rate": 0.00011777685002419717,
"loss": 0.0057,
"mean_token_accuracy": 0.9985937774181366,
"step": 2080
},
{
"epoch": 15.0,
"grad_norm": 0.034773500298789874,
"learning_rate": 0.00011736481776669306,
"loss": 0.006,
"mean_token_accuracy": 0.9984226584434509,
"step": 2085
},
{
"epoch": 15.0,
"eval_loss": 0.14427591860294342,
"eval_mean_token_accuracy": 0.9827392026782036,
"eval_runtime": 20.6283,
"eval_samples_per_second": 5.914,
"eval_steps_per_second": 0.776,
"step": 2085
},
{
"epoch": 15.035971223021583,
"grad_norm": 0.035104691061661225,
"learning_rate": 0.00011695248131408394,
"loss": 0.0052,
"mean_token_accuracy": 0.9986181318759918,
"step": 2090
},
{
"epoch": 15.071942446043165,
"grad_norm": 0.03200252235283823,
"learning_rate": 0.00011653984788963775,
"loss": 0.0046,
"mean_token_accuracy": 0.9987406134605408,
"step": 2095
},
{
"epoch": 15.107913669064748,
"grad_norm": 0.15594026975405056,
"learning_rate": 0.00011612692472182463,
"loss": 0.0051,
"mean_token_accuracy": 0.9986916542053222,
"step": 2100
},
{
"epoch": 15.14388489208633,
"grad_norm": 0.055765964735062255,
"learning_rate": 0.00011571371904419053,
"loss": 0.0053,
"mean_token_accuracy": 0.998593682050705,
"step": 2105
},
{
"epoch": 15.179856115107913,
"grad_norm": 0.029662600967569834,
"learning_rate": 0.0001153002380952303,
"loss": 0.0051,
"mean_token_accuracy": 0.9984715342521667,
"step": 2110
},
{
"epoch": 15.215827338129497,
"grad_norm": 0.033094414530212134,
"learning_rate": 0.00011488648911826099,
"loss": 0.0056,
"mean_token_accuracy": 0.9985202550888062,
"step": 2115
},
{
"epoch": 15.251798561151078,
"grad_norm": 0.05314626647068158,
"learning_rate": 0.00011447247936129497,
"loss": 0.0059,
"mean_token_accuracy": 0.9983490586280823,
"step": 2120
},
{
"epoch": 15.287769784172662,
"grad_norm": 0.045699141908757346,
"learning_rate": 0.00011405821607691287,
"loss": 0.0061,
"mean_token_accuracy": 0.9984403252601624,
"step": 2125
},
{
"epoch": 15.323741007194245,
"grad_norm": 0.022330079247343013,
"learning_rate": 0.00011364370652213665,
"loss": 0.0059,
"mean_token_accuracy": 0.9984836757183075,
"step": 2130
},
{
"epoch": 15.359712230215827,
"grad_norm": 0.04482842486338068,
"learning_rate": 0.00011322895795830237,
"loss": 0.0061,
"mean_token_accuracy": 0.9984592318534851,
"step": 2135
},
{
"epoch": 15.39568345323741,
"grad_norm": 0.035455200603189345,
"learning_rate": 0.00011281397765093301,
"loss": 0.0056,
"mean_token_accuracy": 0.9985081374645233,
"step": 2140
},
{
"epoch": 15.431654676258994,
"grad_norm": 0.03415653545073392,
"learning_rate": 0.00011239877286961122,
"loss": 0.0059,
"mean_token_accuracy": 0.9984224319458008,
"step": 2145
},
{
"epoch": 15.467625899280575,
"grad_norm": 0.022878497923323426,
"learning_rate": 0.000111983350887852,
"loss": 0.0048,
"mean_token_accuracy": 0.9985940217971802,
"step": 2150
},
{
"epoch": 15.503597122302159,
"grad_norm": 0.04143356216028894,
"learning_rate": 0.00011156771898297525,
"loss": 0.0061,
"mean_token_accuracy": 0.9983247220516205,
"step": 2155
},
{
"epoch": 15.53956834532374,
"grad_norm": 0.031009283054650786,
"learning_rate": 0.00011115188443597821,
"loss": 0.0054,
"mean_token_accuracy": 0.9984225571155548,
"step": 2160
},
{
"epoch": 15.575539568345324,
"grad_norm": 0.022394840577598808,
"learning_rate": 0.000110735854531408,
"loss": 0.0049,
"mean_token_accuracy": 0.9986917078495026,
"step": 2165
},
{
"epoch": 15.611510791366907,
"grad_norm": 0.02091443750785051,
"learning_rate": 0.00011031963655723407,
"loss": 0.0055,
"mean_token_accuracy": 0.9984104752540588,
"step": 2170
},
{
"epoch": 15.647482014388489,
"grad_norm": 0.02380856139135341,
"learning_rate": 0.00010990323780472041,
"loss": 0.0052,
"mean_token_accuracy": 0.9986672401428223,
"step": 2175
},
{
"epoch": 15.683453237410072,
"grad_norm": 0.033810504796281546,
"learning_rate": 0.00010948666556829781,
"loss": 0.0053,
"mean_token_accuracy": 0.9985450327396392,
"step": 2180
},
{
"epoch": 15.719424460431654,
"grad_norm": 0.03600666830072387,
"learning_rate": 0.0001090699271454362,
"loss": 0.0051,
"mean_token_accuracy": 0.9987039625644684,
"step": 2185
},
{
"epoch": 15.755395683453237,
"grad_norm": 0.03847446311106137,
"learning_rate": 0.00010865302983651673,
"loss": 0.0058,
"mean_token_accuracy": 0.9983490526676178,
"step": 2190
},
{
"epoch": 15.79136690647482,
"grad_norm": 0.04082747481136435,
"learning_rate": 0.00010823598094470393,
"loss": 0.0065,
"mean_token_accuracy": 0.9983490526676178,
"step": 2195
},
{
"epoch": 15.827338129496402,
"grad_norm": 0.03422471271789895,
"learning_rate": 0.00010781878777581771,
"loss": 0.0054,
"mean_token_accuracy": 0.9984959781169891,
"step": 2200
},
{
"epoch": 15.863309352517986,
"grad_norm": 0.04659967607202778,
"learning_rate": 0.00010740145763820532,
"loss": 0.0056,
"mean_token_accuracy": 0.9985326588153839,
"step": 2205
},
{
"epoch": 15.899280575539569,
"grad_norm": 0.03223979544330336,
"learning_rate": 0.00010698399784261366,
"loss": 0.0051,
"mean_token_accuracy": 0.9985695660114289,
"step": 2210
},
{
"epoch": 15.93525179856115,
"grad_norm": 0.021828243243609862,
"learning_rate": 0.0001065664157020607,
"loss": 0.0054,
"mean_token_accuracy": 0.998581486940384,
"step": 2215
},
{
"epoch": 15.971223021582734,
"grad_norm": 0.057542338253446325,
"learning_rate": 0.00010614871853170781,
"loss": 0.0054,
"mean_token_accuracy": 0.998410427570343,
"step": 2220
},
{
"epoch": 16.0,
"eval_loss": 0.1483013927936554,
"eval_mean_token_accuracy": 0.9880256205797195,
"eval_runtime": 20.8076,
"eval_samples_per_second": 5.863,
"eval_steps_per_second": 0.769,
"step": 2224
},
{
"epoch": 16.007194244604317,
"grad_norm": 0.02766002764688597,
"learning_rate": 0.00010573091364873132,
"loss": 0.005,
"mean_token_accuracy": 0.9988994002342224,
"step": 2225
},
{
"epoch": 16.0431654676259,
"grad_norm": 0.023692963524001718,
"learning_rate": 0.00010531300837219455,
"loss": 0.0048,
"mean_token_accuracy": 0.998691588640213,
"step": 2230
},
{
"epoch": 16.07913669064748,
"grad_norm": 0.029459937106562046,
"learning_rate": 0.00010489501002291952,
"loss": 0.0053,
"mean_token_accuracy": 0.998544842004776,
"step": 2235
},
{
"epoch": 16.115107913669064,
"grad_norm": 0.035502170379668525,
"learning_rate": 0.00010447692592335861,
"loss": 0.0047,
"mean_token_accuracy": 0.9986058592796325,
"step": 2240
},
{
"epoch": 16.151079136690647,
"grad_norm": 0.03791085471996512,
"learning_rate": 0.00010405876339746636,
"loss": 0.0041,
"mean_token_accuracy": 0.9988628804683686,
"step": 2245
},
{
"epoch": 16.18705035971223,
"grad_norm": 0.02626409405940899,
"learning_rate": 0.00010364052977057126,
"loss": 0.0051,
"mean_token_accuracy": 0.9985937297344207,
"step": 2250
},
{
"epoch": 16.223021582733814,
"grad_norm": 0.04031732388074853,
"learning_rate": 0.00010322223236924727,
"loss": 0.0049,
"mean_token_accuracy": 0.9987038433551788,
"step": 2255
},
{
"epoch": 16.258992805755394,
"grad_norm": 0.01795687371349505,
"learning_rate": 0.00010280387852118554,
"loss": 0.0049,
"mean_token_accuracy": 0.9986060202121735,
"step": 2260
},
{
"epoch": 16.294964028776977,
"grad_norm": 0.04430468807526254,
"learning_rate": 0.00010238547555506614,
"loss": 0.005,
"mean_token_accuracy": 0.9984959602355957,
"step": 2265
},
{
"epoch": 16.33093525179856,
"grad_norm": 0.05249900301460759,
"learning_rate": 0.00010196703080042946,
"loss": 0.0052,
"mean_token_accuracy": 0.9986181795597077,
"step": 2270
},
{
"epoch": 16.366906474820144,
"grad_norm": 0.021493191230659354,
"learning_rate": 0.00010154855158754805,
"loss": 0.0046,
"mean_token_accuracy": 0.998752897977829,
"step": 2275
},
{
"epoch": 16.402877697841728,
"grad_norm": 0.0833831847193212,
"learning_rate": 0.00010113004524729799,
"loss": 0.0057,
"mean_token_accuracy": 0.9984051644802093,
"step": 2280
},
{
"epoch": 16.43884892086331,
"grad_norm": 0.030622972707148307,
"learning_rate": 0.00010071151911103063,
"loss": 0.0055,
"mean_token_accuracy": 0.9984959185123443,
"step": 2285
},
{
"epoch": 16.47482014388489,
"grad_norm": 0.030014388256583948,
"learning_rate": 0.00010029298051044414,
"loss": 0.0049,
"mean_token_accuracy": 0.9984592616558075,
"step": 2290
},
{
"epoch": 16.510791366906474,
"grad_norm": 0.023782333462141935,
"learning_rate": 9.987443677745496e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9986916482448578,
"step": 2295
},
{
"epoch": 16.546762589928058,
"grad_norm": 0.0583624667571686,
"learning_rate": 9.945589524406951e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9984225928783417,
"step": 2300
},
{
"epoch": 16.58273381294964,
"grad_norm": 0.025457368768833585,
"learning_rate": 9.90373632422556e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9985692918300628,
"step": 2305
},
{
"epoch": 16.618705035971225,
"grad_norm": 0.027459291921463742,
"learning_rate": 9.861884810381417e-05,
"loss": 0.0047,
"mean_token_accuracy": 0.9986428856849671,
"step": 2310
},
{
"epoch": 16.654676258992804,
"grad_norm": 0.03495997309084576,
"learning_rate": 9.820035716025068e-05,
"loss": 0.005,
"mean_token_accuracy": 0.9984227001667023,
"step": 2315
},
{
"epoch": 16.690647482014388,
"grad_norm": 0.03329874416462551,
"learning_rate": 9.77818977426467e-05,
"loss": 0.0048,
"mean_token_accuracy": 0.998716139793396,
"step": 2320
},
{
"epoch": 16.72661870503597,
"grad_norm": 0.03740885190723497,
"learning_rate": 9.73634771815317e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9985325634479523,
"step": 2325
},
{
"epoch": 16.762589928057555,
"grad_norm": 0.027197902407817453,
"learning_rate": 9.694510280675423e-05,
"loss": 0.005,
"mean_token_accuracy": 0.998703807592392,
"step": 2330
},
{
"epoch": 16.798561151079138,
"grad_norm": 0.025041006989781844,
"learning_rate": 9.652678194735394e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9986181437969208,
"step": 2335
},
{
"epoch": 16.834532374100718,
"grad_norm": 0.024478935060570445,
"learning_rate": 9.610852193143299e-05,
"loss": 0.0053,
"mean_token_accuracy": 0.9984714210033416,
"step": 2340
},
{
"epoch": 16.8705035971223,
"grad_norm": 0.03976876347602272,
"learning_rate": 9.569033008602756e-05,
"loss": 0.0058,
"mean_token_accuracy": 0.9983245432376862,
"step": 2345
},
{
"epoch": 16.906474820143885,
"grad_norm": 0.028228360140161907,
"learning_rate": 9.527221373697973e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9986182987689972,
"step": 2350
},
{
"epoch": 16.942446043165468,
"grad_norm": 0.022048124762114728,
"learning_rate": 9.485418020880907e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9986796140670776,
"step": 2355
},
{
"epoch": 16.97841726618705,
"grad_norm": 0.026867898824077054,
"learning_rate": 9.44362368245842e-05,
"loss": 0.0053,
"mean_token_accuracy": 0.9984837353229523,
"step": 2360
},
{
"epoch": 17.0,
"eval_loss": 0.15214376151561737,
"eval_mean_token_accuracy": 0.986979441209273,
"eval_runtime": 20.6429,
"eval_samples_per_second": 5.91,
"eval_steps_per_second": 0.775,
"step": 2363
},
{
"epoch": 17.014388489208635,
"grad_norm": 0.022380133629246797,
"learning_rate": 9.401839090579462e-05,
"loss": 0.0048,
"mean_token_accuracy": 0.9988689571619034,
"step": 2365
},
{
"epoch": 17.050359712230215,
"grad_norm": 0.017392820309802014,
"learning_rate": 9.360064977222262e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9988016843795776,
"step": 2370
},
{
"epoch": 17.086330935251798,
"grad_norm": 0.028255077169732083,
"learning_rate": 9.31830207418146e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9988139510154724,
"step": 2375
},
{
"epoch": 17.12230215827338,
"grad_norm": 0.024878084296257166,
"learning_rate": 9.276551113055337e-05,
"loss": 0.0045,
"mean_token_accuracy": 0.9986426115036011,
"step": 2380
},
{
"epoch": 17.158273381294965,
"grad_norm": 0.020588745089007283,
"learning_rate": 9.23481282523296e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9987526893615722,
"step": 2385
},
{
"epoch": 17.194244604316548,
"grad_norm": 0.03258067411700634,
"learning_rate": 9.193087941881397e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9988873302936554,
"step": 2390
},
{
"epoch": 17.230215827338128,
"grad_norm": 0.032706145299279836,
"learning_rate": 9.151377193932903e-05,
"loss": 0.0052,
"mean_token_accuracy": 0.9985202550888062,
"step": 2395
},
{
"epoch": 17.26618705035971,
"grad_norm": 0.023979072230274387,
"learning_rate": 9.109681312072091e-05,
"loss": 0.0045,
"mean_token_accuracy": 0.9987037956714631,
"step": 2400
},
{
"epoch": 17.302158273381295,
"grad_norm": 0.03210227027143969,
"learning_rate": 9.068001026723166e-05,
"loss": 0.005,
"mean_token_accuracy": 0.9985203862190246,
"step": 2405
},
{
"epoch": 17.33812949640288,
"grad_norm": 0.02958254695418276,
"learning_rate": 9.026337068037122e-05,
"loss": 0.0047,
"mean_token_accuracy": 0.9986793696880341,
"step": 2410
},
{
"epoch": 17.37410071942446,
"grad_norm": 0.02003759643098566,
"learning_rate": 8.984690165878921e-05,
"loss": 0.0048,
"mean_token_accuracy": 0.9985570669174194,
"step": 2415
},
{
"epoch": 17.41007194244604,
"grad_norm": 0.01976792000129784,
"learning_rate": 8.943061049814752e-05,
"loss": 0.0045,
"mean_token_accuracy": 0.998789393901825,
"step": 2420
},
{
"epoch": 17.446043165467625,
"grad_norm": 0.022841179893650605,
"learning_rate": 8.901450449099214e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9988627791404724,
"step": 2425
},
{
"epoch": 17.48201438848921,
"grad_norm": 0.02670252796794463,
"learning_rate": 8.859859092662563e-05,
"loss": 0.005,
"mean_token_accuracy": 0.9986181497573853,
"step": 2430
},
{
"epoch": 17.51798561151079,
"grad_norm": 0.025962870388272177,
"learning_rate": 8.818287709097947e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9987486064434051,
"step": 2435
},
{
"epoch": 17.553956834532375,
"grad_norm": 0.02094749161409735,
"learning_rate": 8.776737026648605e-05,
"loss": 0.0047,
"mean_token_accuracy": 0.9986182987689972,
"step": 2440
},
{
"epoch": 17.58992805755396,
"grad_norm": 0.02161896425598796,
"learning_rate": 8.735207773195156e-05,
"loss": 0.0047,
"mean_token_accuracy": 0.9986915528774262,
"step": 2445
},
{
"epoch": 17.62589928057554,
"grad_norm": 0.018472080207272413,
"learning_rate": 8.693700676242828e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9985081493854523,
"step": 2450
},
{
"epoch": 17.66187050359712,
"grad_norm": 0.02228454767779503,
"learning_rate": 8.652216462908698e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9986059486865997,
"step": 2455
},
{
"epoch": 17.697841726618705,
"grad_norm": 0.040612353279765694,
"learning_rate": 8.610755859908991e-05,
"loss": 0.0051,
"mean_token_accuracy": 0.9985325336456299,
"step": 2460
},
{
"epoch": 17.73381294964029,
"grad_norm": 0.022409599302205964,
"learning_rate": 8.569319593546309e-05,
"loss": 0.0051,
"mean_token_accuracy": 0.9984713613986969,
"step": 2465
},
{
"epoch": 17.769784172661872,
"grad_norm": 0.022125836247721072,
"learning_rate": 8.527908389696936e-05,
"loss": 0.0053,
"mean_token_accuracy": 0.9985570132732391,
"step": 2470
},
{
"epoch": 17.805755395683452,
"grad_norm": 0.023112422737157953,
"learning_rate": 8.486522973798126e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9987773120403289,
"step": 2475
},
{
"epoch": 17.841726618705035,
"grad_norm": 0.019917824060406528,
"learning_rate": 8.445164070835357e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9987040340900422,
"step": 2480
},
{
"epoch": 17.87769784172662,
"grad_norm": 0.01946476215758075,
"learning_rate": 8.403832405329671e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9987283408641815,
"step": 2485
},
{
"epoch": 17.913669064748202,
"grad_norm": 0.0230992291308728,
"learning_rate": 8.362528701324976e-05,
"loss": 0.0054,
"mean_token_accuracy": 0.9984836399555206,
"step": 2490
},
{
"epoch": 17.949640287769785,
"grad_norm": 0.02120013351137524,
"learning_rate": 8.321253682375324e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9986916482448578,
"step": 2495
},
{
"epoch": 17.985611510791365,
"grad_norm": 0.02970778760107319,
"learning_rate": 8.2800080715323e-05,
"loss": 0.0048,
"mean_token_accuracy": 0.9986183524131775,
"step": 2500
},
{
"epoch": 18.0,
"eval_loss": 0.15446293354034424,
"eval_mean_token_accuracy": 0.9858783274888993,
"eval_runtime": 20.4074,
"eval_samples_per_second": 5.978,
"eval_steps_per_second": 0.784,
"step": 2502
},
{
"epoch": 18.02158273381295,
"grad_norm": 0.01959404120242462,
"learning_rate": 8.238792591332299e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.999062736829122,
"step": 2505
},
{
"epoch": 18.057553956834532,
"grad_norm": 0.021486196998924824,
"learning_rate": 8.197607963783889e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9987650036811828,
"step": 2510
},
{
"epoch": 18.093525179856115,
"grad_norm": 0.02817489227154816,
"learning_rate": 8.156454910355183e-05,
"loss": 0.0049,
"mean_token_accuracy": 0.9985775172710418,
"step": 2515
},
{
"epoch": 18.1294964028777,
"grad_norm": 0.026885737104364704,
"learning_rate": 8.115334151961158e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9987282276153564,
"step": 2520
},
{
"epoch": 18.165467625899282,
"grad_norm": 0.02789797766744541,
"learning_rate": 8.07424640895107e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9987159430980682,
"step": 2525
},
{
"epoch": 18.201438848920862,
"grad_norm": 0.025083417796611586,
"learning_rate": 8.033192401095808e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9987037479877472,
"step": 2530
},
{
"epoch": 18.237410071942445,
"grad_norm": 0.019413881714524635,
"learning_rate": 7.99217284757528e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9987403869628906,
"step": 2535
},
{
"epoch": 18.27338129496403,
"grad_norm": 0.024314824267057528,
"learning_rate": 7.951188466965848e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9986917316913605,
"step": 2540
},
{
"epoch": 18.309352517985612,
"grad_norm": 0.02065428863541667,
"learning_rate": 7.910239977227708e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9988263070583343,
"step": 2545
},
{
"epoch": 18.345323741007196,
"grad_norm": 0.022929150955455176,
"learning_rate": 7.869328095692312e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9987528324127197,
"step": 2550
},
{
"epoch": 18.381294964028775,
"grad_norm": 0.024841087612176065,
"learning_rate": 7.828453539049839e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9986672222614288,
"step": 2555
},
{
"epoch": 18.41726618705036,
"grad_norm": 0.02829228325153677,
"learning_rate": 7.787617023336583e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9987404704093933,
"step": 2560
},
{
"epoch": 18.453237410071942,
"grad_norm": 0.019138233811495497,
"learning_rate": 7.74681926392247e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9987406373023987,
"step": 2565
},
{
"epoch": 18.489208633093526,
"grad_norm": 0.02591841929384043,
"learning_rate": 7.706060975498486e-05,
"loss": 0.0047,
"mean_token_accuracy": 0.9985324561595916,
"step": 2570
},
{
"epoch": 18.52517985611511,
"grad_norm": 0.017988718910731748,
"learning_rate": 7.665342872064156e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9986671388149262,
"step": 2575
},
{
"epoch": 18.56115107913669,
"grad_norm": 0.022342004155552982,
"learning_rate": 7.624665666915068e-05,
"loss": 0.005,
"mean_token_accuracy": 0.9986057758331299,
"step": 2580
},
{
"epoch": 18.597122302158272,
"grad_norm": 0.02255746365433144,
"learning_rate": 7.584030072630351e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9988750219345093,
"step": 2585
},
{
"epoch": 18.633093525179856,
"grad_norm": 0.02826855123526112,
"learning_rate": 7.543436801060187e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9987161874771118,
"step": 2590
},
{
"epoch": 18.66906474820144,
"grad_norm": 0.027035933219826336,
"learning_rate": 7.502886563313376e-05,
"loss": 0.0046,
"mean_token_accuracy": 0.9986913681030274,
"step": 2595
},
{
"epoch": 18.705035971223023,
"grad_norm": 0.02493410112387194,
"learning_rate": 7.462380069744832e-05,
"loss": 0.0046,
"mean_token_accuracy": 0.9986426711082459,
"step": 2600
},
{
"epoch": 18.741007194244606,
"grad_norm": 0.0262740477326827,
"learning_rate": 7.421918029943181e-05,
"loss": 0.0053,
"mean_token_accuracy": 0.9984836339950561,
"step": 2605
},
{
"epoch": 18.776978417266186,
"grad_norm": 0.020297919413359653,
"learning_rate": 7.381501152718308e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9986794233322144,
"step": 2610
},
{
"epoch": 18.81294964028777,
"grad_norm": 0.024175582709317186,
"learning_rate": 7.341130146088935e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9986303865909576,
"step": 2615
},
{
"epoch": 18.848920863309353,
"grad_norm": 0.02486759536470699,
"learning_rate": 7.30080571727024e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9987650454044342,
"step": 2620
},
{
"epoch": 18.884892086330936,
"grad_norm": 0.026194580551628172,
"learning_rate": 7.26052857266145e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9988141477108001,
"step": 2625
},
{
"epoch": 18.92086330935252,
"grad_norm": 0.024554347584631382,
"learning_rate": 7.220299417833472e-05,
"loss": 0.0045,
"mean_token_accuracy": 0.9986916840076446,
"step": 2630
},
{
"epoch": 18.9568345323741,
"grad_norm": 0.02228534507545218,
"learning_rate": 7.180118957516533e-05,
"loss": 0.0047,
"mean_token_accuracy": 0.9986916720867157,
"step": 2635
},
{
"epoch": 18.992805755395683,
"grad_norm": 0.022918389634665453,
"learning_rate": 7.139987895587836e-05,
"loss": 0.0047,
"mean_token_accuracy": 0.9986428201198578,
"step": 2640
},
{
"epoch": 19.0,
"eval_loss": 0.1591736525297165,
"eval_mean_token_accuracy": 0.9844231969780393,
"eval_runtime": 20.6939,
"eval_samples_per_second": 5.895,
"eval_steps_per_second": 0.773,
"step": 2641
},
{
"epoch": 19.028776978417266,
"grad_norm": 0.02209360258119876,
"learning_rate": 7.099906935059229e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.998822808265686,
"step": 2645
},
{
"epoch": 19.06474820143885,
"grad_norm": 0.022577796363744688,
"learning_rate": 7.059876778064885e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9988506972789765,
"step": 2650
},
{
"epoch": 19.100719424460433,
"grad_norm": 0.017119283944210285,
"learning_rate": 7.019898125849004e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9987404644489288,
"step": 2655
},
{
"epoch": 19.136690647482013,
"grad_norm": 0.020443101276389687,
"learning_rate": 6.97997167875354e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9987649381160736,
"step": 2660
},
{
"epoch": 19.172661870503596,
"grad_norm": 0.026169139333505773,
"learning_rate": 6.940098136205917e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9988015532493592,
"step": 2665
},
{
"epoch": 19.20863309352518,
"grad_norm": 0.030602977469022324,
"learning_rate": 6.90027819670678e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9987772822380065,
"step": 2670
},
{
"epoch": 19.244604316546763,
"grad_norm": 0.027716828357940895,
"learning_rate": 6.860512557817767e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.998663604259491,
"step": 2675
},
{
"epoch": 19.280575539568346,
"grad_norm": 0.024284021779463878,
"learning_rate": 6.82080191614928e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9986548125743866,
"step": 2680
},
{
"epoch": 19.31654676258993,
"grad_norm": 0.02384668123159404,
"learning_rate": 6.781146967348284e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9988385021686554,
"step": 2685
},
{
"epoch": 19.35251798561151,
"grad_norm": 0.023240608787474342,
"learning_rate": 6.741548406086126e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9988384068012237,
"step": 2690
},
{
"epoch": 19.388489208633093,
"grad_norm": 0.02403795582857897,
"learning_rate": 6.70200692604636e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9988505661487579,
"step": 2695
},
{
"epoch": 19.424460431654676,
"grad_norm": 0.021246091430060766,
"learning_rate": 6.662523219912595e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9987038612365723,
"step": 2700
},
{
"epoch": 19.46043165467626,
"grad_norm": 0.026758139834932266,
"learning_rate": 6.623097979356367e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.998716127872467,
"step": 2705
},
{
"epoch": 19.496402877697843,
"grad_norm": 0.023539086602775094,
"learning_rate": 6.583731895025014e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9988018155097962,
"step": 2710
},
{
"epoch": 19.532374100719423,
"grad_norm": 0.023275067809463437,
"learning_rate": 6.544425656529582e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9988016784191132,
"step": 2715
},
{
"epoch": 19.568345323741006,
"grad_norm": 0.01761053119993567,
"learning_rate": 6.505179952432748e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9988506674766541,
"step": 2720
},
{
"epoch": 19.60431654676259,
"grad_norm": 0.01935988487682489,
"learning_rate": 6.465995470236743e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9986671209335327,
"step": 2725
},
{
"epoch": 19.640287769784173,
"grad_norm": 0.02366340939887518,
"learning_rate": 6.426872896371331e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9987650036811828,
"step": 2730
},
{
"epoch": 19.676258992805757,
"grad_norm": 0.02674071489195662,
"learning_rate": 6.387812916181772e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9988261640071869,
"step": 2735
},
{
"epoch": 19.71223021582734,
"grad_norm": 0.02265806365028318,
"learning_rate": 6.348816213916802e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9986304640769958,
"step": 2740
},
{
"epoch": 19.74820143884892,
"grad_norm": 0.017940892755793966,
"learning_rate": 6.309883472716677e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9988262236118317,
"step": 2745
},
{
"epoch": 19.784172661870503,
"grad_norm": 0.020099792386730327,
"learning_rate": 6.271015374601179e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9986548662185669,
"step": 2750
},
{
"epoch": 19.820143884892087,
"grad_norm": 0.0234673815288768,
"learning_rate": 6.232212600457684e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.998654842376709,
"step": 2755
},
{
"epoch": 19.85611510791367,
"grad_norm": 0.02264543742102836,
"learning_rate": 6.193475830029232e-05,
"loss": 0.0047,
"mean_token_accuracy": 0.9985569298267365,
"step": 2760
},
{
"epoch": 19.892086330935253,
"grad_norm": 0.03458114838915935,
"learning_rate": 6.154805741902608e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9987527251243591,
"step": 2765
},
{
"epoch": 19.928057553956833,
"grad_norm": 0.0232135058890561,
"learning_rate": 6.116203013496471e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9988996028900147,
"step": 2770
},
{
"epoch": 19.964028776978417,
"grad_norm": 0.020975513905473402,
"learning_rate": 6.0776683210494766e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9986794054508209,
"step": 2775
},
{
"epoch": 20.0,
"grad_norm": 0.02375239274771892,
"learning_rate": 6.039202339608432e-05,
"loss": 0.0046,
"mean_token_accuracy": 0.9986914992332458,
"step": 2780
},
{
"epoch": 20.0,
"eval_loss": 0.162822425365448,
"eval_mean_token_accuracy": 0.9826169647276402,
"eval_runtime": 20.7014,
"eval_samples_per_second": 5.893,
"eval_steps_per_second": 0.773,
"step": 2780
},
{
"epoch": 20.035971223021583,
"grad_norm": 0.020169789952639114,
"learning_rate": 6.0008057430164755e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9987894237041474,
"step": 2785
},
{
"epoch": 20.071942446043167,
"grad_norm": 0.024950995662450234,
"learning_rate": 5.9624792039012634e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9988994717597961,
"step": 2790
},
{
"epoch": 20.107913669064747,
"grad_norm": 0.022289199748791136,
"learning_rate": 5.9242233936631974e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9988993704319,
"step": 2795
},
{
"epoch": 20.14388489208633,
"grad_norm": 0.017031427967920152,
"learning_rate": 5.886038982463658e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9988994240760803,
"step": 2800
},
{
"epoch": 20.179856115107913,
"grad_norm": 0.02769921472621731,
"learning_rate": 5.847926639213259e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9989483714103699,
"step": 2805
},
{
"epoch": 20.215827338129497,
"grad_norm": 0.020218427009491082,
"learning_rate": 5.809887031560137e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9988261520862579,
"step": 2810
},
{
"epoch": 20.25179856115108,
"grad_norm": 0.023672867362183343,
"learning_rate": 5.771920825878268e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9987159848213196,
"step": 2815
},
{
"epoch": 20.28776978417266,
"grad_norm": 0.01825857572816428,
"learning_rate": 5.734028687255751e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9987772822380065,
"step": 2820
},
{
"epoch": 20.323741007194243,
"grad_norm": 0.019161583401344055,
"learning_rate": 5.6962112794832144e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9988016784191132,
"step": 2825
},
{
"epoch": 20.359712230215827,
"grad_norm": 0.020980808156140812,
"learning_rate": 5.65846926504214e-05,
"loss": 0.0042,
"mean_token_accuracy": 0.9987405419349671,
"step": 2830
},
{
"epoch": 20.39568345323741,
"grad_norm": 0.02213766137578922,
"learning_rate": 5.620803305093282e-05,
"loss": 0.004,
"mean_token_accuracy": 0.998777174949646,
"step": 2835
},
{
"epoch": 20.431654676258994,
"grad_norm": 0.020197374684862185,
"learning_rate": 5.583214059465094e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.998789495229721,
"step": 2840
},
{
"epoch": 20.467625899280577,
"grad_norm": 0.02848380661932676,
"learning_rate": 5.545702186642132e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9988626658916473,
"step": 2845
},
{
"epoch": 20.503597122302157,
"grad_norm": 0.021978948352305485,
"learning_rate": 5.5082683437535574e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.998862886428833,
"step": 2850
},
{
"epoch": 20.53956834532374,
"grad_norm": 0.02155398957358224,
"learning_rate": 5.470913186561616e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9987405002117157,
"step": 2855
},
{
"epoch": 20.575539568345324,
"grad_norm": 0.021479791644405853,
"learning_rate": 5.433637369450123e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9988384425640107,
"step": 2860
},
{
"epoch": 20.611510791366907,
"grad_norm": 0.020996011807413666,
"learning_rate": 5.39644154541305e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.998924195766449,
"step": 2865
},
{
"epoch": 20.64748201438849,
"grad_norm": 0.01978564420652077,
"learning_rate": 5.359326366043047e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9989973843097687,
"step": 2870
},
{
"epoch": 20.68345323741007,
"grad_norm": 0.02623546539682555,
"learning_rate": 5.322292481520027e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9986303389072418,
"step": 2875
},
{
"epoch": 20.719424460431654,
"grad_norm": 0.024843856079622802,
"learning_rate": 5.285340540599808e-05,
"loss": 0.0044,
"mean_token_accuracy": 0.9985324263572692,
"step": 2880
},
{
"epoch": 20.755395683453237,
"grad_norm": 0.024519174672566837,
"learning_rate": 5.2484711906027084e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9988139152526856,
"step": 2885
},
{
"epoch": 20.79136690647482,
"grad_norm": 0.027279076643368993,
"learning_rate": 5.211685077402246e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9988231182098388,
"step": 2890
},
{
"epoch": 20.827338129496404,
"grad_norm": 0.0261791376934865,
"learning_rate": 5.1749828454137996e-05,
"loss": 0.0043,
"mean_token_accuracy": 0.9986548364162445,
"step": 2895
},
{
"epoch": 20.863309352517987,
"grad_norm": 0.025300978285089024,
"learning_rate": 5.138365137583314e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9987772285938263,
"step": 2900
},
{
"epoch": 20.899280575539567,
"grad_norm": 0.02099648760004841,
"learning_rate": 5.101832595376059e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9988138735294342,
"step": 2905
},
{
"epoch": 20.93525179856115,
"grad_norm": 0.018269036560369226,
"learning_rate": 5.065385858765383e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.998667049407959,
"step": 2910
},
{
"epoch": 20.971223021582734,
"grad_norm": 0.021112292097246896,
"learning_rate": 5.0290255662214945e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.998618096113205,
"step": 2915
},
{
"epoch": 21.0,
"eval_loss": 0.16446451842784882,
"eval_mean_token_accuracy": 0.9880100190639496,
"eval_runtime": 20.8403,
"eval_samples_per_second": 5.854,
"eval_steps_per_second": 0.768,
"step": 2919
},
{
"epoch": 21.007194244604317,
"grad_norm": 0.022743077396830767,
"learning_rate": 4.992752354700292e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9993276000022888,
"step": 2920
},
{
"epoch": 21.0431654676259,
"grad_norm": 0.020371511578122142,
"learning_rate": 4.956566859632183e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9987894117832183,
"step": 2925
},
{
"epoch": 21.07913669064748,
"grad_norm": 0.016322090768065737,
"learning_rate": 4.920469714910982e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9991074562072754,
"step": 2930
},
{
"epoch": 21.115107913669064,
"grad_norm": 0.02086363099641971,
"learning_rate": 4.8844615528827874e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9989607691764831,
"step": 2935
},
{
"epoch": 21.151079136690647,
"grad_norm": 0.02198403181646824,
"learning_rate": 4.8485430043348955e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9988505244255066,
"step": 2940
},
{
"epoch": 21.18705035971223,
"grad_norm": 0.020254569354367712,
"learning_rate": 4.812714698484784e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.99900963306427,
"step": 2945
},
{
"epoch": 21.223021582733814,
"grad_norm": 0.02242033220917943,
"learning_rate": 4.776977262969057e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.99886274933815,
"step": 2950
},
{
"epoch": 21.258992805755394,
"grad_norm": 0.027896489445715905,
"learning_rate": 4.7413313238324556e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9988628089427948,
"step": 2955
},
{
"epoch": 21.294964028776977,
"grad_norm": 0.026693838170829185,
"learning_rate": 4.705777505516904e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9988016128540039,
"step": 2960
},
{
"epoch": 21.33093525179856,
"grad_norm": 0.02617631180403082,
"learning_rate": 4.6703164308505634e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9987404823303223,
"step": 2965
},
{
"epoch": 21.366906474820144,
"grad_norm": 0.02094580431705213,
"learning_rate": 4.63494872103692e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9988383114337921,
"step": 2970
},
{
"epoch": 21.402877697841728,
"grad_norm": 0.024972295644906873,
"learning_rate": 4.599674995643909e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9987526178359986,
"step": 2975
},
{
"epoch": 21.43884892086331,
"grad_norm": 0.026511984691810955,
"learning_rate": 4.564495872593041e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9986182391643524,
"step": 2980
},
{
"epoch": 21.47482014388489,
"grad_norm": 0.02770322638498023,
"learning_rate": 4.5294119681486066e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9988994836807251,
"step": 2985
},
{
"epoch": 21.510791366906474,
"grad_norm": 0.025625116896544668,
"learning_rate": 4.494423896906864e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9989360928535461,
"step": 2990
},
{
"epoch": 21.546762589928058,
"grad_norm": 0.026657201513430866,
"learning_rate": 4.459532271785273e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9986671328544616,
"step": 2995
},
{
"epoch": 21.58273381294964,
"grad_norm": 0.023164027774429576,
"learning_rate": 4.42473770401176e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9986792922019958,
"step": 3000
},
{
"epoch": 21.618705035971225,
"grad_norm": 0.01911923815995147,
"learning_rate": 4.390040803114015e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9990341305732727,
"step": 3005
},
{
"epoch": 21.654676258992804,
"grad_norm": 0.01988333819776402,
"learning_rate": 4.355442176908798e-05,
"loss": 0.0045,
"mean_token_accuracy": 0.9987158477306366,
"step": 3010
},
{
"epoch": 21.690647482014388,
"grad_norm": 0.024407661206665206,
"learning_rate": 4.3209424314913174e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9989240050315857,
"step": 3015
},
{
"epoch": 21.72661870503597,
"grad_norm": 0.021530015839347865,
"learning_rate": 4.286542171224589e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9987647831439972,
"step": 3020
},
{
"epoch": 21.762589928057555,
"grad_norm": 0.02141074113834186,
"learning_rate": 4.252241998728861e-05,
"loss": 0.0041,
"mean_token_accuracy": 0.9986670732498169,
"step": 3025
},
{
"epoch": 21.798561151079138,
"grad_norm": 0.02332310027280023,
"learning_rate": 4.218042514871058e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9988139569759369,
"step": 3030
},
{
"epoch": 21.834532374100718,
"grad_norm": 0.02040834265816528,
"learning_rate": 4.183944318754238e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9987895727157593,
"step": 3035
},
{
"epoch": 21.8705035971223,
"grad_norm": 0.025359310719483122,
"learning_rate": 4.149948007707126e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.998899495601654,
"step": 3040
},
{
"epoch": 21.906474820143885,
"grad_norm": 0.02406626773998849,
"learning_rate": 4.116054177273627e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9986980199813843,
"step": 3045
},
{
"epoch": 21.942446043165468,
"grad_norm": 0.020726891185531986,
"learning_rate": 4.082263421202403e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9987283110618591,
"step": 3050
},
{
"epoch": 21.97841726618705,
"grad_norm": 0.019329853559948335,
"learning_rate": 4.0485763314364735e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9988263309001922,
"step": 3055
},
{
"epoch": 22.0,
"eval_loss": 0.16608409583568573,
"eval_mean_token_accuracy": 0.987030259587548,
"eval_runtime": 20.6843,
"eval_samples_per_second": 5.898,
"eval_steps_per_second": 0.774,
"step": 3058
},
{
"epoch": 22.014388489208635,
"grad_norm": 0.021157088143697933,
"learning_rate": 4.0149934981028294e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9990525096654892,
"step": 3060
},
{
"epoch": 22.050359712230215,
"grad_norm": 0.02223133996568501,
"learning_rate": 3.9815155095021215e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9990095376968384,
"step": 3065
},
{
"epoch": 22.086330935251798,
"grad_norm": 0.022466272840321055,
"learning_rate": 3.948142952098336e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9988016068935395,
"step": 3070
},
{
"epoch": 22.12230215827338,
"grad_norm": 0.021919057069398156,
"learning_rate": 3.914876410508528e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988504886627197,
"step": 3075
},
{
"epoch": 22.158273381294965,
"grad_norm": 0.024330160771715296,
"learning_rate": 3.8817164674925766e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9988466024398803,
"step": 3080
},
{
"epoch": 22.194244604316548,
"grad_norm": 0.02730664438385192,
"learning_rate": 3.848663703942981e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9988750219345093,
"step": 3085
},
{
"epoch": 22.230215827338128,
"grad_norm": 0.02072812988318122,
"learning_rate": 3.815718698874672e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9990951418876648,
"step": 3090
},
{
"epoch": 22.26618705035971,
"grad_norm": 0.021412173412854167,
"learning_rate": 3.78288202941489e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9989484190940857,
"step": 3095
},
{
"epoch": 22.302158273381295,
"grad_norm": 0.017054242580422836,
"learning_rate": 3.750154270793058e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9989240109920502,
"step": 3100
},
{
"epoch": 22.33812949640288,
"grad_norm": 0.01980247762228368,
"learning_rate": 3.717535996330711e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.998972886800766,
"step": 3105
},
{
"epoch": 22.37410071942446,
"grad_norm": 0.02184699946449326,
"learning_rate": 3.6850277774314544e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9989607214927674,
"step": 3110
},
{
"epoch": 22.41007194244604,
"grad_norm": 0.013978977988707543,
"learning_rate": 3.652630183570941e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.999009644985199,
"step": 3115
},
{
"epoch": 22.446043165467625,
"grad_norm": 0.025100662338247242,
"learning_rate": 3.620343782286917e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988382995128632,
"step": 3120
},
{
"epoch": 22.48201438848921,
"grad_norm": 0.018821374531494295,
"learning_rate": 3.588169139169263e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9988874316215515,
"step": 3125
},
{
"epoch": 22.51798561151079,
"grad_norm": 0.023763269773152667,
"learning_rate": 3.5561068178500945e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9987893342971802,
"step": 3130
},
{
"epoch": 22.553956834532375,
"grad_norm": 0.021714269090932634,
"learning_rate": 3.524157379993882e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9989117383956909,
"step": 3135
},
{
"epoch": 22.58992805755396,
"grad_norm": 0.034796645480556165,
"learning_rate": 3.49232138528762e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9988261342048645,
"step": 3140
},
{
"epoch": 22.62589928057554,
"grad_norm": 0.02326412744594279,
"learning_rate": 3.460599391431008e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9989117622375489,
"step": 3145
},
{
"epoch": 22.66187050359712,
"grad_norm": 0.025486689306682863,
"learning_rate": 3.428991954126698e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9987648904323578,
"step": 3150
},
{
"epoch": 22.697841726618705,
"grad_norm": 0.020440416523992887,
"learning_rate": 3.397499627070552e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9989974737167359,
"step": 3155
},
{
"epoch": 22.73381294964029,
"grad_norm": 0.02978506453625392,
"learning_rate": 3.366122961941937e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9986671507358551,
"step": 3160
},
{
"epoch": 22.769784172661872,
"grad_norm": 0.03112304520385107,
"learning_rate": 3.3348625083940785e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9988505899906158,
"step": 3165
},
{
"epoch": 22.805755395683452,
"grad_norm": 0.02655579795920087,
"learning_rate": 3.3037188140443995e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9989850640296936,
"step": 3170
},
{
"epoch": 22.841726618705035,
"grad_norm": 0.022226423897356053,
"learning_rate": 3.2726924244649636e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9986669540405273,
"step": 3175
},
{
"epoch": 22.87769784172662,
"grad_norm": 0.01836212562974761,
"learning_rate": 3.241783883172895e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9988262414932251,
"step": 3180
},
{
"epoch": 22.913669064748202,
"grad_norm": 0.025618306976980545,
"learning_rate": 3.210993731620867e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9986548483371734,
"step": 3185
},
{
"epoch": 22.949640287769785,
"grad_norm": 0.02206393950967854,
"learning_rate": 3.180322509187612e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9987526178359986,
"step": 3190
},
{
"epoch": 22.985611510791365,
"grad_norm": 0.025248641779480812,
"learning_rate": 3.149770753168468e-05,
"loss": 0.004,
"mean_token_accuracy": 0.9987527012825013,
"step": 3195
},
{
"epoch": 23.0,
"eval_loss": 0.16851434111595154,
"eval_mean_token_accuracy": 0.9858345746994018,
"eval_runtime": 20.6794,
"eval_samples_per_second": 5.9,
"eval_steps_per_second": 0.774,
"step": 3197
},
{
"epoch": 23.02158273381295,
"grad_norm": 0.019134482337190666,
"learning_rate": 3.119338998765984e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9988585710525513,
"step": 3200
},
{
"epoch": 23.057553956834532,
"grad_norm": 0.017875463612263744,
"learning_rate": 3.089027779080522e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.998948335647583,
"step": 3205
},
{
"epoch": 23.093525179856115,
"grad_norm": 0.023438769430109335,
"learning_rate": 3.0588376251009386e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9990340173244476,
"step": 3210
},
{
"epoch": 23.1294964028777,
"grad_norm": 0.021359479774257485,
"learning_rate": 3.0287690656952673e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988463282585144,
"step": 3215
},
{
"epoch": 23.165467625899282,
"grad_norm": 0.01767366924264051,
"learning_rate": 2.9988226276014664e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9990708291530609,
"step": 3220
},
{
"epoch": 23.201438848920862,
"grad_norm": 0.024005445801463642,
"learning_rate": 2.968998835418174e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9989728808403016,
"step": 3225
},
{
"epoch": 23.237410071942445,
"grad_norm": 0.023067638497095064,
"learning_rate": 2.9392982115955414e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9990340113639832,
"step": 3230
},
{
"epoch": 23.27338129496403,
"grad_norm": 0.02597606132182271,
"learning_rate": 2.909721276426064e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9989606022834778,
"step": 3235
},
{
"epoch": 23.309352517985612,
"grad_norm": 0.029591792929044805,
"learning_rate": 2.880268548035473e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9990462243556977,
"step": 3240
},
{
"epoch": 23.345323741007196,
"grad_norm": 0.027328046647646854,
"learning_rate": 2.8509405423736603e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9987404048442841,
"step": 3245
},
{
"epoch": 23.381294964028775,
"grad_norm": 0.025252117211303413,
"learning_rate": 2.8217377732056304e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988750219345093,
"step": 3250
},
{
"epoch": 23.41726618705036,
"grad_norm": 0.02406714011699659,
"learning_rate": 2.792660752102514e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.998985105752945,
"step": 3255
},
{
"epoch": 23.453237410071942,
"grad_norm": 0.022721023494655553,
"learning_rate": 2.7637099884326e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9989973485469819,
"step": 3260
},
{
"epoch": 23.489208633093526,
"grad_norm": 0.02301823912808615,
"learning_rate": 2.7348859893524105e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988383531570435,
"step": 3265
},
{
"epoch": 23.52517985611511,
"grad_norm": 0.02705894303804802,
"learning_rate": 2.7061892597978177e-05,
"loss": 0.0037,
"mean_token_accuracy": 0.9987648069858551,
"step": 3270
},
{
"epoch": 23.56115107913669,
"grad_norm": 0.01942725691392434,
"learning_rate": 2.6776203024752055e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988993644714356,
"step": 3275
},
{
"epoch": 23.597122302158272,
"grad_norm": 0.027402506121057685,
"learning_rate": 2.6491796178526453e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9988141000270844,
"step": 3280
},
{
"epoch": 23.633093525179856,
"grad_norm": 0.025450594562071174,
"learning_rate": 2.6208677041511488e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9988994300365448,
"step": 3285
},
{
"epoch": 23.66906474820144,
"grad_norm": 0.023278496141033778,
"learning_rate": 2.5926850573359317e-05,
"loss": 0.0038,
"mean_token_accuracy": 0.9987892925739288,
"step": 3290
},
{
"epoch": 23.705035971223023,
"grad_norm": 0.01949407921179753,
"learning_rate": 2.5646321711077227e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9989118635654449,
"step": 3295
},
{
"epoch": 23.741007194244606,
"grad_norm": 0.022066203733673222,
"learning_rate": 2.536709536894123e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9988872468471527,
"step": 3300
},
{
"epoch": 23.776978417266186,
"grad_norm": 0.021463047376446793,
"learning_rate": 2.508917643840981e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9988751173019409,
"step": 3305
},
{
"epoch": 23.81294964028777,
"grad_norm": 0.025166598560956936,
"learning_rate": 2.4812569788038463e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988750696182251,
"step": 3310
},
{
"epoch": 23.848920863309353,
"grad_norm": 0.017097642676170137,
"learning_rate": 2.4537280263394258e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.998960679769516,
"step": 3315
},
{
"epoch": 23.884892086330936,
"grad_norm": 0.028106428762610353,
"learning_rate": 2.4263312686970986e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9988138198852539,
"step": 3320
},
{
"epoch": 23.92086330935252,
"grad_norm": 0.04083138822929284,
"learning_rate": 2.3990671858104662e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9989116787910461,
"step": 3325
},
{
"epoch": 23.9568345323741,
"grad_norm": 0.022311007545414503,
"learning_rate": 2.3719362552889536e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9988382458686829,
"step": 3330
},
{
"epoch": 23.992805755395683,
"grad_norm": 0.02668608739885611,
"learning_rate": 2.3449389524094266e-05,
"loss": 0.0039,
"mean_token_accuracy": 0.9985814273357392,
"step": 3335
},
{
"epoch": 24.0,
"eval_loss": 0.16874690353870392,
"eval_mean_token_accuracy": 0.9842985835340288,
"eval_runtime": 20.8056,
"eval_samples_per_second": 5.864,
"eval_steps_per_second": 0.769,
"step": 3336
},
{
"epoch": 24.028776978417266,
"grad_norm": 0.018047194943347125,
"learning_rate": 2.3180757501078843e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9989758655428886,
"step": 3340
},
{
"epoch": 24.06474820143885,
"grad_norm": 0.023899729196053907,
"learning_rate": 2.291347118971162e-05,
"loss": 0.0033,
"mean_token_accuracy": 0.9988993644714356,
"step": 3345
},
{
"epoch": 24.100719424460433,
"grad_norm": 0.018510819166514855,
"learning_rate": 2.2647535272286912e-05,
"loss": 0.003,
"mean_token_accuracy": 0.99908287525177,
"step": 3350
},
{
"epoch": 24.136690647482013,
"grad_norm": 0.02938331730686069,
"learning_rate": 2.2382954407443003e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9989606201648712,
"step": 3355
},
{
"epoch": 24.172661870503596,
"grad_norm": 0.020826402133998383,
"learning_rate": 2.2119733230080408e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9990829288959503,
"step": 3360
},
{
"epoch": 24.20863309352518,
"grad_norm": 0.0290831582379025,
"learning_rate": 2.185787635128086e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9988750517368317,
"step": 3365
},
{
"epoch": 24.244604316546763,
"grad_norm": 0.027772364062720812,
"learning_rate": 2.15973883582265e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9989484786987305,
"step": 3370
},
{
"epoch": 24.280575539568346,
"grad_norm": 0.025911646510706054,
"learning_rate": 2.1338273814119325e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9990463495254517,
"step": 3375
},
{
"epoch": 24.31654676258993,
"grad_norm": 0.031011036083864605,
"learning_rate": 2.1080537258101517e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988994300365448,
"step": 3380
},
{
"epoch": 24.35251798561151,
"grad_norm": 0.03048137643406368,
"learning_rate": 2.0824183205175706e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988261342048645,
"step": 3385
},
{
"epoch": 24.388489208633093,
"grad_norm": 0.02484851509688683,
"learning_rate": 2.0569216146126014e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.998923909664154,
"step": 3390
},
{
"epoch": 24.424460431654676,
"grad_norm": 0.019607288520278966,
"learning_rate": 2.031564054743943e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9990830242633819,
"step": 3395
},
{
"epoch": 24.46043165467626,
"grad_norm": 0.023840339055394528,
"learning_rate": 2.0063460851227345e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9991563498973847,
"step": 3400
},
{
"epoch": 24.496402877697843,
"grad_norm": 0.027657508791596168,
"learning_rate": 1.9812681475147942e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.998960655927658,
"step": 3405
},
{
"epoch": 24.532374100719423,
"grad_norm": 0.025748643261759012,
"learning_rate": 1.9563306812328763e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9988259911537171,
"step": 3410
},
{
"epoch": 24.568345323741006,
"grad_norm": 0.020585045016937135,
"learning_rate": 1.931534123128965e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9988994836807251,
"step": 3415
},
{
"epoch": 24.60431654676259,
"grad_norm": 0.024469587326783445,
"learning_rate": 1.9068789075866355e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9990095853805542,
"step": 3420
},
{
"epoch": 24.640287769784173,
"grad_norm": 0.021728578764992675,
"learning_rate": 1.882365466513437e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9988505184650421,
"step": 3425
},
{
"epoch": 24.676258992805757,
"grad_norm": 0.019942244796565842,
"learning_rate": 1.8579942293333286e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9990096926689148,
"step": 3430
},
{
"epoch": 24.71223021582734,
"grad_norm": 0.02821479641357632,
"learning_rate": 1.8337656229791577e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9990096509456634,
"step": 3435
},
{
"epoch": 24.74820143884892,
"grad_norm": 0.02455993351782378,
"learning_rate": 1.8096800718851705e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9989973723888397,
"step": 3440
},
{
"epoch": 24.784172661870503,
"grad_norm": 0.028363038053404677,
"learning_rate": 1.785737997979594e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9989820778369903,
"step": 3445
},
{
"epoch": 24.820143884892087,
"grad_norm": 0.02773422789629047,
"learning_rate": 1.761939820677241e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9987894833087921,
"step": 3450
},
{
"epoch": 24.85611510791367,
"grad_norm": 0.026309951879648234,
"learning_rate": 1.7382859568721465e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.999070692062378,
"step": 3455
},
{
"epoch": 24.892086330935253,
"grad_norm": 0.022282492985843325,
"learning_rate": 1.714776820930283e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9988258957862854,
"step": 3460
},
{
"epoch": 24.928057553956833,
"grad_norm": 0.024990648055085517,
"learning_rate": 1.691412824682297e-05,
"loss": 0.0036,
"mean_token_accuracy": 0.9988260388374328,
"step": 3465
},
{
"epoch": 24.964028776978417,
"grad_norm": 0.029113720763185476,
"learning_rate": 1.6681943774162823e-05,
"loss": 0.0034,
"mean_token_accuracy": 0.9988504767417907,
"step": 3470
},
{
"epoch": 25.0,
"grad_norm": 0.029421600632342313,
"learning_rate": 1.6451218858706374e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9988382577896118,
"step": 3475
},
{
"epoch": 25.0,
"eval_loss": 0.17007607221603394,
"eval_mean_token_accuracy": 0.9825018458068371,
"eval_runtime": 20.6952,
"eval_samples_per_second": 5.895,
"eval_steps_per_second": 0.773,
"step": 3475
},
{
"epoch": 25.035971223021583,
"grad_norm": 0.02049594957895056,
"learning_rate": 1.622195754226906e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9991562008857727,
"step": 3480
},
{
"epoch": 25.071942446043167,
"grad_norm": 0.018619055614502295,
"learning_rate": 1.5994163841027266e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9991196513175964,
"step": 3485
},
{
"epoch": 25.107913669064747,
"grad_norm": 0.022576514305950958,
"learning_rate": 1.57678417454478e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9988995909690856,
"step": 3490
},
{
"epoch": 25.14388489208633,
"grad_norm": 0.02334590282868171,
"learning_rate": 1.554299522021796e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9989116668701172,
"step": 3495
},
{
"epoch": 25.179856115107913,
"grad_norm": 0.02392645596443554,
"learning_rate": 1.5319628204176307e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.9991563737392426,
"step": 3500
},
{
"epoch": 25.215827338129497,
"grad_norm": 0.02789413194367586,
"learning_rate": 1.5097744610243403e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.999180793762207,
"step": 3505
},
{
"epoch": 25.25179856115108,
"grad_norm": 0.018264027514364508,
"learning_rate": 1.4877348325353368e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9989115953445434,
"step": 3510
},
{
"epoch": 25.28776978417266,
"grad_norm": 0.026930678697230023,
"learning_rate": 1.4658443210385863e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9990339398384094,
"step": 3515
},
{
"epoch": 25.323741007194243,
"grad_norm": 0.023975938540105726,
"learning_rate": 1.44410331000983e-05,
"loss": 0.0025,
"mean_token_accuracy": 0.9990951836109161,
"step": 3520
},
{
"epoch": 25.359712230215827,
"grad_norm": 0.025955619076464948,
"learning_rate": 1.4225121803058794e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9990216612815856,
"step": 3525
},
{
"epoch": 25.39568345323741,
"grad_norm": 0.02577267491308173,
"learning_rate": 1.4010713101579486e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9990583598613739,
"step": 3530
},
{
"epoch": 25.431654676258994,
"grad_norm": 0.023780752055292367,
"learning_rate": 1.3797810751650032e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9988504767417907,
"step": 3535
},
{
"epoch": 25.467625899280577,
"grad_norm": 0.026803889925486716,
"learning_rate": 1.35864184828721e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9988995373249054,
"step": 3540
},
{
"epoch": 25.503597122302157,
"grad_norm": 0.023040368859518934,
"learning_rate": 1.33765399983939e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9989809155464172,
"step": 3545
},
{
"epoch": 25.53956834532374,
"grad_norm": 0.025198060678819276,
"learning_rate": 1.3168178974845225e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.9990095555782318,
"step": 3550
},
{
"epoch": 25.575539568345324,
"grad_norm": 0.02916409633819172,
"learning_rate": 1.2961339062273314e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9990462839603425,
"step": 3555
},
{
"epoch": 25.611510791366907,
"grad_norm": 0.0278874264136872,
"learning_rate": 1.275602388407856e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9989850282669067,
"step": 3560
},
{
"epoch": 25.64748201438849,
"grad_norm": 0.027646524866529763,
"learning_rate": 1.255223703695132e-05,
"loss": 0.003,
"mean_token_accuracy": 0.999070692062378,
"step": 3565
},
{
"epoch": 25.68345323741007,
"grad_norm": 0.025857527943575487,
"learning_rate": 1.2349982090808821e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9990462481975555,
"step": 3570
},
{
"epoch": 25.719424460431654,
"grad_norm": 0.02286550422259087,
"learning_rate": 1.214926258873247e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.9988381743431092,
"step": 3575
},
{
"epoch": 25.755395683453237,
"grad_norm": 0.027097388349150865,
"learning_rate": 1.1950082046906086e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9989361703395844,
"step": 3580
},
{
"epoch": 25.79136690647482,
"grad_norm": 0.02303610795273582,
"learning_rate": 1.1752443954554082e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9990462839603425,
"step": 3585
},
{
"epoch": 25.827338129496404,
"grad_norm": 0.02608991544129121,
"learning_rate": 1.1556351773880337e-05,
"loss": 0.0032,
"mean_token_accuracy": 0.99886274933815,
"step": 3590
},
{
"epoch": 25.863309352517987,
"grad_norm": 0.023156647103821718,
"learning_rate": 1.1361808940007668e-05,
"loss": 0.0029,
"mean_token_accuracy": 0.9989973545074463,
"step": 3595
},
{
"epoch": 25.899280575539567,
"grad_norm": 0.0243464595811653,
"learning_rate": 1.1168818860917574e-05,
"loss": 0.0031,
"mean_token_accuracy": 0.9989239156246186,
"step": 3600
},
{
"epoch": 25.93525179856115,
"grad_norm": 0.02981427171507079,
"learning_rate": 1.0977384917390576e-05,
"loss": 0.0027,
"mean_token_accuracy": 0.9990585505962372,
"step": 3605
},
{
"epoch": 25.971223021582734,
"grad_norm": 0.032037378944090256,
"learning_rate": 1.078751046294697e-05,
"loss": 0.0035,
"mean_token_accuracy": 0.9986914873123169,
"step": 3610
},
{
"epoch": 26.0,
"eval_loss": 0.17206676304340363,
"eval_mean_token_accuracy": 0.9879742885629336,
"eval_runtime": 20.7606,
"eval_samples_per_second": 5.877,
"eval_steps_per_second": 0.771,
"step": 3614
},
{
"epoch": 26.007194244604317,
"grad_norm": 0.0213461708648987,
"learning_rate": 1.0599198823788025e-05,
"loss": 0.003,
"mean_token_accuracy": 0.9992053210735321,
"step": 3615
},
{
"epoch": 26.0431654676259,
"grad_norm": 0.023428034716649632,
"learning_rate": 1.0412453298737823e-05,
"loss": 0.0027,
"mean_token_accuracy": 0.9991684496402741,
"step": 3620
},
{
"epoch": 26.07913669064748,
"grad_norm": 0.023348574386097033,
"learning_rate": 1.0227277159185422e-05,
"loss": 0.0026,
"mean_token_accuracy": 0.9990829169750214,
"step": 3625
},
{
"epoch": 26.115107913669064,
"grad_norm": 0.021739767766830953,
"learning_rate": 1.0043673649027518e-05,
"loss": 0.0028,
"mean_token_accuracy": 0.999131840467453,
"step": 3630
},
{
"epoch": 26.151079136690647,
"grad_norm": 0.019442840163705226,
"learning_rate": 9.861645984611678e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9991685032844544,
"step": 3635
},
{
"epoch": 26.18705035971223,
"grad_norm": 0.019504618622678677,
"learning_rate": 9.681197354679949e-06,
"loss": 0.0026,
"mean_token_accuracy": 0.9990584969520568,
"step": 3640
},
{
"epoch": 26.223021582733814,
"grad_norm": 0.021803811310150526,
"learning_rate": 9.502330920312974e-06,
"loss": 0.003,
"mean_token_accuracy": 0.9989483237266541,
"step": 3645
},
{
"epoch": 26.258992805755394,
"grad_norm": 0.0290446844890043,
"learning_rate": 9.325049814874732e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9990217745304107,
"step": 3650
},
{
"epoch": 26.294964028776977,
"grad_norm": 0.02127288345338018,
"learning_rate": 9.149357143957471e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9991196155548095,
"step": 3655
},
{
"epoch": 26.33093525179856,
"grad_norm": 0.026538142153862753,
"learning_rate": 8.975255985327524e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.999070692062378,
"step": 3660
},
{
"epoch": 26.366906474820144,
"grad_norm": 0.023809198775522854,
"learning_rate": 8.802749388871224e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9990461349487305,
"step": 3665
},
{
"epoch": 26.402877697841728,
"grad_norm": 0.03172209954402754,
"learning_rate": 8.631840376541457e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9990951240062713,
"step": 3670
},
{
"epoch": 26.43884892086331,
"grad_norm": 0.025297799542257252,
"learning_rate": 8.462531942304896e-06,
"loss": 0.003,
"mean_token_accuracy": 0.999021691083908,
"step": 3675
},
{
"epoch": 26.47482014388489,
"grad_norm": 0.026186042332624757,
"learning_rate": 8.294827052089393e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9989819586277008,
"step": 3680
},
{
"epoch": 26.510791366906474,
"grad_norm": 0.02062785999993999,
"learning_rate": 8.128728643732108e-06,
"loss": 0.003,
"mean_token_accuracy": 0.9989850223064423,
"step": 3685
},
{
"epoch": 26.546762589928058,
"grad_norm": 0.025889886568629488,
"learning_rate": 7.964239626927994e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.999095219373703,
"step": 3690
},
{
"epoch": 26.58273381294964,
"grad_norm": 0.022489670251659637,
"learning_rate": 7.801362883178876e-06,
"loss": 0.0024,
"mean_token_accuracy": 0.9991442322731018,
"step": 3695
},
{
"epoch": 26.618705035971225,
"grad_norm": 0.024715714144679234,
"learning_rate": 7.640101265742883e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9989606618881226,
"step": 3700
},
{
"epoch": 26.654676258992804,
"grad_norm": 0.026901869610419675,
"learning_rate": 7.480457599584601e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9989850759506226,
"step": 3705
},
{
"epoch": 26.690647482014388,
"grad_norm": 0.025055977716386093,
"learning_rate": 7.3224346813254626e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9990584552288055,
"step": 3710
},
{
"epoch": 26.72661870503597,
"grad_norm": 0.03167579002764372,
"learning_rate": 7.166035279194816e-06,
"loss": 0.0026,
"mean_token_accuracy": 0.9991685152053833,
"step": 3715
},
{
"epoch": 26.762589928057555,
"grad_norm": 0.02242351163674594,
"learning_rate": 7.011262132981456e-06,
"loss": 0.003,
"mean_token_accuracy": 0.9989973664283752,
"step": 3720
},
{
"epoch": 26.798561151079138,
"grad_norm": 0.02247426874697418,
"learning_rate": 6.85811795398551e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9991073191165925,
"step": 3725
},
{
"epoch": 26.834532374100718,
"grad_norm": 0.03351273807743497,
"learning_rate": 6.706605424971091e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9990706741809845,
"step": 3730
},
{
"epoch": 26.8705035971223,
"grad_norm": 0.028717775489285478,
"learning_rate": 6.556727200119217e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9990462124347687,
"step": 3735
},
{
"epoch": 26.906474820143885,
"grad_norm": 0.030401223292522324,
"learning_rate": 6.408485904981332e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9988750994205475,
"step": 3740
},
{
"epoch": 26.942446043165468,
"grad_norm": 0.029390890433714404,
"learning_rate": 6.261884136433327e-06,
"loss": 0.0032,
"mean_token_accuracy": 0.9988627254962921,
"step": 3745
},
{
"epoch": 26.97841726618705,
"grad_norm": 0.027232194452777886,
"learning_rate": 6.116924462629992e-06,
"loss": 0.0031,
"mean_token_accuracy": 0.998874968290329,
"step": 3750
},
{
"epoch": 27.0,
"eval_loss": 0.17284731566905975,
"eval_mean_token_accuracy": 0.9869382300160148,
"eval_runtime": 20.6664,
"eval_samples_per_second": 5.903,
"eval_steps_per_second": 0.774,
"step": 3753
},
{
"epoch": 27.014388489208635,
"grad_norm": 0.016100025377121675,
"learning_rate": 5.973609422960103e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9992051720619202,
"step": 3755
},
{
"epoch": 27.050359712230215,
"grad_norm": 0.023343201820734655,
"learning_rate": 5.831941528001894e-06,
"loss": 0.0026,
"mean_token_accuracy": 0.9990951895713807,
"step": 3760
},
{
"epoch": 27.086330935251798,
"grad_norm": 0.0248367264595005,
"learning_rate": 5.691923259479093e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9990583717823028,
"step": 3765
},
{
"epoch": 27.12230215827338,
"grad_norm": 0.024747413248307617,
"learning_rate": 5.55355707021743e-06,
"loss": 0.003,
"mean_token_accuracy": 0.9989116430282593,
"step": 3770
},
{
"epoch": 27.158273381294965,
"grad_norm": 0.02630731839366515,
"learning_rate": 5.416845384101699e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9991073429584503,
"step": 3775
},
{
"epoch": 27.194244604316548,
"grad_norm": 0.025754165173891827,
"learning_rate": 5.281790596033232e-06,
"loss": 0.0026,
"mean_token_accuracy": 0.9992175042629242,
"step": 3780
},
{
"epoch": 27.230215827338128,
"grad_norm": 0.026574099762379837,
"learning_rate": 5.1483950718880456e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.999156379699707,
"step": 3785
},
{
"epoch": 27.26618705035971,
"grad_norm": 0.02248640885779222,
"learning_rate": 5.016661148475299e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9989850997924805,
"step": 3790
},
{
"epoch": 27.302158273381295,
"grad_norm": 0.022399236095728896,
"learning_rate": 4.8865911334964094e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9990584075450897,
"step": 3795
},
{
"epoch": 27.33812949640288,
"grad_norm": 0.027779653089896033,
"learning_rate": 4.758187305504658e-06,
"loss": 0.0024,
"mean_token_accuracy": 0.9991685688495636,
"step": 3800
},
{
"epoch": 27.37410071942446,
"grad_norm": 0.02964208013830329,
"learning_rate": 4.6314519138651594e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9990707218647004,
"step": 3805
},
{
"epoch": 27.41007194244604,
"grad_norm": 0.02712978542770664,
"learning_rate": 4.506387178715565e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9989605605602264,
"step": 3810
},
{
"epoch": 27.446043165467625,
"grad_norm": 0.03281882059446763,
"learning_rate": 4.382995290927161e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9991195619106292,
"step": 3815
},
{
"epoch": 27.48201438848921,
"grad_norm": 0.03285754084626906,
"learning_rate": 4.261278412066427e-06,
"loss": 0.003,
"mean_token_accuracy": 0.9989972472190857,
"step": 3820
},
{
"epoch": 27.51798561151079,
"grad_norm": 0.02770507620979567,
"learning_rate": 4.141238674357217e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9991528451442718,
"step": 3825
},
{
"epoch": 27.553956834532375,
"grad_norm": 0.027323468101626835,
"learning_rate": 4.022878180643441e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9990583479404449,
"step": 3830
},
{
"epoch": 27.58992805755396,
"grad_norm": 0.026165409160412198,
"learning_rate": 3.906199004352085e-06,
"loss": 0.0023,
"mean_token_accuracy": 0.9992541253566742,
"step": 3835
},
{
"epoch": 27.62589928057554,
"grad_norm": 0.02995860073308557,
"learning_rate": 3.791203189457093e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.99905846118927,
"step": 3840
},
{
"epoch": 27.66187050359712,
"grad_norm": 0.025049536857449636,
"learning_rate": 3.67789275044339e-06,
"loss": 0.003,
"mean_token_accuracy": 0.9990216970443726,
"step": 3845
},
{
"epoch": 27.697841726618705,
"grad_norm": 0.02435227347466978,
"learning_rate": 3.5662696722716936e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9990707516670227,
"step": 3850
},
{
"epoch": 27.73381294964029,
"grad_norm": 0.026934231649150823,
"learning_rate": 3.4563359103436886e-06,
"loss": 0.003,
"mean_token_accuracy": 0.9989239752292634,
"step": 3855
},
{
"epoch": 27.769784172661872,
"grad_norm": 0.026039766458009594,
"learning_rate": 3.348093390467788e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9991317570209504,
"step": 3860
},
{
"epoch": 27.805755395683452,
"grad_norm": 0.028279821541101296,
"learning_rate": 3.2415440088254033e-06,
"loss": 0.0022,
"mean_token_accuracy": 0.9991562783718109,
"step": 3865
},
{
"epoch": 27.841726618705035,
"grad_norm": 0.02903501608211124,
"learning_rate": 3.1366896319377283e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9990950644016265,
"step": 3870
},
{
"epoch": 27.87769784172662,
"grad_norm": 0.029246402437814508,
"learning_rate": 3.0335320966330405e-06,
"loss": 0.0026,
"mean_token_accuracy": 0.9991073131561279,
"step": 3875
},
{
"epoch": 27.913669064748202,
"grad_norm": 0.024885738033322623,
"learning_rate": 2.932073210014519e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9991195976734162,
"step": 3880
},
{
"epoch": 27.949640287769785,
"grad_norm": 0.02744346407474506,
"learning_rate": 2.832314749428555e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9990218222141266,
"step": 3885
},
{
"epoch": 27.985611510791365,
"grad_norm": 0.036648097310783105,
"learning_rate": 2.734258462433692e-06,
"loss": 0.003,
"mean_token_accuracy": 0.9989727795124054,
"step": 3890
},
{
"epoch": 28.0,
"eval_loss": 0.1739717721939087,
"eval_mean_token_accuracy": 0.9858080625534058,
"eval_runtime": 20.6535,
"eval_samples_per_second": 5.907,
"eval_steps_per_second": 0.775,
"step": 3892
},
{
"epoch": 28.02158273381295,
"grad_norm": 0.022935392530238134,
"learning_rate": 2.6379060667699686e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9989197750886282,
"step": 3895
},
{
"epoch": 28.057553956834532,
"grad_norm": 0.028305752750639582,
"learning_rate": 2.5432592503288e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9991684257984161,
"step": 3900
},
{
"epoch": 28.093525179856115,
"grad_norm": 0.02473903295003507,
"learning_rate": 2.4503196711234576e-06,
"loss": 0.0026,
"mean_token_accuracy": 0.9991562008857727,
"step": 3905
},
{
"epoch": 28.1294964028777,
"grad_norm": 0.020068405620154862,
"learning_rate": 2.3590889572600138e-06,
"loss": 0.0023,
"mean_token_accuracy": 0.9991807460784912,
"step": 3910
},
{
"epoch": 28.165467625899282,
"grad_norm": 0.023023534500934043,
"learning_rate": 2.2695687069087868e-06,
"loss": 0.0023,
"mean_token_accuracy": 0.9991318583488464,
"step": 3915
},
{
"epoch": 28.201438848920862,
"grad_norm": 0.02063534437749836,
"learning_rate": 2.1817604882763854e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9990828394889831,
"step": 3920
},
{
"epoch": 28.237410071942445,
"grad_norm": 0.021379802805438913,
"learning_rate": 2.0956658395782202e-06,
"loss": 0.0026,
"mean_token_accuracy": 0.9990951299667359,
"step": 3925
},
{
"epoch": 28.27338129496403,
"grad_norm": 0.021192132406956426,
"learning_rate": 2.01128626901157e-06,
"loss": 0.0024,
"mean_token_accuracy": 0.9991563200950623,
"step": 3930
},
{
"epoch": 28.309352517985612,
"grad_norm": 0.02399906239916882,
"learning_rate": 1.928623254729134e-06,
"loss": 0.0026,
"mean_token_accuracy": 0.9991318345069885,
"step": 3935
},
{
"epoch": 28.345323741007196,
"grad_norm": 0.025361576061434472,
"learning_rate": 1.8476782448131446e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9992174208164215,
"step": 3940
},
{
"epoch": 28.381294964028775,
"grad_norm": 0.026511459482348218,
"learning_rate": 1.7684526572500416e-06,
"loss": 0.0029,
"mean_token_accuracy": 0.9989605724811554,
"step": 3945
},
{
"epoch": 28.41726618705036,
"grad_norm": 0.028448573855638794,
"learning_rate": 1.6909478799055578e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9992173910140991,
"step": 3950
},
{
"epoch": 28.453237410071942,
"grad_norm": 0.021467267529428444,
"learning_rate": 1.615165270500485e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9991808295249939,
"step": 3955
},
{
"epoch": 28.489208633093526,
"grad_norm": 0.023006055777401,
"learning_rate": 1.5411061565868467e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.999119633436203,
"step": 3960
},
{
"epoch": 28.52517985611511,
"grad_norm": 0.02585586195710277,
"learning_rate": 1.4687718355246294e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9990950226783752,
"step": 3965
},
{
"epoch": 28.56115107913669,
"grad_norm": 0.026300449454115953,
"learning_rate": 1.3981635744590883e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9990339398384094,
"step": 3970
},
{
"epoch": 28.597122302158272,
"grad_norm": 0.021202713897712756,
"learning_rate": 1.3292826102985212e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9990707278251648,
"step": 3975
},
{
"epoch": 28.633093525179856,
"grad_norm": 0.033199600046098315,
"learning_rate": 1.2621301496926419e-06,
"loss": 0.0024,
"mean_token_accuracy": 0.9991685271263122,
"step": 3980
},
{
"epoch": 28.66906474820144,
"grad_norm": 0.029822323609474746,
"learning_rate": 1.196707369011396e-06,
"loss": 0.0028,
"mean_token_accuracy": 0.9990427136421204,
"step": 3985
},
{
"epoch": 28.705035971223023,
"grad_norm": 0.021907570953874314,
"learning_rate": 1.1330154143243787e-06,
"loss": 0.0025,
"mean_token_accuracy": 0.9991563141345978,
"step": 3990
},
{
"epoch": 28.741007194244606,
"grad_norm": 0.031723215800157904,
"learning_rate": 1.0710554013807495e-06,
"loss": 0.0027,
"mean_token_accuracy": 0.9990951001644135,
"step": 3995
},
{
"epoch": 28.776978417266186,
"grad_norm": 0.02188892324131467,
"learning_rate": 1.0108284155896819e-06,
"loss": 0.0024,
"mean_token_accuracy": 0.9993274867534637,
"step": 4000
},
{
"epoch": 28.81294964028777,
"grad_norm": 0.027770981302162885,
"learning_rate": 9.523355120013677e-07,
"loss": 0.0022,
"mean_token_accuracy": 0.9991930305957795,
"step": 4005
},
{
"epoch": 28.848920863309353,
"grad_norm": 0.03088762830415268,
"learning_rate": 8.955777152885314e-07,
"loss": 0.0027,
"mean_token_accuracy": 0.9990583479404449,
"step": 4010
},
{
"epoch": 28.884892086330936,
"grad_norm": 0.02070344017616713,
"learning_rate": 8.405560197284557e-07,
"loss": 0.0028,
"mean_token_accuracy": 0.9990951836109161,
"step": 4015
},
{
"epoch": 28.92086330935252,
"grad_norm": 0.025134927148283456,
"learning_rate": 7.872713891855843e-07,
"loss": 0.0029,
"mean_token_accuracy": 0.9990340828895569,
"step": 4020
},
{
"epoch": 28.9568345323741,
"grad_norm": 0.03278076527634437,
"learning_rate": 7.357247570946357e-07,
"loss": 0.0028,
"mean_token_accuracy": 0.9990216612815856,
"step": 4025
},
{
"epoch": 28.992805755395683,
"grad_norm": 0.02754408645957575,
"learning_rate": 6.859170264442605e-07,
"loss": 0.0027,
"mean_token_accuracy": 0.9989728093147278,
"step": 4030
},
{
"epoch": 29.0,
"eval_loss": 0.1744653284549713,
"eval_mean_token_accuracy": 0.9842423597971598,
"eval_runtime": 20.7969,
"eval_samples_per_second": 5.866,
"eval_steps_per_second": 0.769,
"step": 4031
},
{
"epoch": 29.028776978417266,
"grad_norm": 0.02324930062427535,
"learning_rate": 6.378490697611761e-07,
"loss": 0.0026,
"mean_token_accuracy": 0.9992052540183067,
"step": 4035
},
{
"epoch": 29.06474820143885,
"grad_norm": 0.02432217426882393,
"learning_rate": 5.915217290949571e-07,
"loss": 0.0027,
"mean_token_accuracy": 0.9990829706192017,
"step": 4040
},
{
"epoch": 29.100719424460433,
"grad_norm": 0.028049045181475652,
"learning_rate": 5.469358160032356e-07,
"loss": 0.0026,
"mean_token_accuracy": 0.9991195380687714,
"step": 4045
},
{
"epoch": 29.136690647482013,
"grad_norm": 0.021910112528279284,
"learning_rate": 5.040921115374686e-07,
"loss": 0.0027,
"mean_token_accuracy": 0.9991283357143402,
"step": 4050
},
{
"epoch": 29.172661870503596,
"grad_norm": 0.023718091323160405,
"learning_rate": 4.6299136622929285e-07,
"loss": 0.0024,
"mean_token_accuracy": 0.9992173552513123,
"step": 4055
},
{
"epoch": 29.20863309352518,
"grad_norm": 0.02590581189187291,
"learning_rate": 4.2363430007740237e-07,
"loss": 0.0025,
"mean_token_accuracy": 0.9991929352283477,
"step": 4060
},
{
"epoch": 29.244604316546763,
"grad_norm": 0.030650626796343103,
"learning_rate": 3.860216025348251e-07,
"loss": 0.0029,
"mean_token_accuracy": 0.9990461230278015,
"step": 4065
},
{
"epoch": 29.280575539568346,
"grad_norm": 0.030720762355050432,
"learning_rate": 3.5015393249698824e-07,
"loss": 0.0025,
"mean_token_accuracy": 0.9990951597690583,
"step": 4070
},
{
"epoch": 29.31654676258993,
"grad_norm": 0.02506864688330541,
"learning_rate": 3.160319182900495e-07,
"loss": 0.0025,
"mean_token_accuracy": 0.9991685032844544,
"step": 4075
},
{
"epoch": 29.35251798561151,
"grad_norm": 0.02852340035502043,
"learning_rate": 2.836561576599839e-07,
"loss": 0.0028,
"mean_token_accuracy": 0.999033921957016,
"step": 4080
},
{
"epoch": 29.388489208633093,
"grad_norm": 0.029973206482145683,
"learning_rate": 2.530272177620585e-07,
"loss": 0.0027,
"mean_token_accuracy": 0.9990583717823028,
"step": 4085
},
{
"epoch": 29.424460431654676,
"grad_norm": 0.022189267720462317,
"learning_rate": 2.241456351509186e-07,
"loss": 0.0023,
"mean_token_accuracy": 0.9991807758808136,
"step": 4090
},
{
"epoch": 29.46043165467626,
"grad_norm": 0.01944218340517776,
"learning_rate": 1.9701191577117252e-07,
"loss": 0.0024,
"mean_token_accuracy": 0.9990707099437713,
"step": 4095
},
{
"epoch": 29.496402877697843,
"grad_norm": 0.01990659572163372,
"learning_rate": 1.7162653494855462e-07,
"loss": 0.0026,
"mean_token_accuracy": 0.9991929590702057,
"step": 4100
},
{
"epoch": 29.532374100719423,
"grad_norm": 0.023361693235018466,
"learning_rate": 1.4798993738156518e-07,
"loss": 0.0023,
"mean_token_accuracy": 0.9991930842399597,
"step": 4105
},
{
"epoch": 29.568345323741006,
"grad_norm": 0.03204420748570329,
"learning_rate": 1.26102537133721e-07,
"loss": 0.0024,
"mean_token_accuracy": 0.9991929471492768,
"step": 4110
},
{
"epoch": 29.60431654676259,
"grad_norm": 0.020309140360128874,
"learning_rate": 1.0596471762626126e-07,
"loss": 0.0029,
"mean_token_accuracy": 0.9989850461483002,
"step": 4115
},
{
"epoch": 29.640287769784173,
"grad_norm": 0.02215835825264975,
"learning_rate": 8.757683163144182e-08,
"loss": 0.0024,
"mean_token_accuracy": 0.99908287525177,
"step": 4120
},
{
"epoch": 29.676258992805757,
"grad_norm": 0.027609261852714354,
"learning_rate": 7.093920126638454e-08,
"loss": 0.0025,
"mean_token_accuracy": 0.9991684675216674,
"step": 4125
},
{
"epoch": 29.71223021582734,
"grad_norm": 0.026309926413359074,
"learning_rate": 5.605211798738186e-08,
"loss": 0.0024,
"mean_token_accuracy": 0.9993030488491058,
"step": 4130
},
{
"epoch": 29.74820143884892,
"grad_norm": 0.02776420304640643,
"learning_rate": 4.291584258486747e-08,
"loss": 0.0025,
"mean_token_accuracy": 0.9992418825626374,
"step": 4135
},
{
"epoch": 29.784172661870503,
"grad_norm": 0.03297468936183595,
"learning_rate": 3.153060517874229e-08,
"loss": 0.003,
"mean_token_accuracy": 0.9990706086158753,
"step": 4140
},
{
"epoch": 29.820143884892087,
"grad_norm": 0.021453260733662226,
"learning_rate": 2.1896605214455356e-08,
"loss": 0.0023,
"mean_token_accuracy": 0.9992419958114624,
"step": 4145
},
{
"epoch": 29.85611510791367,
"grad_norm": 0.02545565832007103,
"learning_rate": 1.4014011459428933e-08,
"loss": 0.0025,
"mean_token_accuracy": 0.9990583419799804,
"step": 4150
},
{
"epoch": 29.892086330935253,
"grad_norm": 0.02683900491178024,
"learning_rate": 7.882962000138605e-09,
"loss": 0.0024,
"mean_token_accuracy": 0.9991440534591675,
"step": 4155
},
{
"epoch": 29.928057553956833,
"grad_norm": 0.025734296592890643,
"learning_rate": 3.503564239670798e-09,
"loss": 0.0027,
"mean_token_accuracy": 0.9990827918052674,
"step": 4160
},
{
"epoch": 29.964028776978417,
"grad_norm": 0.025706770914082706,
"learning_rate": 8.75894895879803e-10,
"loss": 0.0025,
"mean_token_accuracy": 0.9992418885231018,
"step": 4165
},
{
"epoch": 30.0,
"grad_norm": 0.020417514095723455,
"learning_rate": 0.0,
"loss": 0.0025,
"mean_token_accuracy": 0.9991196155548095,
"step": 4170
},
{
"epoch": 30.0,
"eval_loss": 0.17444376647472382,
"eval_mean_token_accuracy": 0.9824385866522789,
"eval_runtime": 20.2577,
"eval_samples_per_second": 6.022,
"eval_steps_per_second": 0.79,
"step": 4170
},
{
"epoch": 30.0,
"step": 4170,
"total_flos": 1.3300619754508124e+18,
"train_loss": 0.05172654809675914,
"train_runtime": 17966.4012,
"train_samples_per_second": 1.855,
"train_steps_per_second": 0.232
}
],
"logging_steps": 5,
"max_steps": 4170,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3300619754508124e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}