error577's picture
Training in progress, step 300, checkpoint
223b421 verified
{
"best_metric": 0.4757327735424042,
"best_model_checkpoint": "miner_id_24/checkpoint-300",
"epoch": 0.09686989162680874,
"eval_steps": 50,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00032289963875602916,
"grad_norm": 0.8824386596679688,
"learning_rate": 2.9999999999999997e-05,
"loss": 0.9502,
"step": 1
},
{
"epoch": 0.00032289963875602916,
"eval_loss": 1.3877121210098267,
"eval_runtime": 93.1564,
"eval_samples_per_second": 2.673,
"eval_steps_per_second": 2.673,
"step": 1
},
{
"epoch": 0.0006457992775120583,
"grad_norm": 0.9450064897537231,
"learning_rate": 5.9999999999999995e-05,
"loss": 1.0243,
"step": 2
},
{
"epoch": 0.0009686989162680874,
"grad_norm": 0.8851878046989441,
"learning_rate": 8.999999999999999e-05,
"loss": 1.0441,
"step": 3
},
{
"epoch": 0.0012915985550241166,
"grad_norm": 0.9229115843772888,
"learning_rate": 0.00011999999999999999,
"loss": 0.9838,
"step": 4
},
{
"epoch": 0.0016144981937801458,
"grad_norm": 0.8223733901977539,
"learning_rate": 0.00015,
"loss": 0.9295,
"step": 5
},
{
"epoch": 0.0019373978325361748,
"grad_norm": 0.608881950378418,
"learning_rate": 0.00017999999999999998,
"loss": 0.8621,
"step": 6
},
{
"epoch": 0.002260297471292204,
"grad_norm": 0.5287642478942871,
"learning_rate": 0.00020999999999999998,
"loss": 0.7054,
"step": 7
},
{
"epoch": 0.0025831971100482333,
"grad_norm": 0.5346184372901917,
"learning_rate": 0.00023999999999999998,
"loss": 0.7059,
"step": 8
},
{
"epoch": 0.0029060967488042625,
"grad_norm": 0.5089557766914368,
"learning_rate": 0.00027,
"loss": 0.697,
"step": 9
},
{
"epoch": 0.0032289963875602916,
"grad_norm": 0.5970950722694397,
"learning_rate": 0.0003,
"loss": 0.7296,
"step": 10
},
{
"epoch": 0.0035518960263163203,
"grad_norm": 0.5149356722831726,
"learning_rate": 0.0002999911984174669,
"loss": 0.652,
"step": 11
},
{
"epoch": 0.0038747956650723495,
"grad_norm": 0.5151774287223816,
"learning_rate": 0.0002999647947027726,
"loss": 0.5781,
"step": 12
},
{
"epoch": 0.004197695303828379,
"grad_norm": 0.8652517795562744,
"learning_rate": 0.0002999207919545099,
"loss": 0.6031,
"step": 13
},
{
"epoch": 0.004520594942584408,
"grad_norm": 0.5327635407447815,
"learning_rate": 0.0002998591953365965,
"loss": 0.5459,
"step": 14
},
{
"epoch": 0.004843494581340437,
"grad_norm": 0.6640207171440125,
"learning_rate": 0.00029978001207766854,
"loss": 0.6103,
"step": 15
},
{
"epoch": 0.005166394220096467,
"grad_norm": 0.5555705428123474,
"learning_rate": 0.00029968325147023263,
"loss": 0.5229,
"step": 16
},
{
"epoch": 0.005489293858852495,
"grad_norm": 0.5247602462768555,
"learning_rate": 0.000299568924869575,
"loss": 0.5202,
"step": 17
},
{
"epoch": 0.005812193497608525,
"grad_norm": 0.42580658197402954,
"learning_rate": 0.00029943704569242917,
"loss": 0.5354,
"step": 18
},
{
"epoch": 0.006135093136364554,
"grad_norm": 0.43852224946022034,
"learning_rate": 0.0002992876294154013,
"loss": 0.5391,
"step": 19
},
{
"epoch": 0.006457992775120583,
"grad_norm": 0.5213896632194519,
"learning_rate": 0.00029912069357315393,
"loss": 0.514,
"step": 20
},
{
"epoch": 0.006780892413876612,
"grad_norm": 0.4253011643886566,
"learning_rate": 0.00029893625775634835,
"loss": 0.5246,
"step": 21
},
{
"epoch": 0.007103792052632641,
"grad_norm": 0.6179479956626892,
"learning_rate": 0.0002987343436093454,
"loss": 0.5259,
"step": 22
},
{
"epoch": 0.00742669169138867,
"grad_norm": 0.4869435429573059,
"learning_rate": 0.00029851497482766547,
"loss": 0.4949,
"step": 23
},
{
"epoch": 0.007749591330144699,
"grad_norm": 0.46306025981903076,
"learning_rate": 0.00029827817715520773,
"loss": 0.5344,
"step": 24
},
{
"epoch": 0.008072490968900729,
"grad_norm": 0.5446106195449829,
"learning_rate": 0.0002980239783812289,
"loss": 0.6442,
"step": 25
},
{
"epoch": 0.008395390607656757,
"grad_norm": 0.4747227430343628,
"learning_rate": 0.0002977524083370822,
"loss": 0.5129,
"step": 26
},
{
"epoch": 0.008718290246412786,
"grad_norm": 0.4436034560203552,
"learning_rate": 0.00029746349889271645,
"loss": 0.5176,
"step": 27
},
{
"epoch": 0.009041189885168817,
"grad_norm": 0.458401620388031,
"learning_rate": 0.0002971572839529358,
"loss": 0.5984,
"step": 28
},
{
"epoch": 0.009364089523924845,
"grad_norm": 0.41444727778434753,
"learning_rate": 0.00029683379945342125,
"loss": 0.5384,
"step": 29
},
{
"epoch": 0.009686989162680874,
"grad_norm": 0.4278819262981415,
"learning_rate": 0.000296493083356513,
"loss": 0.4995,
"step": 30
},
{
"epoch": 0.010009888801436903,
"grad_norm": 0.5811014175415039,
"learning_rate": 0.00029613517564675565,
"loss": 0.5204,
"step": 31
},
{
"epoch": 0.010332788440192933,
"grad_norm": 0.43319976329803467,
"learning_rate": 0.0002957601183262058,
"loss": 0.5376,
"step": 32
},
{
"epoch": 0.010655688078948962,
"grad_norm": 0.5323597192764282,
"learning_rate": 0.000295367955409503,
"loss": 0.5722,
"step": 33
},
{
"epoch": 0.01097858771770499,
"grad_norm": 0.49440231919288635,
"learning_rate": 0.00029495873291870436,
"loss": 0.519,
"step": 34
},
{
"epoch": 0.01130148735646102,
"grad_norm": 0.585383951663971,
"learning_rate": 0.0002945324988778834,
"loss": 0.5643,
"step": 35
},
{
"epoch": 0.01162438699521705,
"grad_norm": 0.6207253336906433,
"learning_rate": 0.00029408930330749477,
"loss": 0.5647,
"step": 36
},
{
"epoch": 0.011947286633973079,
"grad_norm": 0.5334969162940979,
"learning_rate": 0.0002936291982185036,
"loss": 0.6132,
"step": 37
},
{
"epoch": 0.012270186272729107,
"grad_norm": 0.5715814828872681,
"learning_rate": 0.00029315223760628217,
"loss": 0.5453,
"step": 38
},
{
"epoch": 0.012593085911485136,
"grad_norm": 0.5439808964729309,
"learning_rate": 0.00029265847744427303,
"loss": 0.6189,
"step": 39
},
{
"epoch": 0.012915985550241166,
"grad_norm": 0.4831181466579437,
"learning_rate": 0.00029214797567742035,
"loss": 0.558,
"step": 40
},
{
"epoch": 0.013238885188997195,
"grad_norm": 0.44526898860931396,
"learning_rate": 0.00029162079221537,
"loss": 0.5486,
"step": 41
},
{
"epoch": 0.013561784827753224,
"grad_norm": 0.4262256324291229,
"learning_rate": 0.0002910769889254386,
"loss": 0.5953,
"step": 42
},
{
"epoch": 0.013884684466509253,
"grad_norm": 1.0412601232528687,
"learning_rate": 0.0002905166296253533,
"loss": 0.6746,
"step": 43
},
{
"epoch": 0.014207584105265281,
"grad_norm": 0.4333534836769104,
"learning_rate": 0.0002899397800757626,
"loss": 0.5598,
"step": 44
},
{
"epoch": 0.014530483744021312,
"grad_norm": 0.44549164175987244,
"learning_rate": 0.0002893465079725187,
"loss": 0.5453,
"step": 45
},
{
"epoch": 0.01485338338277734,
"grad_norm": 0.4520653486251831,
"learning_rate": 0.0002887368829387333,
"loss": 0.55,
"step": 46
},
{
"epoch": 0.01517628302153337,
"grad_norm": 0.4351714849472046,
"learning_rate": 0.0002881109765166071,
"loss": 0.5768,
"step": 47
},
{
"epoch": 0.015499182660289398,
"grad_norm": 0.49298447370529175,
"learning_rate": 0.00028746886215903387,
"loss": 0.5117,
"step": 48
},
{
"epoch": 0.01582208229904543,
"grad_norm": 0.5363653302192688,
"learning_rate": 0.00028681061522098047,
"loss": 0.6614,
"step": 49
},
{
"epoch": 0.016144981937801457,
"grad_norm": 0.6216686367988586,
"learning_rate": 0.0002861363129506435,
"loss": 0.6039,
"step": 50
},
{
"epoch": 0.016144981937801457,
"eval_loss": 0.5689713358879089,
"eval_runtime": 93.1562,
"eval_samples_per_second": 2.673,
"eval_steps_per_second": 2.673,
"step": 50
},
{
"epoch": 0.016467881576557486,
"grad_norm": 0.4990479052066803,
"learning_rate": 0.0002854460344803842,
"loss": 0.6526,
"step": 51
},
{
"epoch": 0.016790781215313515,
"grad_norm": 0.45471203327178955,
"learning_rate": 0.00028473986081744163,
"loss": 0.5901,
"step": 52
},
{
"epoch": 0.017113680854069543,
"grad_norm": 0.3418448567390442,
"learning_rate": 0.000284017874834426,
"loss": 0.5179,
"step": 53
},
{
"epoch": 0.017436580492825572,
"grad_norm": 0.5214569568634033,
"learning_rate": 0.0002832801612595937,
"loss": 0.531,
"step": 54
},
{
"epoch": 0.017759480131581604,
"grad_norm": 0.4000888764858246,
"learning_rate": 0.0002825268066669034,
"loss": 0.5312,
"step": 55
},
{
"epoch": 0.018082379770337633,
"grad_norm": 0.3511790335178375,
"learning_rate": 0.00028175789946585693,
"loss": 0.5187,
"step": 56
},
{
"epoch": 0.018405279409093662,
"grad_norm": 0.40245896577835083,
"learning_rate": 0.0002809735298911234,
"loss": 0.5141,
"step": 57
},
{
"epoch": 0.01872817904784969,
"grad_norm": 0.3479350805282593,
"learning_rate": 0.00028017378999195015,
"loss": 0.5353,
"step": 58
},
{
"epoch": 0.01905107868660572,
"grad_norm": 0.3354577124118805,
"learning_rate": 0.0002793587736213603,
"loss": 0.5223,
"step": 59
},
{
"epoch": 0.019373978325361748,
"grad_norm": 0.36033692955970764,
"learning_rate": 0.00027852857642513836,
"loss": 0.535,
"step": 60
},
{
"epoch": 0.019696877964117777,
"grad_norm": 0.40051642060279846,
"learning_rate": 0.00027768329583060635,
"loss": 0.4658,
"step": 61
},
{
"epoch": 0.020019777602873805,
"grad_norm": 0.41961464285850525,
"learning_rate": 0.00027682303103518976,
"loss": 0.5517,
"step": 62
},
{
"epoch": 0.020342677241629838,
"grad_norm": 0.4324147403240204,
"learning_rate": 0.00027594788299477655,
"loss": 0.5352,
"step": 63
},
{
"epoch": 0.020665576880385866,
"grad_norm": 0.39792558550834656,
"learning_rate": 0.0002750579544118695,
"loss": 0.5369,
"step": 64
},
{
"epoch": 0.020988476519141895,
"grad_norm": 0.4185834228992462,
"learning_rate": 0.00027415334972353357,
"loss": 0.5323,
"step": 65
},
{
"epoch": 0.021311376157897924,
"grad_norm": 0.36977943778038025,
"learning_rate": 0.0002732341750891397,
"loss": 0.4811,
"step": 66
},
{
"epoch": 0.021634275796653953,
"grad_norm": 0.38211461901664734,
"learning_rate": 0.00027230053837790666,
"loss": 0.508,
"step": 67
},
{
"epoch": 0.02195717543540998,
"grad_norm": 0.3872841000556946,
"learning_rate": 0.0002713525491562421,
"loss": 0.4985,
"step": 68
},
{
"epoch": 0.02228007507416601,
"grad_norm": 0.3474493622779846,
"learning_rate": 0.0002703903186748843,
"loss": 0.4411,
"step": 69
},
{
"epoch": 0.02260297471292204,
"grad_norm": 0.3461940586566925,
"learning_rate": 0.00026941395985584653,
"loss": 0.4987,
"step": 70
},
{
"epoch": 0.022925874351678067,
"grad_norm": 0.3902309238910675,
"learning_rate": 0.00026842358727916524,
"loss": 0.5151,
"step": 71
},
{
"epoch": 0.0232487739904341,
"grad_norm": 0.34906888008117676,
"learning_rate": 0.0002674193171694533,
"loss": 0.5087,
"step": 72
},
{
"epoch": 0.02357167362919013,
"grad_norm": 0.40804773569107056,
"learning_rate": 0.0002664012673822609,
"loss": 0.5772,
"step": 73
},
{
"epoch": 0.023894573267946157,
"grad_norm": 0.3935260474681854,
"learning_rate": 0.0002653695573902443,
"loss": 0.4854,
"step": 74
},
{
"epoch": 0.024217472906702186,
"grad_norm": 0.3976927101612091,
"learning_rate": 0.0002643243082691454,
"loss": 0.4943,
"step": 75
},
{
"epoch": 0.024540372545458215,
"grad_norm": 0.4370949864387512,
"learning_rate": 0.0002632656426835831,
"loss": 0.5562,
"step": 76
},
{
"epoch": 0.024863272184214243,
"grad_norm": 0.37720787525177,
"learning_rate": 0.00026219368487265753,
"loss": 0.4861,
"step": 77
},
{
"epoch": 0.025186171822970272,
"grad_norm": 0.38937804102897644,
"learning_rate": 0.00026110856063537083,
"loss": 0.4428,
"step": 78
},
{
"epoch": 0.0255090714617263,
"grad_norm": 0.41925427317619324,
"learning_rate": 0.00026001039731586334,
"loss": 0.5127,
"step": 79
},
{
"epoch": 0.025831971100482333,
"grad_norm": 0.4304789900779724,
"learning_rate": 0.0002588993237884696,
"loss": 0.4446,
"step": 80
},
{
"epoch": 0.02615487073923836,
"grad_norm": 0.4581892490386963,
"learning_rate": 0.00025777547044259435,
"loss": 0.5083,
"step": 81
},
{
"epoch": 0.02647777037799439,
"grad_norm": 0.40676695108413696,
"learning_rate": 0.0002566389691674106,
"loss": 0.4466,
"step": 82
},
{
"epoch": 0.02680067001675042,
"grad_norm": 0.4280904531478882,
"learning_rate": 0.00025548995333638197,
"loss": 0.5397,
"step": 83
},
{
"epoch": 0.027123569655506448,
"grad_norm": 0.500979483127594,
"learning_rate": 0.00025432855779161076,
"loss": 0.5162,
"step": 84
},
{
"epoch": 0.027446469294262477,
"grad_norm": 0.46338576078414917,
"learning_rate": 0.00025315491882801347,
"loss": 0.52,
"step": 85
},
{
"epoch": 0.027769368933018505,
"grad_norm": 0.4408024251461029,
"learning_rate": 0.00025196917417732615,
"loss": 0.5225,
"step": 86
},
{
"epoch": 0.028092268571774534,
"grad_norm": 0.5076299905776978,
"learning_rate": 0.0002507714629919409,
"loss": 0.5791,
"step": 87
},
{
"epoch": 0.028415168210530563,
"grad_norm": 0.4352111220359802,
"learning_rate": 0.0002495619258285757,
"loss": 0.5098,
"step": 88
},
{
"epoch": 0.028738067849286595,
"grad_norm": 0.5041966438293457,
"learning_rate": 0.0002483407046317794,
"loss": 0.5932,
"step": 89
},
{
"epoch": 0.029060967488042624,
"grad_norm": 0.452606737613678,
"learning_rate": 0.00024710794271727413,
"loss": 0.606,
"step": 90
},
{
"epoch": 0.029383867126798652,
"grad_norm": 0.40284180641174316,
"learning_rate": 0.0002458637847551364,
"loss": 0.537,
"step": 91
},
{
"epoch": 0.02970676676555468,
"grad_norm": 0.4412550628185272,
"learning_rate": 0.00024460837675281926,
"loss": 0.487,
"step": 92
},
{
"epoch": 0.03002966640431071,
"grad_norm": 0.4605487287044525,
"learning_rate": 0.00024334186603801807,
"loss": 0.5168,
"step": 93
},
{
"epoch": 0.03035256604306674,
"grad_norm": 0.4492901861667633,
"learning_rate": 0.00024206440124138062,
"loss": 0.5853,
"step": 94
},
{
"epoch": 0.030675465681822767,
"grad_norm": 0.4852229356765747,
"learning_rate": 0.0002407761322790648,
"loss": 0.5914,
"step": 95
},
{
"epoch": 0.030998365320578796,
"grad_norm": 0.436937153339386,
"learning_rate": 0.00023947721033514512,
"loss": 0.5557,
"step": 96
},
{
"epoch": 0.03132126495933483,
"grad_norm": 0.49399349093437195,
"learning_rate": 0.00023816778784387094,
"loss": 0.5293,
"step": 97
},
{
"epoch": 0.03164416459809086,
"grad_norm": 0.4873245358467102,
"learning_rate": 0.0002368480184717773,
"loss": 0.4905,
"step": 98
},
{
"epoch": 0.031967064236846886,
"grad_norm": 0.5261349678039551,
"learning_rate": 0.00023551805709965147,
"loss": 0.512,
"step": 99
},
{
"epoch": 0.032289963875602914,
"grad_norm": 0.6633840799331665,
"learning_rate": 0.00023417805980435736,
"loss": 0.5687,
"step": 100
},
{
"epoch": 0.032289963875602914,
"eval_loss": 0.549869179725647,
"eval_runtime": 93.2109,
"eval_samples_per_second": 2.671,
"eval_steps_per_second": 2.671,
"step": 100
},
{
"epoch": 0.03261286351435894,
"grad_norm": 0.5682958960533142,
"learning_rate": 0.00023282818384051866,
"loss": 0.6406,
"step": 101
},
{
"epoch": 0.03293576315311497,
"grad_norm": 0.4537736773490906,
"learning_rate": 0.00023146858762206489,
"loss": 0.5788,
"step": 102
},
{
"epoch": 0.033258662791871,
"grad_norm": 0.3675607442855835,
"learning_rate": 0.00023009943070364044,
"loss": 0.5094,
"step": 103
},
{
"epoch": 0.03358156243062703,
"grad_norm": 0.30980873107910156,
"learning_rate": 0.0002287208737618801,
"loss": 0.5131,
"step": 104
},
{
"epoch": 0.03390446206938306,
"grad_norm": 0.31538596749305725,
"learning_rate": 0.00022733307857655325,
"loss": 0.4996,
"step": 105
},
{
"epoch": 0.03422736170813909,
"grad_norm": 0.3792458474636078,
"learning_rate": 0.00022593620801157808,
"loss": 0.5272,
"step": 106
},
{
"epoch": 0.034550261346895116,
"grad_norm": 0.3597868084907532,
"learning_rate": 0.00022453042599590882,
"loss": 0.5219,
"step": 107
},
{
"epoch": 0.034873160985651144,
"grad_norm": 0.3305104970932007,
"learning_rate": 0.00022311589750429787,
"loss": 0.4561,
"step": 108
},
{
"epoch": 0.03519606062440717,
"grad_norm": 0.30127620697021484,
"learning_rate": 0.00022169278853793545,
"loss": 0.4988,
"step": 109
},
{
"epoch": 0.03551896026316321,
"grad_norm": 0.34320175647735596,
"learning_rate": 0.00022026126610496852,
"loss": 0.5181,
"step": 110
},
{
"epoch": 0.03584185990191924,
"grad_norm": 0.3284643292427063,
"learning_rate": 0.0002188214982009016,
"loss": 0.5342,
"step": 111
},
{
"epoch": 0.036164759540675266,
"grad_norm": 0.3455963730812073,
"learning_rate": 0.00021737365378888187,
"loss": 0.4768,
"step": 112
},
{
"epoch": 0.036487659179431295,
"grad_norm": 0.3220086097717285,
"learning_rate": 0.00021591790277987043,
"loss": 0.4888,
"step": 113
},
{
"epoch": 0.036810558818187324,
"grad_norm": 0.3551287353038788,
"learning_rate": 0.00021445441601270276,
"loss": 0.4567,
"step": 114
},
{
"epoch": 0.03713345845694335,
"grad_norm": 0.35259413719177246,
"learning_rate": 0.00021298336523403968,
"loss": 0.4779,
"step": 115
},
{
"epoch": 0.03745635809569938,
"grad_norm": 0.3786124587059021,
"learning_rate": 0.0002115049230782124,
"loss": 0.4885,
"step": 116
},
{
"epoch": 0.03777925773445541,
"grad_norm": 0.3437775671482086,
"learning_rate": 0.00021001926304696296,
"loss": 0.4335,
"step": 117
},
{
"epoch": 0.03810215737321144,
"grad_norm": 0.3531520962715149,
"learning_rate": 0.00020852655948908316,
"loss": 0.4604,
"step": 118
},
{
"epoch": 0.03842505701196747,
"grad_norm": 0.374508261680603,
"learning_rate": 0.0002070269875799538,
"loss": 0.4603,
"step": 119
},
{
"epoch": 0.038747956650723496,
"grad_norm": 0.3716915249824524,
"learning_rate": 0.00020552072330098716,
"loss": 0.4878,
"step": 120
},
{
"epoch": 0.039070856289479525,
"grad_norm": 0.41532278060913086,
"learning_rate": 0.0002040079434189748,
"loss": 0.5234,
"step": 121
},
{
"epoch": 0.03939375592823555,
"grad_norm": 0.3630126416683197,
"learning_rate": 0.00020248882546534326,
"loss": 0.4509,
"step": 122
},
{
"epoch": 0.03971665556699158,
"grad_norm": 0.3788343071937561,
"learning_rate": 0.00020096354771531976,
"loss": 0.4989,
"step": 123
},
{
"epoch": 0.04003955520574761,
"grad_norm": 0.34830769896507263,
"learning_rate": 0.00019943228916701104,
"loss": 0.459,
"step": 124
},
{
"epoch": 0.04036245484450364,
"grad_norm": 0.3595748543739319,
"learning_rate": 0.00019789522952039695,
"loss": 0.4525,
"step": 125
},
{
"epoch": 0.040685354483259675,
"grad_norm": 0.3879833519458771,
"learning_rate": 0.0001963525491562421,
"loss": 0.483,
"step": 126
},
{
"epoch": 0.041008254122015704,
"grad_norm": 0.44336938858032227,
"learning_rate": 0.00019480442911492702,
"loss": 0.5148,
"step": 127
},
{
"epoch": 0.04133115376077173,
"grad_norm": 0.4268869161605835,
"learning_rate": 0.00019325105107520263,
"loss": 0.5186,
"step": 128
},
{
"epoch": 0.04165405339952776,
"grad_norm": 0.40015068650245667,
"learning_rate": 0.00019169259733286913,
"loss": 0.4856,
"step": 129
},
{
"epoch": 0.04197695303828379,
"grad_norm": 0.3713054656982422,
"learning_rate": 0.00019012925077938314,
"loss": 0.4546,
"step": 130
},
{
"epoch": 0.04229985267703982,
"grad_norm": 0.45219311118125916,
"learning_rate": 0.0001885611948803941,
"loss": 0.4606,
"step": 131
},
{
"epoch": 0.04262275231579585,
"grad_norm": 0.3732009828090668,
"learning_rate": 0.0001869886136542143,
"loss": 0.502,
"step": 132
},
{
"epoch": 0.042945651954551876,
"grad_norm": 0.5309696793556213,
"learning_rate": 0.00018541169165022298,
"loss": 0.591,
"step": 133
},
{
"epoch": 0.043268551593307905,
"grad_norm": 0.4451289772987366,
"learning_rate": 0.00018383061392720913,
"loss": 0.5503,
"step": 134
},
{
"epoch": 0.043591451232063934,
"grad_norm": 0.49492791295051575,
"learning_rate": 0.0001822455660316536,
"loss": 0.6013,
"step": 135
},
{
"epoch": 0.04391435087081996,
"grad_norm": 0.4255249500274658,
"learning_rate": 0.00018065673397595473,
"loss": 0.5237,
"step": 136
},
{
"epoch": 0.04423725050957599,
"grad_norm": 0.35570111870765686,
"learning_rate": 0.00017906430421659876,
"loss": 0.4749,
"step": 137
},
{
"epoch": 0.04456015014833202,
"grad_norm": 0.4393974840641022,
"learning_rate": 0.00017746846363227842,
"loss": 0.5613,
"step": 138
},
{
"epoch": 0.04488304978708805,
"grad_norm": 0.4163194000720978,
"learning_rate": 0.00017586939950196186,
"loss": 0.5197,
"step": 139
},
{
"epoch": 0.04520594942584408,
"grad_norm": 0.40177619457244873,
"learning_rate": 0.00017426729948291474,
"loss": 0.5775,
"step": 140
},
{
"epoch": 0.045528849064600106,
"grad_norm": 0.38539931178092957,
"learning_rate": 0.00017266235158867752,
"loss": 0.5486,
"step": 141
},
{
"epoch": 0.045851748703356135,
"grad_norm": 0.3907434642314911,
"learning_rate": 0.00017105474416700164,
"loss": 0.52,
"step": 142
},
{
"epoch": 0.04617464834211217,
"grad_norm": 0.45413726568222046,
"learning_rate": 0.0001694446658777458,
"loss": 0.5052,
"step": 143
},
{
"epoch": 0.0464975479808682,
"grad_norm": 0.3584253489971161,
"learning_rate": 0.00016783230567073596,
"loss": 0.4945,
"step": 144
},
{
"epoch": 0.04682044761962423,
"grad_norm": 0.38787075877189636,
"learning_rate": 0.00016621785276359127,
"loss": 0.5507,
"step": 145
},
{
"epoch": 0.04714334725838026,
"grad_norm": 0.4273373484611511,
"learning_rate": 0.0001646014966195185,
"loss": 0.5588,
"step": 146
},
{
"epoch": 0.047466246897136286,
"grad_norm": 0.3610289692878723,
"learning_rate": 0.00016298342692507763,
"loss": 0.5056,
"step": 147
},
{
"epoch": 0.047789146535892314,
"grad_norm": 0.4982016682624817,
"learning_rate": 0.00016136383356792156,
"loss": 0.6056,
"step": 148
},
{
"epoch": 0.04811204617464834,
"grad_norm": 0.48546895384788513,
"learning_rate": 0.0001597429066145116,
"loss": 0.6287,
"step": 149
},
{
"epoch": 0.04843494581340437,
"grad_norm": 0.4680643379688263,
"learning_rate": 0.0001581208362878126,
"loss": 0.5451,
"step": 150
},
{
"epoch": 0.04843494581340437,
"eval_loss": 0.5126909613609314,
"eval_runtime": 93.2305,
"eval_samples_per_second": 2.671,
"eval_steps_per_second": 2.671,
"step": 150
},
{
"epoch": 0.0487578454521604,
"grad_norm": 0.4066923260688782,
"learning_rate": 0.00015649781294496933,
"loss": 0.5668,
"step": 151
},
{
"epoch": 0.04908074509091643,
"grad_norm": 0.4061080813407898,
"learning_rate": 0.00015487402705496707,
"loss": 0.5461,
"step": 152
},
{
"epoch": 0.04940364472967246,
"grad_norm": 0.33173874020576477,
"learning_rate": 0.0001532496691762796,
"loss": 0.4937,
"step": 153
},
{
"epoch": 0.04972654436842849,
"grad_norm": 0.3234449625015259,
"learning_rate": 0.00015162492993450597,
"loss": 0.5055,
"step": 154
},
{
"epoch": 0.050049444007184515,
"grad_norm": 0.26888391375541687,
"learning_rate": 0.00015,
"loss": 0.4302,
"step": 155
},
{
"epoch": 0.050372343645940544,
"grad_norm": 0.2688259482383728,
"learning_rate": 0.00014837507006549403,
"loss": 0.4681,
"step": 156
},
{
"epoch": 0.05069524328469657,
"grad_norm": 0.29257914423942566,
"learning_rate": 0.00014675033082372038,
"loss": 0.4916,
"step": 157
},
{
"epoch": 0.0510181429234526,
"grad_norm": 0.3311769664287567,
"learning_rate": 0.00014512597294503293,
"loss": 0.4852,
"step": 158
},
{
"epoch": 0.05134104256220863,
"grad_norm": 0.3170052468776703,
"learning_rate": 0.00014350218705503067,
"loss": 0.4772,
"step": 159
},
{
"epoch": 0.051663942200964666,
"grad_norm": 0.34309279918670654,
"learning_rate": 0.00014187916371218736,
"loss": 0.4622,
"step": 160
},
{
"epoch": 0.051986841839720695,
"grad_norm": 0.32580074667930603,
"learning_rate": 0.00014025709338548836,
"loss": 0.4839,
"step": 161
},
{
"epoch": 0.05230974147847672,
"grad_norm": 0.317793071269989,
"learning_rate": 0.00013863616643207844,
"loss": 0.4923,
"step": 162
},
{
"epoch": 0.05263264111723275,
"grad_norm": 0.30965691804885864,
"learning_rate": 0.00013701657307492235,
"loss": 0.4769,
"step": 163
},
{
"epoch": 0.05295554075598878,
"grad_norm": 0.3157210052013397,
"learning_rate": 0.00013539850338048154,
"loss": 0.4798,
"step": 164
},
{
"epoch": 0.05327844039474481,
"grad_norm": 0.35382428765296936,
"learning_rate": 0.00013378214723640876,
"loss": 0.5212,
"step": 165
},
{
"epoch": 0.05360134003350084,
"grad_norm": 0.29083287715911865,
"learning_rate": 0.00013216769432926404,
"loss": 0.4257,
"step": 166
},
{
"epoch": 0.05392423967225687,
"grad_norm": 0.3195495009422302,
"learning_rate": 0.00013055533412225422,
"loss": 0.4177,
"step": 167
},
{
"epoch": 0.054247139311012896,
"grad_norm": 0.29507699608802795,
"learning_rate": 0.00012894525583299833,
"loss": 0.4311,
"step": 168
},
{
"epoch": 0.054570038949768924,
"grad_norm": 0.2950059175491333,
"learning_rate": 0.0001273376484113225,
"loss": 0.4188,
"step": 169
},
{
"epoch": 0.05489293858852495,
"grad_norm": 0.34078386425971985,
"learning_rate": 0.0001257327005170853,
"loss": 0.4737,
"step": 170
},
{
"epoch": 0.05521583822728098,
"grad_norm": 0.3855750262737274,
"learning_rate": 0.00012413060049803814,
"loss": 0.455,
"step": 171
},
{
"epoch": 0.05553873786603701,
"grad_norm": 0.34931278228759766,
"learning_rate": 0.00012253153636772156,
"loss": 0.4584,
"step": 172
},
{
"epoch": 0.05586163750479304,
"grad_norm": 0.3456253707408905,
"learning_rate": 0.00012093569578340124,
"loss": 0.4152,
"step": 173
},
{
"epoch": 0.05618453714354907,
"grad_norm": 0.3462797999382019,
"learning_rate": 0.00011934326602404528,
"loss": 0.4644,
"step": 174
},
{
"epoch": 0.0565074367823051,
"grad_norm": 0.3225034475326538,
"learning_rate": 0.00011775443396834638,
"loss": 0.4438,
"step": 175
},
{
"epoch": 0.056830336421061126,
"grad_norm": 0.3485172986984253,
"learning_rate": 0.00011616938607279086,
"loss": 0.4167,
"step": 176
},
{
"epoch": 0.05715323605981716,
"grad_norm": 0.36885136365890503,
"learning_rate": 0.00011458830834977698,
"loss": 0.4494,
"step": 177
},
{
"epoch": 0.05747613569857319,
"grad_norm": 0.40458542108535767,
"learning_rate": 0.0001130113863457857,
"loss": 0.4847,
"step": 178
},
{
"epoch": 0.05779903533732922,
"grad_norm": 0.3624725043773651,
"learning_rate": 0.00011143880511960584,
"loss": 0.4958,
"step": 179
},
{
"epoch": 0.05812193497608525,
"grad_norm": 0.3824242949485779,
"learning_rate": 0.00010987074922061689,
"loss": 0.4564,
"step": 180
},
{
"epoch": 0.058444834614841276,
"grad_norm": 0.3851178288459778,
"learning_rate": 0.00010830740266713087,
"loss": 0.4651,
"step": 181
},
{
"epoch": 0.058767734253597305,
"grad_norm": 0.43144652247428894,
"learning_rate": 0.00010674894892479738,
"loss": 0.4815,
"step": 182
},
{
"epoch": 0.059090633892353334,
"grad_norm": 0.389303982257843,
"learning_rate": 0.00010519557088507298,
"loss": 0.5031,
"step": 183
},
{
"epoch": 0.05941353353110936,
"grad_norm": 0.37136152386665344,
"learning_rate": 0.0001036474508437579,
"loss": 0.4521,
"step": 184
},
{
"epoch": 0.05973643316986539,
"grad_norm": 0.3901714086532593,
"learning_rate": 0.00010210477047960302,
"loss": 0.4977,
"step": 185
},
{
"epoch": 0.06005933280862142,
"grad_norm": 0.4063364863395691,
"learning_rate": 0.00010056771083298893,
"loss": 0.4808,
"step": 186
},
{
"epoch": 0.06038223244737745,
"grad_norm": 0.408845454454422,
"learning_rate": 9.903645228468024e-05,
"loss": 0.4782,
"step": 187
},
{
"epoch": 0.06070513208613348,
"grad_norm": 0.3464532792568207,
"learning_rate": 9.751117453465673e-05,
"loss": 0.4462,
"step": 188
},
{
"epoch": 0.061028031724889506,
"grad_norm": 0.41235268115997314,
"learning_rate": 9.59920565810252e-05,
"loss": 0.4636,
"step": 189
},
{
"epoch": 0.061350931363645535,
"grad_norm": 0.3754219710826874,
"learning_rate": 9.447927669901282e-05,
"loss": 0.5001,
"step": 190
},
{
"epoch": 0.06167383100240156,
"grad_norm": 0.39120209217071533,
"learning_rate": 9.297301242004618e-05,
"loss": 0.5631,
"step": 191
},
{
"epoch": 0.06199673064115759,
"grad_norm": 0.47471514344215393,
"learning_rate": 9.14734405109168e-05,
"loss": 0.5029,
"step": 192
},
{
"epoch": 0.06231963027991362,
"grad_norm": 0.3913878798484802,
"learning_rate": 8.998073695303701e-05,
"loss": 0.5068,
"step": 193
},
{
"epoch": 0.06264252991866966,
"grad_norm": 0.4407348334789276,
"learning_rate": 8.849507692178758e-05,
"loss": 0.4856,
"step": 194
},
{
"epoch": 0.06296542955742568,
"grad_norm": 0.41722989082336426,
"learning_rate": 8.70166347659603e-05,
"loss": 0.5372,
"step": 195
},
{
"epoch": 0.06328832919618171,
"grad_norm": 0.35007795691490173,
"learning_rate": 8.554558398729725e-05,
"loss": 0.4814,
"step": 196
},
{
"epoch": 0.06361122883493774,
"grad_norm": 0.43563127517700195,
"learning_rate": 8.408209722012956e-05,
"loss": 0.5617,
"step": 197
},
{
"epoch": 0.06393412847369377,
"grad_norm": 0.5308802723884583,
"learning_rate": 8.262634621111818e-05,
"loss": 0.5746,
"step": 198
},
{
"epoch": 0.0642570281124498,
"grad_norm": 0.5026018023490906,
"learning_rate": 8.117850179909842e-05,
"loss": 0.6231,
"step": 199
},
{
"epoch": 0.06457992775120583,
"grad_norm": 0.5310789346694946,
"learning_rate": 7.973873389503149e-05,
"loss": 0.6351,
"step": 200
},
{
"epoch": 0.06457992775120583,
"eval_loss": 0.4887339770793915,
"eval_runtime": 92.9821,
"eval_samples_per_second": 2.678,
"eval_steps_per_second": 2.678,
"step": 200
},
{
"epoch": 0.06490282738996185,
"grad_norm": 0.318142294883728,
"learning_rate": 7.830721146206451e-05,
"loss": 0.5384,
"step": 201
},
{
"epoch": 0.06522572702871789,
"grad_norm": 0.288631409406662,
"learning_rate": 7.688410249570214e-05,
"loss": 0.5078,
"step": 202
},
{
"epoch": 0.06554862666747392,
"grad_norm": 0.280100554227829,
"learning_rate": 7.54695740040912e-05,
"loss": 0.4788,
"step": 203
},
{
"epoch": 0.06587152630622994,
"grad_norm": 0.279681533575058,
"learning_rate": 7.406379198842189e-05,
"loss": 0.4447,
"step": 204
},
{
"epoch": 0.06619442594498598,
"grad_norm": 0.2892783284187317,
"learning_rate": 7.266692142344672e-05,
"loss": 0.4932,
"step": 205
},
{
"epoch": 0.066517325583742,
"grad_norm": 0.2658500075340271,
"learning_rate": 7.127912623811993e-05,
"loss": 0.4682,
"step": 206
},
{
"epoch": 0.06684022522249804,
"grad_norm": 0.2946866452693939,
"learning_rate": 6.990056929635957e-05,
"loss": 0.4838,
"step": 207
},
{
"epoch": 0.06716312486125406,
"grad_norm": 0.2683822214603424,
"learning_rate": 6.853141237793506e-05,
"loss": 0.4408,
"step": 208
},
{
"epoch": 0.0674860245000101,
"grad_norm": 0.3225007653236389,
"learning_rate": 6.717181615948126e-05,
"loss": 0.4949,
"step": 209
},
{
"epoch": 0.06780892413876612,
"grad_norm": 0.25332513451576233,
"learning_rate": 6.582194019564266e-05,
"loss": 0.4141,
"step": 210
},
{
"epoch": 0.06813182377752215,
"grad_norm": 0.2799530625343323,
"learning_rate": 6.448194290034848e-05,
"loss": 0.4445,
"step": 211
},
{
"epoch": 0.06845472341627817,
"grad_norm": 0.27327555418014526,
"learning_rate": 6.315198152822272e-05,
"loss": 0.4138,
"step": 212
},
{
"epoch": 0.06877762305503421,
"grad_norm": 0.3778553903102875,
"learning_rate": 6.183221215612904e-05,
"loss": 0.4804,
"step": 213
},
{
"epoch": 0.06910052269379023,
"grad_norm": 0.3077884614467621,
"learning_rate": 6.052278966485491e-05,
"loss": 0.4657,
"step": 214
},
{
"epoch": 0.06942342233254627,
"grad_norm": 0.29660362005233765,
"learning_rate": 5.922386772093526e-05,
"loss": 0.4297,
"step": 215
},
{
"epoch": 0.06974632197130229,
"grad_norm": 0.3540116548538208,
"learning_rate": 5.793559875861938e-05,
"loss": 0.466,
"step": 216
},
{
"epoch": 0.07006922161005832,
"grad_norm": 0.2957676351070404,
"learning_rate": 5.6658133961981894e-05,
"loss": 0.4421,
"step": 217
},
{
"epoch": 0.07039212124881435,
"grad_norm": 0.3042965233325958,
"learning_rate": 5.5391623247180744e-05,
"loss": 0.441,
"step": 218
},
{
"epoch": 0.07071502088757038,
"grad_norm": 0.36982765793800354,
"learning_rate": 5.413621524486363e-05,
"loss": 0.4114,
"step": 219
},
{
"epoch": 0.07103792052632642,
"grad_norm": 0.3452307879924774,
"learning_rate": 5.289205728272586e-05,
"loss": 0.4562,
"step": 220
},
{
"epoch": 0.07136082016508244,
"grad_norm": 0.3854043483734131,
"learning_rate": 5.165929536822059e-05,
"loss": 0.5003,
"step": 221
},
{
"epoch": 0.07168371980383847,
"grad_norm": 0.3237496018409729,
"learning_rate": 5.043807417142436e-05,
"loss": 0.4592,
"step": 222
},
{
"epoch": 0.0720066194425945,
"grad_norm": 0.32223159074783325,
"learning_rate": 4.922853700805909e-05,
"loss": 0.4553,
"step": 223
},
{
"epoch": 0.07232951908135053,
"grad_norm": 0.40129488706588745,
"learning_rate": 4.8030825822673814e-05,
"loss": 0.4276,
"step": 224
},
{
"epoch": 0.07265241872010655,
"grad_norm": 0.34809187054634094,
"learning_rate": 4.684508117198648e-05,
"loss": 0.4856,
"step": 225
},
{
"epoch": 0.07297531835886259,
"grad_norm": 0.3367185592651367,
"learning_rate": 4.567144220838923e-05,
"loss": 0.4555,
"step": 226
},
{
"epoch": 0.07329821799761861,
"grad_norm": 0.35933539271354675,
"learning_rate": 4.4510046663617996e-05,
"loss": 0.4837,
"step": 227
},
{
"epoch": 0.07362111763637465,
"grad_norm": 0.3718101382255554,
"learning_rate": 4.336103083258942e-05,
"loss": 0.4789,
"step": 228
},
{
"epoch": 0.07394401727513067,
"grad_norm": 0.3542415201663971,
"learning_rate": 4.2224529557405645e-05,
"loss": 0.5075,
"step": 229
},
{
"epoch": 0.0742669169138867,
"grad_norm": 0.3407626748085022,
"learning_rate": 4.1100676211530404e-05,
"loss": 0.4803,
"step": 230
},
{
"epoch": 0.07458981655264273,
"grad_norm": 0.39396294951438904,
"learning_rate": 3.998960268413666e-05,
"loss": 0.5117,
"step": 231
},
{
"epoch": 0.07491271619139876,
"grad_norm": 0.3785285949707031,
"learning_rate": 3.889143936462914e-05,
"loss": 0.4925,
"step": 232
},
{
"epoch": 0.07523561583015478,
"grad_norm": 0.36613747477531433,
"learning_rate": 3.780631512734241e-05,
"loss": 0.4434,
"step": 233
},
{
"epoch": 0.07555851546891082,
"grad_norm": 0.3978104591369629,
"learning_rate": 3.673435731641691e-05,
"loss": 0.4613,
"step": 234
},
{
"epoch": 0.07588141510766684,
"grad_norm": 0.43552708625793457,
"learning_rate": 3.567569173085454e-05,
"loss": 0.4177,
"step": 235
},
{
"epoch": 0.07620431474642288,
"grad_norm": 0.3718654215335846,
"learning_rate": 3.463044260975566e-05,
"loss": 0.4611,
"step": 236
},
{
"epoch": 0.07652721438517891,
"grad_norm": 0.41485676169395447,
"learning_rate": 3.3598732617739036e-05,
"loss": 0.5586,
"step": 237
},
{
"epoch": 0.07685011402393493,
"grad_norm": 0.37860673666000366,
"learning_rate": 3.258068283054666e-05,
"loss": 0.4256,
"step": 238
},
{
"epoch": 0.07717301366269097,
"grad_norm": 0.4362449645996094,
"learning_rate": 3.1576412720834746e-05,
"loss": 0.5763,
"step": 239
},
{
"epoch": 0.07749591330144699,
"grad_norm": 0.3914451003074646,
"learning_rate": 3.058604014415343e-05,
"loss": 0.4739,
"step": 240
},
{
"epoch": 0.07781881294020303,
"grad_norm": 0.3677349388599396,
"learning_rate": 2.960968132511567e-05,
"loss": 0.4716,
"step": 241
},
{
"epoch": 0.07814171257895905,
"grad_norm": 0.3888345956802368,
"learning_rate": 2.8647450843757897e-05,
"loss": 0.5218,
"step": 242
},
{
"epoch": 0.07846461221771509,
"grad_norm": 0.37700045108795166,
"learning_rate": 2.7699461622093304e-05,
"loss": 0.4978,
"step": 243
},
{
"epoch": 0.0787875118564711,
"grad_norm": 0.41537439823150635,
"learning_rate": 2.67658249108603e-05,
"loss": 0.4907,
"step": 244
},
{
"epoch": 0.07911041149522714,
"grad_norm": 0.40000054240226746,
"learning_rate": 2.584665027646643e-05,
"loss": 0.488,
"step": 245
},
{
"epoch": 0.07943331113398316,
"grad_norm": 0.395548552274704,
"learning_rate": 2.49420455881305e-05,
"loss": 0.4847,
"step": 246
},
{
"epoch": 0.0797562107727392,
"grad_norm": 0.4183206558227539,
"learning_rate": 2.4052117005223455e-05,
"loss": 0.5261,
"step": 247
},
{
"epoch": 0.08007911041149522,
"grad_norm": 0.37241002917289734,
"learning_rate": 2.317696896481024e-05,
"loss": 0.499,
"step": 248
},
{
"epoch": 0.08040201005025126,
"grad_norm": 0.4700750410556793,
"learning_rate": 2.231670416939364e-05,
"loss": 0.435,
"step": 249
},
{
"epoch": 0.08072490968900728,
"grad_norm": 0.47890686988830566,
"learning_rate": 2.147142357486164e-05,
"loss": 0.6928,
"step": 250
},
{
"epoch": 0.08072490968900728,
"eval_loss": 0.4805048406124115,
"eval_runtime": 93.118,
"eval_samples_per_second": 2.674,
"eval_steps_per_second": 2.674,
"step": 250
},
{
"epoch": 0.08104780932776331,
"grad_norm": 0.3123357892036438,
"learning_rate": 2.0641226378639715e-05,
"loss": 0.5109,
"step": 251
},
{
"epoch": 0.08137070896651935,
"grad_norm": 0.30325785279273987,
"learning_rate": 1.9826210008049785e-05,
"loss": 0.498,
"step": 252
},
{
"epoch": 0.08169360860527537,
"grad_norm": 0.2983933389186859,
"learning_rate": 1.902647010887655e-05,
"loss": 0.508,
"step": 253
},
{
"epoch": 0.08201650824403141,
"grad_norm": 0.29377394914627075,
"learning_rate": 1.8242100534143062e-05,
"loss": 0.486,
"step": 254
},
{
"epoch": 0.08233940788278743,
"grad_norm": 0.28709226846694946,
"learning_rate": 1.7473193333096575e-05,
"loss": 0.4685,
"step": 255
},
{
"epoch": 0.08266230752154347,
"grad_norm": 0.2827620804309845,
"learning_rate": 1.671983874040631e-05,
"loss": 0.4801,
"step": 256
},
{
"epoch": 0.08298520716029949,
"grad_norm": 0.3168405294418335,
"learning_rate": 1.598212516557394e-05,
"loss": 0.4902,
"step": 257
},
{
"epoch": 0.08330810679905552,
"grad_norm": 0.3135143518447876,
"learning_rate": 1.526013918255836e-05,
"loss": 0.5243,
"step": 258
},
{
"epoch": 0.08363100643781154,
"grad_norm": 0.25695309042930603,
"learning_rate": 1.4553965519615723e-05,
"loss": 0.4216,
"step": 259
},
{
"epoch": 0.08395390607656758,
"grad_norm": 0.2938316762447357,
"learning_rate": 1.3863687049356464e-05,
"loss": 0.4577,
"step": 260
},
{
"epoch": 0.0842768057153236,
"grad_norm": 0.2999093234539032,
"learning_rate": 1.3189384779019535e-05,
"loss": 0.4935,
"step": 261
},
{
"epoch": 0.08459970535407964,
"grad_norm": 0.3224240839481354,
"learning_rate": 1.25311378409661e-05,
"loss": 0.4744,
"step": 262
},
{
"epoch": 0.08492260499283566,
"grad_norm": 0.29576462507247925,
"learning_rate": 1.1889023483392879e-05,
"loss": 0.4506,
"step": 263
},
{
"epoch": 0.0852455046315917,
"grad_norm": 0.2991703450679779,
"learning_rate": 1.1263117061266675e-05,
"loss": 0.4842,
"step": 264
},
{
"epoch": 0.08556840427034772,
"grad_norm": 0.3080856502056122,
"learning_rate": 1.0653492027481286e-05,
"loss": 0.4486,
"step": 265
},
{
"epoch": 0.08589130390910375,
"grad_norm": 0.2527904510498047,
"learning_rate": 1.0060219924237379e-05,
"loss": 0.3798,
"step": 266
},
{
"epoch": 0.08621420354785977,
"grad_norm": 0.2680191397666931,
"learning_rate": 9.48337037464666e-06,
"loss": 0.4122,
"step": 267
},
{
"epoch": 0.08653710318661581,
"grad_norm": 0.29812344908714294,
"learning_rate": 8.923011074561404e-06,
"loss": 0.4546,
"step": 268
},
{
"epoch": 0.08686000282537185,
"grad_norm": 0.3110487163066864,
"learning_rate": 8.379207784630004e-06,
"loss": 0.4445,
"step": 269
},
{
"epoch": 0.08718290246412787,
"grad_norm": 0.32935261726379395,
"learning_rate": 7.852024322579648e-06,
"loss": 0.482,
"step": 270
},
{
"epoch": 0.0875058021028839,
"grad_norm": 0.30921775102615356,
"learning_rate": 7.34152255572697e-06,
"loss": 0.4362,
"step": 271
},
{
"epoch": 0.08782870174163993,
"grad_norm": 0.3837946951389313,
"learning_rate": 6.847762393717782e-06,
"loss": 0.433,
"step": 272
},
{
"epoch": 0.08815160138039596,
"grad_norm": 0.2926897406578064,
"learning_rate": 6.370801781496326e-06,
"loss": 0.4659,
"step": 273
},
{
"epoch": 0.08847450101915198,
"grad_norm": 0.35898199677467346,
"learning_rate": 5.910696692505201e-06,
"loss": 0.506,
"step": 274
},
{
"epoch": 0.08879740065790802,
"grad_norm": 0.3298279345035553,
"learning_rate": 5.467501122116563e-06,
"loss": 0.5052,
"step": 275
},
{
"epoch": 0.08912030029666404,
"grad_norm": 0.34559693932533264,
"learning_rate": 5.0412670812956465e-06,
"loss": 0.4997,
"step": 276
},
{
"epoch": 0.08944319993542008,
"grad_norm": 0.2868078947067261,
"learning_rate": 4.6320445904969475e-06,
"loss": 0.4047,
"step": 277
},
{
"epoch": 0.0897660995741761,
"grad_norm": 0.3573528528213501,
"learning_rate": 4.239881673794165e-06,
"loss": 0.481,
"step": 278
},
{
"epoch": 0.09008899921293213,
"grad_norm": 0.3438877463340759,
"learning_rate": 3.864824353244367e-06,
"loss": 0.5199,
"step": 279
},
{
"epoch": 0.09041189885168815,
"grad_norm": 0.3259707987308502,
"learning_rate": 3.506916643487001e-06,
"loss": 0.4441,
"step": 280
},
{
"epoch": 0.09073479849044419,
"grad_norm": 0.36126869916915894,
"learning_rate": 3.166200546578718e-06,
"loss": 0.4598,
"step": 281
},
{
"epoch": 0.09105769812920021,
"grad_norm": 0.29352861642837524,
"learning_rate": 2.8427160470641253e-06,
"loss": 0.4116,
"step": 282
},
{
"epoch": 0.09138059776795625,
"grad_norm": 0.390318363904953,
"learning_rate": 2.5365011072835117e-06,
"loss": 0.457,
"step": 283
},
{
"epoch": 0.09170349740671227,
"grad_norm": 0.34145522117614746,
"learning_rate": 2.2475916629177415e-06,
"loss": 0.4275,
"step": 284
},
{
"epoch": 0.0920263970454683,
"grad_norm": 0.3860124945640564,
"learning_rate": 1.9760216187710787e-06,
"loss": 0.5023,
"step": 285
},
{
"epoch": 0.09234929668422434,
"grad_norm": 0.36518341302871704,
"learning_rate": 1.7218228447922867e-06,
"loss": 0.4925,
"step": 286
},
{
"epoch": 0.09267219632298036,
"grad_norm": 0.3913903832435608,
"learning_rate": 1.4850251723345196e-06,
"loss": 0.4858,
"step": 287
},
{
"epoch": 0.0929950959617364,
"grad_norm": 0.35096660256385803,
"learning_rate": 1.2656563906545902e-06,
"loss": 0.4196,
"step": 288
},
{
"epoch": 0.09331799560049242,
"grad_norm": 0.4638069272041321,
"learning_rate": 1.0637422436516274e-06,
"loss": 0.5741,
"step": 289
},
{
"epoch": 0.09364089523924846,
"grad_norm": 0.40387284755706787,
"learning_rate": 8.793064268460604e-07,
"loss": 0.4867,
"step": 290
},
{
"epoch": 0.09396379487800448,
"grad_norm": 0.39819347858428955,
"learning_rate": 7.123705845987093e-07,
"loss": 0.4803,
"step": 291
},
{
"epoch": 0.09428669451676051,
"grad_norm": 0.3998761773109436,
"learning_rate": 5.629543075708176e-07,
"loss": 0.4755,
"step": 292
},
{
"epoch": 0.09460959415551654,
"grad_norm": 0.3553345799446106,
"learning_rate": 4.310751304249738e-07,
"loss": 0.5079,
"step": 293
},
{
"epoch": 0.09493249379427257,
"grad_norm": 0.35981830954551697,
"learning_rate": 3.167485297673411e-07,
"loss": 0.4487,
"step": 294
},
{
"epoch": 0.09525539343302859,
"grad_norm": 0.38175123929977417,
"learning_rate": 2.1998792233142714e-07,
"loss": 0.5537,
"step": 295
},
{
"epoch": 0.09557829307178463,
"grad_norm": 0.42106011509895325,
"learning_rate": 1.4080466340349316e-07,
"loss": 0.4318,
"step": 296
},
{
"epoch": 0.09590119271054065,
"grad_norm": 0.37637245655059814,
"learning_rate": 7.92080454900701e-08,
"loss": 0.5632,
"step": 297
},
{
"epoch": 0.09622409234929669,
"grad_norm": 0.3730914890766144,
"learning_rate": 3.5205297227380855e-08,
"loss": 0.4611,
"step": 298
},
{
"epoch": 0.09654699198805271,
"grad_norm": 0.5511401891708374,
"learning_rate": 8.801582533035644e-09,
"loss": 0.5325,
"step": 299
},
{
"epoch": 0.09686989162680874,
"grad_norm": 0.5235540270805359,
"learning_rate": 0.0,
"loss": 0.6263,
"step": 300
},
{
"epoch": 0.09686989162680874,
"eval_loss": 0.4757327735424042,
"eval_runtime": 92.9521,
"eval_samples_per_second": 2.679,
"eval_steps_per_second": 2.679,
"step": 300
}
],
"logging_steps": 1,
"max_steps": 300,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7681879996301312e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}