m7n's picture
Upload folder using huggingface_hub
ef2960e verified
{
"best_metric": 0.4268312156200409,
"best_model_checkpoint": "gte-modernbert-philosophy-v1-1-autotr/checkpoint-11543",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 11543,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021658147795200556,
"grad_norm": 9.93041706085205,
"learning_rate": 6.493506493506493e-07,
"loss": 1.2002,
"step": 25
},
{
"epoch": 0.004331629559040111,
"grad_norm": 10.093696594238281,
"learning_rate": 1.2987012987012986e-06,
"loss": 1.1623,
"step": 50
},
{
"epoch": 0.006497444338560167,
"grad_norm": 9.860174179077148,
"learning_rate": 1.948051948051948e-06,
"loss": 1.2012,
"step": 75
},
{
"epoch": 0.008663259118080222,
"grad_norm": 8.943851470947266,
"learning_rate": 2.597402597402597e-06,
"loss": 1.1853,
"step": 100
},
{
"epoch": 0.010829073897600277,
"grad_norm": 5.130438327789307,
"learning_rate": 3.246753246753247e-06,
"loss": 0.9767,
"step": 125
},
{
"epoch": 0.012994888677120333,
"grad_norm": 8.450475692749023,
"learning_rate": 3.896103896103896e-06,
"loss": 0.865,
"step": 150
},
{
"epoch": 0.015160703456640388,
"grad_norm": 3.7100517749786377,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.7733,
"step": 175
},
{
"epoch": 0.017326518236160444,
"grad_norm": 15.069323539733887,
"learning_rate": 5.194805194805194e-06,
"loss": 0.9545,
"step": 200
},
{
"epoch": 0.0194923330156805,
"grad_norm": 9.745051383972168,
"learning_rate": 5.8181818181818185e-06,
"loss": 0.8309,
"step": 225
},
{
"epoch": 0.021658147795200554,
"grad_norm": 7.115631580352783,
"learning_rate": 6.467532467532467e-06,
"loss": 0.7514,
"step": 250
},
{
"epoch": 0.02382396257472061,
"grad_norm": 15.047070503234863,
"learning_rate": 7.116883116883117e-06,
"loss": 0.5555,
"step": 275
},
{
"epoch": 0.025989777354240667,
"grad_norm": 14.228421211242676,
"learning_rate": 7.766233766233767e-06,
"loss": 0.563,
"step": 300
},
{
"epoch": 0.02815559213376072,
"grad_norm": 8.77304458618164,
"learning_rate": 8.415584415584416e-06,
"loss": 0.618,
"step": 325
},
{
"epoch": 0.030321406913280776,
"grad_norm": 4.78096866607666,
"learning_rate": 9.064935064935066e-06,
"loss": 0.6538,
"step": 350
},
{
"epoch": 0.032487221692800834,
"grad_norm": 9.978753089904785,
"learning_rate": 9.714285714285715e-06,
"loss": 0.5802,
"step": 375
},
{
"epoch": 0.03465303647232089,
"grad_norm": 15.071464538574219,
"learning_rate": 1.0363636363636364e-05,
"loss": 0.6568,
"step": 400
},
{
"epoch": 0.036818851251840944,
"grad_norm": 12.352958679199219,
"learning_rate": 1.1012987012987013e-05,
"loss": 0.4934,
"step": 425
},
{
"epoch": 0.038984666031361,
"grad_norm": 8.754806518554688,
"learning_rate": 1.1662337662337662e-05,
"loss": 0.597,
"step": 450
},
{
"epoch": 0.04115048081088105,
"grad_norm": 2.432300090789795,
"learning_rate": 1.2311688311688312e-05,
"loss": 0.3812,
"step": 475
},
{
"epoch": 0.04331629559040111,
"grad_norm": 7.439950466156006,
"learning_rate": 1.2961038961038961e-05,
"loss": 0.482,
"step": 500
},
{
"epoch": 0.04548211036992116,
"grad_norm": 14.198251724243164,
"learning_rate": 1.361038961038961e-05,
"loss": 0.5347,
"step": 525
},
{
"epoch": 0.04764792514944122,
"grad_norm": 5.91489839553833,
"learning_rate": 1.425974025974026e-05,
"loss": 0.5012,
"step": 550
},
{
"epoch": 0.04981373992896128,
"grad_norm": 14.394525527954102,
"learning_rate": 1.490909090909091e-05,
"loss": 0.5765,
"step": 575
},
{
"epoch": 0.05197955470848133,
"grad_norm": 16.823543548583984,
"learning_rate": 1.555844155844156e-05,
"loss": 0.4286,
"step": 600
},
{
"epoch": 0.05414536948800139,
"grad_norm": 4.72399377822876,
"learning_rate": 1.6207792207792207e-05,
"loss": 0.5167,
"step": 625
},
{
"epoch": 0.05631118426752144,
"grad_norm": 18.0063419342041,
"learning_rate": 1.6857142857142858e-05,
"loss": 0.4791,
"step": 650
},
{
"epoch": 0.0584769990470415,
"grad_norm": 11.925456047058105,
"learning_rate": 1.750649350649351e-05,
"loss": 0.5022,
"step": 675
},
{
"epoch": 0.06064281382656155,
"grad_norm": 2.7437996864318848,
"learning_rate": 1.8155844155844156e-05,
"loss": 0.438,
"step": 700
},
{
"epoch": 0.0628086286060816,
"grad_norm": 1.8270901441574097,
"learning_rate": 1.8805194805194806e-05,
"loss": 0.3995,
"step": 725
},
{
"epoch": 0.06497444338560167,
"grad_norm": 4.187374591827393,
"learning_rate": 1.9454545454545453e-05,
"loss": 0.2924,
"step": 750
},
{
"epoch": 0.06714025816512172,
"grad_norm": 12.709814071655273,
"learning_rate": 2.0103896103896104e-05,
"loss": 0.4391,
"step": 775
},
{
"epoch": 0.06930607294464178,
"grad_norm": 8.789942741394043,
"learning_rate": 2.0753246753246755e-05,
"loss": 0.4328,
"step": 800
},
{
"epoch": 0.07147188772416183,
"grad_norm": 10.182008743286133,
"learning_rate": 2.137662337662338e-05,
"loss": 0.5658,
"step": 825
},
{
"epoch": 0.07363770250368189,
"grad_norm": 3.5178301334381104,
"learning_rate": 2.2025974025974026e-05,
"loss": 0.4541,
"step": 850
},
{
"epoch": 0.07580351728320193,
"grad_norm": 8.124090194702148,
"learning_rate": 2.2675324675324676e-05,
"loss": 0.5381,
"step": 875
},
{
"epoch": 0.077969332062722,
"grad_norm": 11.69704532623291,
"learning_rate": 2.3324675324675324e-05,
"loss": 0.4523,
"step": 900
},
{
"epoch": 0.08013514684224206,
"grad_norm": 19.822145462036133,
"learning_rate": 2.3974025974025974e-05,
"loss": 0.3522,
"step": 925
},
{
"epoch": 0.0823009616217621,
"grad_norm": 8.31993579864502,
"learning_rate": 2.4623376623376625e-05,
"loss": 0.4475,
"step": 950
},
{
"epoch": 0.08446677640128217,
"grad_norm": 5.60876989364624,
"learning_rate": 2.5246753246753246e-05,
"loss": 0.4448,
"step": 975
},
{
"epoch": 0.08663259118080222,
"grad_norm": 9.872743606567383,
"learning_rate": 2.5896103896103896e-05,
"loss": 0.407,
"step": 1000
},
{
"epoch": 0.08879840596032228,
"grad_norm": 7.193666458129883,
"learning_rate": 2.6545454545454547e-05,
"loss": 0.4616,
"step": 1025
},
{
"epoch": 0.09096422073984232,
"grad_norm": 17.595991134643555,
"learning_rate": 2.7194805194805194e-05,
"loss": 0.4213,
"step": 1050
},
{
"epoch": 0.09313003551936239,
"grad_norm": 3.281184196472168,
"learning_rate": 2.7844155844155844e-05,
"loss": 0.465,
"step": 1075
},
{
"epoch": 0.09529585029888243,
"grad_norm": 7.671459197998047,
"learning_rate": 2.849350649350649e-05,
"loss": 0.2964,
"step": 1100
},
{
"epoch": 0.0974616650784025,
"grad_norm": 7.963995933532715,
"learning_rate": 2.9142857142857142e-05,
"loss": 0.4414,
"step": 1125
},
{
"epoch": 0.09962747985792256,
"grad_norm": 1.8723474740982056,
"learning_rate": 2.9792207792207793e-05,
"loss": 0.3508,
"step": 1150
},
{
"epoch": 0.1017932946374426,
"grad_norm": 5.1907877922058105,
"learning_rate": 2.995090489025799e-05,
"loss": 0.3362,
"step": 1175
},
{
"epoch": 0.10395910941696267,
"grad_norm": 5.219175815582275,
"learning_rate": 2.9878706199460916e-05,
"loss": 0.4953,
"step": 1200
},
{
"epoch": 0.10612492419648271,
"grad_norm": 15.204286575317383,
"learning_rate": 2.9806507508663843e-05,
"loss": 0.4041,
"step": 1225
},
{
"epoch": 0.10829073897600278,
"grad_norm": 5.872297286987305,
"learning_rate": 2.973430881786677e-05,
"loss": 0.3773,
"step": 1250
},
{
"epoch": 0.11045655375552282,
"grad_norm": 7.201790809631348,
"learning_rate": 2.9662110127069697e-05,
"loss": 0.3574,
"step": 1275
},
{
"epoch": 0.11262236853504289,
"grad_norm": 2.872793674468994,
"learning_rate": 2.9589911436272623e-05,
"loss": 0.642,
"step": 1300
},
{
"epoch": 0.11478818331456293,
"grad_norm": 10.854488372802734,
"learning_rate": 2.951771274547555e-05,
"loss": 0.3783,
"step": 1325
},
{
"epoch": 0.116953998094083,
"grad_norm": 2.162464141845703,
"learning_rate": 2.9445514054678477e-05,
"loss": 0.4905,
"step": 1350
},
{
"epoch": 0.11911981287360304,
"grad_norm": 14.541825294494629,
"learning_rate": 2.9373315363881403e-05,
"loss": 0.3937,
"step": 1375
},
{
"epoch": 0.1212856276531231,
"grad_norm": 1.6897481679916382,
"learning_rate": 2.9301116673084327e-05,
"loss": 0.4245,
"step": 1400
},
{
"epoch": 0.12345144243264317,
"grad_norm": 9.359882354736328,
"learning_rate": 2.9228917982287253e-05,
"loss": 0.4139,
"step": 1425
},
{
"epoch": 0.1256172572121632,
"grad_norm": 39.94605255126953,
"learning_rate": 2.915671929149018e-05,
"loss": 0.4305,
"step": 1450
},
{
"epoch": 0.12778307199168326,
"grad_norm": 10.268132209777832,
"learning_rate": 2.908452060069311e-05,
"loss": 0.675,
"step": 1475
},
{
"epoch": 0.12994888677120334,
"grad_norm": 1.7209604978561401,
"learning_rate": 2.9012321909896037e-05,
"loss": 0.55,
"step": 1500
},
{
"epoch": 0.13211470155072338,
"grad_norm": 8.541482925415039,
"learning_rate": 2.894012321909896e-05,
"loss": 0.4033,
"step": 1525
},
{
"epoch": 0.13428051633024343,
"grad_norm": 10.4110107421875,
"learning_rate": 2.8867924528301887e-05,
"loss": 0.4167,
"step": 1550
},
{
"epoch": 0.13644633110976348,
"grad_norm": 10.823756217956543,
"learning_rate": 2.8795725837504814e-05,
"loss": 0.3814,
"step": 1575
},
{
"epoch": 0.13861214588928356,
"grad_norm": 0.6896539926528931,
"learning_rate": 2.872352714670774e-05,
"loss": 0.5183,
"step": 1600
},
{
"epoch": 0.1407779606688036,
"grad_norm": 4.357579231262207,
"learning_rate": 2.8651328455910667e-05,
"loss": 0.3343,
"step": 1625
},
{
"epoch": 0.14294377544832365,
"grad_norm": 12.074344635009766,
"learning_rate": 2.857912976511359e-05,
"loss": 0.4212,
"step": 1650
},
{
"epoch": 0.14510959022784373,
"grad_norm": 11.660531997680664,
"learning_rate": 2.850693107431652e-05,
"loss": 0.4737,
"step": 1675
},
{
"epoch": 0.14727540500736377,
"grad_norm": 15.467144966125488,
"learning_rate": 2.8434732383519447e-05,
"loss": 0.4563,
"step": 1700
},
{
"epoch": 0.14944121978688382,
"grad_norm": 9.277994155883789,
"learning_rate": 2.8362533692722374e-05,
"loss": 0.4251,
"step": 1725
},
{
"epoch": 0.15160703456640387,
"grad_norm": 3.6043941974639893,
"learning_rate": 2.82903350019253e-05,
"loss": 0.3497,
"step": 1750
},
{
"epoch": 0.15377284934592395,
"grad_norm": 3.933353900909424,
"learning_rate": 2.8218136311128224e-05,
"loss": 0.3753,
"step": 1775
},
{
"epoch": 0.155938664125444,
"grad_norm": 3.8728222846984863,
"learning_rate": 2.814593762033115e-05,
"loss": 0.4031,
"step": 1800
},
{
"epoch": 0.15810447890496404,
"grad_norm": 8.067976951599121,
"learning_rate": 2.8073738929534077e-05,
"loss": 0.4037,
"step": 1825
},
{
"epoch": 0.16027029368448412,
"grad_norm": 9.141134262084961,
"learning_rate": 2.8001540238737004e-05,
"loss": 0.4114,
"step": 1850
},
{
"epoch": 0.16243610846400416,
"grad_norm": 1.8272747993469238,
"learning_rate": 2.7929341547939934e-05,
"loss": 0.3848,
"step": 1875
},
{
"epoch": 0.1646019232435242,
"grad_norm": 0.4890976846218109,
"learning_rate": 2.7857142857142858e-05,
"loss": 0.5088,
"step": 1900
},
{
"epoch": 0.16676773802304426,
"grad_norm": 9.043623924255371,
"learning_rate": 2.7784944166345784e-05,
"loss": 0.4032,
"step": 1925
},
{
"epoch": 0.16893355280256434,
"grad_norm": 9.092608451843262,
"learning_rate": 2.771274547554871e-05,
"loss": 0.3354,
"step": 1950
},
{
"epoch": 0.17109936758208438,
"grad_norm": 6.121222972869873,
"learning_rate": 2.7640546784751638e-05,
"loss": 0.4163,
"step": 1975
},
{
"epoch": 0.17326518236160443,
"grad_norm": 1.539663314819336,
"learning_rate": 2.7568348093954564e-05,
"loss": 0.3715,
"step": 2000
},
{
"epoch": 0.17543099714112448,
"grad_norm": 16.089406967163086,
"learning_rate": 2.7496149403157488e-05,
"loss": 0.3424,
"step": 2025
},
{
"epoch": 0.17759681192064455,
"grad_norm": 12.510934829711914,
"learning_rate": 2.7423950712360414e-05,
"loss": 0.3311,
"step": 2050
},
{
"epoch": 0.1797626267001646,
"grad_norm": 2.823338508605957,
"learning_rate": 2.7351752021563345e-05,
"loss": 0.4362,
"step": 2075
},
{
"epoch": 0.18192844147968465,
"grad_norm": 6.191600322723389,
"learning_rate": 2.727955333076627e-05,
"loss": 0.4441,
"step": 2100
},
{
"epoch": 0.18409425625920472,
"grad_norm": 4.86907434463501,
"learning_rate": 2.7207354639969198e-05,
"loss": 0.3122,
"step": 2125
},
{
"epoch": 0.18626007103872477,
"grad_norm": 7.323814868927002,
"learning_rate": 2.713515594917212e-05,
"loss": 0.3717,
"step": 2150
},
{
"epoch": 0.18842588581824482,
"grad_norm": 10.09737491607666,
"learning_rate": 2.7062957258375048e-05,
"loss": 0.3461,
"step": 2175
},
{
"epoch": 0.19059170059776487,
"grad_norm": 8.536800384521484,
"learning_rate": 2.6990758567577975e-05,
"loss": 0.4816,
"step": 2200
},
{
"epoch": 0.19275751537728494,
"grad_norm": 5.237682819366455,
"learning_rate": 2.69185598767809e-05,
"loss": 0.4784,
"step": 2225
},
{
"epoch": 0.194923330156805,
"grad_norm": 10.763497352600098,
"learning_rate": 2.6846361185983828e-05,
"loss": 0.4334,
"step": 2250
},
{
"epoch": 0.19708914493632504,
"grad_norm": 0.7019050121307373,
"learning_rate": 2.6774162495186755e-05,
"loss": 0.3437,
"step": 2275
},
{
"epoch": 0.19925495971584511,
"grad_norm": 8.020634651184082,
"learning_rate": 2.670196380438968e-05,
"loss": 0.4333,
"step": 2300
},
{
"epoch": 0.20142077449536516,
"grad_norm": 10.549779891967773,
"learning_rate": 2.662976511359261e-05,
"loss": 0.3609,
"step": 2325
},
{
"epoch": 0.2035865892748852,
"grad_norm": 5.6236677169799805,
"learning_rate": 2.6557566422795535e-05,
"loss": 0.3437,
"step": 2350
},
{
"epoch": 0.20575240405440526,
"grad_norm": 1.4388600587844849,
"learning_rate": 2.648536773199846e-05,
"loss": 0.4911,
"step": 2375
},
{
"epoch": 0.20791821883392533,
"grad_norm": 4.445183277130127,
"learning_rate": 2.6413169041201385e-05,
"loss": 0.3872,
"step": 2400
},
{
"epoch": 0.21008403361344538,
"grad_norm": 9.076152801513672,
"learning_rate": 2.6340970350404312e-05,
"loss": 0.276,
"step": 2425
},
{
"epoch": 0.21224984839296543,
"grad_norm": 5.573355197906494,
"learning_rate": 2.6268771659607242e-05,
"loss": 0.3318,
"step": 2450
},
{
"epoch": 0.21441566317248548,
"grad_norm": 5.015573024749756,
"learning_rate": 2.619657296881017e-05,
"loss": 0.4833,
"step": 2475
},
{
"epoch": 0.21658147795200555,
"grad_norm": 3.9038755893707275,
"learning_rate": 2.6124374278013092e-05,
"loss": 0.4656,
"step": 2500
},
{
"epoch": 0.2187472927315256,
"grad_norm": 2.66627836227417,
"learning_rate": 2.605217558721602e-05,
"loss": 0.4232,
"step": 2525
},
{
"epoch": 0.22091310751104565,
"grad_norm": 8.859906196594238,
"learning_rate": 2.5979976896418945e-05,
"loss": 0.434,
"step": 2550
},
{
"epoch": 0.22307892229056572,
"grad_norm": 3.2811522483825684,
"learning_rate": 2.5907778205621872e-05,
"loss": 0.2479,
"step": 2575
},
{
"epoch": 0.22524473707008577,
"grad_norm": 8.53447437286377,
"learning_rate": 2.58355795148248e-05,
"loss": 0.4656,
"step": 2600
},
{
"epoch": 0.22741055184960582,
"grad_norm": 6.359921455383301,
"learning_rate": 2.5763380824027722e-05,
"loss": 0.3881,
"step": 2625
},
{
"epoch": 0.22957636662912587,
"grad_norm": 6.196253776550293,
"learning_rate": 2.5691182133230652e-05,
"loss": 0.3637,
"step": 2650
},
{
"epoch": 0.23174218140864594,
"grad_norm": 7.805304050445557,
"learning_rate": 2.561898344243358e-05,
"loss": 0.3099,
"step": 2675
},
{
"epoch": 0.233907996188166,
"grad_norm": 4.51755428314209,
"learning_rate": 2.5546784751636506e-05,
"loss": 0.3933,
"step": 2700
},
{
"epoch": 0.23607381096768604,
"grad_norm": 5.72914981842041,
"learning_rate": 2.5474586060839432e-05,
"loss": 0.3789,
"step": 2725
},
{
"epoch": 0.23823962574720609,
"grad_norm": 2.4809954166412354,
"learning_rate": 2.5402387370042356e-05,
"loss": 0.4056,
"step": 2750
},
{
"epoch": 0.24040544052672616,
"grad_norm": 1.940656065940857,
"learning_rate": 2.5330188679245282e-05,
"loss": 0.4132,
"step": 2775
},
{
"epoch": 0.2425712553062462,
"grad_norm": 3.452242851257324,
"learning_rate": 2.525798998844821e-05,
"loss": 0.375,
"step": 2800
},
{
"epoch": 0.24473707008576626,
"grad_norm": 9.220993041992188,
"learning_rate": 2.5185791297651136e-05,
"loss": 0.3026,
"step": 2825
},
{
"epoch": 0.24690288486528633,
"grad_norm": 10.027073860168457,
"learning_rate": 2.5113592606854066e-05,
"loss": 0.5372,
"step": 2850
},
{
"epoch": 0.24906869964480638,
"grad_norm": 2.228799819946289,
"learning_rate": 2.504139391605699e-05,
"loss": 0.4233,
"step": 2875
},
{
"epoch": 0.2512345144243264,
"grad_norm": 7.281198978424072,
"learning_rate": 2.4969195225259916e-05,
"loss": 0.2945,
"step": 2900
},
{
"epoch": 0.2534003292038465,
"grad_norm": 1.4160314798355103,
"learning_rate": 2.4896996534462843e-05,
"loss": 0.2916,
"step": 2925
},
{
"epoch": 0.2555661439833665,
"grad_norm": 4.095098972320557,
"learning_rate": 2.482479784366577e-05,
"loss": 0.3536,
"step": 2950
},
{
"epoch": 0.25773195876288657,
"grad_norm": 1.413552165031433,
"learning_rate": 2.4752599152868696e-05,
"loss": 0.3246,
"step": 2975
},
{
"epoch": 0.2598977735424067,
"grad_norm": 3.3196184635162354,
"learning_rate": 2.468040046207162e-05,
"loss": 0.4236,
"step": 3000
},
{
"epoch": 0.2620635883219267,
"grad_norm": 11.855537414550781,
"learning_rate": 2.4608201771274546e-05,
"loss": 0.4088,
"step": 3025
},
{
"epoch": 0.26422940310144677,
"grad_norm": 9.322809219360352,
"learning_rate": 2.4536003080477476e-05,
"loss": 0.4522,
"step": 3050
},
{
"epoch": 0.2663952178809668,
"grad_norm": 7.581571578979492,
"learning_rate": 2.4463804389680403e-05,
"loss": 0.3445,
"step": 3075
},
{
"epoch": 0.26856103266048686,
"grad_norm": 2.6131093502044678,
"learning_rate": 2.439160569888333e-05,
"loss": 0.3575,
"step": 3100
},
{
"epoch": 0.2707268474400069,
"grad_norm": 3.68662166595459,
"learning_rate": 2.4319407008086253e-05,
"loss": 0.3809,
"step": 3125
},
{
"epoch": 0.27289266221952696,
"grad_norm": 2.3688032627105713,
"learning_rate": 2.424720831728918e-05,
"loss": 0.3364,
"step": 3150
},
{
"epoch": 0.27505847699904706,
"grad_norm": 1.155315637588501,
"learning_rate": 2.4175009626492106e-05,
"loss": 0.4103,
"step": 3175
},
{
"epoch": 0.2772242917785671,
"grad_norm": 35.7138671875,
"learning_rate": 2.4102810935695033e-05,
"loss": 0.3502,
"step": 3200
},
{
"epoch": 0.27939010655808716,
"grad_norm": 6.429433822631836,
"learning_rate": 2.403061224489796e-05,
"loss": 0.2632,
"step": 3225
},
{
"epoch": 0.2815559213376072,
"grad_norm": 9.816515922546387,
"learning_rate": 2.3958413554100887e-05,
"loss": 0.406,
"step": 3250
},
{
"epoch": 0.28372173611712725,
"grad_norm": 1.9653140306472778,
"learning_rate": 2.3886214863303813e-05,
"loss": 0.4363,
"step": 3275
},
{
"epoch": 0.2858875508966473,
"grad_norm": 9.559599876403809,
"learning_rate": 2.381401617250674e-05,
"loss": 0.2819,
"step": 3300
},
{
"epoch": 0.28805336567616735,
"grad_norm": 10.623549461364746,
"learning_rate": 2.3741817481709667e-05,
"loss": 0.3421,
"step": 3325
},
{
"epoch": 0.29021918045568745,
"grad_norm": 2.4988913536071777,
"learning_rate": 2.366961879091259e-05,
"loss": 0.269,
"step": 3350
},
{
"epoch": 0.2923849952352075,
"grad_norm": 4.704137802124023,
"learning_rate": 2.3597420100115517e-05,
"loss": 0.2902,
"step": 3375
},
{
"epoch": 0.29455081001472755,
"grad_norm": 9.48901653289795,
"learning_rate": 2.3525221409318443e-05,
"loss": 0.3548,
"step": 3400
},
{
"epoch": 0.2967166247942476,
"grad_norm": 0.5201269388198853,
"learning_rate": 2.3453022718521374e-05,
"loss": 0.4575,
"step": 3425
},
{
"epoch": 0.29888243957376764,
"grad_norm": 8.074861526489258,
"learning_rate": 2.33808240277243e-05,
"loss": 0.3942,
"step": 3450
},
{
"epoch": 0.3010482543532877,
"grad_norm": 8.45334243774414,
"learning_rate": 2.3308625336927224e-05,
"loss": 0.3537,
"step": 3475
},
{
"epoch": 0.30321406913280774,
"grad_norm": 2.7069313526153564,
"learning_rate": 2.323642664613015e-05,
"loss": 0.3672,
"step": 3500
},
{
"epoch": 0.30537988391232784,
"grad_norm": 13.849508285522461,
"learning_rate": 2.3164227955333077e-05,
"loss": 0.3502,
"step": 3525
},
{
"epoch": 0.3075456986918479,
"grad_norm": 4.5892462730407715,
"learning_rate": 2.3092029264536004e-05,
"loss": 0.2545,
"step": 3550
},
{
"epoch": 0.30971151347136794,
"grad_norm": 1.035447120666504,
"learning_rate": 2.301983057373893e-05,
"loss": 0.2544,
"step": 3575
},
{
"epoch": 0.311877328250888,
"grad_norm": 5.170057773590088,
"learning_rate": 2.2947631882941854e-05,
"loss": 0.3443,
"step": 3600
},
{
"epoch": 0.31404314303040803,
"grad_norm": 2.908191204071045,
"learning_rate": 2.2875433192144784e-05,
"loss": 0.3784,
"step": 3625
},
{
"epoch": 0.3162089578099281,
"grad_norm": 9.946891784667969,
"learning_rate": 2.280323450134771e-05,
"loss": 0.3828,
"step": 3650
},
{
"epoch": 0.31837477258944813,
"grad_norm": 10.337167739868164,
"learning_rate": 2.2731035810550637e-05,
"loss": 0.4032,
"step": 3675
},
{
"epoch": 0.32054058736896823,
"grad_norm": 10.093758583068848,
"learning_rate": 2.2658837119753564e-05,
"loss": 0.2556,
"step": 3700
},
{
"epoch": 0.3227064021484883,
"grad_norm": 7.309471130371094,
"learning_rate": 2.2586638428956487e-05,
"loss": 0.3352,
"step": 3725
},
{
"epoch": 0.32487221692800833,
"grad_norm": 10.050370216369629,
"learning_rate": 2.2514439738159414e-05,
"loss": 0.4054,
"step": 3750
},
{
"epoch": 0.3270380317075284,
"grad_norm": 3.858546733856201,
"learning_rate": 2.244224104736234e-05,
"loss": 0.3049,
"step": 3775
},
{
"epoch": 0.3292038464870484,
"grad_norm": 5.640537261962891,
"learning_rate": 2.2370042356565267e-05,
"loss": 0.2223,
"step": 3800
},
{
"epoch": 0.33136966126656847,
"grad_norm": 5.106541633605957,
"learning_rate": 2.2297843665768198e-05,
"loss": 0.4878,
"step": 3825
},
{
"epoch": 0.3335354760460885,
"grad_norm": 7.738224029541016,
"learning_rate": 2.222564497497112e-05,
"loss": 0.3015,
"step": 3850
},
{
"epoch": 0.33570129082560857,
"grad_norm": 12.313666343688965,
"learning_rate": 2.2153446284174048e-05,
"loss": 0.3816,
"step": 3875
},
{
"epoch": 0.33786710560512867,
"grad_norm": 0.9929437041282654,
"learning_rate": 2.2081247593376974e-05,
"loss": 0.3334,
"step": 3900
},
{
"epoch": 0.3400329203846487,
"grad_norm": 5.753032207489014,
"learning_rate": 2.20090489025799e-05,
"loss": 0.3724,
"step": 3925
},
{
"epoch": 0.34219873516416877,
"grad_norm": 8.37396240234375,
"learning_rate": 2.1936850211782828e-05,
"loss": 0.4217,
"step": 3950
},
{
"epoch": 0.3443645499436888,
"grad_norm": 7.365005016326904,
"learning_rate": 2.186465152098575e-05,
"loss": 0.4339,
"step": 3975
},
{
"epoch": 0.34653036472320886,
"grad_norm": 1.91083824634552,
"learning_rate": 2.1792452830188678e-05,
"loss": 0.3642,
"step": 4000
},
{
"epoch": 0.3486961795027289,
"grad_norm": 3.0427494049072266,
"learning_rate": 2.1720254139391608e-05,
"loss": 0.3819,
"step": 4025
},
{
"epoch": 0.35086199428224896,
"grad_norm": 1.176952838897705,
"learning_rate": 2.1648055448594535e-05,
"loss": 0.2796,
"step": 4050
},
{
"epoch": 0.35302780906176906,
"grad_norm": 1.0579583644866943,
"learning_rate": 2.157585675779746e-05,
"loss": 0.4277,
"step": 4075
},
{
"epoch": 0.3551936238412891,
"grad_norm": 11.798035621643066,
"learning_rate": 2.1503658067000385e-05,
"loss": 0.3407,
"step": 4100
},
{
"epoch": 0.35735943862080916,
"grad_norm": 15.57787036895752,
"learning_rate": 2.143145937620331e-05,
"loss": 0.2781,
"step": 4125
},
{
"epoch": 0.3595252534003292,
"grad_norm": 8.533368110656738,
"learning_rate": 2.1359260685406238e-05,
"loss": 0.4274,
"step": 4150
},
{
"epoch": 0.36169106817984925,
"grad_norm": 8.470250129699707,
"learning_rate": 2.1287061994609165e-05,
"loss": 0.3609,
"step": 4175
},
{
"epoch": 0.3638568829593693,
"grad_norm": 6.417985439300537,
"learning_rate": 2.121486330381209e-05,
"loss": 0.3476,
"step": 4200
},
{
"epoch": 0.36602269773888935,
"grad_norm": 8.685192108154297,
"learning_rate": 2.1142664613015018e-05,
"loss": 0.41,
"step": 4225
},
{
"epoch": 0.36818851251840945,
"grad_norm": 7.082727432250977,
"learning_rate": 2.1070465922217945e-05,
"loss": 0.4003,
"step": 4250
},
{
"epoch": 0.3703543272979295,
"grad_norm": 4.621776103973389,
"learning_rate": 2.099826723142087e-05,
"loss": 0.306,
"step": 4275
},
{
"epoch": 0.37252014207744955,
"grad_norm": 3.1071817874908447,
"learning_rate": 2.09260685406238e-05,
"loss": 0.2335,
"step": 4300
},
{
"epoch": 0.3746859568569696,
"grad_norm": 7.23638916015625,
"learning_rate": 2.085386984982672e-05,
"loss": 0.2733,
"step": 4325
},
{
"epoch": 0.37685177163648964,
"grad_norm": 6.893523693084717,
"learning_rate": 2.078167115902965e-05,
"loss": 0.3007,
"step": 4350
},
{
"epoch": 0.3790175864160097,
"grad_norm": 5.9917073249816895,
"learning_rate": 2.0709472468232575e-05,
"loss": 0.3086,
"step": 4375
},
{
"epoch": 0.38118340119552974,
"grad_norm": 6.596795558929443,
"learning_rate": 2.0637273777435502e-05,
"loss": 0.365,
"step": 4400
},
{
"epoch": 0.38334921597504984,
"grad_norm": 9.045963287353516,
"learning_rate": 2.0565075086638432e-05,
"loss": 0.3255,
"step": 4425
},
{
"epoch": 0.3855150307545699,
"grad_norm": 6.755446434020996,
"learning_rate": 2.0492876395841355e-05,
"loss": 0.3765,
"step": 4450
},
{
"epoch": 0.38768084553408994,
"grad_norm": 11.626537322998047,
"learning_rate": 2.0420677705044282e-05,
"loss": 0.2946,
"step": 4475
},
{
"epoch": 0.38984666031361,
"grad_norm": 4.125662326812744,
"learning_rate": 2.034847901424721e-05,
"loss": 0.3298,
"step": 4500
},
{
"epoch": 0.39201247509313003,
"grad_norm": 1.2437127828598022,
"learning_rate": 2.0276280323450135e-05,
"loss": 0.3645,
"step": 4525
},
{
"epoch": 0.3941782898726501,
"grad_norm": 10.272943496704102,
"learning_rate": 2.0204081632653062e-05,
"loss": 0.2403,
"step": 4550
},
{
"epoch": 0.3963441046521701,
"grad_norm": 2.164606809616089,
"learning_rate": 2.0131882941855985e-05,
"loss": 0.28,
"step": 4575
},
{
"epoch": 0.39850991943169023,
"grad_norm": 9.157061576843262,
"learning_rate": 2.0059684251058916e-05,
"loss": 0.3814,
"step": 4600
},
{
"epoch": 0.4006757342112103,
"grad_norm": 4.034579277038574,
"learning_rate": 1.9987485560261842e-05,
"loss": 0.3419,
"step": 4625
},
{
"epoch": 0.4028415489907303,
"grad_norm": 2.5503344535827637,
"learning_rate": 1.991528686946477e-05,
"loss": 0.3374,
"step": 4650
},
{
"epoch": 0.4050073637702504,
"grad_norm": 4.660188674926758,
"learning_rate": 1.9843088178667696e-05,
"loss": 0.3511,
"step": 4675
},
{
"epoch": 0.4071731785497704,
"grad_norm": 7.020951747894287,
"learning_rate": 1.977088948787062e-05,
"loss": 0.4339,
"step": 4700
},
{
"epoch": 0.40933899332929047,
"grad_norm": 5.4507269859313965,
"learning_rate": 1.9698690797073546e-05,
"loss": 0.3441,
"step": 4725
},
{
"epoch": 0.4115048081088105,
"grad_norm": 11.266243934631348,
"learning_rate": 1.9626492106276472e-05,
"loss": 0.346,
"step": 4750
},
{
"epoch": 0.41367062288833056,
"grad_norm": 2.1191511154174805,
"learning_rate": 1.95542934154794e-05,
"loss": 0.3723,
"step": 4775
},
{
"epoch": 0.41583643766785067,
"grad_norm": 1.9068052768707275,
"learning_rate": 1.948209472468233e-05,
"loss": 0.2075,
"step": 4800
},
{
"epoch": 0.4180022524473707,
"grad_norm": 0.36394304037094116,
"learning_rate": 1.9409896033885253e-05,
"loss": 0.2431,
"step": 4825
},
{
"epoch": 0.42016806722689076,
"grad_norm": 6.177628993988037,
"learning_rate": 1.933769734308818e-05,
"loss": 0.2642,
"step": 4850
},
{
"epoch": 0.4223338820064108,
"grad_norm": 3.6669273376464844,
"learning_rate": 1.9265498652291106e-05,
"loss": 0.1763,
"step": 4875
},
{
"epoch": 0.42449969678593086,
"grad_norm": 7.836557865142822,
"learning_rate": 1.9193299961494033e-05,
"loss": 0.3862,
"step": 4900
},
{
"epoch": 0.4266655115654509,
"grad_norm": 10.140486717224121,
"learning_rate": 1.912110127069696e-05,
"loss": 0.3053,
"step": 4925
},
{
"epoch": 0.42883132634497095,
"grad_norm": 2.8873586654663086,
"learning_rate": 1.9048902579899883e-05,
"loss": 0.3162,
"step": 4950
},
{
"epoch": 0.43099714112449106,
"grad_norm": 1.758466362953186,
"learning_rate": 1.897670388910281e-05,
"loss": 0.3178,
"step": 4975
},
{
"epoch": 0.4331629559040111,
"grad_norm": 7.523651599884033,
"learning_rate": 1.890450519830574e-05,
"loss": 0.2789,
"step": 5000
},
{
"epoch": 0.43532877068353115,
"grad_norm": 5.955496311187744,
"learning_rate": 1.8832306507508666e-05,
"loss": 0.1777,
"step": 5025
},
{
"epoch": 0.4374945854630512,
"grad_norm": 9.068547248840332,
"learning_rate": 1.8760107816711593e-05,
"loss": 0.4155,
"step": 5050
},
{
"epoch": 0.43966040024257125,
"grad_norm": 4.900373458862305,
"learning_rate": 1.8687909125914516e-05,
"loss": 0.2983,
"step": 5075
},
{
"epoch": 0.4418262150220913,
"grad_norm": 3.5501790046691895,
"learning_rate": 1.8615710435117443e-05,
"loss": 0.3687,
"step": 5100
},
{
"epoch": 0.44399202980161134,
"grad_norm": 1.0216822624206543,
"learning_rate": 1.854351174432037e-05,
"loss": 0.2428,
"step": 5125
},
{
"epoch": 0.44615784458113145,
"grad_norm": 7.637403964996338,
"learning_rate": 1.8471313053523296e-05,
"loss": 0.3071,
"step": 5150
},
{
"epoch": 0.4483236593606515,
"grad_norm": 9.478981018066406,
"learning_rate": 1.8399114362726223e-05,
"loss": 0.2911,
"step": 5175
},
{
"epoch": 0.45048947414017154,
"grad_norm": 3.875411033630371,
"learning_rate": 1.832691567192915e-05,
"loss": 0.3152,
"step": 5200
},
{
"epoch": 0.4526552889196916,
"grad_norm": 1.1700037717819214,
"learning_rate": 1.8254716981132077e-05,
"loss": 0.2776,
"step": 5225
},
{
"epoch": 0.45482110369921164,
"grad_norm": 4.037864685058594,
"learning_rate": 1.8182518290335003e-05,
"loss": 0.2674,
"step": 5250
},
{
"epoch": 0.4569869184787317,
"grad_norm": 2.6295673847198486,
"learning_rate": 1.811031959953793e-05,
"loss": 0.3035,
"step": 5275
},
{
"epoch": 0.45915273325825173,
"grad_norm": 9.654006004333496,
"learning_rate": 1.8038120908740853e-05,
"loss": 0.3352,
"step": 5300
},
{
"epoch": 0.46131854803777184,
"grad_norm": 7.339272975921631,
"learning_rate": 1.796592221794378e-05,
"loss": 0.3879,
"step": 5325
},
{
"epoch": 0.4634843628172919,
"grad_norm": 5.668703079223633,
"learning_rate": 1.7893723527146707e-05,
"loss": 0.3828,
"step": 5350
},
{
"epoch": 0.46565017759681193,
"grad_norm": 11.843222618103027,
"learning_rate": 1.7821524836349633e-05,
"loss": 0.2797,
"step": 5375
},
{
"epoch": 0.467815992376332,
"grad_norm": 3.3071844577789307,
"learning_rate": 1.7749326145552564e-05,
"loss": 0.3492,
"step": 5400
},
{
"epoch": 0.469981807155852,
"grad_norm": 11.303645133972168,
"learning_rate": 1.7677127454755487e-05,
"loss": 0.5,
"step": 5425
},
{
"epoch": 0.4721476219353721,
"grad_norm": 1.1275362968444824,
"learning_rate": 1.7604928763958414e-05,
"loss": 0.2317,
"step": 5450
},
{
"epoch": 0.4743134367148921,
"grad_norm": 11.97022533416748,
"learning_rate": 1.753273007316134e-05,
"loss": 0.2411,
"step": 5475
},
{
"epoch": 0.47647925149441217,
"grad_norm": 2.9647443294525146,
"learning_rate": 1.7460531382364267e-05,
"loss": 0.277,
"step": 5500
},
{
"epoch": 0.4786450662739323,
"grad_norm": 5.046292781829834,
"learning_rate": 1.7388332691567194e-05,
"loss": 0.4112,
"step": 5525
},
{
"epoch": 0.4808108810534523,
"grad_norm": 8.11351203918457,
"learning_rate": 1.7316134000770117e-05,
"loss": 0.5116,
"step": 5550
},
{
"epoch": 0.48297669583297237,
"grad_norm": 1.0861672163009644,
"learning_rate": 1.7243935309973047e-05,
"loss": 0.3264,
"step": 5575
},
{
"epoch": 0.4851425106124924,
"grad_norm": 2.311553955078125,
"learning_rate": 1.7171736619175974e-05,
"loss": 0.3688,
"step": 5600
},
{
"epoch": 0.48730832539201246,
"grad_norm": 2.371721029281616,
"learning_rate": 1.70995379283789e-05,
"loss": 0.3224,
"step": 5625
},
{
"epoch": 0.4894741401715325,
"grad_norm": 7.7612714767456055,
"learning_rate": 1.7027339237581827e-05,
"loss": 0.3778,
"step": 5650
},
{
"epoch": 0.49163995495105256,
"grad_norm": 7.416019916534424,
"learning_rate": 1.695514054678475e-05,
"loss": 0.3671,
"step": 5675
},
{
"epoch": 0.49380576973057266,
"grad_norm": 7.0320940017700195,
"learning_rate": 1.6882941855987677e-05,
"loss": 0.3331,
"step": 5700
},
{
"epoch": 0.4959715845100927,
"grad_norm": 0.8671308159828186,
"learning_rate": 1.6810743165190604e-05,
"loss": 0.3426,
"step": 5725
},
{
"epoch": 0.49813739928961276,
"grad_norm": 6.607793807983398,
"learning_rate": 1.673854447439353e-05,
"loss": 0.2863,
"step": 5750
},
{
"epoch": 0.5003032140691328,
"grad_norm": 10.399803161621094,
"learning_rate": 1.666634578359646e-05,
"loss": 0.5822,
"step": 5775
},
{
"epoch": 0.5024690288486529,
"grad_norm": 2.4261348247528076,
"learning_rate": 1.6594147092799384e-05,
"loss": 0.2687,
"step": 5800
},
{
"epoch": 0.5046348436281729,
"grad_norm": 0.30012401938438416,
"learning_rate": 1.652194840200231e-05,
"loss": 0.3365,
"step": 5825
},
{
"epoch": 0.506800658407693,
"grad_norm": 8.255668640136719,
"learning_rate": 1.6449749711205238e-05,
"loss": 0.4609,
"step": 5850
},
{
"epoch": 0.508966473187213,
"grad_norm": 6.495670795440674,
"learning_rate": 1.6377551020408164e-05,
"loss": 0.3127,
"step": 5875
},
{
"epoch": 0.511132287966733,
"grad_norm": 4.311783790588379,
"learning_rate": 1.630535232961109e-05,
"loss": 0.2705,
"step": 5900
},
{
"epoch": 0.5132981027462531,
"grad_norm": 7.5022430419921875,
"learning_rate": 1.6233153638814014e-05,
"loss": 0.3089,
"step": 5925
},
{
"epoch": 0.5154639175257731,
"grad_norm": 9.813260078430176,
"learning_rate": 1.616095494801694e-05,
"loss": 0.3386,
"step": 5950
},
{
"epoch": 0.5176297323052933,
"grad_norm": 8.11892318725586,
"learning_rate": 1.608875625721987e-05,
"loss": 0.3796,
"step": 5975
},
{
"epoch": 0.5197955470848133,
"grad_norm": 8.750290870666504,
"learning_rate": 1.6016557566422798e-05,
"loss": 0.4231,
"step": 6000
},
{
"epoch": 0.5219613618643334,
"grad_norm": 8.316088676452637,
"learning_rate": 1.5944358875625725e-05,
"loss": 0.3922,
"step": 6025
},
{
"epoch": 0.5241271766438534,
"grad_norm": 4.458547592163086,
"learning_rate": 1.5872160184828648e-05,
"loss": 0.3138,
"step": 6050
},
{
"epoch": 0.5262929914233735,
"grad_norm": 4.100847244262695,
"learning_rate": 1.5799961494031575e-05,
"loss": 0.3106,
"step": 6075
},
{
"epoch": 0.5284588062028935,
"grad_norm": 3.5927000045776367,
"learning_rate": 1.57277628032345e-05,
"loss": 0.188,
"step": 6100
},
{
"epoch": 0.5306246209824136,
"grad_norm": 0.6444216370582581,
"learning_rate": 1.5655564112437428e-05,
"loss": 0.209,
"step": 6125
},
{
"epoch": 0.5327904357619336,
"grad_norm": 6.649785041809082,
"learning_rate": 1.5583365421640355e-05,
"loss": 0.2617,
"step": 6150
},
{
"epoch": 0.5349562505414537,
"grad_norm": 8.491826057434082,
"learning_rate": 1.551116673084328e-05,
"loss": 0.3059,
"step": 6175
},
{
"epoch": 0.5371220653209737,
"grad_norm": 22.71511459350586,
"learning_rate": 1.5438968040046208e-05,
"loss": 0.2764,
"step": 6200
},
{
"epoch": 0.5392878801004938,
"grad_norm": 6.877171516418457,
"learning_rate": 1.5366769349249135e-05,
"loss": 0.2801,
"step": 6225
},
{
"epoch": 0.5414536948800138,
"grad_norm": 0.46479833126068115,
"learning_rate": 1.529457065845206e-05,
"loss": 0.3744,
"step": 6250
},
{
"epoch": 0.5436195096595339,
"grad_norm": 7.200215816497803,
"learning_rate": 1.5222371967654987e-05,
"loss": 0.3067,
"step": 6275
},
{
"epoch": 0.5457853244390539,
"grad_norm": 6.230359077453613,
"learning_rate": 1.5150173276857913e-05,
"loss": 0.3305,
"step": 6300
},
{
"epoch": 0.5479511392185741,
"grad_norm": 3.2241950035095215,
"learning_rate": 1.5077974586060838e-05,
"loss": 0.2827,
"step": 6325
},
{
"epoch": 0.5501169539980941,
"grad_norm": 10.813590049743652,
"learning_rate": 1.5005775895263765e-05,
"loss": 0.2712,
"step": 6350
},
{
"epoch": 0.5522827687776142,
"grad_norm": 3.5207877159118652,
"learning_rate": 1.4933577204466692e-05,
"loss": 0.2677,
"step": 6375
},
{
"epoch": 0.5544485835571342,
"grad_norm": 6.884098529815674,
"learning_rate": 1.4861378513669619e-05,
"loss": 0.4269,
"step": 6400
},
{
"epoch": 0.5566143983366543,
"grad_norm": 12.490416526794434,
"learning_rate": 1.4789179822872547e-05,
"loss": 0.3834,
"step": 6425
},
{
"epoch": 0.5587802131161743,
"grad_norm": 6.844019889831543,
"learning_rate": 1.4716981132075472e-05,
"loss": 0.4177,
"step": 6450
},
{
"epoch": 0.5609460278956944,
"grad_norm": 2.4574711322784424,
"learning_rate": 1.4644782441278399e-05,
"loss": 0.2457,
"step": 6475
},
{
"epoch": 0.5631118426752144,
"grad_norm": 4.939560413360596,
"learning_rate": 1.4572583750481324e-05,
"loss": 0.348,
"step": 6500
},
{
"epoch": 0.5652776574547345,
"grad_norm": 11.443745613098145,
"learning_rate": 1.4500385059684252e-05,
"loss": 0.3035,
"step": 6525
},
{
"epoch": 0.5674434722342545,
"grad_norm": 5.136826515197754,
"learning_rate": 1.4428186368887177e-05,
"loss": 0.39,
"step": 6550
},
{
"epoch": 0.5696092870137746,
"grad_norm": 8.772330284118652,
"learning_rate": 1.4355987678090104e-05,
"loss": 0.366,
"step": 6575
},
{
"epoch": 0.5717751017932946,
"grad_norm": 0.46080633997917175,
"learning_rate": 1.428378898729303e-05,
"loss": 0.2299,
"step": 6600
},
{
"epoch": 0.5739409165728147,
"grad_norm": 5.478773593902588,
"learning_rate": 1.4211590296495957e-05,
"loss": 0.1737,
"step": 6625
},
{
"epoch": 0.5761067313523347,
"grad_norm": 11.235420227050781,
"learning_rate": 1.4139391605698884e-05,
"loss": 0.3773,
"step": 6650
},
{
"epoch": 0.5782725461318549,
"grad_norm": 7.810971260070801,
"learning_rate": 1.4067192914901809e-05,
"loss": 0.3409,
"step": 6675
},
{
"epoch": 0.5804383609113749,
"grad_norm": 2.817094087600708,
"learning_rate": 1.3994994224104737e-05,
"loss": 0.1739,
"step": 6700
},
{
"epoch": 0.582604175690895,
"grad_norm": 0.4941748082637787,
"learning_rate": 1.3922795533307664e-05,
"loss": 0.3462,
"step": 6725
},
{
"epoch": 0.584769990470415,
"grad_norm": 1.5013363361358643,
"learning_rate": 1.3850596842510589e-05,
"loss": 0.2976,
"step": 6750
},
{
"epoch": 0.586935805249935,
"grad_norm": 4.63820219039917,
"learning_rate": 1.3778398151713516e-05,
"loss": 0.3246,
"step": 6775
},
{
"epoch": 0.5891016200294551,
"grad_norm": 0.6134036779403687,
"learning_rate": 1.3706199460916443e-05,
"loss": 0.3808,
"step": 6800
},
{
"epoch": 0.5912674348089751,
"grad_norm": 9.693577766418457,
"learning_rate": 1.363400077011937e-05,
"loss": 0.2926,
"step": 6825
},
{
"epoch": 0.5934332495884952,
"grad_norm": 8.138602256774902,
"learning_rate": 1.3561802079322296e-05,
"loss": 0.2709,
"step": 6850
},
{
"epoch": 0.5955990643680152,
"grad_norm": 5.065515041351318,
"learning_rate": 1.3489603388525221e-05,
"loss": 0.3777,
"step": 6875
},
{
"epoch": 0.5977648791475353,
"grad_norm": 6.169302463531494,
"learning_rate": 1.341740469772815e-05,
"loss": 0.2834,
"step": 6900
},
{
"epoch": 0.5999306939270553,
"grad_norm": 1.4236884117126465,
"learning_rate": 1.3345206006931074e-05,
"loss": 0.2965,
"step": 6925
},
{
"epoch": 0.6020965087065754,
"grad_norm": 4.954479217529297,
"learning_rate": 1.3273007316134001e-05,
"loss": 0.2399,
"step": 6950
},
{
"epoch": 0.6042623234860954,
"grad_norm": 1.1738444566726685,
"learning_rate": 1.3200808625336928e-05,
"loss": 0.2936,
"step": 6975
},
{
"epoch": 0.6064281382656155,
"grad_norm": 6.822793006896973,
"learning_rate": 1.3128609934539855e-05,
"loss": 0.2674,
"step": 7000
},
{
"epoch": 0.6085939530451355,
"grad_norm": 9.408463478088379,
"learning_rate": 1.3056411243742781e-05,
"loss": 0.265,
"step": 7025
},
{
"epoch": 0.6107597678246557,
"grad_norm": 24.97877311706543,
"learning_rate": 1.2984212552945706e-05,
"loss": 0.3257,
"step": 7050
},
{
"epoch": 0.6129255826041757,
"grad_norm": 2.854039192199707,
"learning_rate": 1.2912013862148633e-05,
"loss": 0.3504,
"step": 7075
},
{
"epoch": 0.6150913973836958,
"grad_norm": 0.40900859236717224,
"learning_rate": 1.283981517135156e-05,
"loss": 0.1485,
"step": 7100
},
{
"epoch": 0.6172572121632158,
"grad_norm": 5.776600360870361,
"learning_rate": 1.2767616480554486e-05,
"loss": 0.2598,
"step": 7125
},
{
"epoch": 0.6194230269427359,
"grad_norm": 1.7507195472717285,
"learning_rate": 1.2695417789757413e-05,
"loss": 0.2838,
"step": 7150
},
{
"epoch": 0.6215888417222559,
"grad_norm": 7.723363399505615,
"learning_rate": 1.2623219098960338e-05,
"loss": 0.3391,
"step": 7175
},
{
"epoch": 0.623754656501776,
"grad_norm": 6.485815048217773,
"learning_rate": 1.2551020408163267e-05,
"loss": 0.3568,
"step": 7200
},
{
"epoch": 0.625920471281296,
"grad_norm": 0.392874151468277,
"learning_rate": 1.2481709664998075e-05,
"loss": 0.3001,
"step": 7225
},
{
"epoch": 0.6280862860608161,
"grad_norm": 1.3930811882019043,
"learning_rate": 1.2409510974201001e-05,
"loss": 0.2613,
"step": 7250
},
{
"epoch": 0.6302521008403361,
"grad_norm": 0.3461158275604248,
"learning_rate": 1.2337312283403928e-05,
"loss": 0.3379,
"step": 7275
},
{
"epoch": 0.6324179156198562,
"grad_norm": 3.489888906478882,
"learning_rate": 1.2265113592606855e-05,
"loss": 0.3347,
"step": 7300
},
{
"epoch": 0.6345837303993762,
"grad_norm": 2.3235511779785156,
"learning_rate": 1.219291490180978e-05,
"loss": 0.242,
"step": 7325
},
{
"epoch": 0.6367495451788963,
"grad_norm": 10.576093673706055,
"learning_rate": 1.2120716211012708e-05,
"loss": 0.3076,
"step": 7350
},
{
"epoch": 0.6389153599584163,
"grad_norm": 4.862971305847168,
"learning_rate": 1.2048517520215633e-05,
"loss": 0.3055,
"step": 7375
},
{
"epoch": 0.6410811747379365,
"grad_norm": 4.282524108886719,
"learning_rate": 1.197631882941856e-05,
"loss": 0.4014,
"step": 7400
},
{
"epoch": 0.6432469895174565,
"grad_norm": 1.2869305610656738,
"learning_rate": 1.1904120138621487e-05,
"loss": 0.3723,
"step": 7425
},
{
"epoch": 0.6454128042969766,
"grad_norm": 8.37488842010498,
"learning_rate": 1.1831921447824414e-05,
"loss": 0.3421,
"step": 7450
},
{
"epoch": 0.6475786190764966,
"grad_norm": 8.292667388916016,
"learning_rate": 1.175972275702734e-05,
"loss": 0.4306,
"step": 7475
},
{
"epoch": 0.6497444338560167,
"grad_norm": 7.678843975067139,
"learning_rate": 1.1687524066230265e-05,
"loss": 0.2536,
"step": 7500
},
{
"epoch": 0.6519102486355367,
"grad_norm": 1.5608030557632446,
"learning_rate": 1.1615325375433192e-05,
"loss": 0.264,
"step": 7525
},
{
"epoch": 0.6540760634150568,
"grad_norm": 7.649046897888184,
"learning_rate": 1.1543126684636119e-05,
"loss": 0.1767,
"step": 7550
},
{
"epoch": 0.6562418781945768,
"grad_norm": 4.701557636260986,
"learning_rate": 1.1470927993839045e-05,
"loss": 0.259,
"step": 7575
},
{
"epoch": 0.6584076929740968,
"grad_norm": 14.77114200592041,
"learning_rate": 1.1398729303041972e-05,
"loss": 0.2761,
"step": 7600
},
{
"epoch": 0.6605735077536169,
"grad_norm": 0.08189712464809418,
"learning_rate": 1.1326530612244897e-05,
"loss": 0.2934,
"step": 7625
},
{
"epoch": 0.6627393225331369,
"grad_norm": 8.246410369873047,
"learning_rate": 1.1254331921447826e-05,
"loss": 0.3055,
"step": 7650
},
{
"epoch": 0.664905137312657,
"grad_norm": 2.8091800212860107,
"learning_rate": 1.118213323065075e-05,
"loss": 0.2532,
"step": 7675
},
{
"epoch": 0.667070952092177,
"grad_norm": 8.43855094909668,
"learning_rate": 1.1109934539853677e-05,
"loss": 0.2942,
"step": 7700
},
{
"epoch": 0.6692367668716971,
"grad_norm": 2.259917974472046,
"learning_rate": 1.1037735849056604e-05,
"loss": 0.2048,
"step": 7725
},
{
"epoch": 0.6714025816512171,
"grad_norm": 13.296177864074707,
"learning_rate": 1.096553715825953e-05,
"loss": 0.2884,
"step": 7750
},
{
"epoch": 0.6735683964307373,
"grad_norm": 7.745298862457275,
"learning_rate": 1.0893338467462457e-05,
"loss": 0.3598,
"step": 7775
},
{
"epoch": 0.6757342112102573,
"grad_norm": 1.932173490524292,
"learning_rate": 1.0821139776665382e-05,
"loss": 0.3318,
"step": 7800
},
{
"epoch": 0.6779000259897774,
"grad_norm": 7.833034515380859,
"learning_rate": 1.0748941085868309e-05,
"loss": 0.3058,
"step": 7825
},
{
"epoch": 0.6800658407692974,
"grad_norm": 8.620037078857422,
"learning_rate": 1.0676742395071238e-05,
"loss": 0.3395,
"step": 7850
},
{
"epoch": 0.6822316555488175,
"grad_norm": 8.948209762573242,
"learning_rate": 1.0604543704274163e-05,
"loss": 0.2973,
"step": 7875
},
{
"epoch": 0.6843974703283375,
"grad_norm": 5.001883506774902,
"learning_rate": 1.053234501347709e-05,
"loss": 0.2741,
"step": 7900
},
{
"epoch": 0.6865632851078576,
"grad_norm": 10.376258850097656,
"learning_rate": 1.0460146322680016e-05,
"loss": 0.2493,
"step": 7925
},
{
"epoch": 0.6887290998873776,
"grad_norm": 9.021862030029297,
"learning_rate": 1.0387947631882943e-05,
"loss": 0.2966,
"step": 7950
},
{
"epoch": 0.6908949146668977,
"grad_norm": 12.025108337402344,
"learning_rate": 1.0315748941085868e-05,
"loss": 0.3207,
"step": 7975
},
{
"epoch": 0.6930607294464177,
"grad_norm": 0.8383066058158875,
"learning_rate": 1.0243550250288794e-05,
"loss": 0.2501,
"step": 8000
},
{
"epoch": 0.6952265442259378,
"grad_norm": 2.6812140941619873,
"learning_rate": 1.0171351559491723e-05,
"loss": 0.4028,
"step": 8025
},
{
"epoch": 0.6973923590054578,
"grad_norm": 11.301798820495605,
"learning_rate": 1.0099152868694648e-05,
"loss": 0.3549,
"step": 8050
},
{
"epoch": 0.6995581737849779,
"grad_norm": 8.55245304107666,
"learning_rate": 1.0026954177897575e-05,
"loss": 0.3805,
"step": 8075
},
{
"epoch": 0.7017239885644979,
"grad_norm": 1.9036015272140503,
"learning_rate": 9.9547554871005e-06,
"loss": 0.353,
"step": 8100
},
{
"epoch": 0.7038898033440181,
"grad_norm": 1.0196151733398438,
"learning_rate": 9.882556796303428e-06,
"loss": 0.3569,
"step": 8125
},
{
"epoch": 0.7060556181235381,
"grad_norm": 2.688908338546753,
"learning_rate": 9.810358105506355e-06,
"loss": 0.2588,
"step": 8150
},
{
"epoch": 0.7082214329030582,
"grad_norm": 0.6335782408714294,
"learning_rate": 9.73815941470928e-06,
"loss": 0.2252,
"step": 8175
},
{
"epoch": 0.7103872476825782,
"grad_norm": 4.539221286773682,
"learning_rate": 9.665960723912206e-06,
"loss": 0.2747,
"step": 8200
},
{
"epoch": 0.7125530624620983,
"grad_norm": 8.757186889648438,
"learning_rate": 9.593762033115133e-06,
"loss": 0.3239,
"step": 8225
},
{
"epoch": 0.7147188772416183,
"grad_norm": 1.7275235652923584,
"learning_rate": 9.52156334231806e-06,
"loss": 0.2954,
"step": 8250
},
{
"epoch": 0.7168846920211384,
"grad_norm": 6.338670253753662,
"learning_rate": 9.449364651520987e-06,
"loss": 0.3749,
"step": 8275
},
{
"epoch": 0.7190505068006584,
"grad_norm": 1.565496563911438,
"learning_rate": 9.377165960723912e-06,
"loss": 0.2757,
"step": 8300
},
{
"epoch": 0.7212163215801785,
"grad_norm": 0.0664602667093277,
"learning_rate": 9.30496726992684e-06,
"loss": 0.3012,
"step": 8325
},
{
"epoch": 0.7233821363596985,
"grad_norm": 10.375814437866211,
"learning_rate": 9.232768579129765e-06,
"loss": 0.2985,
"step": 8350
},
{
"epoch": 0.7255479511392185,
"grad_norm": 16.607072830200195,
"learning_rate": 9.160569888332692e-06,
"loss": 0.2656,
"step": 8375
},
{
"epoch": 0.7277137659187386,
"grad_norm": 0.6724597811698914,
"learning_rate": 9.088371197535618e-06,
"loss": 0.2007,
"step": 8400
},
{
"epoch": 0.7298795806982586,
"grad_norm": 2.3397414684295654,
"learning_rate": 9.016172506738545e-06,
"loss": 0.2402,
"step": 8425
},
{
"epoch": 0.7320453954777787,
"grad_norm": 11.172548294067383,
"learning_rate": 8.943973815941472e-06,
"loss": 0.3434,
"step": 8450
},
{
"epoch": 0.7342112102572987,
"grad_norm": 12.031539916992188,
"learning_rate": 8.871775125144397e-06,
"loss": 0.2628,
"step": 8475
},
{
"epoch": 0.7363770250368189,
"grad_norm": 0.37211769819259644,
"learning_rate": 8.799576434347324e-06,
"loss": 0.265,
"step": 8500
},
{
"epoch": 0.738542839816339,
"grad_norm": 6.181528568267822,
"learning_rate": 8.72737774355025e-06,
"loss": 0.3748,
"step": 8525
},
{
"epoch": 0.740708654595859,
"grad_norm": 2.7227742671966553,
"learning_rate": 8.655179052753177e-06,
"loss": 0.249,
"step": 8550
},
{
"epoch": 0.742874469375379,
"grad_norm": 7.977476596832275,
"learning_rate": 8.582980361956104e-06,
"loss": 0.3375,
"step": 8575
},
{
"epoch": 0.7450402841548991,
"grad_norm": 11.404130935668945,
"learning_rate": 8.510781671159029e-06,
"loss": 0.3336,
"step": 8600
},
{
"epoch": 0.7472060989344191,
"grad_norm": 0.4421218931674957,
"learning_rate": 8.438582980361957e-06,
"loss": 0.3702,
"step": 8625
},
{
"epoch": 0.7493719137139392,
"grad_norm": 4.386607646942139,
"learning_rate": 8.366384289564882e-06,
"loss": 0.3494,
"step": 8650
},
{
"epoch": 0.7515377284934592,
"grad_norm": 5.428525924682617,
"learning_rate": 8.294185598767809e-06,
"loss": 0.2996,
"step": 8675
},
{
"epoch": 0.7537035432729793,
"grad_norm": 0.3034394085407257,
"learning_rate": 8.221986907970736e-06,
"loss": 0.2433,
"step": 8700
},
{
"epoch": 0.7558693580524993,
"grad_norm": 3.75878643989563,
"learning_rate": 8.149788217173662e-06,
"loss": 0.3027,
"step": 8725
},
{
"epoch": 0.7580351728320194,
"grad_norm": 9.965909004211426,
"learning_rate": 8.077589526376589e-06,
"loss": 0.382,
"step": 8750
},
{
"epoch": 0.7602009876115394,
"grad_norm": 7.314566135406494,
"learning_rate": 8.005390835579514e-06,
"loss": 0.2874,
"step": 8775
},
{
"epoch": 0.7623668023910595,
"grad_norm": 8.704547882080078,
"learning_rate": 7.93319214478244e-06,
"loss": 0.2737,
"step": 8800
},
{
"epoch": 0.7645326171705795,
"grad_norm": 10.275945663452148,
"learning_rate": 7.86099345398537e-06,
"loss": 0.3212,
"step": 8825
},
{
"epoch": 0.7666984319500997,
"grad_norm": 4.1912641525268555,
"learning_rate": 7.788794763188294e-06,
"loss": 0.3475,
"step": 8850
},
{
"epoch": 0.7688642467296197,
"grad_norm": 10.281148910522461,
"learning_rate": 7.716596072391221e-06,
"loss": 0.221,
"step": 8875
},
{
"epoch": 0.7710300615091398,
"grad_norm": 9.613810539245605,
"learning_rate": 7.644397381594146e-06,
"loss": 0.2587,
"step": 8900
},
{
"epoch": 0.7731958762886598,
"grad_norm": 1.2200976610183716,
"learning_rate": 7.572198690797074e-06,
"loss": 0.2852,
"step": 8925
},
{
"epoch": 0.7753616910681799,
"grad_norm": 2.445672035217285,
"learning_rate": 7.5e-06,
"loss": 0.3837,
"step": 8950
},
{
"epoch": 0.7775275058476999,
"grad_norm": 13.744851112365723,
"learning_rate": 7.427801309202927e-06,
"loss": 0.2333,
"step": 8975
},
{
"epoch": 0.77969332062722,
"grad_norm": 4.426064968109131,
"learning_rate": 7.355602618405853e-06,
"loss": 0.3036,
"step": 9000
},
{
"epoch": 0.78185913540674,
"grad_norm": 8.329988479614258,
"learning_rate": 7.28340392760878e-06,
"loss": 0.3287,
"step": 9025
},
{
"epoch": 0.7840249501862601,
"grad_norm": 4.122848987579346,
"learning_rate": 7.211205236811706e-06,
"loss": 0.3248,
"step": 9050
},
{
"epoch": 0.7861907649657801,
"grad_norm": 6.127285480499268,
"learning_rate": 7.139006546014633e-06,
"loss": 0.2395,
"step": 9075
},
{
"epoch": 0.7883565797453002,
"grad_norm": 1.6887600421905518,
"learning_rate": 7.066807855217559e-06,
"loss": 0.2647,
"step": 9100
},
{
"epoch": 0.7905223945248202,
"grad_norm": 1.4300670623779297,
"learning_rate": 6.9946091644204855e-06,
"loss": 0.3345,
"step": 9125
},
{
"epoch": 0.7926882093043403,
"grad_norm": 9.334101676940918,
"learning_rate": 6.922410473623411e-06,
"loss": 0.3421,
"step": 9150
},
{
"epoch": 0.7948540240838603,
"grad_norm": 6.996714115142822,
"learning_rate": 6.850211782826339e-06,
"loss": 0.3496,
"step": 9175
},
{
"epoch": 0.7970198388633805,
"grad_norm": 8.47280216217041,
"learning_rate": 6.778013092029265e-06,
"loss": 0.253,
"step": 9200
},
{
"epoch": 0.7991856536429005,
"grad_norm": 3.239483118057251,
"learning_rate": 6.708702348864075e-06,
"loss": 0.3462,
"step": 9225
},
{
"epoch": 0.8013514684224206,
"grad_norm": 1.6153030395507812,
"learning_rate": 6.6365036580670006e-06,
"loss": 0.2688,
"step": 9250
},
{
"epoch": 0.8035172832019406,
"grad_norm": 5.316878795623779,
"learning_rate": 6.564304967269927e-06,
"loss": 0.3301,
"step": 9275
},
{
"epoch": 0.8056830979814606,
"grad_norm": 8.06822395324707,
"learning_rate": 6.492106276472853e-06,
"loss": 0.3382,
"step": 9300
},
{
"epoch": 0.8078489127609807,
"grad_norm": 2.8038644790649414,
"learning_rate": 6.41990758567578e-06,
"loss": 0.2219,
"step": 9325
},
{
"epoch": 0.8100147275405007,
"grad_norm": 5.063823223114014,
"learning_rate": 6.3477088948787066e-06,
"loss": 0.278,
"step": 9350
},
{
"epoch": 0.8121805423200208,
"grad_norm": 6.974782466888428,
"learning_rate": 6.275510204081633e-06,
"loss": 0.2338,
"step": 9375
},
{
"epoch": 0.8143463570995408,
"grad_norm": 2.8085834980010986,
"learning_rate": 6.203311513284559e-06,
"loss": 0.2732,
"step": 9400
},
{
"epoch": 0.8165121718790609,
"grad_norm": 12.976601600646973,
"learning_rate": 6.131112822487486e-06,
"loss": 0.2973,
"step": 9425
},
{
"epoch": 0.8186779866585809,
"grad_norm": 2.7448630332946777,
"learning_rate": 6.058914131690412e-06,
"loss": 0.2783,
"step": 9450
},
{
"epoch": 0.820843801438101,
"grad_norm": 2.347792387008667,
"learning_rate": 5.986715440893339e-06,
"loss": 0.2418,
"step": 9475
},
{
"epoch": 0.823009616217621,
"grad_norm": 2.851559638977051,
"learning_rate": 5.914516750096265e-06,
"loss": 0.2603,
"step": 9500
},
{
"epoch": 0.8251754309971411,
"grad_norm": 6.941406726837158,
"learning_rate": 5.842318059299192e-06,
"loss": 0.1888,
"step": 9525
},
{
"epoch": 0.8273412457766611,
"grad_norm": 4.45375394821167,
"learning_rate": 5.770119368502118e-06,
"loss": 0.2581,
"step": 9550
},
{
"epoch": 0.8295070605561813,
"grad_norm": 5.2709641456604,
"learning_rate": 5.6979206777050444e-06,
"loss": 0.2742,
"step": 9575
},
{
"epoch": 0.8316728753357013,
"grad_norm": 2.6814463138580322,
"learning_rate": 5.62572198690797e-06,
"loss": 0.2156,
"step": 9600
},
{
"epoch": 0.8338386901152214,
"grad_norm": 0.12416364997625351,
"learning_rate": 5.553523296110898e-06,
"loss": 0.3317,
"step": 9625
},
{
"epoch": 0.8360045048947414,
"grad_norm": 5.639218807220459,
"learning_rate": 5.481324605313824e-06,
"loss": 0.1967,
"step": 9650
},
{
"epoch": 0.8381703196742615,
"grad_norm": 0.8800064921379089,
"learning_rate": 5.4091259145167504e-06,
"loss": 0.1701,
"step": 9675
},
{
"epoch": 0.8403361344537815,
"grad_norm": 2.7125442028045654,
"learning_rate": 5.336927223719676e-06,
"loss": 0.3064,
"step": 9700
},
{
"epoch": 0.8425019492333016,
"grad_norm": 3.1365272998809814,
"learning_rate": 5.264728532922603e-06,
"loss": 0.3511,
"step": 9725
},
{
"epoch": 0.8446677640128216,
"grad_norm": 10.584244728088379,
"learning_rate": 5.19252984212553e-06,
"loss": 0.2461,
"step": 9750
},
{
"epoch": 0.8468335787923417,
"grad_norm": 0.7926290035247803,
"learning_rate": 5.1203311513284565e-06,
"loss": 0.3047,
"step": 9775
},
{
"epoch": 0.8489993935718617,
"grad_norm": 10.744616508483887,
"learning_rate": 5.048132460531382e-06,
"loss": 0.3234,
"step": 9800
},
{
"epoch": 0.8511652083513818,
"grad_norm": 3.9436535835266113,
"learning_rate": 4.975933769734309e-06,
"loss": 0.2843,
"step": 9825
},
{
"epoch": 0.8533310231309018,
"grad_norm": 0.2785266637802124,
"learning_rate": 4.903735078937235e-06,
"loss": 0.3365,
"step": 9850
},
{
"epoch": 0.8554968379104219,
"grad_norm": 7.446309566497803,
"learning_rate": 4.831536388140162e-06,
"loss": 0.3802,
"step": 9875
},
{
"epoch": 0.8576626526899419,
"grad_norm": 9.687524795532227,
"learning_rate": 4.759337697343088e-06,
"loss": 0.2587,
"step": 9900
},
{
"epoch": 0.8598284674694621,
"grad_norm": 0.4837453067302704,
"learning_rate": 4.687139006546015e-06,
"loss": 0.2367,
"step": 9925
},
{
"epoch": 0.8619942822489821,
"grad_norm": 0.7170611023902893,
"learning_rate": 4.614940315748941e-06,
"loss": 0.2971,
"step": 9950
},
{
"epoch": 0.8641600970285022,
"grad_norm": 16.417407989501953,
"learning_rate": 4.542741624951868e-06,
"loss": 0.2884,
"step": 9975
},
{
"epoch": 0.8663259118080222,
"grad_norm": 7.771174430847168,
"learning_rate": 4.4705429341547935e-06,
"loss": 0.2296,
"step": 10000
},
{
"epoch": 0.8684917265875423,
"grad_norm": 1.540907859802246,
"learning_rate": 4.398344243357721e-06,
"loss": 0.3145,
"step": 10025
},
{
"epoch": 0.8706575413670623,
"grad_norm": 1.4157791137695312,
"learning_rate": 4.326145552560647e-06,
"loss": 0.178,
"step": 10050
},
{
"epoch": 0.8728233561465824,
"grad_norm": 4.707205295562744,
"learning_rate": 4.253946861763574e-06,
"loss": 0.2681,
"step": 10075
},
{
"epoch": 0.8749891709261024,
"grad_norm": 3.7186520099639893,
"learning_rate": 4.1817481709664995e-06,
"loss": 0.3191,
"step": 10100
},
{
"epoch": 0.8771549857056224,
"grad_norm": 1.6584956645965576,
"learning_rate": 4.109549480169426e-06,
"loss": 0.2544,
"step": 10125
},
{
"epoch": 0.8793208004851425,
"grad_norm": 9.22360610961914,
"learning_rate": 4.037350789372352e-06,
"loss": 0.2965,
"step": 10150
},
{
"epoch": 0.8814866152646625,
"grad_norm": 3.5934746265411377,
"learning_rate": 3.96515209857528e-06,
"loss": 0.317,
"step": 10175
},
{
"epoch": 0.8836524300441826,
"grad_norm": 1.5978528261184692,
"learning_rate": 3.892953407778206e-06,
"loss": 0.2149,
"step": 10200
},
{
"epoch": 0.8858182448237026,
"grad_norm": 4.726417064666748,
"learning_rate": 3.820754716981132e-06,
"loss": 0.4876,
"step": 10225
},
{
"epoch": 0.8879840596032227,
"grad_norm": 7.836237907409668,
"learning_rate": 3.7485560261840585e-06,
"loss": 0.2984,
"step": 10250
},
{
"epoch": 0.8901498743827427,
"grad_norm": 6.5479912757873535,
"learning_rate": 3.676357335386985e-06,
"loss": 0.3024,
"step": 10275
},
{
"epoch": 0.8923156891622629,
"grad_norm": 1.180179476737976,
"learning_rate": 3.6041586445899115e-06,
"loss": 0.2447,
"step": 10300
},
{
"epoch": 0.8944815039417829,
"grad_norm": 5.868828773498535,
"learning_rate": 3.5319599537928378e-06,
"loss": 0.2684,
"step": 10325
},
{
"epoch": 0.896647318721303,
"grad_norm": 6.2655816078186035,
"learning_rate": 3.4597612629957645e-06,
"loss": 0.1714,
"step": 10350
},
{
"epoch": 0.898813133500823,
"grad_norm": 6.3384270668029785,
"learning_rate": 3.387562572198691e-06,
"loss": 0.2776,
"step": 10375
},
{
"epoch": 0.9009789482803431,
"grad_norm": 6.097102165222168,
"learning_rate": 3.315363881401617e-06,
"loss": 0.2745,
"step": 10400
},
{
"epoch": 0.9031447630598631,
"grad_norm": 7.250086784362793,
"learning_rate": 3.243165190604544e-06,
"loss": 0.3299,
"step": 10425
},
{
"epoch": 0.9053105778393832,
"grad_norm": 9.260988235473633,
"learning_rate": 3.17096649980747e-06,
"loss": 0.2629,
"step": 10450
},
{
"epoch": 0.9074763926189032,
"grad_norm": 8.009949684143066,
"learning_rate": 3.0987678090103964e-06,
"loss": 0.3627,
"step": 10475
},
{
"epoch": 0.9096422073984233,
"grad_norm": 1.247878074645996,
"learning_rate": 3.026569118213323e-06,
"loss": 0.2236,
"step": 10500
},
{
"epoch": 0.9118080221779433,
"grad_norm": 6.759634971618652,
"learning_rate": 2.9543704274162494e-06,
"loss": 0.2819,
"step": 10525
},
{
"epoch": 0.9139738369574634,
"grad_norm": 0.09837600588798523,
"learning_rate": 2.882171736619176e-06,
"loss": 0.3129,
"step": 10550
},
{
"epoch": 0.9161396517369834,
"grad_norm": 6.850848197937012,
"learning_rate": 2.8099730458221024e-06,
"loss": 0.3051,
"step": 10575
},
{
"epoch": 0.9183054665165035,
"grad_norm": 8.94210147857666,
"learning_rate": 2.7377743550250287e-06,
"loss": 0.3955,
"step": 10600
},
{
"epoch": 0.9204712812960235,
"grad_norm": 8.595787048339844,
"learning_rate": 2.6655756642279554e-06,
"loss": 0.2493,
"step": 10625
},
{
"epoch": 0.9226370960755437,
"grad_norm": 7.062394618988037,
"learning_rate": 2.5933769734308817e-06,
"loss": 0.2543,
"step": 10650
},
{
"epoch": 0.9248029108550637,
"grad_norm": 3.371393918991089,
"learning_rate": 2.521178282633808e-06,
"loss": 0.2222,
"step": 10675
},
{
"epoch": 0.9269687256345838,
"grad_norm": 1.3468866348266602,
"learning_rate": 2.4489795918367347e-06,
"loss": 0.2823,
"step": 10700
},
{
"epoch": 0.9291345404141038,
"grad_norm": 15.475239753723145,
"learning_rate": 2.376780901039661e-06,
"loss": 0.3098,
"step": 10725
},
{
"epoch": 0.9313003551936239,
"grad_norm": 6.605096340179443,
"learning_rate": 2.3045822102425877e-06,
"loss": 0.3009,
"step": 10750
},
{
"epoch": 0.9334661699731439,
"grad_norm": 3.2146847248077393,
"learning_rate": 2.2323835194455144e-06,
"loss": 0.2623,
"step": 10775
},
{
"epoch": 0.935631984752664,
"grad_norm": 2.727200508117676,
"learning_rate": 2.1601848286484407e-06,
"loss": 0.1952,
"step": 10800
},
{
"epoch": 0.937797799532184,
"grad_norm": 2.7418553829193115,
"learning_rate": 2.0879861378513674e-06,
"loss": 0.4527,
"step": 10825
},
{
"epoch": 0.939963614311704,
"grad_norm": 8.577201843261719,
"learning_rate": 2.0157874470542937e-06,
"loss": 0.2323,
"step": 10850
},
{
"epoch": 0.9421294290912241,
"grad_norm": 4.514817237854004,
"learning_rate": 1.94358875625722e-06,
"loss": 0.3109,
"step": 10875
},
{
"epoch": 0.9442952438707441,
"grad_norm": 10.761394500732422,
"learning_rate": 1.8713900654601463e-06,
"loss": 0.3335,
"step": 10900
},
{
"epoch": 0.9464610586502642,
"grad_norm": 8.004775047302246,
"learning_rate": 1.7991913746630728e-06,
"loss": 0.2862,
"step": 10925
},
{
"epoch": 0.9486268734297842,
"grad_norm": 7.491416931152344,
"learning_rate": 1.7269926838659993e-06,
"loss": 0.4005,
"step": 10950
},
{
"epoch": 0.9507926882093043,
"grad_norm": 6.168478488922119,
"learning_rate": 1.6547939930689255e-06,
"loss": 0.2815,
"step": 10975
},
{
"epoch": 0.9529585029888243,
"grad_norm": 7.221772193908691,
"learning_rate": 1.582595302271852e-06,
"loss": 0.2157,
"step": 11000
},
{
"epoch": 0.9551243177683445,
"grad_norm": 5.9744086265563965,
"learning_rate": 1.5103966114747788e-06,
"loss": 0.3733,
"step": 11025
},
{
"epoch": 0.9572901325478645,
"grad_norm": 5.776475429534912,
"learning_rate": 1.4381979206777053e-06,
"loss": 0.2843,
"step": 11050
},
{
"epoch": 0.9594559473273846,
"grad_norm": 1.3870640993118286,
"learning_rate": 1.3659992298806316e-06,
"loss": 0.1963,
"step": 11075
},
{
"epoch": 0.9616217621069046,
"grad_norm": 7.3776535987854,
"learning_rate": 1.293800539083558e-06,
"loss": 0.3081,
"step": 11100
},
{
"epoch": 0.9637875768864247,
"grad_norm": 11.289216995239258,
"learning_rate": 1.2216018482864846e-06,
"loss": 0.2317,
"step": 11125
},
{
"epoch": 0.9659533916659447,
"grad_norm": 11.621864318847656,
"learning_rate": 1.1494031574894108e-06,
"loss": 0.3027,
"step": 11150
},
{
"epoch": 0.9681192064454648,
"grad_norm": 11.617834091186523,
"learning_rate": 1.0772044666923373e-06,
"loss": 0.3581,
"step": 11175
},
{
"epoch": 0.9702850212249848,
"grad_norm": 5.500637531280518,
"learning_rate": 1.0050057758952638e-06,
"loss": 0.3,
"step": 11200
},
{
"epoch": 0.9724508360045049,
"grad_norm": 3.552578926086426,
"learning_rate": 9.328070850981902e-07,
"loss": 0.2797,
"step": 11225
},
{
"epoch": 0.9746166507840249,
"grad_norm": 1.074208378791809,
"learning_rate": 8.606083943011167e-07,
"loss": 0.2918,
"step": 11250
},
{
"epoch": 0.976782465563545,
"grad_norm": 11.449936866760254,
"learning_rate": 7.884097035040431e-07,
"loss": 0.2519,
"step": 11275
},
{
"epoch": 0.978948280343065,
"grad_norm": 2.988003730773926,
"learning_rate": 7.162110127069696e-07,
"loss": 0.2183,
"step": 11300
},
{
"epoch": 0.9811140951225851,
"grad_norm": 2.9280929565429688,
"learning_rate": 6.44012321909896e-07,
"loss": 0.2764,
"step": 11325
},
{
"epoch": 0.9832799099021051,
"grad_norm": 3.2279105186462402,
"learning_rate": 5.718136311128224e-07,
"loss": 0.4107,
"step": 11350
},
{
"epoch": 0.9854457246816253,
"grad_norm": 2.54160737991333,
"learning_rate": 4.996149403157489e-07,
"loss": 0.3135,
"step": 11375
},
{
"epoch": 0.9876115394611453,
"grad_norm": 1.3068925142288208,
"learning_rate": 4.2741624951867543e-07,
"loss": 0.2138,
"step": 11400
},
{
"epoch": 0.9897773542406654,
"grad_norm": 8.606940269470215,
"learning_rate": 3.5521755872160183e-07,
"loss": 0.2984,
"step": 11425
},
{
"epoch": 0.9919431690201854,
"grad_norm": 1.2513303756713867,
"learning_rate": 2.830188679245283e-07,
"loss": 0.2407,
"step": 11450
},
{
"epoch": 0.9941089837997055,
"grad_norm": 11.340466499328613,
"learning_rate": 2.1082017712745478e-07,
"loss": 0.2449,
"step": 11475
},
{
"epoch": 0.9962747985792255,
"grad_norm": 6.166193008422852,
"learning_rate": 1.386214863303812e-07,
"loss": 0.2629,
"step": 11500
},
{
"epoch": 0.9984406133587456,
"grad_norm": 4.004662990570068,
"learning_rate": 6.642279553330766e-08,
"loss": 0.3488,
"step": 11525
},
{
"epoch": 1.0,
"eval_cosine_accuracy": 0.9693415637860082,
"eval_loss": 0.4268312156200409,
"eval_runtime": 50.4023,
"eval_samples_per_second": 96.424,
"eval_steps_per_second": 6.031,
"step": 11543
}
],
"logging_steps": 25,
"max_steps": 11543,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.01
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}