{ "best_metric": 0.4268312156200409, "best_model_checkpoint": "gte-modernbert-philosophy-v1-1-autotr/checkpoint-11543", "epoch": 1.0, "eval_steps": 500, "global_step": 11543, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021658147795200556, "grad_norm": 9.93041706085205, "learning_rate": 6.493506493506493e-07, "loss": 1.2002, "step": 25 }, { "epoch": 0.004331629559040111, "grad_norm": 10.093696594238281, "learning_rate": 1.2987012987012986e-06, "loss": 1.1623, "step": 50 }, { "epoch": 0.006497444338560167, "grad_norm": 9.860174179077148, "learning_rate": 1.948051948051948e-06, "loss": 1.2012, "step": 75 }, { "epoch": 0.008663259118080222, "grad_norm": 8.943851470947266, "learning_rate": 2.597402597402597e-06, "loss": 1.1853, "step": 100 }, { "epoch": 0.010829073897600277, "grad_norm": 5.130438327789307, "learning_rate": 3.246753246753247e-06, "loss": 0.9767, "step": 125 }, { "epoch": 0.012994888677120333, "grad_norm": 8.450475692749023, "learning_rate": 3.896103896103896e-06, "loss": 0.865, "step": 150 }, { "epoch": 0.015160703456640388, "grad_norm": 3.7100517749786377, "learning_rate": 4.5454545454545455e-06, "loss": 0.7733, "step": 175 }, { "epoch": 0.017326518236160444, "grad_norm": 15.069323539733887, "learning_rate": 5.194805194805194e-06, "loss": 0.9545, "step": 200 }, { "epoch": 0.0194923330156805, "grad_norm": 9.745051383972168, "learning_rate": 5.8181818181818185e-06, "loss": 0.8309, "step": 225 }, { "epoch": 0.021658147795200554, "grad_norm": 7.115631580352783, "learning_rate": 6.467532467532467e-06, "loss": 0.7514, "step": 250 }, { "epoch": 0.02382396257472061, "grad_norm": 15.047070503234863, "learning_rate": 7.116883116883117e-06, "loss": 0.5555, "step": 275 }, { "epoch": 0.025989777354240667, "grad_norm": 14.228421211242676, "learning_rate": 7.766233766233767e-06, "loss": 0.563, "step": 300 }, { "epoch": 0.02815559213376072, "grad_norm": 8.77304458618164, "learning_rate": 8.415584415584416e-06, "loss": 0.618, "step": 325 }, { "epoch": 0.030321406913280776, "grad_norm": 4.78096866607666, "learning_rate": 9.064935064935066e-06, "loss": 0.6538, "step": 350 }, { "epoch": 0.032487221692800834, "grad_norm": 9.978753089904785, "learning_rate": 9.714285714285715e-06, "loss": 0.5802, "step": 375 }, { "epoch": 0.03465303647232089, "grad_norm": 15.071464538574219, "learning_rate": 1.0363636363636364e-05, "loss": 0.6568, "step": 400 }, { "epoch": 0.036818851251840944, "grad_norm": 12.352958679199219, "learning_rate": 1.1012987012987013e-05, "loss": 0.4934, "step": 425 }, { "epoch": 0.038984666031361, "grad_norm": 8.754806518554688, "learning_rate": 1.1662337662337662e-05, "loss": 0.597, "step": 450 }, { "epoch": 0.04115048081088105, "grad_norm": 2.432300090789795, "learning_rate": 1.2311688311688312e-05, "loss": 0.3812, "step": 475 }, { "epoch": 0.04331629559040111, "grad_norm": 7.439950466156006, "learning_rate": 1.2961038961038961e-05, "loss": 0.482, "step": 500 }, { "epoch": 0.04548211036992116, "grad_norm": 14.198251724243164, "learning_rate": 1.361038961038961e-05, "loss": 0.5347, "step": 525 }, { "epoch": 0.04764792514944122, "grad_norm": 5.91489839553833, "learning_rate": 1.425974025974026e-05, "loss": 0.5012, "step": 550 }, { "epoch": 0.04981373992896128, "grad_norm": 14.394525527954102, "learning_rate": 1.490909090909091e-05, "loss": 0.5765, "step": 575 }, { "epoch": 0.05197955470848133, "grad_norm": 16.823543548583984, "learning_rate": 1.555844155844156e-05, "loss": 0.4286, "step": 600 }, { "epoch": 0.05414536948800139, "grad_norm": 4.72399377822876, "learning_rate": 1.6207792207792207e-05, "loss": 0.5167, "step": 625 }, { "epoch": 0.05631118426752144, "grad_norm": 18.0063419342041, "learning_rate": 1.6857142857142858e-05, "loss": 0.4791, "step": 650 }, { "epoch": 0.0584769990470415, "grad_norm": 11.925456047058105, "learning_rate": 1.750649350649351e-05, "loss": 0.5022, "step": 675 }, { "epoch": 0.06064281382656155, "grad_norm": 2.7437996864318848, "learning_rate": 1.8155844155844156e-05, "loss": 0.438, "step": 700 }, { "epoch": 0.0628086286060816, "grad_norm": 1.8270901441574097, "learning_rate": 1.8805194805194806e-05, "loss": 0.3995, "step": 725 }, { "epoch": 0.06497444338560167, "grad_norm": 4.187374591827393, "learning_rate": 1.9454545454545453e-05, "loss": 0.2924, "step": 750 }, { "epoch": 0.06714025816512172, "grad_norm": 12.709814071655273, "learning_rate": 2.0103896103896104e-05, "loss": 0.4391, "step": 775 }, { "epoch": 0.06930607294464178, "grad_norm": 8.789942741394043, "learning_rate": 2.0753246753246755e-05, "loss": 0.4328, "step": 800 }, { "epoch": 0.07147188772416183, "grad_norm": 10.182008743286133, "learning_rate": 2.137662337662338e-05, "loss": 0.5658, "step": 825 }, { "epoch": 0.07363770250368189, "grad_norm": 3.5178301334381104, "learning_rate": 2.2025974025974026e-05, "loss": 0.4541, "step": 850 }, { "epoch": 0.07580351728320193, "grad_norm": 8.124090194702148, "learning_rate": 2.2675324675324676e-05, "loss": 0.5381, "step": 875 }, { "epoch": 0.077969332062722, "grad_norm": 11.69704532623291, "learning_rate": 2.3324675324675324e-05, "loss": 0.4523, "step": 900 }, { "epoch": 0.08013514684224206, "grad_norm": 19.822145462036133, "learning_rate": 2.3974025974025974e-05, "loss": 0.3522, "step": 925 }, { "epoch": 0.0823009616217621, "grad_norm": 8.31993579864502, "learning_rate": 2.4623376623376625e-05, "loss": 0.4475, "step": 950 }, { "epoch": 0.08446677640128217, "grad_norm": 5.60876989364624, "learning_rate": 2.5246753246753246e-05, "loss": 0.4448, "step": 975 }, { "epoch": 0.08663259118080222, "grad_norm": 9.872743606567383, "learning_rate": 2.5896103896103896e-05, "loss": 0.407, "step": 1000 }, { "epoch": 0.08879840596032228, "grad_norm": 7.193666458129883, "learning_rate": 2.6545454545454547e-05, "loss": 0.4616, "step": 1025 }, { "epoch": 0.09096422073984232, "grad_norm": 17.595991134643555, "learning_rate": 2.7194805194805194e-05, "loss": 0.4213, "step": 1050 }, { "epoch": 0.09313003551936239, "grad_norm": 3.281184196472168, "learning_rate": 2.7844155844155844e-05, "loss": 0.465, "step": 1075 }, { "epoch": 0.09529585029888243, "grad_norm": 7.671459197998047, "learning_rate": 2.849350649350649e-05, "loss": 0.2964, "step": 1100 }, { "epoch": 0.0974616650784025, "grad_norm": 7.963995933532715, "learning_rate": 2.9142857142857142e-05, "loss": 0.4414, "step": 1125 }, { "epoch": 0.09962747985792256, "grad_norm": 1.8723474740982056, "learning_rate": 2.9792207792207793e-05, "loss": 0.3508, "step": 1150 }, { "epoch": 0.1017932946374426, "grad_norm": 5.1907877922058105, "learning_rate": 2.995090489025799e-05, "loss": 0.3362, "step": 1175 }, { "epoch": 0.10395910941696267, "grad_norm": 5.219175815582275, "learning_rate": 2.9878706199460916e-05, "loss": 0.4953, "step": 1200 }, { "epoch": 0.10612492419648271, "grad_norm": 15.204286575317383, "learning_rate": 2.9806507508663843e-05, "loss": 0.4041, "step": 1225 }, { "epoch": 0.10829073897600278, "grad_norm": 5.872297286987305, "learning_rate": 2.973430881786677e-05, "loss": 0.3773, "step": 1250 }, { "epoch": 0.11045655375552282, "grad_norm": 7.201790809631348, "learning_rate": 2.9662110127069697e-05, "loss": 0.3574, "step": 1275 }, { "epoch": 0.11262236853504289, "grad_norm": 2.872793674468994, "learning_rate": 2.9589911436272623e-05, "loss": 0.642, "step": 1300 }, { "epoch": 0.11478818331456293, "grad_norm": 10.854488372802734, "learning_rate": 2.951771274547555e-05, "loss": 0.3783, "step": 1325 }, { "epoch": 0.116953998094083, "grad_norm": 2.162464141845703, "learning_rate": 2.9445514054678477e-05, "loss": 0.4905, "step": 1350 }, { "epoch": 0.11911981287360304, "grad_norm": 14.541825294494629, "learning_rate": 2.9373315363881403e-05, "loss": 0.3937, "step": 1375 }, { "epoch": 0.1212856276531231, "grad_norm": 1.6897481679916382, "learning_rate": 2.9301116673084327e-05, "loss": 0.4245, "step": 1400 }, { "epoch": 0.12345144243264317, "grad_norm": 9.359882354736328, "learning_rate": 2.9228917982287253e-05, "loss": 0.4139, "step": 1425 }, { "epoch": 0.1256172572121632, "grad_norm": 39.94605255126953, "learning_rate": 2.915671929149018e-05, "loss": 0.4305, "step": 1450 }, { "epoch": 0.12778307199168326, "grad_norm": 10.268132209777832, "learning_rate": 2.908452060069311e-05, "loss": 0.675, "step": 1475 }, { "epoch": 0.12994888677120334, "grad_norm": 1.7209604978561401, "learning_rate": 2.9012321909896037e-05, "loss": 0.55, "step": 1500 }, { "epoch": 0.13211470155072338, "grad_norm": 8.541482925415039, "learning_rate": 2.894012321909896e-05, "loss": 0.4033, "step": 1525 }, { "epoch": 0.13428051633024343, "grad_norm": 10.4110107421875, "learning_rate": 2.8867924528301887e-05, "loss": 0.4167, "step": 1550 }, { "epoch": 0.13644633110976348, "grad_norm": 10.823756217956543, "learning_rate": 2.8795725837504814e-05, "loss": 0.3814, "step": 1575 }, { "epoch": 0.13861214588928356, "grad_norm": 0.6896539926528931, "learning_rate": 2.872352714670774e-05, "loss": 0.5183, "step": 1600 }, { "epoch": 0.1407779606688036, "grad_norm": 4.357579231262207, "learning_rate": 2.8651328455910667e-05, "loss": 0.3343, "step": 1625 }, { "epoch": 0.14294377544832365, "grad_norm": 12.074344635009766, "learning_rate": 2.857912976511359e-05, "loss": 0.4212, "step": 1650 }, { "epoch": 0.14510959022784373, "grad_norm": 11.660531997680664, "learning_rate": 2.850693107431652e-05, "loss": 0.4737, "step": 1675 }, { "epoch": 0.14727540500736377, "grad_norm": 15.467144966125488, "learning_rate": 2.8434732383519447e-05, "loss": 0.4563, "step": 1700 }, { "epoch": 0.14944121978688382, "grad_norm": 9.277994155883789, "learning_rate": 2.8362533692722374e-05, "loss": 0.4251, "step": 1725 }, { "epoch": 0.15160703456640387, "grad_norm": 3.6043941974639893, "learning_rate": 2.82903350019253e-05, "loss": 0.3497, "step": 1750 }, { "epoch": 0.15377284934592395, "grad_norm": 3.933353900909424, "learning_rate": 2.8218136311128224e-05, "loss": 0.3753, "step": 1775 }, { "epoch": 0.155938664125444, "grad_norm": 3.8728222846984863, "learning_rate": 2.814593762033115e-05, "loss": 0.4031, "step": 1800 }, { "epoch": 0.15810447890496404, "grad_norm": 8.067976951599121, "learning_rate": 2.8073738929534077e-05, "loss": 0.4037, "step": 1825 }, { "epoch": 0.16027029368448412, "grad_norm": 9.141134262084961, "learning_rate": 2.8001540238737004e-05, "loss": 0.4114, "step": 1850 }, { "epoch": 0.16243610846400416, "grad_norm": 1.8272747993469238, "learning_rate": 2.7929341547939934e-05, "loss": 0.3848, "step": 1875 }, { "epoch": 0.1646019232435242, "grad_norm": 0.4890976846218109, "learning_rate": 2.7857142857142858e-05, "loss": 0.5088, "step": 1900 }, { "epoch": 0.16676773802304426, "grad_norm": 9.043623924255371, "learning_rate": 2.7784944166345784e-05, "loss": 0.4032, "step": 1925 }, { "epoch": 0.16893355280256434, "grad_norm": 9.092608451843262, "learning_rate": 2.771274547554871e-05, "loss": 0.3354, "step": 1950 }, { "epoch": 0.17109936758208438, "grad_norm": 6.121222972869873, "learning_rate": 2.7640546784751638e-05, "loss": 0.4163, "step": 1975 }, { "epoch": 0.17326518236160443, "grad_norm": 1.539663314819336, "learning_rate": 2.7568348093954564e-05, "loss": 0.3715, "step": 2000 }, { "epoch": 0.17543099714112448, "grad_norm": 16.089406967163086, "learning_rate": 2.7496149403157488e-05, "loss": 0.3424, "step": 2025 }, { "epoch": 0.17759681192064455, "grad_norm": 12.510934829711914, "learning_rate": 2.7423950712360414e-05, "loss": 0.3311, "step": 2050 }, { "epoch": 0.1797626267001646, "grad_norm": 2.823338508605957, "learning_rate": 2.7351752021563345e-05, "loss": 0.4362, "step": 2075 }, { "epoch": 0.18192844147968465, "grad_norm": 6.191600322723389, "learning_rate": 2.727955333076627e-05, "loss": 0.4441, "step": 2100 }, { "epoch": 0.18409425625920472, "grad_norm": 4.86907434463501, "learning_rate": 2.7207354639969198e-05, "loss": 0.3122, "step": 2125 }, { "epoch": 0.18626007103872477, "grad_norm": 7.323814868927002, "learning_rate": 2.713515594917212e-05, "loss": 0.3717, "step": 2150 }, { "epoch": 0.18842588581824482, "grad_norm": 10.09737491607666, "learning_rate": 2.7062957258375048e-05, "loss": 0.3461, "step": 2175 }, { "epoch": 0.19059170059776487, "grad_norm": 8.536800384521484, "learning_rate": 2.6990758567577975e-05, "loss": 0.4816, "step": 2200 }, { "epoch": 0.19275751537728494, "grad_norm": 5.237682819366455, "learning_rate": 2.69185598767809e-05, "loss": 0.4784, "step": 2225 }, { "epoch": 0.194923330156805, "grad_norm": 10.763497352600098, "learning_rate": 2.6846361185983828e-05, "loss": 0.4334, "step": 2250 }, { "epoch": 0.19708914493632504, "grad_norm": 0.7019050121307373, "learning_rate": 2.6774162495186755e-05, "loss": 0.3437, "step": 2275 }, { "epoch": 0.19925495971584511, "grad_norm": 8.020634651184082, "learning_rate": 2.670196380438968e-05, "loss": 0.4333, "step": 2300 }, { "epoch": 0.20142077449536516, "grad_norm": 10.549779891967773, "learning_rate": 2.662976511359261e-05, "loss": 0.3609, "step": 2325 }, { "epoch": 0.2035865892748852, "grad_norm": 5.6236677169799805, "learning_rate": 2.6557566422795535e-05, "loss": 0.3437, "step": 2350 }, { "epoch": 0.20575240405440526, "grad_norm": 1.4388600587844849, "learning_rate": 2.648536773199846e-05, "loss": 0.4911, "step": 2375 }, { "epoch": 0.20791821883392533, "grad_norm": 4.445183277130127, "learning_rate": 2.6413169041201385e-05, "loss": 0.3872, "step": 2400 }, { "epoch": 0.21008403361344538, "grad_norm": 9.076152801513672, "learning_rate": 2.6340970350404312e-05, "loss": 0.276, "step": 2425 }, { "epoch": 0.21224984839296543, "grad_norm": 5.573355197906494, "learning_rate": 2.6268771659607242e-05, "loss": 0.3318, "step": 2450 }, { "epoch": 0.21441566317248548, "grad_norm": 5.015573024749756, "learning_rate": 2.619657296881017e-05, "loss": 0.4833, "step": 2475 }, { "epoch": 0.21658147795200555, "grad_norm": 3.9038755893707275, "learning_rate": 2.6124374278013092e-05, "loss": 0.4656, "step": 2500 }, { "epoch": 0.2187472927315256, "grad_norm": 2.66627836227417, "learning_rate": 2.605217558721602e-05, "loss": 0.4232, "step": 2525 }, { "epoch": 0.22091310751104565, "grad_norm": 8.859906196594238, "learning_rate": 2.5979976896418945e-05, "loss": 0.434, "step": 2550 }, { "epoch": 0.22307892229056572, "grad_norm": 3.2811522483825684, "learning_rate": 2.5907778205621872e-05, "loss": 0.2479, "step": 2575 }, { "epoch": 0.22524473707008577, "grad_norm": 8.53447437286377, "learning_rate": 2.58355795148248e-05, "loss": 0.4656, "step": 2600 }, { "epoch": 0.22741055184960582, "grad_norm": 6.359921455383301, "learning_rate": 2.5763380824027722e-05, "loss": 0.3881, "step": 2625 }, { "epoch": 0.22957636662912587, "grad_norm": 6.196253776550293, "learning_rate": 2.5691182133230652e-05, "loss": 0.3637, "step": 2650 }, { "epoch": 0.23174218140864594, "grad_norm": 7.805304050445557, "learning_rate": 2.561898344243358e-05, "loss": 0.3099, "step": 2675 }, { "epoch": 0.233907996188166, "grad_norm": 4.51755428314209, "learning_rate": 2.5546784751636506e-05, "loss": 0.3933, "step": 2700 }, { "epoch": 0.23607381096768604, "grad_norm": 5.72914981842041, "learning_rate": 2.5474586060839432e-05, "loss": 0.3789, "step": 2725 }, { "epoch": 0.23823962574720609, "grad_norm": 2.4809954166412354, "learning_rate": 2.5402387370042356e-05, "loss": 0.4056, "step": 2750 }, { "epoch": 0.24040544052672616, "grad_norm": 1.940656065940857, "learning_rate": 2.5330188679245282e-05, "loss": 0.4132, "step": 2775 }, { "epoch": 0.2425712553062462, "grad_norm": 3.452242851257324, "learning_rate": 2.525798998844821e-05, "loss": 0.375, "step": 2800 }, { "epoch": 0.24473707008576626, "grad_norm": 9.220993041992188, "learning_rate": 2.5185791297651136e-05, "loss": 0.3026, "step": 2825 }, { "epoch": 0.24690288486528633, "grad_norm": 10.027073860168457, "learning_rate": 2.5113592606854066e-05, "loss": 0.5372, "step": 2850 }, { "epoch": 0.24906869964480638, "grad_norm": 2.228799819946289, "learning_rate": 2.504139391605699e-05, "loss": 0.4233, "step": 2875 }, { "epoch": 0.2512345144243264, "grad_norm": 7.281198978424072, "learning_rate": 2.4969195225259916e-05, "loss": 0.2945, "step": 2900 }, { "epoch": 0.2534003292038465, "grad_norm": 1.4160314798355103, "learning_rate": 2.4896996534462843e-05, "loss": 0.2916, "step": 2925 }, { "epoch": 0.2555661439833665, "grad_norm": 4.095098972320557, "learning_rate": 2.482479784366577e-05, "loss": 0.3536, "step": 2950 }, { "epoch": 0.25773195876288657, "grad_norm": 1.413552165031433, "learning_rate": 2.4752599152868696e-05, "loss": 0.3246, "step": 2975 }, { "epoch": 0.2598977735424067, "grad_norm": 3.3196184635162354, "learning_rate": 2.468040046207162e-05, "loss": 0.4236, "step": 3000 }, { "epoch": 0.2620635883219267, "grad_norm": 11.855537414550781, "learning_rate": 2.4608201771274546e-05, "loss": 0.4088, "step": 3025 }, { "epoch": 0.26422940310144677, "grad_norm": 9.322809219360352, "learning_rate": 2.4536003080477476e-05, "loss": 0.4522, "step": 3050 }, { "epoch": 0.2663952178809668, "grad_norm": 7.581571578979492, "learning_rate": 2.4463804389680403e-05, "loss": 0.3445, "step": 3075 }, { "epoch": 0.26856103266048686, "grad_norm": 2.6131093502044678, "learning_rate": 2.439160569888333e-05, "loss": 0.3575, "step": 3100 }, { "epoch": 0.2707268474400069, "grad_norm": 3.68662166595459, "learning_rate": 2.4319407008086253e-05, "loss": 0.3809, "step": 3125 }, { "epoch": 0.27289266221952696, "grad_norm": 2.3688032627105713, "learning_rate": 2.424720831728918e-05, "loss": 0.3364, "step": 3150 }, { "epoch": 0.27505847699904706, "grad_norm": 1.155315637588501, "learning_rate": 2.4175009626492106e-05, "loss": 0.4103, "step": 3175 }, { "epoch": 0.2772242917785671, "grad_norm": 35.7138671875, "learning_rate": 2.4102810935695033e-05, "loss": 0.3502, "step": 3200 }, { "epoch": 0.27939010655808716, "grad_norm": 6.429433822631836, "learning_rate": 2.403061224489796e-05, "loss": 0.2632, "step": 3225 }, { "epoch": 0.2815559213376072, "grad_norm": 9.816515922546387, "learning_rate": 2.3958413554100887e-05, "loss": 0.406, "step": 3250 }, { "epoch": 0.28372173611712725, "grad_norm": 1.9653140306472778, "learning_rate": 2.3886214863303813e-05, "loss": 0.4363, "step": 3275 }, { "epoch": 0.2858875508966473, "grad_norm": 9.559599876403809, "learning_rate": 2.381401617250674e-05, "loss": 0.2819, "step": 3300 }, { "epoch": 0.28805336567616735, "grad_norm": 10.623549461364746, "learning_rate": 2.3741817481709667e-05, "loss": 0.3421, "step": 3325 }, { "epoch": 0.29021918045568745, "grad_norm": 2.4988913536071777, "learning_rate": 2.366961879091259e-05, "loss": 0.269, "step": 3350 }, { "epoch": 0.2923849952352075, "grad_norm": 4.704137802124023, "learning_rate": 2.3597420100115517e-05, "loss": 0.2902, "step": 3375 }, { "epoch": 0.29455081001472755, "grad_norm": 9.48901653289795, "learning_rate": 2.3525221409318443e-05, "loss": 0.3548, "step": 3400 }, { "epoch": 0.2967166247942476, "grad_norm": 0.5201269388198853, "learning_rate": 2.3453022718521374e-05, "loss": 0.4575, "step": 3425 }, { "epoch": 0.29888243957376764, "grad_norm": 8.074861526489258, "learning_rate": 2.33808240277243e-05, "loss": 0.3942, "step": 3450 }, { "epoch": 0.3010482543532877, "grad_norm": 8.45334243774414, "learning_rate": 2.3308625336927224e-05, "loss": 0.3537, "step": 3475 }, { "epoch": 0.30321406913280774, "grad_norm": 2.7069313526153564, "learning_rate": 2.323642664613015e-05, "loss": 0.3672, "step": 3500 }, { "epoch": 0.30537988391232784, "grad_norm": 13.849508285522461, "learning_rate": 2.3164227955333077e-05, "loss": 0.3502, "step": 3525 }, { "epoch": 0.3075456986918479, "grad_norm": 4.5892462730407715, "learning_rate": 2.3092029264536004e-05, "loss": 0.2545, "step": 3550 }, { "epoch": 0.30971151347136794, "grad_norm": 1.035447120666504, "learning_rate": 2.301983057373893e-05, "loss": 0.2544, "step": 3575 }, { "epoch": 0.311877328250888, "grad_norm": 5.170057773590088, "learning_rate": 2.2947631882941854e-05, "loss": 0.3443, "step": 3600 }, { "epoch": 0.31404314303040803, "grad_norm": 2.908191204071045, "learning_rate": 2.2875433192144784e-05, "loss": 0.3784, "step": 3625 }, { "epoch": 0.3162089578099281, "grad_norm": 9.946891784667969, "learning_rate": 2.280323450134771e-05, "loss": 0.3828, "step": 3650 }, { "epoch": 0.31837477258944813, "grad_norm": 10.337167739868164, "learning_rate": 2.2731035810550637e-05, "loss": 0.4032, "step": 3675 }, { "epoch": 0.32054058736896823, "grad_norm": 10.093758583068848, "learning_rate": 2.2658837119753564e-05, "loss": 0.2556, "step": 3700 }, { "epoch": 0.3227064021484883, "grad_norm": 7.309471130371094, "learning_rate": 2.2586638428956487e-05, "loss": 0.3352, "step": 3725 }, { "epoch": 0.32487221692800833, "grad_norm": 10.050370216369629, "learning_rate": 2.2514439738159414e-05, "loss": 0.4054, "step": 3750 }, { "epoch": 0.3270380317075284, "grad_norm": 3.858546733856201, "learning_rate": 2.244224104736234e-05, "loss": 0.3049, "step": 3775 }, { "epoch": 0.3292038464870484, "grad_norm": 5.640537261962891, "learning_rate": 2.2370042356565267e-05, "loss": 0.2223, "step": 3800 }, { "epoch": 0.33136966126656847, "grad_norm": 5.106541633605957, "learning_rate": 2.2297843665768198e-05, "loss": 0.4878, "step": 3825 }, { "epoch": 0.3335354760460885, "grad_norm": 7.738224029541016, "learning_rate": 2.222564497497112e-05, "loss": 0.3015, "step": 3850 }, { "epoch": 0.33570129082560857, "grad_norm": 12.313666343688965, "learning_rate": 2.2153446284174048e-05, "loss": 0.3816, "step": 3875 }, { "epoch": 0.33786710560512867, "grad_norm": 0.9929437041282654, "learning_rate": 2.2081247593376974e-05, "loss": 0.3334, "step": 3900 }, { "epoch": 0.3400329203846487, "grad_norm": 5.753032207489014, "learning_rate": 2.20090489025799e-05, "loss": 0.3724, "step": 3925 }, { "epoch": 0.34219873516416877, "grad_norm": 8.37396240234375, "learning_rate": 2.1936850211782828e-05, "loss": 0.4217, "step": 3950 }, { "epoch": 0.3443645499436888, "grad_norm": 7.365005016326904, "learning_rate": 2.186465152098575e-05, "loss": 0.4339, "step": 3975 }, { "epoch": 0.34653036472320886, "grad_norm": 1.91083824634552, "learning_rate": 2.1792452830188678e-05, "loss": 0.3642, "step": 4000 }, { "epoch": 0.3486961795027289, "grad_norm": 3.0427494049072266, "learning_rate": 2.1720254139391608e-05, "loss": 0.3819, "step": 4025 }, { "epoch": 0.35086199428224896, "grad_norm": 1.176952838897705, "learning_rate": 2.1648055448594535e-05, "loss": 0.2796, "step": 4050 }, { "epoch": 0.35302780906176906, "grad_norm": 1.0579583644866943, "learning_rate": 2.157585675779746e-05, "loss": 0.4277, "step": 4075 }, { "epoch": 0.3551936238412891, "grad_norm": 11.798035621643066, "learning_rate": 2.1503658067000385e-05, "loss": 0.3407, "step": 4100 }, { "epoch": 0.35735943862080916, "grad_norm": 15.57787036895752, "learning_rate": 2.143145937620331e-05, "loss": 0.2781, "step": 4125 }, { "epoch": 0.3595252534003292, "grad_norm": 8.533368110656738, "learning_rate": 2.1359260685406238e-05, "loss": 0.4274, "step": 4150 }, { "epoch": 0.36169106817984925, "grad_norm": 8.470250129699707, "learning_rate": 2.1287061994609165e-05, "loss": 0.3609, "step": 4175 }, { "epoch": 0.3638568829593693, "grad_norm": 6.417985439300537, "learning_rate": 2.121486330381209e-05, "loss": 0.3476, "step": 4200 }, { "epoch": 0.36602269773888935, "grad_norm": 8.685192108154297, "learning_rate": 2.1142664613015018e-05, "loss": 0.41, "step": 4225 }, { "epoch": 0.36818851251840945, "grad_norm": 7.082727432250977, "learning_rate": 2.1070465922217945e-05, "loss": 0.4003, "step": 4250 }, { "epoch": 0.3703543272979295, "grad_norm": 4.621776103973389, "learning_rate": 2.099826723142087e-05, "loss": 0.306, "step": 4275 }, { "epoch": 0.37252014207744955, "grad_norm": 3.1071817874908447, "learning_rate": 2.09260685406238e-05, "loss": 0.2335, "step": 4300 }, { "epoch": 0.3746859568569696, "grad_norm": 7.23638916015625, "learning_rate": 2.085386984982672e-05, "loss": 0.2733, "step": 4325 }, { "epoch": 0.37685177163648964, "grad_norm": 6.893523693084717, "learning_rate": 2.078167115902965e-05, "loss": 0.3007, "step": 4350 }, { "epoch": 0.3790175864160097, "grad_norm": 5.9917073249816895, "learning_rate": 2.0709472468232575e-05, "loss": 0.3086, "step": 4375 }, { "epoch": 0.38118340119552974, "grad_norm": 6.596795558929443, "learning_rate": 2.0637273777435502e-05, "loss": 0.365, "step": 4400 }, { "epoch": 0.38334921597504984, "grad_norm": 9.045963287353516, "learning_rate": 2.0565075086638432e-05, "loss": 0.3255, "step": 4425 }, { "epoch": 0.3855150307545699, "grad_norm": 6.755446434020996, "learning_rate": 2.0492876395841355e-05, "loss": 0.3765, "step": 4450 }, { "epoch": 0.38768084553408994, "grad_norm": 11.626537322998047, "learning_rate": 2.0420677705044282e-05, "loss": 0.2946, "step": 4475 }, { "epoch": 0.38984666031361, "grad_norm": 4.125662326812744, "learning_rate": 2.034847901424721e-05, "loss": 0.3298, "step": 4500 }, { "epoch": 0.39201247509313003, "grad_norm": 1.2437127828598022, "learning_rate": 2.0276280323450135e-05, "loss": 0.3645, "step": 4525 }, { "epoch": 0.3941782898726501, "grad_norm": 10.272943496704102, "learning_rate": 2.0204081632653062e-05, "loss": 0.2403, "step": 4550 }, { "epoch": 0.3963441046521701, "grad_norm": 2.164606809616089, "learning_rate": 2.0131882941855985e-05, "loss": 0.28, "step": 4575 }, { "epoch": 0.39850991943169023, "grad_norm": 9.157061576843262, "learning_rate": 2.0059684251058916e-05, "loss": 0.3814, "step": 4600 }, { "epoch": 0.4006757342112103, "grad_norm": 4.034579277038574, "learning_rate": 1.9987485560261842e-05, "loss": 0.3419, "step": 4625 }, { "epoch": 0.4028415489907303, "grad_norm": 2.5503344535827637, "learning_rate": 1.991528686946477e-05, "loss": 0.3374, "step": 4650 }, { "epoch": 0.4050073637702504, "grad_norm": 4.660188674926758, "learning_rate": 1.9843088178667696e-05, "loss": 0.3511, "step": 4675 }, { "epoch": 0.4071731785497704, "grad_norm": 7.020951747894287, "learning_rate": 1.977088948787062e-05, "loss": 0.4339, "step": 4700 }, { "epoch": 0.40933899332929047, "grad_norm": 5.4507269859313965, "learning_rate": 1.9698690797073546e-05, "loss": 0.3441, "step": 4725 }, { "epoch": 0.4115048081088105, "grad_norm": 11.266243934631348, "learning_rate": 1.9626492106276472e-05, "loss": 0.346, "step": 4750 }, { "epoch": 0.41367062288833056, "grad_norm": 2.1191511154174805, "learning_rate": 1.95542934154794e-05, "loss": 0.3723, "step": 4775 }, { "epoch": 0.41583643766785067, "grad_norm": 1.9068052768707275, "learning_rate": 1.948209472468233e-05, "loss": 0.2075, "step": 4800 }, { "epoch": 0.4180022524473707, "grad_norm": 0.36394304037094116, "learning_rate": 1.9409896033885253e-05, "loss": 0.2431, "step": 4825 }, { "epoch": 0.42016806722689076, "grad_norm": 6.177628993988037, "learning_rate": 1.933769734308818e-05, "loss": 0.2642, "step": 4850 }, { "epoch": 0.4223338820064108, "grad_norm": 3.6669273376464844, "learning_rate": 1.9265498652291106e-05, "loss": 0.1763, "step": 4875 }, { "epoch": 0.42449969678593086, "grad_norm": 7.836557865142822, "learning_rate": 1.9193299961494033e-05, "loss": 0.3862, "step": 4900 }, { "epoch": 0.4266655115654509, "grad_norm": 10.140486717224121, "learning_rate": 1.912110127069696e-05, "loss": 0.3053, "step": 4925 }, { "epoch": 0.42883132634497095, "grad_norm": 2.8873586654663086, "learning_rate": 1.9048902579899883e-05, "loss": 0.3162, "step": 4950 }, { "epoch": 0.43099714112449106, "grad_norm": 1.758466362953186, "learning_rate": 1.897670388910281e-05, "loss": 0.3178, "step": 4975 }, { "epoch": 0.4331629559040111, "grad_norm": 7.523651599884033, "learning_rate": 1.890450519830574e-05, "loss": 0.2789, "step": 5000 }, { "epoch": 0.43532877068353115, "grad_norm": 5.955496311187744, "learning_rate": 1.8832306507508666e-05, "loss": 0.1777, "step": 5025 }, { "epoch": 0.4374945854630512, "grad_norm": 9.068547248840332, "learning_rate": 1.8760107816711593e-05, "loss": 0.4155, "step": 5050 }, { "epoch": 0.43966040024257125, "grad_norm": 4.900373458862305, "learning_rate": 1.8687909125914516e-05, "loss": 0.2983, "step": 5075 }, { "epoch": 0.4418262150220913, "grad_norm": 3.5501790046691895, "learning_rate": 1.8615710435117443e-05, "loss": 0.3687, "step": 5100 }, { "epoch": 0.44399202980161134, "grad_norm": 1.0216822624206543, "learning_rate": 1.854351174432037e-05, "loss": 0.2428, "step": 5125 }, { "epoch": 0.44615784458113145, "grad_norm": 7.637403964996338, "learning_rate": 1.8471313053523296e-05, "loss": 0.3071, "step": 5150 }, { "epoch": 0.4483236593606515, "grad_norm": 9.478981018066406, "learning_rate": 1.8399114362726223e-05, "loss": 0.2911, "step": 5175 }, { "epoch": 0.45048947414017154, "grad_norm": 3.875411033630371, "learning_rate": 1.832691567192915e-05, "loss": 0.3152, "step": 5200 }, { "epoch": 0.4526552889196916, "grad_norm": 1.1700037717819214, "learning_rate": 1.8254716981132077e-05, "loss": 0.2776, "step": 5225 }, { "epoch": 0.45482110369921164, "grad_norm": 4.037864685058594, "learning_rate": 1.8182518290335003e-05, "loss": 0.2674, "step": 5250 }, { "epoch": 0.4569869184787317, "grad_norm": 2.6295673847198486, "learning_rate": 1.811031959953793e-05, "loss": 0.3035, "step": 5275 }, { "epoch": 0.45915273325825173, "grad_norm": 9.654006004333496, "learning_rate": 1.8038120908740853e-05, "loss": 0.3352, "step": 5300 }, { "epoch": 0.46131854803777184, "grad_norm": 7.339272975921631, "learning_rate": 1.796592221794378e-05, "loss": 0.3879, "step": 5325 }, { "epoch": 0.4634843628172919, "grad_norm": 5.668703079223633, "learning_rate": 1.7893723527146707e-05, "loss": 0.3828, "step": 5350 }, { "epoch": 0.46565017759681193, "grad_norm": 11.843222618103027, "learning_rate": 1.7821524836349633e-05, "loss": 0.2797, "step": 5375 }, { "epoch": 0.467815992376332, "grad_norm": 3.3071844577789307, "learning_rate": 1.7749326145552564e-05, "loss": 0.3492, "step": 5400 }, { "epoch": 0.469981807155852, "grad_norm": 11.303645133972168, "learning_rate": 1.7677127454755487e-05, "loss": 0.5, "step": 5425 }, { "epoch": 0.4721476219353721, "grad_norm": 1.1275362968444824, "learning_rate": 1.7604928763958414e-05, "loss": 0.2317, "step": 5450 }, { "epoch": 0.4743134367148921, "grad_norm": 11.97022533416748, "learning_rate": 1.753273007316134e-05, "loss": 0.2411, "step": 5475 }, { "epoch": 0.47647925149441217, "grad_norm": 2.9647443294525146, "learning_rate": 1.7460531382364267e-05, "loss": 0.277, "step": 5500 }, { "epoch": 0.4786450662739323, "grad_norm": 5.046292781829834, "learning_rate": 1.7388332691567194e-05, "loss": 0.4112, "step": 5525 }, { "epoch": 0.4808108810534523, "grad_norm": 8.11351203918457, "learning_rate": 1.7316134000770117e-05, "loss": 0.5116, "step": 5550 }, { "epoch": 0.48297669583297237, "grad_norm": 1.0861672163009644, "learning_rate": 1.7243935309973047e-05, "loss": 0.3264, "step": 5575 }, { "epoch": 0.4851425106124924, "grad_norm": 2.311553955078125, "learning_rate": 1.7171736619175974e-05, "loss": 0.3688, "step": 5600 }, { "epoch": 0.48730832539201246, "grad_norm": 2.371721029281616, "learning_rate": 1.70995379283789e-05, "loss": 0.3224, "step": 5625 }, { "epoch": 0.4894741401715325, "grad_norm": 7.7612714767456055, "learning_rate": 1.7027339237581827e-05, "loss": 0.3778, "step": 5650 }, { "epoch": 0.49163995495105256, "grad_norm": 7.416019916534424, "learning_rate": 1.695514054678475e-05, "loss": 0.3671, "step": 5675 }, { "epoch": 0.49380576973057266, "grad_norm": 7.0320940017700195, "learning_rate": 1.6882941855987677e-05, "loss": 0.3331, "step": 5700 }, { "epoch": 0.4959715845100927, "grad_norm": 0.8671308159828186, "learning_rate": 1.6810743165190604e-05, "loss": 0.3426, "step": 5725 }, { "epoch": 0.49813739928961276, "grad_norm": 6.607793807983398, "learning_rate": 1.673854447439353e-05, "loss": 0.2863, "step": 5750 }, { "epoch": 0.5003032140691328, "grad_norm": 10.399803161621094, "learning_rate": 1.666634578359646e-05, "loss": 0.5822, "step": 5775 }, { "epoch": 0.5024690288486529, "grad_norm": 2.4261348247528076, "learning_rate": 1.6594147092799384e-05, "loss": 0.2687, "step": 5800 }, { "epoch": 0.5046348436281729, "grad_norm": 0.30012401938438416, "learning_rate": 1.652194840200231e-05, "loss": 0.3365, "step": 5825 }, { "epoch": 0.506800658407693, "grad_norm": 8.255668640136719, "learning_rate": 1.6449749711205238e-05, "loss": 0.4609, "step": 5850 }, { "epoch": 0.508966473187213, "grad_norm": 6.495670795440674, "learning_rate": 1.6377551020408164e-05, "loss": 0.3127, "step": 5875 }, { "epoch": 0.511132287966733, "grad_norm": 4.311783790588379, "learning_rate": 1.630535232961109e-05, "loss": 0.2705, "step": 5900 }, { "epoch": 0.5132981027462531, "grad_norm": 7.5022430419921875, "learning_rate": 1.6233153638814014e-05, "loss": 0.3089, "step": 5925 }, { "epoch": 0.5154639175257731, "grad_norm": 9.813260078430176, "learning_rate": 1.616095494801694e-05, "loss": 0.3386, "step": 5950 }, { "epoch": 0.5176297323052933, "grad_norm": 8.11892318725586, "learning_rate": 1.608875625721987e-05, "loss": 0.3796, "step": 5975 }, { "epoch": 0.5197955470848133, "grad_norm": 8.750290870666504, "learning_rate": 1.6016557566422798e-05, "loss": 0.4231, "step": 6000 }, { "epoch": 0.5219613618643334, "grad_norm": 8.316088676452637, "learning_rate": 1.5944358875625725e-05, "loss": 0.3922, "step": 6025 }, { "epoch": 0.5241271766438534, "grad_norm": 4.458547592163086, "learning_rate": 1.5872160184828648e-05, "loss": 0.3138, "step": 6050 }, { "epoch": 0.5262929914233735, "grad_norm": 4.100847244262695, "learning_rate": 1.5799961494031575e-05, "loss": 0.3106, "step": 6075 }, { "epoch": 0.5284588062028935, "grad_norm": 3.5927000045776367, "learning_rate": 1.57277628032345e-05, "loss": 0.188, "step": 6100 }, { "epoch": 0.5306246209824136, "grad_norm": 0.6444216370582581, "learning_rate": 1.5655564112437428e-05, "loss": 0.209, "step": 6125 }, { "epoch": 0.5327904357619336, "grad_norm": 6.649785041809082, "learning_rate": 1.5583365421640355e-05, "loss": 0.2617, "step": 6150 }, { "epoch": 0.5349562505414537, "grad_norm": 8.491826057434082, "learning_rate": 1.551116673084328e-05, "loss": 0.3059, "step": 6175 }, { "epoch": 0.5371220653209737, "grad_norm": 22.71511459350586, "learning_rate": 1.5438968040046208e-05, "loss": 0.2764, "step": 6200 }, { "epoch": 0.5392878801004938, "grad_norm": 6.877171516418457, "learning_rate": 1.5366769349249135e-05, "loss": 0.2801, "step": 6225 }, { "epoch": 0.5414536948800138, "grad_norm": 0.46479833126068115, "learning_rate": 1.529457065845206e-05, "loss": 0.3744, "step": 6250 }, { "epoch": 0.5436195096595339, "grad_norm": 7.200215816497803, "learning_rate": 1.5222371967654987e-05, "loss": 0.3067, "step": 6275 }, { "epoch": 0.5457853244390539, "grad_norm": 6.230359077453613, "learning_rate": 1.5150173276857913e-05, "loss": 0.3305, "step": 6300 }, { "epoch": 0.5479511392185741, "grad_norm": 3.2241950035095215, "learning_rate": 1.5077974586060838e-05, "loss": 0.2827, "step": 6325 }, { "epoch": 0.5501169539980941, "grad_norm": 10.813590049743652, "learning_rate": 1.5005775895263765e-05, "loss": 0.2712, "step": 6350 }, { "epoch": 0.5522827687776142, "grad_norm": 3.5207877159118652, "learning_rate": 1.4933577204466692e-05, "loss": 0.2677, "step": 6375 }, { "epoch": 0.5544485835571342, "grad_norm": 6.884098529815674, "learning_rate": 1.4861378513669619e-05, "loss": 0.4269, "step": 6400 }, { "epoch": 0.5566143983366543, "grad_norm": 12.490416526794434, "learning_rate": 1.4789179822872547e-05, "loss": 0.3834, "step": 6425 }, { "epoch": 0.5587802131161743, "grad_norm": 6.844019889831543, "learning_rate": 1.4716981132075472e-05, "loss": 0.4177, "step": 6450 }, { "epoch": 0.5609460278956944, "grad_norm": 2.4574711322784424, "learning_rate": 1.4644782441278399e-05, "loss": 0.2457, "step": 6475 }, { "epoch": 0.5631118426752144, "grad_norm": 4.939560413360596, "learning_rate": 1.4572583750481324e-05, "loss": 0.348, "step": 6500 }, { "epoch": 0.5652776574547345, "grad_norm": 11.443745613098145, "learning_rate": 1.4500385059684252e-05, "loss": 0.3035, "step": 6525 }, { "epoch": 0.5674434722342545, "grad_norm": 5.136826515197754, "learning_rate": 1.4428186368887177e-05, "loss": 0.39, "step": 6550 }, { "epoch": 0.5696092870137746, "grad_norm": 8.772330284118652, "learning_rate": 1.4355987678090104e-05, "loss": 0.366, "step": 6575 }, { "epoch": 0.5717751017932946, "grad_norm": 0.46080633997917175, "learning_rate": 1.428378898729303e-05, "loss": 0.2299, "step": 6600 }, { "epoch": 0.5739409165728147, "grad_norm": 5.478773593902588, "learning_rate": 1.4211590296495957e-05, "loss": 0.1737, "step": 6625 }, { "epoch": 0.5761067313523347, "grad_norm": 11.235420227050781, "learning_rate": 1.4139391605698884e-05, "loss": 0.3773, "step": 6650 }, { "epoch": 0.5782725461318549, "grad_norm": 7.810971260070801, "learning_rate": 1.4067192914901809e-05, "loss": 0.3409, "step": 6675 }, { "epoch": 0.5804383609113749, "grad_norm": 2.817094087600708, "learning_rate": 1.3994994224104737e-05, "loss": 0.1739, "step": 6700 }, { "epoch": 0.582604175690895, "grad_norm": 0.4941748082637787, "learning_rate": 1.3922795533307664e-05, "loss": 0.3462, "step": 6725 }, { "epoch": 0.584769990470415, "grad_norm": 1.5013363361358643, "learning_rate": 1.3850596842510589e-05, "loss": 0.2976, "step": 6750 }, { "epoch": 0.586935805249935, "grad_norm": 4.63820219039917, "learning_rate": 1.3778398151713516e-05, "loss": 0.3246, "step": 6775 }, { "epoch": 0.5891016200294551, "grad_norm": 0.6134036779403687, "learning_rate": 1.3706199460916443e-05, "loss": 0.3808, "step": 6800 }, { "epoch": 0.5912674348089751, "grad_norm": 9.693577766418457, "learning_rate": 1.363400077011937e-05, "loss": 0.2926, "step": 6825 }, { "epoch": 0.5934332495884952, "grad_norm": 8.138602256774902, "learning_rate": 1.3561802079322296e-05, "loss": 0.2709, "step": 6850 }, { "epoch": 0.5955990643680152, "grad_norm": 5.065515041351318, "learning_rate": 1.3489603388525221e-05, "loss": 0.3777, "step": 6875 }, { "epoch": 0.5977648791475353, "grad_norm": 6.169302463531494, "learning_rate": 1.341740469772815e-05, "loss": 0.2834, "step": 6900 }, { "epoch": 0.5999306939270553, "grad_norm": 1.4236884117126465, "learning_rate": 1.3345206006931074e-05, "loss": 0.2965, "step": 6925 }, { "epoch": 0.6020965087065754, "grad_norm": 4.954479217529297, "learning_rate": 1.3273007316134001e-05, "loss": 0.2399, "step": 6950 }, { "epoch": 0.6042623234860954, "grad_norm": 1.1738444566726685, "learning_rate": 1.3200808625336928e-05, "loss": 0.2936, "step": 6975 }, { "epoch": 0.6064281382656155, "grad_norm": 6.822793006896973, "learning_rate": 1.3128609934539855e-05, "loss": 0.2674, "step": 7000 }, { "epoch": 0.6085939530451355, "grad_norm": 9.408463478088379, "learning_rate": 1.3056411243742781e-05, "loss": 0.265, "step": 7025 }, { "epoch": 0.6107597678246557, "grad_norm": 24.97877311706543, "learning_rate": 1.2984212552945706e-05, "loss": 0.3257, "step": 7050 }, { "epoch": 0.6129255826041757, "grad_norm": 2.854039192199707, "learning_rate": 1.2912013862148633e-05, "loss": 0.3504, "step": 7075 }, { "epoch": 0.6150913973836958, "grad_norm": 0.40900859236717224, "learning_rate": 1.283981517135156e-05, "loss": 0.1485, "step": 7100 }, { "epoch": 0.6172572121632158, "grad_norm": 5.776600360870361, "learning_rate": 1.2767616480554486e-05, "loss": 0.2598, "step": 7125 }, { "epoch": 0.6194230269427359, "grad_norm": 1.7507195472717285, "learning_rate": 1.2695417789757413e-05, "loss": 0.2838, "step": 7150 }, { "epoch": 0.6215888417222559, "grad_norm": 7.723363399505615, "learning_rate": 1.2623219098960338e-05, "loss": 0.3391, "step": 7175 }, { "epoch": 0.623754656501776, "grad_norm": 6.485815048217773, "learning_rate": 1.2551020408163267e-05, "loss": 0.3568, "step": 7200 }, { "epoch": 0.625920471281296, "grad_norm": 0.392874151468277, "learning_rate": 1.2481709664998075e-05, "loss": 0.3001, "step": 7225 }, { "epoch": 0.6280862860608161, "grad_norm": 1.3930811882019043, "learning_rate": 1.2409510974201001e-05, "loss": 0.2613, "step": 7250 }, { "epoch": 0.6302521008403361, "grad_norm": 0.3461158275604248, "learning_rate": 1.2337312283403928e-05, "loss": 0.3379, "step": 7275 }, { "epoch": 0.6324179156198562, "grad_norm": 3.489888906478882, "learning_rate": 1.2265113592606855e-05, "loss": 0.3347, "step": 7300 }, { "epoch": 0.6345837303993762, "grad_norm": 2.3235511779785156, "learning_rate": 1.219291490180978e-05, "loss": 0.242, "step": 7325 }, { "epoch": 0.6367495451788963, "grad_norm": 10.576093673706055, "learning_rate": 1.2120716211012708e-05, "loss": 0.3076, "step": 7350 }, { "epoch": 0.6389153599584163, "grad_norm": 4.862971305847168, "learning_rate": 1.2048517520215633e-05, "loss": 0.3055, "step": 7375 }, { "epoch": 0.6410811747379365, "grad_norm": 4.282524108886719, "learning_rate": 1.197631882941856e-05, "loss": 0.4014, "step": 7400 }, { "epoch": 0.6432469895174565, "grad_norm": 1.2869305610656738, "learning_rate": 1.1904120138621487e-05, "loss": 0.3723, "step": 7425 }, { "epoch": 0.6454128042969766, "grad_norm": 8.37488842010498, "learning_rate": 1.1831921447824414e-05, "loss": 0.3421, "step": 7450 }, { "epoch": 0.6475786190764966, "grad_norm": 8.292667388916016, "learning_rate": 1.175972275702734e-05, "loss": 0.4306, "step": 7475 }, { "epoch": 0.6497444338560167, "grad_norm": 7.678843975067139, "learning_rate": 1.1687524066230265e-05, "loss": 0.2536, "step": 7500 }, { "epoch": 0.6519102486355367, "grad_norm": 1.5608030557632446, "learning_rate": 1.1615325375433192e-05, "loss": 0.264, "step": 7525 }, { "epoch": 0.6540760634150568, "grad_norm": 7.649046897888184, "learning_rate": 1.1543126684636119e-05, "loss": 0.1767, "step": 7550 }, { "epoch": 0.6562418781945768, "grad_norm": 4.701557636260986, "learning_rate": 1.1470927993839045e-05, "loss": 0.259, "step": 7575 }, { "epoch": 0.6584076929740968, "grad_norm": 14.77114200592041, "learning_rate": 1.1398729303041972e-05, "loss": 0.2761, "step": 7600 }, { "epoch": 0.6605735077536169, "grad_norm": 0.08189712464809418, "learning_rate": 1.1326530612244897e-05, "loss": 0.2934, "step": 7625 }, { "epoch": 0.6627393225331369, "grad_norm": 8.246410369873047, "learning_rate": 1.1254331921447826e-05, "loss": 0.3055, "step": 7650 }, { "epoch": 0.664905137312657, "grad_norm": 2.8091800212860107, "learning_rate": 1.118213323065075e-05, "loss": 0.2532, "step": 7675 }, { "epoch": 0.667070952092177, "grad_norm": 8.43855094909668, "learning_rate": 1.1109934539853677e-05, "loss": 0.2942, "step": 7700 }, { "epoch": 0.6692367668716971, "grad_norm": 2.259917974472046, "learning_rate": 1.1037735849056604e-05, "loss": 0.2048, "step": 7725 }, { "epoch": 0.6714025816512171, "grad_norm": 13.296177864074707, "learning_rate": 1.096553715825953e-05, "loss": 0.2884, "step": 7750 }, { "epoch": 0.6735683964307373, "grad_norm": 7.745298862457275, "learning_rate": 1.0893338467462457e-05, "loss": 0.3598, "step": 7775 }, { "epoch": 0.6757342112102573, "grad_norm": 1.932173490524292, "learning_rate": 1.0821139776665382e-05, "loss": 0.3318, "step": 7800 }, { "epoch": 0.6779000259897774, "grad_norm": 7.833034515380859, "learning_rate": 1.0748941085868309e-05, "loss": 0.3058, "step": 7825 }, { "epoch": 0.6800658407692974, "grad_norm": 8.620037078857422, "learning_rate": 1.0676742395071238e-05, "loss": 0.3395, "step": 7850 }, { "epoch": 0.6822316555488175, "grad_norm": 8.948209762573242, "learning_rate": 1.0604543704274163e-05, "loss": 0.2973, "step": 7875 }, { "epoch": 0.6843974703283375, "grad_norm": 5.001883506774902, "learning_rate": 1.053234501347709e-05, "loss": 0.2741, "step": 7900 }, { "epoch": 0.6865632851078576, "grad_norm": 10.376258850097656, "learning_rate": 1.0460146322680016e-05, "loss": 0.2493, "step": 7925 }, { "epoch": 0.6887290998873776, "grad_norm": 9.021862030029297, "learning_rate": 1.0387947631882943e-05, "loss": 0.2966, "step": 7950 }, { "epoch": 0.6908949146668977, "grad_norm": 12.025108337402344, "learning_rate": 1.0315748941085868e-05, "loss": 0.3207, "step": 7975 }, { "epoch": 0.6930607294464177, "grad_norm": 0.8383066058158875, "learning_rate": 1.0243550250288794e-05, "loss": 0.2501, "step": 8000 }, { "epoch": 0.6952265442259378, "grad_norm": 2.6812140941619873, "learning_rate": 1.0171351559491723e-05, "loss": 0.4028, "step": 8025 }, { "epoch": 0.6973923590054578, "grad_norm": 11.301798820495605, "learning_rate": 1.0099152868694648e-05, "loss": 0.3549, "step": 8050 }, { "epoch": 0.6995581737849779, "grad_norm": 8.55245304107666, "learning_rate": 1.0026954177897575e-05, "loss": 0.3805, "step": 8075 }, { "epoch": 0.7017239885644979, "grad_norm": 1.9036015272140503, "learning_rate": 9.9547554871005e-06, "loss": 0.353, "step": 8100 }, { "epoch": 0.7038898033440181, "grad_norm": 1.0196151733398438, "learning_rate": 9.882556796303428e-06, "loss": 0.3569, "step": 8125 }, { "epoch": 0.7060556181235381, "grad_norm": 2.688908338546753, "learning_rate": 9.810358105506355e-06, "loss": 0.2588, "step": 8150 }, { "epoch": 0.7082214329030582, "grad_norm": 0.6335782408714294, "learning_rate": 9.73815941470928e-06, "loss": 0.2252, "step": 8175 }, { "epoch": 0.7103872476825782, "grad_norm": 4.539221286773682, "learning_rate": 9.665960723912206e-06, "loss": 0.2747, "step": 8200 }, { "epoch": 0.7125530624620983, "grad_norm": 8.757186889648438, "learning_rate": 9.593762033115133e-06, "loss": 0.3239, "step": 8225 }, { "epoch": 0.7147188772416183, "grad_norm": 1.7275235652923584, "learning_rate": 9.52156334231806e-06, "loss": 0.2954, "step": 8250 }, { "epoch": 0.7168846920211384, "grad_norm": 6.338670253753662, "learning_rate": 9.449364651520987e-06, "loss": 0.3749, "step": 8275 }, { "epoch": 0.7190505068006584, "grad_norm": 1.565496563911438, "learning_rate": 9.377165960723912e-06, "loss": 0.2757, "step": 8300 }, { "epoch": 0.7212163215801785, "grad_norm": 0.0664602667093277, "learning_rate": 9.30496726992684e-06, "loss": 0.3012, "step": 8325 }, { "epoch": 0.7233821363596985, "grad_norm": 10.375814437866211, "learning_rate": 9.232768579129765e-06, "loss": 0.2985, "step": 8350 }, { "epoch": 0.7255479511392185, "grad_norm": 16.607072830200195, "learning_rate": 9.160569888332692e-06, "loss": 0.2656, "step": 8375 }, { "epoch": 0.7277137659187386, "grad_norm": 0.6724597811698914, "learning_rate": 9.088371197535618e-06, "loss": 0.2007, "step": 8400 }, { "epoch": 0.7298795806982586, "grad_norm": 2.3397414684295654, "learning_rate": 9.016172506738545e-06, "loss": 0.2402, "step": 8425 }, { "epoch": 0.7320453954777787, "grad_norm": 11.172548294067383, "learning_rate": 8.943973815941472e-06, "loss": 0.3434, "step": 8450 }, { "epoch": 0.7342112102572987, "grad_norm": 12.031539916992188, "learning_rate": 8.871775125144397e-06, "loss": 0.2628, "step": 8475 }, { "epoch": 0.7363770250368189, "grad_norm": 0.37211769819259644, "learning_rate": 8.799576434347324e-06, "loss": 0.265, "step": 8500 }, { "epoch": 0.738542839816339, "grad_norm": 6.181528568267822, "learning_rate": 8.72737774355025e-06, "loss": 0.3748, "step": 8525 }, { "epoch": 0.740708654595859, "grad_norm": 2.7227742671966553, "learning_rate": 8.655179052753177e-06, "loss": 0.249, "step": 8550 }, { "epoch": 0.742874469375379, "grad_norm": 7.977476596832275, "learning_rate": 8.582980361956104e-06, "loss": 0.3375, "step": 8575 }, { "epoch": 0.7450402841548991, "grad_norm": 11.404130935668945, "learning_rate": 8.510781671159029e-06, "loss": 0.3336, "step": 8600 }, { "epoch": 0.7472060989344191, "grad_norm": 0.4421218931674957, "learning_rate": 8.438582980361957e-06, "loss": 0.3702, "step": 8625 }, { "epoch": 0.7493719137139392, "grad_norm": 4.386607646942139, "learning_rate": 8.366384289564882e-06, "loss": 0.3494, "step": 8650 }, { "epoch": 0.7515377284934592, "grad_norm": 5.428525924682617, "learning_rate": 8.294185598767809e-06, "loss": 0.2996, "step": 8675 }, { "epoch": 0.7537035432729793, "grad_norm": 0.3034394085407257, "learning_rate": 8.221986907970736e-06, "loss": 0.2433, "step": 8700 }, { "epoch": 0.7558693580524993, "grad_norm": 3.75878643989563, "learning_rate": 8.149788217173662e-06, "loss": 0.3027, "step": 8725 }, { "epoch": 0.7580351728320194, "grad_norm": 9.965909004211426, "learning_rate": 8.077589526376589e-06, "loss": 0.382, "step": 8750 }, { "epoch": 0.7602009876115394, "grad_norm": 7.314566135406494, "learning_rate": 8.005390835579514e-06, "loss": 0.2874, "step": 8775 }, { "epoch": 0.7623668023910595, "grad_norm": 8.704547882080078, "learning_rate": 7.93319214478244e-06, "loss": 0.2737, "step": 8800 }, { "epoch": 0.7645326171705795, "grad_norm": 10.275945663452148, "learning_rate": 7.86099345398537e-06, "loss": 0.3212, "step": 8825 }, { "epoch": 0.7666984319500997, "grad_norm": 4.1912641525268555, "learning_rate": 7.788794763188294e-06, "loss": 0.3475, "step": 8850 }, { "epoch": 0.7688642467296197, "grad_norm": 10.281148910522461, "learning_rate": 7.716596072391221e-06, "loss": 0.221, "step": 8875 }, { "epoch": 0.7710300615091398, "grad_norm": 9.613810539245605, "learning_rate": 7.644397381594146e-06, "loss": 0.2587, "step": 8900 }, { "epoch": 0.7731958762886598, "grad_norm": 1.2200976610183716, "learning_rate": 7.572198690797074e-06, "loss": 0.2852, "step": 8925 }, { "epoch": 0.7753616910681799, "grad_norm": 2.445672035217285, "learning_rate": 7.5e-06, "loss": 0.3837, "step": 8950 }, { "epoch": 0.7775275058476999, "grad_norm": 13.744851112365723, "learning_rate": 7.427801309202927e-06, "loss": 0.2333, "step": 8975 }, { "epoch": 0.77969332062722, "grad_norm": 4.426064968109131, "learning_rate": 7.355602618405853e-06, "loss": 0.3036, "step": 9000 }, { "epoch": 0.78185913540674, "grad_norm": 8.329988479614258, "learning_rate": 7.28340392760878e-06, "loss": 0.3287, "step": 9025 }, { "epoch": 0.7840249501862601, "grad_norm": 4.122848987579346, "learning_rate": 7.211205236811706e-06, "loss": 0.3248, "step": 9050 }, { "epoch": 0.7861907649657801, "grad_norm": 6.127285480499268, "learning_rate": 7.139006546014633e-06, "loss": 0.2395, "step": 9075 }, { "epoch": 0.7883565797453002, "grad_norm": 1.6887600421905518, "learning_rate": 7.066807855217559e-06, "loss": 0.2647, "step": 9100 }, { "epoch": 0.7905223945248202, "grad_norm": 1.4300670623779297, "learning_rate": 6.9946091644204855e-06, "loss": 0.3345, "step": 9125 }, { "epoch": 0.7926882093043403, "grad_norm": 9.334101676940918, "learning_rate": 6.922410473623411e-06, "loss": 0.3421, "step": 9150 }, { "epoch": 0.7948540240838603, "grad_norm": 6.996714115142822, "learning_rate": 6.850211782826339e-06, "loss": 0.3496, "step": 9175 }, { "epoch": 0.7970198388633805, "grad_norm": 8.47280216217041, "learning_rate": 6.778013092029265e-06, "loss": 0.253, "step": 9200 }, { "epoch": 0.7991856536429005, "grad_norm": 3.239483118057251, "learning_rate": 6.708702348864075e-06, "loss": 0.3462, "step": 9225 }, { "epoch": 0.8013514684224206, "grad_norm": 1.6153030395507812, "learning_rate": 6.6365036580670006e-06, "loss": 0.2688, "step": 9250 }, { "epoch": 0.8035172832019406, "grad_norm": 5.316878795623779, "learning_rate": 6.564304967269927e-06, "loss": 0.3301, "step": 9275 }, { "epoch": 0.8056830979814606, "grad_norm": 8.06822395324707, "learning_rate": 6.492106276472853e-06, "loss": 0.3382, "step": 9300 }, { "epoch": 0.8078489127609807, "grad_norm": 2.8038644790649414, "learning_rate": 6.41990758567578e-06, "loss": 0.2219, "step": 9325 }, { "epoch": 0.8100147275405007, "grad_norm": 5.063823223114014, "learning_rate": 6.3477088948787066e-06, "loss": 0.278, "step": 9350 }, { "epoch": 0.8121805423200208, "grad_norm": 6.974782466888428, "learning_rate": 6.275510204081633e-06, "loss": 0.2338, "step": 9375 }, { "epoch": 0.8143463570995408, "grad_norm": 2.8085834980010986, "learning_rate": 6.203311513284559e-06, "loss": 0.2732, "step": 9400 }, { "epoch": 0.8165121718790609, "grad_norm": 12.976601600646973, "learning_rate": 6.131112822487486e-06, "loss": 0.2973, "step": 9425 }, { "epoch": 0.8186779866585809, "grad_norm": 2.7448630332946777, "learning_rate": 6.058914131690412e-06, "loss": 0.2783, "step": 9450 }, { "epoch": 0.820843801438101, "grad_norm": 2.347792387008667, "learning_rate": 5.986715440893339e-06, "loss": 0.2418, "step": 9475 }, { "epoch": 0.823009616217621, "grad_norm": 2.851559638977051, "learning_rate": 5.914516750096265e-06, "loss": 0.2603, "step": 9500 }, { "epoch": 0.8251754309971411, "grad_norm": 6.941406726837158, "learning_rate": 5.842318059299192e-06, "loss": 0.1888, "step": 9525 }, { "epoch": 0.8273412457766611, "grad_norm": 4.45375394821167, "learning_rate": 5.770119368502118e-06, "loss": 0.2581, "step": 9550 }, { "epoch": 0.8295070605561813, "grad_norm": 5.2709641456604, "learning_rate": 5.6979206777050444e-06, "loss": 0.2742, "step": 9575 }, { "epoch": 0.8316728753357013, "grad_norm": 2.6814463138580322, "learning_rate": 5.62572198690797e-06, "loss": 0.2156, "step": 9600 }, { "epoch": 0.8338386901152214, "grad_norm": 0.12416364997625351, "learning_rate": 5.553523296110898e-06, "loss": 0.3317, "step": 9625 }, { "epoch": 0.8360045048947414, "grad_norm": 5.639218807220459, "learning_rate": 5.481324605313824e-06, "loss": 0.1967, "step": 9650 }, { "epoch": 0.8381703196742615, "grad_norm": 0.8800064921379089, "learning_rate": 5.4091259145167504e-06, "loss": 0.1701, "step": 9675 }, { "epoch": 0.8403361344537815, "grad_norm": 2.7125442028045654, "learning_rate": 5.336927223719676e-06, "loss": 0.3064, "step": 9700 }, { "epoch": 0.8425019492333016, "grad_norm": 3.1365272998809814, "learning_rate": 5.264728532922603e-06, "loss": 0.3511, "step": 9725 }, { "epoch": 0.8446677640128216, "grad_norm": 10.584244728088379, "learning_rate": 5.19252984212553e-06, "loss": 0.2461, "step": 9750 }, { "epoch": 0.8468335787923417, "grad_norm": 0.7926290035247803, "learning_rate": 5.1203311513284565e-06, "loss": 0.3047, "step": 9775 }, { "epoch": 0.8489993935718617, "grad_norm": 10.744616508483887, "learning_rate": 5.048132460531382e-06, "loss": 0.3234, "step": 9800 }, { "epoch": 0.8511652083513818, "grad_norm": 3.9436535835266113, "learning_rate": 4.975933769734309e-06, "loss": 0.2843, "step": 9825 }, { "epoch": 0.8533310231309018, "grad_norm": 0.2785266637802124, "learning_rate": 4.903735078937235e-06, "loss": 0.3365, "step": 9850 }, { "epoch": 0.8554968379104219, "grad_norm": 7.446309566497803, "learning_rate": 4.831536388140162e-06, "loss": 0.3802, "step": 9875 }, { "epoch": 0.8576626526899419, "grad_norm": 9.687524795532227, "learning_rate": 4.759337697343088e-06, "loss": 0.2587, "step": 9900 }, { "epoch": 0.8598284674694621, "grad_norm": 0.4837453067302704, "learning_rate": 4.687139006546015e-06, "loss": 0.2367, "step": 9925 }, { "epoch": 0.8619942822489821, "grad_norm": 0.7170611023902893, "learning_rate": 4.614940315748941e-06, "loss": 0.2971, "step": 9950 }, { "epoch": 0.8641600970285022, "grad_norm": 16.417407989501953, "learning_rate": 4.542741624951868e-06, "loss": 0.2884, "step": 9975 }, { "epoch": 0.8663259118080222, "grad_norm": 7.771174430847168, "learning_rate": 4.4705429341547935e-06, "loss": 0.2296, "step": 10000 }, { "epoch": 0.8684917265875423, "grad_norm": 1.540907859802246, "learning_rate": 4.398344243357721e-06, "loss": 0.3145, "step": 10025 }, { "epoch": 0.8706575413670623, "grad_norm": 1.4157791137695312, "learning_rate": 4.326145552560647e-06, "loss": 0.178, "step": 10050 }, { "epoch": 0.8728233561465824, "grad_norm": 4.707205295562744, "learning_rate": 4.253946861763574e-06, "loss": 0.2681, "step": 10075 }, { "epoch": 0.8749891709261024, "grad_norm": 3.7186520099639893, "learning_rate": 4.1817481709664995e-06, "loss": 0.3191, "step": 10100 }, { "epoch": 0.8771549857056224, "grad_norm": 1.6584956645965576, "learning_rate": 4.109549480169426e-06, "loss": 0.2544, "step": 10125 }, { "epoch": 0.8793208004851425, "grad_norm": 9.22360610961914, "learning_rate": 4.037350789372352e-06, "loss": 0.2965, "step": 10150 }, { "epoch": 0.8814866152646625, "grad_norm": 3.5934746265411377, "learning_rate": 3.96515209857528e-06, "loss": 0.317, "step": 10175 }, { "epoch": 0.8836524300441826, "grad_norm": 1.5978528261184692, "learning_rate": 3.892953407778206e-06, "loss": 0.2149, "step": 10200 }, { "epoch": 0.8858182448237026, "grad_norm": 4.726417064666748, "learning_rate": 3.820754716981132e-06, "loss": 0.4876, "step": 10225 }, { "epoch": 0.8879840596032227, "grad_norm": 7.836237907409668, "learning_rate": 3.7485560261840585e-06, "loss": 0.2984, "step": 10250 }, { "epoch": 0.8901498743827427, "grad_norm": 6.5479912757873535, "learning_rate": 3.676357335386985e-06, "loss": 0.3024, "step": 10275 }, { "epoch": 0.8923156891622629, "grad_norm": 1.180179476737976, "learning_rate": 3.6041586445899115e-06, "loss": 0.2447, "step": 10300 }, { "epoch": 0.8944815039417829, "grad_norm": 5.868828773498535, "learning_rate": 3.5319599537928378e-06, "loss": 0.2684, "step": 10325 }, { "epoch": 0.896647318721303, "grad_norm": 6.2655816078186035, "learning_rate": 3.4597612629957645e-06, "loss": 0.1714, "step": 10350 }, { "epoch": 0.898813133500823, "grad_norm": 6.3384270668029785, "learning_rate": 3.387562572198691e-06, "loss": 0.2776, "step": 10375 }, { "epoch": 0.9009789482803431, "grad_norm": 6.097102165222168, "learning_rate": 3.315363881401617e-06, "loss": 0.2745, "step": 10400 }, { "epoch": 0.9031447630598631, "grad_norm": 7.250086784362793, "learning_rate": 3.243165190604544e-06, "loss": 0.3299, "step": 10425 }, { "epoch": 0.9053105778393832, "grad_norm": 9.260988235473633, "learning_rate": 3.17096649980747e-06, "loss": 0.2629, "step": 10450 }, { "epoch": 0.9074763926189032, "grad_norm": 8.009949684143066, "learning_rate": 3.0987678090103964e-06, "loss": 0.3627, "step": 10475 }, { "epoch": 0.9096422073984233, "grad_norm": 1.247878074645996, "learning_rate": 3.026569118213323e-06, "loss": 0.2236, "step": 10500 }, { "epoch": 0.9118080221779433, "grad_norm": 6.759634971618652, "learning_rate": 2.9543704274162494e-06, "loss": 0.2819, "step": 10525 }, { "epoch": 0.9139738369574634, "grad_norm": 0.09837600588798523, "learning_rate": 2.882171736619176e-06, "loss": 0.3129, "step": 10550 }, { "epoch": 0.9161396517369834, "grad_norm": 6.850848197937012, "learning_rate": 2.8099730458221024e-06, "loss": 0.3051, "step": 10575 }, { "epoch": 0.9183054665165035, "grad_norm": 8.94210147857666, "learning_rate": 2.7377743550250287e-06, "loss": 0.3955, "step": 10600 }, { "epoch": 0.9204712812960235, "grad_norm": 8.595787048339844, "learning_rate": 2.6655756642279554e-06, "loss": 0.2493, "step": 10625 }, { "epoch": 0.9226370960755437, "grad_norm": 7.062394618988037, "learning_rate": 2.5933769734308817e-06, "loss": 0.2543, "step": 10650 }, { "epoch": 0.9248029108550637, "grad_norm": 3.371393918991089, "learning_rate": 2.521178282633808e-06, "loss": 0.2222, "step": 10675 }, { "epoch": 0.9269687256345838, "grad_norm": 1.3468866348266602, "learning_rate": 2.4489795918367347e-06, "loss": 0.2823, "step": 10700 }, { "epoch": 0.9291345404141038, "grad_norm": 15.475239753723145, "learning_rate": 2.376780901039661e-06, "loss": 0.3098, "step": 10725 }, { "epoch": 0.9313003551936239, "grad_norm": 6.605096340179443, "learning_rate": 2.3045822102425877e-06, "loss": 0.3009, "step": 10750 }, { "epoch": 0.9334661699731439, "grad_norm": 3.2146847248077393, "learning_rate": 2.2323835194455144e-06, "loss": 0.2623, "step": 10775 }, { "epoch": 0.935631984752664, "grad_norm": 2.727200508117676, "learning_rate": 2.1601848286484407e-06, "loss": 0.1952, "step": 10800 }, { "epoch": 0.937797799532184, "grad_norm": 2.7418553829193115, "learning_rate": 2.0879861378513674e-06, "loss": 0.4527, "step": 10825 }, { "epoch": 0.939963614311704, "grad_norm": 8.577201843261719, "learning_rate": 2.0157874470542937e-06, "loss": 0.2323, "step": 10850 }, { "epoch": 0.9421294290912241, "grad_norm": 4.514817237854004, "learning_rate": 1.94358875625722e-06, "loss": 0.3109, "step": 10875 }, { "epoch": 0.9442952438707441, "grad_norm": 10.761394500732422, "learning_rate": 1.8713900654601463e-06, "loss": 0.3335, "step": 10900 }, { "epoch": 0.9464610586502642, "grad_norm": 8.004775047302246, "learning_rate": 1.7991913746630728e-06, "loss": 0.2862, "step": 10925 }, { "epoch": 0.9486268734297842, "grad_norm": 7.491416931152344, "learning_rate": 1.7269926838659993e-06, "loss": 0.4005, "step": 10950 }, { "epoch": 0.9507926882093043, "grad_norm": 6.168478488922119, "learning_rate": 1.6547939930689255e-06, "loss": 0.2815, "step": 10975 }, { "epoch": 0.9529585029888243, "grad_norm": 7.221772193908691, "learning_rate": 1.582595302271852e-06, "loss": 0.2157, "step": 11000 }, { "epoch": 0.9551243177683445, "grad_norm": 5.9744086265563965, "learning_rate": 1.5103966114747788e-06, "loss": 0.3733, "step": 11025 }, { "epoch": 0.9572901325478645, "grad_norm": 5.776475429534912, "learning_rate": 1.4381979206777053e-06, "loss": 0.2843, "step": 11050 }, { "epoch": 0.9594559473273846, "grad_norm": 1.3870640993118286, "learning_rate": 1.3659992298806316e-06, "loss": 0.1963, "step": 11075 }, { "epoch": 0.9616217621069046, "grad_norm": 7.3776535987854, "learning_rate": 1.293800539083558e-06, "loss": 0.3081, "step": 11100 }, { "epoch": 0.9637875768864247, "grad_norm": 11.289216995239258, "learning_rate": 1.2216018482864846e-06, "loss": 0.2317, "step": 11125 }, { "epoch": 0.9659533916659447, "grad_norm": 11.621864318847656, "learning_rate": 1.1494031574894108e-06, "loss": 0.3027, "step": 11150 }, { "epoch": 0.9681192064454648, "grad_norm": 11.617834091186523, "learning_rate": 1.0772044666923373e-06, "loss": 0.3581, "step": 11175 }, { "epoch": 0.9702850212249848, "grad_norm": 5.500637531280518, "learning_rate": 1.0050057758952638e-06, "loss": 0.3, "step": 11200 }, { "epoch": 0.9724508360045049, "grad_norm": 3.552578926086426, "learning_rate": 9.328070850981902e-07, "loss": 0.2797, "step": 11225 }, { "epoch": 0.9746166507840249, "grad_norm": 1.074208378791809, "learning_rate": 8.606083943011167e-07, "loss": 0.2918, "step": 11250 }, { "epoch": 0.976782465563545, "grad_norm": 11.449936866760254, "learning_rate": 7.884097035040431e-07, "loss": 0.2519, "step": 11275 }, { "epoch": 0.978948280343065, "grad_norm": 2.988003730773926, "learning_rate": 7.162110127069696e-07, "loss": 0.2183, "step": 11300 }, { "epoch": 0.9811140951225851, "grad_norm": 2.9280929565429688, "learning_rate": 6.44012321909896e-07, "loss": 0.2764, "step": 11325 }, { "epoch": 0.9832799099021051, "grad_norm": 3.2279105186462402, "learning_rate": 5.718136311128224e-07, "loss": 0.4107, "step": 11350 }, { "epoch": 0.9854457246816253, "grad_norm": 2.54160737991333, "learning_rate": 4.996149403157489e-07, "loss": 0.3135, "step": 11375 }, { "epoch": 0.9876115394611453, "grad_norm": 1.3068925142288208, "learning_rate": 4.2741624951867543e-07, "loss": 0.2138, "step": 11400 }, { "epoch": 0.9897773542406654, "grad_norm": 8.606940269470215, "learning_rate": 3.5521755872160183e-07, "loss": 0.2984, "step": 11425 }, { "epoch": 0.9919431690201854, "grad_norm": 1.2513303756713867, "learning_rate": 2.830188679245283e-07, "loss": 0.2407, "step": 11450 }, { "epoch": 0.9941089837997055, "grad_norm": 11.340466499328613, "learning_rate": 2.1082017712745478e-07, "loss": 0.2449, "step": 11475 }, { "epoch": 0.9962747985792255, "grad_norm": 6.166193008422852, "learning_rate": 1.386214863303812e-07, "loss": 0.2629, "step": 11500 }, { "epoch": 0.9984406133587456, "grad_norm": 4.004662990570068, "learning_rate": 6.642279553330766e-08, "loss": 0.3488, "step": 11525 }, { "epoch": 1.0, "eval_cosine_accuracy": 0.9693415637860082, "eval_loss": 0.4268312156200409, "eval_runtime": 50.4023, "eval_samples_per_second": 96.424, "eval_steps_per_second": 6.031, "step": 11543 } ], "logging_steps": 25, "max_steps": 11543, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }