{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9998919736415686, "eval_steps": 500, "global_step": 6942, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014403514457527638, "grad_norm": 16.586680451608224, "learning_rate": 1.1510791366906476e-07, "loss": 1.7625, "step": 5 }, { "epoch": 0.0028807028915055276, "grad_norm": 16.23021243769238, "learning_rate": 2.589928057553957e-07, "loss": 1.754, "step": 10 }, { "epoch": 0.004321054337258291, "grad_norm": 13.550599381632765, "learning_rate": 4.0287769784172663e-07, "loss": 1.7397, "step": 15 }, { "epoch": 0.005761405783011055, "grad_norm": 11.888221853811155, "learning_rate": 5.467625899280576e-07, "loss": 1.6888, "step": 20 }, { "epoch": 0.007201757228763818, "grad_norm": 9.60550960713175, "learning_rate": 6.906474820143885e-07, "loss": 1.5985, "step": 25 }, { "epoch": 0.008642108674516582, "grad_norm": 9.038730172783534, "learning_rate": 8.345323741007196e-07, "loss": 1.4653, "step": 30 }, { "epoch": 0.010082460120269346, "grad_norm": 8.758156216249105, "learning_rate": 9.784172661870505e-07, "loss": 1.2571, "step": 35 }, { "epoch": 0.01152281156602211, "grad_norm": 8.491557806539111, "learning_rate": 1.1223021582733814e-06, "loss": 0.9934, "step": 40 }, { "epoch": 0.012963163011774873, "grad_norm": 7.84797759833516, "learning_rate": 1.2661870503597123e-06, "loss": 0.8012, "step": 45 }, { "epoch": 0.014403514457527637, "grad_norm": 2.323360571149784, "learning_rate": 1.4100719424460432e-06, "loss": 0.5458, "step": 50 }, { "epoch": 0.0158438659032804, "grad_norm": 1.8779227048258833, "learning_rate": 1.5539568345323742e-06, "loss": 0.5126, "step": 55 }, { "epoch": 0.017284217349033165, "grad_norm": 1.7665084648486797, "learning_rate": 1.6978417266187053e-06, "loss": 0.472, "step": 60 }, { "epoch": 0.01872456879478593, "grad_norm": 2.538885827255077, "learning_rate": 1.8417266187050362e-06, "loss": 0.4457, "step": 65 }, { "epoch": 0.020164920240538693, "grad_norm": 2.6827209141200647, "learning_rate": 1.985611510791367e-06, "loss": 0.4296, "step": 70 }, { "epoch": 0.021605271686291457, "grad_norm": 2.0305361884485555, "learning_rate": 2.129496402877698e-06, "loss": 0.4221, "step": 75 }, { "epoch": 0.02304562313204422, "grad_norm": 2.6997984040664833, "learning_rate": 2.273381294964029e-06, "loss": 0.4028, "step": 80 }, { "epoch": 0.02448597457779698, "grad_norm": 1.2504187205349406, "learning_rate": 2.41726618705036e-06, "loss": 0.3801, "step": 85 }, { "epoch": 0.025926326023549745, "grad_norm": 1.3474376559842136, "learning_rate": 2.5611510791366906e-06, "loss": 0.3694, "step": 90 }, { "epoch": 0.02736667746930251, "grad_norm": 1.4308843552665633, "learning_rate": 2.7050359712230217e-06, "loss": 0.3708, "step": 95 }, { "epoch": 0.028807028915055273, "grad_norm": 1.3683344004037181, "learning_rate": 2.848920863309353e-06, "loss": 0.3384, "step": 100 }, { "epoch": 0.030247380360808037, "grad_norm": 1.3015919647993486, "learning_rate": 2.9928057553956836e-06, "loss": 0.331, "step": 105 }, { "epoch": 0.0316877318065608, "grad_norm": 1.9912468429786234, "learning_rate": 3.1366906474820147e-06, "loss": 0.3178, "step": 110 }, { "epoch": 0.03312808325231356, "grad_norm": 1.2757927978871475, "learning_rate": 3.280575539568346e-06, "loss": 0.3208, "step": 115 }, { "epoch": 0.03456843469806633, "grad_norm": 1.1146556109378103, "learning_rate": 3.4244604316546766e-06, "loss": 0.3143, "step": 120 }, { "epoch": 0.03600878614381909, "grad_norm": 1.254484216831056, "learning_rate": 3.5683453237410077e-06, "loss": 0.3176, "step": 125 }, { "epoch": 0.03744913758957186, "grad_norm": 1.1698368725906516, "learning_rate": 3.7122302158273384e-06, "loss": 0.3146, "step": 130 }, { "epoch": 0.03888948903532462, "grad_norm": 1.1437180666843907, "learning_rate": 3.856115107913669e-06, "loss": 0.3059, "step": 135 }, { "epoch": 0.040329840481077385, "grad_norm": 1.1343184831908104, "learning_rate": 4.000000000000001e-06, "loss": 0.3002, "step": 140 }, { "epoch": 0.041770191926830146, "grad_norm": 1.1952316457960082, "learning_rate": 4.143884892086331e-06, "loss": 0.2924, "step": 145 }, { "epoch": 0.04321054337258291, "grad_norm": 1.347560615976888, "learning_rate": 4.287769784172662e-06, "loss": 0.2856, "step": 150 }, { "epoch": 0.044650894818335674, "grad_norm": 1.2457364588537787, "learning_rate": 4.431654676258993e-06, "loss": 0.2815, "step": 155 }, { "epoch": 0.04609124626408844, "grad_norm": 1.0098986875238485, "learning_rate": 4.575539568345324e-06, "loss": 0.262, "step": 160 }, { "epoch": 0.0475315977098412, "grad_norm": 1.0632076910755275, "learning_rate": 4.719424460431655e-06, "loss": 0.2831, "step": 165 }, { "epoch": 0.04897194915559396, "grad_norm": 1.0001553412785371, "learning_rate": 4.863309352517986e-06, "loss": 0.2692, "step": 170 }, { "epoch": 0.05041230060134673, "grad_norm": 0.9548653221886261, "learning_rate": 5.0071942446043165e-06, "loss": 0.2526, "step": 175 }, { "epoch": 0.05185265204709949, "grad_norm": 0.8527895969241093, "learning_rate": 5.151079136690648e-06, "loss": 0.2578, "step": 180 }, { "epoch": 0.05329300349285226, "grad_norm": 0.8301269546841742, "learning_rate": 5.294964028776979e-06, "loss": 0.2612, "step": 185 }, { "epoch": 0.05473335493860502, "grad_norm": 0.8696014983595155, "learning_rate": 5.43884892086331e-06, "loss": 0.2587, "step": 190 }, { "epoch": 0.056173706384357786, "grad_norm": 0.7739875909178445, "learning_rate": 5.582733812949641e-06, "loss": 0.246, "step": 195 }, { "epoch": 0.057614057830110546, "grad_norm": 0.7654383803539627, "learning_rate": 5.726618705035971e-06, "loss": 0.2596, "step": 200 }, { "epoch": 0.059054409275863314, "grad_norm": 0.8726355233974463, "learning_rate": 5.8705035971223024e-06, "loss": 0.2396, "step": 205 }, { "epoch": 0.060494760721616074, "grad_norm": 0.7952385048612362, "learning_rate": 6.014388489208633e-06, "loss": 0.2489, "step": 210 }, { "epoch": 0.061935112167368835, "grad_norm": 0.8979351693559371, "learning_rate": 6.158273381294965e-06, "loss": 0.2393, "step": 215 }, { "epoch": 0.0633754636131216, "grad_norm": 0.6684273499878575, "learning_rate": 6.302158273381295e-06, "loss": 0.2471, "step": 220 }, { "epoch": 0.06481581505887436, "grad_norm": 0.7298265883042253, "learning_rate": 6.446043165467626e-06, "loss": 0.2421, "step": 225 }, { "epoch": 0.06625616650462712, "grad_norm": 0.6283034910513979, "learning_rate": 6.589928057553957e-06, "loss": 0.2347, "step": 230 }, { "epoch": 0.0676965179503799, "grad_norm": 2.019647888568607, "learning_rate": 6.733812949640288e-06, "loss": 0.2438, "step": 235 }, { "epoch": 0.06913686939613266, "grad_norm": 0.7723184818645342, "learning_rate": 6.877697841726619e-06, "loss": 0.247, "step": 240 }, { "epoch": 0.07057722084188542, "grad_norm": 0.6213657978770711, "learning_rate": 7.021582733812951e-06, "loss": 0.2378, "step": 245 }, { "epoch": 0.07201757228763818, "grad_norm": 0.7770361009794773, "learning_rate": 7.165467625899281e-06, "loss": 0.2393, "step": 250 }, { "epoch": 0.07345792373339095, "grad_norm": 0.6688025297223997, "learning_rate": 7.309352517985612e-06, "loss": 0.243, "step": 255 }, { "epoch": 0.07489827517914371, "grad_norm": 0.7154762192358323, "learning_rate": 7.453237410071943e-06, "loss": 0.2384, "step": 260 }, { "epoch": 0.07633862662489647, "grad_norm": 0.6519073456752323, "learning_rate": 7.597122302158274e-06, "loss": 0.2285, "step": 265 }, { "epoch": 0.07777897807064924, "grad_norm": 0.6216449718793182, "learning_rate": 7.741007194244606e-06, "loss": 0.2209, "step": 270 }, { "epoch": 0.079219329516402, "grad_norm": 0.5890271039585314, "learning_rate": 7.884892086330936e-06, "loss": 0.2295, "step": 275 }, { "epoch": 0.08065968096215477, "grad_norm": 0.5765383492022604, "learning_rate": 8.028776978417266e-06, "loss": 0.2347, "step": 280 }, { "epoch": 0.08210003240790753, "grad_norm": 0.6882797441460431, "learning_rate": 8.172661870503597e-06, "loss": 0.2295, "step": 285 }, { "epoch": 0.08354038385366029, "grad_norm": 0.557535624999968, "learning_rate": 8.316546762589929e-06, "loss": 0.2275, "step": 290 }, { "epoch": 0.08498073529941305, "grad_norm": 0.6508761757468999, "learning_rate": 8.46043165467626e-06, "loss": 0.2315, "step": 295 }, { "epoch": 0.08642108674516583, "grad_norm": 0.6758956838113382, "learning_rate": 8.604316546762592e-06, "loss": 0.231, "step": 300 }, { "epoch": 0.08786143819091859, "grad_norm": 0.578932856654019, "learning_rate": 8.748201438848922e-06, "loss": 0.2389, "step": 305 }, { "epoch": 0.08930178963667135, "grad_norm": 0.6066986114882191, "learning_rate": 8.892086330935252e-06, "loss": 0.241, "step": 310 }, { "epoch": 0.09074214108242411, "grad_norm": 0.615624208891137, "learning_rate": 9.035971223021583e-06, "loss": 0.2167, "step": 315 }, { "epoch": 0.09218249252817688, "grad_norm": 0.5484255060272044, "learning_rate": 9.179856115107915e-06, "loss": 0.2337, "step": 320 }, { "epoch": 0.09362284397392964, "grad_norm": 0.657391795292415, "learning_rate": 9.323741007194246e-06, "loss": 0.2274, "step": 325 }, { "epoch": 0.0950631954196824, "grad_norm": 0.5704279837914004, "learning_rate": 9.467625899280576e-06, "loss": 0.2203, "step": 330 }, { "epoch": 0.09650354686543516, "grad_norm": 0.540539465103558, "learning_rate": 9.611510791366908e-06, "loss": 0.2202, "step": 335 }, { "epoch": 0.09794389831118792, "grad_norm": 0.5694667381563077, "learning_rate": 9.755395683453238e-06, "loss": 0.2185, "step": 340 }, { "epoch": 0.0993842497569407, "grad_norm": 0.6987105858587492, "learning_rate": 9.899280575539569e-06, "loss": 0.223, "step": 345 }, { "epoch": 0.10082460120269346, "grad_norm": 0.6273823448614299, "learning_rate": 1.0043165467625899e-05, "loss": 0.2303, "step": 350 }, { "epoch": 0.10226495264844622, "grad_norm": 0.6219303498898151, "learning_rate": 1.0187050359712232e-05, "loss": 0.206, "step": 355 }, { "epoch": 0.10370530409419898, "grad_norm": 0.5674007048209457, "learning_rate": 1.0330935251798562e-05, "loss": 0.2209, "step": 360 }, { "epoch": 0.10514565553995175, "grad_norm": 0.5817546836599868, "learning_rate": 1.0474820143884894e-05, "loss": 0.2267, "step": 365 }, { "epoch": 0.10658600698570452, "grad_norm": 0.5560607592408621, "learning_rate": 1.0618705035971223e-05, "loss": 0.2279, "step": 370 }, { "epoch": 0.10802635843145728, "grad_norm": 0.6513825672726103, "learning_rate": 1.0762589928057553e-05, "loss": 0.2141, "step": 375 }, { "epoch": 0.10946670987721004, "grad_norm": 0.5936098114546975, "learning_rate": 1.0906474820143887e-05, "loss": 0.2178, "step": 380 }, { "epoch": 0.1109070613229628, "grad_norm": 0.5490902284633686, "learning_rate": 1.1050359712230216e-05, "loss": 0.2195, "step": 385 }, { "epoch": 0.11234741276871557, "grad_norm": 0.5995043966161002, "learning_rate": 1.1194244604316548e-05, "loss": 0.2216, "step": 390 }, { "epoch": 0.11378776421446833, "grad_norm": 0.5725529495218363, "learning_rate": 1.133812949640288e-05, "loss": 0.2234, "step": 395 }, { "epoch": 0.11522811566022109, "grad_norm": 0.5561807013548232, "learning_rate": 1.148201438848921e-05, "loss": 0.2138, "step": 400 }, { "epoch": 0.11666846710597385, "grad_norm": 0.575827707581297, "learning_rate": 1.1625899280575541e-05, "loss": 0.2189, "step": 405 }, { "epoch": 0.11810881855172663, "grad_norm": 0.8535370905580235, "learning_rate": 1.176978417266187e-05, "loss": 0.205, "step": 410 }, { "epoch": 0.11954916999747939, "grad_norm": 0.5195864367586588, "learning_rate": 1.1913669064748204e-05, "loss": 0.2317, "step": 415 }, { "epoch": 0.12098952144323215, "grad_norm": 0.5585128655255315, "learning_rate": 1.2057553956834534e-05, "loss": 0.2242, "step": 420 }, { "epoch": 0.12242987288898491, "grad_norm": 0.5627890703738564, "learning_rate": 1.2201438848920864e-05, "loss": 0.2275, "step": 425 }, { "epoch": 0.12387022433473767, "grad_norm": 0.5525711309273611, "learning_rate": 1.2345323741007195e-05, "loss": 0.2212, "step": 430 }, { "epoch": 0.12531057578049043, "grad_norm": 0.5512498249317371, "learning_rate": 1.2489208633093525e-05, "loss": 0.2075, "step": 435 }, { "epoch": 0.1267509272262432, "grad_norm": 0.527963361870513, "learning_rate": 1.2633093525179858e-05, "loss": 0.2207, "step": 440 }, { "epoch": 0.12819127867199598, "grad_norm": 0.66444135628081, "learning_rate": 1.2776978417266188e-05, "loss": 0.2176, "step": 445 }, { "epoch": 0.12963163011774873, "grad_norm": 0.6375370649966602, "learning_rate": 1.2920863309352518e-05, "loss": 0.218, "step": 450 }, { "epoch": 0.1310719815635015, "grad_norm": 0.5294066493342842, "learning_rate": 1.306474820143885e-05, "loss": 0.2075, "step": 455 }, { "epoch": 0.13251233300925425, "grad_norm": 0.5717532515073108, "learning_rate": 1.3208633093525181e-05, "loss": 0.2151, "step": 460 }, { "epoch": 0.13395268445500702, "grad_norm": 0.5571385430026796, "learning_rate": 1.3352517985611513e-05, "loss": 0.2062, "step": 465 }, { "epoch": 0.1353930359007598, "grad_norm": 0.6911923317118861, "learning_rate": 1.3496402877697843e-05, "loss": 0.2001, "step": 470 }, { "epoch": 0.13683338734651254, "grad_norm": 0.4888695446827276, "learning_rate": 1.3640287769784173e-05, "loss": 0.2054, "step": 475 }, { "epoch": 0.13827373879226532, "grad_norm": 0.5791087898343821, "learning_rate": 1.3784172661870506e-05, "loss": 0.2096, "step": 480 }, { "epoch": 0.13971409023801806, "grad_norm": 0.5461601934980354, "learning_rate": 1.3928057553956836e-05, "loss": 0.2186, "step": 485 }, { "epoch": 0.14115444168377084, "grad_norm": 0.5278105788434795, "learning_rate": 1.4071942446043167e-05, "loss": 0.2064, "step": 490 }, { "epoch": 0.1425947931295236, "grad_norm": 0.572815440882881, "learning_rate": 1.4215827338129497e-05, "loss": 0.23, "step": 495 }, { "epoch": 0.14403514457527636, "grad_norm": 0.5825481697550493, "learning_rate": 1.4359712230215827e-05, "loss": 0.2113, "step": 500 }, { "epoch": 0.14403514457527636, "eval_loss": 0.21517202258110046, "eval_runtime": 192.1896, "eval_samples_per_second": 9.387, "eval_steps_per_second": 2.347, "step": 500 }, { "epoch": 0.14547549602102913, "grad_norm": 0.5450285278535744, "learning_rate": 1.450359712230216e-05, "loss": 0.2176, "step": 505 }, { "epoch": 0.1469158474667819, "grad_norm": 0.5617983236242802, "learning_rate": 1.464748201438849e-05, "loss": 0.2263, "step": 510 }, { "epoch": 0.14835619891253465, "grad_norm": 0.5107925009512722, "learning_rate": 1.4791366906474822e-05, "loss": 0.2093, "step": 515 }, { "epoch": 0.14979655035828743, "grad_norm": 0.5095844104524674, "learning_rate": 1.4935251798561152e-05, "loss": 0.2076, "step": 520 }, { "epoch": 0.15123690180404017, "grad_norm": 0.4681970484619367, "learning_rate": 1.5079136690647483e-05, "loss": 0.205, "step": 525 }, { "epoch": 0.15267725324979295, "grad_norm": 0.483511636229593, "learning_rate": 1.5223021582733815e-05, "loss": 0.2039, "step": 530 }, { "epoch": 0.15411760469554572, "grad_norm": 0.5952140053203314, "learning_rate": 1.5366906474820144e-05, "loss": 0.2048, "step": 535 }, { "epoch": 0.15555795614129847, "grad_norm": 0.5355425751788454, "learning_rate": 1.5510791366906476e-05, "loss": 0.1994, "step": 540 }, { "epoch": 0.15699830758705124, "grad_norm": 0.5186977018893049, "learning_rate": 1.5654676258992808e-05, "loss": 0.2191, "step": 545 }, { "epoch": 0.158438659032804, "grad_norm": 0.5096765096533113, "learning_rate": 1.5798561151079136e-05, "loss": 0.2037, "step": 550 }, { "epoch": 0.15987901047855677, "grad_norm": 0.49643477985240453, "learning_rate": 1.594244604316547e-05, "loss": 0.2111, "step": 555 }, { "epoch": 0.16131936192430954, "grad_norm": 0.527798431687936, "learning_rate": 1.60863309352518e-05, "loss": 0.194, "step": 560 }, { "epoch": 0.1627597133700623, "grad_norm": 0.4992686472592364, "learning_rate": 1.623021582733813e-05, "loss": 0.1967, "step": 565 }, { "epoch": 0.16420006481581506, "grad_norm": 0.4982521591402399, "learning_rate": 1.6374100719424462e-05, "loss": 0.217, "step": 570 }, { "epoch": 0.16564041626156784, "grad_norm": 0.464665175723593, "learning_rate": 1.651798561151079e-05, "loss": 0.1957, "step": 575 }, { "epoch": 0.16708076770732058, "grad_norm": 0.45188523938453024, "learning_rate": 1.6661870503597125e-05, "loss": 0.2087, "step": 580 }, { "epoch": 0.16852111915307336, "grad_norm": 0.5395237904716252, "learning_rate": 1.6805755395683453e-05, "loss": 0.2159, "step": 585 }, { "epoch": 0.1699614705988261, "grad_norm": 0.49617157293448155, "learning_rate": 1.6949640287769785e-05, "loss": 0.2048, "step": 590 }, { "epoch": 0.17140182204457888, "grad_norm": 0.45357867780884986, "learning_rate": 1.7093525179856116e-05, "loss": 0.2085, "step": 595 }, { "epoch": 0.17284217349033165, "grad_norm": 0.4561385049934112, "learning_rate": 1.7237410071942448e-05, "loss": 0.2071, "step": 600 }, { "epoch": 0.1742825249360844, "grad_norm": 0.4743223833758313, "learning_rate": 1.738129496402878e-05, "loss": 0.2004, "step": 605 }, { "epoch": 0.17572287638183717, "grad_norm": 0.5210203708061392, "learning_rate": 1.7525179856115108e-05, "loss": 0.2119, "step": 610 }, { "epoch": 0.17716322782758992, "grad_norm": 0.5230318528080634, "learning_rate": 1.7669064748201443e-05, "loss": 0.2063, "step": 615 }, { "epoch": 0.1786035792733427, "grad_norm": 0.5438502195561348, "learning_rate": 1.781294964028777e-05, "loss": 0.2111, "step": 620 }, { "epoch": 0.18004393071909547, "grad_norm": 0.5574590918705871, "learning_rate": 1.7956834532374102e-05, "loss": 0.2063, "step": 625 }, { "epoch": 0.18148428216484822, "grad_norm": 0.5531167510606824, "learning_rate": 1.8100719424460434e-05, "loss": 0.2047, "step": 630 }, { "epoch": 0.182924633610601, "grad_norm": 0.4941091683759115, "learning_rate": 1.8244604316546762e-05, "loss": 0.209, "step": 635 }, { "epoch": 0.18436498505635376, "grad_norm": 0.49923381402425615, "learning_rate": 1.8388489208633097e-05, "loss": 0.2008, "step": 640 }, { "epoch": 0.1858053365021065, "grad_norm": 0.5294742227793021, "learning_rate": 1.8532374100719425e-05, "loss": 0.2093, "step": 645 }, { "epoch": 0.18724568794785929, "grad_norm": 0.730229027488376, "learning_rate": 1.8676258992805757e-05, "loss": 0.1919, "step": 650 }, { "epoch": 0.18868603939361203, "grad_norm": 0.5108094713116142, "learning_rate": 1.8820143884892088e-05, "loss": 0.2125, "step": 655 }, { "epoch": 0.1901263908393648, "grad_norm": 0.4842966623253086, "learning_rate": 1.8964028776978416e-05, "loss": 0.2146, "step": 660 }, { "epoch": 0.19156674228511758, "grad_norm": 0.49057011382124954, "learning_rate": 1.910791366906475e-05, "loss": 0.2205, "step": 665 }, { "epoch": 0.19300709373087033, "grad_norm": 0.48392895116401746, "learning_rate": 1.925179856115108e-05, "loss": 0.2173, "step": 670 }, { "epoch": 0.1944474451766231, "grad_norm": 0.4857633076018165, "learning_rate": 1.939568345323741e-05, "loss": 0.1981, "step": 675 }, { "epoch": 0.19588779662237585, "grad_norm": 0.530480064500977, "learning_rate": 1.9539568345323743e-05, "loss": 0.21, "step": 680 }, { "epoch": 0.19732814806812862, "grad_norm": 0.42587073729316316, "learning_rate": 1.9683453237410074e-05, "loss": 0.1915, "step": 685 }, { "epoch": 0.1987684995138814, "grad_norm": 0.4666628821103525, "learning_rate": 1.9827338129496406e-05, "loss": 0.2021, "step": 690 }, { "epoch": 0.20020885095963414, "grad_norm": 0.5396149739812561, "learning_rate": 1.9971223021582734e-05, "loss": 0.2054, "step": 695 }, { "epoch": 0.20164920240538692, "grad_norm": 0.49716967053304606, "learning_rate": 1.99999797676386e-05, "loss": 0.2115, "step": 700 }, { "epoch": 0.20308955385113966, "grad_norm": 0.4390495467259597, "learning_rate": 1.9999897573810713e-05, "loss": 0.2082, "step": 705 }, { "epoch": 0.20452990529689244, "grad_norm": 0.500587930536983, "learning_rate": 1.9999752154513036e-05, "loss": 0.2039, "step": 710 }, { "epoch": 0.2059702567426452, "grad_norm": 0.4561308107687448, "learning_rate": 1.9999543510665e-05, "loss": 0.212, "step": 715 }, { "epoch": 0.20741060818839796, "grad_norm": 0.4425298382894157, "learning_rate": 1.9999271643585775e-05, "loss": 0.2032, "step": 720 }, { "epoch": 0.20885095963415073, "grad_norm": 0.48158901469019905, "learning_rate": 1.9998936554994277e-05, "loss": 0.2033, "step": 725 }, { "epoch": 0.2102913110799035, "grad_norm": 0.45355370323735367, "learning_rate": 1.9998538247009135e-05, "loss": 0.2015, "step": 730 }, { "epoch": 0.21173166252565626, "grad_norm": 0.45663265150871324, "learning_rate": 1.99980767221487e-05, "loss": 0.2067, "step": 735 }, { "epoch": 0.21317201397140903, "grad_norm": 0.5282768185639742, "learning_rate": 1.999755198333101e-05, "loss": 0.1984, "step": 740 }, { "epoch": 0.21461236541716178, "grad_norm": 0.5312593562636294, "learning_rate": 1.999696403387379e-05, "loss": 0.1834, "step": 745 }, { "epoch": 0.21605271686291455, "grad_norm": 0.4522583632651766, "learning_rate": 1.9996312877494413e-05, "loss": 0.2074, "step": 750 }, { "epoch": 0.21749306830866733, "grad_norm": 0.45521876727857974, "learning_rate": 1.9995598518309886e-05, "loss": 0.2118, "step": 755 }, { "epoch": 0.21893341975442007, "grad_norm": 0.4661350669834831, "learning_rate": 1.999482096083683e-05, "loss": 0.2026, "step": 760 }, { "epoch": 0.22037377120017285, "grad_norm": 0.42027669464187223, "learning_rate": 1.9993980209991435e-05, "loss": 0.2053, "step": 765 }, { "epoch": 0.2218141226459256, "grad_norm": 0.4889241347368223, "learning_rate": 1.9993076271089443e-05, "loss": 0.2007, "step": 770 }, { "epoch": 0.22325447409167837, "grad_norm": 0.44479392282113467, "learning_rate": 1.999210914984611e-05, "loss": 0.1886, "step": 775 }, { "epoch": 0.22469482553743114, "grad_norm": 0.5309438649358437, "learning_rate": 1.999107885237617e-05, "loss": 0.1957, "step": 780 }, { "epoch": 0.2261351769831839, "grad_norm": 0.50630514648079, "learning_rate": 1.9989985385193788e-05, "loss": 0.2054, "step": 785 }, { "epoch": 0.22757552842893666, "grad_norm": 0.5119561463816542, "learning_rate": 1.9988828755212533e-05, "loss": 0.2153, "step": 790 }, { "epoch": 0.22901587987468944, "grad_norm": 0.5058068636508265, "learning_rate": 1.9987608969745338e-05, "loss": 0.2021, "step": 795 }, { "epoch": 0.23045623132044218, "grad_norm": 0.5387219159158904, "learning_rate": 1.998632603650442e-05, "loss": 0.2013, "step": 800 }, { "epoch": 0.23189658276619496, "grad_norm": 0.534399378765923, "learning_rate": 1.998497996360127e-05, "loss": 0.2029, "step": 805 }, { "epoch": 0.2333369342119477, "grad_norm": 0.48567637464880914, "learning_rate": 1.998357075954659e-05, "loss": 0.2176, "step": 810 }, { "epoch": 0.23477728565770048, "grad_norm": 0.4150282760082826, "learning_rate": 1.998209843325023e-05, "loss": 0.2032, "step": 815 }, { "epoch": 0.23621763710345325, "grad_norm": 0.41070942542681976, "learning_rate": 1.9980562994021132e-05, "loss": 0.1923, "step": 820 }, { "epoch": 0.237657988549206, "grad_norm": 0.4109088959795977, "learning_rate": 1.9978964451567285e-05, "loss": 0.2108, "step": 825 }, { "epoch": 0.23909833999495878, "grad_norm": 0.4527005874298046, "learning_rate": 1.997730281599565e-05, "loss": 0.2139, "step": 830 }, { "epoch": 0.24053869144071152, "grad_norm": 0.4639405357645518, "learning_rate": 1.9975578097812108e-05, "loss": 0.1936, "step": 835 }, { "epoch": 0.2419790428864643, "grad_norm": 0.4277370943857234, "learning_rate": 1.997379030792138e-05, "loss": 0.1904, "step": 840 }, { "epoch": 0.24341939433221707, "grad_norm": 0.4168082342597945, "learning_rate": 1.9971939457626966e-05, "loss": 0.2007, "step": 845 }, { "epoch": 0.24485974577796982, "grad_norm": 0.39209563109237133, "learning_rate": 1.9970025558631075e-05, "loss": 0.2038, "step": 850 }, { "epoch": 0.2463000972237226, "grad_norm": 0.40504280128109277, "learning_rate": 1.9968048623034546e-05, "loss": 0.1927, "step": 855 }, { "epoch": 0.24774044866947534, "grad_norm": 0.49852597790981823, "learning_rate": 1.996600866333678e-05, "loss": 0.1951, "step": 860 }, { "epoch": 0.2491808001152281, "grad_norm": 0.49051458590903607, "learning_rate": 1.9963905692435642e-05, "loss": 0.1984, "step": 865 }, { "epoch": 0.25062115156098086, "grad_norm": 0.4132497703343479, "learning_rate": 1.9961739723627412e-05, "loss": 0.195, "step": 870 }, { "epoch": 0.25206150300673363, "grad_norm": 0.4435532753877853, "learning_rate": 1.9959510770606657e-05, "loss": 0.1908, "step": 875 }, { "epoch": 0.2535018544524864, "grad_norm": 0.48838619445736153, "learning_rate": 1.9957218847466193e-05, "loss": 0.2173, "step": 880 }, { "epoch": 0.2549422058982392, "grad_norm": 0.41570171710878423, "learning_rate": 1.995486396869695e-05, "loss": 0.2109, "step": 885 }, { "epoch": 0.25638255734399196, "grad_norm": 0.4888694180605261, "learning_rate": 1.995244614918792e-05, "loss": 0.1953, "step": 890 }, { "epoch": 0.2578229087897447, "grad_norm": 0.40020516792336286, "learning_rate": 1.994996540422603e-05, "loss": 0.1909, "step": 895 }, { "epoch": 0.25926326023549745, "grad_norm": 0.3812000551763114, "learning_rate": 1.9947421749496076e-05, "loss": 0.1919, "step": 900 }, { "epoch": 0.2607036116812502, "grad_norm": 0.43266733746493274, "learning_rate": 1.9944815201080594e-05, "loss": 0.193, "step": 905 }, { "epoch": 0.262143963127003, "grad_norm": 0.47929032918009606, "learning_rate": 1.9942145775459774e-05, "loss": 0.1971, "step": 910 }, { "epoch": 0.2635843145727558, "grad_norm": 0.4817650251465911, "learning_rate": 1.9939413489511365e-05, "loss": 0.2002, "step": 915 }, { "epoch": 0.2650246660185085, "grad_norm": 0.46730468361587124, "learning_rate": 1.9936618360510545e-05, "loss": 0.1884, "step": 920 }, { "epoch": 0.26646501746426127, "grad_norm": 0.3967806288180982, "learning_rate": 1.9933760406129834e-05, "loss": 0.1897, "step": 925 }, { "epoch": 0.26790536891001404, "grad_norm": 0.4034947494394968, "learning_rate": 1.9930839644438966e-05, "loss": 0.2021, "step": 930 }, { "epoch": 0.2693457203557668, "grad_norm": 0.43359445557784915, "learning_rate": 1.992785609390478e-05, "loss": 0.1891, "step": 935 }, { "epoch": 0.2707860718015196, "grad_norm": 0.4619161295977017, "learning_rate": 1.992480977339111e-05, "loss": 0.1937, "step": 940 }, { "epoch": 0.2722264232472723, "grad_norm": 0.4703530373966522, "learning_rate": 1.9921700702158657e-05, "loss": 0.1839, "step": 945 }, { "epoch": 0.2736667746930251, "grad_norm": 0.4173755704909504, "learning_rate": 1.9918528899864875e-05, "loss": 0.1811, "step": 950 }, { "epoch": 0.27510712613877786, "grad_norm": 0.39226526189502814, "learning_rate": 1.9915294386563834e-05, "loss": 0.1929, "step": 955 }, { "epoch": 0.27654747758453063, "grad_norm": 0.3798320414179301, "learning_rate": 1.9911997182706108e-05, "loss": 0.2024, "step": 960 }, { "epoch": 0.2779878290302834, "grad_norm": 0.4351779082897012, "learning_rate": 1.9908637309138636e-05, "loss": 0.1884, "step": 965 }, { "epoch": 0.2794281804760361, "grad_norm": 0.46467414117322164, "learning_rate": 1.9905214787104592e-05, "loss": 0.1989, "step": 970 }, { "epoch": 0.2808685319217889, "grad_norm": 0.47966903762788865, "learning_rate": 1.990172963824326e-05, "loss": 0.1984, "step": 975 }, { "epoch": 0.2823088833675417, "grad_norm": 0.40419487334235976, "learning_rate": 1.9898181884589877e-05, "loss": 0.1995, "step": 980 }, { "epoch": 0.28374923481329445, "grad_norm": 0.4140452081019953, "learning_rate": 1.9894571548575516e-05, "loss": 0.2038, "step": 985 }, { "epoch": 0.2851895862590472, "grad_norm": 0.4735712012361699, "learning_rate": 1.9890898653026926e-05, "loss": 0.1991, "step": 990 }, { "epoch": 0.2866299377048, "grad_norm": 0.4817269779622208, "learning_rate": 1.9887163221166405e-05, "loss": 0.2039, "step": 995 }, { "epoch": 0.2880702891505527, "grad_norm": 0.4052308677688318, "learning_rate": 1.9883365276611634e-05, "loss": 0.2023, "step": 1000 }, { "epoch": 0.2880702891505527, "eval_loss": 0.19591009616851807, "eval_runtime": 181.0536, "eval_samples_per_second": 9.964, "eval_steps_per_second": 2.491, "step": 1000 }, { "epoch": 0.2895106405963055, "grad_norm": 0.41953136423806925, "learning_rate": 1.987950484337554e-05, "loss": 0.1839, "step": 1005 }, { "epoch": 0.29095099204205827, "grad_norm": 0.46884000577230694, "learning_rate": 1.987558194586615e-05, "loss": 0.1827, "step": 1010 }, { "epoch": 0.29239134348781104, "grad_norm": 0.5492195075262948, "learning_rate": 1.9871596608886416e-05, "loss": 0.2055, "step": 1015 }, { "epoch": 0.2938316949335638, "grad_norm": 0.4125424201488764, "learning_rate": 1.9867548857634077e-05, "loss": 0.1991, "step": 1020 }, { "epoch": 0.29527204637931653, "grad_norm": 0.3811786481459084, "learning_rate": 1.9863438717701497e-05, "loss": 0.194, "step": 1025 }, { "epoch": 0.2967123978250693, "grad_norm": 0.4304143670583801, "learning_rate": 1.985926621507549e-05, "loss": 0.2051, "step": 1030 }, { "epoch": 0.2981527492708221, "grad_norm": 0.4060031359968526, "learning_rate": 1.9855031376137174e-05, "loss": 0.1923, "step": 1035 }, { "epoch": 0.29959310071657486, "grad_norm": 0.43396401122257877, "learning_rate": 1.985073422766179e-05, "loss": 0.2047, "step": 1040 }, { "epoch": 0.30103345216232763, "grad_norm": 0.43955835043716046, "learning_rate": 1.9846374796818536e-05, "loss": 0.1966, "step": 1045 }, { "epoch": 0.30247380360808035, "grad_norm": 0.38189862082042997, "learning_rate": 1.9841953111170407e-05, "loss": 0.1753, "step": 1050 }, { "epoch": 0.3039141550538331, "grad_norm": 0.42163518938375355, "learning_rate": 1.9837469198673996e-05, "loss": 0.2136, "step": 1055 }, { "epoch": 0.3053545064995859, "grad_norm": 0.41241261767807075, "learning_rate": 1.9832923087679352e-05, "loss": 0.1977, "step": 1060 }, { "epoch": 0.3067948579453387, "grad_norm": 0.41901389346526713, "learning_rate": 1.9828314806929762e-05, "loss": 0.1887, "step": 1065 }, { "epoch": 0.30823520939109145, "grad_norm": 0.4500388667621992, "learning_rate": 1.9823644385561596e-05, "loss": 0.2098, "step": 1070 }, { "epoch": 0.30967556083684417, "grad_norm": 0.4039289925589606, "learning_rate": 1.9818911853104118e-05, "loss": 0.1939, "step": 1075 }, { "epoch": 0.31111591228259694, "grad_norm": 0.46025963744196113, "learning_rate": 1.981411723947929e-05, "loss": 0.1923, "step": 1080 }, { "epoch": 0.3125562637283497, "grad_norm": 0.4080631379080584, "learning_rate": 1.9809260575001595e-05, "loss": 0.1859, "step": 1085 }, { "epoch": 0.3139966151741025, "grad_norm": 0.5000635476212351, "learning_rate": 1.980434189037784e-05, "loss": 0.176, "step": 1090 }, { "epoch": 0.31543696661985526, "grad_norm": 0.4846532765583438, "learning_rate": 1.9799361216706948e-05, "loss": 0.1889, "step": 1095 }, { "epoch": 0.316877318065608, "grad_norm": 0.42730669186483955, "learning_rate": 1.9794318585479795e-05, "loss": 0.1962, "step": 1100 }, { "epoch": 0.31831766951136076, "grad_norm": 0.4400767346219366, "learning_rate": 1.9789214028578978e-05, "loss": 0.1979, "step": 1105 }, { "epoch": 0.31975802095711353, "grad_norm": 0.3670443287370546, "learning_rate": 1.9784047578278623e-05, "loss": 0.1995, "step": 1110 }, { "epoch": 0.3211983724028663, "grad_norm": 0.4279589568148661, "learning_rate": 1.9778819267244197e-05, "loss": 0.1829, "step": 1115 }, { "epoch": 0.3226387238486191, "grad_norm": 0.3918422995120454, "learning_rate": 1.9773529128532275e-05, "loss": 0.1892, "step": 1120 }, { "epoch": 0.32407907529437185, "grad_norm": 0.449542089139243, "learning_rate": 1.9768177195590352e-05, "loss": 0.184, "step": 1125 }, { "epoch": 0.3255194267401246, "grad_norm": 0.4363100481377313, "learning_rate": 1.9762763502256625e-05, "loss": 0.1946, "step": 1130 }, { "epoch": 0.32695977818587735, "grad_norm": 0.38895014838177233, "learning_rate": 1.9757288082759766e-05, "loss": 0.189, "step": 1135 }, { "epoch": 0.3284001296316301, "grad_norm": 0.39937918835152797, "learning_rate": 1.9751750971718734e-05, "loss": 0.1965, "step": 1140 }, { "epoch": 0.3298404810773829, "grad_norm": 0.37444043134636334, "learning_rate": 1.9746152204142536e-05, "loss": 0.1842, "step": 1145 }, { "epoch": 0.33128083252313567, "grad_norm": 0.407018722514467, "learning_rate": 1.9740491815429996e-05, "loss": 0.1771, "step": 1150 }, { "epoch": 0.3327211839688884, "grad_norm": 0.38945712491554313, "learning_rate": 1.973476984136956e-05, "loss": 0.1808, "step": 1155 }, { "epoch": 0.33416153541464116, "grad_norm": 0.38168932221231533, "learning_rate": 1.9728986318139048e-05, "loss": 0.1927, "step": 1160 }, { "epoch": 0.33560188686039394, "grad_norm": 0.40270685047202076, "learning_rate": 1.9723141282305432e-05, "loss": 0.1914, "step": 1165 }, { "epoch": 0.3370422383061467, "grad_norm": 0.38062787430312417, "learning_rate": 1.9717234770824598e-05, "loss": 0.2004, "step": 1170 }, { "epoch": 0.3384825897518995, "grad_norm": 0.37505399884013846, "learning_rate": 1.9711266821041134e-05, "loss": 0.1869, "step": 1175 }, { "epoch": 0.3399229411976522, "grad_norm": 0.38856749140181485, "learning_rate": 1.9705237470688064e-05, "loss": 0.1723, "step": 1180 }, { "epoch": 0.341363292643405, "grad_norm": 0.48828707239384744, "learning_rate": 1.969914675788663e-05, "loss": 0.1878, "step": 1185 }, { "epoch": 0.34280364408915776, "grad_norm": 0.3994017025051837, "learning_rate": 1.969299472114605e-05, "loss": 0.1972, "step": 1190 }, { "epoch": 0.34424399553491053, "grad_norm": 0.39594128020669767, "learning_rate": 1.9686781399363252e-05, "loss": 0.1771, "step": 1195 }, { "epoch": 0.3456843469806633, "grad_norm": 0.37998283976311237, "learning_rate": 1.9680506831822667e-05, "loss": 0.1957, "step": 1200 }, { "epoch": 0.347124698426416, "grad_norm": 0.43308160608261626, "learning_rate": 1.9674171058195947e-05, "loss": 0.1842, "step": 1205 }, { "epoch": 0.3485650498721688, "grad_norm": 0.5263932390604856, "learning_rate": 1.9667774118541726e-05, "loss": 0.1806, "step": 1210 }, { "epoch": 0.35000540131792157, "grad_norm": 0.46521786049734537, "learning_rate": 1.9661316053305374e-05, "loss": 0.1937, "step": 1215 }, { "epoch": 0.35144575276367435, "grad_norm": 0.3676290666077626, "learning_rate": 1.9654796903318726e-05, "loss": 0.1861, "step": 1220 }, { "epoch": 0.3528861042094271, "grad_norm": 0.397145673105087, "learning_rate": 1.9648216709799837e-05, "loss": 0.1914, "step": 1225 }, { "epoch": 0.35432645565517984, "grad_norm": 0.4689947572382222, "learning_rate": 1.9641575514352717e-05, "loss": 0.1974, "step": 1230 }, { "epoch": 0.3557668071009326, "grad_norm": 0.3911963183342496, "learning_rate": 1.9634873358967068e-05, "loss": 0.1799, "step": 1235 }, { "epoch": 0.3572071585466854, "grad_norm": 0.42683909988407304, "learning_rate": 1.9628110286018015e-05, "loss": 0.1914, "step": 1240 }, { "epoch": 0.35864750999243816, "grad_norm": 0.4135664643510884, "learning_rate": 1.9621286338265836e-05, "loss": 0.1881, "step": 1245 }, { "epoch": 0.36008786143819094, "grad_norm": 0.39390745705939456, "learning_rate": 1.9614401558855712e-05, "loss": 0.1804, "step": 1250 }, { "epoch": 0.36152821288394366, "grad_norm": 0.3450938944444097, "learning_rate": 1.9607455991317432e-05, "loss": 0.1659, "step": 1255 }, { "epoch": 0.36296856432969643, "grad_norm": 0.3860649499749823, "learning_rate": 1.9600449679565115e-05, "loss": 0.2026, "step": 1260 }, { "epoch": 0.3644089157754492, "grad_norm": 0.3705839431178819, "learning_rate": 1.9593382667896953e-05, "loss": 0.1853, "step": 1265 }, { "epoch": 0.365849267221202, "grad_norm": 0.3863708731489008, "learning_rate": 1.9586255000994914e-05, "loss": 0.1841, "step": 1270 }, { "epoch": 0.36728961866695475, "grad_norm": 0.36464576680841765, "learning_rate": 1.957906672392447e-05, "loss": 0.1924, "step": 1275 }, { "epoch": 0.36872997011270753, "grad_norm": 0.37171858096860977, "learning_rate": 1.9571817882134316e-05, "loss": 0.1917, "step": 1280 }, { "epoch": 0.37017032155846025, "grad_norm": 0.3686315996537359, "learning_rate": 1.9564508521456048e-05, "loss": 0.1908, "step": 1285 }, { "epoch": 0.371610673004213, "grad_norm": 0.4189046777798967, "learning_rate": 1.9557138688103925e-05, "loss": 0.1942, "step": 1290 }, { "epoch": 0.3730510244499658, "grad_norm": 0.37142444736388497, "learning_rate": 1.9549708428674537e-05, "loss": 0.1899, "step": 1295 }, { "epoch": 0.37449137589571857, "grad_norm": 0.3986545412554622, "learning_rate": 1.9542217790146537e-05, "loss": 0.2, "step": 1300 }, { "epoch": 0.37593172734147134, "grad_norm": 0.42032853779043916, "learning_rate": 1.953466681988032e-05, "loss": 0.1964, "step": 1305 }, { "epoch": 0.37737207878722406, "grad_norm": 0.36898012006756564, "learning_rate": 1.9527055565617735e-05, "loss": 0.1849, "step": 1310 }, { "epoch": 0.37881243023297684, "grad_norm": 0.37522686829642604, "learning_rate": 1.9519384075481794e-05, "loss": 0.1804, "step": 1315 }, { "epoch": 0.3802527816787296, "grad_norm": 0.39969390457539616, "learning_rate": 1.9511652397976347e-05, "loss": 0.1951, "step": 1320 }, { "epoch": 0.3816931331244824, "grad_norm": 0.38791503198824945, "learning_rate": 1.950386058198579e-05, "loss": 0.1829, "step": 1325 }, { "epoch": 0.38313348457023516, "grad_norm": 0.44412581422573094, "learning_rate": 1.949600867677475e-05, "loss": 0.2033, "step": 1330 }, { "epoch": 0.3845738360159879, "grad_norm": 0.4122212943592675, "learning_rate": 1.9488096731987773e-05, "loss": 0.1786, "step": 1335 }, { "epoch": 0.38601418746174065, "grad_norm": 0.425017653459728, "learning_rate": 1.948012479764902e-05, "loss": 0.1916, "step": 1340 }, { "epoch": 0.38745453890749343, "grad_norm": 0.4337062414117548, "learning_rate": 1.9472092924161932e-05, "loss": 0.1823, "step": 1345 }, { "epoch": 0.3888948903532462, "grad_norm": 0.3805737492676703, "learning_rate": 1.9464001162308926e-05, "loss": 0.1789, "step": 1350 }, { "epoch": 0.390335241798999, "grad_norm": 0.36167363167725614, "learning_rate": 1.945584956325107e-05, "loss": 0.1898, "step": 1355 }, { "epoch": 0.3917755932447517, "grad_norm": 0.40446782411211757, "learning_rate": 1.9447638178527766e-05, "loss": 0.1874, "step": 1360 }, { "epoch": 0.39321594469050447, "grad_norm": 0.43931132257252586, "learning_rate": 1.9439367060056403e-05, "loss": 0.1844, "step": 1365 }, { "epoch": 0.39465629613625725, "grad_norm": 0.360584577289545, "learning_rate": 1.943103626013206e-05, "loss": 0.1784, "step": 1370 }, { "epoch": 0.39609664758201, "grad_norm": 0.3838200412167434, "learning_rate": 1.9422645831427144e-05, "loss": 0.1977, "step": 1375 }, { "epoch": 0.3975369990277628, "grad_norm": 0.41400589883857647, "learning_rate": 1.941419582699108e-05, "loss": 0.1786, "step": 1380 }, { "epoch": 0.3989773504735155, "grad_norm": 0.44021507039755364, "learning_rate": 1.940568630024997e-05, "loss": 0.1864, "step": 1385 }, { "epoch": 0.4004177019192683, "grad_norm": 0.4076609951453182, "learning_rate": 1.9397117305006238e-05, "loss": 0.1924, "step": 1390 }, { "epoch": 0.40185805336502106, "grad_norm": 0.3448331981439342, "learning_rate": 1.9388488895438322e-05, "loss": 0.1727, "step": 1395 }, { "epoch": 0.40329840481077384, "grad_norm": 0.37971856194171055, "learning_rate": 1.9379801126100305e-05, "loss": 0.1908, "step": 1400 }, { "epoch": 0.4047387562565266, "grad_norm": 0.4063644753493077, "learning_rate": 1.937105405192157e-05, "loss": 0.1959, "step": 1405 }, { "epoch": 0.40617910770227933, "grad_norm": 0.4240507739263381, "learning_rate": 1.9362247728206484e-05, "loss": 0.1847, "step": 1410 }, { "epoch": 0.4076194591480321, "grad_norm": 0.35602981769186853, "learning_rate": 1.9353382210634005e-05, "loss": 0.1765, "step": 1415 }, { "epoch": 0.4090598105937849, "grad_norm": 0.3890399540852024, "learning_rate": 1.934445755525736e-05, "loss": 0.1833, "step": 1420 }, { "epoch": 0.41050016203953765, "grad_norm": 0.3680499036431358, "learning_rate": 1.9335473818503683e-05, "loss": 0.1845, "step": 1425 }, { "epoch": 0.4119405134852904, "grad_norm": 0.41759109180555776, "learning_rate": 1.932643105717365e-05, "loss": 0.1974, "step": 1430 }, { "epoch": 0.4133808649310432, "grad_norm": 0.37662092517599377, "learning_rate": 1.9317329328441126e-05, "loss": 0.1915, "step": 1435 }, { "epoch": 0.4148212163767959, "grad_norm": 0.3544406104660179, "learning_rate": 1.9308168689852816e-05, "loss": 0.1824, "step": 1440 }, { "epoch": 0.4162615678225487, "grad_norm": 0.40679163344205593, "learning_rate": 1.929894919932788e-05, "loss": 0.1936, "step": 1445 }, { "epoch": 0.41770191926830147, "grad_norm": 0.37090593229503027, "learning_rate": 1.928967091515757e-05, "loss": 0.1863, "step": 1450 }, { "epoch": 0.41914227071405424, "grad_norm": 0.3358948102639496, "learning_rate": 1.928033389600488e-05, "loss": 0.1791, "step": 1455 }, { "epoch": 0.420582622159807, "grad_norm": 0.39320319005496884, "learning_rate": 1.927093820090416e-05, "loss": 0.1967, "step": 1460 }, { "epoch": 0.42202297360555974, "grad_norm": 0.39872720769164977, "learning_rate": 1.9261483889260733e-05, "loss": 0.1804, "step": 1465 }, { "epoch": 0.4234633250513125, "grad_norm": 0.37282558197697574, "learning_rate": 1.9251971020850545e-05, "loss": 0.1799, "step": 1470 }, { "epoch": 0.4249036764970653, "grad_norm": 0.37924447925156907, "learning_rate": 1.9242399655819777e-05, "loss": 0.178, "step": 1475 }, { "epoch": 0.42634402794281806, "grad_norm": 0.41752825519108605, "learning_rate": 1.923276985468444e-05, "loss": 0.1863, "step": 1480 }, { "epoch": 0.42778437938857083, "grad_norm": 0.5167921346970157, "learning_rate": 1.922308167833004e-05, "loss": 0.187, "step": 1485 }, { "epoch": 0.42922473083432355, "grad_norm": 0.40873239691837054, "learning_rate": 1.921333518801115e-05, "loss": 0.1936, "step": 1490 }, { "epoch": 0.43066508228007633, "grad_norm": 0.40907971709968743, "learning_rate": 1.9203530445351037e-05, "loss": 0.1782, "step": 1495 }, { "epoch": 0.4321054337258291, "grad_norm": 0.38770424167315554, "learning_rate": 1.9193667512341294e-05, "loss": 0.1868, "step": 1500 }, { "epoch": 0.4321054337258291, "eval_loss": 0.18722842633724213, "eval_runtime": 179.8721, "eval_samples_per_second": 10.029, "eval_steps_per_second": 2.507, "step": 1500 }, { "epoch": 0.4335457851715819, "grad_norm": 0.4127505427391907, "learning_rate": 1.918374645134141e-05, "loss": 0.1891, "step": 1505 }, { "epoch": 0.43498613661733465, "grad_norm": 0.4075820439736581, "learning_rate": 1.9173767325078403e-05, "loss": 0.1836, "step": 1510 }, { "epoch": 0.43642648806308737, "grad_norm": 0.41104823750430075, "learning_rate": 1.9163730196646416e-05, "loss": 0.1873, "step": 1515 }, { "epoch": 0.43786683950884014, "grad_norm": 0.3793283659275875, "learning_rate": 1.915363512950631e-05, "loss": 0.1766, "step": 1520 }, { "epoch": 0.4393071909545929, "grad_norm": 0.39712080280256257, "learning_rate": 1.9143482187485283e-05, "loss": 0.1903, "step": 1525 }, { "epoch": 0.4407475424003457, "grad_norm": 0.38704548175577747, "learning_rate": 1.9133271434776438e-05, "loss": 0.1836, "step": 1530 }, { "epoch": 0.44218789384609847, "grad_norm": 0.32778859492202395, "learning_rate": 1.9123002935938405e-05, "loss": 0.1796, "step": 1535 }, { "epoch": 0.4436282452918512, "grad_norm": 0.41828874008537575, "learning_rate": 1.911267675589491e-05, "loss": 0.1875, "step": 1540 }, { "epoch": 0.44506859673760396, "grad_norm": 0.3559379941636415, "learning_rate": 1.9102292959934385e-05, "loss": 0.1794, "step": 1545 }, { "epoch": 0.44650894818335674, "grad_norm": 0.4087026290282012, "learning_rate": 1.9091851613709538e-05, "loss": 0.1847, "step": 1550 }, { "epoch": 0.4479492996291095, "grad_norm": 0.40953363707420476, "learning_rate": 1.9081352783236945e-05, "loss": 0.1933, "step": 1555 }, { "epoch": 0.4493896510748623, "grad_norm": 0.36898746212883354, "learning_rate": 1.9070796534896644e-05, "loss": 0.1768, "step": 1560 }, { "epoch": 0.450830002520615, "grad_norm": 0.41334025634238375, "learning_rate": 1.9060182935431682e-05, "loss": 0.1829, "step": 1565 }, { "epoch": 0.4522703539663678, "grad_norm": 0.44332370723191467, "learning_rate": 1.9049512051947735e-05, "loss": 0.1901, "step": 1570 }, { "epoch": 0.45371070541212055, "grad_norm": 0.4004111280630263, "learning_rate": 1.9038783951912653e-05, "loss": 0.1889, "step": 1575 }, { "epoch": 0.4551510568578733, "grad_norm": 0.371651865125121, "learning_rate": 1.9027998703156055e-05, "loss": 0.1812, "step": 1580 }, { "epoch": 0.4565914083036261, "grad_norm": 0.40485070841566645, "learning_rate": 1.901715637386887e-05, "loss": 0.1776, "step": 1585 }, { "epoch": 0.4580317597493789, "grad_norm": 0.48633370085572336, "learning_rate": 1.9006257032602942e-05, "loss": 0.1789, "step": 1590 }, { "epoch": 0.4594721111951316, "grad_norm": 0.3966239241276605, "learning_rate": 1.8995300748270577e-05, "loss": 0.1729, "step": 1595 }, { "epoch": 0.46091246264088437, "grad_norm": 0.3327027752839659, "learning_rate": 1.8984287590144102e-05, "loss": 0.1734, "step": 1600 }, { "epoch": 0.46235281408663714, "grad_norm": 0.4317078665592322, "learning_rate": 1.897321762785544e-05, "loss": 0.1783, "step": 1605 }, { "epoch": 0.4637931655323899, "grad_norm": 0.4235966917899685, "learning_rate": 1.896209093139567e-05, "loss": 0.1767, "step": 1610 }, { "epoch": 0.4652335169781427, "grad_norm": 0.36113982142860973, "learning_rate": 1.8950907571114568e-05, "loss": 0.1794, "step": 1615 }, { "epoch": 0.4666738684238954, "grad_norm": 0.3810588912729469, "learning_rate": 1.893966761772018e-05, "loss": 0.1836, "step": 1620 }, { "epoch": 0.4681142198696482, "grad_norm": 0.4196558725198718, "learning_rate": 1.8928371142278368e-05, "loss": 0.1866, "step": 1625 }, { "epoch": 0.46955457131540096, "grad_norm": 0.43666303493670905, "learning_rate": 1.891701821621236e-05, "loss": 0.1901, "step": 1630 }, { "epoch": 0.47099492276115373, "grad_norm": 0.3803714847945443, "learning_rate": 1.8905608911302303e-05, "loss": 0.1862, "step": 1635 }, { "epoch": 0.4724352742069065, "grad_norm": 0.3757549530860626, "learning_rate": 1.8894143299684797e-05, "loss": 0.1961, "step": 1640 }, { "epoch": 0.4738756256526592, "grad_norm": 0.4066659146296864, "learning_rate": 1.8882621453852456e-05, "loss": 0.1699, "step": 1645 }, { "epoch": 0.475315977098412, "grad_norm": 0.3712534952216303, "learning_rate": 1.8871043446653436e-05, "loss": 0.1931, "step": 1650 }, { "epoch": 0.4767563285441648, "grad_norm": 0.4190807324107653, "learning_rate": 1.885940935129098e-05, "loss": 0.1858, "step": 1655 }, { "epoch": 0.47819667998991755, "grad_norm": 0.36972687977137836, "learning_rate": 1.884771924132296e-05, "loss": 0.1857, "step": 1660 }, { "epoch": 0.4796370314356703, "grad_norm": 0.3607936830171753, "learning_rate": 1.8835973190661397e-05, "loss": 0.1825, "step": 1665 }, { "epoch": 0.48107738288142304, "grad_norm": 0.39734635206783725, "learning_rate": 1.8824171273572017e-05, "loss": 0.1846, "step": 1670 }, { "epoch": 0.4825177343271758, "grad_norm": 0.38332143199966057, "learning_rate": 1.881231356467375e-05, "loss": 0.1916, "step": 1675 }, { "epoch": 0.4839580857729286, "grad_norm": 0.35427455618937453, "learning_rate": 1.8800400138938293e-05, "loss": 0.1763, "step": 1680 }, { "epoch": 0.48539843721868137, "grad_norm": 0.3865305729090607, "learning_rate": 1.8788431071689605e-05, "loss": 0.1771, "step": 1685 }, { "epoch": 0.48683878866443414, "grad_norm": 0.34295884844513114, "learning_rate": 1.8776406438603457e-05, "loss": 0.1691, "step": 1690 }, { "epoch": 0.48827914011018686, "grad_norm": 0.3776721425978484, "learning_rate": 1.876432631570693e-05, "loss": 0.1788, "step": 1695 }, { "epoch": 0.48971949155593963, "grad_norm": 0.3861886847035672, "learning_rate": 1.8752190779377958e-05, "loss": 0.1882, "step": 1700 }, { "epoch": 0.4911598430016924, "grad_norm": 0.3634727635182055, "learning_rate": 1.8739999906344817e-05, "loss": 0.1836, "step": 1705 }, { "epoch": 0.4926001944474452, "grad_norm": 0.43152804810108353, "learning_rate": 1.872775377368567e-05, "loss": 0.1935, "step": 1710 }, { "epoch": 0.49404054589319796, "grad_norm": 0.3655986706714032, "learning_rate": 1.8715452458828057e-05, "loss": 0.1813, "step": 1715 }, { "epoch": 0.4954808973389507, "grad_norm": 0.35970766776797, "learning_rate": 1.8703096039548415e-05, "loss": 0.1931, "step": 1720 }, { "epoch": 0.49692124878470345, "grad_norm": 0.3935269406537452, "learning_rate": 1.869068459397159e-05, "loss": 0.175, "step": 1725 }, { "epoch": 0.4983616002304562, "grad_norm": 0.3695776720546673, "learning_rate": 1.8678218200570327e-05, "loss": 0.1821, "step": 1730 }, { "epoch": 0.499801951676209, "grad_norm": 0.3691159723569882, "learning_rate": 1.866569693816479e-05, "loss": 0.1884, "step": 1735 }, { "epoch": 0.5012423031219617, "grad_norm": 0.38398657899121097, "learning_rate": 1.865312088592207e-05, "loss": 0.1818, "step": 1740 }, { "epoch": 0.5026826545677145, "grad_norm": 0.4032929747939036, "learning_rate": 1.8640490123355656e-05, "loss": 0.1793, "step": 1745 }, { "epoch": 0.5041230060134673, "grad_norm": 0.38471076664611525, "learning_rate": 1.8627804730324955e-05, "loss": 0.1848, "step": 1750 }, { "epoch": 0.5055633574592201, "grad_norm": 0.3512566796696949, "learning_rate": 1.8615064787034784e-05, "loss": 0.1933, "step": 1755 }, { "epoch": 0.5070037089049728, "grad_norm": 0.3736665521850479, "learning_rate": 1.8602270374034853e-05, "loss": 0.183, "step": 1760 }, { "epoch": 0.5084440603507255, "grad_norm": 0.3796593083345244, "learning_rate": 1.8589421572219277e-05, "loss": 0.1743, "step": 1765 }, { "epoch": 0.5098844117964784, "grad_norm": 0.3688833908332777, "learning_rate": 1.8576518462826033e-05, "loss": 0.1784, "step": 1770 }, { "epoch": 0.5113247632422311, "grad_norm": 0.3549233461196651, "learning_rate": 1.8563561127436472e-05, "loss": 0.1826, "step": 1775 }, { "epoch": 0.5127651146879839, "grad_norm": 0.36567662921445526, "learning_rate": 1.8550549647974803e-05, "loss": 0.1782, "step": 1780 }, { "epoch": 0.5142054661337366, "grad_norm": 0.3939614446337141, "learning_rate": 1.8537484106707553e-05, "loss": 0.177, "step": 1785 }, { "epoch": 0.5156458175794894, "grad_norm": 0.36187155938329685, "learning_rate": 1.8524364586243063e-05, "loss": 0.1721, "step": 1790 }, { "epoch": 0.5170861690252422, "grad_norm": 0.346859682925655, "learning_rate": 1.8511191169530977e-05, "loss": 0.1756, "step": 1795 }, { "epoch": 0.5185265204709949, "grad_norm": 0.314365239889427, "learning_rate": 1.8497963939861684e-05, "loss": 0.1662, "step": 1800 }, { "epoch": 0.5199668719167477, "grad_norm": 0.35335427903783484, "learning_rate": 1.8484682980865827e-05, "loss": 0.1736, "step": 1805 }, { "epoch": 0.5214072233625004, "grad_norm": 0.3219984035719524, "learning_rate": 1.8471348376513753e-05, "loss": 0.1696, "step": 1810 }, { "epoch": 0.5228475748082532, "grad_norm": 0.33373306734235164, "learning_rate": 1.845796021111499e-05, "loss": 0.1884, "step": 1815 }, { "epoch": 0.524287926254006, "grad_norm": 0.35546415706338036, "learning_rate": 1.8444518569317704e-05, "loss": 0.1695, "step": 1820 }, { "epoch": 0.5257282776997587, "grad_norm": 0.37012004054869435, "learning_rate": 1.8431023536108175e-05, "loss": 0.1697, "step": 1825 }, { "epoch": 0.5271686291455115, "grad_norm": 0.37288255802693826, "learning_rate": 1.841747519681027e-05, "loss": 0.1737, "step": 1830 }, { "epoch": 0.5286089805912643, "grad_norm": 0.37080812218300224, "learning_rate": 1.8403873637084872e-05, "loss": 0.1796, "step": 1835 }, { "epoch": 0.530049332037017, "grad_norm": 0.3478775229906149, "learning_rate": 1.839021894292936e-05, "loss": 0.176, "step": 1840 }, { "epoch": 0.5314896834827698, "grad_norm": 0.372540490899064, "learning_rate": 1.8376511200677067e-05, "loss": 0.2007, "step": 1845 }, { "epoch": 0.5329300349285225, "grad_norm": 0.36586791336161834, "learning_rate": 1.836275049699672e-05, "loss": 0.1751, "step": 1850 }, { "epoch": 0.5343703863742754, "grad_norm": 0.3448429738318597, "learning_rate": 1.834893691889191e-05, "loss": 0.1805, "step": 1855 }, { "epoch": 0.5358107378200281, "grad_norm": 0.3311960167279875, "learning_rate": 1.8335070553700533e-05, "loss": 0.1681, "step": 1860 }, { "epoch": 0.5372510892657808, "grad_norm": 0.35282815078601876, "learning_rate": 1.832115148909422e-05, "loss": 0.173, "step": 1865 }, { "epoch": 0.5386914407115336, "grad_norm": 0.3063521260958916, "learning_rate": 1.830717981307782e-05, "loss": 0.1663, "step": 1870 }, { "epoch": 0.5401317921572864, "grad_norm": 0.3463171525987883, "learning_rate": 1.8293155613988816e-05, "loss": 0.1893, "step": 1875 }, { "epoch": 0.5415721436030392, "grad_norm": 0.3279453628244872, "learning_rate": 1.827907898049677e-05, "loss": 0.1733, "step": 1880 }, { "epoch": 0.5430124950487919, "grad_norm": 0.3520187313077995, "learning_rate": 1.8264950001602778e-05, "loss": 0.1825, "step": 1885 }, { "epoch": 0.5444528464945446, "grad_norm": 0.3565061630561884, "learning_rate": 1.825076876663888e-05, "loss": 0.1848, "step": 1890 }, { "epoch": 0.5458931979402974, "grad_norm": 0.378084862994715, "learning_rate": 1.823653536526752e-05, "loss": 0.1808, "step": 1895 }, { "epoch": 0.5473335493860502, "grad_norm": 0.4212362648494665, "learning_rate": 1.8222249887480966e-05, "loss": 0.1874, "step": 1900 }, { "epoch": 0.548773900831803, "grad_norm": 0.37361394437125983, "learning_rate": 1.8207912423600755e-05, "loss": 0.1807, "step": 1905 }, { "epoch": 0.5502142522775557, "grad_norm": 0.4422231855974053, "learning_rate": 1.8193523064277103e-05, "loss": 0.1774, "step": 1910 }, { "epoch": 0.5516546037233084, "grad_norm": 0.37677879394634756, "learning_rate": 1.8179081900488337e-05, "loss": 0.1868, "step": 1915 }, { "epoch": 0.5530949551690613, "grad_norm": 0.32468876912478695, "learning_rate": 1.8164589023540332e-05, "loss": 0.1739, "step": 1920 }, { "epoch": 0.554535306614814, "grad_norm": 0.3939834452367248, "learning_rate": 1.815004452506592e-05, "loss": 0.1807, "step": 1925 }, { "epoch": 0.5559756580605668, "grad_norm": 0.36557739763760627, "learning_rate": 1.813544849702432e-05, "loss": 0.1637, "step": 1930 }, { "epoch": 0.5574160095063195, "grad_norm": 0.3818470475052624, "learning_rate": 1.812080103170055e-05, "loss": 0.1783, "step": 1935 }, { "epoch": 0.5588563609520723, "grad_norm": 0.3951616694877707, "learning_rate": 1.8106102221704848e-05, "loss": 0.1843, "step": 1940 }, { "epoch": 0.5602967123978251, "grad_norm": 0.36826181964798843, "learning_rate": 1.809135215997208e-05, "loss": 0.1829, "step": 1945 }, { "epoch": 0.5617370638435778, "grad_norm": 0.39404420593554734, "learning_rate": 1.8076550939761156e-05, "loss": 0.1763, "step": 1950 }, { "epoch": 0.5631774152893306, "grad_norm": 0.4062269110223546, "learning_rate": 1.806169865465445e-05, "loss": 0.1855, "step": 1955 }, { "epoch": 0.5646177667350833, "grad_norm": 0.4304330078698439, "learning_rate": 1.8046795398557192e-05, "loss": 0.1897, "step": 1960 }, { "epoch": 0.5660581181808362, "grad_norm": 0.37064207532734933, "learning_rate": 1.8031841265696886e-05, "loss": 0.1912, "step": 1965 }, { "epoch": 0.5674984696265889, "grad_norm": 0.40700194898232867, "learning_rate": 1.8016836350622707e-05, "loss": 0.1891, "step": 1970 }, { "epoch": 0.5689388210723416, "grad_norm": 0.3565593238088361, "learning_rate": 1.8001780748204907e-05, "loss": 0.1794, "step": 1975 }, { "epoch": 0.5703791725180944, "grad_norm": 0.3436335567049151, "learning_rate": 1.7986674553634213e-05, "loss": 0.1726, "step": 1980 }, { "epoch": 0.5718195239638472, "grad_norm": 0.3546880736906482, "learning_rate": 1.7971517862421227e-05, "loss": 0.1723, "step": 1985 }, { "epoch": 0.5732598754096, "grad_norm": 0.37327906152208173, "learning_rate": 1.795631077039583e-05, "loss": 0.1902, "step": 1990 }, { "epoch": 0.5747002268553527, "grad_norm": 0.4331333954155759, "learning_rate": 1.794105337370655e-05, "loss": 0.1779, "step": 1995 }, { "epoch": 0.5761405783011054, "grad_norm": 0.3401961795693695, "learning_rate": 1.7925745768819995e-05, "loss": 0.1766, "step": 2000 }, { "epoch": 0.5761405783011054, "eval_loss": 0.1823691725730896, "eval_runtime": 179.75, "eval_samples_per_second": 10.036, "eval_steps_per_second": 2.509, "step": 2000 }, { "epoch": 0.5775809297468583, "grad_norm": 0.34051110754364844, "learning_rate": 1.7910388052520198e-05, "loss": 0.1699, "step": 2005 }, { "epoch": 0.579021281192611, "grad_norm": 0.3904395823768888, "learning_rate": 1.7894980321908037e-05, "loss": 0.1887, "step": 2010 }, { "epoch": 0.5804616326383638, "grad_norm": 0.35479628226475546, "learning_rate": 1.7879522674400616e-05, "loss": 0.1729, "step": 2015 }, { "epoch": 0.5819019840841165, "grad_norm": 0.36745109167309065, "learning_rate": 1.786401520773063e-05, "loss": 0.1823, "step": 2020 }, { "epoch": 0.5833423355298692, "grad_norm": 0.34440430468545136, "learning_rate": 1.7848458019945778e-05, "loss": 0.1806, "step": 2025 }, { "epoch": 0.5847826869756221, "grad_norm": 0.3436234879041042, "learning_rate": 1.7832851209408116e-05, "loss": 0.1711, "step": 2030 }, { "epoch": 0.5862230384213748, "grad_norm": 0.44158762702669296, "learning_rate": 1.7817194874793446e-05, "loss": 0.1983, "step": 2035 }, { "epoch": 0.5876633898671276, "grad_norm": 0.3547851651042487, "learning_rate": 1.780148911509069e-05, "loss": 0.1761, "step": 2040 }, { "epoch": 0.5891037413128803, "grad_norm": 0.3802065495740333, "learning_rate": 1.7785734029601275e-05, "loss": 0.1781, "step": 2045 }, { "epoch": 0.5905440927586331, "grad_norm": 0.3402917536980965, "learning_rate": 1.7769929717938485e-05, "loss": 0.1732, "step": 2050 }, { "epoch": 0.5919844442043859, "grad_norm": 0.33348260099352367, "learning_rate": 1.775407628002685e-05, "loss": 0.1845, "step": 2055 }, { "epoch": 0.5934247956501386, "grad_norm": 0.3559404326820407, "learning_rate": 1.77381738161015e-05, "loss": 0.1809, "step": 2060 }, { "epoch": 0.5948651470958914, "grad_norm": 0.3493274389579037, "learning_rate": 1.772222242670754e-05, "loss": 0.1975, "step": 2065 }, { "epoch": 0.5963054985416442, "grad_norm": 0.35347058284663757, "learning_rate": 1.7706222212699413e-05, "loss": 0.1845, "step": 2070 }, { "epoch": 0.5977458499873969, "grad_norm": 0.419314654327899, "learning_rate": 1.7690173275240258e-05, "loss": 0.1835, "step": 2075 }, { "epoch": 0.5991862014331497, "grad_norm": 0.3304240143474617, "learning_rate": 1.767407571580128e-05, "loss": 0.1779, "step": 2080 }, { "epoch": 0.6006265528789024, "grad_norm": 0.3697054200801788, "learning_rate": 1.765792963616109e-05, "loss": 0.1903, "step": 2085 }, { "epoch": 0.6020669043246553, "grad_norm": 0.3736233059239093, "learning_rate": 1.764173513840509e-05, "loss": 0.1927, "step": 2090 }, { "epoch": 0.603507255770408, "grad_norm": 0.33836118548178384, "learning_rate": 1.7625492324924794e-05, "loss": 0.1934, "step": 2095 }, { "epoch": 0.6049476072161607, "grad_norm": 0.3385963747126986, "learning_rate": 1.7609201298417205e-05, "loss": 0.1819, "step": 2100 }, { "epoch": 0.6063879586619135, "grad_norm": 0.339416908126632, "learning_rate": 1.7592862161884166e-05, "loss": 0.1798, "step": 2105 }, { "epoch": 0.6078283101076662, "grad_norm": 0.36861625743367654, "learning_rate": 1.7576475018631684e-05, "loss": 0.1779, "step": 2110 }, { "epoch": 0.6092686615534191, "grad_norm": 0.3704231845054649, "learning_rate": 1.756003997226931e-05, "loss": 0.1875, "step": 2115 }, { "epoch": 0.6107090129991718, "grad_norm": 0.38079919821116043, "learning_rate": 1.754355712670946e-05, "loss": 0.1803, "step": 2120 }, { "epoch": 0.6121493644449245, "grad_norm": 0.3859155909423547, "learning_rate": 1.7527026586166767e-05, "loss": 0.1885, "step": 2125 }, { "epoch": 0.6135897158906773, "grad_norm": 0.4008091703421957, "learning_rate": 1.7510448455157415e-05, "loss": 0.1849, "step": 2130 }, { "epoch": 0.6150300673364301, "grad_norm": 0.39230107396255565, "learning_rate": 1.7493822838498496e-05, "loss": 0.1888, "step": 2135 }, { "epoch": 0.6164704187821829, "grad_norm": 0.3603073643543367, "learning_rate": 1.747714984130733e-05, "loss": 0.1715, "step": 2140 }, { "epoch": 0.6179107702279356, "grad_norm": 0.39800892388698866, "learning_rate": 1.74604295690008e-05, "loss": 0.1685, "step": 2145 }, { "epoch": 0.6193511216736883, "grad_norm": 0.32131963215807496, "learning_rate": 1.7443662127294696e-05, "loss": 0.1745, "step": 2150 }, { "epoch": 0.6207914731194412, "grad_norm": 0.36459623655858153, "learning_rate": 1.7426847622203043e-05, "loss": 0.1706, "step": 2155 }, { "epoch": 0.6222318245651939, "grad_norm": 0.38227127635284325, "learning_rate": 1.7409986160037432e-05, "loss": 0.1736, "step": 2160 }, { "epoch": 0.6236721760109467, "grad_norm": 0.33802946607680817, "learning_rate": 1.7393077847406338e-05, "loss": 0.1796, "step": 2165 }, { "epoch": 0.6251125274566994, "grad_norm": 0.34668747024243696, "learning_rate": 1.7376122791214457e-05, "loss": 0.1727, "step": 2170 }, { "epoch": 0.6265528789024521, "grad_norm": 0.35080021260147937, "learning_rate": 1.7359121098662027e-05, "loss": 0.1719, "step": 2175 }, { "epoch": 0.627993230348205, "grad_norm": 0.3375939993470441, "learning_rate": 1.734207287724415e-05, "loss": 0.1686, "step": 2180 }, { "epoch": 0.6294335817939577, "grad_norm": 0.3721850489546093, "learning_rate": 1.732497823475011e-05, "loss": 0.175, "step": 2185 }, { "epoch": 0.6308739332397105, "grad_norm": 0.34307632301934204, "learning_rate": 1.7307837279262692e-05, "loss": 0.1665, "step": 2190 }, { "epoch": 0.6323142846854632, "grad_norm": 0.37051543804913684, "learning_rate": 1.7290650119157505e-05, "loss": 0.1716, "step": 2195 }, { "epoch": 0.633754636131216, "grad_norm": 0.34975756767684907, "learning_rate": 1.7273416863102287e-05, "loss": 0.1817, "step": 2200 }, { "epoch": 0.6351949875769688, "grad_norm": 0.3749976162171925, "learning_rate": 1.725613762005623e-05, "loss": 0.1843, "step": 2205 }, { "epoch": 0.6366353390227215, "grad_norm": 0.36815036243031424, "learning_rate": 1.7238812499269274e-05, "loss": 0.1694, "step": 2210 }, { "epoch": 0.6380756904684743, "grad_norm": 0.3117804955490591, "learning_rate": 1.7221441610281434e-05, "loss": 0.1708, "step": 2215 }, { "epoch": 0.6395160419142271, "grad_norm": 0.3604043748062189, "learning_rate": 1.720402506292209e-05, "loss": 0.1761, "step": 2220 }, { "epoch": 0.6409563933599798, "grad_norm": 0.3298130218644663, "learning_rate": 1.718656296730932e-05, "loss": 0.179, "step": 2225 }, { "epoch": 0.6423967448057326, "grad_norm": 0.3826862512207884, "learning_rate": 1.7169055433849166e-05, "loss": 0.1712, "step": 2230 }, { "epoch": 0.6438370962514853, "grad_norm": 0.3542349813736183, "learning_rate": 1.7151502573234967e-05, "loss": 0.1761, "step": 2235 }, { "epoch": 0.6452774476972382, "grad_norm": 0.34843337457451795, "learning_rate": 1.7133904496446647e-05, "loss": 0.1704, "step": 2240 }, { "epoch": 0.6467177991429909, "grad_norm": 0.3820580792053532, "learning_rate": 1.711626131475001e-05, "loss": 0.1781, "step": 2245 }, { "epoch": 0.6481581505887437, "grad_norm": 0.3626263301136655, "learning_rate": 1.709857313969605e-05, "loss": 0.1818, "step": 2250 }, { "epoch": 0.6495985020344964, "grad_norm": 0.3602584360955885, "learning_rate": 1.708084008312022e-05, "loss": 0.1887, "step": 2255 }, { "epoch": 0.6510388534802491, "grad_norm": 0.34881350230579233, "learning_rate": 1.7063062257141766e-05, "loss": 0.1827, "step": 2260 }, { "epoch": 0.652479204926002, "grad_norm": 0.31830926244321406, "learning_rate": 1.704523977416296e-05, "loss": 0.1843, "step": 2265 }, { "epoch": 0.6539195563717547, "grad_norm": 0.37964043159017924, "learning_rate": 1.702737274686846e-05, "loss": 0.1683, "step": 2270 }, { "epoch": 0.6553599078175075, "grad_norm": 0.32018925541820387, "learning_rate": 1.7009461288224533e-05, "loss": 0.1757, "step": 2275 }, { "epoch": 0.6568002592632602, "grad_norm": 0.3177150747359183, "learning_rate": 1.699150551147838e-05, "loss": 0.1788, "step": 2280 }, { "epoch": 0.658240610709013, "grad_norm": 0.34466027988896136, "learning_rate": 1.697350553015741e-05, "loss": 0.1748, "step": 2285 }, { "epoch": 0.6596809621547658, "grad_norm": 0.34944690536487255, "learning_rate": 1.6955461458068507e-05, "loss": 0.1772, "step": 2290 }, { "epoch": 0.6611213136005185, "grad_norm": 0.3634337752114522, "learning_rate": 1.6937373409297336e-05, "loss": 0.1697, "step": 2295 }, { "epoch": 0.6625616650462713, "grad_norm": 0.3479574597357066, "learning_rate": 1.6919241498207613e-05, "loss": 0.1758, "step": 2300 }, { "epoch": 0.6640020164920241, "grad_norm": 0.3337542016890648, "learning_rate": 1.6901065839440365e-05, "loss": 0.1678, "step": 2305 }, { "epoch": 0.6654423679377768, "grad_norm": 0.342411033930506, "learning_rate": 1.688284654791323e-05, "loss": 0.1661, "step": 2310 }, { "epoch": 0.6668827193835296, "grad_norm": 0.3536792365079188, "learning_rate": 1.6864583738819712e-05, "loss": 0.1845, "step": 2315 }, { "epoch": 0.6683230708292823, "grad_norm": 0.35537832797085916, "learning_rate": 1.6846277527628463e-05, "loss": 0.167, "step": 2320 }, { "epoch": 0.6697634222750352, "grad_norm": 0.3828398429805593, "learning_rate": 1.6827928030082546e-05, "loss": 0.1815, "step": 2325 }, { "epoch": 0.6712037737207879, "grad_norm": 0.33156685434051136, "learning_rate": 1.6809535362198713e-05, "loss": 0.1875, "step": 2330 }, { "epoch": 0.6726441251665406, "grad_norm": 0.30886048275734546, "learning_rate": 1.679109964026666e-05, "loss": 0.1741, "step": 2335 }, { "epoch": 0.6740844766122934, "grad_norm": 0.38213092526719655, "learning_rate": 1.67726209808483e-05, "loss": 0.1657, "step": 2340 }, { "epoch": 0.6755248280580461, "grad_norm": 0.34931015409841865, "learning_rate": 1.6754099500777025e-05, "loss": 0.1703, "step": 2345 }, { "epoch": 0.676965179503799, "grad_norm": 0.4311198591898118, "learning_rate": 1.6735535317156957e-05, "loss": 0.1696, "step": 2350 }, { "epoch": 0.6784055309495517, "grad_norm": 0.32753418004216717, "learning_rate": 1.671692854736222e-05, "loss": 0.1727, "step": 2355 }, { "epoch": 0.6798458823953044, "grad_norm": 0.3205363210915123, "learning_rate": 1.66982793090362e-05, "loss": 0.1704, "step": 2360 }, { "epoch": 0.6812862338410572, "grad_norm": 0.3228782244582717, "learning_rate": 1.6679587720090792e-05, "loss": 0.1791, "step": 2365 }, { "epoch": 0.68272658528681, "grad_norm": 0.38827253301199693, "learning_rate": 1.666085389870565e-05, "loss": 0.1756, "step": 2370 }, { "epoch": 0.6841669367325628, "grad_norm": 0.361153122747486, "learning_rate": 1.664207796332746e-05, "loss": 0.1694, "step": 2375 }, { "epoch": 0.6856072881783155, "grad_norm": 0.38186177109604, "learning_rate": 1.662326003266916e-05, "loss": 0.1865, "step": 2380 }, { "epoch": 0.6870476396240682, "grad_norm": 0.3230350966256622, "learning_rate": 1.660440022570923e-05, "loss": 0.1697, "step": 2385 }, { "epoch": 0.6884879910698211, "grad_norm": 0.3281847066113813, "learning_rate": 1.6585498661690897e-05, "loss": 0.1816, "step": 2390 }, { "epoch": 0.6899283425155738, "grad_norm": 0.37849511355850496, "learning_rate": 1.6566555460121424e-05, "loss": 0.172, "step": 2395 }, { "epoch": 0.6913686939613266, "grad_norm": 0.4496749879940468, "learning_rate": 1.654757074077131e-05, "loss": 0.1776, "step": 2400 }, { "epoch": 0.6928090454070793, "grad_norm": 0.3276397468021885, "learning_rate": 1.6528544623673567e-05, "loss": 0.1641, "step": 2405 }, { "epoch": 0.694249396852832, "grad_norm": 0.3540543579094072, "learning_rate": 1.650947722912295e-05, "loss": 0.172, "step": 2410 }, { "epoch": 0.6956897482985849, "grad_norm": 0.3282349985410633, "learning_rate": 1.6490368677675187e-05, "loss": 0.1727, "step": 2415 }, { "epoch": 0.6971300997443376, "grad_norm": 0.33846062010566547, "learning_rate": 1.647121909014623e-05, "loss": 0.1827, "step": 2420 }, { "epoch": 0.6985704511900904, "grad_norm": 0.36878805858295277, "learning_rate": 1.645202858761149e-05, "loss": 0.1678, "step": 2425 }, { "epoch": 0.7000108026358431, "grad_norm": 0.3241659934094576, "learning_rate": 1.6432797291405055e-05, "loss": 0.1829, "step": 2430 }, { "epoch": 0.7014511540815959, "grad_norm": 0.3323977392588174, "learning_rate": 1.6413525323118956e-05, "loss": 0.1866, "step": 2435 }, { "epoch": 0.7028915055273487, "grad_norm": 0.38740911056410215, "learning_rate": 1.6394212804602356e-05, "loss": 0.1859, "step": 2440 }, { "epoch": 0.7043318569731014, "grad_norm": 0.35020569775025157, "learning_rate": 1.6374859857960813e-05, "loss": 0.1713, "step": 2445 }, { "epoch": 0.7057722084188542, "grad_norm": 0.32892434146707544, "learning_rate": 1.6355466605555502e-05, "loss": 0.1785, "step": 2450 }, { "epoch": 0.707212559864607, "grad_norm": 0.35551264606759814, "learning_rate": 1.633603317000242e-05, "loss": 0.1749, "step": 2455 }, { "epoch": 0.7086529113103597, "grad_norm": 0.4393019056382748, "learning_rate": 1.6316559674171636e-05, "loss": 0.1758, "step": 2460 }, { "epoch": 0.7100932627561125, "grad_norm": 0.4685944922688227, "learning_rate": 1.629704624118651e-05, "loss": 0.1744, "step": 2465 }, { "epoch": 0.7115336142018652, "grad_norm": 0.35581074458621115, "learning_rate": 1.6277492994422893e-05, "loss": 0.1756, "step": 2470 }, { "epoch": 0.7129739656476181, "grad_norm": 0.33856885661174746, "learning_rate": 1.625790005750838e-05, "loss": 0.1705, "step": 2475 }, { "epoch": 0.7144143170933708, "grad_norm": 0.3856019975908658, "learning_rate": 1.62382675543215e-05, "loss": 0.176, "step": 2480 }, { "epoch": 0.7158546685391235, "grad_norm": 0.3832177698765323, "learning_rate": 1.621859560899095e-05, "loss": 0.1824, "step": 2485 }, { "epoch": 0.7172950199848763, "grad_norm": 0.3405022015623911, "learning_rate": 1.6198884345894803e-05, "loss": 0.1879, "step": 2490 }, { "epoch": 0.718735371430629, "grad_norm": 0.34846873026652925, "learning_rate": 1.6179133889659714e-05, "loss": 0.1731, "step": 2495 }, { "epoch": 0.7201757228763819, "grad_norm": 0.391933717466078, "learning_rate": 1.6159344365160162e-05, "loss": 0.1773, "step": 2500 }, { "epoch": 0.7201757228763819, "eval_loss": 0.17813709378242493, "eval_runtime": 178.3665, "eval_samples_per_second": 10.114, "eval_steps_per_second": 2.529, "step": 2500 }, { "epoch": 0.7216160743221346, "grad_norm": 0.30920679982637206, "learning_rate": 1.613951589751762e-05, "loss": 0.1653, "step": 2505 }, { "epoch": 0.7230564257678873, "grad_norm": 0.37229773310756054, "learning_rate": 1.6119648612099793e-05, "loss": 0.1789, "step": 2510 }, { "epoch": 0.7244967772136401, "grad_norm": 0.369107759253664, "learning_rate": 1.609974263451981e-05, "loss": 0.1782, "step": 2515 }, { "epoch": 0.7259371286593929, "grad_norm": 0.3094009207730617, "learning_rate": 1.6079798090635442e-05, "loss": 0.1788, "step": 2520 }, { "epoch": 0.7273774801051457, "grad_norm": 0.34727782391218126, "learning_rate": 1.6059815106548294e-05, "loss": 0.1927, "step": 2525 }, { "epoch": 0.7288178315508984, "grad_norm": 0.3392015861778497, "learning_rate": 1.6039793808603014e-05, "loss": 0.1573, "step": 2530 }, { "epoch": 0.7302581829966511, "grad_norm": 0.3414009389947609, "learning_rate": 1.60197343233865e-05, "loss": 0.1668, "step": 2535 }, { "epoch": 0.731698534442404, "grad_norm": 0.35361147490247585, "learning_rate": 1.5999636777727085e-05, "loss": 0.1743, "step": 2540 }, { "epoch": 0.7331388858881567, "grad_norm": 0.3274226857520657, "learning_rate": 1.5979501298693752e-05, "loss": 0.1758, "step": 2545 }, { "epoch": 0.7345792373339095, "grad_norm": 0.3621973295683448, "learning_rate": 1.595932801359531e-05, "loss": 0.1837, "step": 2550 }, { "epoch": 0.7360195887796622, "grad_norm": 0.3606995367356055, "learning_rate": 1.5939117049979614e-05, "loss": 0.1791, "step": 2555 }, { "epoch": 0.7374599402254151, "grad_norm": 0.33373830456128545, "learning_rate": 1.5918868535632736e-05, "loss": 0.1802, "step": 2560 }, { "epoch": 0.7389002916711678, "grad_norm": 0.3390372078416868, "learning_rate": 1.589858259857817e-05, "loss": 0.187, "step": 2565 }, { "epoch": 0.7403406431169205, "grad_norm": 0.34412888867250124, "learning_rate": 1.5878259367076027e-05, "loss": 0.1911, "step": 2570 }, { "epoch": 0.7417809945626733, "grad_norm": 0.33009136167107, "learning_rate": 1.5857898969622204e-05, "loss": 0.183, "step": 2575 }, { "epoch": 0.743221346008426, "grad_norm": 0.36557215152582107, "learning_rate": 1.5837501534947586e-05, "loss": 0.1758, "step": 2580 }, { "epoch": 0.7446616974541789, "grad_norm": 0.34247088555723115, "learning_rate": 1.5817067192017234e-05, "loss": 0.1828, "step": 2585 }, { "epoch": 0.7461020488999316, "grad_norm": 0.40218176116646415, "learning_rate": 1.579659607002957e-05, "loss": 0.187, "step": 2590 }, { "epoch": 0.7475424003456843, "grad_norm": 0.34125168730229577, "learning_rate": 1.5776088298415545e-05, "loss": 0.183, "step": 2595 }, { "epoch": 0.7489827517914371, "grad_norm": 0.33299022807385764, "learning_rate": 1.575554400683784e-05, "loss": 0.1684, "step": 2600 }, { "epoch": 0.7504231032371899, "grad_norm": 0.3367346205260822, "learning_rate": 1.5734963325190026e-05, "loss": 0.1742, "step": 2605 }, { "epoch": 0.7518634546829427, "grad_norm": 0.3757675002933998, "learning_rate": 1.5714346383595776e-05, "loss": 0.1792, "step": 2610 }, { "epoch": 0.7533038061286954, "grad_norm": 0.36883835440832763, "learning_rate": 1.5693693312407997e-05, "loss": 0.1741, "step": 2615 }, { "epoch": 0.7547441575744481, "grad_norm": 0.3085265660673038, "learning_rate": 1.567300424220804e-05, "loss": 0.1777, "step": 2620 }, { "epoch": 0.756184509020201, "grad_norm": 0.34234678068991636, "learning_rate": 1.565227930380487e-05, "loss": 0.178, "step": 2625 }, { "epoch": 0.7576248604659537, "grad_norm": 0.37484403928305354, "learning_rate": 1.5631518628234217e-05, "loss": 0.1752, "step": 2630 }, { "epoch": 0.7590652119117065, "grad_norm": 0.309840359340079, "learning_rate": 1.5610722346757775e-05, "loss": 0.1697, "step": 2635 }, { "epoch": 0.7605055633574592, "grad_norm": 0.3440804282261142, "learning_rate": 1.558989059086236e-05, "loss": 0.175, "step": 2640 }, { "epoch": 0.7619459148032119, "grad_norm": 0.34552977330495505, "learning_rate": 1.556902349225907e-05, "loss": 0.1687, "step": 2645 }, { "epoch": 0.7633862662489648, "grad_norm": 0.3054063669302173, "learning_rate": 1.554812118288248e-05, "loss": 0.1669, "step": 2650 }, { "epoch": 0.7648266176947175, "grad_norm": 0.35428566557970353, "learning_rate": 1.5527183794889765e-05, "loss": 0.1683, "step": 2655 }, { "epoch": 0.7662669691404703, "grad_norm": 0.3569130747650239, "learning_rate": 1.5506211460659906e-05, "loss": 0.1729, "step": 2660 }, { "epoch": 0.767707320586223, "grad_norm": 0.31980284497508643, "learning_rate": 1.5485204312792824e-05, "loss": 0.1788, "step": 2665 }, { "epoch": 0.7691476720319758, "grad_norm": 0.3182129504042558, "learning_rate": 1.546416248410857e-05, "loss": 0.1675, "step": 2670 }, { "epoch": 0.7705880234777286, "grad_norm": 0.3420940121534491, "learning_rate": 1.544308610764644e-05, "loss": 0.1679, "step": 2675 }, { "epoch": 0.7720283749234813, "grad_norm": 0.35404502346417216, "learning_rate": 1.542197531666419e-05, "loss": 0.1723, "step": 2680 }, { "epoch": 0.7734687263692341, "grad_norm": 0.33613773212219045, "learning_rate": 1.5400830244637158e-05, "loss": 0.1682, "step": 2685 }, { "epoch": 0.7749090778149869, "grad_norm": 0.3163583277670731, "learning_rate": 1.5379651025257415e-05, "loss": 0.1651, "step": 2690 }, { "epoch": 0.7763494292607396, "grad_norm": 0.35141493314032013, "learning_rate": 1.5358437792432952e-05, "loss": 0.1797, "step": 2695 }, { "epoch": 0.7777897807064924, "grad_norm": 0.3462527106786627, "learning_rate": 1.5337190680286796e-05, "loss": 0.1692, "step": 2700 }, { "epoch": 0.7792301321522451, "grad_norm": 0.36237551860243844, "learning_rate": 1.531590982315619e-05, "loss": 0.1682, "step": 2705 }, { "epoch": 0.780670483597998, "grad_norm": 0.35779509635155105, "learning_rate": 1.5294595355591737e-05, "loss": 0.1818, "step": 2710 }, { "epoch": 0.7821108350437507, "grad_norm": 0.3383432292049812, "learning_rate": 1.527324741235653e-05, "loss": 0.1847, "step": 2715 }, { "epoch": 0.7835511864895034, "grad_norm": 0.4033785323831944, "learning_rate": 1.525186612842533e-05, "loss": 0.1724, "step": 2720 }, { "epoch": 0.7849915379352562, "grad_norm": 0.37448292353996054, "learning_rate": 1.5230451638983699e-05, "loss": 0.1832, "step": 2725 }, { "epoch": 0.7864318893810089, "grad_norm": 0.3294476080971381, "learning_rate": 1.5209004079427132e-05, "loss": 0.1671, "step": 2730 }, { "epoch": 0.7878722408267618, "grad_norm": 0.33174892241685555, "learning_rate": 1.518752358536022e-05, "loss": 0.1672, "step": 2735 }, { "epoch": 0.7893125922725145, "grad_norm": 0.3625241110832361, "learning_rate": 1.5166010292595794e-05, "loss": 0.1729, "step": 2740 }, { "epoch": 0.7907529437182672, "grad_norm": 0.3172553909416287, "learning_rate": 1.5144464337154045e-05, "loss": 0.1712, "step": 2745 }, { "epoch": 0.79219329516402, "grad_norm": 0.3752235550380414, "learning_rate": 1.5122885855261687e-05, "loss": 0.175, "step": 2750 }, { "epoch": 0.7936336466097728, "grad_norm": 0.39890985335500706, "learning_rate": 1.5101274983351082e-05, "loss": 0.1707, "step": 2755 }, { "epoch": 0.7950739980555256, "grad_norm": 0.35249780009984283, "learning_rate": 1.5079631858059385e-05, "loss": 0.1619, "step": 2760 }, { "epoch": 0.7965143495012783, "grad_norm": 0.3677203486485534, "learning_rate": 1.5057956616227669e-05, "loss": 0.1727, "step": 2765 }, { "epoch": 0.797954700947031, "grad_norm": 0.33164847451796536, "learning_rate": 1.5036249394900073e-05, "loss": 0.1602, "step": 2770 }, { "epoch": 0.7993950523927839, "grad_norm": 0.33110969305592347, "learning_rate": 1.5014510331322935e-05, "loss": 0.1767, "step": 2775 }, { "epoch": 0.8008354038385366, "grad_norm": 0.3357704192473928, "learning_rate": 1.499273956294391e-05, "loss": 0.1758, "step": 2780 }, { "epoch": 0.8022757552842894, "grad_norm": 0.3203731450233889, "learning_rate": 1.4970937227411113e-05, "loss": 0.1707, "step": 2785 }, { "epoch": 0.8037161067300421, "grad_norm": 0.35078849531073075, "learning_rate": 1.4949103462572247e-05, "loss": 0.1716, "step": 2790 }, { "epoch": 0.8051564581757948, "grad_norm": 0.3438642759457224, "learning_rate": 1.4927238406473734e-05, "loss": 0.1818, "step": 2795 }, { "epoch": 0.8065968096215477, "grad_norm": 0.3045848895679209, "learning_rate": 1.4905342197359826e-05, "loss": 0.1632, "step": 2800 }, { "epoch": 0.8080371610673004, "grad_norm": 0.3359619222321305, "learning_rate": 1.4883414973671758e-05, "loss": 0.1793, "step": 2805 }, { "epoch": 0.8094775125130532, "grad_norm": 0.34845042237143453, "learning_rate": 1.4861456874046849e-05, "loss": 0.1804, "step": 2810 }, { "epoch": 0.8109178639588059, "grad_norm": 0.3638246314263187, "learning_rate": 1.483946803731764e-05, "loss": 0.1786, "step": 2815 }, { "epoch": 0.8123582154045587, "grad_norm": 0.338860960844331, "learning_rate": 1.4817448602511008e-05, "loss": 0.172, "step": 2820 }, { "epoch": 0.8137985668503115, "grad_norm": 0.3273375870728084, "learning_rate": 1.4795398708847288e-05, "loss": 0.172, "step": 2825 }, { "epoch": 0.8152389182960642, "grad_norm": 0.3675294921196136, "learning_rate": 1.4773318495739399e-05, "loss": 0.1686, "step": 2830 }, { "epoch": 0.816679269741817, "grad_norm": 0.3629448732145989, "learning_rate": 1.4751208102791953e-05, "loss": 0.1649, "step": 2835 }, { "epoch": 0.8181196211875698, "grad_norm": 0.3532170948727624, "learning_rate": 1.4729067669800379e-05, "loss": 0.1731, "step": 2840 }, { "epoch": 0.8195599726333225, "grad_norm": 0.37425170739303987, "learning_rate": 1.4706897336750045e-05, "loss": 0.1801, "step": 2845 }, { "epoch": 0.8210003240790753, "grad_norm": 0.33497558634653835, "learning_rate": 1.4684697243815353e-05, "loss": 0.1796, "step": 2850 }, { "epoch": 0.822440675524828, "grad_norm": 0.32112128012699964, "learning_rate": 1.466246753135887e-05, "loss": 0.1695, "step": 2855 }, { "epoch": 0.8238810269705809, "grad_norm": 0.3329436646941721, "learning_rate": 1.4640208339930442e-05, "loss": 0.1792, "step": 2860 }, { "epoch": 0.8253213784163336, "grad_norm": 0.3463503675900774, "learning_rate": 1.4617919810266293e-05, "loss": 0.1661, "step": 2865 }, { "epoch": 0.8267617298620864, "grad_norm": 0.3586194073001657, "learning_rate": 1.459560208328814e-05, "loss": 0.1712, "step": 2870 }, { "epoch": 0.8282020813078391, "grad_norm": 0.3616414921071555, "learning_rate": 1.4573255300102306e-05, "loss": 0.1647, "step": 2875 }, { "epoch": 0.8296424327535918, "grad_norm": 0.3382993481908728, "learning_rate": 1.4550879601998829e-05, "loss": 0.1817, "step": 2880 }, { "epoch": 0.8310827841993447, "grad_norm": 0.3291217731024657, "learning_rate": 1.4528475130450555e-05, "loss": 0.1583, "step": 2885 }, { "epoch": 0.8325231356450974, "grad_norm": 0.3802083802739792, "learning_rate": 1.4506042027112259e-05, "loss": 0.1667, "step": 2890 }, { "epoch": 0.8339634870908502, "grad_norm": 0.3027853759942923, "learning_rate": 1.4483580433819747e-05, "loss": 0.1639, "step": 2895 }, { "epoch": 0.8354038385366029, "grad_norm": 0.3186470887578327, "learning_rate": 1.446109049258895e-05, "loss": 0.1662, "step": 2900 }, { "epoch": 0.8368441899823557, "grad_norm": 0.32594383112579883, "learning_rate": 1.4438572345615036e-05, "loss": 0.1718, "step": 2905 }, { "epoch": 0.8382845414281085, "grad_norm": 0.33772511433449404, "learning_rate": 1.4416026135271502e-05, "loss": 0.1748, "step": 2910 }, { "epoch": 0.8397248928738612, "grad_norm": 0.3630128403423318, "learning_rate": 1.4393452004109288e-05, "loss": 0.1753, "step": 2915 }, { "epoch": 0.841165244319614, "grad_norm": 0.37176315359169365, "learning_rate": 1.4370850094855855e-05, "loss": 0.1688, "step": 2920 }, { "epoch": 0.8426055957653668, "grad_norm": 0.325548664105803, "learning_rate": 1.4348220550414305e-05, "loss": 0.1669, "step": 2925 }, { "epoch": 0.8440459472111195, "grad_norm": 0.3034699571801956, "learning_rate": 1.4325563513862456e-05, "loss": 0.169, "step": 2930 }, { "epoch": 0.8454862986568723, "grad_norm": 0.3624727001828696, "learning_rate": 1.4302879128451956e-05, "loss": 0.1799, "step": 2935 }, { "epoch": 0.846926650102625, "grad_norm": 0.3456854159772283, "learning_rate": 1.428016753760737e-05, "loss": 0.1747, "step": 2940 }, { "epoch": 0.8483670015483779, "grad_norm": 0.3935855355179201, "learning_rate": 1.425742888492526e-05, "loss": 0.1831, "step": 2945 }, { "epoch": 0.8498073529941306, "grad_norm": 0.39615158777629134, "learning_rate": 1.4234663314173307e-05, "loss": 0.1795, "step": 2950 }, { "epoch": 0.8512477044398833, "grad_norm": 0.3427325406907131, "learning_rate": 1.421187096928937e-05, "loss": 0.1686, "step": 2955 }, { "epoch": 0.8526880558856361, "grad_norm": 0.3650047955860237, "learning_rate": 1.41890519943806e-05, "loss": 0.1745, "step": 2960 }, { "epoch": 0.8541284073313888, "grad_norm": 0.3821350839884672, "learning_rate": 1.4166206533722517e-05, "loss": 0.1637, "step": 2965 }, { "epoch": 0.8555687587771417, "grad_norm": 0.30931762511103533, "learning_rate": 1.4143334731758094e-05, "loss": 0.172, "step": 2970 }, { "epoch": 0.8570091102228944, "grad_norm": 0.3421291900106858, "learning_rate": 1.4120436733096855e-05, "loss": 0.1757, "step": 2975 }, { "epoch": 0.8584494616686471, "grad_norm": 0.551760437183804, "learning_rate": 1.4097512682513958e-05, "loss": 0.1843, "step": 2980 }, { "epoch": 0.8598898131143999, "grad_norm": 1.5660769145066697, "learning_rate": 1.4074562724949274e-05, "loss": 0.1986, "step": 2985 }, { "epoch": 0.8613301645601527, "grad_norm": 3.6907279321244477, "learning_rate": 1.4051587005506474e-05, "loss": 0.2213, "step": 2990 }, { "epoch": 0.8627705160059055, "grad_norm": 0.47087515557365756, "learning_rate": 1.4028585669452111e-05, "loss": 0.1803, "step": 2995 }, { "epoch": 0.8642108674516582, "grad_norm": 0.3672313808361774, "learning_rate": 1.40055588622147e-05, "loss": 0.1852, "step": 3000 }, { "epoch": 0.8642108674516582, "eval_loss": 0.17705903947353363, "eval_runtime": 179.9737, "eval_samples_per_second": 10.024, "eval_steps_per_second": 2.506, "step": 3000 }, { "epoch": 0.8656512188974109, "grad_norm": 0.326486381519465, "learning_rate": 1.3982506729383805e-05, "loss": 0.1731, "step": 3005 }, { "epoch": 0.8670915703431638, "grad_norm": 0.32065558593854615, "learning_rate": 1.3959429416709112e-05, "loss": 0.1697, "step": 3010 }, { "epoch": 0.8685319217889165, "grad_norm": 0.404356115517127, "learning_rate": 1.393632707009951e-05, "loss": 0.1832, "step": 3015 }, { "epoch": 0.8699722732346693, "grad_norm": 0.31904774410454045, "learning_rate": 1.3913199835622165e-05, "loss": 0.1695, "step": 3020 }, { "epoch": 0.871412624680422, "grad_norm": 0.45454954577102474, "learning_rate": 1.38900478595016e-05, "loss": 0.1824, "step": 3025 }, { "epoch": 0.8728529761261747, "grad_norm": 0.36657761636760966, "learning_rate": 1.3866871288118772e-05, "loss": 0.1729, "step": 3030 }, { "epoch": 0.8742933275719276, "grad_norm": 0.29971196823956686, "learning_rate": 1.384367026801015e-05, "loss": 0.1688, "step": 3035 }, { "epoch": 0.8757336790176803, "grad_norm": 0.33635749644358365, "learning_rate": 1.3820444945866765e-05, "loss": 0.1705, "step": 3040 }, { "epoch": 0.8771740304634331, "grad_norm": 0.36596078522080977, "learning_rate": 1.3797195468533316e-05, "loss": 0.1691, "step": 3045 }, { "epoch": 0.8786143819091858, "grad_norm": 0.3474031868244395, "learning_rate": 1.3773921983007224e-05, "loss": 0.1666, "step": 3050 }, { "epoch": 0.8800547333549386, "grad_norm": 0.3465786978708465, "learning_rate": 1.37506246364377e-05, "loss": 0.1695, "step": 3055 }, { "epoch": 0.8814950848006914, "grad_norm": 0.3178525376840447, "learning_rate": 1.3727303576124817e-05, "loss": 0.1658, "step": 3060 }, { "epoch": 0.8829354362464441, "grad_norm": 0.3774091077871316, "learning_rate": 1.370395894951859e-05, "loss": 0.1656, "step": 3065 }, { "epoch": 0.8843757876921969, "grad_norm": 0.3665050342290737, "learning_rate": 1.3680590904218032e-05, "loss": 0.1777, "step": 3070 }, { "epoch": 0.8858161391379497, "grad_norm": 0.3248751131099162, "learning_rate": 1.3657199587970212e-05, "loss": 0.1733, "step": 3075 }, { "epoch": 0.8872564905837024, "grad_norm": 0.3560181873382442, "learning_rate": 1.3633785148669343e-05, "loss": 0.1662, "step": 3080 }, { "epoch": 0.8886968420294552, "grad_norm": 0.36371643455151664, "learning_rate": 1.3610347734355838e-05, "loss": 0.1885, "step": 3085 }, { "epoch": 0.8901371934752079, "grad_norm": 0.31964376113184734, "learning_rate": 1.3586887493215364e-05, "loss": 0.1798, "step": 3090 }, { "epoch": 0.8915775449209608, "grad_norm": 0.34388240218522587, "learning_rate": 1.3563404573577919e-05, "loss": 0.1645, "step": 3095 }, { "epoch": 0.8930178963667135, "grad_norm": 0.32094313709171657, "learning_rate": 1.3539899123916884e-05, "loss": 0.1709, "step": 3100 }, { "epoch": 0.8944582478124662, "grad_norm": 0.35489801575784186, "learning_rate": 1.3516371292848098e-05, "loss": 0.1759, "step": 3105 }, { "epoch": 0.895898599258219, "grad_norm": 0.3435136193030186, "learning_rate": 1.3492821229128892e-05, "loss": 0.1801, "step": 3110 }, { "epoch": 0.8973389507039717, "grad_norm": 0.3405482581189107, "learning_rate": 1.3469249081657178e-05, "loss": 0.1621, "step": 3115 }, { "epoch": 0.8987793021497246, "grad_norm": 0.36475355151619254, "learning_rate": 1.34456549994705e-05, "loss": 0.1767, "step": 3120 }, { "epoch": 0.9002196535954773, "grad_norm": 0.3185028661890843, "learning_rate": 1.3422039131745073e-05, "loss": 0.1718, "step": 3125 }, { "epoch": 0.90166000504123, "grad_norm": 0.3501109132647978, "learning_rate": 1.3398401627794855e-05, "loss": 0.164, "step": 3130 }, { "epoch": 0.9031003564869828, "grad_norm": 0.34377330815664026, "learning_rate": 1.3374742637070612e-05, "loss": 0.1705, "step": 3135 }, { "epoch": 0.9045407079327356, "grad_norm": 0.33654711284735916, "learning_rate": 1.335106230915896e-05, "loss": 0.1609, "step": 3140 }, { "epoch": 0.9059810593784884, "grad_norm": 0.3245646527085092, "learning_rate": 1.3327360793781408e-05, "loss": 0.158, "step": 3145 }, { "epoch": 0.9074214108242411, "grad_norm": 0.3351308270482288, "learning_rate": 1.3303638240793442e-05, "loss": 0.1678, "step": 3150 }, { "epoch": 0.9088617622699939, "grad_norm": 0.32107454775466887, "learning_rate": 1.3279894800183555e-05, "loss": 0.1563, "step": 3155 }, { "epoch": 0.9103021137157467, "grad_norm": 0.34245383003412766, "learning_rate": 1.3256130622072301e-05, "loss": 0.1766, "step": 3160 }, { "epoch": 0.9117424651614994, "grad_norm": 0.3274886424161308, "learning_rate": 1.323234585671135e-05, "loss": 0.1715, "step": 3165 }, { "epoch": 0.9131828166072522, "grad_norm": 0.32664224598446223, "learning_rate": 1.3208540654482543e-05, "loss": 0.1797, "step": 3170 }, { "epoch": 0.9146231680530049, "grad_norm": 0.3570503032198834, "learning_rate": 1.3184715165896924e-05, "loss": 0.1684, "step": 3175 }, { "epoch": 0.9160635194987578, "grad_norm": 0.3041353628175197, "learning_rate": 1.3160869541593815e-05, "loss": 0.1644, "step": 3180 }, { "epoch": 0.9175038709445105, "grad_norm": 0.3384280077903303, "learning_rate": 1.3137003932339834e-05, "loss": 0.1655, "step": 3185 }, { "epoch": 0.9189442223902632, "grad_norm": 0.32882081126553514, "learning_rate": 1.3113118489027968e-05, "loss": 0.1639, "step": 3190 }, { "epoch": 0.920384573836016, "grad_norm": 0.33419046612443065, "learning_rate": 1.3089213362676595e-05, "loss": 0.169, "step": 3195 }, { "epoch": 0.9218249252817687, "grad_norm": 0.32575685656168196, "learning_rate": 1.306528870442855e-05, "loss": 0.1673, "step": 3200 }, { "epoch": 0.9232652767275216, "grad_norm": 0.3802101667394028, "learning_rate": 1.304134466555016e-05, "loss": 0.1672, "step": 3205 }, { "epoch": 0.9247056281732743, "grad_norm": 0.3637602408564527, "learning_rate": 1.3017381397430285e-05, "loss": 0.1721, "step": 3210 }, { "epoch": 0.926145979619027, "grad_norm": 0.32095803714990945, "learning_rate": 1.2993399051579365e-05, "loss": 0.1759, "step": 3215 }, { "epoch": 0.9275863310647798, "grad_norm": 0.33727876451711586, "learning_rate": 1.2969397779628459e-05, "loss": 0.1691, "step": 3220 }, { "epoch": 0.9290266825105326, "grad_norm": 0.40867319701914406, "learning_rate": 1.2945377733328297e-05, "loss": 0.1775, "step": 3225 }, { "epoch": 0.9304670339562854, "grad_norm": 0.32340952324251065, "learning_rate": 1.29213390645483e-05, "loss": 0.1668, "step": 3230 }, { "epoch": 0.9319073854020381, "grad_norm": 0.3659132059806781, "learning_rate": 1.289728192527564e-05, "loss": 0.165, "step": 3235 }, { "epoch": 0.9333477368477908, "grad_norm": 0.3285554990119345, "learning_rate": 1.2873206467614268e-05, "loss": 0.1651, "step": 3240 }, { "epoch": 0.9347880882935437, "grad_norm": 0.33834559931171954, "learning_rate": 1.2849112843783952e-05, "loss": 0.1683, "step": 3245 }, { "epoch": 0.9362284397392964, "grad_norm": 0.3134919137585381, "learning_rate": 1.2825001206119328e-05, "loss": 0.1743, "step": 3250 }, { "epoch": 0.9376687911850492, "grad_norm": 0.3616806939948535, "learning_rate": 1.2800871707068913e-05, "loss": 0.1782, "step": 3255 }, { "epoch": 0.9391091426308019, "grad_norm": 0.38816909872964706, "learning_rate": 1.2776724499194165e-05, "loss": 0.1589, "step": 3260 }, { "epoch": 0.9405494940765546, "grad_norm": 0.38618952505233706, "learning_rate": 1.27525597351685e-05, "loss": 0.1762, "step": 3265 }, { "epoch": 0.9419898455223075, "grad_norm": 0.3602161530621059, "learning_rate": 1.272837756777634e-05, "loss": 0.1866, "step": 3270 }, { "epoch": 0.9434301969680602, "grad_norm": 0.3441209052655488, "learning_rate": 1.2704178149912142e-05, "loss": 0.1707, "step": 3275 }, { "epoch": 0.944870548413813, "grad_norm": 0.3575347569438798, "learning_rate": 1.2679961634579429e-05, "loss": 0.1816, "step": 3280 }, { "epoch": 0.9463108998595657, "grad_norm": 0.32619834091130173, "learning_rate": 1.2655728174889823e-05, "loss": 0.1596, "step": 3285 }, { "epoch": 0.9477512513053185, "grad_norm": 0.30450616004896364, "learning_rate": 1.2631477924062086e-05, "loss": 0.1687, "step": 3290 }, { "epoch": 0.9491916027510713, "grad_norm": 0.39194493113328666, "learning_rate": 1.2607211035421134e-05, "loss": 0.1798, "step": 3295 }, { "epoch": 0.950631954196824, "grad_norm": 0.33488364978285323, "learning_rate": 1.258292766239708e-05, "loss": 0.164, "step": 3300 }, { "epoch": 0.9520723056425768, "grad_norm": 0.33449050395875485, "learning_rate": 1.255862795852427e-05, "loss": 0.1838, "step": 3305 }, { "epoch": 0.9535126570883296, "grad_norm": 0.3224448291423571, "learning_rate": 1.2534312077440291e-05, "loss": 0.159, "step": 3310 }, { "epoch": 0.9549530085340823, "grad_norm": 0.34250880685460666, "learning_rate": 1.250998017288502e-05, "loss": 0.1732, "step": 3315 }, { "epoch": 0.9563933599798351, "grad_norm": 0.325026715867963, "learning_rate": 1.2485632398699644e-05, "loss": 0.1655, "step": 3320 }, { "epoch": 0.9578337114255878, "grad_norm": 0.34476248704682083, "learning_rate": 1.2461268908825686e-05, "loss": 0.1752, "step": 3325 }, { "epoch": 0.9592740628713406, "grad_norm": 0.3442273279650722, "learning_rate": 1.2436889857304031e-05, "loss": 0.157, "step": 3330 }, { "epoch": 0.9607144143170934, "grad_norm": 0.4303337353194812, "learning_rate": 1.2412495398273956e-05, "loss": 0.1728, "step": 3335 }, { "epoch": 0.9621547657628461, "grad_norm": 0.365900582909615, "learning_rate": 1.2388085685972155e-05, "loss": 0.1712, "step": 3340 }, { "epoch": 0.9635951172085989, "grad_norm": 0.3728395806507143, "learning_rate": 1.2363660874731767e-05, "loss": 0.1682, "step": 3345 }, { "epoch": 0.9650354686543516, "grad_norm": 0.35386731676038286, "learning_rate": 1.233922111898138e-05, "loss": 0.1724, "step": 3350 }, { "epoch": 0.9664758201001045, "grad_norm": 0.30172215095317684, "learning_rate": 1.2314766573244085e-05, "loss": 0.1581, "step": 3355 }, { "epoch": 0.9679161715458572, "grad_norm": 0.3559005601073562, "learning_rate": 1.2290297392136483e-05, "loss": 0.175, "step": 3360 }, { "epoch": 0.9693565229916099, "grad_norm": 0.3349516882886285, "learning_rate": 1.2265813730367704e-05, "loss": 0.1726, "step": 3365 }, { "epoch": 0.9707968744373627, "grad_norm": 0.3234492836179018, "learning_rate": 1.2241315742738431e-05, "loss": 0.1797, "step": 3370 }, { "epoch": 0.9722372258831155, "grad_norm": 0.35736598159988336, "learning_rate": 1.2216803584139936e-05, "loss": 0.1741, "step": 3375 }, { "epoch": 0.9736775773288683, "grad_norm": 0.3173229290519434, "learning_rate": 1.2192277409553075e-05, "loss": 0.1728, "step": 3380 }, { "epoch": 0.975117928774621, "grad_norm": 0.35907434868148436, "learning_rate": 1.2167737374047329e-05, "loss": 0.1619, "step": 3385 }, { "epoch": 0.9765582802203737, "grad_norm": 0.31746492403546855, "learning_rate": 1.2143183632779812e-05, "loss": 0.1606, "step": 3390 }, { "epoch": 0.9779986316661265, "grad_norm": 0.29969257567145113, "learning_rate": 1.2118616340994302e-05, "loss": 0.176, "step": 3395 }, { "epoch": 0.9794389831118793, "grad_norm": 0.34191493614948304, "learning_rate": 1.2094035654020245e-05, "loss": 0.1771, "step": 3400 }, { "epoch": 0.9808793345576321, "grad_norm": 0.336306850022089, "learning_rate": 1.2069441727271776e-05, "loss": 0.1725, "step": 3405 }, { "epoch": 0.9823196860033848, "grad_norm": 0.3488759742550077, "learning_rate": 1.2044834716246752e-05, "loss": 0.1664, "step": 3410 }, { "epoch": 0.9837600374491375, "grad_norm": 0.29434687938442566, "learning_rate": 1.2020214776525746e-05, "loss": 0.1665, "step": 3415 }, { "epoch": 0.9852003888948904, "grad_norm": 0.32154231494880775, "learning_rate": 1.1995582063771076e-05, "loss": 0.1605, "step": 3420 }, { "epoch": 0.9866407403406431, "grad_norm": 0.3216164557661555, "learning_rate": 1.1970936733725822e-05, "loss": 0.1649, "step": 3425 }, { "epoch": 0.9880810917863959, "grad_norm": 0.3984614491878183, "learning_rate": 1.1946278942212841e-05, "loss": 0.1627, "step": 3430 }, { "epoch": 0.9895214432321486, "grad_norm": 0.3084357411827215, "learning_rate": 1.1921608845133774e-05, "loss": 0.1791, "step": 3435 }, { "epoch": 0.9909617946779014, "grad_norm": 0.3461006191407041, "learning_rate": 1.1896926598468062e-05, "loss": 0.1608, "step": 3440 }, { "epoch": 0.9924021461236542, "grad_norm": 0.3242641159641522, "learning_rate": 1.187223235827197e-05, "loss": 0.1675, "step": 3445 }, { "epoch": 0.9938424975694069, "grad_norm": 0.31853075587857466, "learning_rate": 1.1847526280677592e-05, "loss": 0.159, "step": 3450 }, { "epoch": 0.9952828490151597, "grad_norm": 0.36031480976740865, "learning_rate": 1.1822808521891864e-05, "loss": 0.1709, "step": 3455 }, { "epoch": 0.9967232004609125, "grad_norm": 0.34851124123259397, "learning_rate": 1.1798079238195574e-05, "loss": 0.1693, "step": 3460 }, { "epoch": 0.9981635519066653, "grad_norm": 0.38061386776708506, "learning_rate": 1.1773338585942389e-05, "loss": 0.1662, "step": 3465 }, { "epoch": 0.999603903352418, "grad_norm": 0.3643047405112528, "learning_rate": 1.1748586721557842e-05, "loss": 0.1691, "step": 3470 }, { "epoch": 1.0011522811566023, "grad_norm": 2.3721237897739833, "learning_rate": 1.1723823801538361e-05, "loss": 0.5506, "step": 3475 }, { "epoch": 1.0025926326023549, "grad_norm": 2.341945780620137, "learning_rate": 1.169904998245028e-05, "loss": 0.1436, "step": 3480 }, { "epoch": 1.0040329840481077, "grad_norm": 2.7593761138904846, "learning_rate": 1.1674265420928827e-05, "loss": 0.138, "step": 3485 }, { "epoch": 1.0054733354938605, "grad_norm": 2.3938411855023687, "learning_rate": 1.1649470273677178e-05, "loss": 0.1604, "step": 3490 }, { "epoch": 1.0069136869396134, "grad_norm": 0.30894809227179215, "learning_rate": 1.1624664697465406e-05, "loss": 0.1361, "step": 3495 }, { "epoch": 1.008354038385366, "grad_norm": 0.8469245650916747, "learning_rate": 1.1599848849129549e-05, "loss": 0.1264, "step": 3500 }, { "epoch": 1.008354038385366, "eval_loss": 0.14734847843647003, "eval_runtime": 187.1808, "eval_samples_per_second": 9.638, "eval_steps_per_second": 2.409, "step": 3500 }, { "epoch": 1.0097943898311188, "grad_norm": 0.2863610834780281, "learning_rate": 1.157502288557058e-05, "loss": 0.1397, "step": 3505 }, { "epoch": 1.0112347412768716, "grad_norm": 0.27170412433783897, "learning_rate": 1.155018696375342e-05, "loss": 0.1398, "step": 3510 }, { "epoch": 1.0126750927226242, "grad_norm": 0.26430273342487837, "learning_rate": 1.1525341240705967e-05, "loss": 0.1308, "step": 3515 }, { "epoch": 1.014115444168377, "grad_norm": 0.2856254131463934, "learning_rate": 1.1500485873518079e-05, "loss": 0.1466, "step": 3520 }, { "epoch": 1.01555579561413, "grad_norm": 0.28388915331696574, "learning_rate": 1.1475621019340594e-05, "loss": 0.1363, "step": 3525 }, { "epoch": 1.0169961470598825, "grad_norm": 0.27929551536804725, "learning_rate": 1.145074683538433e-05, "loss": 0.1281, "step": 3530 }, { "epoch": 1.0184364985056353, "grad_norm": 0.2877130736841538, "learning_rate": 1.1425863478919092e-05, "loss": 0.1261, "step": 3535 }, { "epoch": 1.0198768499513882, "grad_norm": 0.2845024477187945, "learning_rate": 1.1400971107272685e-05, "loss": 0.1394, "step": 3540 }, { "epoch": 1.021317201397141, "grad_norm": 0.28240198605290473, "learning_rate": 1.137606987782991e-05, "loss": 0.1295, "step": 3545 }, { "epoch": 1.0227575528428936, "grad_norm": 0.29550483906146596, "learning_rate": 1.1351159948031572e-05, "loss": 0.136, "step": 3550 }, { "epoch": 1.0241979042886464, "grad_norm": 0.26147300326314377, "learning_rate": 1.1326241475373483e-05, "loss": 0.1347, "step": 3555 }, { "epoch": 1.0256382557343993, "grad_norm": 0.2847825316184351, "learning_rate": 1.1301314617405473e-05, "loss": 0.1339, "step": 3560 }, { "epoch": 1.0270786071801519, "grad_norm": 0.26880663609447486, "learning_rate": 1.1276379531730386e-05, "loss": 0.1293, "step": 3565 }, { "epoch": 1.0285189586259047, "grad_norm": 0.30390938449587174, "learning_rate": 1.1251436376003091e-05, "loss": 0.1356, "step": 3570 }, { "epoch": 1.0299593100716575, "grad_norm": 0.309678056558676, "learning_rate": 1.122648530792947e-05, "loss": 0.1306, "step": 3575 }, { "epoch": 1.0313996615174101, "grad_norm": 0.2733158298216727, "learning_rate": 1.1201526485265449e-05, "loss": 0.1359, "step": 3580 }, { "epoch": 1.032840012963163, "grad_norm": 0.2897804991588295, "learning_rate": 1.1176560065815962e-05, "loss": 0.1276, "step": 3585 }, { "epoch": 1.0342803644089158, "grad_norm": 0.2871445202236724, "learning_rate": 1.1151586207433993e-05, "loss": 0.1336, "step": 3590 }, { "epoch": 1.0357207158546686, "grad_norm": 0.304554126712303, "learning_rate": 1.112660506801955e-05, "loss": 0.124, "step": 3595 }, { "epoch": 1.0371610673004212, "grad_norm": 0.28174097838281875, "learning_rate": 1.1101616805518678e-05, "loss": 0.1319, "step": 3600 }, { "epoch": 1.038601418746174, "grad_norm": 0.28561395901703596, "learning_rate": 1.1076621577922461e-05, "loss": 0.1276, "step": 3605 }, { "epoch": 1.040041770191927, "grad_norm": 0.2592443629276645, "learning_rate": 1.1051619543266017e-05, "loss": 0.131, "step": 3610 }, { "epoch": 1.0414821216376795, "grad_norm": 0.29120003001124567, "learning_rate": 1.1026610859627502e-05, "loss": 0.1432, "step": 3615 }, { "epoch": 1.0429224730834323, "grad_norm": 0.299864421583837, "learning_rate": 1.1001595685127117e-05, "loss": 0.1332, "step": 3620 }, { "epoch": 1.0443628245291852, "grad_norm": 0.29923938520475035, "learning_rate": 1.097657417792609e-05, "loss": 0.1293, "step": 3625 }, { "epoch": 1.045803175974938, "grad_norm": 0.2716661652311505, "learning_rate": 1.0951546496225705e-05, "loss": 0.1297, "step": 3630 }, { "epoch": 1.0472435274206906, "grad_norm": 0.3202993091637731, "learning_rate": 1.0926512798266273e-05, "loss": 0.1316, "step": 3635 }, { "epoch": 1.0486838788664434, "grad_norm": 0.2933354486531652, "learning_rate": 1.0901473242326148e-05, "loss": 0.1379, "step": 3640 }, { "epoch": 1.0501242303121963, "grad_norm": 0.2827070705191196, "learning_rate": 1.0876427986720715e-05, "loss": 0.1373, "step": 3645 }, { "epoch": 1.0515645817579489, "grad_norm": 0.3053591398371649, "learning_rate": 1.0851377189801406e-05, "loss": 0.1366, "step": 3650 }, { "epoch": 1.0530049332037017, "grad_norm": 0.3254925917237289, "learning_rate": 1.0826321009954683e-05, "loss": 0.1421, "step": 3655 }, { "epoch": 1.0544452846494545, "grad_norm": 0.2964349375907128, "learning_rate": 1.0801259605601043e-05, "loss": 0.143, "step": 3660 }, { "epoch": 1.0558856360952071, "grad_norm": 0.31104349771963047, "learning_rate": 1.077619313519401e-05, "loss": 0.1372, "step": 3665 }, { "epoch": 1.05732598754096, "grad_norm": 0.27745539952677795, "learning_rate": 1.0751121757219154e-05, "loss": 0.1405, "step": 3670 }, { "epoch": 1.0587663389867128, "grad_norm": 0.2846782984236109, "learning_rate": 1.0726045630193057e-05, "loss": 0.1301, "step": 3675 }, { "epoch": 1.0602066904324656, "grad_norm": 0.3088319093039246, "learning_rate": 1.070096491266233e-05, "loss": 0.1356, "step": 3680 }, { "epoch": 1.0616470418782182, "grad_norm": 0.2682708073426634, "learning_rate": 1.0675879763202623e-05, "loss": 0.1317, "step": 3685 }, { "epoch": 1.063087393323971, "grad_norm": 0.2671845851053607, "learning_rate": 1.0650790340417592e-05, "loss": 0.1337, "step": 3690 }, { "epoch": 1.064527744769724, "grad_norm": 0.2778331787589554, "learning_rate": 1.0625696802937911e-05, "loss": 0.1377, "step": 3695 }, { "epoch": 1.0659680962154765, "grad_norm": 0.30753300441114967, "learning_rate": 1.0600599309420279e-05, "loss": 0.1374, "step": 3700 }, { "epoch": 1.0674084476612293, "grad_norm": 0.2775830751816375, "learning_rate": 1.0575498018546407e-05, "loss": 0.1307, "step": 3705 }, { "epoch": 1.0688487991069822, "grad_norm": 0.32475842682289247, "learning_rate": 1.0550393089022001e-05, "loss": 0.1284, "step": 3710 }, { "epoch": 1.0702891505527348, "grad_norm": 0.2927503470442227, "learning_rate": 1.052528467957579e-05, "loss": 0.1255, "step": 3715 }, { "epoch": 1.0717295019984876, "grad_norm": 0.2918443033872336, "learning_rate": 1.0500172948958502e-05, "loss": 0.1296, "step": 3720 }, { "epoch": 1.0731698534442404, "grad_norm": 0.2733071927339924, "learning_rate": 1.0475058055941856e-05, "loss": 0.1354, "step": 3725 }, { "epoch": 1.0746102048899933, "grad_norm": 0.32091404766902804, "learning_rate": 1.0449940159317564e-05, "loss": 0.1403, "step": 3730 }, { "epoch": 1.0760505563357459, "grad_norm": 0.27309247291871086, "learning_rate": 1.042481941789634e-05, "loss": 0.1299, "step": 3735 }, { "epoch": 1.0774909077814987, "grad_norm": 0.291429030269511, "learning_rate": 1.0399695990506877e-05, "loss": 0.1349, "step": 3740 }, { "epoch": 1.0789312592272515, "grad_norm": 0.2794660700760565, "learning_rate": 1.0374570035994855e-05, "loss": 0.1267, "step": 3745 }, { "epoch": 1.0803716106730041, "grad_norm": 0.2915027456228963, "learning_rate": 1.0349441713221923e-05, "loss": 0.1332, "step": 3750 }, { "epoch": 1.081811962118757, "grad_norm": 0.30229082944109603, "learning_rate": 1.0324311181064714e-05, "loss": 0.1284, "step": 3755 }, { "epoch": 1.0832523135645098, "grad_norm": 0.28544390636563377, "learning_rate": 1.0299178598413828e-05, "loss": 0.1395, "step": 3760 }, { "epoch": 1.0846926650102624, "grad_norm": 0.3197660163913866, "learning_rate": 1.0274044124172817e-05, "loss": 0.1296, "step": 3765 }, { "epoch": 1.0861330164560152, "grad_norm": 0.28278629655789606, "learning_rate": 1.0248907917257213e-05, "loss": 0.1238, "step": 3770 }, { "epoch": 1.087573367901768, "grad_norm": 0.2980589878724013, "learning_rate": 1.022377013659349e-05, "loss": 0.1383, "step": 3775 }, { "epoch": 1.089013719347521, "grad_norm": 0.29273436019807486, "learning_rate": 1.0198630941118075e-05, "loss": 0.1299, "step": 3780 }, { "epoch": 1.0904540707932735, "grad_norm": 0.2611335142282758, "learning_rate": 1.0173490489776337e-05, "loss": 0.1194, "step": 3785 }, { "epoch": 1.0918944222390263, "grad_norm": 0.27401911211382207, "learning_rate": 1.0148348941521596e-05, "loss": 0.1283, "step": 3790 }, { "epoch": 1.0933347736847792, "grad_norm": 0.27569212910222046, "learning_rate": 1.012320645531409e-05, "loss": 0.1357, "step": 3795 }, { "epoch": 1.0947751251305318, "grad_norm": 0.31773489006970385, "learning_rate": 1.0098063190120009e-05, "loss": 0.1388, "step": 3800 }, { "epoch": 1.0962154765762846, "grad_norm": 0.27529632570187174, "learning_rate": 1.0072919304910446e-05, "loss": 0.1348, "step": 3805 }, { "epoch": 1.0976558280220374, "grad_norm": 0.32498147518056825, "learning_rate": 1.0047774958660432e-05, "loss": 0.1272, "step": 3810 }, { "epoch": 1.0990961794677903, "grad_norm": 0.2774029992998093, "learning_rate": 1.0022630310347905e-05, "loss": 0.1235, "step": 3815 }, { "epoch": 1.1005365309135429, "grad_norm": 0.2614075435164931, "learning_rate": 9.99748551895271e-06, "loss": 0.1296, "step": 3820 }, { "epoch": 1.1019768823592957, "grad_norm": 0.3087699168742396, "learning_rate": 9.972340743455606e-06, "loss": 0.1298, "step": 3825 }, { "epoch": 1.1034172338050485, "grad_norm": 0.26898843049721016, "learning_rate": 9.947196142837237e-06, "loss": 0.1261, "step": 3830 }, { "epoch": 1.1048575852508011, "grad_norm": 0.2857369504598532, "learning_rate": 9.922051876077157e-06, "loss": 0.1373, "step": 3835 }, { "epoch": 1.106297936696554, "grad_norm": 0.27007478399517304, "learning_rate": 9.8969081021528e-06, "loss": 0.1244, "step": 3840 }, { "epoch": 1.1077382881423068, "grad_norm": 0.28803665926227207, "learning_rate": 9.871764980038491e-06, "loss": 0.133, "step": 3845 }, { "epoch": 1.1091786395880594, "grad_norm": 0.29489016085930986, "learning_rate": 9.846622668704421e-06, "loss": 0.1342, "step": 3850 }, { "epoch": 1.1106189910338122, "grad_norm": 0.30476944622519747, "learning_rate": 9.821481327115665e-06, "loss": 0.1288, "step": 3855 }, { "epoch": 1.112059342479565, "grad_norm": 0.3092718795766665, "learning_rate": 9.796341114231168e-06, "loss": 0.1246, "step": 3860 }, { "epoch": 1.1134996939253177, "grad_norm": 0.28805538998371455, "learning_rate": 9.771202189002732e-06, "loss": 0.1303, "step": 3865 }, { "epoch": 1.1149400453710705, "grad_norm": 0.26965101242879297, "learning_rate": 9.74606471037402e-06, "loss": 0.1251, "step": 3870 }, { "epoch": 1.1163803968168233, "grad_norm": 0.2524930013344495, "learning_rate": 9.720928837279555e-06, "loss": 0.1331, "step": 3875 }, { "epoch": 1.1178207482625762, "grad_norm": 0.27639450425848394, "learning_rate": 9.6957947286437e-06, "loss": 0.1326, "step": 3880 }, { "epoch": 1.1192610997083288, "grad_norm": 0.2950953833829538, "learning_rate": 9.67066254337966e-06, "loss": 0.1396, "step": 3885 }, { "epoch": 1.1207014511540816, "grad_norm": 0.3332689077004191, "learning_rate": 9.645532440388491e-06, "loss": 0.1395, "step": 3890 }, { "epoch": 1.1221418025998344, "grad_norm": 0.3045207707948191, "learning_rate": 9.620404578558078e-06, "loss": 0.1332, "step": 3895 }, { "epoch": 1.123582154045587, "grad_norm": 0.27543129852179515, "learning_rate": 9.59527911676213e-06, "loss": 0.1312, "step": 3900 }, { "epoch": 1.1250225054913399, "grad_norm": 0.322786857182541, "learning_rate": 9.570156213859188e-06, "loss": 0.1363, "step": 3905 }, { "epoch": 1.1264628569370927, "grad_norm": 0.3332907289414307, "learning_rate": 9.545036028691618e-06, "loss": 0.1452, "step": 3910 }, { "epoch": 1.1279032083828455, "grad_norm": 0.2964745281324652, "learning_rate": 9.519918720084595e-06, "loss": 0.1255, "step": 3915 }, { "epoch": 1.1293435598285981, "grad_norm": 0.29380526786178957, "learning_rate": 9.494804446845105e-06, "loss": 0.1353, "step": 3920 }, { "epoch": 1.130783911274351, "grad_norm": 0.3124984711938421, "learning_rate": 9.46969336776095e-06, "loss": 0.1194, "step": 3925 }, { "epoch": 1.1322242627201038, "grad_norm": 0.2784062232835093, "learning_rate": 9.444585641599736e-06, "loss": 0.1267, "step": 3930 }, { "epoch": 1.1336646141658564, "grad_norm": 0.2960281672476851, "learning_rate": 9.41948142710786e-06, "loss": 0.1288, "step": 3935 }, { "epoch": 1.1351049656116092, "grad_norm": 0.3319737170484304, "learning_rate": 9.394380883009528e-06, "loss": 0.134, "step": 3940 }, { "epoch": 1.136545317057362, "grad_norm": 0.2676639926944894, "learning_rate": 9.369284168005739e-06, "loss": 0.1302, "step": 3945 }, { "epoch": 1.1379856685031147, "grad_norm": 0.29304917330014446, "learning_rate": 9.344191440773269e-06, "loss": 0.1398, "step": 3950 }, { "epoch": 1.1394260199488675, "grad_norm": 0.2834441331194175, "learning_rate": 9.3191028599637e-06, "loss": 0.1366, "step": 3955 }, { "epoch": 1.1408663713946203, "grad_norm": 0.2875227981738716, "learning_rate": 9.294018584202378e-06, "loss": 0.1275, "step": 3960 }, { "epoch": 1.142306722840373, "grad_norm": 0.295392566822095, "learning_rate": 9.268938772087444e-06, "loss": 0.134, "step": 3965 }, { "epoch": 1.1437470742861258, "grad_norm": 0.3031248184368795, "learning_rate": 9.24386358218881e-06, "loss": 0.1328, "step": 3970 }, { "epoch": 1.1451874257318786, "grad_norm": 0.281546642383125, "learning_rate": 9.218793173047167e-06, "loss": 0.126, "step": 3975 }, { "epoch": 1.1466277771776314, "grad_norm": 0.2820813141995356, "learning_rate": 9.19372770317298e-06, "loss": 0.1232, "step": 3980 }, { "epoch": 1.148068128623384, "grad_norm": 0.2733287557965743, "learning_rate": 9.168667331045482e-06, "loss": 0.1356, "step": 3985 }, { "epoch": 1.1495084800691369, "grad_norm": 0.3190121141129414, "learning_rate": 9.143612215111679e-06, "loss": 0.1453, "step": 3990 }, { "epoch": 1.1509488315148897, "grad_norm": 0.3111192035405633, "learning_rate": 9.118562513785334e-06, "loss": 0.1425, "step": 3995 }, { "epoch": 1.1523891829606423, "grad_norm": 0.3137082396200987, "learning_rate": 9.093518385445988e-06, "loss": 0.1377, "step": 4000 }, { "epoch": 1.1523891829606423, "eval_loss": 0.14474257826805115, "eval_runtime": 185.5439, "eval_samples_per_second": 9.723, "eval_steps_per_second": 2.431, "step": 4000 }, { "epoch": 1.1538295344063951, "grad_norm": 0.28258671327645435, "learning_rate": 9.06847998843794e-06, "loss": 0.1334, "step": 4005 }, { "epoch": 1.155269885852148, "grad_norm": 0.29926251581651486, "learning_rate": 9.04344748106925e-06, "loss": 0.1341, "step": 4010 }, { "epoch": 1.1567102372979008, "grad_norm": 0.2704908892813702, "learning_rate": 9.018421021610747e-06, "loss": 0.1328, "step": 4015 }, { "epoch": 1.1581505887436534, "grad_norm": 0.32713139197229957, "learning_rate": 8.993400768295014e-06, "loss": 0.1308, "step": 4020 }, { "epoch": 1.1595909401894062, "grad_norm": 0.2766299542206814, "learning_rate": 8.968386879315404e-06, "loss": 0.1248, "step": 4025 }, { "epoch": 1.161031291635159, "grad_norm": 0.28383009419843636, "learning_rate": 8.94337951282502e-06, "loss": 0.1305, "step": 4030 }, { "epoch": 1.1624716430809117, "grad_norm": 0.29371071990613795, "learning_rate": 8.918378826935731e-06, "loss": 0.1382, "step": 4035 }, { "epoch": 1.1639119945266645, "grad_norm": 0.3150456872535722, "learning_rate": 8.893384979717165e-06, "loss": 0.1359, "step": 4040 }, { "epoch": 1.1653523459724173, "grad_norm": 0.27842595651311847, "learning_rate": 8.86839812919572e-06, "loss": 0.1325, "step": 4045 }, { "epoch": 1.16679269741817, "grad_norm": 0.28767571922945206, "learning_rate": 8.843418433353548e-06, "loss": 0.129, "step": 4050 }, { "epoch": 1.1682330488639228, "grad_norm": 0.29442793959753094, "learning_rate": 8.818446050127565e-06, "loss": 0.132, "step": 4055 }, { "epoch": 1.1696734003096756, "grad_norm": 0.28879468366985844, "learning_rate": 8.793481137408457e-06, "loss": 0.1303, "step": 4060 }, { "epoch": 1.1711137517554282, "grad_norm": 0.27722520036794807, "learning_rate": 8.768523853039675e-06, "loss": 0.1242, "step": 4065 }, { "epoch": 1.172554103201181, "grad_norm": 0.2870500423966342, "learning_rate": 8.743574354816433e-06, "loss": 0.1273, "step": 4070 }, { "epoch": 1.1739944546469339, "grad_norm": 0.29838754312619825, "learning_rate": 8.718632800484725e-06, "loss": 0.134, "step": 4075 }, { "epoch": 1.1754348060926867, "grad_norm": 0.2792449091382121, "learning_rate": 8.693699347740315e-06, "loss": 0.1232, "step": 4080 }, { "epoch": 1.1768751575384393, "grad_norm": 0.27360997864384, "learning_rate": 8.668774154227745e-06, "loss": 0.1262, "step": 4085 }, { "epoch": 1.1783155089841921, "grad_norm": 0.311899339054919, "learning_rate": 8.643857377539333e-06, "loss": 0.1405, "step": 4090 }, { "epoch": 1.179755860429945, "grad_norm": 0.2949013259318118, "learning_rate": 8.618949175214187e-06, "loss": 0.1318, "step": 4095 }, { "epoch": 1.1811962118756978, "grad_norm": 0.27956199010868105, "learning_rate": 8.594049704737199e-06, "loss": 0.134, "step": 4100 }, { "epoch": 1.1826365633214504, "grad_norm": 0.29596036435161904, "learning_rate": 8.569159123538053e-06, "loss": 0.1273, "step": 4105 }, { "epoch": 1.1840769147672032, "grad_norm": 0.3191390201358693, "learning_rate": 8.544277588990226e-06, "loss": 0.1335, "step": 4110 }, { "epoch": 1.185517266212956, "grad_norm": 0.28318994572027134, "learning_rate": 8.519405258410007e-06, "loss": 0.1308, "step": 4115 }, { "epoch": 1.1869576176587087, "grad_norm": 0.27647236193756786, "learning_rate": 8.49454228905548e-06, "loss": 0.1338, "step": 4120 }, { "epoch": 1.1883979691044615, "grad_norm": 0.2885923108928197, "learning_rate": 8.469688838125549e-06, "loss": 0.1274, "step": 4125 }, { "epoch": 1.1898383205502143, "grad_norm": 0.27744666931948775, "learning_rate": 8.444845062758937e-06, "loss": 0.1313, "step": 4130 }, { "epoch": 1.191278671995967, "grad_norm": 0.30589668436211354, "learning_rate": 8.420011120033185e-06, "loss": 0.1246, "step": 4135 }, { "epoch": 1.1927190234417198, "grad_norm": 0.3233342541589803, "learning_rate": 8.395187166963677e-06, "loss": 0.1382, "step": 4140 }, { "epoch": 1.1941593748874726, "grad_norm": 0.3011225199281378, "learning_rate": 8.370373360502621e-06, "loss": 0.1364, "step": 4145 }, { "epoch": 1.1955997263332252, "grad_norm": 0.2699583457721809, "learning_rate": 8.345569857538089e-06, "loss": 0.1301, "step": 4150 }, { "epoch": 1.197040077778978, "grad_norm": 0.29161496467052717, "learning_rate": 8.320776814892996e-06, "loss": 0.1337, "step": 4155 }, { "epoch": 1.1984804292247309, "grad_norm": 0.2989838376182308, "learning_rate": 8.295994389324125e-06, "loss": 0.1329, "step": 4160 }, { "epoch": 1.1999207806704837, "grad_norm": 0.2987527432419416, "learning_rate": 8.271222737521135e-06, "loss": 0.1343, "step": 4165 }, { "epoch": 1.2013611321162363, "grad_norm": 0.3073154082480416, "learning_rate": 8.246462016105561e-06, "loss": 0.1294, "step": 4170 }, { "epoch": 1.2028014835619891, "grad_norm": 0.2918080099388789, "learning_rate": 8.221712381629824e-06, "loss": 0.1253, "step": 4175 }, { "epoch": 1.204241835007742, "grad_norm": 0.2941283674055012, "learning_rate": 8.196973990576259e-06, "loss": 0.1375, "step": 4180 }, { "epoch": 1.2056821864534946, "grad_norm": 0.3010648090781424, "learning_rate": 8.172246999356109e-06, "loss": 0.1291, "step": 4185 }, { "epoch": 1.2071225378992474, "grad_norm": 0.3112057276433019, "learning_rate": 8.147531564308534e-06, "loss": 0.1316, "step": 4190 }, { "epoch": 1.2085628893450002, "grad_norm": 0.3229481450208124, "learning_rate": 8.122827841699638e-06, "loss": 0.1429, "step": 4195 }, { "epoch": 1.210003240790753, "grad_norm": 0.2858141473020997, "learning_rate": 8.09813598772147e-06, "loss": 0.1281, "step": 4200 }, { "epoch": 1.2114435922365057, "grad_norm": 0.2829245856140391, "learning_rate": 8.07345615849103e-06, "loss": 0.1267, "step": 4205 }, { "epoch": 1.2128839436822585, "grad_norm": 0.3112793144040732, "learning_rate": 8.0487885100493e-06, "loss": 0.1262, "step": 4210 }, { "epoch": 1.2143242951280113, "grad_norm": 0.28202317022242396, "learning_rate": 8.02413319836024e-06, "loss": 0.1329, "step": 4215 }, { "epoch": 1.215764646573764, "grad_norm": 0.25601822143551783, "learning_rate": 7.999490379309815e-06, "loss": 0.1272, "step": 4220 }, { "epoch": 1.2172049980195168, "grad_norm": 0.28229846356369415, "learning_rate": 7.974860208705003e-06, "loss": 0.1334, "step": 4225 }, { "epoch": 1.2186453494652696, "grad_norm": 0.29081914492163424, "learning_rate": 7.950242842272805e-06, "loss": 0.1311, "step": 4230 }, { "epoch": 1.2200857009110222, "grad_norm": 0.29767304550969986, "learning_rate": 7.92563843565928e-06, "loss": 0.1303, "step": 4235 }, { "epoch": 1.221526052356775, "grad_norm": 0.279147638085936, "learning_rate": 7.90104714442853e-06, "loss": 0.132, "step": 4240 }, { "epoch": 1.2229664038025279, "grad_norm": 0.2949168996509113, "learning_rate": 7.876469124061748e-06, "loss": 0.1268, "step": 4245 }, { "epoch": 1.2244067552482805, "grad_norm": 0.2620835993688827, "learning_rate": 7.851904529956207e-06, "loss": 0.1316, "step": 4250 }, { "epoch": 1.2258471066940333, "grad_norm": 0.28815330072715767, "learning_rate": 7.827353517424303e-06, "loss": 0.1319, "step": 4255 }, { "epoch": 1.2272874581397861, "grad_norm": 0.27770301658683055, "learning_rate": 7.802816241692554e-06, "loss": 0.1279, "step": 4260 }, { "epoch": 1.228727809585539, "grad_norm": 0.2754011061921349, "learning_rate": 7.778292857900627e-06, "loss": 0.1336, "step": 4265 }, { "epoch": 1.2301681610312916, "grad_norm": 0.306991072991392, "learning_rate": 7.753783521100362e-06, "loss": 0.1389, "step": 4270 }, { "epoch": 1.2316085124770444, "grad_norm": 0.28053697792633275, "learning_rate": 7.72928838625477e-06, "loss": 0.1275, "step": 4275 }, { "epoch": 1.2330488639227972, "grad_norm": 0.30305612446281116, "learning_rate": 7.704807608237089e-06, "loss": 0.1295, "step": 4280 }, { "epoch": 1.2344892153685498, "grad_norm": 0.2864190950078543, "learning_rate": 7.680341341829765e-06, "loss": 0.1331, "step": 4285 }, { "epoch": 1.2359295668143027, "grad_norm": 0.3084452021982498, "learning_rate": 7.655889741723503e-06, "loss": 0.1291, "step": 4290 }, { "epoch": 1.2373699182600555, "grad_norm": 0.3005817324491606, "learning_rate": 7.631452962516278e-06, "loss": 0.1356, "step": 4295 }, { "epoch": 1.2388102697058083, "grad_norm": 0.2917648957325195, "learning_rate": 7.6070311587123555e-06, "loss": 0.1297, "step": 4300 }, { "epoch": 1.240250621151561, "grad_norm": 0.30176130506840737, "learning_rate": 7.5826244847213234e-06, "loss": 0.1265, "step": 4305 }, { "epoch": 1.2416909725973138, "grad_norm": 0.2526507461560052, "learning_rate": 7.558233094857101e-06, "loss": 0.1278, "step": 4310 }, { "epoch": 1.2431313240430666, "grad_norm": 0.28625993806129957, "learning_rate": 7.533857143336976e-06, "loss": 0.1238, "step": 4315 }, { "epoch": 1.2445716754888192, "grad_norm": 0.3146356292248194, "learning_rate": 7.50949678428063e-06, "loss": 0.137, "step": 4320 }, { "epoch": 1.246012026934572, "grad_norm": 0.31206200245191695, "learning_rate": 7.485152171709151e-06, "loss": 0.1319, "step": 4325 }, { "epoch": 1.2474523783803249, "grad_norm": 0.2860758480463773, "learning_rate": 7.460823459544072e-06, "loss": 0.1386, "step": 4330 }, { "epoch": 1.2488927298260775, "grad_norm": 0.2771762439164134, "learning_rate": 7.4365108016063955e-06, "loss": 0.1264, "step": 4335 }, { "epoch": 1.2503330812718303, "grad_norm": 0.30029450340343883, "learning_rate": 7.4122143516156185e-06, "loss": 0.1538, "step": 4340 }, { "epoch": 1.2517734327175831, "grad_norm": 0.30376043235244393, "learning_rate": 7.38793426318876e-06, "loss": 0.1309, "step": 4345 }, { "epoch": 1.2532137841633357, "grad_norm": 0.2848974573813015, "learning_rate": 7.363670689839392e-06, "loss": 0.1271, "step": 4350 }, { "epoch": 1.2546541356090886, "grad_norm": 0.3274964514141692, "learning_rate": 7.339423784976672e-06, "loss": 0.1347, "step": 4355 }, { "epoch": 1.2560944870548414, "grad_norm": 0.2808970287278979, "learning_rate": 7.315193701904361e-06, "loss": 0.1338, "step": 4360 }, { "epoch": 1.2575348385005942, "grad_norm": 0.26647645243293355, "learning_rate": 7.290980593819866e-06, "loss": 0.1206, "step": 4365 }, { "epoch": 1.2589751899463468, "grad_norm": 0.299200085436933, "learning_rate": 7.266784613813268e-06, "loss": 0.1282, "step": 4370 }, { "epoch": 1.2604155413920997, "grad_norm": 0.2769069945827787, "learning_rate": 7.24260591486636e-06, "loss": 0.1356, "step": 4375 }, { "epoch": 1.2618558928378525, "grad_norm": 0.27637014074328664, "learning_rate": 7.218444649851661e-06, "loss": 0.1359, "step": 4380 }, { "epoch": 1.2632962442836053, "grad_norm": 0.31996200647691136, "learning_rate": 7.194300971531473e-06, "loss": 0.14, "step": 4385 }, { "epoch": 1.264736595729358, "grad_norm": 0.31623231666179996, "learning_rate": 7.170175032556902e-06, "loss": 0.1283, "step": 4390 }, { "epoch": 1.2661769471751108, "grad_norm": 0.30271632929723064, "learning_rate": 7.146066985466889e-06, "loss": 0.1285, "step": 4395 }, { "epoch": 1.2676172986208636, "grad_norm": 0.2737967446190047, "learning_rate": 7.121976982687253e-06, "loss": 0.1271, "step": 4400 }, { "epoch": 1.2690576500666162, "grad_norm": 0.27977893252007474, "learning_rate": 7.097905176529734e-06, "loss": 0.1258, "step": 4405 }, { "epoch": 1.270498001512369, "grad_norm": 0.30126818356616486, "learning_rate": 7.073851719191014e-06, "loss": 0.131, "step": 4410 }, { "epoch": 1.2719383529581219, "grad_norm": 0.299507259922042, "learning_rate": 7.049816762751762e-06, "loss": 0.1308, "step": 4415 }, { "epoch": 1.2733787044038745, "grad_norm": 0.2649336740662209, "learning_rate": 7.02580045917568e-06, "loss": 0.13, "step": 4420 }, { "epoch": 1.2748190558496273, "grad_norm": 0.2564044244734405, "learning_rate": 7.001802960308534e-06, "loss": 0.1211, "step": 4425 }, { "epoch": 1.2762594072953801, "grad_norm": 0.2977518133910194, "learning_rate": 6.977824417877183e-06, "loss": 0.1347, "step": 4430 }, { "epoch": 1.2776997587411327, "grad_norm": 0.37240677712373393, "learning_rate": 6.953864983488646e-06, "loss": 0.1356, "step": 4435 }, { "epoch": 1.2791401101868856, "grad_norm": 0.2965189745890561, "learning_rate": 6.929924808629122e-06, "loss": 0.1265, "step": 4440 }, { "epoch": 1.2805804616326384, "grad_norm": 0.2752308024638383, "learning_rate": 6.906004044663046e-06, "loss": 0.1251, "step": 4445 }, { "epoch": 1.282020813078391, "grad_norm": 0.30571802447619534, "learning_rate": 6.882102842832115e-06, "loss": 0.1332, "step": 4450 }, { "epoch": 1.2834611645241438, "grad_norm": 0.2842226334713274, "learning_rate": 6.858221354254352e-06, "loss": 0.1363, "step": 4455 }, { "epoch": 1.2849015159698967, "grad_norm": 0.2979873943753214, "learning_rate": 6.834359729923138e-06, "loss": 0.1298, "step": 4460 }, { "epoch": 1.2863418674156495, "grad_norm": 0.30630908938424195, "learning_rate": 6.81051812070626e-06, "loss": 0.1351, "step": 4465 }, { "epoch": 1.2877822188614023, "grad_norm": 0.28492055381385706, "learning_rate": 6.786696677344949e-06, "loss": 0.1208, "step": 4470 }, { "epoch": 1.289222570307155, "grad_norm": 0.28384613081056187, "learning_rate": 6.762895550452948e-06, "loss": 0.1395, "step": 4475 }, { "epoch": 1.2906629217529078, "grad_norm": 0.324228616513587, "learning_rate": 6.739114890515542e-06, "loss": 0.1378, "step": 4480 }, { "epoch": 1.2921032731986606, "grad_norm": 0.2853156077237628, "learning_rate": 6.715354847888607e-06, "loss": 0.1226, "step": 4485 }, { "epoch": 1.2935436246444132, "grad_norm": 0.31375254697639315, "learning_rate": 6.691615572797672e-06, "loss": 0.1392, "step": 4490 }, { "epoch": 1.294983976090166, "grad_norm": 0.2883946445076774, "learning_rate": 6.667897215336954e-06, "loss": 0.1217, "step": 4495 }, { "epoch": 1.2964243275359189, "grad_norm": 0.2895452595912213, "learning_rate": 6.64419992546842e-06, "loss": 0.1331, "step": 4500 }, { "epoch": 1.2964243275359189, "eval_loss": 0.14379242062568665, "eval_runtime": 184.0705, "eval_samples_per_second": 9.801, "eval_steps_per_second": 2.45, "step": 4500 }, { "epoch": 1.2978646789816715, "grad_norm": 0.28665306269625207, "learning_rate": 6.620523853020828e-06, "loss": 0.1293, "step": 4505 }, { "epoch": 1.2993050304274243, "grad_norm": 0.3087468378172715, "learning_rate": 6.596869147688796e-06, "loss": 0.1359, "step": 4510 }, { "epoch": 1.3007453818731771, "grad_norm": 0.31040503221184323, "learning_rate": 6.5732359590318405e-06, "loss": 0.1289, "step": 4515 }, { "epoch": 1.3021857333189297, "grad_norm": 0.2614034619917938, "learning_rate": 6.549624436473437e-06, "loss": 0.1317, "step": 4520 }, { "epoch": 1.3036260847646826, "grad_norm": 0.29279408085324476, "learning_rate": 6.526034729300077e-06, "loss": 0.1302, "step": 4525 }, { "epoch": 1.3050664362104354, "grad_norm": 0.27391338017046146, "learning_rate": 6.502466986660318e-06, "loss": 0.1237, "step": 4530 }, { "epoch": 1.306506787656188, "grad_norm": 0.2577932055235253, "learning_rate": 6.478921357563852e-06, "loss": 0.133, "step": 4535 }, { "epoch": 1.3079471391019408, "grad_norm": 0.29115643412884196, "learning_rate": 6.4553979908805405e-06, "loss": 0.1264, "step": 4540 }, { "epoch": 1.3093874905476937, "grad_norm": 0.2805429545326651, "learning_rate": 6.4318970353395015e-06, "loss": 0.126, "step": 4545 }, { "epoch": 1.3108278419934463, "grad_norm": 0.28048701005423443, "learning_rate": 6.408418639528155e-06, "loss": 0.1304, "step": 4550 }, { "epoch": 1.312268193439199, "grad_norm": 0.31036490760140995, "learning_rate": 6.38496295189128e-06, "loss": 0.1299, "step": 4555 }, { "epoch": 1.313708544884952, "grad_norm": 0.27781555493965154, "learning_rate": 6.361530120730084e-06, "loss": 0.1283, "step": 4560 }, { "epoch": 1.3151488963307048, "grad_norm": 0.28470922038550206, "learning_rate": 6.338120294201257e-06, "loss": 0.1273, "step": 4565 }, { "epoch": 1.3165892477764576, "grad_norm": 0.2812452734554949, "learning_rate": 6.314733620316047e-06, "loss": 0.1225, "step": 4570 }, { "epoch": 1.3180295992222102, "grad_norm": 0.29152015278801036, "learning_rate": 6.291370246939312e-06, "loss": 0.132, "step": 4575 }, { "epoch": 1.319469950667963, "grad_norm": 0.28443383104443637, "learning_rate": 6.268030321788589e-06, "loss": 0.1293, "step": 4580 }, { "epoch": 1.3209103021137159, "grad_norm": 0.2959906683847514, "learning_rate": 6.244713992433164e-06, "loss": 0.1335, "step": 4585 }, { "epoch": 1.3223506535594685, "grad_norm": 0.3040944482039468, "learning_rate": 6.221421406293131e-06, "loss": 0.1273, "step": 4590 }, { "epoch": 1.3237910050052213, "grad_norm": 0.30882724239677245, "learning_rate": 6.1981527106384765e-06, "loss": 0.1191, "step": 4595 }, { "epoch": 1.3252313564509741, "grad_norm": 0.302671709123605, "learning_rate": 6.17490805258812e-06, "loss": 0.1365, "step": 4600 }, { "epoch": 1.3266717078967267, "grad_norm": 0.29010412797273544, "learning_rate": 6.151687579109015e-06, "loss": 0.1402, "step": 4605 }, { "epoch": 1.3281120593424796, "grad_norm": 0.2638211025663234, "learning_rate": 6.128491437015202e-06, "loss": 0.122, "step": 4610 }, { "epoch": 1.3295524107882324, "grad_norm": 0.3317741888903263, "learning_rate": 6.1053197729668745e-06, "loss": 0.1234, "step": 4615 }, { "epoch": 1.330992762233985, "grad_norm": 0.305170078612422, "learning_rate": 6.082172733469469e-06, "loss": 0.1316, "step": 4620 }, { "epoch": 1.3324331136797378, "grad_norm": 0.28817033083195814, "learning_rate": 6.059050464872731e-06, "loss": 0.1366, "step": 4625 }, { "epoch": 1.3338734651254907, "grad_norm": 0.2953828272280745, "learning_rate": 6.03595311336979e-06, "loss": 0.1272, "step": 4630 }, { "epoch": 1.3353138165712433, "grad_norm": 0.2830751051303079, "learning_rate": 6.0128808249962255e-06, "loss": 0.1404, "step": 4635 }, { "epoch": 1.336754168016996, "grad_norm": 0.3252605933332986, "learning_rate": 5.989833745629163e-06, "loss": 0.129, "step": 4640 }, { "epoch": 1.338194519462749, "grad_norm": 0.30213564618018374, "learning_rate": 5.966812020986341e-06, "loss": 0.13, "step": 4645 }, { "epoch": 1.3396348709085018, "grad_norm": 0.2883204719629323, "learning_rate": 5.943815796625179e-06, "loss": 0.1253, "step": 4650 }, { "epoch": 1.3410752223542544, "grad_norm": 0.2959117376319711, "learning_rate": 5.920845217941874e-06, "loss": 0.1327, "step": 4655 }, { "epoch": 1.3425155738000072, "grad_norm": 0.31232438887774006, "learning_rate": 5.8979004301704814e-06, "loss": 0.1383, "step": 4660 }, { "epoch": 1.34395592524576, "grad_norm": 0.29382396748754, "learning_rate": 5.874981578381985e-06, "loss": 0.133, "step": 4665 }, { "epoch": 1.3453962766915128, "grad_norm": 0.30443621369119456, "learning_rate": 5.852088807483385e-06, "loss": 0.1303, "step": 4670 }, { "epoch": 1.3468366281372655, "grad_norm": 0.2807950559344455, "learning_rate": 5.829222262216783e-06, "loss": 0.1327, "step": 4675 }, { "epoch": 1.3482769795830183, "grad_norm": 0.28883011845316237, "learning_rate": 5.80638208715847e-06, "loss": 0.1321, "step": 4680 }, { "epoch": 1.3497173310287711, "grad_norm": 0.2822159218958781, "learning_rate": 5.783568426718001e-06, "loss": 0.1243, "step": 4685 }, { "epoch": 1.3511576824745237, "grad_norm": 0.2856294468316457, "learning_rate": 5.76078142513729e-06, "loss": 0.1264, "step": 4690 }, { "epoch": 1.3525980339202766, "grad_norm": 0.2833850663671519, "learning_rate": 5.738021226489711e-06, "loss": 0.1249, "step": 4695 }, { "epoch": 1.3540383853660294, "grad_norm": 0.3098314509041874, "learning_rate": 5.715287974679156e-06, "loss": 0.1317, "step": 4700 }, { "epoch": 1.355478736811782, "grad_norm": 0.2509107454497361, "learning_rate": 5.692581813439147e-06, "loss": 0.1207, "step": 4705 }, { "epoch": 1.3569190882575348, "grad_norm": 0.288399827316461, "learning_rate": 5.669902886331935e-06, "loss": 0.1313, "step": 4710 }, { "epoch": 1.3583594397032877, "grad_norm": 0.2899923984835422, "learning_rate": 5.647251336747565e-06, "loss": 0.1353, "step": 4715 }, { "epoch": 1.3597997911490403, "grad_norm": 0.30442118246180094, "learning_rate": 5.62462730790299e-06, "loss": 0.1242, "step": 4720 }, { "epoch": 1.361240142594793, "grad_norm": 0.3136171534592957, "learning_rate": 5.602030942841161e-06, "loss": 0.1249, "step": 4725 }, { "epoch": 1.362680494040546, "grad_norm": 0.29860304044620467, "learning_rate": 5.579462384430123e-06, "loss": 0.1283, "step": 4730 }, { "epoch": 1.3641208454862985, "grad_norm": 0.31046611461157403, "learning_rate": 5.556921775362101e-06, "loss": 0.135, "step": 4735 }, { "epoch": 1.3655611969320514, "grad_norm": 0.28704908174570976, "learning_rate": 5.5344092581526246e-06, "loss": 0.1387, "step": 4740 }, { "epoch": 1.3670015483778042, "grad_norm": 0.3341532812497149, "learning_rate": 5.5119249751395955e-06, "loss": 0.143, "step": 4745 }, { "epoch": 1.368441899823557, "grad_norm": 0.29001862112776317, "learning_rate": 5.489469068482399e-06, "loss": 0.1461, "step": 4750 }, { "epoch": 1.3698822512693098, "grad_norm": 0.2834403752479179, "learning_rate": 5.467041680161029e-06, "loss": 0.1233, "step": 4755 }, { "epoch": 1.3713226027150625, "grad_norm": 0.33143013156488327, "learning_rate": 5.444642951975137e-06, "loss": 0.1272, "step": 4760 }, { "epoch": 1.3727629541608153, "grad_norm": 0.28777376307497, "learning_rate": 5.422273025543197e-06, "loss": 0.1289, "step": 4765 }, { "epoch": 1.3742033056065681, "grad_norm": 0.28509358403097473, "learning_rate": 5.399932042301565e-06, "loss": 0.1359, "step": 4770 }, { "epoch": 1.3756436570523207, "grad_norm": 0.31777606236187494, "learning_rate": 5.377620143503598e-06, "loss": 0.1364, "step": 4775 }, { "epoch": 1.3770840084980736, "grad_norm": 0.2892592873947313, "learning_rate": 5.355337470218778e-06, "loss": 0.1325, "step": 4780 }, { "epoch": 1.3785243599438264, "grad_norm": 0.27422137870653046, "learning_rate": 5.333084163331794e-06, "loss": 0.1298, "step": 4785 }, { "epoch": 1.379964711389579, "grad_norm": 0.3146053844222587, "learning_rate": 5.3108603635416654e-06, "loss": 0.1434, "step": 4790 }, { "epoch": 1.3814050628353318, "grad_norm": 0.27997320753339555, "learning_rate": 5.288666211360848e-06, "loss": 0.1342, "step": 4795 }, { "epoch": 1.3828454142810847, "grad_norm": 0.2984800479383985, "learning_rate": 5.266501847114349e-06, "loss": 0.1346, "step": 4800 }, { "epoch": 1.3842857657268373, "grad_norm": 0.29201935536901064, "learning_rate": 5.2443674109388355e-06, "loss": 0.1264, "step": 4805 }, { "epoch": 1.38572611717259, "grad_norm": 0.27719720949263027, "learning_rate": 5.222263042781761e-06, "loss": 0.1301, "step": 4810 }, { "epoch": 1.387166468618343, "grad_norm": 0.3042597202236721, "learning_rate": 5.200188882400458e-06, "loss": 0.1291, "step": 4815 }, { "epoch": 1.3886068200640955, "grad_norm": 0.30302490372496094, "learning_rate": 5.178145069361269e-06, "loss": 0.1387, "step": 4820 }, { "epoch": 1.3900471715098484, "grad_norm": 0.28176863882814285, "learning_rate": 5.156131743038672e-06, "loss": 0.1292, "step": 4825 }, { "epoch": 1.3914875229556012, "grad_norm": 0.3058547991836209, "learning_rate": 5.134149042614381e-06, "loss": 0.1376, "step": 4830 }, { "epoch": 1.3929278744013538, "grad_norm": 0.30393509196007185, "learning_rate": 5.112197107076473e-06, "loss": 0.1355, "step": 4835 }, { "epoch": 1.3943682258471066, "grad_norm": 0.26425351590901147, "learning_rate": 5.090276075218516e-06, "loss": 0.1253, "step": 4840 }, { "epoch": 1.3958085772928595, "grad_norm": 0.2789170911602346, "learning_rate": 5.0683860856386805e-06, "loss": 0.1226, "step": 4845 }, { "epoch": 1.3972489287386123, "grad_norm": 0.3084946604950202, "learning_rate": 5.046527276738869e-06, "loss": 0.1403, "step": 4850 }, { "epoch": 1.3986892801843651, "grad_norm": 0.2817833925106797, "learning_rate": 5.02469978672385e-06, "loss": 0.1286, "step": 4855 }, { "epoch": 1.4001296316301177, "grad_norm": 0.31976795477023545, "learning_rate": 5.002903753600368e-06, "loss": 0.1388, "step": 4860 }, { "epoch": 1.4015699830758706, "grad_norm": 0.31706703406909087, "learning_rate": 4.981139315176272e-06, "loss": 0.1256, "step": 4865 }, { "epoch": 1.4030103345216234, "grad_norm": 0.29670136774417283, "learning_rate": 4.959406609059661e-06, "loss": 0.1305, "step": 4870 }, { "epoch": 1.404450685967376, "grad_norm": 0.3093837514114273, "learning_rate": 4.937705772657992e-06, "loss": 0.1279, "step": 4875 }, { "epoch": 1.4058910374131288, "grad_norm": 0.26356143472517385, "learning_rate": 4.916036943177235e-06, "loss": 0.1349, "step": 4880 }, { "epoch": 1.4073313888588816, "grad_norm": 0.3339315239470743, "learning_rate": 4.894400257620982e-06, "loss": 0.1361, "step": 4885 }, { "epoch": 1.4087717403046343, "grad_norm": 0.32122936240066957, "learning_rate": 4.872795852789592e-06, "loss": 0.1306, "step": 4890 }, { "epoch": 1.410212091750387, "grad_norm": 0.28913451492903725, "learning_rate": 4.851223865279336e-06, "loss": 0.1356, "step": 4895 }, { "epoch": 1.41165244319614, "grad_norm": 0.30914198884665406, "learning_rate": 4.829684431481516e-06, "loss": 0.1299, "step": 4900 }, { "epoch": 1.4130927946418925, "grad_norm": 0.2733782589344944, "learning_rate": 4.8081776875815966e-06, "loss": 0.1301, "step": 4905 }, { "epoch": 1.4145331460876454, "grad_norm": 0.3485745526900565, "learning_rate": 4.786703769558382e-06, "loss": 0.1253, "step": 4910 }, { "epoch": 1.4159734975333982, "grad_norm": 0.311015844030605, "learning_rate": 4.765262813183112e-06, "loss": 0.1243, "step": 4915 }, { "epoch": 1.4174138489791508, "grad_norm": 0.2642965069547113, "learning_rate": 4.743854954018628e-06, "loss": 0.1195, "step": 4920 }, { "epoch": 1.4188542004249036, "grad_norm": 0.272979546582442, "learning_rate": 4.7224803274185185e-06, "loss": 0.1212, "step": 4925 }, { "epoch": 1.4202945518706565, "grad_norm": 0.3038792772484129, "learning_rate": 4.701139068526243e-06, "loss": 0.1338, "step": 4930 }, { "epoch": 1.4217349033164093, "grad_norm": 0.2767357670064067, "learning_rate": 4.679831312274298e-06, "loss": 0.1255, "step": 4935 }, { "epoch": 1.423175254762162, "grad_norm": 0.27710201889151737, "learning_rate": 4.658557193383352e-06, "loss": 0.1357, "step": 4940 }, { "epoch": 1.4246156062079147, "grad_norm": 0.3096121537912325, "learning_rate": 4.637316846361395e-06, "loss": 0.1293, "step": 4945 }, { "epoch": 1.4260559576536675, "grad_norm": 0.29924550507004927, "learning_rate": 4.616110405502903e-06, "loss": 0.1367, "step": 4950 }, { "epoch": 1.4274963090994204, "grad_norm": 0.3084219881809167, "learning_rate": 4.594938004887963e-06, "loss": 0.1268, "step": 4955 }, { "epoch": 1.428936660545173, "grad_norm": 0.30194233813871496, "learning_rate": 4.57379977838144e-06, "loss": 0.1386, "step": 4960 }, { "epoch": 1.4303770119909258, "grad_norm": 0.2924906618475438, "learning_rate": 4.5526958596321415e-06, "loss": 0.1326, "step": 4965 }, { "epoch": 1.4318173634366786, "grad_norm": 0.2928117826425519, "learning_rate": 4.531626382071947e-06, "loss": 0.1337, "step": 4970 }, { "epoch": 1.4332577148824313, "grad_norm": 0.2890472449363505, "learning_rate": 4.510591478914984e-06, "loss": 0.1326, "step": 4975 }, { "epoch": 1.434698066328184, "grad_norm": 0.2816438145391565, "learning_rate": 4.489591283156778e-06, "loss": 0.1298, "step": 4980 }, { "epoch": 1.436138417773937, "grad_norm": 0.2632230619903958, "learning_rate": 4.468625927573411e-06, "loss": 0.1263, "step": 4985 }, { "epoch": 1.4375787692196895, "grad_norm": 0.29380703917060536, "learning_rate": 4.447695544720685e-06, "loss": 0.1385, "step": 4990 }, { "epoch": 1.4390191206654424, "grad_norm": 0.28185514155481595, "learning_rate": 4.426800266933291e-06, "loss": 0.1306, "step": 4995 }, { "epoch": 1.4404594721111952, "grad_norm": 0.2874696989018966, "learning_rate": 4.405940226323953e-06, "loss": 0.1311, "step": 5000 }, { "epoch": 1.4404594721111952, "eval_loss": 0.14263266324996948, "eval_runtime": 182.4281, "eval_samples_per_second": 9.889, "eval_steps_per_second": 2.472, "step": 5000 }, { "epoch": 1.4418998235569478, "grad_norm": 0.29035247275746917, "learning_rate": 4.385115554782608e-06, "loss": 0.1284, "step": 5005 }, { "epoch": 1.4433401750027006, "grad_norm": 0.2979091755169656, "learning_rate": 4.364326383975576e-06, "loss": 0.1393, "step": 5010 }, { "epoch": 1.4447805264484535, "grad_norm": 0.2851008473192101, "learning_rate": 4.343572845344699e-06, "loss": 0.1302, "step": 5015 }, { "epoch": 1.446220877894206, "grad_norm": 0.29510410689584077, "learning_rate": 4.3228550701065555e-06, "loss": 0.1295, "step": 5020 }, { "epoch": 1.4476612293399589, "grad_norm": 0.27703300197438524, "learning_rate": 4.302173189251592e-06, "loss": 0.1277, "step": 5025 }, { "epoch": 1.4491015807857117, "grad_norm": 0.2762512635985658, "learning_rate": 4.281527333543304e-06, "loss": 0.135, "step": 5030 }, { "epoch": 1.4505419322314645, "grad_norm": 0.3041165339805363, "learning_rate": 4.260917633517432e-06, "loss": 0.1243, "step": 5035 }, { "epoch": 1.4519822836772174, "grad_norm": 0.3060868972240773, "learning_rate": 4.2403442194811015e-06, "loss": 0.1298, "step": 5040 }, { "epoch": 1.45342263512297, "grad_norm": 0.2893090691141548, "learning_rate": 4.2198072215120234e-06, "loss": 0.1364, "step": 5045 }, { "epoch": 1.4548629865687228, "grad_norm": 0.31038762857518265, "learning_rate": 4.1993067694576604e-06, "loss": 0.1319, "step": 5050 }, { "epoch": 1.4563033380144756, "grad_norm": 0.2812918503365721, "learning_rate": 4.178842992934412e-06, "loss": 0.1262, "step": 5055 }, { "epoch": 1.4577436894602283, "grad_norm": 0.2850284783778579, "learning_rate": 4.158416021326787e-06, "loss": 0.1296, "step": 5060 }, { "epoch": 1.459184040905981, "grad_norm": 0.32065754223579734, "learning_rate": 4.138025983786606e-06, "loss": 0.1328, "step": 5065 }, { "epoch": 1.460624392351734, "grad_norm": 0.34520060614106257, "learning_rate": 4.117673009232155e-06, "loss": 0.1323, "step": 5070 }, { "epoch": 1.4620647437974865, "grad_norm": 0.28913376259082474, "learning_rate": 4.097357226347385e-06, "loss": 0.122, "step": 5075 }, { "epoch": 1.4635050952432394, "grad_norm": 0.3380735998941071, "learning_rate": 4.077078763581112e-06, "loss": 0.1331, "step": 5080 }, { "epoch": 1.4649454466889922, "grad_norm": 0.32045923135465526, "learning_rate": 4.056837749146176e-06, "loss": 0.13, "step": 5085 }, { "epoch": 1.4663857981347448, "grad_norm": 0.3204047195205385, "learning_rate": 4.036634311018657e-06, "loss": 0.1271, "step": 5090 }, { "epoch": 1.4678261495804976, "grad_norm": 0.3177103619589229, "learning_rate": 4.016468576937048e-06, "loss": 0.1313, "step": 5095 }, { "epoch": 1.4692665010262504, "grad_norm": 0.3170687328843362, "learning_rate": 3.996340674401452e-06, "loss": 0.143, "step": 5100 }, { "epoch": 1.470706852472003, "grad_norm": 0.31482701115246225, "learning_rate": 3.976250730672789e-06, "loss": 0.1267, "step": 5105 }, { "epoch": 1.4721472039177559, "grad_norm": 0.29749419127247917, "learning_rate": 3.95619887277197e-06, "loss": 0.1355, "step": 5110 }, { "epoch": 1.4735875553635087, "grad_norm": 0.2721513816837507, "learning_rate": 3.936185227479104e-06, "loss": 0.1262, "step": 5115 }, { "epoch": 1.4750279068092613, "grad_norm": 0.28848818134175824, "learning_rate": 3.91620992133271e-06, "loss": 0.14, "step": 5120 }, { "epoch": 1.4764682582550142, "grad_norm": 0.2878594948643242, "learning_rate": 3.896273080628881e-06, "loss": 0.1256, "step": 5125 }, { "epoch": 1.477908609700767, "grad_norm": 0.295688557911149, "learning_rate": 3.876374831420523e-06, "loss": 0.1326, "step": 5130 }, { "epoch": 1.4793489611465198, "grad_norm": 0.29623849761201715, "learning_rate": 3.856515299516545e-06, "loss": 0.132, "step": 5135 }, { "epoch": 1.4807893125922726, "grad_norm": 0.30600381790452613, "learning_rate": 3.8366946104810535e-06, "loss": 0.1319, "step": 5140 }, { "epoch": 1.4822296640380253, "grad_norm": 0.2829908397887632, "learning_rate": 3.816912889632567e-06, "loss": 0.1304, "step": 5145 }, { "epoch": 1.483670015483778, "grad_norm": 0.2859310181485027, "learning_rate": 3.7971702620432306e-06, "loss": 0.1291, "step": 5150 }, { "epoch": 1.485110366929531, "grad_norm": 0.2903900767146654, "learning_rate": 3.777466852538012e-06, "loss": 0.1269, "step": 5155 }, { "epoch": 1.4865507183752835, "grad_norm": 0.30693062898146356, "learning_rate": 3.757802785693919e-06, "loss": 0.1227, "step": 5160 }, { "epoch": 1.4879910698210363, "grad_norm": 0.2920336924699934, "learning_rate": 3.738178185839212e-06, "loss": 0.1298, "step": 5165 }, { "epoch": 1.4894314212667892, "grad_norm": 0.2945995741156779, "learning_rate": 3.718593177052611e-06, "loss": 0.1296, "step": 5170 }, { "epoch": 1.4908717727125418, "grad_norm": 0.281519581967037, "learning_rate": 3.699047883162531e-06, "loss": 0.1348, "step": 5175 }, { "epoch": 1.4923121241582946, "grad_norm": 0.2942529713423248, "learning_rate": 3.679542427746272e-06, "loss": 0.1285, "step": 5180 }, { "epoch": 1.4937524756040474, "grad_norm": 0.2590754218015783, "learning_rate": 3.660076934129253e-06, "loss": 0.1198, "step": 5185 }, { "epoch": 1.4951928270498, "grad_norm": 0.3110962189352441, "learning_rate": 3.6406515253842433e-06, "loss": 0.1329, "step": 5190 }, { "epoch": 1.4966331784955529, "grad_norm": 0.3012107480072597, "learning_rate": 3.621266324330548e-06, "loss": 0.1269, "step": 5195 }, { "epoch": 1.4980735299413057, "grad_norm": 0.27989842275949034, "learning_rate": 3.601921453533269e-06, "loss": 0.1264, "step": 5200 }, { "epoch": 1.4995138813870583, "grad_norm": 0.27816202646485955, "learning_rate": 3.582617035302519e-06, "loss": 0.1353, "step": 5205 }, { "epoch": 1.5009542328328112, "grad_norm": 0.28449873805759873, "learning_rate": 3.5633531916926355e-06, "loss": 0.1401, "step": 5210 }, { "epoch": 1.502394584278564, "grad_norm": 0.2939328466462327, "learning_rate": 3.5441300445014204e-06, "loss": 0.1309, "step": 5215 }, { "epoch": 1.5038349357243166, "grad_norm": 0.27676106704556663, "learning_rate": 3.5249477152693746e-06, "loss": 0.1255, "step": 5220 }, { "epoch": 1.5052752871700696, "grad_norm": 0.3091437198894899, "learning_rate": 3.5058063252789164e-06, "loss": 0.1337, "step": 5225 }, { "epoch": 1.5067156386158223, "grad_norm": 0.29089963493957577, "learning_rate": 3.486705995553623e-06, "loss": 0.1179, "step": 5230 }, { "epoch": 1.5081559900615749, "grad_norm": 0.28563278658009045, "learning_rate": 3.467646846857462e-06, "loss": 0.1324, "step": 5235 }, { "epoch": 1.509596341507328, "grad_norm": 0.27308010285162093, "learning_rate": 3.448628999694028e-06, "loss": 0.131, "step": 5240 }, { "epoch": 1.5110366929530805, "grad_norm": 0.2692168630506388, "learning_rate": 3.4296525743057917e-06, "loss": 0.1245, "step": 5245 }, { "epoch": 1.5124770443988333, "grad_norm": 0.28589432120186975, "learning_rate": 3.4107176906733186e-06, "loss": 0.1395, "step": 5250 }, { "epoch": 1.5139173958445862, "grad_norm": 0.3230293467940769, "learning_rate": 3.3918244685145273e-06, "loss": 0.1239, "step": 5255 }, { "epoch": 1.5153577472903388, "grad_norm": 0.2724298060757966, "learning_rate": 3.3729730272839236e-06, "loss": 0.1243, "step": 5260 }, { "epoch": 1.5167980987360916, "grad_norm": 0.2875130873464737, "learning_rate": 3.3541634861718586e-06, "loss": 0.1267, "step": 5265 }, { "epoch": 1.5182384501818444, "grad_norm": 0.2545525280708859, "learning_rate": 3.335395964103746e-06, "loss": 0.1257, "step": 5270 }, { "epoch": 1.519678801627597, "grad_norm": 0.28758741388989995, "learning_rate": 3.3166705797393505e-06, "loss": 0.1218, "step": 5275 }, { "epoch": 1.5211191530733499, "grad_norm": 0.28405695915470375, "learning_rate": 3.2979874514720044e-06, "loss": 0.1304, "step": 5280 }, { "epoch": 1.5225595045191027, "grad_norm": 0.30359930663025747, "learning_rate": 3.2793466974278698e-06, "loss": 0.1312, "step": 5285 }, { "epoch": 1.5239998559648553, "grad_norm": 0.2739393344439081, "learning_rate": 3.2607484354652053e-06, "loss": 0.1266, "step": 5290 }, { "epoch": 1.5254402074106082, "grad_norm": 0.29944033968813727, "learning_rate": 3.2421927831735946e-06, "loss": 0.1272, "step": 5295 }, { "epoch": 1.526880558856361, "grad_norm": 0.2954139006807455, "learning_rate": 3.2236798578732243e-06, "loss": 0.124, "step": 5300 }, { "epoch": 1.5283209103021136, "grad_norm": 0.29210672998811726, "learning_rate": 3.2052097766141333e-06, "loss": 0.1318, "step": 5305 }, { "epoch": 1.5297612617478666, "grad_norm": 0.2600604904032675, "learning_rate": 3.1867826561754734e-06, "loss": 0.1249, "step": 5310 }, { "epoch": 1.5312016131936192, "grad_norm": 0.30502890550059597, "learning_rate": 3.168398613064769e-06, "loss": 0.1331, "step": 5315 }, { "epoch": 1.5326419646393719, "grad_norm": 0.2927272597507059, "learning_rate": 3.150057763517195e-06, "loss": 0.1398, "step": 5320 }, { "epoch": 1.534082316085125, "grad_norm": 0.27938831535091735, "learning_rate": 3.1317602234948176e-06, "loss": 0.1263, "step": 5325 }, { "epoch": 1.5355226675308775, "grad_norm": 0.3173797555345694, "learning_rate": 3.1135061086858744e-06, "loss": 0.1331, "step": 5330 }, { "epoch": 1.5369630189766303, "grad_norm": 0.28629187116885524, "learning_rate": 3.0952955345040536e-06, "loss": 0.1232, "step": 5335 }, { "epoch": 1.5384033704223832, "grad_norm": 0.315475203948924, "learning_rate": 3.0771286160877422e-06, "loss": 0.1361, "step": 5340 }, { "epoch": 1.5398437218681358, "grad_norm": 0.3090927948177035, "learning_rate": 3.0590054682993107e-06, "loss": 0.1329, "step": 5345 }, { "epoch": 1.5412840733138886, "grad_norm": 0.2952878570632112, "learning_rate": 3.0409262057243873e-06, "loss": 0.1307, "step": 5350 }, { "epoch": 1.5427244247596414, "grad_norm": 0.292962868707295, "learning_rate": 3.022890942671126e-06, "loss": 0.1223, "step": 5355 }, { "epoch": 1.544164776205394, "grad_norm": 0.268435660253563, "learning_rate": 3.004899793169499e-06, "loss": 0.1219, "step": 5360 }, { "epoch": 1.5456051276511469, "grad_norm": 0.2756367576477082, "learning_rate": 2.986952870970555e-06, "loss": 0.1326, "step": 5365 }, { "epoch": 1.5470454790968997, "grad_norm": 0.28750683938474214, "learning_rate": 2.969050289545714e-06, "loss": 0.1314, "step": 5370 }, { "epoch": 1.5484858305426523, "grad_norm": 0.27186607640486593, "learning_rate": 2.9511921620860564e-06, "loss": 0.1265, "step": 5375 }, { "epoch": 1.5499261819884051, "grad_norm": 0.28628288268866514, "learning_rate": 2.9333786015015785e-06, "loss": 0.1355, "step": 5380 }, { "epoch": 1.551366533434158, "grad_norm": 0.31905004510449597, "learning_rate": 2.9156097204205067e-06, "loss": 0.1322, "step": 5385 }, { "epoch": 1.5528068848799106, "grad_norm": 0.26846823715130697, "learning_rate": 2.897885631188585e-06, "loss": 0.1318, "step": 5390 }, { "epoch": 1.5542472363256634, "grad_norm": 0.2910887451967936, "learning_rate": 2.8802064458683455e-06, "loss": 0.1295, "step": 5395 }, { "epoch": 1.5556875877714162, "grad_norm": 0.29876503104887947, "learning_rate": 2.862572276238407e-06, "loss": 0.1326, "step": 5400 }, { "epoch": 1.5571279392171689, "grad_norm": 0.2747178203316531, "learning_rate": 2.844983233792785e-06, "loss": 0.1233, "step": 5405 }, { "epoch": 1.558568290662922, "grad_norm": 0.2860909643835414, "learning_rate": 2.827439429740164e-06, "loss": 0.1256, "step": 5410 }, { "epoch": 1.5600086421086745, "grad_norm": 0.2731315827900148, "learning_rate": 2.8099409750032035e-06, "loss": 0.131, "step": 5415 }, { "epoch": 1.5614489935544271, "grad_norm": 0.28706730422047516, "learning_rate": 2.7924879802178395e-06, "loss": 0.1277, "step": 5420 }, { "epoch": 1.5628893450001802, "grad_norm": 0.3052258135371807, "learning_rate": 2.77508055573258e-06, "loss": 0.1207, "step": 5425 }, { "epoch": 1.5643296964459328, "grad_norm": 0.2851069995923877, "learning_rate": 2.7577188116078148e-06, "loss": 0.1299, "step": 5430 }, { "epoch": 1.5657700478916856, "grad_norm": 0.2951839634863107, "learning_rate": 2.74040285761511e-06, "loss": 0.1304, "step": 5435 }, { "epoch": 1.5672103993374384, "grad_norm": 0.26286219302398806, "learning_rate": 2.723132803236517e-06, "loss": 0.1235, "step": 5440 }, { "epoch": 1.568650750783191, "grad_norm": 0.2979441766077297, "learning_rate": 2.7059087576638876e-06, "loss": 0.1256, "step": 5445 }, { "epoch": 1.5700911022289439, "grad_norm": 0.3205557150437196, "learning_rate": 2.6887308297981775e-06, "loss": 0.1371, "step": 5450 }, { "epoch": 1.5715314536746967, "grad_norm": 0.2721271632458455, "learning_rate": 2.6715991282487454e-06, "loss": 0.1332, "step": 5455 }, { "epoch": 1.5729718051204493, "grad_norm": 0.27278176079279876, "learning_rate": 2.6545137613326968e-06, "loss": 0.1276, "step": 5460 }, { "epoch": 1.5744121565662021, "grad_norm": 0.29700660323704126, "learning_rate": 2.63747483707417e-06, "loss": 0.1233, "step": 5465 }, { "epoch": 1.575852508011955, "grad_norm": 0.2825088591042487, "learning_rate": 2.620482463203665e-06, "loss": 0.1384, "step": 5470 }, { "epoch": 1.5772928594577076, "grad_norm": 0.29095414150571464, "learning_rate": 2.6035367471573712e-06, "loss": 0.1296, "step": 5475 }, { "epoch": 1.5787332109034604, "grad_norm": 0.2663708625782063, "learning_rate": 2.586637796076468e-06, "loss": 0.1266, "step": 5480 }, { "epoch": 1.5801735623492132, "grad_norm": 0.3045977656337426, "learning_rate": 2.569785716806462e-06, "loss": 0.1296, "step": 5485 }, { "epoch": 1.5816139137949659, "grad_norm": 0.31566523598693275, "learning_rate": 2.5529806158965065e-06, "loss": 0.1342, "step": 5490 }, { "epoch": 1.5830542652407187, "grad_norm": 0.30588183705349636, "learning_rate": 2.5362225995987277e-06, "loss": 0.1319, "step": 5495 }, { "epoch": 1.5844946166864715, "grad_norm": 0.29519798602614045, "learning_rate": 2.5195117738675625e-06, "loss": 0.1321, "step": 5500 }, { "epoch": 1.5844946166864715, "eval_loss": 0.14166609942913055, "eval_runtime": 183.2221, "eval_samples_per_second": 9.846, "eval_steps_per_second": 2.461, "step": 5500 }, { "epoch": 1.5859349681322241, "grad_norm": 0.2909461656220088, "learning_rate": 2.502848244359071e-06, "loss": 0.1286, "step": 5505 }, { "epoch": 1.5873753195779772, "grad_norm": 0.2898578050354809, "learning_rate": 2.486232116430275e-06, "loss": 0.1342, "step": 5510 }, { "epoch": 1.5888156710237298, "grad_norm": 0.3086150696182557, "learning_rate": 2.469663495138509e-06, "loss": 0.1295, "step": 5515 }, { "epoch": 1.5902560224694824, "grad_norm": 0.30421519295681765, "learning_rate": 2.4531424852407316e-06, "loss": 0.1335, "step": 5520 }, { "epoch": 1.5916963739152354, "grad_norm": 0.288178893935303, "learning_rate": 2.436669191192864e-06, "loss": 0.1272, "step": 5525 }, { "epoch": 1.593136725360988, "grad_norm": 0.2860066401966028, "learning_rate": 2.420243717149159e-06, "loss": 0.1333, "step": 5530 }, { "epoch": 1.5945770768067409, "grad_norm": 0.29160475999502244, "learning_rate": 2.403866166961507e-06, "loss": 0.1267, "step": 5535 }, { "epoch": 1.5960174282524937, "grad_norm": 0.2755214718192039, "learning_rate": 2.3875366441787984e-06, "loss": 0.121, "step": 5540 }, { "epoch": 1.5974577796982463, "grad_norm": 0.2779639622212633, "learning_rate": 2.3712552520462683e-06, "loss": 0.1269, "step": 5545 }, { "epoch": 1.5988981311439991, "grad_norm": 0.26597538696786716, "learning_rate": 2.3550220935048375e-06, "loss": 0.1223, "step": 5550 }, { "epoch": 1.600338482589752, "grad_norm": 0.2865145501913996, "learning_rate": 2.338837271190464e-06, "loss": 0.1266, "step": 5555 }, { "epoch": 1.6017788340355046, "grad_norm": 0.27749201843913596, "learning_rate": 2.3227008874334943e-06, "loss": 0.1255, "step": 5560 }, { "epoch": 1.6032191854812574, "grad_norm": 0.2772893573934755, "learning_rate": 2.306613044258017e-06, "loss": 0.1277, "step": 5565 }, { "epoch": 1.6046595369270102, "grad_norm": 0.28121537003540437, "learning_rate": 2.290573843381222e-06, "loss": 0.1307, "step": 5570 }, { "epoch": 1.6060998883727629, "grad_norm": 0.28006234124775475, "learning_rate": 2.2745833862127466e-06, "loss": 0.1265, "step": 5575 }, { "epoch": 1.6075402398185157, "grad_norm": 0.28516117847488853, "learning_rate": 2.258641773854041e-06, "loss": 0.1279, "step": 5580 }, { "epoch": 1.6089805912642685, "grad_norm": 0.29225443516823396, "learning_rate": 2.242749107097736e-06, "loss": 0.1198, "step": 5585 }, { "epoch": 1.6104209427100211, "grad_norm": 0.29078063045804087, "learning_rate": 2.226905486426989e-06, "loss": 0.1254, "step": 5590 }, { "epoch": 1.6118612941557742, "grad_norm": 0.3159257252559826, "learning_rate": 2.2111110120148638e-06, "loss": 0.1338, "step": 5595 }, { "epoch": 1.6133016456015268, "grad_norm": 0.27166305277438857, "learning_rate": 2.1953657837236887e-06, "loss": 0.14, "step": 5600 }, { "epoch": 1.6147419970472794, "grad_norm": 0.3188347122514508, "learning_rate": 2.17966990110443e-06, "loss": 0.1397, "step": 5605 }, { "epoch": 1.6161823484930324, "grad_norm": 0.3145543667358082, "learning_rate": 2.1640234633960544e-06, "loss": 0.1295, "step": 5610 }, { "epoch": 1.617622699938785, "grad_norm": 0.2893812485783845, "learning_rate": 2.1484265695249205e-06, "loss": 0.1224, "step": 5615 }, { "epoch": 1.6190630513845379, "grad_norm": 0.3286526294166698, "learning_rate": 2.1328793181041284e-06, "loss": 0.129, "step": 5620 }, { "epoch": 1.6205034028302907, "grad_norm": 0.30207325112357286, "learning_rate": 2.11738180743291e-06, "loss": 0.1291, "step": 5625 }, { "epoch": 1.6219437542760433, "grad_norm": 0.3098575022621966, "learning_rate": 2.101934135496018e-06, "loss": 0.1395, "step": 5630 }, { "epoch": 1.6233841057217961, "grad_norm": 0.30020432664082614, "learning_rate": 2.0865363999630704e-06, "loss": 0.1259, "step": 5635 }, { "epoch": 1.624824457167549, "grad_norm": 0.2862022859486779, "learning_rate": 2.0711886981879812e-06, "loss": 0.1293, "step": 5640 }, { "epoch": 1.6262648086133016, "grad_norm": 0.2869456622784833, "learning_rate": 2.055891127208306e-06, "loss": 0.1234, "step": 5645 }, { "epoch": 1.6277051600590544, "grad_norm": 0.3025687824497439, "learning_rate": 2.0406437837446446e-06, "loss": 0.1317, "step": 5650 }, { "epoch": 1.6291455115048072, "grad_norm": 0.31367247268097065, "learning_rate": 2.025446764200034e-06, "loss": 0.1197, "step": 5655 }, { "epoch": 1.6305858629505599, "grad_norm": 0.3081066945029777, "learning_rate": 2.0103001646593277e-06, "loss": 0.1355, "step": 5660 }, { "epoch": 1.6320262143963127, "grad_norm": 0.30372574948208725, "learning_rate": 1.995204080888592e-06, "loss": 0.1273, "step": 5665 }, { "epoch": 1.6334665658420655, "grad_norm": 0.2773828728322217, "learning_rate": 1.980158608334504e-06, "loss": 0.1253, "step": 5670 }, { "epoch": 1.6349069172878181, "grad_norm": 0.2685031281995709, "learning_rate": 1.965163842123745e-06, "loss": 0.1213, "step": 5675 }, { "epoch": 1.636347268733571, "grad_norm": 0.2772769131129277, "learning_rate": 1.950219877062397e-06, "loss": 0.1311, "step": 5680 }, { "epoch": 1.6377876201793238, "grad_norm": 0.27691142659726703, "learning_rate": 1.935326807635355e-06, "loss": 0.1234, "step": 5685 }, { "epoch": 1.6392279716250764, "grad_norm": 0.27592948769412723, "learning_rate": 1.9204847280057117e-06, "loss": 0.1309, "step": 5690 }, { "epoch": 1.6406683230708294, "grad_norm": 0.30816287352415167, "learning_rate": 1.90569373201417e-06, "loss": 0.1247, "step": 5695 }, { "epoch": 1.642108674516582, "grad_norm": 0.2945379079952402, "learning_rate": 1.8909539131784616e-06, "loss": 0.1304, "step": 5700 }, { "epoch": 1.6435490259623347, "grad_norm": 0.29639169392730347, "learning_rate": 1.8762653646927354e-06, "loss": 0.1305, "step": 5705 }, { "epoch": 1.6449893774080877, "grad_norm": 0.3005355791166683, "learning_rate": 1.8616281794269797e-06, "loss": 0.1311, "step": 5710 }, { "epoch": 1.6464297288538403, "grad_norm": 0.3066605889100896, "learning_rate": 1.847042449926435e-06, "loss": 0.1292, "step": 5715 }, { "epoch": 1.6478700802995931, "grad_norm": 0.29468421114638166, "learning_rate": 1.8325082684110017e-06, "loss": 0.1339, "step": 5720 }, { "epoch": 1.649310431745346, "grad_norm": 0.278781498205819, "learning_rate": 1.8180257267746726e-06, "loss": 0.1321, "step": 5725 }, { "epoch": 1.6507507831910986, "grad_norm": 0.2948317736555441, "learning_rate": 1.8035949165849332e-06, "loss": 0.1288, "step": 5730 }, { "epoch": 1.6521911346368514, "grad_norm": 0.2726210628368952, "learning_rate": 1.7892159290821931e-06, "loss": 0.1205, "step": 5735 }, { "epoch": 1.6536314860826042, "grad_norm": 0.29140658348322845, "learning_rate": 1.7748888551792077e-06, "loss": 0.1367, "step": 5740 }, { "epoch": 1.6550718375283568, "grad_norm": 0.28048412965835157, "learning_rate": 1.760613785460501e-06, "loss": 0.1263, "step": 5745 }, { "epoch": 1.6565121889741097, "grad_norm": 0.28351884101089136, "learning_rate": 1.7463908101817962e-06, "loss": 0.1254, "step": 5750 }, { "epoch": 1.6579525404198625, "grad_norm": 0.27922439198383014, "learning_rate": 1.7322200192694471e-06, "loss": 0.1182, "step": 5755 }, { "epoch": 1.6593928918656151, "grad_norm": 0.2819390130377161, "learning_rate": 1.718101502319861e-06, "loss": 0.1268, "step": 5760 }, { "epoch": 1.660833243311368, "grad_norm": 0.3055312304432241, "learning_rate": 1.704035348598937e-06, "loss": 0.1271, "step": 5765 }, { "epoch": 1.6622735947571208, "grad_norm": 0.2634592274743398, "learning_rate": 1.6900216470415076e-06, "loss": 0.1215, "step": 5770 }, { "epoch": 1.6637139462028734, "grad_norm": 0.30635125652537465, "learning_rate": 1.6760604862507645e-06, "loss": 0.1304, "step": 5775 }, { "epoch": 1.6651542976486262, "grad_norm": 0.283548657450537, "learning_rate": 1.6621519544977072e-06, "loss": 0.1279, "step": 5780 }, { "epoch": 1.666594649094379, "grad_norm": 0.29058908783134074, "learning_rate": 1.648296139720581e-06, "loss": 0.1298, "step": 5785 }, { "epoch": 1.6680350005401317, "grad_norm": 0.28167987115880083, "learning_rate": 1.634493129524325e-06, "loss": 0.1239, "step": 5790 }, { "epoch": 1.6694753519858847, "grad_norm": 0.2676116536078819, "learning_rate": 1.6207430111800081e-06, "loss": 0.1324, "step": 5795 }, { "epoch": 1.6709157034316373, "grad_norm": 0.2954262440265811, "learning_rate": 1.6070458716242977e-06, "loss": 0.1274, "step": 5800 }, { "epoch": 1.67235605487739, "grad_norm": 0.30524434113838955, "learning_rate": 1.5934017974588845e-06, "loss": 0.1332, "step": 5805 }, { "epoch": 1.673796406323143, "grad_norm": 0.2723087735878449, "learning_rate": 1.5798108749499542e-06, "loss": 0.1309, "step": 5810 }, { "epoch": 1.6752367577688956, "grad_norm": 0.276644502192923, "learning_rate": 1.5662731900276307e-06, "loss": 0.1288, "step": 5815 }, { "epoch": 1.6766771092146484, "grad_norm": 0.2843596250126526, "learning_rate": 1.5527888282854386e-06, "loss": 0.1271, "step": 5820 }, { "epoch": 1.6781174606604012, "grad_norm": 0.3050371869749407, "learning_rate": 1.5393578749797667e-06, "loss": 0.1277, "step": 5825 }, { "epoch": 1.6795578121061538, "grad_norm": 0.30443412637747763, "learning_rate": 1.5259804150293144e-06, "loss": 0.1264, "step": 5830 }, { "epoch": 1.6809981635519067, "grad_norm": 0.28129272818555606, "learning_rate": 1.512656533014566e-06, "loss": 0.131, "step": 5835 }, { "epoch": 1.6824385149976595, "grad_norm": 0.2744057680465857, "learning_rate": 1.499386313177258e-06, "loss": 0.1326, "step": 5840 }, { "epoch": 1.6838788664434121, "grad_norm": 0.31846251340497744, "learning_rate": 1.4861698394198366e-06, "loss": 0.1331, "step": 5845 }, { "epoch": 1.685319217889165, "grad_norm": 0.30051590436367076, "learning_rate": 1.473007195304934e-06, "loss": 0.1263, "step": 5850 }, { "epoch": 1.6867595693349178, "grad_norm": 0.2793350788441144, "learning_rate": 1.4598984640548375e-06, "loss": 0.1345, "step": 5855 }, { "epoch": 1.6881999207806704, "grad_norm": 0.2707364156107711, "learning_rate": 1.4468437285509652e-06, "loss": 0.1284, "step": 5860 }, { "epoch": 1.6896402722264232, "grad_norm": 0.314258875396424, "learning_rate": 1.4338430713333397e-06, "loss": 0.1282, "step": 5865 }, { "epoch": 1.691080623672176, "grad_norm": 0.3003554107965295, "learning_rate": 1.4208965746000725e-06, "loss": 0.1321, "step": 5870 }, { "epoch": 1.6925209751179286, "grad_norm": 0.2953755650564155, "learning_rate": 1.408004320206835e-06, "loss": 0.1287, "step": 5875 }, { "epoch": 1.6939613265636817, "grad_norm": 0.26321096545532147, "learning_rate": 1.3951663896663426e-06, "loss": 0.1215, "step": 5880 }, { "epoch": 1.6954016780094343, "grad_norm": 0.29649122509259834, "learning_rate": 1.3823828641478532e-06, "loss": 0.1288, "step": 5885 }, { "epoch": 1.696842029455187, "grad_norm": 0.2990214517225168, "learning_rate": 1.3696538244766256e-06, "loss": 0.1279, "step": 5890 }, { "epoch": 1.69828238090094, "grad_norm": 0.31449726804552053, "learning_rate": 1.3569793511334416e-06, "loss": 0.1412, "step": 5895 }, { "epoch": 1.6997227323466926, "grad_norm": 0.31375582105120253, "learning_rate": 1.3443595242540753e-06, "loss": 0.1355, "step": 5900 }, { "epoch": 1.7011630837924454, "grad_norm": 0.28341031790692495, "learning_rate": 1.3317944236287882e-06, "loss": 0.1214, "step": 5905 }, { "epoch": 1.7026034352381982, "grad_norm": 0.2899396718777641, "learning_rate": 1.3192841287018376e-06, "loss": 0.1301, "step": 5910 }, { "epoch": 1.7040437866839508, "grad_norm": 0.2735047037867141, "learning_rate": 1.3068287185709584e-06, "loss": 0.1287, "step": 5915 }, { "epoch": 1.7054841381297037, "grad_norm": 0.2763878558281379, "learning_rate": 1.2944282719868739e-06, "loss": 0.1299, "step": 5920 }, { "epoch": 1.7069244895754565, "grad_norm": 0.29905907907114176, "learning_rate": 1.282082867352794e-06, "loss": 0.1285, "step": 5925 }, { "epoch": 1.7083648410212091, "grad_norm": 0.2768076492722104, "learning_rate": 1.2697925827239166e-06, "loss": 0.1301, "step": 5930 }, { "epoch": 1.709805192466962, "grad_norm": 0.27653267521768804, "learning_rate": 1.2575574958069392e-06, "loss": 0.1212, "step": 5935 }, { "epoch": 1.7112455439127148, "grad_norm": 0.2764018528709046, "learning_rate": 1.24537768395957e-06, "loss": 0.1233, "step": 5940 }, { "epoch": 1.7126858953584674, "grad_norm": 0.28421347311440187, "learning_rate": 1.2332532241900275e-06, "loss": 0.133, "step": 5945 }, { "epoch": 1.7141262468042202, "grad_norm": 0.2889271677627283, "learning_rate": 1.2211841931565615e-06, "loss": 0.1321, "step": 5950 }, { "epoch": 1.715566598249973, "grad_norm": 0.3030014581222401, "learning_rate": 1.2091706671669746e-06, "loss": 0.1381, "step": 5955 }, { "epoch": 1.7170069496957256, "grad_norm": 0.27428273725744723, "learning_rate": 1.1972127221781238e-06, "loss": 0.1285, "step": 5960 }, { "epoch": 1.7184473011414785, "grad_norm": 0.3008427383898093, "learning_rate": 1.1853104337954535e-06, "loss": 0.1306, "step": 5965 }, { "epoch": 1.7198876525872313, "grad_norm": 0.3229248441404968, "learning_rate": 1.1734638772725104e-06, "loss": 0.1334, "step": 5970 }, { "epoch": 1.721328004032984, "grad_norm": 0.30334834034021635, "learning_rate": 1.161673127510472e-06, "loss": 0.1266, "step": 5975 }, { "epoch": 1.722768355478737, "grad_norm": 0.2852528997722053, "learning_rate": 1.1499382590576736e-06, "loss": 0.1276, "step": 5980 }, { "epoch": 1.7242087069244896, "grad_norm": 0.2895562296977625, "learning_rate": 1.1382593461091308e-06, "loss": 0.1319, "step": 5985 }, { "epoch": 1.7256490583702422, "grad_norm": 0.2941343826737447, "learning_rate": 1.1266364625060722e-06, "loss": 0.1324, "step": 5990 }, { "epoch": 1.7270894098159952, "grad_norm": 0.2735135243510112, "learning_rate": 1.1150696817354867e-06, "loss": 0.1318, "step": 5995 }, { "epoch": 1.7285297612617478, "grad_norm": 0.2762792761084767, "learning_rate": 1.1035590769296313e-06, "loss": 0.1354, "step": 6000 }, { "epoch": 1.7285297612617478, "eval_loss": 0.14099709689617157, "eval_runtime": 186.76, "eval_samples_per_second": 9.659, "eval_steps_per_second": 2.415, "step": 6000 }, { "epoch": 1.7299701127075007, "grad_norm": 0.2822759449386405, "learning_rate": 1.09210472086559e-06, "loss": 0.1262, "step": 6005 }, { "epoch": 1.7314104641532535, "grad_norm": 0.2852954331322426, "learning_rate": 1.080706685964814e-06, "loss": 0.1291, "step": 6010 }, { "epoch": 1.7328508155990061, "grad_norm": 0.283633303505763, "learning_rate": 1.0693650442926496e-06, "loss": 0.1299, "step": 6015 }, { "epoch": 1.734291167044759, "grad_norm": 0.2893275038552144, "learning_rate": 1.058079867557893e-06, "loss": 0.1224, "step": 6020 }, { "epoch": 1.7357315184905118, "grad_norm": 0.2929228007525251, "learning_rate": 1.0468512271123376e-06, "loss": 0.1212, "step": 6025 }, { "epoch": 1.7371718699362644, "grad_norm": 0.276901291045506, "learning_rate": 1.0356791939503164e-06, "loss": 0.1277, "step": 6030 }, { "epoch": 1.7386122213820172, "grad_norm": 0.3125904701701172, "learning_rate": 1.0245638387082578e-06, "loss": 0.1264, "step": 6035 }, { "epoch": 1.74005257282777, "grad_norm": 0.2741364320107291, "learning_rate": 1.0135052316642358e-06, "loss": 0.1251, "step": 6040 }, { "epoch": 1.7414929242735226, "grad_norm": 0.3041429091229379, "learning_rate": 1.002503442737527e-06, "loss": 0.1314, "step": 6045 }, { "epoch": 1.7429332757192755, "grad_norm": 0.3070386341868555, "learning_rate": 9.915585414881767e-07, "loss": 0.1336, "step": 6050 }, { "epoch": 1.7443736271650283, "grad_norm": 0.2861168629205116, "learning_rate": 9.806705971165443e-07, "loss": 0.1294, "step": 6055 }, { "epoch": 1.745813978610781, "grad_norm": 0.28061574721986277, "learning_rate": 9.698396784628704e-07, "loss": 0.1249, "step": 6060 }, { "epoch": 1.7472543300565337, "grad_norm": 0.32122825311739844, "learning_rate": 9.590658540068564e-07, "loss": 0.1275, "step": 6065 }, { "epoch": 1.7486946815022866, "grad_norm": 0.3039376380929849, "learning_rate": 9.48349191867205e-07, "loss": 0.1318, "step": 6070 }, { "epoch": 1.7501350329480392, "grad_norm": 0.27031359733052707, "learning_rate": 9.376897598012102e-07, "loss": 0.1296, "step": 6075 }, { "epoch": 1.7515753843937922, "grad_norm": 0.27603252423746855, "learning_rate": 9.270876252043249e-07, "loss": 0.1255, "step": 6080 }, { "epoch": 1.7530157358395448, "grad_norm": 0.28168370555571337, "learning_rate": 9.165428551097288e-07, "loss": 0.1227, "step": 6085 }, { "epoch": 1.7544560872852974, "grad_norm": 0.29637741640626303, "learning_rate": 9.060555161879069e-07, "loss": 0.1345, "step": 6090 }, { "epoch": 1.7558964387310505, "grad_norm": 0.2912099853152085, "learning_rate": 8.956256747462367e-07, "loss": 0.1253, "step": 6095 }, { "epoch": 1.757336790176803, "grad_norm": 0.27901921713470756, "learning_rate": 8.852533967285515e-07, "loss": 0.1154, "step": 6100 }, { "epoch": 1.758777141622556, "grad_norm": 0.27957055801860914, "learning_rate": 8.749387477147408e-07, "loss": 0.1224, "step": 6105 }, { "epoch": 1.7602174930683088, "grad_norm": 0.27918555052505345, "learning_rate": 8.646817929203233e-07, "loss": 0.1251, "step": 6110 }, { "epoch": 1.7616578445140614, "grad_norm": 0.30402797081062277, "learning_rate": 8.544825971960402e-07, "loss": 0.1351, "step": 6115 }, { "epoch": 1.7630981959598142, "grad_norm": 0.27127335623081955, "learning_rate": 8.443412250274519e-07, "loss": 0.1314, "step": 6120 }, { "epoch": 1.764538547405567, "grad_norm": 0.28386937418963853, "learning_rate": 8.342577405345132e-07, "loss": 0.1185, "step": 6125 }, { "epoch": 1.7659788988513196, "grad_norm": 0.30462689876606047, "learning_rate": 8.242322074711806e-07, "loss": 0.128, "step": 6130 }, { "epoch": 1.7674192502970725, "grad_norm": 0.2679545912074383, "learning_rate": 8.142646892250106e-07, "loss": 0.1208, "step": 6135 }, { "epoch": 1.7688596017428253, "grad_norm": 0.30324932458216164, "learning_rate": 8.043552488167505e-07, "loss": 0.1253, "step": 6140 }, { "epoch": 1.770299953188578, "grad_norm": 0.2809028272767657, "learning_rate": 7.945039488999396e-07, "loss": 0.1365, "step": 6145 }, { "epoch": 1.7717403046343307, "grad_norm": 0.2921736188573899, "learning_rate": 7.847108517605284e-07, "loss": 0.1198, "step": 6150 }, { "epoch": 1.7731806560800836, "grad_norm": 0.3076568585771632, "learning_rate": 7.749760193164657e-07, "loss": 0.1252, "step": 6155 }, { "epoch": 1.7746210075258362, "grad_norm": 0.28028733764115277, "learning_rate": 7.652995131173146e-07, "loss": 0.1202, "step": 6160 }, { "epoch": 1.7760613589715892, "grad_norm": 0.27140081979265973, "learning_rate": 7.556813943438712e-07, "loss": 0.1278, "step": 6165 }, { "epoch": 1.7775017104173418, "grad_norm": 0.28397278127939163, "learning_rate": 7.461217238077656e-07, "loss": 0.1279, "step": 6170 }, { "epoch": 1.7789420618630944, "grad_norm": 0.29162913313509997, "learning_rate": 7.366205619510803e-07, "loss": 0.1222, "step": 6175 }, { "epoch": 1.7803824133088475, "grad_norm": 0.2831971876270899, "learning_rate": 7.271779688459746e-07, "loss": 0.1306, "step": 6180 }, { "epoch": 1.7818227647546, "grad_norm": 0.31140814074989936, "learning_rate": 7.177940041942965e-07, "loss": 0.1232, "step": 6185 }, { "epoch": 1.783263116200353, "grad_norm": 0.2961845757723407, "learning_rate": 7.084687273272139e-07, "loss": 0.1384, "step": 6190 }, { "epoch": 1.7847034676461058, "grad_norm": 0.2909880963891404, "learning_rate": 6.992021972048312e-07, "loss": 0.1358, "step": 6195 }, { "epoch": 1.7861438190918584, "grad_norm": 0.2847134635831081, "learning_rate": 6.899944724158192e-07, "loss": 0.1311, "step": 6200 }, { "epoch": 1.7875841705376112, "grad_norm": 0.30563787075961574, "learning_rate": 6.808456111770467e-07, "loss": 0.1181, "step": 6205 }, { "epoch": 1.789024521983364, "grad_norm": 0.28983189625700895, "learning_rate": 6.717556713332129e-07, "loss": 0.1291, "step": 6210 }, { "epoch": 1.7904648734291166, "grad_norm": 0.29330483143372066, "learning_rate": 6.627247103564771e-07, "loss": 0.1356, "step": 6215 }, { "epoch": 1.7919052248748695, "grad_norm": 0.3032075996039303, "learning_rate": 6.53752785346099e-07, "loss": 0.1252, "step": 6220 }, { "epoch": 1.7933455763206223, "grad_norm": 0.2872904446310524, "learning_rate": 6.44839953028078e-07, "loss": 0.1353, "step": 6225 }, { "epoch": 1.794785927766375, "grad_norm": 0.2749000307290741, "learning_rate": 6.359862697547891e-07, "loss": 0.1325, "step": 6230 }, { "epoch": 1.7962262792121277, "grad_norm": 0.29018671520624423, "learning_rate": 6.271917915046388e-07, "loss": 0.1334, "step": 6235 }, { "epoch": 1.7976666306578806, "grad_norm": 0.29181272959005167, "learning_rate": 6.184565738816961e-07, "loss": 0.1417, "step": 6240 }, { "epoch": 1.7991069821036332, "grad_norm": 0.2860722043841982, "learning_rate": 6.097806721153498e-07, "loss": 0.1251, "step": 6245 }, { "epoch": 1.800547333549386, "grad_norm": 0.2928724802398806, "learning_rate": 6.011641410599611e-07, "loss": 0.1281, "step": 6250 }, { "epoch": 1.8019876849951388, "grad_norm": 0.29674996515037555, "learning_rate": 5.926070351945079e-07, "loss": 0.1306, "step": 6255 }, { "epoch": 1.8034280364408914, "grad_norm": 0.29284969211694145, "learning_rate": 5.841094086222465e-07, "loss": 0.1228, "step": 6260 }, { "epoch": 1.8048683878866445, "grad_norm": 0.28914358868452567, "learning_rate": 5.756713150703752e-07, "loss": 0.1281, "step": 6265 }, { "epoch": 1.806308739332397, "grad_norm": 0.2784100141416387, "learning_rate": 5.6729280788968e-07, "loss": 0.1169, "step": 6270 }, { "epoch": 1.8077490907781497, "grad_norm": 0.26766994019437396, "learning_rate": 5.589739400542071e-07, "loss": 0.1226, "step": 6275 }, { "epoch": 1.8091894422239028, "grad_norm": 0.30757543672267906, "learning_rate": 5.507147641609334e-07, "loss": 0.1287, "step": 6280 }, { "epoch": 1.8106297936696554, "grad_norm": 0.2913516026723883, "learning_rate": 5.425153324294175e-07, "loss": 0.1259, "step": 6285 }, { "epoch": 1.8120701451154082, "grad_norm": 0.2833838801982532, "learning_rate": 5.343756967014846e-07, "loss": 0.1316, "step": 6290 }, { "epoch": 1.813510496561161, "grad_norm": 0.28126356941560415, "learning_rate": 5.262959084408891e-07, "loss": 0.1262, "step": 6295 }, { "epoch": 1.8149508480069136, "grad_norm": 0.2791890015076149, "learning_rate": 5.182760187329949e-07, "loss": 0.1253, "step": 6300 }, { "epoch": 1.8163911994526665, "grad_norm": 0.28325951768891716, "learning_rate": 5.103160782844541e-07, "loss": 0.1184, "step": 6305 }, { "epoch": 1.8178315508984193, "grad_norm": 0.28791275189471405, "learning_rate": 5.024161374228765e-07, "loss": 0.1378, "step": 6310 }, { "epoch": 1.819271902344172, "grad_norm": 0.29622226881158076, "learning_rate": 4.945762460965209e-07, "loss": 0.126, "step": 6315 }, { "epoch": 1.8207122537899247, "grad_norm": 0.30683589530685085, "learning_rate": 4.8679645387398e-07, "loss": 0.1371, "step": 6320 }, { "epoch": 1.8221526052356776, "grad_norm": 0.3082445698857907, "learning_rate": 4.790768099438558e-07, "loss": 0.1268, "step": 6325 }, { "epoch": 1.8235929566814302, "grad_norm": 0.31230886419869525, "learning_rate": 4.714173631144592e-07, "loss": 0.1333, "step": 6330 }, { "epoch": 1.825033308127183, "grad_norm": 0.2902616527080115, "learning_rate": 4.638181618135007e-07, "loss": 0.128, "step": 6335 }, { "epoch": 1.8264736595729358, "grad_norm": 0.29991008830228766, "learning_rate": 4.562792540877792e-07, "loss": 0.125, "step": 6340 }, { "epoch": 1.8279140110186884, "grad_norm": 0.3053693304563772, "learning_rate": 4.488006876028805e-07, "loss": 0.1251, "step": 6345 }, { "epoch": 1.8293543624644413, "grad_norm": 0.2841149117151109, "learning_rate": 4.413825096428781e-07, "loss": 0.1338, "step": 6350 }, { "epoch": 1.830794713910194, "grad_norm": 0.27769192328457887, "learning_rate": 4.3402476711002947e-07, "loss": 0.1236, "step": 6355 }, { "epoch": 1.8322350653559467, "grad_norm": 0.30137959777639806, "learning_rate": 4.2672750652448467e-07, "loss": 0.1348, "step": 6360 }, { "epoch": 1.8336754168016998, "grad_norm": 0.30779020386772094, "learning_rate": 4.1949077402399063e-07, "loss": 0.1289, "step": 6365 }, { "epoch": 1.8351157682474524, "grad_norm": 0.2932190435782668, "learning_rate": 4.1231461536359374e-07, "loss": 0.1244, "step": 6370 }, { "epoch": 1.836556119693205, "grad_norm": 0.2997160829285179, "learning_rate": 4.051990759153612e-07, "loss": 0.1246, "step": 6375 }, { "epoch": 1.837996471138958, "grad_norm": 0.2837660942585995, "learning_rate": 3.981442006680869e-07, "loss": 0.1305, "step": 6380 }, { "epoch": 1.8394368225847106, "grad_norm": 0.3161468834055083, "learning_rate": 3.911500342270058e-07, "loss": 0.128, "step": 6385 }, { "epoch": 1.8408771740304635, "grad_norm": 0.30244059578998417, "learning_rate": 3.842166208135201e-07, "loss": 0.1247, "step": 6390 }, { "epoch": 1.8423175254762163, "grad_norm": 0.27806142338453754, "learning_rate": 3.77344004264909e-07, "loss": 0.1143, "step": 6395 }, { "epoch": 1.843757876921969, "grad_norm": 0.30174417783454494, "learning_rate": 3.705322280340562e-07, "loss": 0.1236, "step": 6400 }, { "epoch": 1.8451982283677217, "grad_norm": 0.30013465943928136, "learning_rate": 3.637813351891806e-07, "loss": 0.1335, "step": 6405 }, { "epoch": 1.8466385798134746, "grad_norm": 0.2819689346445666, "learning_rate": 3.5709136841355686e-07, "loss": 0.1228, "step": 6410 }, { "epoch": 1.8480789312592272, "grad_norm": 0.2946026656198203, "learning_rate": 3.504623700052456e-07, "loss": 0.138, "step": 6415 }, { "epoch": 1.84951928270498, "grad_norm": 0.290038702050758, "learning_rate": 3.4389438187683146e-07, "loss": 0.1261, "step": 6420 }, { "epoch": 1.8509596341507328, "grad_norm": 0.2869715781228218, "learning_rate": 3.37387445555154e-07, "loss": 0.1223, "step": 6425 }, { "epoch": 1.8523999855964854, "grad_norm": 0.2833834600270098, "learning_rate": 3.30941602181043e-07, "loss": 0.121, "step": 6430 }, { "epoch": 1.8538403370422383, "grad_norm": 0.2850728665063182, "learning_rate": 3.2455689250906584e-07, "loss": 0.1258, "step": 6435 }, { "epoch": 1.855280688487991, "grad_norm": 0.29666657702209753, "learning_rate": 3.1823335690725933e-07, "loss": 0.1294, "step": 6440 }, { "epoch": 1.8567210399337437, "grad_norm": 0.26968192494466475, "learning_rate": 3.119710353568872e-07, "loss": 0.1165, "step": 6445 }, { "epoch": 1.8581613913794968, "grad_norm": 0.2996210613153865, "learning_rate": 3.0576996745217637e-07, "loss": 0.134, "step": 6450 }, { "epoch": 1.8596017428252494, "grad_norm": 0.293024903599958, "learning_rate": 2.996301924000711e-07, "loss": 0.1261, "step": 6455 }, { "epoch": 1.861042094271002, "grad_norm": 0.2766190559075753, "learning_rate": 2.935517490199857e-07, "loss": 0.123, "step": 6460 }, { "epoch": 1.862482445716755, "grad_norm": 0.2853304131204647, "learning_rate": 2.8753467574355707e-07, "loss": 0.1195, "step": 6465 }, { "epoch": 1.8639227971625076, "grad_norm": 0.3036274997665907, "learning_rate": 2.815790106144045e-07, "loss": 0.1327, "step": 6470 }, { "epoch": 1.8653631486082605, "grad_norm": 0.2899082654759326, "learning_rate": 2.756847912878846e-07, "loss": 0.1258, "step": 6475 }, { "epoch": 1.8668035000540133, "grad_norm": 0.269201950199654, "learning_rate": 2.698520550308581e-07, "loss": 0.1216, "step": 6480 }, { "epoch": 1.868243851499766, "grad_norm": 0.3061531908691431, "learning_rate": 2.640808387214522e-07, "loss": 0.1271, "step": 6485 }, { "epoch": 1.8696842029455187, "grad_norm": 0.3114060303595798, "learning_rate": 2.5837117884882743e-07, "loss": 0.1254, "step": 6490 }, { "epoch": 1.8711245543912716, "grad_norm": 0.2992475437515773, "learning_rate": 2.527231115129458e-07, "loss": 0.1258, "step": 6495 }, { "epoch": 1.8725649058370242, "grad_norm": 0.2830997835854172, "learning_rate": 2.4713667242434294e-07, "loss": 0.1316, "step": 6500 }, { "epoch": 1.8725649058370242, "eval_loss": 0.14066626131534576, "eval_runtime": 180.7398, "eval_samples_per_second": 9.981, "eval_steps_per_second": 2.495, "step": 6500 }, { "epoch": 1.874005257282777, "grad_norm": 0.30147030501531397, "learning_rate": 2.416118969039061e-07, "loss": 0.1276, "step": 6505 }, { "epoch": 1.8754456087285298, "grad_norm": 0.28892524030979483, "learning_rate": 2.361488198826445e-07, "loss": 0.127, "step": 6510 }, { "epoch": 1.8768859601742824, "grad_norm": 0.2953532817059129, "learning_rate": 2.3074747590147384e-07, "loss": 0.137, "step": 6515 }, { "epoch": 1.8783263116200353, "grad_norm": 0.25078807633095623, "learning_rate": 2.2540789911099536e-07, "loss": 0.1225, "step": 6520 }, { "epoch": 1.879766663065788, "grad_norm": 0.28249677086004654, "learning_rate": 2.2013012327127826e-07, "loss": 0.1253, "step": 6525 }, { "epoch": 1.8812070145115407, "grad_norm": 0.2850391716271125, "learning_rate": 2.1491418175165202e-07, "loss": 0.133, "step": 6530 }, { "epoch": 1.8826473659572935, "grad_norm": 0.28209719167913044, "learning_rate": 2.097601075304878e-07, "loss": 0.1243, "step": 6535 }, { "epoch": 1.8840877174030464, "grad_norm": 0.2877245048633624, "learning_rate": 2.0466793319499856e-07, "loss": 0.1344, "step": 6540 }, { "epoch": 1.885528068848799, "grad_norm": 0.283418662494498, "learning_rate": 1.9963769094102247e-07, "loss": 0.1317, "step": 6545 }, { "epoch": 1.886968420294552, "grad_norm": 0.2932428710876708, "learning_rate": 1.946694125728299e-07, "loss": 0.1319, "step": 6550 }, { "epoch": 1.8884087717403046, "grad_norm": 0.30112745695158777, "learning_rate": 1.8976312950291453e-07, "loss": 0.1256, "step": 6555 }, { "epoch": 1.8898491231860572, "grad_norm": 0.27640553764468206, "learning_rate": 1.8491887275180143e-07, "loss": 0.125, "step": 6560 }, { "epoch": 1.8912894746318103, "grad_norm": 0.2940276593318057, "learning_rate": 1.8013667294784376e-07, "loss": 0.1289, "step": 6565 }, { "epoch": 1.892729826077563, "grad_norm": 0.27144519568474473, "learning_rate": 1.7541656032703413e-07, "loss": 0.1354, "step": 6570 }, { "epoch": 1.8941701775233157, "grad_norm": 0.275764857359935, "learning_rate": 1.707585647328136e-07, "loss": 0.1292, "step": 6575 }, { "epoch": 1.8956105289690686, "grad_norm": 0.27755806619480555, "learning_rate": 1.6616271561587737e-07, "loss": 0.1188, "step": 6580 }, { "epoch": 1.8970508804148212, "grad_norm": 0.2997586600616719, "learning_rate": 1.6162904203399722e-07, "loss": 0.1381, "step": 6585 }, { "epoch": 1.898491231860574, "grad_norm": 0.2941056183108044, "learning_rate": 1.571575726518293e-07, "loss": 0.1308, "step": 6590 }, { "epoch": 1.8999315833063268, "grad_norm": 0.31046657242771797, "learning_rate": 1.5274833574073887e-07, "loss": 0.1371, "step": 6595 }, { "epoch": 1.9013719347520794, "grad_norm": 0.31288177847066073, "learning_rate": 1.4840135917862041e-07, "loss": 0.1297, "step": 6600 }, { "epoch": 1.9028122861978323, "grad_norm": 0.29822816494380705, "learning_rate": 1.4411667044971657e-07, "loss": 0.1347, "step": 6605 }, { "epoch": 1.904252637643585, "grad_norm": 0.2850936706817707, "learning_rate": 1.3989429664445275e-07, "loss": 0.1332, "step": 6610 }, { "epoch": 1.9056929890893377, "grad_norm": 0.26285478488340075, "learning_rate": 1.3573426445925853e-07, "loss": 0.1226, "step": 6615 }, { "epoch": 1.9071333405350905, "grad_norm": 0.32854759121769145, "learning_rate": 1.316366001964009e-07, "loss": 0.1281, "step": 6620 }, { "epoch": 1.9085736919808434, "grad_norm": 0.30927391667980875, "learning_rate": 1.2760132976382123e-07, "loss": 0.1282, "step": 6625 }, { "epoch": 1.910014043426596, "grad_norm": 0.2910109063295465, "learning_rate": 1.2362847867496754e-07, "loss": 0.1238, "step": 6630 }, { "epoch": 1.9114543948723488, "grad_norm": 0.2962860334445714, "learning_rate": 1.197180720486346e-07, "loss": 0.1378, "step": 6635 }, { "epoch": 1.9128947463181016, "grad_norm": 0.2887305818228257, "learning_rate": 1.1587013460880537e-07, "loss": 0.1239, "step": 6640 }, { "epoch": 1.9143350977638542, "grad_norm": 0.31793728948458577, "learning_rate": 1.1208469068449413e-07, "loss": 0.124, "step": 6645 }, { "epoch": 1.9157754492096073, "grad_norm": 0.29221711966306396, "learning_rate": 1.0836176420959354e-07, "loss": 0.1311, "step": 6650 }, { "epoch": 1.91721580065536, "grad_norm": 0.31385454606410323, "learning_rate": 1.0470137872272246e-07, "loss": 0.1357, "step": 6655 }, { "epoch": 1.9186561521011125, "grad_norm": 0.31438512993189466, "learning_rate": 1.01103557367076e-07, "loss": 0.1416, "step": 6660 }, { "epoch": 1.9200965035468656, "grad_norm": 0.29829830683355085, "learning_rate": 9.756832289028239e-08, "loss": 0.1306, "step": 6665 }, { "epoch": 1.9215368549926182, "grad_norm": 0.260802342295974, "learning_rate": 9.40956976442564e-08, "loss": 0.1277, "step": 6670 }, { "epoch": 1.922977206438371, "grad_norm": 0.3197171414870384, "learning_rate": 9.068570358506058e-08, "loss": 0.1314, "step": 6675 }, { "epoch": 1.9244175578841238, "grad_norm": 0.2894154530923691, "learning_rate": 8.733836227276082e-08, "loss": 0.1284, "step": 6680 }, { "epoch": 1.9258579093298764, "grad_norm": 0.29695374382019035, "learning_rate": 8.405369487129889e-08, "loss": 0.1329, "step": 6685 }, { "epoch": 1.9272982607756293, "grad_norm": 0.30960960633140455, "learning_rate": 8.083172214835011e-08, "loss": 0.1365, "step": 6690 }, { "epoch": 1.928738612221382, "grad_norm": 0.2971288891300368, "learning_rate": 7.767246447519694e-08, "loss": 0.126, "step": 6695 }, { "epoch": 1.9301789636671347, "grad_norm": 0.2789858562034354, "learning_rate": 7.457594182660011e-08, "loss": 0.125, "step": 6700 }, { "epoch": 1.9316193151128875, "grad_norm": 0.29826867052107653, "learning_rate": 7.154217378066875e-08, "loss": 0.135, "step": 6705 }, { "epoch": 1.9330596665586404, "grad_norm": 0.32154445451796954, "learning_rate": 6.85711795187416e-08, "loss": 0.1306, "step": 6710 }, { "epoch": 1.934500018004393, "grad_norm": 0.2816182285382255, "learning_rate": 6.566297782526155e-08, "loss": 0.1338, "step": 6715 }, { "epoch": 1.9359403694501458, "grad_norm": 0.2805436494834369, "learning_rate": 6.281758708765796e-08, "loss": 0.1399, "step": 6720 }, { "epoch": 1.9373807208958986, "grad_norm": 0.2871447896608813, "learning_rate": 6.00350252962334e-08, "loss": 0.1326, "step": 6725 }, { "epoch": 1.9388210723416512, "grad_norm": 0.2865634190488767, "learning_rate": 5.731531004404378e-08, "loss": 0.1348, "step": 6730 }, { "epoch": 1.940261423787404, "grad_norm": 0.3073170789090603, "learning_rate": 5.465845852679397e-08, "loss": 0.1265, "step": 6735 }, { "epoch": 1.941701775233157, "grad_norm": 0.27714379568487674, "learning_rate": 5.206448754272342e-08, "loss": 0.1226, "step": 6740 }, { "epoch": 1.9431421266789095, "grad_norm": 0.2986995263002976, "learning_rate": 4.9533413492504065e-08, "loss": 0.1284, "step": 6745 }, { "epoch": 1.9445824781246626, "grad_norm": 0.31721814876106047, "learning_rate": 4.706525237913595e-08, "loss": 0.1192, "step": 6750 }, { "epoch": 1.9460228295704152, "grad_norm": 0.2909997426660172, "learning_rate": 4.466001980784063e-08, "loss": 0.1317, "step": 6755 }, { "epoch": 1.947463181016168, "grad_norm": 0.28400270417549767, "learning_rate": 4.231773098597236e-08, "loss": 0.1338, "step": 6760 }, { "epoch": 1.9489035324619208, "grad_norm": 0.2743912511811287, "learning_rate": 4.0038400722911544e-08, "loss": 0.1303, "step": 6765 }, { "epoch": 1.9503438839076734, "grad_norm": 0.2815930208370952, "learning_rate": 3.7822043429980304e-08, "loss": 0.1333, "step": 6770 }, { "epoch": 1.9517842353534263, "grad_norm": 0.29321178019359334, "learning_rate": 3.566867312034483e-08, "loss": 0.1248, "step": 6775 }, { "epoch": 1.953224586799179, "grad_norm": 0.28463818504050836, "learning_rate": 3.357830340892987e-08, "loss": 0.134, "step": 6780 }, { "epoch": 1.9546649382449317, "grad_norm": 0.3052187447654327, "learning_rate": 3.155094751233101e-08, "loss": 0.1256, "step": 6785 }, { "epoch": 1.9561052896906845, "grad_norm": 0.28659122590048935, "learning_rate": 2.9586618248731436e-08, "loss": 0.1226, "step": 6790 }, { "epoch": 1.9575456411364374, "grad_norm": 0.282038441472891, "learning_rate": 2.768532803782531e-08, "loss": 0.1347, "step": 6795 }, { "epoch": 1.95898599258219, "grad_norm": 0.2947473056279094, "learning_rate": 2.5847088900728955e-08, "loss": 0.1307, "step": 6800 }, { "epoch": 1.9604263440279428, "grad_norm": 0.28871021497286375, "learning_rate": 2.407191245991758e-08, "loss": 0.1257, "step": 6805 }, { "epoch": 1.9618666954736956, "grad_norm": 0.2793833356124683, "learning_rate": 2.2359809939139775e-08, "loss": 0.1199, "step": 6810 }, { "epoch": 1.9633070469194482, "grad_norm": 0.28471879043256865, "learning_rate": 2.0710792163357586e-08, "loss": 0.1301, "step": 6815 }, { "epoch": 1.964747398365201, "grad_norm": 0.326540718273098, "learning_rate": 1.912486955866988e-08, "loss": 0.1281, "step": 6820 }, { "epoch": 1.966187749810954, "grad_norm": 0.2945065457407077, "learning_rate": 1.7602052152247973e-08, "loss": 0.1299, "step": 6825 }, { "epoch": 1.9676281012567065, "grad_norm": 0.2860442540390112, "learning_rate": 1.6142349572275674e-08, "loss": 0.1331, "step": 6830 }, { "epoch": 1.9690684527024596, "grad_norm": 0.28188944868108917, "learning_rate": 1.4745771047887104e-08, "loss": 0.1338, "step": 6835 }, { "epoch": 1.9705088041482122, "grad_norm": 0.28844347874593, "learning_rate": 1.3412325409103421e-08, "loss": 0.1247, "step": 6840 }, { "epoch": 1.9719491555939648, "grad_norm": 0.27691996340853114, "learning_rate": 1.2142021086786194e-08, "loss": 0.1243, "step": 6845 }, { "epoch": 1.9733895070397178, "grad_norm": 0.30470759791737545, "learning_rate": 1.0934866112575215e-08, "loss": 0.1343, "step": 6850 }, { "epoch": 1.9748298584854704, "grad_norm": 0.2704989184353482, "learning_rate": 9.790868118843e-09, "loss": 0.1281, "step": 6855 }, { "epoch": 1.9762702099312233, "grad_norm": 0.2936626817128071, "learning_rate": 8.710034338643702e-09, "loss": 0.1293, "step": 6860 }, { "epoch": 1.977710561376976, "grad_norm": 0.28036256864259923, "learning_rate": 7.692371605670935e-09, "loss": 0.1299, "step": 6865 }, { "epoch": 1.9791509128227287, "grad_norm": 0.29270530929271654, "learning_rate": 6.737886354211132e-09, "loss": 0.1327, "step": 6870 }, { "epoch": 1.9805912642684815, "grad_norm": 0.31142795102161236, "learning_rate": 5.84658461910359e-09, "loss": 0.1301, "step": 6875 }, { "epoch": 1.9820316157142344, "grad_norm": 0.28078438046285364, "learning_rate": 5.018472035701605e-09, "loss": 0.1375, "step": 6880 }, { "epoch": 1.983471967159987, "grad_norm": 0.3015462902715081, "learning_rate": 4.253553839842495e-09, "loss": 0.1306, "step": 6885 }, { "epoch": 1.9849123186057398, "grad_norm": 0.29201527254897053, "learning_rate": 3.5518348678043046e-09, "loss": 0.1264, "step": 6890 }, { "epoch": 1.9863526700514926, "grad_norm": 0.297518857741422, "learning_rate": 2.9133195562847106e-09, "loss": 0.1278, "step": 6895 }, { "epoch": 1.9877930214972452, "grad_norm": 0.2899898155130717, "learning_rate": 2.338011942368823e-09, "loss": 0.1287, "step": 6900 }, { "epoch": 1.989233372942998, "grad_norm": 0.29940028846058375, "learning_rate": 1.8259156635025422e-09, "loss": 0.1323, "step": 6905 }, { "epoch": 1.990673724388751, "grad_norm": 0.307580959013968, "learning_rate": 1.3770339574714631e-09, "loss": 0.1359, "step": 6910 }, { "epoch": 1.9921140758345035, "grad_norm": 0.2926247859880895, "learning_rate": 9.913696623808922e-10, "loss": 0.1277, "step": 6915 }, { "epoch": 1.9935544272802563, "grad_norm": 0.24746513443996024, "learning_rate": 6.68925216636973e-10, "loss": 0.1232, "step": 6920 }, { "epoch": 1.9949947787260092, "grad_norm": 0.31522655128809673, "learning_rate": 4.0970265892892327e-10, "loss": 0.127, "step": 6925 }, { "epoch": 1.9964351301717618, "grad_norm": 0.2858966249788635, "learning_rate": 2.1370362822237256e-10, "loss": 0.1273, "step": 6930 }, { "epoch": 1.9978754816175148, "grad_norm": 0.30208855377084065, "learning_rate": 8.092936374159977e-11, "loss": 0.1299, "step": 6935 }, { "epoch": 1.9993158330632674, "grad_norm": 0.27290887764863414, "learning_rate": 1.1380704968422251e-11, "loss": 0.1311, "step": 6940 }, { "epoch": 1.9998919736415686, "step": 6942, "total_flos": 2.1613118366416896e+16, "train_loss": 0.0653228780393709, "train_runtime": 98246.8444, "train_samples_per_second": 4.523, "train_steps_per_second": 0.071 } ], "logging_steps": 5, "max_steps": 6942, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1613118366416896e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }