diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,9847 +2,4926 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.9998919736415686, + "epoch": 0.9998197093715069, "eval_steps": 500, - "global_step": 6942, + "global_step": 3466, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0014403514457527638, - "grad_norm": 16.586680451608224, - "learning_rate": 1.1510791366906476e-07, - "loss": 1.7625, + "epoch": 0.0014423250279450475, + "grad_norm": 23.09968734754774, + "learning_rate": 2.3054755043227666e-07, + "loss": 12.1657, "step": 5 }, { - "epoch": 0.0028807028915055276, - "grad_norm": 16.23021243769238, - "learning_rate": 2.589928057553957e-07, - "loss": 1.754, + "epoch": 0.002884650055890095, + "grad_norm": 23.350567085111635, + "learning_rate": 5.187319884726226e-07, + "loss": 12.1499, "step": 10 }, { - "epoch": 0.004321054337258291, - "grad_norm": 13.550599381632765, - "learning_rate": 4.0287769784172663e-07, - "loss": 1.7397, + "epoch": 0.004326975083835142, + "grad_norm": 22.840877913954497, + "learning_rate": 8.069164265129684e-07, + "loss": 12.0857, "step": 15 }, { - "epoch": 0.005761405783011055, - "grad_norm": 11.888221853811155, - "learning_rate": 5.467625899280576e-07, - "loss": 1.6888, + "epoch": 0.00576930011178019, + "grad_norm": 21.40321138460624, + "learning_rate": 1.0951008645533142e-06, + "loss": 11.8028, "step": 20 }, { - "epoch": 0.007201757228763818, - "grad_norm": 9.60550960713175, - "learning_rate": 6.906474820143885e-07, - "loss": 1.5985, + "epoch": 0.007211625139725237, + "grad_norm": 18.192353108517974, + "learning_rate": 1.3832853025936602e-06, + "loss": 11.3384, "step": 25 }, { - "epoch": 0.008642108674516582, - "grad_norm": 9.038730172783534, - "learning_rate": 8.345323741007196e-07, - "loss": 1.4653, + "epoch": 0.008653950167670284, + "grad_norm": 18.559232783911973, + "learning_rate": 1.6714697406340058e-06, + "loss": 10.3127, "step": 30 }, { - "epoch": 0.010082460120269346, - "grad_norm": 8.758156216249105, - "learning_rate": 9.784172661870505e-07, - "loss": 1.2571, + "epoch": 0.010096275195615331, + "grad_norm": 37.79150391064707, + "learning_rate": 1.959654178674352e-06, + "loss": 9.0664, "step": 35 }, { - "epoch": 0.01152281156602211, - "grad_norm": 8.491557806539111, - "learning_rate": 1.1223021582733814e-06, - "loss": 0.9934, + "epoch": 0.01153860022356038, + "grad_norm": 33.772043740311254, + "learning_rate": 2.247838616714698e-06, + "loss": 7.409, "step": 40 }, { - "epoch": 0.012963163011774873, - "grad_norm": 7.84797759833516, - "learning_rate": 1.2661870503597123e-06, - "loss": 0.8012, + "epoch": 0.012980925251505427, + "grad_norm": 23.04632172544007, + "learning_rate": 2.5360230547550434e-06, + "loss": 6.3338, "step": 45 }, { - "epoch": 0.014403514457527637, - "grad_norm": 2.323360571149784, - "learning_rate": 1.4100719424460432e-06, - "loss": 0.5458, + "epoch": 0.014423250279450473, + "grad_norm": 25.32559722397877, + "learning_rate": 2.8242074927953894e-06, + "loss": 4.4908, "step": 50 }, { - "epoch": 0.0158438659032804, - "grad_norm": 1.8779227048258833, - "learning_rate": 1.5539568345323742e-06, - "loss": 0.5126, + "epoch": 0.015865575307395522, + "grad_norm": 9.143968031022688, + "learning_rate": 3.1123919308357354e-06, + "loss": 3.2978, "step": 55 }, { - "epoch": 0.017284217349033165, - "grad_norm": 1.7665084648486797, - "learning_rate": 1.6978417266187053e-06, - "loss": 0.472, + "epoch": 0.01730790033534057, + "grad_norm": 2.3359297684099745, + "learning_rate": 3.400576368876081e-06, + "loss": 2.6887, "step": 60 }, { - "epoch": 0.01872456879478593, - "grad_norm": 2.538885827255077, - "learning_rate": 1.8417266187050362e-06, - "loss": 0.4457, + "epoch": 0.018750225363285616, + "grad_norm": 1.5235792893524585, + "learning_rate": 3.6887608069164266e-06, + "loss": 2.6051, "step": 65 }, { - "epoch": 0.020164920240538693, - "grad_norm": 2.6827209141200647, - "learning_rate": 1.985611510791367e-06, - "loss": 0.4296, + "epoch": 0.020192550391230663, + "grad_norm": 1.6452371227737381, + "learning_rate": 3.976945244956772e-06, + "loss": 2.5288, "step": 70 }, { - "epoch": 0.021605271686291457, - "grad_norm": 2.0305361884485555, - "learning_rate": 2.129496402877698e-06, - "loss": 0.4221, + "epoch": 0.021634875419175713, + "grad_norm": 2.3877151673363133, + "learning_rate": 4.265129682997119e-06, + "loss": 2.4368, "step": 75 }, { - "epoch": 0.02304562313204422, - "grad_norm": 2.6997984040664833, - "learning_rate": 2.273381294964029e-06, - "loss": 0.4028, + "epoch": 0.02307720044712076, + "grad_norm": 3.5448230000902283, + "learning_rate": 4.553314121037464e-06, + "loss": 2.2394, "step": 80 }, { - "epoch": 0.02448597457779698, - "grad_norm": 1.2504187205349406, - "learning_rate": 2.41726618705036e-06, - "loss": 0.3801, + "epoch": 0.024519525475065806, + "grad_norm": 3.998099329525319, + "learning_rate": 4.84149855907781e-06, + "loss": 2.0687, "step": 85 }, { - "epoch": 0.025926326023549745, - "grad_norm": 1.3474376559842136, - "learning_rate": 2.5611510791366906e-06, - "loss": 0.3694, + "epoch": 0.025961850503010853, + "grad_norm": 5.3900301279889025, + "learning_rate": 5.129682997118156e-06, + "loss": 2.0427, "step": 90 }, { - "epoch": 0.02736667746930251, - "grad_norm": 1.4308843552665633, - "learning_rate": 2.7050359712230217e-06, - "loss": 0.3708, + "epoch": 0.0274041755309559, + "grad_norm": 7.2317244995568215, + "learning_rate": 5.417867435158502e-06, + "loss": 1.9167, "step": 95 }, { - "epoch": 0.028807028915055273, - "grad_norm": 1.3683344004037181, - "learning_rate": 2.848920863309353e-06, - "loss": 0.3384, + "epoch": 0.028846500558900947, + "grad_norm": 4.190169407947923, + "learning_rate": 5.706051873198848e-06, + "loss": 1.8528, "step": 100 }, { - "epoch": 0.030247380360808037, - "grad_norm": 1.3015919647993486, - "learning_rate": 2.9928057553956836e-06, - "loss": 0.331, + "epoch": 0.030288825586845997, + "grad_norm": 5.165106554451897, + "learning_rate": 5.994236311239193e-06, + "loss": 1.8751, "step": 105 }, { - "epoch": 0.0316877318065608, - "grad_norm": 1.9912468429786234, - "learning_rate": 3.1366906474820147e-06, - "loss": 0.3178, + "epoch": 0.031731150614791044, + "grad_norm": 3.2421300129897426, + "learning_rate": 6.2824207492795395e-06, + "loss": 1.7973, "step": 110 }, { - "epoch": 0.03312808325231356, - "grad_norm": 1.2757927978871475, - "learning_rate": 3.280575539568346e-06, - "loss": 0.3208, + "epoch": 0.03317347564273609, + "grad_norm": 4.460292781455887, + "learning_rate": 6.570605187319885e-06, + "loss": 1.6292, "step": 115 }, { - "epoch": 0.03456843469806633, - "grad_norm": 1.1146556109378103, - "learning_rate": 3.4244604316546766e-06, - "loss": 0.3143, + "epoch": 0.03461580067068114, + "grad_norm": 4.913131259117871, + "learning_rate": 6.8587896253602315e-06, + "loss": 1.655, "step": 120 }, { - "epoch": 0.03600878614381909, - "grad_norm": 1.254484216831056, - "learning_rate": 3.5683453237410077e-06, - "loss": 0.3176, + "epoch": 0.03605812569862619, + "grad_norm": 4.1881116653103945, + "learning_rate": 7.146974063400577e-06, + "loss": 1.664, "step": 125 }, { - "epoch": 0.03744913758957186, - "grad_norm": 1.1698368725906516, - "learning_rate": 3.7122302158273384e-06, - "loss": 0.3146, + "epoch": 0.03750045072657123, + "grad_norm": 5.723431293294362, + "learning_rate": 7.4351585014409235e-06, + "loss": 1.6202, "step": 130 }, { - "epoch": 0.03888948903532462, - "grad_norm": 1.1437180666843907, - "learning_rate": 3.856115107913669e-06, - "loss": 0.3059, + "epoch": 0.03894277575451628, + "grad_norm": 4.909602119186479, + "learning_rate": 7.723342939481268e-06, + "loss": 1.5486, "step": 135 }, { - "epoch": 0.040329840481077385, - "grad_norm": 1.1343184831908104, - "learning_rate": 4.000000000000001e-06, - "loss": 0.3002, + "epoch": 0.040385100782461325, + "grad_norm": 5.928676345818394, + "learning_rate": 8.011527377521614e-06, + "loss": 1.4965, "step": 140 }, { - "epoch": 0.041770191926830146, - "grad_norm": 1.1952316457960082, - "learning_rate": 4.143884892086331e-06, - "loss": 0.2924, + "epoch": 0.041827425810406375, + "grad_norm": 5.5830317263384845, + "learning_rate": 8.299711815561961e-06, + "loss": 1.4195, "step": 145 }, { - "epoch": 0.04321054337258291, - "grad_norm": 1.347560615976888, - "learning_rate": 4.287769784172662e-06, - "loss": 0.2856, + "epoch": 0.043269750838351426, + "grad_norm": 5.587820490379444, + "learning_rate": 8.587896253602305e-06, + "loss": 1.3894, "step": 150 }, { - "epoch": 0.044650894818335674, - "grad_norm": 1.2457364588537787, - "learning_rate": 4.431654676258993e-06, - "loss": 0.2815, + "epoch": 0.04471207586629647, + "grad_norm": 3.5851612990900836, + "learning_rate": 8.876080691642652e-06, + "loss": 1.4654, "step": 155 }, { - "epoch": 0.04609124626408844, - "grad_norm": 1.0098986875238485, - "learning_rate": 4.575539568345324e-06, - "loss": 0.262, + "epoch": 0.04615440089424152, + "grad_norm": 4.792344497245253, + "learning_rate": 9.164265129682998e-06, + "loss": 1.3801, "step": 160 }, { - "epoch": 0.0475315977098412, - "grad_norm": 1.0632076910755275, - "learning_rate": 4.719424460431655e-06, - "loss": 0.2831, + "epoch": 0.04759672592218656, + "grad_norm": 3.5644574463856387, + "learning_rate": 9.452449567723344e-06, + "loss": 1.3527, "step": 165 }, { - "epoch": 0.04897194915559396, - "grad_norm": 1.0001553412785371, - "learning_rate": 4.863309352517986e-06, - "loss": 0.2692, + "epoch": 0.04903905095013161, + "grad_norm": 4.245088356022904, + "learning_rate": 9.740634005763689e-06, + "loss": 1.3465, "step": 170 }, { - "epoch": 0.05041230060134673, - "grad_norm": 0.9548653221886261, - "learning_rate": 5.0071942446043165e-06, - "loss": 0.2526, + "epoch": 0.050481375978076656, + "grad_norm": 4.623244884122231, + "learning_rate": 1.0028818443804036e-05, + "loss": 1.3647, "step": 175 }, { - "epoch": 0.05185265204709949, - "grad_norm": 0.8527895969241093, - "learning_rate": 5.151079136690648e-06, - "loss": 0.2578, + "epoch": 0.05192370100602171, + "grad_norm": 3.5591972450196043, + "learning_rate": 1.031700288184438e-05, + "loss": 1.261, "step": 180 }, { - "epoch": 0.05329300349285226, - "grad_norm": 0.8301269546841742, - "learning_rate": 5.294964028776979e-06, - "loss": 0.2612, + "epoch": 0.05336602603396676, + "grad_norm": 3.6288737317693243, + "learning_rate": 1.0605187319884726e-05, + "loss": 1.2178, "step": 185 }, { - "epoch": 0.05473335493860502, - "grad_norm": 0.8696014983595155, - "learning_rate": 5.43884892086331e-06, - "loss": 0.2587, + "epoch": 0.0548083510619118, + "grad_norm": 5.472679192029011, + "learning_rate": 1.0893371757925073e-05, + "loss": 1.2372, "step": 190 }, { - "epoch": 0.056173706384357786, - "grad_norm": 0.7739875909178445, - "learning_rate": 5.582733812949641e-06, - "loss": 0.246, + "epoch": 0.05625067608985685, + "grad_norm": 2.987171924181164, + "learning_rate": 1.1181556195965419e-05, + "loss": 1.1878, "step": 195 }, { - "epoch": 0.057614057830110546, - "grad_norm": 0.7654383803539627, - "learning_rate": 5.726618705035971e-06, - "loss": 0.2596, + "epoch": 0.057693001117801894, + "grad_norm": 3.633711033064426, + "learning_rate": 1.1469740634005764e-05, + "loss": 1.1895, "step": 200 }, { - "epoch": 0.059054409275863314, - "grad_norm": 0.8726355233974463, - "learning_rate": 5.8705035971223024e-06, - "loss": 0.2396, + "epoch": 0.059135326145746944, + "grad_norm": 3.9402926571067978, + "learning_rate": 1.175792507204611e-05, + "loss": 1.1368, "step": 205 }, { - "epoch": 0.060494760721616074, - "grad_norm": 0.7952385048612362, - "learning_rate": 6.014388489208633e-06, - "loss": 0.2489, + "epoch": 0.060577651173691995, + "grad_norm": 3.527134311033913, + "learning_rate": 1.2046109510086457e-05, + "loss": 1.1306, "step": 210 }, { - "epoch": 0.061935112167368835, - "grad_norm": 0.8979351693559371, - "learning_rate": 6.158273381294965e-06, - "loss": 0.2393, + "epoch": 0.06201997620163704, + "grad_norm": 3.679407663475352, + "learning_rate": 1.2334293948126803e-05, + "loss": 1.0846, "step": 215 }, { - "epoch": 0.0633754636131216, - "grad_norm": 0.6684273499878575, - "learning_rate": 6.302158273381295e-06, - "loss": 0.2471, + "epoch": 0.06346230122958209, + "grad_norm": 3.1104059182965047, + "learning_rate": 1.2622478386167147e-05, + "loss": 1.1201, "step": 220 }, { - "epoch": 0.06481581505887436, - "grad_norm": 0.7298265883042253, - "learning_rate": 6.446043165467626e-06, - "loss": 0.2421, + "epoch": 0.06490462625752713, + "grad_norm": 4.203869282005421, + "learning_rate": 1.2910662824207494e-05, + "loss": 1.0694, "step": 225 }, { - "epoch": 0.06625616650462712, - "grad_norm": 0.6283034910513979, - "learning_rate": 6.589928057553957e-06, - "loss": 0.2347, + "epoch": 0.06634695128547217, + "grad_norm": 3.936128919901792, + "learning_rate": 1.319884726224784e-05, + "loss": 1.0191, "step": 230 }, { - "epoch": 0.0676965179503799, - "grad_norm": 2.019647888568607, - "learning_rate": 6.733812949640288e-06, - "loss": 0.2438, + "epoch": 0.06778927631341723, + "grad_norm": 2.2362445033305804, + "learning_rate": 1.3487031700288185e-05, + "loss": 0.9774, "step": 235 }, { - "epoch": 0.06913686939613266, - "grad_norm": 0.7723184818645342, - "learning_rate": 6.877697841726619e-06, - "loss": 0.247, + "epoch": 0.06923160134136228, + "grad_norm": 2.757438827888907, + "learning_rate": 1.377521613832853e-05, + "loss": 1.0124, "step": 240 }, { - "epoch": 0.07057722084188542, - "grad_norm": 0.6213657978770711, - "learning_rate": 7.021582733812951e-06, - "loss": 0.2378, + "epoch": 0.07067392636930732, + "grad_norm": 3.4599226565163783, + "learning_rate": 1.4063400576368878e-05, + "loss": 0.9295, "step": 245 }, { - "epoch": 0.07201757228763818, - "grad_norm": 0.7770361009794773, - "learning_rate": 7.165467625899281e-06, - "loss": 0.2393, + "epoch": 0.07211625139725238, + "grad_norm": 2.0262096895794963, + "learning_rate": 1.4351585014409224e-05, + "loss": 0.9118, "step": 250 }, { - "epoch": 0.07345792373339095, - "grad_norm": 0.6688025297223997, - "learning_rate": 7.309352517985612e-06, - "loss": 0.243, + "epoch": 0.07355857642519742, + "grad_norm": 2.487400868386021, + "learning_rate": 1.4639769452449568e-05, + "loss": 0.9409, "step": 255 }, { - "epoch": 0.07489827517914371, - "grad_norm": 0.7154762192358323, - "learning_rate": 7.453237410071943e-06, - "loss": 0.2384, + "epoch": 0.07500090145314246, + "grad_norm": 1.9303088742335475, + "learning_rate": 1.4927953890489915e-05, + "loss": 0.9211, "step": 260 }, { - "epoch": 0.07633862662489647, - "grad_norm": 0.6519073456752323, - "learning_rate": 7.597122302158274e-06, - "loss": 0.2285, + "epoch": 0.0764432264810875, + "grad_norm": 2.175412817851971, + "learning_rate": 1.521613832853026e-05, + "loss": 0.9168, "step": 265 }, { - "epoch": 0.07777897807064924, - "grad_norm": 0.6216449718793182, - "learning_rate": 7.741007194244606e-06, - "loss": 0.2209, + "epoch": 0.07788555150903256, + "grad_norm": 2.5796504124225033, + "learning_rate": 1.5504322766570608e-05, + "loss": 0.9527, "step": 270 }, { - "epoch": 0.079219329516402, - "grad_norm": 0.5890271039585314, - "learning_rate": 7.884892086330936e-06, - "loss": 0.2295, + "epoch": 0.0793278765369776, + "grad_norm": 1.9788435183920994, + "learning_rate": 1.5792507204610953e-05, + "loss": 0.8426, "step": 275 }, { - "epoch": 0.08065968096215477, - "grad_norm": 0.5765383492022604, - "learning_rate": 8.028776978417266e-06, - "loss": 0.2347, + "epoch": 0.08077020156492265, + "grad_norm": 2.003074548053739, + "learning_rate": 1.60806916426513e-05, + "loss": 0.8527, "step": 280 }, { - "epoch": 0.08210003240790753, - "grad_norm": 0.6882797441460431, - "learning_rate": 8.172661870503597e-06, - "loss": 0.2295, + "epoch": 0.08221252659286771, + "grad_norm": 2.1994335722383602, + "learning_rate": 1.6368876080691644e-05, + "loss": 0.8072, "step": 285 }, { - "epoch": 0.08354038385366029, - "grad_norm": 0.557535624999968, - "learning_rate": 8.316546762589929e-06, - "loss": 0.2275, + "epoch": 0.08365485162081275, + "grad_norm": 1.726445070641134, + "learning_rate": 1.665706051873199e-05, + "loss": 0.8163, "step": 290 }, { - "epoch": 0.08498073529941305, - "grad_norm": 0.6508761757468999, - "learning_rate": 8.46043165467626e-06, - "loss": 0.2315, + "epoch": 0.0850971766487578, + "grad_norm": 2.350691118327581, + "learning_rate": 1.6945244956772336e-05, + "loss": 0.7651, "step": 295 }, { - "epoch": 0.08642108674516583, - "grad_norm": 0.6758956838113382, - "learning_rate": 8.604316546762592e-06, - "loss": 0.231, + "epoch": 0.08653950167670285, + "grad_norm": 2.6639655167915115, + "learning_rate": 1.723342939481268e-05, + "loss": 0.7535, "step": 300 }, { - "epoch": 0.08786143819091859, - "grad_norm": 0.578932856654019, - "learning_rate": 8.748201438848922e-06, - "loss": 0.2389, + "epoch": 0.0879818267046479, + "grad_norm": 1.3919563172463725, + "learning_rate": 1.7521613832853027e-05, + "loss": 0.785, "step": 305 }, { - "epoch": 0.08930178963667135, - "grad_norm": 0.6066986114882191, - "learning_rate": 8.892086330935252e-06, - "loss": 0.241, + "epoch": 0.08942415173259294, + "grad_norm": 1.2944766289360783, + "learning_rate": 1.7809798270893372e-05, + "loss": 0.7111, "step": 310 }, { - "epoch": 0.09074214108242411, - "grad_norm": 0.615624208891137, - "learning_rate": 9.035971223021583e-06, - "loss": 0.2167, + "epoch": 0.09086647676053798, + "grad_norm": 1.4798988266070112, + "learning_rate": 1.8097982708933718e-05, + "loss": 0.7293, "step": 315 }, { - "epoch": 0.09218249252817688, - "grad_norm": 0.5484255060272044, - "learning_rate": 9.179856115107915e-06, - "loss": 0.2337, + "epoch": 0.09230880178848304, + "grad_norm": 1.1830162313483426, + "learning_rate": 1.8386167146974067e-05, + "loss": 0.7231, "step": 320 }, { - "epoch": 0.09362284397392964, - "grad_norm": 0.657391795292415, - "learning_rate": 9.323741007194246e-06, - "loss": 0.2274, + "epoch": 0.09375112681642808, + "grad_norm": 1.5568610974134778, + "learning_rate": 1.867435158501441e-05, + "loss": 0.7445, "step": 325 }, { - "epoch": 0.0950631954196824, - "grad_norm": 0.5704279837914004, - "learning_rate": 9.467625899280576e-06, - "loss": 0.2203, + "epoch": 0.09519345184437313, + "grad_norm": 1.1492164494899182, + "learning_rate": 1.8962536023054755e-05, + "loss": 0.6959, "step": 330 }, { - "epoch": 0.09650354686543516, - "grad_norm": 0.540539465103558, - "learning_rate": 9.611510791366908e-06, - "loss": 0.2202, + "epoch": 0.09663577687231818, + "grad_norm": 1.0978857201097723, + "learning_rate": 1.9250720461095104e-05, + "loss": 0.7057, "step": 335 }, { - "epoch": 0.09794389831118792, - "grad_norm": 0.5694667381563077, - "learning_rate": 9.755395683453238e-06, - "loss": 0.2185, + "epoch": 0.09807810190026323, + "grad_norm": 1.0096489653703298, + "learning_rate": 1.953890489913545e-05, + "loss": 0.6772, "step": 340 }, { - "epoch": 0.0993842497569407, - "grad_norm": 0.6987105858587492, - "learning_rate": 9.899280575539569e-06, - "loss": 0.223, + "epoch": 0.09952042692820827, + "grad_norm": 1.1232844613521993, + "learning_rate": 1.9827089337175795e-05, + "loss": 0.7246, "step": 345 }, { - "epoch": 0.10082460120269346, - "grad_norm": 0.6273823448614299, - "learning_rate": 1.0043165467625899e-05, - "loss": 0.2303, + "epoch": 0.10096275195615331, + "grad_norm": 1.02243795388932, + "learning_rate": 1.9999979709215212e-05, + "loss": 0.7024, "step": 350 }, { - "epoch": 0.10226495264844622, - "grad_norm": 0.6219303498898151, - "learning_rate": 1.0187050359712232e-05, - "loss": 0.206, + "epoch": 0.10240507698409837, + "grad_norm": 1.1367801539352143, + "learning_rate": 1.9999751438831965e-05, + "loss": 0.6489, "step": 355 }, { - "epoch": 0.10370530409419898, - "grad_norm": 0.5674007048209457, - "learning_rate": 1.0330935251798562e-05, - "loss": 0.2209, + "epoch": 0.10384740201204341, + "grad_norm": 1.1572043181625398, + "learning_rate": 1.9999269540393507e-05, + "loss": 0.6489, "step": 360 }, { - "epoch": 0.10514565553995175, - "grad_norm": 0.5817546836599868, - "learning_rate": 1.0474820143884894e-05, - "loss": 0.2267, + "epoch": 0.10528972703998846, + "grad_norm": 1.0269240416486167, + "learning_rate": 1.9998534026122433e-05, + "loss": 0.6782, "step": 365 }, { - "epoch": 0.10658600698570452, - "grad_norm": 0.5560607592408621, - "learning_rate": 1.0618705035971223e-05, - "loss": 0.2279, + "epoch": 0.10673205206793351, + "grad_norm": 0.9511160065038861, + "learning_rate": 1.9997544914673915e-05, + "loss": 0.6312, "step": 370 }, { - "epoch": 0.10802635843145728, - "grad_norm": 0.6513825672726103, - "learning_rate": 1.0762589928057553e-05, - "loss": 0.2141, + "epoch": 0.10817437709587856, + "grad_norm": 1.1374311508874984, + "learning_rate": 1.999630223113522e-05, + "loss": 0.6628, "step": 375 }, { - "epoch": 0.10946670987721004, - "grad_norm": 0.5936098114546975, - "learning_rate": 1.0906474820143887e-05, - "loss": 0.2178, + "epoch": 0.1096167021238236, + "grad_norm": 1.450941328478541, + "learning_rate": 1.9994806007025068e-05, + "loss": 0.6389, "step": 380 }, { - "epoch": 0.1109070613229628, - "grad_norm": 0.5490902284633686, - "learning_rate": 1.1050359712230216e-05, - "loss": 0.2195, + "epoch": 0.11105902715176866, + "grad_norm": 0.8046806001901237, + "learning_rate": 1.9993056280292845e-05, + "loss": 0.6482, "step": 385 }, { - "epoch": 0.11234741276871557, - "grad_norm": 0.5995043966161002, - "learning_rate": 1.1194244604316548e-05, - "loss": 0.2216, + "epoch": 0.1125013521797137, + "grad_norm": 0.8216403494158578, + "learning_rate": 1.999105309531763e-05, + "loss": 0.6078, "step": 390 }, { - "epoch": 0.11378776421446833, - "grad_norm": 0.5725529495218363, - "learning_rate": 1.133812949640288e-05, - "loss": 0.2234, + "epoch": 0.11394367720765874, + "grad_norm": 0.8600864577290717, + "learning_rate": 1.9988796502907083e-05, + "loss": 0.63, "step": 395 }, { - "epoch": 0.11522811566022109, - "grad_norm": 0.5561807013548232, - "learning_rate": 1.148201438848921e-05, - "loss": 0.2138, + "epoch": 0.11538600223560379, + "grad_norm": 0.798579467879802, + "learning_rate": 1.9986286560296134e-05, + "loss": 0.6109, "step": 400 }, { - "epoch": 0.11666846710597385, - "grad_norm": 0.575827707581297, - "learning_rate": 1.1625899280575541e-05, - "loss": 0.2189, + "epoch": 0.11682832726354885, + "grad_norm": 0.7668970837973854, + "learning_rate": 1.998352333114556e-05, + "loss": 0.5857, "step": 405 }, { - "epoch": 0.11810881855172663, - "grad_norm": 0.8535370905580235, - "learning_rate": 1.176978417266187e-05, - "loss": 0.205, + "epoch": 0.11827065229149389, + "grad_norm": 1.0143366745206854, + "learning_rate": 1.998050688554034e-05, + "loss": 0.6176, "step": 410 }, { - "epoch": 0.11954916999747939, - "grad_norm": 0.5195864367586588, - "learning_rate": 1.1913669064748204e-05, - "loss": 0.2317, + "epoch": 0.11971297731943893, + "grad_norm": 0.7114180483975799, + "learning_rate": 1.9977237299987903e-05, + "loss": 0.62, "step": 415 }, { - "epoch": 0.12098952144323215, - "grad_norm": 0.5585128655255315, - "learning_rate": 1.2057553956834534e-05, - "loss": 0.2242, + "epoch": 0.12115530234738399, + "grad_norm": 0.8179413343809848, + "learning_rate": 1.997371465741617e-05, + "loss": 0.6205, "step": 420 }, { - "epoch": 0.12242987288898491, - "grad_norm": 0.5627890703738564, - "learning_rate": 1.2201438848920864e-05, - "loss": 0.2275, + "epoch": 0.12259762737532903, + "grad_norm": 0.6435940720725398, + "learning_rate": 1.996993904717146e-05, + "loss": 0.5878, "step": 425 }, { - "epoch": 0.12387022433473767, - "grad_norm": 0.5525711309273611, - "learning_rate": 1.2345323741007195e-05, - "loss": 0.2212, + "epoch": 0.12403995240327408, + "grad_norm": 0.9102246188273324, + "learning_rate": 1.9965910565016223e-05, + "loss": 0.6021, "step": 430 }, { - "epoch": 0.12531057578049043, - "grad_norm": 0.5512498249317371, - "learning_rate": 1.2489208633093525e-05, - "loss": 0.2075, + "epoch": 0.12548227743121912, + "grad_norm": 0.6153476600060466, + "learning_rate": 1.9961629313126608e-05, + "loss": 0.5674, "step": 435 }, { - "epoch": 0.1267509272262432, - "grad_norm": 0.527963361870513, - "learning_rate": 1.2633093525179858e-05, - "loss": 0.2207, + "epoch": 0.12692460245916418, + "grad_norm": 0.5823753109992822, + "learning_rate": 1.9957095400089875e-05, + "loss": 0.5819, "step": 440 }, { - "epoch": 0.12819127867199598, - "grad_norm": 0.66444135628081, - "learning_rate": 1.2776978417266188e-05, - "loss": 0.2176, + "epoch": 0.12836692748710923, + "grad_norm": 0.6280650049871973, + "learning_rate": 1.9952308940901634e-05, + "loss": 0.6357, "step": 445 }, { - "epoch": 0.12963163011774873, - "grad_norm": 0.6375370649966602, - "learning_rate": 1.2920863309352518e-05, - "loss": 0.218, + "epoch": 0.12980925251505426, + "grad_norm": 1.12163730124818, + "learning_rate": 1.9947270056962934e-05, + "loss": 0.5659, "step": 450 }, { - "epoch": 0.1310719815635015, - "grad_norm": 0.5294066493342842, - "learning_rate": 1.306474820143885e-05, - "loss": 0.2075, + "epoch": 0.13125157754299932, + "grad_norm": 0.8453741002711367, + "learning_rate": 1.994197887607719e-05, + "loss": 0.5423, "step": 455 }, { - "epoch": 0.13251233300925425, - "grad_norm": 0.5717532515073108, - "learning_rate": 1.3208633093525181e-05, - "loss": 0.2151, + "epoch": 0.13269390257094435, + "grad_norm": 0.6945577095672939, + "learning_rate": 1.993643553244693e-05, + "loss": 0.6118, "step": 460 }, { - "epoch": 0.13395268445500702, - "grad_norm": 0.5571385430026796, - "learning_rate": 1.3352517985611513e-05, - "loss": 0.2062, + "epoch": 0.1341362275988894, + "grad_norm": 0.6080087347638511, + "learning_rate": 1.993064016667039e-05, + "loss": 0.5912, "step": 465 }, { - "epoch": 0.1353930359007598, - "grad_norm": 0.6911923317118861, - "learning_rate": 1.3496402877697843e-05, - "loss": 0.2001, + "epoch": 0.13557855262683446, + "grad_norm": 0.5072027520003524, + "learning_rate": 1.992459292573796e-05, + "loss": 0.6086, "step": 470 }, { - "epoch": 0.13683338734651254, - "grad_norm": 0.4888695446827276, - "learning_rate": 1.3640287769784173e-05, - "loss": 0.2054, + "epoch": 0.1370208776547795, + "grad_norm": 0.5194397753829619, + "learning_rate": 1.991829396302845e-05, + "loss": 0.5554, "step": 475 }, { - "epoch": 0.13827373879226532, - "grad_norm": 0.5791087898343821, - "learning_rate": 1.3784172661870506e-05, - "loss": 0.2096, + "epoch": 0.13846320268272455, + "grad_norm": 0.6531400636419847, + "learning_rate": 1.9911743438305203e-05, + "loss": 0.5738, "step": 480 }, { - "epoch": 0.13971409023801806, - "grad_norm": 0.5461601934980354, - "learning_rate": 1.3928057553956836e-05, - "loss": 0.2186, + "epoch": 0.1399055277106696, + "grad_norm": 0.8007993447245763, + "learning_rate": 1.990494151771202e-05, + "loss": 0.5698, "step": 485 }, { - "epoch": 0.14115444168377084, - "grad_norm": 0.5278105788434795, - "learning_rate": 1.4071942446043167e-05, - "loss": 0.2064, + "epoch": 0.14134785273861464, + "grad_norm": 0.7192330669398362, + "learning_rate": 1.989788837376899e-05, + "loss": 0.5629, "step": 490 }, { - "epoch": 0.1425947931295236, - "grad_norm": 0.572815440882881, - "learning_rate": 1.4215827338129497e-05, - "loss": 0.23, + "epoch": 0.1427901777665597, + "grad_norm": 0.688440868686088, + "learning_rate": 1.989058418536807e-05, + "loss": 0.5734, "step": 495 }, { - "epoch": 0.14403514457527636, - "grad_norm": 0.5825481697550493, - "learning_rate": 1.4359712230215827e-05, - "loss": 0.2113, + "epoch": 0.14423250279450475, + "grad_norm": 1.001172764554856, + "learning_rate": 1.988302913776858e-05, + "loss": 0.5745, "step": 500 }, { - "epoch": 0.14403514457527636, - "eval_loss": 0.21517202258110046, - "eval_runtime": 192.1896, - "eval_samples_per_second": 9.387, - "eval_steps_per_second": 2.347, + "epoch": 0.14423250279450475, + "eval_loss": 0.568706750869751, + "eval_runtime": 161.3667, + "eval_samples_per_second": 11.161, + "eval_steps_per_second": 2.795, "step": 500 }, { - "epoch": 0.14547549602102913, - "grad_norm": 0.5450285278535744, - "learning_rate": 1.450359712230216e-05, - "loss": 0.2176, + "epoch": 0.14567482782244978, + "grad_norm": 1.0515733209433527, + "learning_rate": 1.9875223422592485e-05, + "loss": 0.5704, "step": 505 }, { - "epoch": 0.1469158474667819, - "grad_norm": 0.5617983236242802, - "learning_rate": 1.464748201438849e-05, - "loss": 0.2263, + "epoch": 0.14711715285039484, + "grad_norm": 1.0276945765068186, + "learning_rate": 1.986716723781954e-05, + "loss": 0.6123, "step": 510 }, { - "epoch": 0.14835619891253465, - "grad_norm": 0.5107925009512722, - "learning_rate": 1.4791366906474822e-05, - "loss": 0.2093, + "epoch": 0.1485594778783399, + "grad_norm": 0.8043743845845657, + "learning_rate": 1.985886078778227e-05, + "loss": 0.5437, "step": 515 }, { - "epoch": 0.14979655035828743, - "grad_norm": 0.5095844104524674, - "learning_rate": 1.4935251798561152e-05, - "loss": 0.2076, + "epoch": 0.15000180290628493, + "grad_norm": 0.6535595881064415, + "learning_rate": 1.9850304283160793e-05, + "loss": 0.5527, "step": 520 }, { - "epoch": 0.15123690180404017, - "grad_norm": 0.4681970484619367, - "learning_rate": 1.5079136690647483e-05, - "loss": 0.205, + "epoch": 0.15144412793422998, + "grad_norm": 0.7357564272936004, + "learning_rate": 1.9841497940977464e-05, + "loss": 0.5432, "step": 525 }, { - "epoch": 0.15267725324979295, - "grad_norm": 0.483511636229593, - "learning_rate": 1.5223021582733815e-05, - "loss": 0.2039, + "epoch": 0.152886452962175, + "grad_norm": 0.7287222676647807, + "learning_rate": 1.983244198459138e-05, + "loss": 0.5811, "step": 530 }, { - "epoch": 0.15411760469554572, - "grad_norm": 0.5952140053203314, - "learning_rate": 1.5366906474820144e-05, - "loss": 0.2048, + "epoch": 0.15432877799012007, + "grad_norm": 0.5697752505815841, + "learning_rate": 1.982313664369271e-05, + "loss": 0.5627, "step": 535 }, { - "epoch": 0.15555795614129847, - "grad_norm": 0.5355425751788454, - "learning_rate": 1.5510791366906476e-05, - "loss": 0.1994, + "epoch": 0.15577110301806513, + "grad_norm": 0.5170616797914624, + "learning_rate": 1.981358215429687e-05, + "loss": 0.5592, "step": 540 }, { - "epoch": 0.15699830758705124, - "grad_norm": 0.5186977018893049, - "learning_rate": 1.5654676258992808e-05, - "loss": 0.2191, + "epoch": 0.15721342804601016, + "grad_norm": 0.619913426569597, + "learning_rate": 1.9803778758738543e-05, + "loss": 0.5435, "step": 545 }, { - "epoch": 0.158438659032804, - "grad_norm": 0.5096765096533113, - "learning_rate": 1.5798561151079136e-05, - "loss": 0.2037, + "epoch": 0.1586557530739552, + "grad_norm": 0.9727823301261521, + "learning_rate": 1.9793726705665524e-05, + "loss": 0.5889, "step": 550 }, { - "epoch": 0.15987901047855677, - "grad_norm": 0.49643477985240453, - "learning_rate": 1.594244604316547e-05, - "loss": 0.2111, + "epoch": 0.16009807810190027, + "grad_norm": 0.6044688838902901, + "learning_rate": 1.9783426250032412e-05, + "loss": 0.5678, "step": 555 }, { - "epoch": 0.16131936192430954, - "grad_norm": 0.527798431687936, - "learning_rate": 1.60863309352518e-05, - "loss": 0.194, + "epoch": 0.1615404031298453, + "grad_norm": 0.46024598144245266, + "learning_rate": 1.9772877653094165e-05, + "loss": 0.5639, "step": 560 }, { - "epoch": 0.1627597133700623, - "grad_norm": 0.4992686472592364, - "learning_rate": 1.623021582733813e-05, - "loss": 0.1967, + "epoch": 0.16298272815779036, + "grad_norm": 0.45100341602786603, + "learning_rate": 1.9762081182399434e-05, + "loss": 0.5717, "step": 565 }, { - "epoch": 0.16420006481581506, - "grad_norm": 0.4982521591402399, - "learning_rate": 1.6374100719424462e-05, - "loss": 0.217, + "epoch": 0.16442505318573541, + "grad_norm": 0.5540308655652189, + "learning_rate": 1.9751037111783818e-05, + "loss": 0.5623, "step": 570 }, { - "epoch": 0.16564041626156784, - "grad_norm": 0.464665175723593, - "learning_rate": 1.651798561151079e-05, - "loss": 0.1957, + "epoch": 0.16586737821368044, + "grad_norm": 0.43976603899998645, + "learning_rate": 1.9739745721362897e-05, + "loss": 0.5319, "step": 575 }, { - "epoch": 0.16708076770732058, - "grad_norm": 0.45188523938453024, - "learning_rate": 1.6661870503597125e-05, - "loss": 0.2087, + "epoch": 0.1673097032416255, + "grad_norm": 0.4612500025708451, + "learning_rate": 1.9728207297525125e-05, + "loss": 0.5653, "step": 580 }, { - "epoch": 0.16852111915307336, - "grad_norm": 0.5395237904716252, - "learning_rate": 1.6805755395683453e-05, - "loss": 0.2159, + "epoch": 0.16875202826957056, + "grad_norm": 0.5752333041985558, + "learning_rate": 1.9716422132924572e-05, + "loss": 0.567, "step": 585 }, { - "epoch": 0.1699614705988261, - "grad_norm": 0.49617157293448155, - "learning_rate": 1.6949640287769785e-05, - "loss": 0.2048, + "epoch": 0.1701943532975156, + "grad_norm": 0.5369943570453672, + "learning_rate": 1.9704390526473515e-05, + "loss": 0.5609, "step": 590 }, { - "epoch": 0.17140182204457888, - "grad_norm": 0.45357867780884986, - "learning_rate": 1.7093525179856116e-05, - "loss": 0.2085, + "epoch": 0.17163667832546065, + "grad_norm": 0.5164720235053389, + "learning_rate": 1.9692112783334826e-05, + "loss": 0.5415, "step": 595 }, { - "epoch": 0.17284217349033165, - "grad_norm": 0.4561385049934112, - "learning_rate": 1.7237410071942448e-05, - "loss": 0.2071, + "epoch": 0.1730790033534057, + "grad_norm": 0.7665382521888024, + "learning_rate": 1.967958921491426e-05, + "loss": 0.5671, "step": 600 }, { - "epoch": 0.1742825249360844, - "grad_norm": 0.4743223833758313, - "learning_rate": 1.738129496402878e-05, - "loss": 0.2004, + "epoch": 0.17452132838135073, + "grad_norm": 0.6256340257615823, + "learning_rate": 1.966682013885255e-05, + "loss": 0.5533, "step": 605 }, { - "epoch": 0.17572287638183717, - "grad_norm": 0.5210203708061392, - "learning_rate": 1.7525179856115108e-05, - "loss": 0.2119, + "epoch": 0.1759636534092958, + "grad_norm": 0.4893424331522886, + "learning_rate": 1.9653805879017323e-05, + "loss": 0.5589, "step": 610 }, { - "epoch": 0.17716322782758992, - "grad_norm": 0.5230318528080634, - "learning_rate": 1.7669064748201443e-05, - "loss": 0.2063, + "epoch": 0.17740597843724082, + "grad_norm": 0.4930248858437027, + "learning_rate": 1.964054676549494e-05, + "loss": 0.5418, "step": 615 }, { - "epoch": 0.1786035792733427, - "grad_norm": 0.5438502195561348, - "learning_rate": 1.781294964028777e-05, - "loss": 0.2111, + "epoch": 0.17884830346518588, + "grad_norm": 0.45814407628412845, + "learning_rate": 1.9627043134582068e-05, + "loss": 0.5195, "step": 620 }, { - "epoch": 0.18004393071909547, - "grad_norm": 0.5574590918705871, - "learning_rate": 1.7956834532374102e-05, - "loss": 0.2063, + "epoch": 0.18029062849313093, + "grad_norm": 0.5315704703868885, + "learning_rate": 1.9613295328777187e-05, + "loss": 0.5095, "step": 625 }, { - "epoch": 0.18148428216484822, - "grad_norm": 0.5531167510606824, - "learning_rate": 1.8100719424460434e-05, - "loss": 0.2047, + "epoch": 0.18173295352107596, + "grad_norm": 0.43146076740416167, + "learning_rate": 1.959930369677189e-05, + "loss": 0.4929, "step": 630 }, { - "epoch": 0.182924633610601, - "grad_norm": 0.4941091683759115, - "learning_rate": 1.8244604316546762e-05, - "loss": 0.209, + "epoch": 0.18317527854902102, + "grad_norm": 0.4627882494650573, + "learning_rate": 1.958506859344204e-05, + "loss": 0.5141, "step": 635 }, { - "epoch": 0.18436498505635376, - "grad_norm": 0.49923381402425615, - "learning_rate": 1.8388489208633097e-05, - "loss": 0.2008, + "epoch": 0.18461760357696608, + "grad_norm": 0.621672972720691, + "learning_rate": 1.9570590379838767e-05, + "loss": 0.5486, "step": 640 }, { - "epoch": 0.1858053365021065, - "grad_norm": 0.5294742227793021, - "learning_rate": 1.8532374100719425e-05, - "loss": 0.2093, + "epoch": 0.1860599286049111, + "grad_norm": 0.5063460018719447, + "learning_rate": 1.9555869423179316e-05, + "loss": 0.5497, "step": 645 }, { - "epoch": 0.18724568794785929, - "grad_norm": 0.730229027488376, - "learning_rate": 1.8676258992805757e-05, - "loss": 0.1919, + "epoch": 0.18750225363285616, + "grad_norm": 0.48895947210824475, + "learning_rate": 1.9540906096837727e-05, + "loss": 0.5465, "step": 650 }, { - "epoch": 0.18868603939361203, - "grad_norm": 0.5108094713116142, - "learning_rate": 1.8820143884892088e-05, - "loss": 0.2125, + "epoch": 0.18894457866080122, + "grad_norm": 0.47357663586358684, + "learning_rate": 1.9525700780335372e-05, + "loss": 0.529, "step": 655 }, { - "epoch": 0.1901263908393648, - "grad_norm": 0.4842966623253086, - "learning_rate": 1.8964028776978416e-05, - "loss": 0.2146, + "epoch": 0.19038690368874625, + "grad_norm": 0.43786638884850015, + "learning_rate": 1.951025385933132e-05, + "loss": 0.522, "step": 660 }, { - "epoch": 0.19156674228511758, - "grad_norm": 0.49057011382124954, - "learning_rate": 1.910791366906475e-05, - "loss": 0.2205, + "epoch": 0.1918292287166913, + "grad_norm": 0.5828551791972233, + "learning_rate": 1.9494565725612565e-05, + "loss": 0.5334, "step": 665 }, { - "epoch": 0.19300709373087033, - "grad_norm": 0.48392895116401746, - "learning_rate": 1.925179856115108e-05, - "loss": 0.2173, + "epoch": 0.19327155374463637, + "grad_norm": 0.4669699168406431, + "learning_rate": 1.9478636777084077e-05, + "loss": 0.4846, "step": 670 }, { - "epoch": 0.1944474451766231, - "grad_norm": 0.4857633076018165, - "learning_rate": 1.939568345323741e-05, - "loss": 0.1981, + "epoch": 0.1947138787725814, + "grad_norm": 0.5626195687859905, + "learning_rate": 1.946246741775873e-05, + "loss": 0.556, "step": 675 }, { - "epoch": 0.19588779662237585, - "grad_norm": 0.530480064500977, - "learning_rate": 1.9539568345323743e-05, - "loss": 0.21, + "epoch": 0.19615620380052645, + "grad_norm": 0.5482755680769119, + "learning_rate": 1.9446058057747025e-05, + "loss": 0.4561, "step": 680 }, { - "epoch": 0.19732814806812862, - "grad_norm": 0.42587073729316316, - "learning_rate": 1.9683453237410074e-05, - "loss": 0.1915, + "epoch": 0.1975985288284715, + "grad_norm": 0.4878018831010534, + "learning_rate": 1.9429409113246715e-05, + "loss": 0.526, "step": 685 }, { - "epoch": 0.1987684995138814, - "grad_norm": 0.4666628821103525, - "learning_rate": 1.9827338129496406e-05, - "loss": 0.2021, + "epoch": 0.19904085385641654, + "grad_norm": 0.7436357434374212, + "learning_rate": 1.9412521006532245e-05, + "loss": 0.5088, "step": 690 }, { - "epoch": 0.20020885095963414, - "grad_norm": 0.5396149739812561, - "learning_rate": 1.9971223021582734e-05, - "loss": 0.2054, + "epoch": 0.2004831788843616, + "grad_norm": 0.45530676409796045, + "learning_rate": 1.939539416594402e-05, + "loss": 0.5214, "step": 695 }, { - "epoch": 0.20164920240538692, - "grad_norm": 0.49716967053304606, - "learning_rate": 1.99999797676386e-05, - "loss": 0.2115, + "epoch": 0.20192550391230663, + "grad_norm": 0.6302948823981896, + "learning_rate": 1.937802902587757e-05, + "loss": 0.5591, "step": 700 }, { - "epoch": 0.20308955385113966, - "grad_norm": 0.4390495467259597, - "learning_rate": 1.9999897573810713e-05, - "loss": 0.2082, + "epoch": 0.20336782894025168, + "grad_norm": 0.4921513503843826, + "learning_rate": 1.936042602677251e-05, + "loss": 0.5288, "step": 705 }, { - "epoch": 0.20452990529689244, - "grad_norm": 0.500587930536983, - "learning_rate": 1.9999752154513036e-05, - "loss": 0.2039, + "epoch": 0.20481015396819674, + "grad_norm": 0.5421091687931597, + "learning_rate": 1.934258561510138e-05, + "loss": 0.5151, "step": 710 }, { - "epoch": 0.2059702567426452, - "grad_norm": 0.4561308107687448, - "learning_rate": 1.9999543510665e-05, - "loss": 0.212, + "epoch": 0.20625247899614177, + "grad_norm": 0.7576428493111558, + "learning_rate": 1.932450824335832e-05, + "loss": 0.477, "step": 715 }, { - "epoch": 0.20741060818839796, - "grad_norm": 0.4425298382894157, - "learning_rate": 1.9999271643585775e-05, - "loss": 0.2032, + "epoch": 0.20769480402408683, + "grad_norm": 0.424961853700426, + "learning_rate": 1.9306194370047592e-05, + "loss": 0.5342, "step": 720 }, { - "epoch": 0.20885095963415073, - "grad_norm": 0.48158901469019905, - "learning_rate": 1.9998936554994277e-05, - "loss": 0.2033, + "epoch": 0.20913712905203188, + "grad_norm": 0.49906945581307455, + "learning_rate": 1.9287644459671948e-05, + "loss": 0.5334, "step": 725 }, { - "epoch": 0.2102913110799035, - "grad_norm": 0.45355370323735367, - "learning_rate": 1.9998538247009135e-05, - "loss": 0.2015, + "epoch": 0.2105794540799769, + "grad_norm": 0.46177937508565325, + "learning_rate": 1.926885898272085e-05, + "loss": 0.4989, "step": 730 }, { - "epoch": 0.21173166252565626, - "grad_norm": 0.45663265150871324, - "learning_rate": 1.99980767221487e-05, - "loss": 0.2067, + "epoch": 0.21202177910792197, + "grad_norm": 0.4920606306275181, + "learning_rate": 1.9249838415658543e-05, + "loss": 0.5448, "step": 735 }, { - "epoch": 0.21317201397140903, - "grad_norm": 0.5282768185639742, - "learning_rate": 1.999755198333101e-05, - "loss": 0.1984, + "epoch": 0.21346410413586703, + "grad_norm": 0.4191101613829332, + "learning_rate": 1.9230583240911954e-05, + "loss": 0.4694, "step": 740 }, { - "epoch": 0.21461236541716178, - "grad_norm": 0.5312593562636294, - "learning_rate": 1.999696403387379e-05, - "loss": 0.1834, + "epoch": 0.21490642916381206, + "grad_norm": 0.48817506876963557, + "learning_rate": 1.9211093946858484e-05, + "loss": 0.5173, "step": 745 }, { - "epoch": 0.21605271686291455, - "grad_norm": 0.4522583632651766, - "learning_rate": 1.9996312877494413e-05, - "loss": 0.2074, + "epoch": 0.21634875419175711, + "grad_norm": 0.5126984233381934, + "learning_rate": 1.919137102781359e-05, + "loss": 0.5074, "step": 750 }, { - "epoch": 0.21749306830866733, - "grad_norm": 0.45521876727857974, - "learning_rate": 1.9995598518309886e-05, - "loss": 0.2118, + "epoch": 0.21779107921970217, + "grad_norm": 0.5334260917924061, + "learning_rate": 1.9171414984018266e-05, + "loss": 0.4917, "step": 755 }, { - "epoch": 0.21893341975442007, - "grad_norm": 0.4661350669834831, - "learning_rate": 1.999482096083683e-05, - "loss": 0.2026, + "epoch": 0.2192334042476472, + "grad_norm": 0.5501541841297073, + "learning_rate": 1.915122632162635e-05, + "loss": 0.5152, "step": 760 }, { - "epoch": 0.22037377120017285, - "grad_norm": 0.42027669464187223, - "learning_rate": 1.9993980209991435e-05, - "loss": 0.2053, + "epoch": 0.22067572927559226, + "grad_norm": 0.4359723210170646, + "learning_rate": 1.913080555269169e-05, + "loss": 0.5215, "step": 765 }, { - "epoch": 0.2218141226459256, - "grad_norm": 0.4889241347368223, - "learning_rate": 1.9993076271089443e-05, - "loss": 0.2007, + "epoch": 0.22211805430353732, + "grad_norm": 0.5662077360043514, + "learning_rate": 1.911015319515515e-05, + "loss": 0.5253, "step": 770 }, { - "epoch": 0.22325447409167837, - "grad_norm": 0.44479392282113467, - "learning_rate": 1.999210914984611e-05, - "loss": 0.1886, + "epoch": 0.22356037933148235, + "grad_norm": 0.4764077159702808, + "learning_rate": 1.908926977283148e-05, + "loss": 0.5066, "step": 775 }, { - "epoch": 0.22469482553743114, - "grad_norm": 0.5309438649358437, - "learning_rate": 1.999107885237617e-05, - "loss": 0.1957, + "epoch": 0.2250027043594274, + "grad_norm": 0.5639009005172965, + "learning_rate": 1.9068155815396018e-05, + "loss": 0.474, "step": 780 }, { - "epoch": 0.2261351769831839, - "grad_norm": 0.50630514648079, - "learning_rate": 1.9989985385193788e-05, - "loss": 0.2054, + "epoch": 0.22644502938737243, + "grad_norm": 0.6776509031874417, + "learning_rate": 1.904681185837128e-05, + "loss": 0.5025, "step": 785 }, { - "epoch": 0.22757552842893666, - "grad_norm": 0.5119561463816542, - "learning_rate": 1.9988828755212533e-05, - "loss": 0.2153, + "epoch": 0.2278873544153175, + "grad_norm": 0.3940863617407268, + "learning_rate": 1.9025238443113346e-05, + "loss": 0.4781, "step": 790 }, { - "epoch": 0.22901587987468944, - "grad_norm": 0.5058068636508265, - "learning_rate": 1.9987608969745338e-05, - "loss": 0.2021, + "epoch": 0.22932967944326255, + "grad_norm": 0.5731371374463607, + "learning_rate": 1.9003436116798156e-05, + "loss": 0.5325, "step": 795 }, { - "epoch": 0.23045623132044218, - "grad_norm": 0.5387219159158904, - "learning_rate": 1.998632603650442e-05, - "loss": 0.2013, + "epoch": 0.23077200447120758, + "grad_norm": 0.44630504407580995, + "learning_rate": 1.898140543240762e-05, + "loss": 0.5094, "step": 800 }, { - "epoch": 0.23189658276619496, - "grad_norm": 0.534399378765923, - "learning_rate": 1.998497996360127e-05, - "loss": 0.2029, + "epoch": 0.23221432949915263, + "grad_norm": 0.5013841323056458, + "learning_rate": 1.8959146948715582e-05, + "loss": 0.5123, "step": 805 }, { - "epoch": 0.2333369342119477, - "grad_norm": 0.48567637464880914, - "learning_rate": 1.998357075954659e-05, - "loss": 0.2176, + "epoch": 0.2336566545270977, + "grad_norm": 0.6517172353158069, + "learning_rate": 1.8936661230273677e-05, + "loss": 0.4944, "step": 810 }, { - "epoch": 0.23477728565770048, - "grad_norm": 0.4150282760082826, - "learning_rate": 1.998209843325023e-05, - "loss": 0.2032, + "epoch": 0.23509897955504272, + "grad_norm": 0.5321704297258375, + "learning_rate": 1.8913948847396978e-05, + "loss": 0.5111, "step": 815 }, { - "epoch": 0.23621763710345325, - "grad_norm": 0.41070942542681976, - "learning_rate": 1.9980562994021132e-05, - "loss": 0.1923, + "epoch": 0.23654130458298778, + "grad_norm": 0.5733385459091142, + "learning_rate": 1.8891010376149554e-05, + "loss": 0.5255, "step": 820 }, { - "epoch": 0.237657988549206, - "grad_norm": 0.4109088959795977, - "learning_rate": 1.9978964451567285e-05, - "loss": 0.2108, + "epoch": 0.23798362961093283, + "grad_norm": 0.6439828549708082, + "learning_rate": 1.8867846398329856e-05, + "loss": 0.5224, "step": 825 }, { - "epoch": 0.23909833999495878, - "grad_norm": 0.4527005874298046, - "learning_rate": 1.997730281599565e-05, - "loss": 0.2139, + "epoch": 0.23942595463887786, + "grad_norm": 0.526933741666615, + "learning_rate": 1.884445750145595e-05, + "loss": 0.4987, "step": 830 }, { - "epoch": 0.24053869144071152, - "grad_norm": 0.4639405357645518, - "learning_rate": 1.9975578097812108e-05, - "loss": 0.1936, + "epoch": 0.24086827966682292, + "grad_norm": 0.4358091890203275, + "learning_rate": 1.882084427875062e-05, + "loss": 0.5151, "step": 835 }, { - "epoch": 0.2419790428864643, - "grad_norm": 0.4277370943857234, - "learning_rate": 1.997379030792138e-05, - "loss": 0.1904, + "epoch": 0.24231060469476798, + "grad_norm": 0.42052312366605993, + "learning_rate": 1.8797007329126336e-05, + "loss": 0.5292, "step": 840 }, { - "epoch": 0.24341939433221707, - "grad_norm": 0.4168082342597945, - "learning_rate": 1.9971939457626966e-05, - "loss": 0.2007, + "epoch": 0.243752929722713, + "grad_norm": 0.5162254671712243, + "learning_rate": 1.8772947257170034e-05, + "loss": 0.4701, "step": 845 }, { - "epoch": 0.24485974577796982, - "grad_norm": 0.39209563109237133, - "learning_rate": 1.9970025558631075e-05, - "loss": 0.2038, + "epoch": 0.24519525475065806, + "grad_norm": 0.41421320556868774, + "learning_rate": 1.8748664673127814e-05, + "loss": 0.4869, "step": 850 }, { - "epoch": 0.2463000972237226, - "grad_norm": 0.40504280128109277, - "learning_rate": 1.9968048623034546e-05, - "loss": 0.1927, + "epoch": 0.2466375797786031, + "grad_norm": 0.44489422959937447, + "learning_rate": 1.872416019288944e-05, + "loss": 0.5107, "step": 855 }, { - "epoch": 0.24774044866947534, - "grad_norm": 0.49852597790981823, - "learning_rate": 1.996600866333678e-05, - "loss": 0.1951, + "epoch": 0.24807990480654815, + "grad_norm": 0.5131502882549939, + "learning_rate": 1.8699434437972726e-05, + "loss": 0.5002, "step": 860 }, { - "epoch": 0.2491808001152281, - "grad_norm": 0.49051458590903607, - "learning_rate": 1.9963905692435642e-05, - "loss": 0.1984, + "epoch": 0.2495222298344932, + "grad_norm": 0.4410628046298298, + "learning_rate": 1.8674488035507776e-05, + "loss": 0.5033, "step": 865 }, { - "epoch": 0.25062115156098086, - "grad_norm": 0.4132497703343479, - "learning_rate": 1.9961739723627412e-05, - "loss": 0.195, + "epoch": 0.25096455486243824, + "grad_norm": 0.424822720640458, + "learning_rate": 1.864932161822107e-05, + "loss": 0.459, "step": 870 }, { - "epoch": 0.25206150300673363, - "grad_norm": 0.4435532753877853, - "learning_rate": 1.9959510770606657e-05, - "loss": 0.1908, + "epoch": 0.2524068798903833, + "grad_norm": 0.546763650924181, + "learning_rate": 1.8623935824419416e-05, + "loss": 0.4782, "step": 875 }, { - "epoch": 0.2535018544524864, - "grad_norm": 0.48838619445736153, - "learning_rate": 1.9957218847466193e-05, - "loss": 0.2173, + "epoch": 0.25384920491832835, + "grad_norm": 0.571446149303962, + "learning_rate": 1.859833129797378e-05, + "loss": 0.4971, "step": 880 }, { - "epoch": 0.2549422058982392, - "grad_norm": 0.41570171710878423, - "learning_rate": 1.995486396869695e-05, - "loss": 0.2109, + "epoch": 0.2552915299462734, + "grad_norm": 0.3881051890411508, + "learning_rate": 1.857250868830292e-05, + "loss": 0.4645, "step": 885 }, { - "epoch": 0.25638255734399196, - "grad_norm": 0.4888694180605261, - "learning_rate": 1.995244614918792e-05, - "loss": 0.1953, + "epoch": 0.25673385497421847, + "grad_norm": 0.4365270093969844, + "learning_rate": 1.8546468650356947e-05, + "loss": 0.4999, "step": 890 }, { - "epoch": 0.2578229087897447, - "grad_norm": 0.40020516792336286, - "learning_rate": 1.994996540422603e-05, - "loss": 0.1909, + "epoch": 0.25817618000216347, + "grad_norm": 0.39922925876114046, + "learning_rate": 1.852021184460069e-05, + "loss": 0.4607, "step": 895 }, { - "epoch": 0.25926326023549745, - "grad_norm": 0.3812000551763114, - "learning_rate": 1.9947421749496076e-05, - "loss": 0.1919, + "epoch": 0.2596185050301085, + "grad_norm": 0.4385372209974039, + "learning_rate": 1.849373893699697e-05, + "loss": 0.5032, "step": 900 }, { - "epoch": 0.2607036116812502, - "grad_norm": 0.43266733746493274, - "learning_rate": 1.9944815201080594e-05, - "loss": 0.193, + "epoch": 0.2610608300580536, + "grad_norm": 0.4289486219739114, + "learning_rate": 1.8467050598989677e-05, + "loss": 0.5003, "step": 905 }, { - "epoch": 0.262143963127003, - "grad_norm": 0.47929032918009606, - "learning_rate": 1.9942145775459774e-05, - "loss": 0.1971, + "epoch": 0.26250315508599864, + "grad_norm": 0.4045886984758963, + "learning_rate": 1.8440147507486765e-05, + "loss": 0.4644, "step": 910 }, { - "epoch": 0.2635843145727558, - "grad_norm": 0.4817650251465911, - "learning_rate": 1.9939413489511365e-05, - "loss": 0.2002, + "epoch": 0.2639454801139437, + "grad_norm": 0.43637212820672877, + "learning_rate": 1.8413030344843064e-05, + "loss": 0.5057, "step": 915 }, { - "epoch": 0.2650246660185085, - "grad_norm": 0.46730468361587124, - "learning_rate": 1.9936618360510545e-05, - "loss": 0.1884, + "epoch": 0.2653878051418887, + "grad_norm": 0.468355616591299, + "learning_rate": 1.838569979884301e-05, + "loss": 0.4967, "step": 920 }, { - "epoch": 0.26646501746426127, - "grad_norm": 0.3967806288180982, - "learning_rate": 1.9933760406129834e-05, - "loss": 0.1897, + "epoch": 0.26683013016983376, + "grad_norm": 0.4257178939942325, + "learning_rate": 1.835815656268314e-05, + "loss": 0.4848, "step": 925 }, { - "epoch": 0.26790536891001404, - "grad_norm": 0.4034947494394968, - "learning_rate": 1.9930839644438966e-05, - "loss": 0.2021, + "epoch": 0.2682724551977788, + "grad_norm": 0.6504232751090008, + "learning_rate": 1.8330401334954567e-05, + "loss": 0.4958, "step": 930 }, { - "epoch": 0.2693457203557668, - "grad_norm": 0.43359445557784915, - "learning_rate": 1.992785609390478e-05, - "loss": 0.1891, + "epoch": 0.26971478022572387, + "grad_norm": 0.4492644770064815, + "learning_rate": 1.8302434819625234e-05, + "loss": 0.4868, "step": 935 }, { - "epoch": 0.2707860718015196, - "grad_norm": 0.4619161295977017, - "learning_rate": 1.992480977339111e-05, - "loss": 0.1937, + "epoch": 0.27115710525366893, + "grad_norm": 0.37095796426726924, + "learning_rate": 1.8274257726022054e-05, + "loss": 0.4472, "step": 940 }, { - "epoch": 0.2722264232472723, - "grad_norm": 0.4703530373966522, - "learning_rate": 1.9921700702158657e-05, - "loss": 0.1839, + "epoch": 0.272599430281614, + "grad_norm": 0.4070852473871566, + "learning_rate": 1.824587076881294e-05, + "loss": 0.4686, "step": 945 }, { - "epoch": 0.2736667746930251, - "grad_norm": 0.4173755704909504, - "learning_rate": 1.9918528899864875e-05, - "loss": 0.1811, + "epoch": 0.274041755309559, + "grad_norm": 0.44023807834971757, + "learning_rate": 1.821727466798867e-05, + "loss": 0.471, "step": 950 }, { - "epoch": 0.27510712613877786, - "grad_norm": 0.39226526189502814, - "learning_rate": 1.9915294386563834e-05, - "loss": 0.1929, + "epoch": 0.27548408033750404, + "grad_norm": 0.5209872184391927, + "learning_rate": 1.8188470148844602e-05, + "loss": 0.4962, "step": 955 }, { - "epoch": 0.27654747758453063, - "grad_norm": 0.3798320414179301, - "learning_rate": 1.9911997182706108e-05, - "loss": 0.2024, + "epoch": 0.2769264053654491, + "grad_norm": 0.41685090109899176, + "learning_rate": 1.8159457941962325e-05, + "loss": 0.475, "step": 960 }, { - "epoch": 0.2779878290302834, - "grad_norm": 0.4351779082897012, - "learning_rate": 1.9908637309138636e-05, - "loss": 0.1884, + "epoch": 0.27836873039339416, + "grad_norm": 0.5171250899115861, + "learning_rate": 1.8130238783191087e-05, + "loss": 0.5163, "step": 965 }, { - "epoch": 0.2794281804760361, - "grad_norm": 0.46467414117322164, - "learning_rate": 1.9905214787104592e-05, - "loss": 0.1989, + "epoch": 0.2798110554213392, + "grad_norm": 0.47139497814149867, + "learning_rate": 1.810081341362915e-05, + "loss": 0.4641, "step": 970 }, { - "epoch": 0.2808685319217889, - "grad_norm": 0.47966903762788865, - "learning_rate": 1.990172963824326e-05, - "loss": 0.1984, + "epoch": 0.2812533804492843, + "grad_norm": 0.3879518437836758, + "learning_rate": 1.8071182579604986e-05, + "loss": 0.4777, "step": 975 }, { - "epoch": 0.2823088833675417, - "grad_norm": 0.40419487334235976, - "learning_rate": 1.9898181884589877e-05, - "loss": 0.1995, + "epoch": 0.2826957054772293, + "grad_norm": 0.455341690737865, + "learning_rate": 1.804134703265836e-05, + "loss": 0.5271, "step": 980 }, { - "epoch": 0.28374923481329445, - "grad_norm": 0.4140452081019953, - "learning_rate": 1.9894571548575516e-05, - "loss": 0.2038, + "epoch": 0.28413803050517433, + "grad_norm": 0.39108612071221016, + "learning_rate": 1.8011307529521255e-05, + "loss": 0.4645, "step": 985 }, { - "epoch": 0.2851895862590472, - "grad_norm": 0.4735712012361699, - "learning_rate": 1.9890898653026926e-05, - "loss": 0.1991, + "epoch": 0.2855803555331194, + "grad_norm": 0.3865948965496386, + "learning_rate": 1.7981064832098687e-05, + "loss": 0.4578, "step": 990 }, { - "epoch": 0.2866299377048, - "grad_norm": 0.4817269779622208, - "learning_rate": 1.9887163221166405e-05, - "loss": 0.2039, + "epoch": 0.28702268056106445, + "grad_norm": 0.40375523747783393, + "learning_rate": 1.7950619707449374e-05, + "loss": 0.4923, "step": 995 }, { - "epoch": 0.2880702891505527, - "grad_norm": 0.4052308677688318, - "learning_rate": 1.9883365276611634e-05, - "loss": 0.2023, + "epoch": 0.2884650055890095, + "grad_norm": 0.3376017909117174, + "learning_rate": 1.7919972927766288e-05, + "loss": 0.4658, "step": 1000 }, { - "epoch": 0.2880702891505527, - "eval_loss": 0.19591009616851807, - "eval_runtime": 181.0536, - "eval_samples_per_second": 9.964, - "eval_steps_per_second": 2.491, + "epoch": 0.2884650055890095, + "eval_loss": 0.4833250343799591, + "eval_runtime": 142.0125, + "eval_samples_per_second": 12.682, + "eval_steps_per_second": 3.176, "step": 1000 }, { - "epoch": 0.2895106405963055, - "grad_norm": 0.41953136423806925, - "learning_rate": 1.987950484337554e-05, - "loss": 0.1839, + "epoch": 0.2899073306169545, + "grad_norm": 0.47138251586932034, + "learning_rate": 1.7889125270357053e-05, + "loss": 0.4851, "step": 1005 }, { - "epoch": 0.29095099204205827, - "grad_norm": 0.46884000577230694, - "learning_rate": 1.987558194586615e-05, - "loss": 0.1827, + "epoch": 0.29134965564489956, + "grad_norm": 0.522686359505293, + "learning_rate": 1.7858077517624265e-05, + "loss": 0.4788, "step": 1010 }, { - "epoch": 0.29239134348781104, - "grad_norm": 0.5492195075262948, - "learning_rate": 1.9871596608886416e-05, - "loss": 0.2055, + "epoch": 0.2927919806728446, + "grad_norm": 0.6355398882354177, + "learning_rate": 1.7826830457045608e-05, + "loss": 0.4525, "step": 1015 }, { - "epoch": 0.2938316949335638, - "grad_norm": 0.4125424201488764, - "learning_rate": 1.9867548857634077e-05, - "loss": 0.1991, + "epoch": 0.2942343057007897, + "grad_norm": 0.44577505392395406, + "learning_rate": 1.7795384881153896e-05, + "loss": 0.4614, "step": 1020 }, { - "epoch": 0.29527204637931653, - "grad_norm": 0.3811786481459084, - "learning_rate": 1.9863438717701497e-05, - "loss": 0.194, + "epoch": 0.29567663072873474, + "grad_norm": 0.454859759409631, + "learning_rate": 1.7763741587516983e-05, + "loss": 0.5021, "step": 1025 }, { - "epoch": 0.2967123978250693, - "grad_norm": 0.4304143670583801, - "learning_rate": 1.985926621507549e-05, - "loss": 0.2051, + "epoch": 0.2971189557566798, + "grad_norm": 0.6161570485074761, + "learning_rate": 1.7731901378717523e-05, + "loss": 0.4903, "step": 1030 }, { - "epoch": 0.2981527492708221, - "grad_norm": 0.4060031359968526, - "learning_rate": 1.9855031376137174e-05, - "loss": 0.1923, + "epoch": 0.2985612807846248, + "grad_norm": 0.43940664169854093, + "learning_rate": 1.769986506233261e-05, + "loss": 0.4819, "step": 1035 }, { - "epoch": 0.29959310071657486, - "grad_norm": 0.43396401122257877, - "learning_rate": 1.985073422766179e-05, - "loss": 0.2047, + "epoch": 0.30000360581256985, + "grad_norm": 0.4426640967510136, + "learning_rate": 1.7667633450913307e-05, + "loss": 0.4579, "step": 1040 }, { - "epoch": 0.30103345216232763, - "grad_norm": 0.43955835043716046, - "learning_rate": 1.9846374796818536e-05, - "loss": 0.1966, + "epoch": 0.3014459308405149, + "grad_norm": 0.5064920131450599, + "learning_rate": 1.763520736196402e-05, + "loss": 0.5066, "step": 1045 }, { - "epoch": 0.30247380360808035, - "grad_norm": 0.38189862082042997, - "learning_rate": 1.9841953111170407e-05, - "loss": 0.1753, + "epoch": 0.30288825586845997, + "grad_norm": 0.3628170152752897, + "learning_rate": 1.7602587617921785e-05, + "loss": 0.423, "step": 1050 }, { - "epoch": 0.3039141550538331, - "grad_norm": 0.42163518938375355, - "learning_rate": 1.9837469198673996e-05, - "loss": 0.2136, + "epoch": 0.304330580896405, + "grad_norm": 0.4756441564342862, + "learning_rate": 1.7569775046135388e-05, + "loss": 0.5278, "step": 1055 }, { - "epoch": 0.3053545064995859, - "grad_norm": 0.41241261767807075, - "learning_rate": 1.9832923087679352e-05, - "loss": 0.1977, + "epoch": 0.30577290592435, + "grad_norm": 0.40932967287449395, + "learning_rate": 1.753677047884439e-05, + "loss": 0.4565, "step": 1060 }, { - "epoch": 0.3067948579453387, - "grad_norm": 0.41901389346526713, - "learning_rate": 1.9828314806929762e-05, - "loss": 0.1887, + "epoch": 0.3072152309522951, + "grad_norm": 0.4148447936276441, + "learning_rate": 1.7503574753158022e-05, + "loss": 0.4819, "step": 1065 }, { - "epoch": 0.30823520939109145, - "grad_norm": 0.4500388667621992, - "learning_rate": 1.9823644385561596e-05, - "loss": 0.2098, + "epoch": 0.30865755598024014, + "grad_norm": 0.3868133979093347, + "learning_rate": 1.747018871103395e-05, + "loss": 0.4707, "step": 1070 }, { - "epoch": 0.30967556083684417, - "grad_norm": 0.4039289925589606, - "learning_rate": 1.9818911853104118e-05, - "loss": 0.1939, + "epoch": 0.3100998810081852, + "grad_norm": 0.39630255989567886, + "learning_rate": 1.743661319925691e-05, + "loss": 0.4387, "step": 1075 }, { - "epoch": 0.31111591228259694, - "grad_norm": 0.46025963744196113, - "learning_rate": 1.981411723947929e-05, - "loss": 0.1923, + "epoch": 0.31154220603613025, + "grad_norm": 0.4233553435649959, + "learning_rate": 1.7402849069417246e-05, + "loss": 0.465, "step": 1080 }, { - "epoch": 0.3125562637283497, - "grad_norm": 0.4080631379080584, - "learning_rate": 1.9809260575001595e-05, - "loss": 0.1859, + "epoch": 0.3129845310640753, + "grad_norm": 0.37304393376464795, + "learning_rate": 1.7368897177889307e-05, + "loss": 0.4854, "step": 1085 }, { - "epoch": 0.3139966151741025, - "grad_norm": 0.5000635476212351, - "learning_rate": 1.980434189037784e-05, - "loss": 0.176, + "epoch": 0.3144268560920203, + "grad_norm": 0.41669096423193014, + "learning_rate": 1.7334758385809715e-05, + "loss": 0.4369, "step": 1090 }, { - "epoch": 0.31543696661985526, - "grad_norm": 0.4846532765583438, - "learning_rate": 1.9799361216706948e-05, - "loss": 0.1889, + "epoch": 0.31586918111996537, + "grad_norm": 0.3950040493214593, + "learning_rate": 1.7300433559055533e-05, + "loss": 0.4488, "step": 1095 }, { - "epoch": 0.316877318065608, - "grad_norm": 0.42730669186483955, - "learning_rate": 1.9794318585479795e-05, - "loss": 0.1962, + "epoch": 0.3173115061479104, + "grad_norm": 0.4206456914262744, + "learning_rate": 1.7265923568222315e-05, + "loss": 0.4608, "step": 1100 }, { - "epoch": 0.31831766951136076, - "grad_norm": 0.4400767346219366, - "learning_rate": 1.9789214028578978e-05, - "loss": 0.1979, + "epoch": 0.3187538311758555, + "grad_norm": 0.5459001712618055, + "learning_rate": 1.7231229288602e-05, + "loss": 0.4419, "step": 1105 }, { - "epoch": 0.31975802095711353, - "grad_norm": 0.3670443287370546, - "learning_rate": 1.9784047578278623e-05, - "loss": 0.1995, + "epoch": 0.32019615620380054, + "grad_norm": 0.4002983479690819, + "learning_rate": 1.7196351600160725e-05, + "loss": 0.4575, "step": 1110 }, { - "epoch": 0.3211983724028663, - "grad_norm": 0.4279589568148661, - "learning_rate": 1.9778819267244197e-05, - "loss": 0.1829, + "epoch": 0.3216384812317456, + "grad_norm": 0.5400371185813517, + "learning_rate": 1.716129138751651e-05, + "loss": 0.4402, "step": 1115 }, { - "epoch": 0.3226387238486191, - "grad_norm": 0.3918422995120454, - "learning_rate": 1.9773529128532275e-05, - "loss": 0.1892, + "epoch": 0.3230808062596906, + "grad_norm": 0.4526337203461876, + "learning_rate": 1.712604953991681e-05, + "loss": 0.4923, "step": 1120 }, { - "epoch": 0.32407907529437185, - "grad_norm": 0.449542089139243, - "learning_rate": 1.9768177195590352e-05, - "loss": 0.184, + "epoch": 0.32452313128763566, + "grad_norm": 0.3924148895626424, + "learning_rate": 1.709062695121597e-05, + "loss": 0.4734, "step": 1125 }, { - "epoch": 0.3255194267401246, - "grad_norm": 0.4363100481377313, - "learning_rate": 1.9762763502256625e-05, - "loss": 0.1946, + "epoch": 0.3259654563155807, + "grad_norm": 0.45730078891879783, + "learning_rate": 1.7055024519852554e-05, + "loss": 0.4935, "step": 1130 }, { - "epoch": 0.32695977818587735, - "grad_norm": 0.38895014838177233, - "learning_rate": 1.9757288082759766e-05, - "loss": 0.189, + "epoch": 0.32740778134352577, + "grad_norm": 0.41765126413107173, + "learning_rate": 1.7019243148826547e-05, + "loss": 0.4778, "step": 1135 }, { - "epoch": 0.3284001296316301, - "grad_norm": 0.39937918835152797, - "learning_rate": 1.9751750971718734e-05, - "loss": 0.1965, + "epoch": 0.32885010637147083, + "grad_norm": 0.48822731606676767, + "learning_rate": 1.6983283745676464e-05, + "loss": 0.4786, "step": 1140 }, { - "epoch": 0.3298404810773829, - "grad_norm": 0.37444043134636334, - "learning_rate": 1.9746152204142536e-05, - "loss": 0.1842, + "epoch": 0.33029243139941583, + "grad_norm": 0.47444702764857977, + "learning_rate": 1.6947147222456318e-05, + "loss": 0.4732, "step": 1145 }, { - "epoch": 0.33128083252313567, - "grad_norm": 0.407018722514467, - "learning_rate": 1.9740491815429996e-05, - "loss": 0.1771, + "epoch": 0.3317347564273609, + "grad_norm": 0.36819652961308474, + "learning_rate": 1.6910834495712504e-05, + "loss": 0.49, "step": 1150 }, { - "epoch": 0.3327211839688884, - "grad_norm": 0.38945712491554313, - "learning_rate": 1.973476984136956e-05, - "loss": 0.1808, + "epoch": 0.33317708145530595, + "grad_norm": 0.3963647053897705, + "learning_rate": 1.6874346486460543e-05, + "loss": 0.4599, "step": 1155 }, { - "epoch": 0.33416153541464116, - "grad_norm": 0.38168932221231533, - "learning_rate": 1.9728986318139048e-05, - "loss": 0.1927, + "epoch": 0.334619406483251, + "grad_norm": 0.3557684139157355, + "learning_rate": 1.6837684120161723e-05, + "loss": 0.4603, "step": 1160 }, { - "epoch": 0.33560188686039394, - "grad_norm": 0.40270685047202076, - "learning_rate": 1.9723141282305432e-05, - "loss": 0.1914, + "epoch": 0.33606173151119606, + "grad_norm": 0.42399774345522806, + "learning_rate": 1.680084832669962e-05, + "loss": 0.4322, "step": 1165 }, { - "epoch": 0.3370422383061467, - "grad_norm": 0.38062787430312417, - "learning_rate": 1.9717234770824598e-05, - "loss": 0.2004, + "epoch": 0.3375040565391411, + "grad_norm": 0.4013586249486658, + "learning_rate": 1.6763840040356522e-05, + "loss": 0.4398, "step": 1170 }, { - "epoch": 0.3384825897518995, - "grad_norm": 0.37505399884013846, - "learning_rate": 1.9711266821041134e-05, - "loss": 0.1869, + "epoch": 0.3389463815670861, + "grad_norm": 0.44604773948712173, + "learning_rate": 1.6726660199789733e-05, + "loss": 0.4265, "step": 1175 }, { - "epoch": 0.3399229411976522, - "grad_norm": 0.38856749140181485, - "learning_rate": 1.9705237470688064e-05, - "loss": 0.1723, + "epoch": 0.3403887065950312, + "grad_norm": 0.39551679284847074, + "learning_rate": 1.6689309748007753e-05, + "loss": 0.4418, "step": 1180 }, { - "epoch": 0.341363292643405, - "grad_norm": 0.48828707239384744, - "learning_rate": 1.969914675788663e-05, - "loss": 0.1878, + "epoch": 0.34183103162297623, + "grad_norm": 0.451264115692116, + "learning_rate": 1.6651789632346377e-05, + "loss": 0.4483, "step": 1185 }, { - "epoch": 0.34280364408915776, - "grad_norm": 0.3994017025051837, - "learning_rate": 1.969299472114605e-05, - "loss": 0.1972, + "epoch": 0.3432733566509213, + "grad_norm": 0.4689614820007113, + "learning_rate": 1.6614100804444657e-05, + "loss": 0.467, "step": 1190 }, { - "epoch": 0.34424399553491053, - "grad_norm": 0.39594128020669767, - "learning_rate": 1.9686781399363252e-05, - "loss": 0.1771, + "epoch": 0.34471568167886635, + "grad_norm": 0.3841720473679624, + "learning_rate": 1.6576244220220763e-05, + "loss": 0.4313, "step": 1195 }, { - "epoch": 0.3456843469806633, - "grad_norm": 0.37998283976311237, - "learning_rate": 1.9680506831822667e-05, - "loss": 0.1957, + "epoch": 0.3461580067068114, + "grad_norm": 0.4091561009628973, + "learning_rate": 1.6538220839847745e-05, + "loss": 0.434, "step": 1200 }, { - "epoch": 0.347124698426416, - "grad_norm": 0.43308160608261626, - "learning_rate": 1.9674171058195947e-05, - "loss": 0.1842, + "epoch": 0.3476003317347564, + "grad_norm": 0.4473483816905544, + "learning_rate": 1.6500031627729178e-05, + "loss": 0.4446, "step": 1205 }, { - "epoch": 0.3485650498721688, - "grad_norm": 0.5263932390604856, - "learning_rate": 1.9667774118541726e-05, - "loss": 0.1806, + "epoch": 0.34904265676270146, + "grad_norm": 0.4800983187244669, + "learning_rate": 1.6461677552474698e-05, + "loss": 0.4691, "step": 1210 }, { - "epoch": 0.35000540131792157, - "grad_norm": 0.46521786049734537, - "learning_rate": 1.9661316053305374e-05, - "loss": 0.1937, + "epoch": 0.3504849817906465, + "grad_norm": 0.388554374886088, + "learning_rate": 1.642315958687543e-05, + "loss": 0.4517, "step": 1215 }, { - "epoch": 0.35144575276367435, - "grad_norm": 0.3676290666077626, - "learning_rate": 1.9654796903318726e-05, - "loss": 0.1861, + "epoch": 0.3519273068185916, + "grad_norm": 0.4804591032499286, + "learning_rate": 1.6384478707879337e-05, + "loss": 0.4736, "step": 1220 }, { - "epoch": 0.3528861042094271, - "grad_norm": 0.397145673105087, - "learning_rate": 1.9648216709799837e-05, - "loss": 0.1914, + "epoch": 0.35336963184653664, + "grad_norm": 0.4242345257393015, + "learning_rate": 1.6345635896566415e-05, + "loss": 0.4453, "step": 1225 }, { - "epoch": 0.35432645565517984, - "grad_norm": 0.4689947572382222, - "learning_rate": 1.9641575514352717e-05, - "loss": 0.1974, + "epoch": 0.35481195687448164, + "grad_norm": 0.5125929278365619, + "learning_rate": 1.6306632138123814e-05, + "loss": 0.4894, "step": 1230 }, { - "epoch": 0.3557668071009326, - "grad_norm": 0.3911963183342496, - "learning_rate": 1.9634873358967068e-05, - "loss": 0.1799, + "epoch": 0.3562542819024267, + "grad_norm": 0.4135575305051168, + "learning_rate": 1.626746842182087e-05, + "loss": 0.4516, "step": 1235 }, { - "epoch": 0.3572071585466854, - "grad_norm": 0.42683909988407304, - "learning_rate": 1.9628110286018015e-05, - "loss": 0.1914, + "epoch": 0.35769660693037175, + "grad_norm": 0.49733207897305337, + "learning_rate": 1.6228145740983986e-05, + "loss": 0.4676, "step": 1240 }, { - "epoch": 0.35864750999243816, - "grad_norm": 0.4135664643510884, - "learning_rate": 1.9621286338265836e-05, - "loss": 0.1881, + "epoch": 0.3591389319583168, + "grad_norm": 0.405324125927312, + "learning_rate": 1.618866509297147e-05, + "loss": 0.4539, "step": 1245 }, { - "epoch": 0.36008786143819094, - "grad_norm": 0.39390745705939456, - "learning_rate": 1.9614401558855712e-05, - "loss": 0.1804, + "epoch": 0.36058125698626187, + "grad_norm": 0.43290260214899146, + "learning_rate": 1.61490274791482e-05, + "loss": 0.43, "step": 1250 }, { - "epoch": 0.36152821288394366, - "grad_norm": 0.3450938944444097, - "learning_rate": 1.9607455991317432e-05, - "loss": 0.1659, + "epoch": 0.3620235820142069, + "grad_norm": 0.3648124960837181, + "learning_rate": 1.6109233904860258e-05, + "loss": 0.4516, "step": 1255 }, { - "epoch": 0.36296856432969643, - "grad_norm": 0.3860649499749823, - "learning_rate": 1.9600449679565115e-05, - "loss": 0.2026, + "epoch": 0.3634659070421519, + "grad_norm": 0.43358315460862995, + "learning_rate": 1.606928537940942e-05, + "loss": 0.4565, "step": 1260 }, { - "epoch": 0.3644089157754492, - "grad_norm": 0.3705839431178819, - "learning_rate": 1.9593382667896953e-05, - "loss": 0.1853, + "epoch": 0.364908232070097, + "grad_norm": 0.5070316730676355, + "learning_rate": 1.602918291602755e-05, + "loss": 0.4547, "step": 1265 }, { - "epoch": 0.365849267221202, - "grad_norm": 0.3863708731489008, - "learning_rate": 1.9586255000994914e-05, - "loss": 0.1841, + "epoch": 0.36635055709804204, + "grad_norm": 0.4556281361017855, + "learning_rate": 1.5988927531850913e-05, + "loss": 0.4631, "step": 1270 }, { - "epoch": 0.36728961866695475, - "grad_norm": 0.36464576680841765, - "learning_rate": 1.957906672392447e-05, - "loss": 0.1924, + "epoch": 0.3677928821259871, + "grad_norm": 0.4210598158384229, + "learning_rate": 1.5948520247894363e-05, + "loss": 0.4595, "step": 1275 }, { - "epoch": 0.36872997011270753, - "grad_norm": 0.37171858096860977, - "learning_rate": 1.9571817882134316e-05, - "loss": 0.1917, + "epoch": 0.36923520715393215, + "grad_norm": 0.4325982920205171, + "learning_rate": 1.590796208902546e-05, + "loss": 0.4698, "step": 1280 }, { - "epoch": 0.37017032155846025, - "grad_norm": 0.3686315996537359, - "learning_rate": 1.9564508521456048e-05, - "loss": 0.1908, + "epoch": 0.3706775321818772, + "grad_norm": 0.4263624320016057, + "learning_rate": 1.5867254083938472e-05, + "loss": 0.4371, "step": 1285 }, { - "epoch": 0.371610673004213, - "grad_norm": 0.4189046777798967, - "learning_rate": 1.9557138688103925e-05, - "loss": 0.1942, + "epoch": 0.3721198572098222, + "grad_norm": 0.4792938379196713, + "learning_rate": 1.582639726512828e-05, + "loss": 0.4464, "step": 1290 }, { - "epoch": 0.3730510244499658, - "grad_norm": 0.37142444736388497, - "learning_rate": 1.9549708428674537e-05, - "loss": 0.1899, + "epoch": 0.37356218223776727, + "grad_norm": 0.43544663382731996, + "learning_rate": 1.5785392668864186e-05, + "loss": 0.4658, "step": 1295 }, { - "epoch": 0.37449137589571857, - "grad_norm": 0.3986545412554622, - "learning_rate": 1.9542217790146537e-05, - "loss": 0.2, + "epoch": 0.37500450726571233, + "grad_norm": 0.38089232775082726, + "learning_rate": 1.5744241335163642e-05, + "loss": 0.4492, "step": 1300 }, { - "epoch": 0.37593172734147134, - "grad_norm": 0.42032853779043916, - "learning_rate": 1.953466681988032e-05, - "loss": 0.1964, + "epoch": 0.3764468322936574, + "grad_norm": 0.3692067776356917, + "learning_rate": 1.570294430776587e-05, + "loss": 0.4402, "step": 1305 }, { - "epoch": 0.37737207878722406, - "grad_norm": 0.36898012006756564, - "learning_rate": 1.9527055565617735e-05, - "loss": 0.1849, + "epoch": 0.37788915732160244, + "grad_norm": 0.43939772643420716, + "learning_rate": 1.5661502634105376e-05, + "loss": 0.4413, "step": 1310 }, { - "epoch": 0.37881243023297684, - "grad_norm": 0.37522686829642604, - "learning_rate": 1.9519384075481794e-05, - "loss": 0.1804, + "epoch": 0.37933148234954744, + "grad_norm": 0.39362265905546057, + "learning_rate": 1.5619917365285394e-05, + "loss": 0.4314, "step": 1315 }, { - "epoch": 0.3802527816787296, - "grad_norm": 0.39969390457539616, - "learning_rate": 1.9511652397976347e-05, - "loss": 0.1951, + "epoch": 0.3807738073774925, + "grad_norm": 0.41565735116305985, + "learning_rate": 1.557818955605123e-05, + "loss": 0.4564, "step": 1320 }, { - "epoch": 0.3816931331244824, - "grad_norm": 0.38791503198824945, - "learning_rate": 1.950386058198579e-05, - "loss": 0.1829, + "epoch": 0.38221613240543756, + "grad_norm": 0.3633587329212366, + "learning_rate": 1.55363202647635e-05, + "loss": 0.4568, "step": 1325 }, { - "epoch": 0.38313348457023516, - "grad_norm": 0.44412581422573094, - "learning_rate": 1.949600867677475e-05, - "loss": 0.2033, + "epoch": 0.3836584574333826, + "grad_norm": 0.43886686943718484, + "learning_rate": 1.5494310553371292e-05, + "loss": 0.4408, "step": 1330 }, { - "epoch": 0.3845738360159879, - "grad_norm": 0.4122212943592675, - "learning_rate": 1.9488096731987773e-05, - "loss": 0.1786, + "epoch": 0.3851007824613277, + "grad_norm": 0.44313421551297705, + "learning_rate": 1.545216148738523e-05, + "loss": 0.4728, "step": 1335 }, { - "epoch": 0.38601418746174065, - "grad_norm": 0.425017653459728, - "learning_rate": 1.948012479764902e-05, - "loss": 0.1916, + "epoch": 0.38654310748927273, + "grad_norm": 0.43446763871019, + "learning_rate": 1.5409874135850453e-05, + "loss": 0.4413, "step": 1340 }, { - "epoch": 0.38745453890749343, - "grad_norm": 0.4337062414117548, - "learning_rate": 1.9472092924161932e-05, - "loss": 0.1823, + "epoch": 0.38798543251721773, + "grad_norm": 0.5046802087731463, + "learning_rate": 1.5367449571319486e-05, + "loss": 0.451, "step": 1345 }, { - "epoch": 0.3888948903532462, - "grad_norm": 0.3805737492676703, - "learning_rate": 1.9464001162308926e-05, - "loss": 0.1789, + "epoch": 0.3894277575451628, + "grad_norm": 0.4176799699807321, + "learning_rate": 1.5324888869825062e-05, + "loss": 0.4575, "step": 1350 }, { - "epoch": 0.390335241798999, - "grad_norm": 0.36167363167725614, - "learning_rate": 1.945584956325107e-05, - "loss": 0.1898, + "epoch": 0.39087008257310785, + "grad_norm": 0.4357723650429465, + "learning_rate": 1.5282193110852806e-05, + "loss": 0.4628, "step": 1355 }, { - "epoch": 0.3917755932447517, - "grad_norm": 0.40446782411211757, - "learning_rate": 1.9447638178527766e-05, - "loss": 0.1874, + "epoch": 0.3923124076010529, + "grad_norm": 0.47847755269517595, + "learning_rate": 1.5239363377313864e-05, + "loss": 0.4426, "step": 1360 }, { - "epoch": 0.39321594469050447, - "grad_norm": 0.43931132257252586, - "learning_rate": 1.9439367060056403e-05, - "loss": 0.1844, + "epoch": 0.39375473262899796, + "grad_norm": 0.42951183292967315, + "learning_rate": 1.5196400755517445e-05, + "loss": 0.4173, "step": 1365 }, { - "epoch": 0.39465629613625725, - "grad_norm": 0.360584577289545, - "learning_rate": 1.943103626013206e-05, - "loss": 0.1784, + "epoch": 0.395197057656943, + "grad_norm": 0.3712834304196652, + "learning_rate": 1.5153306335143247e-05, + "loss": 0.4185, "step": 1370 }, { - "epoch": 0.39609664758201, - "grad_norm": 0.3838200412167434, - "learning_rate": 1.9422645831427144e-05, - "loss": 0.1977, + "epoch": 0.396639382684888, + "grad_norm": 0.40028893775485, + "learning_rate": 1.5110081209213849e-05, + "loss": 0.4404, "step": 1375 }, { - "epoch": 0.3975369990277628, - "grad_norm": 0.41400589883857647, - "learning_rate": 1.941419582699108e-05, - "loss": 0.1786, + "epoch": 0.3980817077128331, + "grad_norm": 0.3524439650077371, + "learning_rate": 1.5066726474066962e-05, + "loss": 0.436, "step": 1380 }, { - "epoch": 0.3989773504735155, - "grad_norm": 0.44021507039755364, - "learning_rate": 1.940568630024997e-05, - "loss": 0.1864, + "epoch": 0.39952403274077813, + "grad_norm": 0.41796871469443936, + "learning_rate": 1.5023243229327631e-05, + "loss": 0.4465, "step": 1385 }, { - "epoch": 0.4004177019192683, - "grad_norm": 0.4076609951453182, - "learning_rate": 1.9397117305006238e-05, - "loss": 0.1924, + "epoch": 0.4009663577687232, + "grad_norm": 0.39648024648913516, + "learning_rate": 1.4979632577880355e-05, + "loss": 0.4599, "step": 1390 }, { - "epoch": 0.40185805336502106, - "grad_norm": 0.3448331981439342, - "learning_rate": 1.9388488895438322e-05, - "loss": 0.1727, + "epoch": 0.40240868279666825, + "grad_norm": 0.4177593581987727, + "learning_rate": 1.4935895625841095e-05, + "loss": 0.4341, "step": 1395 }, { - "epoch": 0.40329840481077384, - "grad_norm": 0.37971856194171055, - "learning_rate": 1.9379801126100305e-05, - "loss": 0.1908, + "epoch": 0.40385100782461325, + "grad_norm": 0.39474357091689116, + "learning_rate": 1.4892033482529233e-05, + "loss": 0.4251, "step": 1400 }, { - "epoch": 0.4047387562565266, - "grad_norm": 0.4063644753493077, - "learning_rate": 1.937105405192157e-05, - "loss": 0.1959, + "epoch": 0.4052933328525583, + "grad_norm": 0.3925865645135851, + "learning_rate": 1.484804726043943e-05, + "loss": 0.4188, "step": 1405 }, { - "epoch": 0.40617910770227933, - "grad_norm": 0.4240507739263381, - "learning_rate": 1.9362247728206484e-05, - "loss": 0.1847, + "epoch": 0.40673565788050337, + "grad_norm": 0.43881341912306815, + "learning_rate": 1.480393807521342e-05, + "loss": 0.4626, "step": 1410 }, { - "epoch": 0.4076194591480321, - "grad_norm": 0.35602981769186853, - "learning_rate": 1.9353382210634005e-05, - "loss": 0.1765, + "epoch": 0.4081779829084484, + "grad_norm": 0.38784235208087897, + "learning_rate": 1.4759707045611694e-05, + "loss": 0.4356, "step": 1415 }, { - "epoch": 0.4090598105937849, - "grad_norm": 0.3890399540852024, - "learning_rate": 1.934445755525736e-05, - "loss": 0.1833, + "epoch": 0.4096203079363935, + "grad_norm": 0.4652349082201273, + "learning_rate": 1.4715355293485134e-05, + "loss": 0.4429, "step": 1420 }, { - "epoch": 0.41050016203953765, - "grad_norm": 0.3680499036431358, - "learning_rate": 1.9335473818503683e-05, - "loss": 0.1845, + "epoch": 0.41106263296433854, + "grad_norm": 0.5020179396910893, + "learning_rate": 1.4670883943746575e-05, + "loss": 0.4424, "step": 1425 }, { - "epoch": 0.4119405134852904, - "grad_norm": 0.41759109180555776, - "learning_rate": 1.932643105717365e-05, - "loss": 0.1974, + "epoch": 0.41250495799228354, + "grad_norm": 0.46646941577755224, + "learning_rate": 1.4626294124342237e-05, + "loss": 0.4473, "step": 1430 }, { - "epoch": 0.4133808649310432, - "grad_norm": 0.37662092517599377, - "learning_rate": 1.9317329328441126e-05, - "loss": 0.1915, + "epoch": 0.4139472830202286, + "grad_norm": 0.3715580720003536, + "learning_rate": 1.4581586966223156e-05, + "loss": 0.457, "step": 1435 }, { - "epoch": 0.4148212163767959, - "grad_norm": 0.3544406104660179, - "learning_rate": 1.9308168689852816e-05, - "loss": 0.1824, + "epoch": 0.41538960804817365, + "grad_norm": 0.3913149158851186, + "learning_rate": 1.453676360331647e-05, + "loss": 0.4232, "step": 1440 }, { - "epoch": 0.4162615678225487, - "grad_norm": 0.40679163344205593, - "learning_rate": 1.929894919932788e-05, - "loss": 0.1936, + "epoch": 0.4168319330761187, + "grad_norm": 0.3755928140913827, + "learning_rate": 1.4491825172496675e-05, + "loss": 0.4376, "step": 1445 }, { - "epoch": 0.41770191926830147, - "grad_norm": 0.37090593229503027, - "learning_rate": 1.928967091515757e-05, - "loss": 0.1863, + "epoch": 0.41827425810406377, + "grad_norm": 0.4632236851893659, + "learning_rate": 1.4446772813556784e-05, + "loss": 0.4547, "step": 1450 }, { - "epoch": 0.41914227071405424, - "grad_norm": 0.3358948102639496, - "learning_rate": 1.928033389600488e-05, - "loss": 0.1791, + "epoch": 0.4197165831320088, + "grad_norm": 0.3622221987812085, + "learning_rate": 1.4401607669179415e-05, + "loss": 0.4189, "step": 1455 }, { - "epoch": 0.420582622159807, - "grad_norm": 0.39320319005496884, - "learning_rate": 1.927093820090416e-05, - "loss": 0.1967, + "epoch": 0.4211589081599538, + "grad_norm": 0.4427510263617938, + "learning_rate": 1.4356330884907823e-05, + "loss": 0.4307, "step": 1460 }, { - "epoch": 0.42202297360555974, - "grad_norm": 0.39872720769164977, - "learning_rate": 1.9261483889260733e-05, - "loss": 0.1804, + "epoch": 0.4226012331878989, + "grad_norm": 0.40821656664051026, + "learning_rate": 1.4310943609116815e-05, + "loss": 0.4416, "step": 1465 }, { - "epoch": 0.4234633250513125, - "grad_norm": 0.37282558197697574, - "learning_rate": 1.9251971020850545e-05, - "loss": 0.1799, + "epoch": 0.42404355821584394, + "grad_norm": 0.45484460030870416, + "learning_rate": 1.4265446992983661e-05, + "loss": 0.449, "step": 1470 }, { - "epoch": 0.4249036764970653, - "grad_norm": 0.37924447925156907, - "learning_rate": 1.9242399655819777e-05, - "loss": 0.178, + "epoch": 0.425485883243789, + "grad_norm": 0.38430976618751717, + "learning_rate": 1.4219842190458865e-05, + "loss": 0.4445, "step": 1475 }, { - "epoch": 0.42634402794281806, - "grad_norm": 0.41752825519108605, - "learning_rate": 1.923276985468444e-05, - "loss": 0.1863, + "epoch": 0.42692820827173406, + "grad_norm": 0.40624625230940725, + "learning_rate": 1.4174130358236924e-05, + "loss": 0.4734, "step": 1480 }, { - "epoch": 0.42778437938857083, - "grad_norm": 0.5167921346970157, - "learning_rate": 1.922308167833004e-05, - "loss": 0.187, + "epoch": 0.42837053329967906, + "grad_norm": 0.38501281348072397, + "learning_rate": 1.4128312655726957e-05, + "loss": 0.4407, "step": 1485 }, { - "epoch": 0.42922473083432355, - "grad_norm": 0.40873239691837054, - "learning_rate": 1.921333518801115e-05, - "loss": 0.1936, + "epoch": 0.4298128583276241, + "grad_norm": 0.5552503619067779, + "learning_rate": 1.4082390245023337e-05, + "loss": 0.4559, "step": 1490 }, { - "epoch": 0.43066508228007633, - "grad_norm": 0.40907971709968743, - "learning_rate": 1.9203530445351037e-05, - "loss": 0.1782, + "epoch": 0.43125518335556917, + "grad_norm": 0.41269951819834144, + "learning_rate": 1.4036364290876176e-05, + "loss": 0.4407, "step": 1495 }, { - "epoch": 0.4321054337258291, - "grad_norm": 0.38770424167315554, - "learning_rate": 1.9193667512341294e-05, - "loss": 0.1868, + "epoch": 0.43269750838351423, + "grad_norm": 0.4132538908060478, + "learning_rate": 1.3990235960661824e-05, + "loss": 0.4439, "step": 1500 }, { - "epoch": 0.4321054337258291, - "eval_loss": 0.18722842633724213, - "eval_runtime": 179.8721, - "eval_samples_per_second": 10.029, - "eval_steps_per_second": 2.507, + "epoch": 0.43269750838351423, + "eval_loss": 0.43445292115211487, + "eval_runtime": 142.5412, + "eval_samples_per_second": 12.635, + "eval_steps_per_second": 3.164, "step": 1500 }, { - "epoch": 0.4335457851715819, - "grad_norm": 0.4127505427391907, - "learning_rate": 1.918374645134141e-05, - "loss": 0.1891, + "epoch": 0.4341398334114593, + "grad_norm": 0.42757706099004156, + "learning_rate": 1.3944006424353229e-05, + "loss": 0.4247, "step": 1505 }, { - "epoch": 0.43498613661733465, - "grad_norm": 0.4075820439736581, - "learning_rate": 1.9173767325078403e-05, - "loss": 0.1836, + "epoch": 0.43558215843940434, + "grad_norm": 0.36759037583277737, + "learning_rate": 1.389767685449027e-05, + "loss": 0.4306, "step": 1510 }, { - "epoch": 0.43642648806308737, - "grad_norm": 0.41104823750430075, - "learning_rate": 1.9163730196646416e-05, - "loss": 0.1873, + "epoch": 0.43702448346734935, + "grad_norm": 0.42042330760151675, + "learning_rate": 1.3851248426150026e-05, + "loss": 0.4244, "step": 1515 }, { - "epoch": 0.43786683950884014, - "grad_norm": 0.3793283659275875, - "learning_rate": 1.915363512950631e-05, - "loss": 0.1766, + "epoch": 0.4384668084952944, + "grad_norm": 0.38414415773611094, + "learning_rate": 1.380472231691697e-05, + "loss": 0.4377, "step": 1520 }, { - "epoch": 0.4393071909545929, - "grad_norm": 0.39712080280256257, - "learning_rate": 1.9143482187485283e-05, - "loss": 0.1903, + "epoch": 0.43990913352323946, + "grad_norm": 0.4303765304251248, + "learning_rate": 1.375809970685309e-05, + "loss": 0.4574, "step": 1525 }, { - "epoch": 0.4407475424003457, - "grad_norm": 0.38704548175577747, - "learning_rate": 1.9133271434776438e-05, - "loss": 0.1836, + "epoch": 0.4413514585511845, + "grad_norm": 0.39045631524439356, + "learning_rate": 1.3711381778467972e-05, + "loss": 0.4487, "step": 1530 }, { - "epoch": 0.44218789384609847, - "grad_norm": 0.32778859492202395, - "learning_rate": 1.9123002935938405e-05, - "loss": 0.1796, + "epoch": 0.4427937835791296, + "grad_norm": 0.409923537347395, + "learning_rate": 1.36645697166888e-05, + "loss": 0.4155, "step": 1535 }, { - "epoch": 0.4436282452918512, - "grad_norm": 0.41828874008537575, - "learning_rate": 1.911267675589491e-05, - "loss": 0.1875, + "epoch": 0.44423610860707463, + "grad_norm": 0.4590281734742793, + "learning_rate": 1.3617664708830304e-05, + "loss": 0.4211, "step": 1540 }, { - "epoch": 0.44506859673760396, - "grad_norm": 0.3559379941636415, - "learning_rate": 1.9102292959934385e-05, - "loss": 0.1794, + "epoch": 0.44567843363501963, + "grad_norm": 0.4340206380764746, + "learning_rate": 1.3570667944564651e-05, + "loss": 0.43, "step": 1545 }, { - "epoch": 0.44650894818335674, - "grad_norm": 0.4087026290282012, - "learning_rate": 1.9091851613709538e-05, - "loss": 0.1847, + "epoch": 0.4471207586629647, + "grad_norm": 0.3867702108735739, + "learning_rate": 1.3523580615891258e-05, + "loss": 0.4367, "step": 1550 }, { - "epoch": 0.4479492996291095, - "grad_norm": 0.40953363707420476, - "learning_rate": 1.9081352783236945e-05, - "loss": 0.1933, + "epoch": 0.44856308369090975, + "grad_norm": 0.45493644595260835, + "learning_rate": 1.347640391710657e-05, + "loss": 0.4336, "step": 1555 }, { - "epoch": 0.4493896510748623, - "grad_norm": 0.36898746212883354, - "learning_rate": 1.9070796534896644e-05, - "loss": 0.1768, + "epoch": 0.4500054087188548, + "grad_norm": 0.41557484865468924, + "learning_rate": 1.3429139044773768e-05, + "loss": 0.4128, "step": 1560 }, { - "epoch": 0.450830002520615, - "grad_norm": 0.41334025634238375, - "learning_rate": 1.9060182935431682e-05, - "loss": 0.1829, + "epoch": 0.45144773374679986, + "grad_norm": 0.41564130897863455, + "learning_rate": 1.3381787197692413e-05, + "loss": 0.3957, "step": 1565 }, { - "epoch": 0.4522703539663678, - "grad_norm": 0.44332370723191467, - "learning_rate": 1.9049512051947735e-05, - "loss": 0.1901, + "epoch": 0.45289005877474486, + "grad_norm": 0.4011264197640641, + "learning_rate": 1.3334349576868046e-05, + "loss": 0.442, "step": 1570 }, { - "epoch": 0.45371070541212055, - "grad_norm": 0.4004111280630263, - "learning_rate": 1.9038783951912653e-05, - "loss": 0.1889, + "epoch": 0.4543323838026899, + "grad_norm": 0.4825855614290229, + "learning_rate": 1.3286827385481726e-05, + "loss": 0.4058, "step": 1575 }, { - "epoch": 0.4551510568578733, - "grad_norm": 0.371651865125121, - "learning_rate": 1.9027998703156055e-05, - "loss": 0.1812, + "epoch": 0.455774708830635, + "grad_norm": 0.3921023793032671, + "learning_rate": 1.3239221828859509e-05, + "loss": 0.3884, "step": 1580 }, { - "epoch": 0.4565914083036261, - "grad_norm": 0.40485070841566645, - "learning_rate": 1.901715637386887e-05, - "loss": 0.1776, + "epoch": 0.45721703385858004, + "grad_norm": 0.40627991293028837, + "learning_rate": 1.3191534114441883e-05, + "loss": 0.4333, "step": 1585 }, { - "epoch": 0.4580317597493789, - "grad_norm": 0.48633370085572336, - "learning_rate": 1.9006257032602942e-05, - "loss": 0.1789, + "epoch": 0.4586593588865251, + "grad_norm": 0.43891554498901797, + "learning_rate": 1.3143765451753137e-05, + "loss": 0.4166, "step": 1590 }, { - "epoch": 0.4594721111951316, - "grad_norm": 0.3966239241276605, - "learning_rate": 1.8995300748270577e-05, - "loss": 0.1729, + "epoch": 0.46010168391447015, + "grad_norm": 0.39830311047980305, + "learning_rate": 1.3095917052370686e-05, + "loss": 0.4235, "step": 1595 }, { - "epoch": 0.46091246264088437, - "grad_norm": 0.3327027752839659, - "learning_rate": 1.8984287590144102e-05, - "loss": 0.1734, + "epoch": 0.46154400894241515, + "grad_norm": 0.3980453207285396, + "learning_rate": 1.3047990129894348e-05, + "loss": 0.4001, "step": 1600 }, { - "epoch": 0.46235281408663714, - "grad_norm": 0.4317078665592322, - "learning_rate": 1.897321762785544e-05, - "loss": 0.1783, + "epoch": 0.4629863339703602, + "grad_norm": 0.4136578166461488, + "learning_rate": 1.299998589991555e-05, + "loss": 0.4076, "step": 1605 }, { - "epoch": 0.4637931655323899, - "grad_norm": 0.4235966917899685, - "learning_rate": 1.896209093139567e-05, - "loss": 0.1767, + "epoch": 0.46442865899830527, + "grad_norm": 0.4343208402620231, + "learning_rate": 1.2951905579986506e-05, + "loss": 0.4384, "step": 1610 }, { - "epoch": 0.4652335169781427, - "grad_norm": 0.36113982142860973, - "learning_rate": 1.8950907571114568e-05, - "loss": 0.1794, + "epoch": 0.4658709840262503, + "grad_norm": 0.45578762184210947, + "learning_rate": 1.290375038958933e-05, + "loss": 0.4048, "step": 1615 }, { - "epoch": 0.4666738684238954, - "grad_norm": 0.3810588912729469, - "learning_rate": 1.893966761772018e-05, - "loss": 0.1836, + "epoch": 0.4673133090541954, + "grad_norm": 0.46943412662551365, + "learning_rate": 1.285552155010511e-05, + "loss": 0.401, "step": 1620 }, { - "epoch": 0.4681142198696482, - "grad_norm": 0.4196558725198718, - "learning_rate": 1.8928371142278368e-05, - "loss": 0.1866, + "epoch": 0.46875563408214044, + "grad_norm": 0.40848878753251544, + "learning_rate": 1.2807220284782926e-05, + "loss": 0.4461, "step": 1625 }, { - "epoch": 0.46955457131540096, - "grad_norm": 0.43666303493670905, - "learning_rate": 1.891701821621236e-05, - "loss": 0.1901, + "epoch": 0.47019795911008544, + "grad_norm": 0.3921726292273481, + "learning_rate": 1.2758847818708832e-05, + "loss": 0.4205, "step": 1630 }, { - "epoch": 0.47099492276115373, - "grad_norm": 0.3803714847945443, - "learning_rate": 1.8905608911302303e-05, - "loss": 0.1862, + "epoch": 0.4716402841380305, + "grad_norm": 0.45781513572784016, + "learning_rate": 1.2710405378774768e-05, + "loss": 0.4423, "step": 1635 }, { - "epoch": 0.4724352742069065, - "grad_norm": 0.3757549530860626, - "learning_rate": 1.8894143299684797e-05, - "loss": 0.1961, + "epoch": 0.47308260916597555, + "grad_norm": 0.45862261759553535, + "learning_rate": 1.2661894193647458e-05, + "loss": 0.4, "step": 1640 }, { - "epoch": 0.4738756256526592, - "grad_norm": 0.4066659146296864, - "learning_rate": 1.8882621453852456e-05, - "loss": 0.1699, + "epoch": 0.4745249341939206, + "grad_norm": 0.3527899534786595, + "learning_rate": 1.261331549373724e-05, + "loss": 0.3998, "step": 1645 }, { - "epoch": 0.475315977098412, - "grad_norm": 0.3712534952216303, - "learning_rate": 1.8871043446653436e-05, - "loss": 0.1931, + "epoch": 0.47596725922186567, + "grad_norm": 0.36297450328540837, + "learning_rate": 1.2564670511166865e-05, + "loss": 0.4206, "step": 1650 }, { - "epoch": 0.4767563285441648, - "grad_norm": 0.4190807324107653, - "learning_rate": 1.885940935129098e-05, - "loss": 0.1858, + "epoch": 0.47740958424981067, + "grad_norm": 0.4030716124087903, + "learning_rate": 1.2515960479740224e-05, + "loss": 0.4047, "step": 1655 }, { - "epoch": 0.47819667998991755, - "grad_norm": 0.36972687977137836, - "learning_rate": 1.884771924132296e-05, - "loss": 0.1857, + "epoch": 0.4788519092777557, + "grad_norm": 0.41175543047417906, + "learning_rate": 1.246718663491108e-05, + "loss": 0.4345, "step": 1660 }, { - "epoch": 0.4796370314356703, - "grad_norm": 0.3607936830171753, - "learning_rate": 1.8835973190661397e-05, - "loss": 0.1825, + "epoch": 0.4802942343057008, + "grad_norm": 0.3574092930784039, + "learning_rate": 1.2418350213751728e-05, + "loss": 0.4081, "step": 1665 }, { - "epoch": 0.48107738288142304, - "grad_norm": 0.39734635206783725, - "learning_rate": 1.8824171273572017e-05, - "loss": 0.1846, + "epoch": 0.48173655933364584, + "grad_norm": 0.3954039812545518, + "learning_rate": 1.2369452454921604e-05, + "loss": 0.4159, "step": 1670 }, { - "epoch": 0.4825177343271758, - "grad_norm": 0.38332143199966057, - "learning_rate": 1.881231356467375e-05, - "loss": 0.1916, + "epoch": 0.4831788843615909, + "grad_norm": 0.4497181497561506, + "learning_rate": 1.2320494598635886e-05, + "loss": 0.4052, "step": 1675 }, { - "epoch": 0.4839580857729286, - "grad_norm": 0.35427455618937453, - "learning_rate": 1.8800400138938293e-05, - "loss": 0.1763, + "epoch": 0.48462120938953596, + "grad_norm": 0.44655082111096045, + "learning_rate": 1.2271477886634023e-05, + "loss": 0.4123, "step": 1680 }, { - "epoch": 0.48539843721868137, - "grad_norm": 0.3865305729090607, - "learning_rate": 1.8788431071689605e-05, - "loss": 0.1771, + "epoch": 0.48606353441748096, + "grad_norm": 0.40423139543908587, + "learning_rate": 1.2222403562148252e-05, + "loss": 0.4152, "step": 1685 }, { - "epoch": 0.48683878866443414, - "grad_norm": 0.34295884844513114, - "learning_rate": 1.8776406438603457e-05, - "loss": 0.1691, + "epoch": 0.487505859445426, + "grad_norm": 0.36806086858378434, + "learning_rate": 1.2173272869872062e-05, + "loss": 0.4252, "step": 1690 }, { - "epoch": 0.48827914011018686, - "grad_norm": 0.3776721425978484, - "learning_rate": 1.876432631570693e-05, - "loss": 0.1788, + "epoch": 0.4889481844733711, + "grad_norm": 0.41722654899253564, + "learning_rate": 1.2124087055928617e-05, + "loss": 0.3879, "step": 1695 }, { - "epoch": 0.48971949155593963, - "grad_norm": 0.3861886847035672, - "learning_rate": 1.8752190779377958e-05, - "loss": 0.1882, + "epoch": 0.49039050950131613, + "grad_norm": 0.4329150355333478, + "learning_rate": 1.207484736783916e-05, + "loss": 0.3849, "step": 1700 }, { - "epoch": 0.4911598430016924, - "grad_norm": 0.3634727635182055, - "learning_rate": 1.8739999906344817e-05, - "loss": 0.1836, + "epoch": 0.4918328345292612, + "grad_norm": 0.4710085788902766, + "learning_rate": 1.2025555054491367e-05, + "loss": 0.4303, "step": 1705 }, { - "epoch": 0.4926001944474452, - "grad_norm": 0.43152804810108353, - "learning_rate": 1.872775377368567e-05, - "loss": 0.1935, + "epoch": 0.4932751595572062, + "grad_norm": 0.443066548358196, + "learning_rate": 1.1976211366107668e-05, + "loss": 0.4198, "step": 1710 }, { - "epoch": 0.49404054589319796, - "grad_norm": 0.3655986706714032, - "learning_rate": 1.8715452458828057e-05, - "loss": 0.1813, + "epoch": 0.49471748458515125, + "grad_norm": 0.3338656609348242, + "learning_rate": 1.1926817554213548e-05, + "loss": 0.3911, "step": 1715 }, { - "epoch": 0.4954808973389507, - "grad_norm": 0.35970766776797, - "learning_rate": 1.8703096039548415e-05, - "loss": 0.1931, + "epoch": 0.4961598096130963, + "grad_norm": 0.38270258610415053, + "learning_rate": 1.1877374871605786e-05, + "loss": 0.4068, "step": 1720 }, { - "epoch": 0.49692124878470345, - "grad_norm": 0.3935269406537452, - "learning_rate": 1.869068459397159e-05, - "loss": 0.175, + "epoch": 0.49760213464104136, + "grad_norm": 0.40504870451767916, + "learning_rate": 1.18278845723207e-05, + "loss": 0.4117, "step": 1725 }, { - "epoch": 0.4983616002304562, - "grad_norm": 0.3695776720546673, - "learning_rate": 1.8678218200570327e-05, - "loss": 0.1821, + "epoch": 0.4990444596689864, + "grad_norm": 0.4346348228563321, + "learning_rate": 1.1778347911602329e-05, + "loss": 0.4104, "step": 1730 }, { - "epoch": 0.499801951676209, - "grad_norm": 0.3691159723569882, - "learning_rate": 1.866569693816479e-05, - "loss": 0.1884, + "epoch": 0.5004867846969314, + "grad_norm": 0.4075021793881479, + "learning_rate": 1.1728766145870587e-05, + "loss": 0.4229, "step": 1735 }, { - "epoch": 0.5012423031219617, - "grad_norm": 0.38398657899121097, - "learning_rate": 1.865312088592207e-05, - "loss": 0.1818, + "epoch": 0.5019291097248765, + "grad_norm": 0.418017099187981, + "learning_rate": 1.167914053268942e-05, + "loss": 0.407, "step": 1740 }, { - "epoch": 0.5026826545677145, - "grad_norm": 0.4032929747939036, - "learning_rate": 1.8640490123355656e-05, - "loss": 0.1793, + "epoch": 0.5033714347528215, + "grad_norm": 0.39895813955242926, + "learning_rate": 1.1629472330734888e-05, + "loss": 0.3978, "step": 1745 }, { - "epoch": 0.5041230060134673, - "grad_norm": 0.38471076664611525, - "learning_rate": 1.8627804730324955e-05, - "loss": 0.1848, + "epoch": 0.5048137597807666, + "grad_norm": 0.40383289208967305, + "learning_rate": 1.1579762799763249e-05, + "loss": 0.4175, "step": 1750 }, { - "epoch": 0.5055633574592201, - "grad_norm": 0.3512566796696949, - "learning_rate": 1.8615064787034784e-05, - "loss": 0.1933, + "epoch": 0.5062560848087116, + "grad_norm": 0.5225560587862472, + "learning_rate": 1.1530013200579008e-05, + "loss": 0.4131, "step": 1755 }, { - "epoch": 0.5070037089049728, - "grad_norm": 0.3736665521850479, - "learning_rate": 1.8602270374034853e-05, - "loss": 0.183, + "epoch": 0.5076984098366567, + "grad_norm": 0.4004897787727647, + "learning_rate": 1.1480224795002943e-05, + "loss": 0.3888, "step": 1760 }, { - "epoch": 0.5084440603507255, - "grad_norm": 0.3796593083345244, - "learning_rate": 1.8589421572219277e-05, - "loss": 0.1743, + "epoch": 0.5091407348646018, + "grad_norm": 0.4248175503521806, + "learning_rate": 1.1430398845840085e-05, + "loss": 0.4324, "step": 1765 }, { - "epoch": 0.5098844117964784, - "grad_norm": 0.3688833908332777, - "learning_rate": 1.8576518462826033e-05, - "loss": 0.1784, + "epoch": 0.5105830598925468, + "grad_norm": 0.43829908182981264, + "learning_rate": 1.1380536616847706e-05, + "loss": 0.4079, "step": 1770 }, { - "epoch": 0.5113247632422311, - "grad_norm": 0.3549233461196651, - "learning_rate": 1.8563561127436472e-05, - "loss": 0.1826, + "epoch": 0.5120253849204919, + "grad_norm": 0.43570794658905476, + "learning_rate": 1.1330639372703258e-05, + "loss": 0.4045, "step": 1775 }, { - "epoch": 0.5127651146879839, - "grad_norm": 0.36567662921445526, - "learning_rate": 1.8550549647974803e-05, - "loss": 0.1782, + "epoch": 0.5134677099484369, + "grad_norm": 0.43500914045447153, + "learning_rate": 1.12807083789723e-05, + "loss": 0.419, "step": 1780 }, { - "epoch": 0.5142054661337366, - "grad_norm": 0.3939614446337141, - "learning_rate": 1.8537484106707553e-05, - "loss": 0.177, + "epoch": 0.5149100349763819, + "grad_norm": 0.41351142363579385, + "learning_rate": 1.123074490207639e-05, + "loss": 0.3986, "step": 1785 }, { - "epoch": 0.5156458175794894, - "grad_norm": 0.36187155938329685, - "learning_rate": 1.8524364586243063e-05, - "loss": 0.1721, + "epoch": 0.5163523600043269, + "grad_norm": 0.37789765808010595, + "learning_rate": 1.1180750209260972e-05, + "loss": 0.4016, "step": 1790 }, { - "epoch": 0.5170861690252422, - "grad_norm": 0.346859682925655, - "learning_rate": 1.8511191169530977e-05, - "loss": 0.1756, + "epoch": 0.517794685032272, + "grad_norm": 0.4013962679722207, + "learning_rate": 1.1130725568563241e-05, + "loss": 0.4081, "step": 1795 }, { - "epoch": 0.5185265204709949, - "grad_norm": 0.314365239889427, - "learning_rate": 1.8497963939861684e-05, - "loss": 0.1662, + "epoch": 0.519237010060217, + "grad_norm": 0.38374761554210224, + "learning_rate": 1.1080672248779964e-05, + "loss": 0.4061, "step": 1800 }, { - "epoch": 0.5199668719167477, - "grad_norm": 0.35335427903783484, - "learning_rate": 1.8484682980865827e-05, - "loss": 0.1736, + "epoch": 0.5206793350881621, + "grad_norm": 0.44182386119487255, + "learning_rate": 1.1030591519435316e-05, + "loss": 0.3916, "step": 1805 }, { - "epoch": 0.5214072233625004, - "grad_norm": 0.3219984035719524, - "learning_rate": 1.8471348376513753e-05, - "loss": 0.1696, + "epoch": 0.5221216601161072, + "grad_norm": 0.44971294735945117, + "learning_rate": 1.0980484650748666e-05, + "loss": 0.3996, "step": 1810 }, { - "epoch": 0.5228475748082532, - "grad_norm": 0.33373306734235164, - "learning_rate": 1.845796021111499e-05, - "loss": 0.1884, + "epoch": 0.5235639851440522, + "grad_norm": 0.35276497806950113, + "learning_rate": 1.0930352913602371e-05, + "loss": 0.3732, "step": 1815 }, { - "epoch": 0.524287926254006, - "grad_norm": 0.35546415706338036, - "learning_rate": 1.8444518569317704e-05, - "loss": 0.1695, + "epoch": 0.5250063101719973, + "grad_norm": 0.42340138266599786, + "learning_rate": 1.0880197579509532e-05, + "loss": 0.4222, "step": 1820 }, { - "epoch": 0.5257282776997587, - "grad_norm": 0.37012004054869435, - "learning_rate": 1.8431023536108175e-05, - "loss": 0.1697, + "epoch": 0.5264486351999423, + "grad_norm": 0.39078797688993877, + "learning_rate": 1.0830019920581753e-05, + "loss": 0.4136, "step": 1825 }, { - "epoch": 0.5271686291455115, - "grad_norm": 0.37288255802693826, - "learning_rate": 1.841747519681027e-05, - "loss": 0.1737, + "epoch": 0.5278909602278874, + "grad_norm": 0.4130289272161752, + "learning_rate": 1.0779821209496876e-05, + "loss": 0.4192, "step": 1830 }, { - "epoch": 0.5286089805912643, - "grad_norm": 0.37080812218300224, - "learning_rate": 1.8403873637084872e-05, - "loss": 0.1796, + "epoch": 0.5293332852558325, + "grad_norm": 0.41541974485384586, + "learning_rate": 1.0729602719466692e-05, + "loss": 0.4031, "step": 1835 }, { - "epoch": 0.530049332037017, - "grad_norm": 0.3478775229906149, - "learning_rate": 1.839021894292936e-05, - "loss": 0.176, + "epoch": 0.5307756102837774, + "grad_norm": 0.44049659174573497, + "learning_rate": 1.067936572420466e-05, + "loss": 0.4069, "step": 1840 }, { - "epoch": 0.5314896834827698, - "grad_norm": 0.372540490899064, - "learning_rate": 1.8376511200677067e-05, - "loss": 0.2007, + "epoch": 0.5322179353117225, + "grad_norm": 0.44056632399340595, + "learning_rate": 1.0629111497893591e-05, + "loss": 0.3964, "step": 1845 }, { - "epoch": 0.5329300349285225, - "grad_norm": 0.36586791336161834, - "learning_rate": 1.836275049699672e-05, - "loss": 0.1751, + "epoch": 0.5336602603396675, + "grad_norm": 0.40575645379756525, + "learning_rate": 1.0578841315153333e-05, + "loss": 0.3953, "step": 1850 }, { - "epoch": 0.5343703863742754, - "grad_norm": 0.3448429738318597, - "learning_rate": 1.834893691889191e-05, - "loss": 0.1805, + "epoch": 0.5351025853676126, + "grad_norm": 0.37056517023195357, + "learning_rate": 1.0528556451008447e-05, + "loss": 0.4058, "step": 1855 }, { - "epoch": 0.5358107378200281, - "grad_norm": 0.3311960167279875, - "learning_rate": 1.8335070553700533e-05, - "loss": 0.1681, + "epoch": 0.5365449103955576, + "grad_norm": 0.38961078802000476, + "learning_rate": 1.0478258180855869e-05, + "loss": 0.3783, "step": 1860 }, { - "epoch": 0.5372510892657808, - "grad_norm": 0.35282815078601876, - "learning_rate": 1.832115148909422e-05, - "loss": 0.173, + "epoch": 0.5379872354235027, + "grad_norm": 0.4278326171242378, + "learning_rate": 1.0427947780432547e-05, + "loss": 0.4025, "step": 1865 }, { - "epoch": 0.5386914407115336, - "grad_norm": 0.3063521260958916, - "learning_rate": 1.830717981307782e-05, - "loss": 0.1663, + "epoch": 0.5394295604514477, + "grad_norm": 0.4487192036382051, + "learning_rate": 1.0377626525783101e-05, + "loss": 0.3933, "step": 1870 }, { - "epoch": 0.5401317921572864, - "grad_norm": 0.3463171525987883, - "learning_rate": 1.8293155613988816e-05, - "loss": 0.1893, + "epoch": 0.5408718854793928, + "grad_norm": 0.5348996401888022, + "learning_rate": 1.0327295693227454e-05, + "loss": 0.447, "step": 1875 }, { - "epoch": 0.5415721436030392, - "grad_norm": 0.3279453628244872, - "learning_rate": 1.827907898049677e-05, - "loss": 0.1733, + "epoch": 0.5423142105073379, + "grad_norm": 0.527197311781129, + "learning_rate": 1.0276956559328455e-05, + "loss": 0.3949, "step": 1880 }, { - "epoch": 0.5430124950487919, - "grad_norm": 0.3520187313077995, - "learning_rate": 1.8264950001602778e-05, - "loss": 0.1825, + "epoch": 0.5437565355352829, + "grad_norm": 0.41151058505508553, + "learning_rate": 1.0226610400859498e-05, + "loss": 0.4051, "step": 1885 }, { - "epoch": 0.5444528464945446, - "grad_norm": 0.3565061630561884, - "learning_rate": 1.825076876663888e-05, - "loss": 0.1848, + "epoch": 0.545198860563228, + "grad_norm": 0.37166405264306773, + "learning_rate": 1.0176258494772153e-05, + "loss": 0.3991, "step": 1890 }, { - "epoch": 0.5458931979402974, - "grad_norm": 0.378084862994715, - "learning_rate": 1.823653536526752e-05, - "loss": 0.1808, + "epoch": 0.5466411855911729, + "grad_norm": 0.4167614980577364, + "learning_rate": 1.0125902118163762e-05, + "loss": 0.4086, "step": 1895 }, { - "epoch": 0.5473335493860502, - "grad_norm": 0.4212362648494665, - "learning_rate": 1.8222249887480966e-05, - "loss": 0.1874, + "epoch": 0.548083510619118, + "grad_norm": 0.4002106455641225, + "learning_rate": 1.007554254824506e-05, + "loss": 0.4006, "step": 1900 }, { - "epoch": 0.548773900831803, - "grad_norm": 0.37361394437125983, - "learning_rate": 1.8207912423600755e-05, - "loss": 0.1807, + "epoch": 0.549525835647063, + "grad_norm": 0.38648887792017217, + "learning_rate": 1.0025181062307774e-05, + "loss": 0.4009, "step": 1905 }, { - "epoch": 0.5502142522775557, - "grad_norm": 0.4422231855974053, - "learning_rate": 1.8193523064277103e-05, - "loss": 0.1774, + "epoch": 0.5509681606750081, + "grad_norm": 0.4402653770907521, + "learning_rate": 9.974818937692228e-06, + "loss": 0.3909, "step": 1910 }, { - "epoch": 0.5516546037233084, - "grad_norm": 0.37677879394634756, - "learning_rate": 1.8179081900488337e-05, - "loss": 0.1868, + "epoch": 0.5524104857029531, + "grad_norm": 0.39402192655503426, + "learning_rate": 9.92445745175494e-06, + "loss": 0.3793, "step": 1915 }, { - "epoch": 0.5530949551690613, - "grad_norm": 0.32468876912478695, - "learning_rate": 1.8164589023540332e-05, - "loss": 0.1739, + "epoch": 0.5538528107308982, + "grad_norm": 0.36447042674734037, + "learning_rate": 9.874097881836241e-06, + "loss": 0.3856, "step": 1920 }, { - "epoch": 0.554535306614814, - "grad_norm": 0.3939834452367248, - "learning_rate": 1.815004452506592e-05, - "loss": 0.1807, + "epoch": 0.5552951357588433, + "grad_norm": 0.38084863196798785, + "learning_rate": 9.823741505227852e-06, + "loss": 0.3821, "step": 1925 }, { - "epoch": 0.5559756580605668, - "grad_norm": 0.36557739763760627, - "learning_rate": 1.813544849702432e-05, - "loss": 0.1637, + "epoch": 0.5567374607867883, + "grad_norm": 0.3689396281200298, + "learning_rate": 9.773389599140504e-06, + "loss": 0.3888, "step": 1930 }, { - "epoch": 0.5574160095063195, - "grad_norm": 0.3818470475052624, - "learning_rate": 1.812080103170055e-05, - "loss": 0.1783, + "epoch": 0.5581797858147334, + "grad_norm": 0.42447241183482853, + "learning_rate": 9.72304344067155e-06, + "loss": 0.4018, "step": 1935 }, { - "epoch": 0.5588563609520723, - "grad_norm": 0.3951616694877707, - "learning_rate": 1.8106102221704848e-05, - "loss": 0.1843, + "epoch": 0.5596221108426784, + "grad_norm": 0.34840166562757835, + "learning_rate": 9.672704306772547e-06, + "loss": 0.381, "step": 1940 }, { - "epoch": 0.5602967123978251, - "grad_norm": 0.36826181964798843, - "learning_rate": 1.809135215997208e-05, - "loss": 0.1829, + "epoch": 0.5610644358706235, + "grad_norm": 0.3824007554962182, + "learning_rate": 9.6223734742169e-06, + "loss": 0.405, "step": 1945 }, { - "epoch": 0.5617370638435778, - "grad_norm": 0.39404420593554734, - "learning_rate": 1.8076550939761156e-05, - "loss": 0.1763, + "epoch": 0.5625067608985685, + "grad_norm": 0.40567921647837246, + "learning_rate": 9.572052219567455e-06, + "loss": 0.3886, "step": 1950 }, { - "epoch": 0.5631774152893306, - "grad_norm": 0.4062269110223546, - "learning_rate": 1.806169865465445e-05, - "loss": 0.1855, + "epoch": 0.5639490859265135, + "grad_norm": 0.4496361442646002, + "learning_rate": 9.521741819144135e-06, + "loss": 0.3926, "step": 1955 }, { - "epoch": 0.5646177667350833, - "grad_norm": 0.4304330078698439, - "learning_rate": 1.8046795398557192e-05, - "loss": 0.1897, + "epoch": 0.5653914109544586, + "grad_norm": 0.3771274201963948, + "learning_rate": 9.471443548991557e-06, + "loss": 0.4009, "step": 1960 }, { - "epoch": 0.5660581181808362, - "grad_norm": 0.37064207532734933, - "learning_rate": 1.8031841265696886e-05, - "loss": 0.1912, + "epoch": 0.5668337359824036, + "grad_norm": 0.3832741322922619, + "learning_rate": 9.421158684846669e-06, + "loss": 0.3926, "step": 1965 }, { - "epoch": 0.5674984696265889, - "grad_norm": 0.40700194898232867, - "learning_rate": 1.8016836350622707e-05, - "loss": 0.1891, + "epoch": 0.5682760610103487, + "grad_norm": 0.41676932794244004, + "learning_rate": 9.370888502106414e-06, + "loss": 0.4194, "step": 1970 }, { - "epoch": 0.5689388210723416, - "grad_norm": 0.3565593238088361, - "learning_rate": 1.8001780748204907e-05, - "loss": 0.1794, + "epoch": 0.5697183860382937, + "grad_norm": 0.4465176481054024, + "learning_rate": 9.320634275795342e-06, + "loss": 0.3885, "step": 1975 }, { - "epoch": 0.5703791725180944, - "grad_norm": 0.3436335567049151, - "learning_rate": 1.7986674553634213e-05, - "loss": 0.1726, + "epoch": 0.5711607110662388, + "grad_norm": 0.41454265589275485, + "learning_rate": 9.270397280533311e-06, + "loss": 0.4041, "step": 1980 }, { - "epoch": 0.5718195239638472, - "grad_norm": 0.3546880736906482, - "learning_rate": 1.7971517862421227e-05, - "loss": 0.1723, + "epoch": 0.5726030360941838, + "grad_norm": 0.37529076026198815, + "learning_rate": 9.220178790503125e-06, + "loss": 0.3784, "step": 1985 }, { - "epoch": 0.5732598754096, - "grad_norm": 0.37327906152208173, - "learning_rate": 1.795631077039583e-05, - "loss": 0.1902, + "epoch": 0.5740453611221289, + "grad_norm": 0.4006407856625201, + "learning_rate": 9.169980079418248e-06, + "loss": 0.3742, "step": 1990 }, { - "epoch": 0.5747002268553527, - "grad_norm": 0.4331333954155759, - "learning_rate": 1.794105337370655e-05, - "loss": 0.1779, + "epoch": 0.575487686150074, + "grad_norm": 0.4075785016746068, + "learning_rate": 9.119802420490473e-06, + "loss": 0.4184, "step": 1995 }, { - "epoch": 0.5761405783011054, - "grad_norm": 0.3401961795693695, - "learning_rate": 1.7925745768819995e-05, - "loss": 0.1766, + "epoch": 0.576930011178019, + "grad_norm": 0.3892341916180056, + "learning_rate": 9.06964708639763e-06, + "loss": 0.3865, "step": 2000 }, { - "epoch": 0.5761405783011054, - "eval_loss": 0.1823691725730896, - "eval_runtime": 179.75, - "eval_samples_per_second": 10.036, - "eval_steps_per_second": 2.509, + "epoch": 0.576930011178019, + "eval_loss": 0.3948507606983185, + "eval_runtime": 142.1685, + "eval_samples_per_second": 12.668, + "eval_steps_per_second": 3.172, "step": 2000 }, { - "epoch": 0.5775809297468583, - "grad_norm": 0.34051110754364844, - "learning_rate": 1.7910388052520198e-05, - "loss": 0.1699, + "epoch": 0.5783723362059641, + "grad_norm": 0.4476758638692534, + "learning_rate": 9.019515349251337e-06, + "loss": 0.4076, "step": 2005 }, { - "epoch": 0.579021281192611, - "grad_norm": 0.3904395823768888, - "learning_rate": 1.7894980321908037e-05, - "loss": 0.1887, + "epoch": 0.579814661233909, + "grad_norm": 0.38084358148704506, + "learning_rate": 8.969408480564684e-06, + "loss": 0.3951, "step": 2010 }, { - "epoch": 0.5804616326383638, - "grad_norm": 0.35479628226475546, - "learning_rate": 1.7879522674400616e-05, - "loss": 0.1729, + "epoch": 0.5812569862618541, + "grad_norm": 0.3946160859854508, + "learning_rate": 8.919327751220038e-06, + "loss": 0.3737, "step": 2015 }, { - "epoch": 0.5819019840841165, - "grad_norm": 0.36745109167309065, - "learning_rate": 1.786401520773063e-05, - "loss": 0.1823, + "epoch": 0.5826993112897991, + "grad_norm": 0.4376591903476801, + "learning_rate": 8.86927443143676e-06, + "loss": 0.3993, "step": 2020 }, { - "epoch": 0.5833423355298692, - "grad_norm": 0.34440430468545136, - "learning_rate": 1.7848458019945778e-05, - "loss": 0.1806, + "epoch": 0.5841416363177442, + "grad_norm": 0.4220093736158996, + "learning_rate": 8.819249790739033e-06, + "loss": 0.3896, "step": 2025 }, { - "epoch": 0.5847826869756221, - "grad_norm": 0.3436234879041042, - "learning_rate": 1.7832851209408116e-05, - "loss": 0.1711, + "epoch": 0.5855839613456892, + "grad_norm": 0.37781362600911217, + "learning_rate": 8.769255097923617e-06, + "loss": 0.358, "step": 2030 }, { - "epoch": 0.5862230384213748, - "grad_norm": 0.44158762702669296, - "learning_rate": 1.7817194874793446e-05, - "loss": 0.1983, + "epoch": 0.5870262863736343, + "grad_norm": 0.37752543573320735, + "learning_rate": 8.719291621027703e-06, + "loss": 0.4016, "step": 2035 }, { - "epoch": 0.5876633898671276, - "grad_norm": 0.3547851651042487, - "learning_rate": 1.780148911509069e-05, - "loss": 0.1761, + "epoch": 0.5884686114015794, + "grad_norm": 0.4195162100656966, + "learning_rate": 8.669360627296745e-06, + "loss": 0.3755, "step": 2040 }, { - "epoch": 0.5891037413128803, - "grad_norm": 0.3802065495740333, - "learning_rate": 1.7785734029601275e-05, - "loss": 0.1781, + "epoch": 0.5899109364295244, + "grad_norm": 0.40866907101120203, + "learning_rate": 8.619463383152296e-06, + "loss": 0.3964, "step": 2045 }, { - "epoch": 0.5905440927586331, - "grad_norm": 0.3402917536980965, - "learning_rate": 1.7769929717938485e-05, - "loss": 0.1732, + "epoch": 0.5913532614574695, + "grad_norm": 0.4194072279329464, + "learning_rate": 8.56960115415992e-06, + "loss": 0.3853, "step": 2050 }, { - "epoch": 0.5919844442043859, - "grad_norm": 0.33348260099352367, - "learning_rate": 1.775407628002685e-05, - "loss": 0.1845, + "epoch": 0.5927955864854145, + "grad_norm": 0.503872591140977, + "learning_rate": 8.519775204997063e-06, + "loss": 0.4161, "step": 2055 }, { - "epoch": 0.5934247956501386, - "grad_norm": 0.3559404326820407, - "learning_rate": 1.77381738161015e-05, - "loss": 0.1809, + "epoch": 0.5942379115133596, + "grad_norm": 0.4656959686074043, + "learning_rate": 8.469986799420993e-06, + "loss": 0.4207, "step": 2060 }, { - "epoch": 0.5948651470958914, - "grad_norm": 0.3493274389579037, - "learning_rate": 1.772222242670754e-05, - "loss": 0.1975, + "epoch": 0.5956802365413045, + "grad_norm": 0.4068362162842934, + "learning_rate": 8.420237200236753e-06, + "loss": 0.3717, "step": 2065 }, { - "epoch": 0.5963054985416442, - "grad_norm": 0.35347058284663757, - "learning_rate": 1.7706222212699413e-05, - "loss": 0.1845, + "epoch": 0.5971225615692496, + "grad_norm": 0.4469993385978865, + "learning_rate": 8.370527669265114e-06, + "loss": 0.4039, "step": 2070 }, { - "epoch": 0.5977458499873969, - "grad_norm": 0.419314654327899, - "learning_rate": 1.7690173275240258e-05, - "loss": 0.1835, + "epoch": 0.5985648865971946, + "grad_norm": 0.43643202324029334, + "learning_rate": 8.320859467310582e-06, + "loss": 0.3749, "step": 2075 }, { - "epoch": 0.5991862014331497, - "grad_norm": 0.3304240143474617, - "learning_rate": 1.767407571580128e-05, - "loss": 0.1779, + "epoch": 0.6000072116251397, + "grad_norm": 0.5297689595825736, + "learning_rate": 8.271233854129413e-06, + "loss": 0.376, "step": 2080 }, { - "epoch": 0.6006265528789024, - "grad_norm": 0.3697054200801788, - "learning_rate": 1.765792963616109e-05, - "loss": 0.1903, + "epoch": 0.6014495366530848, + "grad_norm": 0.489056954944045, + "learning_rate": 8.221652088397675e-06, + "loss": 0.3933, "step": 2085 }, { - "epoch": 0.6020669043246553, - "grad_norm": 0.3736233059239093, - "learning_rate": 1.764173513840509e-05, - "loss": 0.1927, + "epoch": 0.6028918616810298, + "grad_norm": 0.37378771704976776, + "learning_rate": 8.172115427679304e-06, + "loss": 0.3945, "step": 2090 }, { - "epoch": 0.603507255770408, - "grad_norm": 0.33836118548178384, - "learning_rate": 1.7625492324924794e-05, - "loss": 0.1934, + "epoch": 0.6043341867089749, + "grad_norm": 0.4235226777306445, + "learning_rate": 8.122625128394216e-06, + "loss": 0.3826, "step": 2095 }, { - "epoch": 0.6049476072161607, - "grad_norm": 0.3385963747126986, - "learning_rate": 1.7609201298417205e-05, - "loss": 0.1819, + "epoch": 0.6057765117369199, + "grad_norm": 0.4021066843708137, + "learning_rate": 8.073182445786455e-06, + "loss": 0.3642, "step": 2100 }, { - "epoch": 0.6063879586619135, - "grad_norm": 0.339416908126632, - "learning_rate": 1.7592862161884166e-05, - "loss": 0.1798, + "epoch": 0.607218836764865, + "grad_norm": 0.3735730097404964, + "learning_rate": 8.023788633892334e-06, + "loss": 0.3725, "step": 2105 }, { - "epoch": 0.6078283101076662, - "grad_norm": 0.36861625743367654, - "learning_rate": 1.7576475018631684e-05, - "loss": 0.1779, + "epoch": 0.60866116179281, + "grad_norm": 0.42115686535849983, + "learning_rate": 7.974444945508637e-06, + "loss": 0.3876, "step": 2110 }, { - "epoch": 0.6092686615534191, - "grad_norm": 0.3704231845054649, - "learning_rate": 1.756003997226931e-05, - "loss": 0.1875, + "epoch": 0.6101034868207551, + "grad_norm": 0.42268328106794184, + "learning_rate": 7.925152632160841e-06, + "loss": 0.4042, "step": 2115 }, { - "epoch": 0.6107090129991718, - "grad_norm": 0.38079919821116043, - "learning_rate": 1.754355712670946e-05, - "loss": 0.1803, + "epoch": 0.6115458118487, + "grad_norm": 0.4303350707681742, + "learning_rate": 7.875912944071386e-06, + "loss": 0.3718, "step": 2120 }, { - "epoch": 0.6121493644449245, - "grad_norm": 0.3859155909423547, - "learning_rate": 1.7527026586166767e-05, - "loss": 0.1885, + "epoch": 0.6129881368766451, + "grad_norm": 0.41179372110756424, + "learning_rate": 7.826727130127942e-06, + "loss": 0.3844, "step": 2125 }, { - "epoch": 0.6135897158906773, - "grad_norm": 0.4008091703421957, - "learning_rate": 1.7510448455157415e-05, - "loss": 0.1849, + "epoch": 0.6144304619045902, + "grad_norm": 0.3763060638976918, + "learning_rate": 7.77759643785175e-06, + "loss": 0.378, "step": 2130 }, { - "epoch": 0.6150300673364301, - "grad_norm": 0.39230107396255565, - "learning_rate": 1.7493822838498496e-05, - "loss": 0.1888, + "epoch": 0.6158727869325352, + "grad_norm": 0.40647467863126857, + "learning_rate": 7.72852211336598e-06, + "loss": 0.3633, "step": 2135 }, { - "epoch": 0.6164704187821829, - "grad_norm": 0.3603073643543367, - "learning_rate": 1.747714984130733e-05, - "loss": 0.1715, + "epoch": 0.6173151119604803, + "grad_norm": 0.4427513530880047, + "learning_rate": 7.679505401364116e-06, + "loss": 0.3728, "step": 2140 }, { - "epoch": 0.6179107702279356, - "grad_norm": 0.39800892388698866, - "learning_rate": 1.74604295690008e-05, - "loss": 0.1685, + "epoch": 0.6187574369884253, + "grad_norm": 0.40218277177425543, + "learning_rate": 7.630547545078398e-06, + "loss": 0.3936, "step": 2145 }, { - "epoch": 0.6193511216736883, - "grad_norm": 0.32131963215807496, - "learning_rate": 1.7443662127294696e-05, - "loss": 0.1745, + "epoch": 0.6201997620163704, + "grad_norm": 0.40266373448906506, + "learning_rate": 7.581649786248276e-06, + "loss": 0.3956, "step": 2150 }, { - "epoch": 0.6207914731194412, - "grad_norm": 0.36459623655858153, - "learning_rate": 1.7426847622203043e-05, - "loss": 0.1706, + "epoch": 0.6216420870443155, + "grad_norm": 0.4101360200980578, + "learning_rate": 7.532813365088921e-06, + "loss": 0.3935, "step": 2155 }, { - "epoch": 0.6222318245651939, - "grad_norm": 0.38227127635284325, - "learning_rate": 1.7409986160037432e-05, - "loss": 0.1736, + "epoch": 0.6230844120722605, + "grad_norm": 0.4360450388421823, + "learning_rate": 7.484039520259781e-06, + "loss": 0.393, "step": 2160 }, { - "epoch": 0.6236721760109467, - "grad_norm": 0.33802946607680817, - "learning_rate": 1.7393077847406338e-05, - "loss": 0.1796, + "epoch": 0.6245267371002056, + "grad_norm": 0.3984091507351705, + "learning_rate": 7.435329488833137e-06, + "loss": 0.3857, "step": 2165 }, { - "epoch": 0.6251125274566994, - "grad_norm": 0.34668747024243696, - "learning_rate": 1.7376122791214457e-05, - "loss": 0.1727, + "epoch": 0.6259690621281506, + "grad_norm": 0.4057039326760462, + "learning_rate": 7.38668450626276e-06, + "loss": 0.4013, "step": 2170 }, { - "epoch": 0.6265528789024521, - "grad_norm": 0.35080021260147937, - "learning_rate": 1.7359121098662027e-05, - "loss": 0.1719, + "epoch": 0.6274113871560957, + "grad_norm": 0.39301356289008293, + "learning_rate": 7.338105806352542e-06, + "loss": 0.3613, "step": 2175 }, { - "epoch": 0.627993230348205, - "grad_norm": 0.3375939993470441, - "learning_rate": 1.734207287724415e-05, - "loss": 0.1686, + "epoch": 0.6288537121840406, + "grad_norm": 0.4031222004525292, + "learning_rate": 7.289594621225236e-06, + "loss": 0.3775, "step": 2180 }, { - "epoch": 0.6294335817939577, - "grad_norm": 0.3721850489546093, - "learning_rate": 1.732497823475011e-05, - "loss": 0.175, + "epoch": 0.6302960372119857, + "grad_norm": 0.42389618462152223, + "learning_rate": 7.241152181291173e-06, + "loss": 0.3842, "step": 2185 }, { - "epoch": 0.6308739332397105, - "grad_norm": 0.34307632301934204, - "learning_rate": 1.7307837279262692e-05, - "loss": 0.1665, + "epoch": 0.6317383622399307, + "grad_norm": 0.4222447939654566, + "learning_rate": 7.192779715217075e-06, + "loss": 0.3747, "step": 2190 }, { - "epoch": 0.6323142846854632, - "grad_norm": 0.37051543804913684, - "learning_rate": 1.7290650119157505e-05, - "loss": 0.1716, + "epoch": 0.6331806872678758, + "grad_norm": 0.3616433078805121, + "learning_rate": 7.144478449894894e-06, + "loss": 0.3619, "step": 2195 }, { - "epoch": 0.633754636131216, - "grad_norm": 0.34975756767684907, - "learning_rate": 1.7273416863102287e-05, - "loss": 0.1817, + "epoch": 0.6346230122958209, + "grad_norm": 0.40315108612725287, + "learning_rate": 7.096249610410671e-06, + "loss": 0.383, "step": 2200 }, { - "epoch": 0.6351949875769688, - "grad_norm": 0.3749976162171925, - "learning_rate": 1.725613762005623e-05, - "loss": 0.1843, + "epoch": 0.6360653373237659, + "grad_norm": 0.39550949033278987, + "learning_rate": 7.0480944200134975e-06, + "loss": 0.3993, "step": 2205 }, { - "epoch": 0.6366353390227215, - "grad_norm": 0.36815036243031424, - "learning_rate": 1.7238812499269274e-05, - "loss": 0.1694, + "epoch": 0.637507662351711, + "grad_norm": 0.4061605042450912, + "learning_rate": 7.00001410008445e-06, + "loss": 0.3667, "step": 2210 }, { - "epoch": 0.6380756904684743, - "grad_norm": 0.3117804955490591, - "learning_rate": 1.7221441610281434e-05, - "loss": 0.1708, + "epoch": 0.638949987379656, + "grad_norm": 0.399669288075527, + "learning_rate": 6.952009870105654e-06, + "loss": 0.387, "step": 2215 }, { - "epoch": 0.6395160419142271, - "grad_norm": 0.3604043748062189, - "learning_rate": 1.720402506292209e-05, - "loss": 0.1761, + "epoch": 0.6403923124076011, + "grad_norm": 0.4188823149502449, + "learning_rate": 6.904082947629317e-06, + "loss": 0.3814, "step": 2220 }, { - "epoch": 0.6409563933599798, - "grad_norm": 0.3298130218644663, - "learning_rate": 1.718656296730932e-05, - "loss": 0.179, + "epoch": 0.6418346374355461, + "grad_norm": 0.3729926900968089, + "learning_rate": 6.856234548246866e-06, + "loss": 0.3647, "step": 2225 }, { - "epoch": 0.6423967448057326, - "grad_norm": 0.3826862512207884, - "learning_rate": 1.7169055433849166e-05, - "loss": 0.1712, + "epoch": 0.6432769624634912, + "grad_norm": 0.3995200969127714, + "learning_rate": 6.808465885558122e-06, + "loss": 0.3778, "step": 2230 }, { - "epoch": 0.6438370962514853, - "grad_norm": 0.3542349813736183, - "learning_rate": 1.7151502573234967e-05, - "loss": 0.1761, + "epoch": 0.6447192874914361, + "grad_norm": 0.4182365028017815, + "learning_rate": 6.760778171140492e-06, + "loss": 0.4071, "step": 2235 }, { - "epoch": 0.6452774476972382, - "grad_norm": 0.34843337457451795, - "learning_rate": 1.7133904496446647e-05, - "loss": 0.1704, + "epoch": 0.6461616125193812, + "grad_norm": 0.419641094415173, + "learning_rate": 6.713172614518278e-06, + "loss": 0.3838, "step": 2240 }, { - "epoch": 0.6467177991429909, - "grad_norm": 0.3820580792053532, - "learning_rate": 1.711626131475001e-05, - "loss": 0.1781, + "epoch": 0.6476039375473263, + "grad_norm": 0.455639932664125, + "learning_rate": 6.665650423131953e-06, + "loss": 0.3864, "step": 2245 }, { - "epoch": 0.6481581505887437, - "grad_norm": 0.3626263301136655, - "learning_rate": 1.709857313969605e-05, - "loss": 0.1818, + "epoch": 0.6490462625752713, + "grad_norm": 0.42278667120966895, + "learning_rate": 6.618212802307589e-06, + "loss": 0.396, "step": 2250 }, { - "epoch": 0.6495985020344964, - "grad_norm": 0.3602584360955885, - "learning_rate": 1.708084008312022e-05, - "loss": 0.1887, + "epoch": 0.6504885876032164, + "grad_norm": 0.44585454789944867, + "learning_rate": 6.570860955226234e-06, + "loss": 0.3811, "step": 2255 }, { - "epoch": 0.6510388534802491, - "grad_norm": 0.34881350230579233, - "learning_rate": 1.7063062257141766e-05, - "loss": 0.1827, + "epoch": 0.6519309126311614, + "grad_norm": 0.3966025625438823, + "learning_rate": 6.5235960828934305e-06, + "loss": 0.3732, "step": 2260 }, { - "epoch": 0.652479204926002, - "grad_norm": 0.31830926244321406, - "learning_rate": 1.704523977416296e-05, - "loss": 0.1843, + "epoch": 0.6533732376591065, + "grad_norm": 0.40489868259557904, + "learning_rate": 6.476419384108745e-06, + "loss": 0.3567, "step": 2265 }, { - "epoch": 0.6539195563717547, - "grad_norm": 0.37964043159017924, - "learning_rate": 1.702737274686846e-05, - "loss": 0.1683, + "epoch": 0.6548155626870515, + "grad_norm": 0.39366736678335024, + "learning_rate": 6.429332055435349e-06, + "loss": 0.3623, "step": 2270 }, { - "epoch": 0.6553599078175075, - "grad_norm": 0.32018925541820387, - "learning_rate": 1.7009461288224533e-05, - "loss": 0.1757, + "epoch": 0.6562578877149966, + "grad_norm": 0.42529750592620424, + "learning_rate": 6.382335291169698e-06, + "loss": 0.3676, "step": 2275 }, { - "epoch": 0.6568002592632602, - "grad_norm": 0.3177150747359183, - "learning_rate": 1.699150551147838e-05, - "loss": 0.1788, + "epoch": 0.6577002127429417, + "grad_norm": 0.44036040562921713, + "learning_rate": 6.335430283311206e-06, + "loss": 0.3889, "step": 2280 }, { - "epoch": 0.658240610709013, - "grad_norm": 0.34466027988896136, - "learning_rate": 1.697350553015741e-05, - "loss": 0.1748, + "epoch": 0.6591425377708867, + "grad_norm": 0.3787593063841428, + "learning_rate": 6.288618221532031e-06, + "loss": 0.386, "step": 2285 }, { - "epoch": 0.6596809621547658, - "grad_norm": 0.34944690536487255, - "learning_rate": 1.6955461458068507e-05, - "loss": 0.1772, + "epoch": 0.6605848627988317, + "grad_norm": 0.4169592811397764, + "learning_rate": 6.241900293146915e-06, + "loss": 0.3752, "step": 2290 }, { - "epoch": 0.6611213136005185, - "grad_norm": 0.3634337752114522, - "learning_rate": 1.6937373409297336e-05, - "loss": 0.1697, + "epoch": 0.6620271878267767, + "grad_norm": 0.4047539500558757, + "learning_rate": 6.195277683083033e-06, + "loss": 0.3658, "step": 2295 }, { - "epoch": 0.6625616650462713, - "grad_norm": 0.3479574597357066, - "learning_rate": 1.6919241498207613e-05, - "loss": 0.1758, + "epoch": 0.6634695128547218, + "grad_norm": 0.3845249122797127, + "learning_rate": 6.148751573849976e-06, + "loss": 0.3563, "step": 2300 }, { - "epoch": 0.6640020164920241, - "grad_norm": 0.3337542016890648, - "learning_rate": 1.6901065839440365e-05, - "loss": 0.1678, + "epoch": 0.6649118378826668, + "grad_norm": 0.4633041975142693, + "learning_rate": 6.102323145509732e-06, + "loss": 0.3852, "step": 2305 }, { - "epoch": 0.6654423679377768, - "grad_norm": 0.342411033930506, - "learning_rate": 1.688284654791323e-05, - "loss": 0.1661, + "epoch": 0.6663541629106119, + "grad_norm": 0.3985148240515743, + "learning_rate": 6.055993575646775e-06, + "loss": 0.3915, "step": 2310 }, { - "epoch": 0.6668827193835296, - "grad_norm": 0.3536792365079188, - "learning_rate": 1.6864583738819712e-05, - "loss": 0.1845, + "epoch": 0.667796487938557, + "grad_norm": 0.40716397694215495, + "learning_rate": 6.00976403933818e-06, + "loss": 0.3605, "step": 2315 }, { - "epoch": 0.6683230708292823, - "grad_norm": 0.35537832797085916, - "learning_rate": 1.6846277527628463e-05, - "loss": 0.167, + "epoch": 0.669238812966502, + "grad_norm": 0.38795576025941675, + "learning_rate": 5.963635709123825e-06, + "loss": 0.37, "step": 2320 }, { - "epoch": 0.6697634222750352, - "grad_norm": 0.3828398429805593, - "learning_rate": 1.6827928030082546e-05, - "loss": 0.1815, + "epoch": 0.6706811379944471, + "grad_norm": 0.4110632294347015, + "learning_rate": 5.91760975497667e-06, + "loss": 0.3853, "step": 2325 }, { - "epoch": 0.6712037737207879, - "grad_norm": 0.33156685434051136, - "learning_rate": 1.6809535362198713e-05, - "loss": 0.1875, + "epoch": 0.6721234630223921, + "grad_norm": 0.3969166036791085, + "learning_rate": 5.871687344273045e-06, + "loss": 0.3672, "step": 2330 }, { - "epoch": 0.6726441251665406, - "grad_norm": 0.30886048275734546, - "learning_rate": 1.679109964026666e-05, - "loss": 0.1741, + "epoch": 0.6735657880503372, + "grad_norm": 0.41207993758304634, + "learning_rate": 5.8258696417630825e-06, + "loss": 0.3547, "step": 2335 }, { - "epoch": 0.6740844766122934, - "grad_norm": 0.38213092526719655, - "learning_rate": 1.67726209808483e-05, - "loss": 0.1657, + "epoch": 0.6750081130782822, + "grad_norm": 0.3680867654775724, + "learning_rate": 5.780157809541134e-06, + "loss": 0.3625, "step": 2340 }, { - "epoch": 0.6755248280580461, - "grad_norm": 0.34931015409841865, - "learning_rate": 1.6754099500777025e-05, - "loss": 0.1703, + "epoch": 0.6764504381062273, + "grad_norm": 0.4267438085961488, + "learning_rate": 5.734553007016345e-06, + "loss": 0.3999, "step": 2345 }, { - "epoch": 0.676965179503799, - "grad_norm": 0.4311198591898118, - "learning_rate": 1.6735535317156957e-05, - "loss": 0.1696, + "epoch": 0.6778927631341722, + "grad_norm": 0.3986326036374569, + "learning_rate": 5.68905639088319e-06, + "loss": 0.3303, "step": 2350 }, { - "epoch": 0.6784055309495517, - "grad_norm": 0.32753418004216717, - "learning_rate": 1.671692854736222e-05, - "loss": 0.1727, + "epoch": 0.6793350881621173, + "grad_norm": 0.42614206231420926, + "learning_rate": 5.643669115092183e-06, + "loss": 0.3589, "step": 2355 }, { - "epoch": 0.6798458823953044, - "grad_norm": 0.3205363210915123, - "learning_rate": 1.66982793090362e-05, - "loss": 0.1704, + "epoch": 0.6807774131900624, + "grad_norm": 0.3776847045804154, + "learning_rate": 5.598392330820586e-06, + "loss": 0.3609, "step": 2360 }, { - "epoch": 0.6812862338410572, - "grad_norm": 0.3228782244582717, - "learning_rate": 1.6679587720090792e-05, - "loss": 0.1791, + "epoch": 0.6822197382180074, + "grad_norm": 0.41271036973705766, + "learning_rate": 5.553227186443215e-06, + "loss": 0.3615, "step": 2365 }, { - "epoch": 0.68272658528681, - "grad_norm": 0.38827253301199693, - "learning_rate": 1.666085389870565e-05, - "loss": 0.1756, + "epoch": 0.6836620632459525, + "grad_norm": 0.38781546784387094, + "learning_rate": 5.508174827503328e-06, + "loss": 0.3433, "step": 2370 }, { - "epoch": 0.6841669367325628, - "grad_norm": 0.361153122747486, - "learning_rate": 1.664207796332746e-05, - "loss": 0.1694, + "epoch": 0.6851043882738975, + "grad_norm": 0.39550012764434234, + "learning_rate": 5.46323639668353e-06, + "loss": 0.3691, "step": 2375 }, { - "epoch": 0.6856072881783155, - "grad_norm": 0.38186177109604, - "learning_rate": 1.662326003266916e-05, - "loss": 0.1865, + "epoch": 0.6865467133018426, + "grad_norm": 0.4203725670836375, + "learning_rate": 5.4184130337768485e-06, + "loss": 0.3882, "step": 2380 }, { - "epoch": 0.6870476396240682, - "grad_norm": 0.3230350966256622, - "learning_rate": 1.660440022570923e-05, - "loss": 0.1697, + "epoch": 0.6879890383297876, + "grad_norm": 0.41719368579398214, + "learning_rate": 5.373705875657766e-06, + "loss": 0.3678, "step": 2385 }, { - "epoch": 0.6884879910698211, - "grad_norm": 0.3281847066113813, - "learning_rate": 1.6585498661690897e-05, - "loss": 0.1816, + "epoch": 0.6894313633577327, + "grad_norm": 0.408418654280754, + "learning_rate": 5.329116056253429e-06, + "loss": 0.3788, "step": 2390 }, { - "epoch": 0.6899283425155738, - "grad_norm": 0.37849511355850496, - "learning_rate": 1.6566555460121424e-05, - "loss": 0.172, + "epoch": 0.6908736883856778, + "grad_norm": 0.4432414502444195, + "learning_rate": 5.284644706514868e-06, + "loss": 0.3733, "step": 2395 }, { - "epoch": 0.6913686939613266, - "grad_norm": 0.4496749879940468, - "learning_rate": 1.654757074077131e-05, - "loss": 0.1776, + "epoch": 0.6923160134136228, + "grad_norm": 0.43523682450545426, + "learning_rate": 5.240292954388306e-06, + "loss": 0.3716, "step": 2400 }, { - "epoch": 0.6928090454070793, - "grad_norm": 0.3276397468021885, - "learning_rate": 1.6528544623673567e-05, - "loss": 0.1641, + "epoch": 0.6937583384415678, + "grad_norm": 0.4389694994462393, + "learning_rate": 5.1960619247865815e-06, + "loss": 0.3655, "step": 2405 }, { - "epoch": 0.694249396852832, - "grad_norm": 0.3540543579094072, - "learning_rate": 1.650947722912295e-05, - "loss": 0.172, + "epoch": 0.6952006634695128, + "grad_norm": 0.3932614135155125, + "learning_rate": 5.15195273956057e-06, + "loss": 0.3971, "step": 2410 }, { - "epoch": 0.6956897482985849, - "grad_norm": 0.3282349985410633, - "learning_rate": 1.6490368677675187e-05, - "loss": 0.1727, + "epoch": 0.6966429884974579, + "grad_norm": 0.38979362609767165, + "learning_rate": 5.107966517470771e-06, + "loss": 0.3724, "step": 2415 }, { - "epoch": 0.6971300997443376, - "grad_norm": 0.33846062010566547, - "learning_rate": 1.647121909014623e-05, - "loss": 0.1827, + "epoch": 0.6980853135254029, + "grad_norm": 0.4209080852390916, + "learning_rate": 5.064104374158909e-06, + "loss": 0.3911, "step": 2420 }, { - "epoch": 0.6985704511900904, - "grad_norm": 0.36878805858295277, - "learning_rate": 1.645202858761149e-05, - "loss": 0.1678, + "epoch": 0.699527638553348, + "grad_norm": 0.45055904805315533, + "learning_rate": 5.0203674221196485e-06, + "loss": 0.3633, "step": 2425 }, { - "epoch": 0.7000108026358431, - "grad_norm": 0.3241659934094576, - "learning_rate": 1.6432797291405055e-05, - "loss": 0.1829, + "epoch": 0.700969963581293, + "grad_norm": 0.3868393099197903, + "learning_rate": 4.9767567706723706e-06, + "loss": 0.3515, "step": 2430 }, { - "epoch": 0.7014511540815959, - "grad_norm": 0.3323977392588174, - "learning_rate": 1.6413525323118956e-05, - "loss": 0.1866, + "epoch": 0.7024122886092381, + "grad_norm": 0.41826804531316264, + "learning_rate": 4.933273525933041e-06, + "loss": 0.3519, "step": 2435 }, { - "epoch": 0.7028915055273487, - "grad_norm": 0.38740911056410215, - "learning_rate": 1.6394212804602356e-05, - "loss": 0.1859, + "epoch": 0.7038546136371832, + "grad_norm": 0.45957339946847975, + "learning_rate": 4.889918790786153e-06, + "loss": 0.3807, "step": 2440 }, { - "epoch": 0.7043318569731014, - "grad_norm": 0.35020569775025157, - "learning_rate": 1.6374859857960813e-05, - "loss": 0.1713, + "epoch": 0.7052969386651282, + "grad_norm": 0.4540538141436769, + "learning_rate": 4.846693664856754e-06, + "loss": 0.3465, "step": 2445 }, { - "epoch": 0.7057722084188542, - "grad_norm": 0.32892434146707544, - "learning_rate": 1.6355466605555502e-05, - "loss": 0.1785, + "epoch": 0.7067392636930733, + "grad_norm": 0.47813500195150954, + "learning_rate": 4.803599244482558e-06, + "loss": 0.376, "step": 2450 }, { - "epoch": 0.707212559864607, - "grad_norm": 0.35551264606759814, - "learning_rate": 1.633603317000242e-05, - "loss": 0.1749, + "epoch": 0.7081815887210183, + "grad_norm": 0.3925519413949624, + "learning_rate": 4.760636622686136e-06, + "loss": 0.3404, "step": 2455 }, { - "epoch": 0.7086529113103597, - "grad_norm": 0.4393019056382748, - "learning_rate": 1.6316559674171636e-05, - "loss": 0.1758, + "epoch": 0.7096239137489633, + "grad_norm": 0.4289528139780234, + "learning_rate": 4.717806889147196e-06, + "loss": 0.3627, "step": 2460 }, { - "epoch": 0.7100932627561125, - "grad_norm": 0.4685944922688227, - "learning_rate": 1.629704624118651e-05, - "loss": 0.1744, + "epoch": 0.7110662387769083, + "grad_norm": 0.41215198190870284, + "learning_rate": 4.675111130174939e-06, + "loss": 0.3716, "step": 2465 }, { - "epoch": 0.7115336142018652, - "grad_norm": 0.35581074458621115, - "learning_rate": 1.6277492994422893e-05, - "loss": 0.1756, + "epoch": 0.7125085638048534, + "grad_norm": 0.4403007485651443, + "learning_rate": 4.632550428680515e-06, + "loss": 0.3765, "step": 2470 }, { - "epoch": 0.7129739656476181, - "grad_norm": 0.33856885661174746, - "learning_rate": 1.625790005750838e-05, - "loss": 0.1705, + "epoch": 0.7139508888327984, + "grad_norm": 0.4311724864201015, + "learning_rate": 4.590125864149551e-06, + "loss": 0.3743, "step": 2475 }, { - "epoch": 0.7144143170933708, - "grad_norm": 0.3856019975908658, - "learning_rate": 1.62382675543215e-05, - "loss": 0.176, + "epoch": 0.7153932138607435, + "grad_norm": 0.46098384046435353, + "learning_rate": 4.547838512614773e-06, + "loss": 0.3505, "step": 2480 }, { - "epoch": 0.7158546685391235, - "grad_norm": 0.3832177698765323, - "learning_rate": 1.621859560899095e-05, - "loss": 0.1824, + "epoch": 0.7168355388886886, + "grad_norm": 0.40338840945222365, + "learning_rate": 4.505689446628712e-06, + "loss": 0.3691, "step": 2485 }, { - "epoch": 0.7172950199848763, - "grad_norm": 0.3405022015623911, - "learning_rate": 1.6198884345894803e-05, - "loss": 0.1879, + "epoch": 0.7182778639166336, + "grad_norm": 0.40824551867501546, + "learning_rate": 4.4636797352365035e-06, + "loss": 0.3585, "step": 2490 }, { - "epoch": 0.718735371430629, - "grad_norm": 0.34846873026652925, - "learning_rate": 1.6179133889659714e-05, - "loss": 0.1731, + "epoch": 0.7197201889445787, + "grad_norm": 0.4297027171998161, + "learning_rate": 4.421810443948774e-06, + "loss": 0.3705, "step": 2495 }, { - "epoch": 0.7201757228763819, - "grad_norm": 0.391933717466078, - "learning_rate": 1.6159344365160162e-05, - "loss": 0.1773, + "epoch": 0.7211625139725237, + "grad_norm": 0.40341531049143703, + "learning_rate": 4.38008263471461e-06, + "loss": 0.3815, "step": 2500 }, { - "epoch": 0.7201757228763819, - "eval_loss": 0.17813709378242493, - "eval_runtime": 178.3665, - "eval_samples_per_second": 10.114, - "eval_steps_per_second": 2.529, + "epoch": 0.7211625139725237, + "eval_loss": 0.37222930788993835, + "eval_runtime": 142.2441, + "eval_samples_per_second": 12.661, + "eval_steps_per_second": 3.171, "step": 2500 }, { - "epoch": 0.7216160743221346, - "grad_norm": 0.30920679982637206, - "learning_rate": 1.613951589751762e-05, - "loss": 0.1653, + "epoch": 0.7226048390004688, + "grad_norm": 0.4407059294927956, + "learning_rate": 4.338497365894628e-06, + "loss": 0.3661, "step": 2505 }, { - "epoch": 0.7230564257678873, - "grad_norm": 0.37229773310756054, - "learning_rate": 1.6119648612099793e-05, - "loss": 0.1789, + "epoch": 0.7240471640284138, + "grad_norm": 0.43213340820969415, + "learning_rate": 4.297055692234133e-06, + "loss": 0.3548, "step": 2510 }, { - "epoch": 0.7244967772136401, - "grad_norm": 0.369107759253664, - "learning_rate": 1.609974263451981e-05, - "loss": 0.1782, + "epoch": 0.7254894890563589, + "grad_norm": 0.40790860794488015, + "learning_rate": 4.25575866483636e-06, + "loss": 0.3693, "step": 2515 }, { - "epoch": 0.7259371286593929, - "grad_norm": 0.3094009207730617, - "learning_rate": 1.6079798090635442e-05, - "loss": 0.1788, + "epoch": 0.7269318140843039, + "grad_norm": 0.39452605394978774, + "learning_rate": 4.214607331135817e-06, + "loss": 0.3629, "step": 2520 }, { - "epoch": 0.7273774801051457, - "grad_norm": 0.34727782391218126, - "learning_rate": 1.6059815106548294e-05, - "loss": 0.1927, + "epoch": 0.7283741391122489, + "grad_norm": 0.4535519104968178, + "learning_rate": 4.173602734871723e-06, + "loss": 0.3631, "step": 2525 }, { - "epoch": 0.7288178315508984, - "grad_norm": 0.3392015861778497, - "learning_rate": 1.6039793808603014e-05, - "loss": 0.1573, + "epoch": 0.729816464140194, + "grad_norm": 0.4215165521407461, + "learning_rate": 4.132745916061528e-06, + "loss": 0.3623, "step": 2530 }, { - "epoch": 0.7302581829966511, - "grad_norm": 0.3414009389947609, - "learning_rate": 1.60197343233865e-05, - "loss": 0.1668, + "epoch": 0.731258789168139, + "grad_norm": 0.4369337778893739, + "learning_rate": 4.09203791097454e-06, + "loss": 0.3799, "step": 2535 }, { - "epoch": 0.731698534442404, - "grad_norm": 0.35361147490247585, - "learning_rate": 1.5999636777727085e-05, - "loss": 0.1743, + "epoch": 0.7327011141960841, + "grad_norm": 0.4218365082776104, + "learning_rate": 4.051479752105642e-06, + "loss": 0.3281, "step": 2540 }, { - "epoch": 0.7331388858881567, - "grad_norm": 0.3274226857520657, - "learning_rate": 1.5979501298693752e-05, - "loss": 0.1758, + "epoch": 0.7341434392240291, + "grad_norm": 0.39141469492573994, + "learning_rate": 4.01107246814909e-06, + "loss": 0.3779, "step": 2545 }, { - "epoch": 0.7345792373339095, - "grad_norm": 0.3621973295683448, - "learning_rate": 1.595932801359531e-05, - "loss": 0.1837, + "epoch": 0.7355857642519742, + "grad_norm": 0.4361183098017262, + "learning_rate": 3.970817083972451e-06, + "loss": 0.3677, "step": 2550 }, { - "epoch": 0.7360195887796622, - "grad_norm": 0.3606995367356055, - "learning_rate": 1.5939117049979614e-05, - "loss": 0.1791, + "epoch": 0.7370280892799193, + "grad_norm": 0.4212489079522315, + "learning_rate": 3.930714620590582e-06, + "loss": 0.3697, "step": 2555 }, { - "epoch": 0.7374599402254151, - "grad_norm": 0.33373830456128545, - "learning_rate": 1.5918868535632736e-05, - "loss": 0.1802, + "epoch": 0.7384704143078643, + "grad_norm": 0.42629366346781794, + "learning_rate": 3.890766095139744e-06, + "loss": 0.336, "step": 2560 }, { - "epoch": 0.7389002916711678, - "grad_norm": 0.3390372078416868, - "learning_rate": 1.589858259857817e-05, - "loss": 0.187, + "epoch": 0.7399127393358094, + "grad_norm": 0.39167597840940843, + "learning_rate": 3.850972520851804e-06, + "loss": 0.3297, "step": 2565 }, { - "epoch": 0.7403406431169205, - "grad_norm": 0.34412888867250124, - "learning_rate": 1.5878259367076027e-05, - "loss": 0.1911, + "epoch": 0.7413550643637544, + "grad_norm": 0.4233310284348778, + "learning_rate": 3.8113349070285344e-06, + "loss": 0.3613, "step": 2570 }, { - "epoch": 0.7417809945626733, - "grad_norm": 0.33009136167107, - "learning_rate": 1.5857898969622204e-05, - "loss": 0.183, + "epoch": 0.7427973893916994, + "grad_norm": 0.4263022461531563, + "learning_rate": 3.771854259016019e-06, + "loss": 0.3529, "step": 2575 }, { - "epoch": 0.743221346008426, - "grad_norm": 0.36557215152582107, - "learning_rate": 1.5837501534947586e-05, - "loss": 0.1758, + "epoch": 0.7442397144196444, + "grad_norm": 0.3973240159937157, + "learning_rate": 3.7325315781791337e-06, + "loss": 0.3661, "step": 2580 }, { - "epoch": 0.7446616974541789, - "grad_norm": 0.34247088555723115, - "learning_rate": 1.5817067192017234e-05, - "loss": 0.1828, + "epoch": 0.7456820394475895, + "grad_norm": 0.39734045764738396, + "learning_rate": 3.693367861876188e-06, + "loss": 0.3815, "step": 2585 }, { - "epoch": 0.7461020488999316, - "grad_norm": 0.40218176116646415, - "learning_rate": 1.579659607002957e-05, - "loss": 0.187, + "epoch": 0.7471243644755345, + "grad_norm": 0.4473118684590064, + "learning_rate": 3.6543641034335873e-06, + "loss": 0.3488, "step": 2590 }, { - "epoch": 0.7475424003456843, - "grad_norm": 0.34125168730229577, - "learning_rate": 1.5776088298415545e-05, - "loss": 0.183, + "epoch": 0.7485666895034796, + "grad_norm": 0.4071557714101167, + "learning_rate": 3.615521292120663e-06, + "loss": 0.36, "step": 2595 }, { - "epoch": 0.7489827517914371, - "grad_norm": 0.33299022807385764, - "learning_rate": 1.575554400683784e-05, - "loss": 0.1684, + "epoch": 0.7500090145314247, + "grad_norm": 0.4149969887621353, + "learning_rate": 3.5768404131245695e-06, + "loss": 0.3619, "step": 2600 }, { - "epoch": 0.7504231032371899, - "grad_norm": 0.3367346205260822, - "learning_rate": 1.5734963325190026e-05, - "loss": 0.1742, + "epoch": 0.7514513395593697, + "grad_norm": 0.41064754239264667, + "learning_rate": 3.5383224475253043e-06, + "loss": 0.3623, "step": 2605 }, { - "epoch": 0.7518634546829427, - "grad_norm": 0.3757675002933998, - "learning_rate": 1.5714346383595776e-05, - "loss": 0.1792, + "epoch": 0.7528936645873148, + "grad_norm": 0.48731666991216727, + "learning_rate": 3.4999683722708265e-06, + "loss": 0.3824, "step": 2610 }, { - "epoch": 0.7533038061286954, - "grad_norm": 0.36883835440832763, - "learning_rate": 1.5693693312407997e-05, - "loss": 0.1741, + "epoch": 0.7543359896152598, + "grad_norm": 0.42149841198530297, + "learning_rate": 3.4617791601522565e-06, + "loss": 0.3658, "step": 2615 }, { - "epoch": 0.7547441575744481, - "grad_norm": 0.3085265660673038, - "learning_rate": 1.567300424220804e-05, - "loss": 0.1777, + "epoch": 0.7557783146432049, + "grad_norm": 0.3936949177789515, + "learning_rate": 3.423755779779243e-06, + "loss": 0.3308, "step": 2620 }, { - "epoch": 0.756184509020201, - "grad_norm": 0.34234678068991636, - "learning_rate": 1.565227930380487e-05, - "loss": 0.178, + "epoch": 0.7572206396711499, + "grad_norm": 0.43489944362821054, + "learning_rate": 3.3858991955553455e-06, + "loss": 0.3815, "step": 2625 }, { - "epoch": 0.7576248604659537, - "grad_norm": 0.37484403928305354, - "learning_rate": 1.5631518628234217e-05, - "loss": 0.1752, + "epoch": 0.7586629646990949, + "grad_norm": 0.3921717289554429, + "learning_rate": 3.348210367653625e-06, + "loss": 0.3531, "step": 2630 }, { - "epoch": 0.7590652119117065, - "grad_norm": 0.309840359340079, - "learning_rate": 1.5610722346757775e-05, - "loss": 0.1697, + "epoch": 0.76010528972704, + "grad_norm": 0.44238912615157533, + "learning_rate": 3.3106902519922523e-06, + "loss": 0.3696, "step": 2635 }, { - "epoch": 0.7605055633574592, - "grad_norm": 0.3440804282261142, - "learning_rate": 1.558989059086236e-05, - "loss": 0.175, + "epoch": 0.761547614754985, + "grad_norm": 0.4536027992384981, + "learning_rate": 3.27333980021027e-06, + "loss": 0.37, "step": 2640 }, { - "epoch": 0.7619459148032119, - "grad_norm": 0.34552977330495505, - "learning_rate": 1.556902349225907e-05, - "loss": 0.1687, + "epoch": 0.7629899397829301, + "grad_norm": 0.4564191707678332, + "learning_rate": 3.236159959643482e-06, + "loss": 0.3819, "step": 2645 }, { - "epoch": 0.7633862662489648, - "grad_norm": 0.3054063669302173, - "learning_rate": 1.554812118288248e-05, - "loss": 0.1669, + "epoch": 0.7644322648108751, + "grad_norm": 0.5326593840798252, + "learning_rate": 3.1991516733003813e-06, + "loss": 0.3758, "step": 2650 }, { - "epoch": 0.7648266176947175, - "grad_norm": 0.35428566557970353, - "learning_rate": 1.5527183794889765e-05, - "loss": 0.1683, + "epoch": 0.7658745898388202, + "grad_norm": 0.43321441818668444, + "learning_rate": 3.1623158798382813e-06, + "loss": 0.3783, "step": 2655 }, { - "epoch": 0.7662669691404703, - "grad_norm": 0.3569130747650239, - "learning_rate": 1.5506211460659906e-05, - "loss": 0.1729, + "epoch": 0.7673169148667652, + "grad_norm": 0.4454237213343821, + "learning_rate": 3.125653513539456e-06, + "loss": 0.3607, "step": 2660 }, { - "epoch": 0.767707320586223, - "grad_norm": 0.31980284497508643, - "learning_rate": 1.5485204312792824e-05, - "loss": 0.1788, + "epoch": 0.7687592398947103, + "grad_norm": 0.4107211963202732, + "learning_rate": 3.089165504287499e-06, + "loss": 0.3482, "step": 2665 }, { - "epoch": 0.7691476720319758, - "grad_norm": 0.3182129504042558, - "learning_rate": 1.546416248410857e-05, - "loss": 0.1675, + "epoch": 0.7702015649226553, + "grad_norm": 0.3789782102911423, + "learning_rate": 3.052852777543687e-06, + "loss": 0.3543, "step": 2670 }, { - "epoch": 0.7705880234777286, - "grad_norm": 0.3420940121534491, - "learning_rate": 1.544308610764644e-05, - "loss": 0.1679, + "epoch": 0.7716438899506004, + "grad_norm": 0.4079189291227377, + "learning_rate": 3.0167162543235384e-06, + "loss": 0.3276, "step": 2675 }, { - "epoch": 0.7720283749234813, - "grad_norm": 0.35404502346417216, - "learning_rate": 1.542197531666419e-05, - "loss": 0.1723, + "epoch": 0.7730862149785455, + "grad_norm": 0.4472943997084153, + "learning_rate": 2.9807568511734564e-06, + "loss": 0.3825, "step": 2680 }, { - "epoch": 0.7734687263692341, - "grad_norm": 0.33613773212219045, - "learning_rate": 1.5400830244637158e-05, - "loss": 0.1682, + "epoch": 0.7745285400064905, + "grad_norm": 0.430008379042804, + "learning_rate": 2.944975480147445e-06, + "loss": 0.3595, "step": 2685 }, { - "epoch": 0.7749090778149869, - "grad_norm": 0.3163583277670731, - "learning_rate": 1.5379651025257415e-05, - "loss": 0.1651, + "epoch": 0.7759708650344355, + "grad_norm": 0.4401700574196651, + "learning_rate": 2.909373048784032e-06, + "loss": 0.3779, "step": 2690 }, { - "epoch": 0.7763494292607396, - "grad_norm": 0.35141493314032013, - "learning_rate": 1.5358437792432952e-05, - "loss": 0.1797, + "epoch": 0.7774131900623805, + "grad_norm": 0.4208383654033427, + "learning_rate": 2.873950460083191e-06, + "loss": 0.3749, "step": 2695 }, { - "epoch": 0.7777897807064924, - "grad_norm": 0.3462527106786627, - "learning_rate": 1.5337190680286796e-05, - "loss": 0.1692, + "epoch": 0.7788555150903256, + "grad_norm": 0.4174074736046765, + "learning_rate": 2.8387086124834952e-06, + "loss": 0.374, "step": 2700 }, { - "epoch": 0.7792301321522451, - "grad_norm": 0.36237551860243844, - "learning_rate": 1.531590982315619e-05, - "loss": 0.1682, + "epoch": 0.7802978401182706, + "grad_norm": 0.42868575004589055, + "learning_rate": 2.8036483998392784e-06, + "loss": 0.3564, "step": 2705 }, { - "epoch": 0.780670483597998, - "grad_norm": 0.35779509635155105, - "learning_rate": 1.5294595355591737e-05, - "loss": 0.1818, + "epoch": 0.7817401651462157, + "grad_norm": 0.3985935455753018, + "learning_rate": 2.768770711398001e-06, + "loss": 0.3667, "step": 2710 }, { - "epoch": 0.7821108350437507, - "grad_norm": 0.3383432292049812, - "learning_rate": 1.527324741235653e-05, - "loss": 0.1847, + "epoch": 0.7831824901741608, + "grad_norm": 0.40569605016983845, + "learning_rate": 2.734076431777688e-06, + "loss": 0.3506, "step": 2715 }, { - "epoch": 0.7835511864895034, - "grad_norm": 0.4033785323831944, - "learning_rate": 1.525186612842533e-05, - "loss": 0.1724, + "epoch": 0.7846248152021058, + "grad_norm": 0.39328145893392497, + "learning_rate": 2.6995664409444665e-06, + "loss": 0.3464, "step": 2720 }, { - "epoch": 0.7849915379352562, - "grad_norm": 0.37448292353996054, - "learning_rate": 1.5230451638983699e-05, - "loss": 0.1832, + "epoch": 0.7860671402300509, + "grad_norm": 0.4528233880552543, + "learning_rate": 2.6652416141902913e-06, + "loss": 0.3605, "step": 2725 }, { - "epoch": 0.7864318893810089, - "grad_norm": 0.3294476080971381, - "learning_rate": 1.5209004079427132e-05, - "loss": 0.1671, + "epoch": 0.7875094652579959, + "grad_norm": 0.4480705994704807, + "learning_rate": 2.631102822110695e-06, + "loss": 0.3726, "step": 2730 }, { - "epoch": 0.7878722408267618, - "grad_norm": 0.33174892241685555, - "learning_rate": 1.518752358536022e-05, - "loss": 0.1672, + "epoch": 0.788951790285941, + "grad_norm": 0.4574022134374259, + "learning_rate": 2.597150930582757e-06, + "loss": 0.359, "step": 2735 }, { - "epoch": 0.7893125922725145, - "grad_norm": 0.3625241110832361, - "learning_rate": 1.5166010292595794e-05, - "loss": 0.1729, + "epoch": 0.790394115313886, + "grad_norm": 0.4078128321456425, + "learning_rate": 2.563386800743094e-06, + "loss": 0.3413, "step": 2740 }, { - "epoch": 0.7907529437182672, - "grad_norm": 0.3172553909416287, - "learning_rate": 1.5144464337154045e-05, - "loss": 0.1712, + "epoch": 0.791836440341831, + "grad_norm": 0.44464864656256, + "learning_rate": 2.5298112889660544e-06, + "loss": 0.3587, "step": 2745 }, { - "epoch": 0.79219329516402, - "grad_norm": 0.3752235550380414, - "learning_rate": 1.5122885855261687e-05, - "loss": 0.175, + "epoch": 0.793278765369776, + "grad_norm": 0.3890963843751233, + "learning_rate": 2.4964252468419802e-06, + "loss": 0.344, "step": 2750 }, { - "epoch": 0.7936336466097728, - "grad_norm": 0.39890985335500706, - "learning_rate": 1.5101274983351082e-05, - "loss": 0.1707, + "epoch": 0.7947210903977211, + "grad_norm": 0.42348428672207705, + "learning_rate": 2.463229521155611e-06, + "loss": 0.3835, "step": 2755 }, { - "epoch": 0.7950739980555256, - "grad_norm": 0.35249780009984283, - "learning_rate": 1.5079631858059385e-05, - "loss": 0.1619, + "epoch": 0.7961634154256662, + "grad_norm": 0.4244981524719468, + "learning_rate": 2.430224953864617e-06, + "loss": 0.3908, "step": 2760 }, { - "epoch": 0.7965143495012783, - "grad_norm": 0.3677203486485534, - "learning_rate": 1.5057956616227669e-05, - "loss": 0.1727, + "epoch": 0.7976057404536112, + "grad_norm": 0.4461589097043871, + "learning_rate": 2.397412382078219e-06, + "loss": 0.3493, "step": 2765 }, { - "epoch": 0.797954700947031, - "grad_norm": 0.33164847451796536, - "learning_rate": 1.5036249394900073e-05, - "loss": 0.1602, + "epoch": 0.7990480654815563, + "grad_norm": 0.4226119316706504, + "learning_rate": 2.364792638035982e-06, + "loss": 0.3549, "step": 2770 }, { - "epoch": 0.7993950523927839, - "grad_norm": 0.33110969305592347, - "learning_rate": 1.5014510331322935e-05, - "loss": 0.1767, + "epoch": 0.8004903905095013, + "grad_norm": 0.43426124883547124, + "learning_rate": 2.3323665490866964e-06, + "loss": 0.3578, "step": 2775 }, { - "epoch": 0.8008354038385366, - "grad_norm": 0.3357704192473928, - "learning_rate": 1.499273956294391e-05, - "loss": 0.1758, + "epoch": 0.8019327155374464, + "grad_norm": 0.42274869171496543, + "learning_rate": 2.300134937667391e-06, + "loss": 0.3805, "step": 2780 }, { - "epoch": 0.8022757552842894, - "grad_norm": 0.3203731450233889, - "learning_rate": 1.4970937227411113e-05, - "loss": 0.1707, + "epoch": 0.8033750405653914, + "grad_norm": 0.4841781161829471, + "learning_rate": 2.2680986212824786e-06, + "loss": 0.3499, "step": 2785 }, { - "epoch": 0.8037161067300421, - "grad_norm": 0.35078849531073075, - "learning_rate": 1.4949103462572247e-05, - "loss": 0.1716, + "epoch": 0.8048173655933365, + "grad_norm": 0.428134320224768, + "learning_rate": 2.2362584124830167e-06, + "loss": 0.3684, "step": 2790 }, { - "epoch": 0.8051564581757948, - "grad_norm": 0.3438642759457224, - "learning_rate": 1.4927238406473734e-05, - "loss": 0.1818, + "epoch": 0.8062596906212816, + "grad_norm": 0.4117804314200649, + "learning_rate": 2.204615118846107e-06, + "loss": 0.3869, "step": 2795 }, { - "epoch": 0.8065968096215477, - "grad_norm": 0.3045848895679209, - "learning_rate": 1.4905342197359826e-05, - "loss": 0.1632, + "epoch": 0.8077020156492265, + "grad_norm": 0.41413616917927765, + "learning_rate": 2.1731695429543974e-06, + "loss": 0.338, "step": 2800 }, { - "epoch": 0.8080371610673004, - "grad_norm": 0.3359619222321305, - "learning_rate": 1.4883414973671758e-05, - "loss": 0.1793, + "epoch": 0.8091443406771716, + "grad_norm": 0.4360068588380961, + "learning_rate": 2.141922482375737e-06, + "loss": 0.3665, "step": 2805 }, { - "epoch": 0.8094775125130532, - "grad_norm": 0.34845042237143453, - "learning_rate": 1.4861456874046849e-05, - "loss": 0.1804, + "epoch": 0.8105866657051166, + "grad_norm": 0.4334830193418244, + "learning_rate": 2.1108747296429477e-06, + "loss": 0.3721, "step": 2810 }, { - "epoch": 0.8109178639588059, - "grad_norm": 0.3638246314263187, - "learning_rate": 1.483946803731764e-05, - "loss": 0.1786, + "epoch": 0.8120289907330617, + "grad_norm": 0.507519342034383, + "learning_rate": 2.080027072233718e-06, + "loss": 0.3646, "step": 2815 }, { - "epoch": 0.8123582154045587, - "grad_norm": 0.338860960844331, - "learning_rate": 1.4817448602511008e-05, - "loss": 0.172, + "epoch": 0.8134713157610067, + "grad_norm": 0.42834185576130923, + "learning_rate": 2.049380292550629e-06, + "loss": 0.3633, "step": 2820 }, { - "epoch": 0.8137985668503115, - "grad_norm": 0.3273375870728084, - "learning_rate": 1.4795398708847288e-05, - "loss": 0.172, + "epoch": 0.8149136407889518, + "grad_norm": 0.453195030964312, + "learning_rate": 2.018935167901316e-06, + "loss": 0.3539, "step": 2825 }, { - "epoch": 0.8152389182960642, - "grad_norm": 0.3675294921196136, - "learning_rate": 1.4773318495739399e-05, - "loss": 0.1686, + "epoch": 0.8163559658168968, + "grad_norm": 0.4103347116873249, + "learning_rate": 1.9886924704787482e-06, + "loss": 0.3457, "step": 2830 }, { - "epoch": 0.816679269741817, - "grad_norm": 0.3629448732145989, - "learning_rate": 1.4751208102791953e-05, - "loss": 0.1649, + "epoch": 0.8177982908448419, + "grad_norm": 0.4081898260751316, + "learning_rate": 1.9586529673416433e-06, + "loss": 0.347, "step": 2835 }, { - "epoch": 0.8181196211875698, - "grad_norm": 0.3532170948727624, - "learning_rate": 1.4729067669800379e-05, - "loss": 0.1731, + "epoch": 0.819240615872787, + "grad_norm": 0.40268175350554464, + "learning_rate": 1.928817420395018e-06, + "loss": 0.3772, "step": 2840 }, { - "epoch": 0.8195599726333225, - "grad_norm": 0.37425170739303987, - "learning_rate": 1.4706897336750045e-05, - "loss": 0.1801, + "epoch": 0.820682940900732, + "grad_norm": 0.43775696767862726, + "learning_rate": 1.8991865863708547e-06, + "loss": 0.3718, "step": 2845 }, { - "epoch": 0.8210003240790753, - "grad_norm": 0.33497558634653835, - "learning_rate": 1.4684697243815353e-05, - "loss": 0.1796, + "epoch": 0.8221252659286771, + "grad_norm": 0.43895036356232614, + "learning_rate": 1.8697612168089152e-06, + "loss": 0.3648, "step": 2850 }, { - "epoch": 0.822440675524828, - "grad_norm": 0.32112128012699964, - "learning_rate": 1.466246753135887e-05, - "loss": 0.1695, + "epoch": 0.823567590956622, + "grad_norm": 0.40821144604675824, + "learning_rate": 1.8405420580376755e-06, + "loss": 0.3422, "step": 2855 }, { - "epoch": 0.8238810269705809, - "grad_norm": 0.3329436646941721, - "learning_rate": 1.4640208339930442e-05, - "loss": 0.1792, + "epoch": 0.8250099159845671, + "grad_norm": 0.4577535204704979, + "learning_rate": 1.811529851155398e-06, + "loss": 0.3511, "step": 2860 }, { - "epoch": 0.8253213784163336, - "grad_norm": 0.3463503675900774, - "learning_rate": 1.4617919810266293e-05, - "loss": 0.1661, + "epoch": 0.8264522410125121, + "grad_norm": 0.40698416625428246, + "learning_rate": 1.7827253320113347e-06, + "loss": 0.3521, "step": 2865 }, { - "epoch": 0.8267617298620864, - "grad_norm": 0.3586194073001657, - "learning_rate": 1.459560208328814e-05, - "loss": 0.1712, + "epoch": 0.8278945660404572, + "grad_norm": 0.48745985212369625, + "learning_rate": 1.7541292311870616e-06, + "loss": 0.3727, "step": 2870 }, { - "epoch": 0.8282020813078391, - "grad_norm": 0.3616414921071555, - "learning_rate": 1.4573255300102306e-05, - "loss": 0.1647, + "epoch": 0.8293368910684022, + "grad_norm": 0.4152788200688241, + "learning_rate": 1.7257422739779495e-06, + "loss": 0.3406, "step": 2875 }, { - "epoch": 0.8296424327535918, - "grad_norm": 0.3382993481908728, - "learning_rate": 1.4550879601998829e-05, - "loss": 0.1817, + "epoch": 0.8307792160963473, + "grad_norm": 0.42357457834820555, + "learning_rate": 1.6975651803747716e-06, + "loss": 0.3614, "step": 2880 }, { - "epoch": 0.8310827841993447, - "grad_norm": 0.3291217731024657, - "learning_rate": 1.4528475130450555e-05, - "loss": 0.1583, + "epoch": 0.8322215411242924, + "grad_norm": 0.4290601435620992, + "learning_rate": 1.6695986650454355e-06, + "loss": 0.349, "step": 2885 }, { - "epoch": 0.8325231356450974, - "grad_norm": 0.3802083802739792, - "learning_rate": 1.4506042027112259e-05, - "loss": 0.1667, + "epoch": 0.8336638661522374, + "grad_norm": 0.40830671063358515, + "learning_rate": 1.6418434373168623e-06, + "loss": 0.3592, "step": 2890 }, { - "epoch": 0.8339634870908502, - "grad_norm": 0.3027853759942923, - "learning_rate": 1.4483580433819747e-05, - "loss": 0.1639, + "epoch": 0.8351061911801825, + "grad_norm": 0.4097799963554095, + "learning_rate": 1.614300201156994e-06, + "loss": 0.3359, "step": 2895 }, { - "epoch": 0.8354038385366029, - "grad_norm": 0.3186470887578327, - "learning_rate": 1.446109049258895e-05, - "loss": 0.1662, + "epoch": 0.8365485162081275, + "grad_norm": 0.43204146744095845, + "learning_rate": 1.5869696551569346e-06, + "loss": 0.3596, "step": 2900 }, { - "epoch": 0.8368441899823557, - "grad_norm": 0.32594383112579883, - "learning_rate": 1.4438572345615036e-05, - "loss": 0.1718, + "epoch": 0.8379908412360726, + "grad_norm": 0.46076233886580875, + "learning_rate": 1.5598524925132396e-06, + "loss": 0.3609, "step": 2905 }, { - "epoch": 0.8382845414281085, - "grad_norm": 0.33772511433449404, - "learning_rate": 1.4416026135271502e-05, - "loss": 0.1748, + "epoch": 0.8394331662640176, + "grad_norm": 0.4286297255981423, + "learning_rate": 1.5329494010103263e-06, + "loss": 0.3607, "step": 2910 }, { - "epoch": 0.8397248928738612, - "grad_norm": 0.3630128403423318, - "learning_rate": 1.4393452004109288e-05, - "loss": 0.1753, + "epoch": 0.8408754912919626, + "grad_norm": 0.3956440167259478, + "learning_rate": 1.5062610630030317e-06, + "loss": 0.316, "step": 2915 }, { - "epoch": 0.841165244319614, - "grad_norm": 0.37176315359169365, - "learning_rate": 1.4370850094855855e-05, - "loss": 0.1688, + "epoch": 0.8423178163199077, + "grad_norm": 0.41432843943606673, + "learning_rate": 1.4797881553993099e-06, + "loss": 0.3589, "step": 2920 }, { - "epoch": 0.8426055957653668, - "grad_norm": 0.325548664105803, - "learning_rate": 1.4348220550414305e-05, - "loss": 0.1669, + "epoch": 0.8437601413478527, + "grad_norm": 0.397270661772685, + "learning_rate": 1.4535313496430558e-06, + "loss": 0.3519, "step": 2925 }, { - "epoch": 0.8440459472111195, - "grad_norm": 0.3034699571801956, - "learning_rate": 1.4325563513862456e-05, - "loss": 0.169, + "epoch": 0.8452024663757978, + "grad_norm": 0.41857285751070505, + "learning_rate": 1.4274913116970846e-06, + "loss": 0.3401, "step": 2930 }, { - "epoch": 0.8454862986568723, - "grad_norm": 0.3624727001828696, - "learning_rate": 1.4302879128451956e-05, - "loss": 0.1799, + "epoch": 0.8466447914037428, + "grad_norm": 0.3941031419777465, + "learning_rate": 1.4016687020262231e-06, + "loss": 0.3504, "step": 2935 }, { - "epoch": 0.846926650102625, - "grad_norm": 0.3456854159772283, - "learning_rate": 1.428016753760737e-05, - "loss": 0.1747, + "epoch": 0.8480871164316879, + "grad_norm": 0.428688446592497, + "learning_rate": 1.3760641755805848e-06, + "loss": 0.3614, "step": 2940 }, { - "epoch": 0.8483670015483779, - "grad_norm": 0.3935855355179201, - "learning_rate": 1.425742888492526e-05, - "loss": 0.1831, + "epoch": 0.8495294414596329, + "grad_norm": 0.4097211469034453, + "learning_rate": 1.3506783817789337e-06, + "loss": 0.3384, "step": 2945 }, { - "epoch": 0.8498073529941306, - "grad_norm": 0.39615158777629134, - "learning_rate": 1.4234663314173307e-05, - "loss": 0.1795, + "epoch": 0.850971766487578, + "grad_norm": 0.44047116848231305, + "learning_rate": 1.3255119644922266e-06, + "loss": 0.3638, "step": 2950 }, { - "epoch": 0.8512477044398833, - "grad_norm": 0.3427325406907131, - "learning_rate": 1.421187096928937e-05, - "loss": 0.1686, + "epoch": 0.852414091515523, + "grad_norm": 0.3994464624403052, + "learning_rate": 1.300565562027276e-06, + "loss": 0.3447, "step": 2955 }, { - "epoch": 0.8526880558856361, - "grad_norm": 0.3650047955860237, - "learning_rate": 1.41890519943806e-05, - "loss": 0.1745, + "epoch": 0.8538564165434681, + "grad_norm": 0.44495457947302897, + "learning_rate": 1.2758398071105626e-06, + "loss": 0.3546, "step": 2960 }, { - "epoch": 0.8541284073313888, - "grad_norm": 0.3821350839884672, - "learning_rate": 1.4166206533722517e-05, - "loss": 0.1637, + "epoch": 0.8552987415714132, + "grad_norm": 0.4147516297268767, + "learning_rate": 1.2513353268721907e-06, + "loss": 0.3421, "step": 2965 }, { - "epoch": 0.8555687587771417, - "grad_norm": 0.30931762511103533, - "learning_rate": 1.4143334731758094e-05, - "loss": 0.172, + "epoch": 0.8567410665993581, + "grad_norm": 0.422646250463158, + "learning_rate": 1.2270527428299684e-06, + "loss": 0.3579, "step": 2970 }, { - "epoch": 0.8570091102228944, - "grad_norm": 0.3421291900106858, - "learning_rate": 1.4120436733096855e-05, - "loss": 0.1757, + "epoch": 0.8581833916273032, + "grad_norm": 0.4189403344854125, + "learning_rate": 1.2029926708736673e-06, + "loss": 0.3425, "step": 2975 }, { - "epoch": 0.8584494616686471, - "grad_norm": 0.551760437183804, - "learning_rate": 1.4097512682513958e-05, - "loss": 0.1843, + "epoch": 0.8596257166552482, + "grad_norm": 0.41547910036939945, + "learning_rate": 1.179155721249381e-06, + "loss": 0.3376, "step": 2980 }, { - "epoch": 0.8598898131143999, - "grad_norm": 1.5660769145066697, - "learning_rate": 1.4074562724949274e-05, - "loss": 0.1986, + "epoch": 0.8610680416831933, + "grad_norm": 0.42428858195226893, + "learning_rate": 1.1555424985440522e-06, + "loss": 0.3554, "step": 2985 }, { - "epoch": 0.8613301645601527, - "grad_norm": 3.6907279321244477, - "learning_rate": 1.4051587005506474e-05, - "loss": 0.2213, + "epoch": 0.8625103667111383, + "grad_norm": 0.4425537282272965, + "learning_rate": 1.1321536016701473e-06, + "loss": 0.351, "step": 2990 }, { - "epoch": 0.8627705160059055, - "grad_norm": 0.47087515557365756, - "learning_rate": 1.4028585669452111e-05, - "loss": 0.1803, + "epoch": 0.8639526917390834, + "grad_norm": 0.4161228925911087, + "learning_rate": 1.1089896238504461e-06, + "loss": 0.336, "step": 2995 }, { - "epoch": 0.8642108674516582, - "grad_norm": 0.3672313808361774, - "learning_rate": 1.40055588622147e-05, - "loss": 0.1852, + "epoch": 0.8653950167670285, + "grad_norm": 0.37656047979276985, + "learning_rate": 1.086051152603026e-06, + "loss": 0.3509, "step": 3000 }, { - "epoch": 0.8642108674516582, - "eval_loss": 0.17705903947353363, - "eval_runtime": 179.9737, - "eval_samples_per_second": 10.024, - "eval_steps_per_second": 2.506, + "epoch": 0.8653950167670285, + "eval_loss": 0.3611552119255066, + "eval_runtime": 142.3229, + "eval_samples_per_second": 12.654, + "eval_steps_per_second": 3.169, "step": 3000 }, { - "epoch": 0.8656512188974109, - "grad_norm": 0.326486381519465, - "learning_rate": 1.3982506729383805e-05, - "loss": 0.1731, + "epoch": 0.8668373417949735, + "grad_norm": 0.4463172354545017, + "learning_rate": 1.0633387697263254e-06, + "loss": 0.35, "step": 3005 }, { - "epoch": 0.8670915703431638, - "grad_norm": 0.32065558593854615, - "learning_rate": 1.3959429416709112e-05, - "loss": 0.1697, + "epoch": 0.8682796668229186, + "grad_norm": 0.43074983850708387, + "learning_rate": 1.0408530512844196e-06, + "loss": 0.3613, "step": 3010 }, { - "epoch": 0.8685319217889165, - "grad_norm": 0.404356115517127, - "learning_rate": 1.393632707009951e-05, - "loss": 0.1832, + "epoch": 0.8697219918508636, + "grad_norm": 0.39354733454334206, + "learning_rate": 1.0185945675923813e-06, + "loss": 0.3727, "step": 3015 }, { - "epoch": 0.8699722732346693, - "grad_norm": 0.31904774410454045, - "learning_rate": 1.3913199835622165e-05, - "loss": 0.1695, + "epoch": 0.8711643168788087, + "grad_norm": 0.44960602091132634, + "learning_rate": 9.965638832018432e-07, + "loss": 0.372, "step": 3020 }, { - "epoch": 0.871412624680422, - "grad_norm": 0.45454954577102474, - "learning_rate": 1.38900478595016e-05, - "loss": 0.1824, + "epoch": 0.8726066419067536, + "grad_norm": 0.42518881330063735, + "learning_rate": 9.747615568866553e-07, + "loss": 0.3516, "step": 3025 }, { - "epoch": 0.8728529761261747, - "grad_norm": 0.36657761636760966, - "learning_rate": 1.3866871288118772e-05, - "loss": 0.1729, + "epoch": 0.8740489669346987, + "grad_norm": 0.44741688383815076, + "learning_rate": 9.531881416287203e-07, + "loss": 0.3562, "step": 3030 }, { - "epoch": 0.8742933275719276, - "grad_norm": 0.29971196823956686, - "learning_rate": 1.384367026801015e-05, - "loss": 0.1688, + "epoch": 0.8754912919626437, + "grad_norm": 0.4331522299966881, + "learning_rate": 9.318441846039828e-07, + "loss": 0.3548, "step": 3035 }, { - "epoch": 0.8757336790176803, - "grad_norm": 0.33635749644358365, - "learning_rate": 1.3820444945866765e-05, - "loss": 0.1705, + "epoch": 0.8769336169905888, + "grad_norm": 0.506237893255727, + "learning_rate": 9.107302271685226e-07, + "loss": 0.3412, "step": 3040 }, { - "epoch": 0.8771740304634331, - "grad_norm": 0.36596078522080977, - "learning_rate": 1.3797195468533316e-05, - "loss": 0.1691, + "epoch": 0.8783759420185339, + "grad_norm": 0.4658754493753741, + "learning_rate": 8.898468048448528e-07, + "loss": 0.3336, "step": 3045 }, { - "epoch": 0.8786143819091858, - "grad_norm": 0.3474031868244395, - "learning_rate": 1.3773921983007224e-05, - "loss": 0.1666, + "epoch": 0.8798182670464789, + "grad_norm": 0.438225563597408, + "learning_rate": 8.691944473083114e-07, + "loss": 0.3422, "step": 3050 }, { - "epoch": 0.8800547333549386, - "grad_norm": 0.3465786978708465, - "learning_rate": 1.37506246364377e-05, - "loss": 0.1695, + "epoch": 0.881260592074424, + "grad_norm": 0.4170714809613398, + "learning_rate": 8.487736783736533e-07, + "loss": 0.3621, "step": 3055 }, { - "epoch": 0.8814950848006914, - "grad_norm": 0.3178525376840447, - "learning_rate": 1.3727303576124817e-05, - "loss": 0.1658, + "epoch": 0.882702917102369, + "grad_norm": 0.4590349478238853, + "learning_rate": 8.285850159817388e-07, + "loss": 0.3791, "step": 3060 }, { - "epoch": 0.8829354362464441, - "grad_norm": 0.3774091077871316, - "learning_rate": 1.370395894951859e-05, - "loss": 0.1656, + "epoch": 0.8841452421303141, + "grad_norm": 0.4332258091307991, + "learning_rate": 8.086289721864127e-07, + "loss": 0.3404, "step": 3065 }, { - "epoch": 0.8843757876921969, - "grad_norm": 0.3665050342290737, - "learning_rate": 1.3680590904218032e-05, - "loss": 0.1777, + "epoch": 0.8855875671582591, + "grad_norm": 0.4452410333427778, + "learning_rate": 7.889060531415193e-07, + "loss": 0.3541, "step": 3070 }, { - "epoch": 0.8858161391379497, - "grad_norm": 0.3248751131099162, - "learning_rate": 1.3657199587970212e-05, - "loss": 0.1733, + "epoch": 0.8870298921862042, + "grad_norm": 0.42507300447077245, + "learning_rate": 7.694167590880475e-07, + "loss": 0.3549, "step": 3075 }, { - "epoch": 0.8872564905837024, - "grad_norm": 0.3560181873382442, - "learning_rate": 1.3633785148669343e-05, - "loss": 0.1662, + "epoch": 0.8884722172141493, + "grad_norm": 0.4227403053651907, + "learning_rate": 7.501615843414623e-07, + "loss": 0.3264, "step": 3080 }, { - "epoch": 0.8886968420294552, - "grad_norm": 0.36371643455151664, - "learning_rate": 1.3610347734355838e-05, - "loss": 0.1885, + "epoch": 0.8899145422420942, + "grad_norm": 0.4131961662824003, + "learning_rate": 7.311410172791522e-07, + "loss": 0.3369, "step": 3085 }, { - "epoch": 0.8901371934752079, - "grad_norm": 0.31964376113184734, - "learning_rate": 1.3586887493215364e-05, - "loss": 0.1798, + "epoch": 0.8913568672700393, + "grad_norm": 0.39579591570866374, + "learning_rate": 7.123555403280558e-07, + "loss": 0.3483, "step": 3090 }, { - "epoch": 0.8915775449209608, - "grad_norm": 0.34388240218522587, - "learning_rate": 1.3563404573577919e-05, - "loss": 0.1645, + "epoch": 0.8927991922979843, + "grad_norm": 0.42292696994848605, + "learning_rate": 6.938056299524099e-07, + "loss": 0.3398, "step": 3095 }, { - "epoch": 0.8930178963667135, - "grad_norm": 0.32094313709171657, - "learning_rate": 1.3539899123916884e-05, - "loss": 0.1709, + "epoch": 0.8942415173259294, + "grad_norm": 0.38022938922831223, + "learning_rate": 6.754917566416796e-07, + "loss": 0.3469, "step": 3100 }, { - "epoch": 0.8944582478124662, - "grad_norm": 0.35489801575784186, - "learning_rate": 1.3516371292848098e-05, - "loss": 0.1759, + "epoch": 0.8956838423538744, + "grad_norm": 0.4849805496701068, + "learning_rate": 6.574143848986226e-07, + "loss": 0.3618, "step": 3105 }, { - "epoch": 0.895898599258219, - "grad_norm": 0.3435136193030186, - "learning_rate": 1.3492821229128892e-05, - "loss": 0.1801, + "epoch": 0.8971261673818195, + "grad_norm": 0.44465461522642474, + "learning_rate": 6.395739732274919e-07, + "loss": 0.3642, "step": 3110 }, { - "epoch": 0.8973389507039717, - "grad_norm": 0.3405482581189107, - "learning_rate": 1.3469249081657178e-05, - "loss": 0.1621, + "epoch": 0.8985684924097646, + "grad_norm": 0.44656695164750837, + "learning_rate": 6.219709741224322e-07, + "loss": 0.3563, "step": 3115 }, { - "epoch": 0.8987793021497246, - "grad_norm": 0.36475355151619254, - "learning_rate": 1.34456549994705e-05, - "loss": 0.1767, + "epoch": 0.9000108174377096, + "grad_norm": 0.4269116876807273, + "learning_rate": 6.046058340559824e-07, + "loss": 0.3431, "step": 3120 }, { - "epoch": 0.9002196535954773, - "grad_norm": 0.3185028661890843, - "learning_rate": 1.3422039131745073e-05, - "loss": 0.1718, + "epoch": 0.9014531424656547, + "grad_norm": 0.4086865891433274, + "learning_rate": 5.874789934677583e-07, + "loss": 0.3505, "step": 3125 }, { - "epoch": 0.90166000504123, - "grad_norm": 0.3501109132647978, - "learning_rate": 1.3398401627794855e-05, - "loss": 0.164, + "epoch": 0.9028954674935997, + "grad_norm": 0.4404444466800333, + "learning_rate": 5.705908867532862e-07, + "loss": 0.3407, "step": 3130 }, { - "epoch": 0.9031003564869828, - "grad_norm": 0.34377330815664026, - "learning_rate": 1.3374742637070612e-05, - "loss": 0.1705, + "epoch": 0.9043377925215448, + "grad_norm": 0.45999537115175176, + "learning_rate": 5.53941942252979e-07, + "loss": 0.37, "step": 3135 }, { - "epoch": 0.9045407079327356, - "grad_norm": 0.33654711284735916, - "learning_rate": 1.335106230915896e-05, - "loss": 0.1609, + "epoch": 0.9057801175494897, + "grad_norm": 0.4242568290280731, + "learning_rate": 5.375325822412747e-07, + "loss": 0.3316, "step": 3140 }, { - "epoch": 0.9059810593784884, - "grad_norm": 0.3245646527085092, - "learning_rate": 1.3327360793781408e-05, - "loss": 0.158, + "epoch": 0.9072224425774348, + "grad_norm": 0.4753028820261241, + "learning_rate": 5.213632229159227e-07, + "loss": 0.3785, "step": 3145 }, { - "epoch": 0.9074214108242411, - "grad_norm": 0.3351308270482288, - "learning_rate": 1.3303638240793442e-05, - "loss": 0.1678, + "epoch": 0.9086647676053798, + "grad_norm": 0.4699691806857396, + "learning_rate": 5.054342743874386e-07, + "loss": 0.3617, "step": 3150 }, { - "epoch": 0.9088617622699939, - "grad_norm": 0.32107454775466887, - "learning_rate": 1.3279894800183555e-05, - "loss": 0.1563, + "epoch": 0.9101070926333249, + "grad_norm": 0.4352496762130561, + "learning_rate": 4.897461406686821e-07, + "loss": 0.3359, "step": 3155 }, { - "epoch": 0.9103021137157467, - "grad_norm": 0.34245383003412766, - "learning_rate": 1.3256130622072301e-05, - "loss": 0.1766, + "epoch": 0.91154941766127, + "grad_norm": 0.4316421343515809, + "learning_rate": 4.742992196646301e-07, + "loss": 0.3376, "step": 3160 }, { - "epoch": 0.9117424651614994, - "grad_norm": 0.3274886424161308, - "learning_rate": 1.323234585671135e-05, - "loss": 0.1715, + "epoch": 0.912991742689215, + "grad_norm": 0.4001287994073788, + "learning_rate": 4.590939031622743e-07, + "loss": 0.3351, "step": 3165 }, { - "epoch": 0.9131828166072522, - "grad_norm": 0.32664224598446223, - "learning_rate": 1.3208540654482543e-05, - "loss": 0.1797, + "epoch": 0.9144340677171601, + "grad_norm": 0.4363788326973079, + "learning_rate": 4.4413057682068606e-07, + "loss": 0.3473, "step": 3170 }, { - "epoch": 0.9146231680530049, - "grad_norm": 0.3570503032198834, - "learning_rate": 1.3184715165896924e-05, - "loss": 0.1684, + "epoch": 0.9158763927451051, + "grad_norm": 0.44176842953481193, + "learning_rate": 4.2940962016123524e-07, + "loss": 0.3332, "step": 3175 }, { - "epoch": 0.9160635194987578, - "grad_norm": 0.3041353628175197, - "learning_rate": 1.3160869541593815e-05, - "loss": 0.1644, + "epoch": 0.9173187177730502, + "grad_norm": 0.43914474716543256, + "learning_rate": 4.149314065579624e-07, + "loss": 0.3383, "step": 3180 }, { - "epoch": 0.9175038709445105, - "grad_norm": 0.3384280077903303, - "learning_rate": 1.3137003932339834e-05, - "loss": 0.1655, + "epoch": 0.9187610428009952, + "grad_norm": 0.4540079519566383, + "learning_rate": 4.0069630322811303e-07, + "loss": 0.3786, "step": 3185 }, { - "epoch": 0.9189442223902632, - "grad_norm": 0.32882081126553514, - "learning_rate": 1.3113118489027968e-05, - "loss": 0.1639, + "epoch": 0.9202033678289403, + "grad_norm": 0.4612868459187327, + "learning_rate": 3.867046712228162e-07, + "loss": 0.3625, "step": 3190 }, { - "epoch": 0.920384573836016, - "grad_norm": 0.33419046612443065, - "learning_rate": 1.3089213362676595e-05, - "loss": 0.169, + "epoch": 0.9216456928568852, + "grad_norm": 0.40372545279617805, + "learning_rate": 3.729568654179361e-07, + "loss": 0.3308, "step": 3195 }, { - "epoch": 0.9218249252817687, - "grad_norm": 0.32575685656168196, - "learning_rate": 1.306528870442855e-05, - "loss": 0.1673, + "epoch": 0.9230880178848303, + "grad_norm": 0.4204476032972304, + "learning_rate": 3.5945323450506387e-07, + "loss": 0.3346, "step": 3200 }, { - "epoch": 0.9232652767275216, - "grad_norm": 0.3802101667394028, - "learning_rate": 1.304134466555016e-05, - "loss": 0.1672, + "epoch": 0.9245303429127754, + "grad_norm": 0.45260198781122246, + "learning_rate": 3.4619412098267693e-07, + "loss": 0.3795, "step": 3205 }, { - "epoch": 0.9247056281732743, - "grad_norm": 0.3637602408564527, - "learning_rate": 1.3017381397430285e-05, - "loss": 0.1721, + "epoch": 0.9259726679407204, + "grad_norm": 0.42527213346553855, + "learning_rate": 3.331798611474535e-07, + "loss": 0.3421, "step": 3210 }, { - "epoch": 0.926145979619027, - "grad_norm": 0.32095803714990945, - "learning_rate": 1.2993399051579365e-05, - "loss": 0.1759, + "epoch": 0.9274149929686655, + "grad_norm": 0.414984415520749, + "learning_rate": 3.204107850857374e-07, + "loss": 0.3291, "step": 3215 }, { - "epoch": 0.9275863310647798, - "grad_norm": 0.33727876451711586, - "learning_rate": 1.2969397779628459e-05, - "loss": 0.1691, + "epoch": 0.9288573179966105, + "grad_norm": 0.4549260227056393, + "learning_rate": 3.0788721666517365e-07, + "loss": 0.3486, "step": 3220 }, { - "epoch": 0.9290266825105326, - "grad_norm": 0.40867319701914406, - "learning_rate": 1.2945377733328297e-05, - "loss": 0.1775, + "epoch": 0.9302996430245556, + "grad_norm": 0.4443023622951338, + "learning_rate": 2.9560947352648697e-07, + "loss": 0.3756, "step": 3225 }, { - "epoch": 0.9304670339562854, - "grad_norm": 0.32340952324251065, - "learning_rate": 1.29213390645483e-05, - "loss": 0.1668, + "epoch": 0.9317419680525006, + "grad_norm": 0.4250192102717841, + "learning_rate": 2.8357786707542854e-07, + "loss": 0.3525, "step": 3230 }, { - "epoch": 0.9319073854020381, - "grad_norm": 0.3659132059806781, - "learning_rate": 1.289728192527564e-05, - "loss": 0.165, + "epoch": 0.9331842930804457, + "grad_norm": 0.41194820669384097, + "learning_rate": 2.71792702474879e-07, + "loss": 0.3562, "step": 3235 }, { - "epoch": 0.9333477368477908, - "grad_norm": 0.3285554990119345, - "learning_rate": 1.2873206467614268e-05, - "loss": 0.1651, + "epoch": 0.9346266181083908, + "grad_norm": 0.42277936484045997, + "learning_rate": 2.602542786371065e-07, + "loss": 0.3609, "step": 3240 }, { - "epoch": 0.9347880882935437, - "grad_norm": 0.33834559931171954, - "learning_rate": 1.2849112843783952e-05, - "loss": 0.1683, + "epoch": 0.9360689431363358, + "grad_norm": 0.402522590339594, + "learning_rate": 2.489628882161832e-07, + "loss": 0.3323, "step": 3245 }, { - "epoch": 0.9362284397392964, - "grad_norm": 0.3134919137585381, - "learning_rate": 1.2825001206119328e-05, - "loss": 0.1743, + "epoch": 0.9375112681642809, + "grad_norm": 0.42468823176649917, + "learning_rate": 2.3791881760056756e-07, + "loss": 0.3705, "step": 3250 }, { - "epoch": 0.9376687911850492, - "grad_norm": 0.3616806939948535, - "learning_rate": 1.2800871707068913e-05, - "loss": 0.1782, + "epoch": 0.9389535931922258, + "grad_norm": 0.42563197511583134, + "learning_rate": 2.2712234690583813e-07, + "loss": 0.3635, "step": 3255 }, { - "epoch": 0.9391091426308019, - "grad_norm": 0.38816909872964706, - "learning_rate": 1.2776724499194165e-05, - "loss": 0.1589, + "epoch": 0.9403959182201709, + "grad_norm": 0.4452148892270775, + "learning_rate": 2.1657374996758795e-07, + "loss": 0.3478, "step": 3260 }, { - "epoch": 0.9405494940765546, - "grad_norm": 0.38618952505233706, - "learning_rate": 1.27525597351685e-05, - "loss": 0.1762, + "epoch": 0.9418382432481159, + "grad_norm": 0.4539015567282992, + "learning_rate": 2.0627329433447917e-07, + "loss": 0.3736, "step": 3265 }, { - "epoch": 0.9419898455223075, - "grad_norm": 0.3602161530621059, - "learning_rate": 1.272837756777634e-05, - "loss": 0.1866, + "epoch": 0.943280568276061, + "grad_norm": 0.40270803503237657, + "learning_rate": 1.9622124126145837e-07, + "loss": 0.3378, "step": 3270 }, { - "epoch": 0.9434301969680602, - "grad_norm": 0.3441209052655488, - "learning_rate": 1.2704178149912142e-05, - "loss": 0.1707, + "epoch": 0.944722893304006, + "grad_norm": 0.4075396549757293, + "learning_rate": 1.864178457031318e-07, + "loss": 0.3562, "step": 3275 }, { - "epoch": 0.944870548413813, - "grad_norm": 0.3575347569438798, - "learning_rate": 1.2679961634579429e-05, - "loss": 0.1816, + "epoch": 0.9461652183319511, + "grad_norm": 0.43266062909072267, + "learning_rate": 1.768633563072919e-07, + "loss": 0.3451, "step": 3280 }, { - "epoch": 0.9463108998595657, - "grad_norm": 0.32619834091130173, - "learning_rate": 1.2655728174889823e-05, - "loss": 0.1596, + "epoch": 0.9476075433598962, + "grad_norm": 0.418621662939926, + "learning_rate": 1.6755801540862092e-07, + "loss": 0.334, "step": 3285 }, { - "epoch": 0.9477512513053185, - "grad_norm": 0.30450616004896364, - "learning_rate": 1.2631477924062086e-05, - "loss": 0.1687, + "epoch": 0.9490498683878412, + "grad_norm": 0.4221481289163581, + "learning_rate": 1.5850205902253613e-07, + "loss": 0.3536, "step": 3290 }, { - "epoch": 0.9491916027510713, - "grad_norm": 0.39194493113328666, - "learning_rate": 1.2607211035421134e-05, - "loss": 0.1798, + "epoch": 0.9504921934157863, + "grad_norm": 0.40400229300396406, + "learning_rate": 1.4969571683920768e-07, + "loss": 0.3636, "step": 3295 }, { - "epoch": 0.950631954196824, - "grad_norm": 0.33488364978285323, - "learning_rate": 1.258292766239708e-05, - "loss": 0.164, + "epoch": 0.9519345184437313, + "grad_norm": 0.4142859171614361, + "learning_rate": 1.411392122177302e-07, + "loss": 0.3302, "step": 3300 }, { - "epoch": 0.9520723056425768, - "grad_norm": 0.33449050395875485, - "learning_rate": 1.255862795852427e-05, - "loss": 0.1838, + "epoch": 0.9533768434716764, + "grad_norm": 0.4259634616965583, + "learning_rate": 1.3283276218046259e-07, + "loss": 0.3674, "step": 3305 }, { - "epoch": 0.9535126570883296, - "grad_norm": 0.3224448291423571, - "learning_rate": 1.2534312077440291e-05, - "loss": 0.159, + "epoch": 0.9548191684996213, + "grad_norm": 0.41429097541392035, + "learning_rate": 1.2477657740751714e-07, + "loss": 0.3483, "step": 3310 }, { - "epoch": 0.9549530085340823, - "grad_norm": 0.34250880685460666, - "learning_rate": 1.250998017288502e-05, - "loss": 0.1732, + "epoch": 0.9562614935275664, + "grad_norm": 0.42353387168902784, + "learning_rate": 1.169708622314214e-07, + "loss": 0.3608, "step": 3315 }, { - "epoch": 0.9563933599798351, - "grad_norm": 0.325026715867963, - "learning_rate": 1.2485632398699644e-05, - "loss": 0.1655, + "epoch": 0.9577038185555115, + "grad_norm": 0.42693212185785107, + "learning_rate": 1.0941581463193129e-07, + "loss": 0.3452, "step": 3320 }, { - "epoch": 0.9578337114255878, - "grad_norm": 0.34476248704682083, - "learning_rate": 1.2461268908825686e-05, - "loss": 0.1752, + "epoch": 0.9591461435834565, + "grad_norm": 0.4328702433520352, + "learning_rate": 1.021116262310129e-07, + "loss": 0.3413, "step": 3325 }, { - "epoch": 0.9592740628713406, - "grad_norm": 0.3442273279650722, - "learning_rate": 1.2436889857304031e-05, - "loss": 0.157, + "epoch": 0.9605884686114016, + "grad_norm": 0.41956255025855793, + "learning_rate": 9.505848228798076e-08, + "loss": 0.3604, "step": 3330 }, { - "epoch": 0.9607144143170934, - "grad_norm": 0.4303337353194812, - "learning_rate": 1.2412495398273956e-05, - "loss": 0.1728, + "epoch": 0.9620307936393466, + "grad_norm": 0.4209071869524921, + "learning_rate": 8.825656169480056e-08, + "loss": 0.3384, "step": 3335 }, { - "epoch": 0.9621547657628461, - "grad_norm": 0.365900582909615, - "learning_rate": 1.2388085685972155e-05, - "loss": 0.1712, + "epoch": 0.9634731186672917, + "grad_norm": 0.4118105753397592, + "learning_rate": 8.170603697154944e-08, + "loss": 0.3338, "step": 3340 }, { - "epoch": 0.9635951172085989, - "grad_norm": 0.3728395806507143, - "learning_rate": 1.2363660874731767e-05, - "loss": 0.1682, + "epoch": 0.9649154436952367, + "grad_norm": 0.43817584876124205, + "learning_rate": 7.540707426204163e-08, + "loss": 0.3281, "step": 3345 }, { - "epoch": 0.9650354686543516, - "grad_norm": 0.35386731676038286, - "learning_rate": 1.233922111898138e-05, - "loss": 0.1724, + "epoch": 0.9663577687231818, + "grad_norm": 0.3903217050033041, + "learning_rate": 6.935983332961305e-08, + "loss": 0.3308, "step": 3350 }, { - "epoch": 0.9664758201001045, - "grad_norm": 0.30172215095317684, - "learning_rate": 1.2314766573244085e-05, - "loss": 0.1581, + "epoch": 0.9678000937511269, + "grad_norm": 0.41905865354117233, + "learning_rate": 6.356446755307444e-08, + "loss": 0.3509, "step": 3355 }, { - "epoch": 0.9679161715458572, - "grad_norm": 0.3559005601073562, - "learning_rate": 1.2290297392136483e-05, - "loss": 0.175, + "epoch": 0.9692424187790719, + "grad_norm": 0.41394321455611666, + "learning_rate": 5.802112392281123e-08, + "loss": 0.3377, "step": 3360 }, { - "epoch": 0.9693565229916099, - "grad_norm": 0.3349516882886285, - "learning_rate": 1.2265813730367704e-05, - "loss": 0.1726, + "epoch": 0.9706847438070169, + "grad_norm": 0.4316304666724342, + "learning_rate": 5.272994303706758e-08, + "loss": 0.3592, "step": 3365 }, { - "epoch": 0.9707968744373627, - "grad_norm": 0.3234492836179018, - "learning_rate": 1.2241315742738431e-05, - "loss": 0.1797, + "epoch": 0.9721270688349619, + "grad_norm": 0.45454272140307556, + "learning_rate": 4.769105909836924e-08, + "loss": 0.3485, "step": 3370 }, { - "epoch": 0.9722372258831155, - "grad_norm": 0.35736598159988336, - "learning_rate": 1.2216803584139936e-05, - "loss": 0.1741, + "epoch": 0.973569393862907, + "grad_norm": 0.43202485000084534, + "learning_rate": 4.2904599910127406e-08, + "loss": 0.3538, "step": 3375 }, { - "epoch": 0.9736775773288683, - "grad_norm": 0.3173229290519434, - "learning_rate": 1.2192277409553075e-05, - "loss": 0.1728, + "epoch": 0.975011718890852, + "grad_norm": 0.44712558770756466, + "learning_rate": 3.837068687339351e-08, + "loss": 0.367, "step": 3380 }, { - "epoch": 0.975117928774621, - "grad_norm": 0.35907434868148436, - "learning_rate": 1.2167737374047329e-05, - "loss": 0.1619, + "epoch": 0.9764540439187971, + "grad_norm": 0.423193248701901, + "learning_rate": 3.408943498377726e-08, + "loss": 0.3351, "step": 3385 }, { - "epoch": 0.9765582802203737, - "grad_norm": 0.31746492403546855, - "learning_rate": 1.2143183632779812e-05, - "loss": 0.1606, + "epoch": 0.9778963689467421, + "grad_norm": 0.47037763666404425, + "learning_rate": 3.006095282854116e-08, + "loss": 0.3966, "step": 3390 }, { - "epoch": 0.9779986316661265, - "grad_norm": 0.29969257567145113, - "learning_rate": 1.2118616340994302e-05, - "loss": 0.176, + "epoch": 0.9793386939746872, + "grad_norm": 0.4314080592872779, + "learning_rate": 2.628534258383164e-08, + "loss": 0.357, "step": 3395 }, { - "epoch": 0.9794389831118793, - "grad_norm": 0.34191493614948304, - "learning_rate": 1.2094035654020245e-05, - "loss": 0.1771, + "epoch": 0.9807810190026323, + "grad_norm": 0.45121239415975073, + "learning_rate": 2.2762700012097795e-08, + "loss": 0.3564, "step": 3400 }, { - "epoch": 0.9808793345576321, - "grad_norm": 0.336306850022089, - "learning_rate": 1.2069441727271776e-05, - "loss": 0.1725, + "epoch": 0.9822233440305773, + "grad_norm": 0.4226505971917229, + "learning_rate": 1.9493114459659956e-08, + "loss": 0.3625, "step": 3405 }, { - "epoch": 0.9823196860033848, - "grad_norm": 0.3488759742550077, - "learning_rate": 1.2044834716246752e-05, - "loss": 0.1664, + "epoch": 0.9836656690585224, + "grad_norm": 0.4197713049001792, + "learning_rate": 1.6476668854440435e-08, + "loss": 0.3526, "step": 3410 }, { - "epoch": 0.9837600374491375, - "grad_norm": 0.29434687938442566, - "learning_rate": 1.2020214776525746e-05, - "loss": 0.1665, + "epoch": 0.9851079940864674, + "grad_norm": 0.4575738762031232, + "learning_rate": 1.3713439703865183e-08, + "loss": 0.3762, "step": 3415 }, { - "epoch": 0.9852003888948904, - "grad_norm": 0.32154231494880775, - "learning_rate": 1.1995582063771076e-05, - "loss": 0.1605, + "epoch": 0.9865503191144124, + "grad_norm": 0.4574906098764045, + "learning_rate": 1.120349709291868e-08, + "loss": 0.3634, "step": 3420 }, { - "epoch": 0.9866407403406431, - "grad_norm": 0.3216164557661555, - "learning_rate": 1.1970936733725822e-05, - "loss": 0.1649, + "epoch": 0.9879926441423574, + "grad_norm": 0.43088006927461175, + "learning_rate": 8.946904682370917e-09, + "loss": 0.3675, "step": 3425 }, { - "epoch": 0.9880810917863959, - "grad_norm": 0.3984614491878183, - "learning_rate": 1.1946278942212841e-05, - "loss": 0.1627, + "epoch": 0.9894349691703025, + "grad_norm": 0.4103449101623024, + "learning_rate": 6.943719707158681e-09, + "loss": 0.3496, "step": 3430 }, { - "epoch": 0.9895214432321486, - "grad_norm": 0.3084357411827215, - "learning_rate": 1.1921608845133774e-05, - "loss": 0.1791, + "epoch": 0.9908772941982475, + "grad_norm": 0.40469613082222705, + "learning_rate": 5.193992974935613e-09, + "loss": 0.369, "step": 3435 }, { - "epoch": 0.9909617946779014, - "grad_norm": 0.3461006191407041, - "learning_rate": 1.1896926598468062e-05, - "loss": 0.1608, + "epoch": 0.9923196192261926, + "grad_norm": 0.46076258755412675, + "learning_rate": 3.697768864782125e-09, + "loss": 0.3588, "step": 3440 }, { - "epoch": 0.9924021461236542, - "grad_norm": 0.3242641159641522, - "learning_rate": 1.187223235827197e-05, - "loss": 0.1675, + "epoch": 0.9937619442541377, + "grad_norm": 0.4334341619233562, + "learning_rate": 2.4550853260851826e-09, + "loss": 0.3345, "step": 3445 }, { - "epoch": 0.9938424975694069, - "grad_norm": 0.31853075587857466, - "learning_rate": 1.1847526280677592e-05, - "loss": 0.159, + "epoch": 0.9952042692820827, + "grad_norm": 0.44568439209243566, + "learning_rate": 1.4659738775679721e-09, + "loss": 0.3459, "step": 3450 }, { - "epoch": 0.9952828490151597, - "grad_norm": 0.36031480976740865, - "learning_rate": 1.1822808521891864e-05, - "loss": 0.1709, + "epoch": 0.9966465943100278, + "grad_norm": 0.45951543969711284, + "learning_rate": 7.30459606494982e-10, + "loss": 0.3791, "step": 3455 }, { - "epoch": 0.9967232004609125, - "grad_norm": 0.34851124123259397, - "learning_rate": 1.1798079238195574e-05, - "loss": 0.1693, + "epoch": 0.9980889193379728, + "grad_norm": 0.4459520568434071, + "learning_rate": 2.4856116803695375e-10, + "loss": 0.3525, "step": 3460 }, { - "epoch": 0.9981635519066653, - "grad_norm": 0.38061386776708506, - "learning_rate": 1.1773338585942389e-05, - "loss": 0.1662, + "epoch": 0.9995312443659179, + "grad_norm": 0.4581327568157757, + "learning_rate": 2.0290784791265893e-11, + "loss": 0.3492, "step": 3465 }, { - "epoch": 0.999603903352418, - "grad_norm": 0.3643047405112528, - "learning_rate": 1.1748586721557842e-05, - "loss": 0.1691, - "step": 3470 - }, - { - "epoch": 1.0011522811566023, - "grad_norm": 2.3721237897739833, - "learning_rate": 1.1723823801538361e-05, - "loss": 0.5506, - "step": 3475 - }, - { - "epoch": 1.0025926326023549, - "grad_norm": 2.341945780620137, - "learning_rate": 1.169904998245028e-05, - "loss": 0.1436, - "step": 3480 - }, - { - "epoch": 1.0040329840481077, - "grad_norm": 2.7593761138904846, - "learning_rate": 1.1674265420928827e-05, - "loss": 0.138, - "step": 3485 - }, - { - "epoch": 1.0054733354938605, - "grad_norm": 2.3938411855023687, - "learning_rate": 1.1649470273677178e-05, - "loss": 0.1604, - "step": 3490 - }, - { - "epoch": 1.0069136869396134, - "grad_norm": 0.30894809227179215, - "learning_rate": 1.1624664697465406e-05, - "loss": 0.1361, - "step": 3495 - }, - { - "epoch": 1.008354038385366, - "grad_norm": 0.8469245650916747, - "learning_rate": 1.1599848849129549e-05, - "loss": 0.1264, - "step": 3500 - }, - { - "epoch": 1.008354038385366, - "eval_loss": 0.14734847843647003, - "eval_runtime": 187.1808, - "eval_samples_per_second": 9.638, - "eval_steps_per_second": 2.409, - "step": 3500 - }, - { - "epoch": 1.0097943898311188, - "grad_norm": 0.2863610834780281, - "learning_rate": 1.157502288557058e-05, - "loss": 0.1397, - "step": 3505 - }, - { - "epoch": 1.0112347412768716, - "grad_norm": 0.27170412433783897, - "learning_rate": 1.155018696375342e-05, - "loss": 0.1398, - "step": 3510 - }, - { - "epoch": 1.0126750927226242, - "grad_norm": 0.26430273342487837, - "learning_rate": 1.1525341240705967e-05, - "loss": 0.1308, - "step": 3515 - }, - { - "epoch": 1.014115444168377, - "grad_norm": 0.2856254131463934, - "learning_rate": 1.1500485873518079e-05, - "loss": 0.1466, - "step": 3520 - }, - { - "epoch": 1.01555579561413, - "grad_norm": 0.28388915331696574, - "learning_rate": 1.1475621019340594e-05, - "loss": 0.1363, - "step": 3525 - }, - { - "epoch": 1.0169961470598825, - "grad_norm": 0.27929551536804725, - "learning_rate": 1.145074683538433e-05, - "loss": 0.1281, - "step": 3530 - }, - { - "epoch": 1.0184364985056353, - "grad_norm": 0.2877130736841538, - "learning_rate": 1.1425863478919092e-05, - "loss": 0.1261, - "step": 3535 - }, - { - "epoch": 1.0198768499513882, - "grad_norm": 0.2845024477187945, - "learning_rate": 1.1400971107272685e-05, - "loss": 0.1394, - "step": 3540 - }, - { - "epoch": 1.021317201397141, - "grad_norm": 0.28240198605290473, - "learning_rate": 1.137606987782991e-05, - "loss": 0.1295, - "step": 3545 - }, - { - "epoch": 1.0227575528428936, - "grad_norm": 0.29550483906146596, - "learning_rate": 1.1351159948031572e-05, - "loss": 0.136, - "step": 3550 - }, - { - "epoch": 1.0241979042886464, - "grad_norm": 0.26147300326314377, - "learning_rate": 1.1326241475373483e-05, - "loss": 0.1347, - "step": 3555 - }, - { - "epoch": 1.0256382557343993, - "grad_norm": 0.2847825316184351, - "learning_rate": 1.1301314617405473e-05, - "loss": 0.1339, - "step": 3560 - }, - { - "epoch": 1.0270786071801519, - "grad_norm": 0.26880663609447486, - "learning_rate": 1.1276379531730386e-05, - "loss": 0.1293, - "step": 3565 - }, - { - "epoch": 1.0285189586259047, - "grad_norm": 0.30390938449587174, - "learning_rate": 1.1251436376003091e-05, - "loss": 0.1356, - "step": 3570 - }, - { - "epoch": 1.0299593100716575, - "grad_norm": 0.309678056558676, - "learning_rate": 1.122648530792947e-05, - "loss": 0.1306, - "step": 3575 - }, - { - "epoch": 1.0313996615174101, - "grad_norm": 0.2733158298216727, - "learning_rate": 1.1201526485265449e-05, - "loss": 0.1359, - "step": 3580 - }, - { - "epoch": 1.032840012963163, - "grad_norm": 0.2897804991588295, - "learning_rate": 1.1176560065815962e-05, - "loss": 0.1276, - "step": 3585 - }, - { - "epoch": 1.0342803644089158, - "grad_norm": 0.2871445202236724, - "learning_rate": 1.1151586207433993e-05, - "loss": 0.1336, - "step": 3590 - }, - { - "epoch": 1.0357207158546686, - "grad_norm": 0.304554126712303, - "learning_rate": 1.112660506801955e-05, - "loss": 0.124, - "step": 3595 - }, - { - "epoch": 1.0371610673004212, - "grad_norm": 0.28174097838281875, - "learning_rate": 1.1101616805518678e-05, - "loss": 0.1319, - "step": 3600 - }, - { - "epoch": 1.038601418746174, - "grad_norm": 0.28561395901703596, - "learning_rate": 1.1076621577922461e-05, - "loss": 0.1276, - "step": 3605 - }, - { - "epoch": 1.040041770191927, - "grad_norm": 0.2592443629276645, - "learning_rate": 1.1051619543266017e-05, - "loss": 0.131, - "step": 3610 - }, - { - "epoch": 1.0414821216376795, - "grad_norm": 0.29120003001124567, - "learning_rate": 1.1026610859627502e-05, - "loss": 0.1432, - "step": 3615 - }, - { - "epoch": 1.0429224730834323, - "grad_norm": 0.299864421583837, - "learning_rate": 1.1001595685127117e-05, - "loss": 0.1332, - "step": 3620 - }, - { - "epoch": 1.0443628245291852, - "grad_norm": 0.29923938520475035, - "learning_rate": 1.097657417792609e-05, - "loss": 0.1293, - "step": 3625 - }, - { - "epoch": 1.045803175974938, - "grad_norm": 0.2716661652311505, - "learning_rate": 1.0951546496225705e-05, - "loss": 0.1297, - "step": 3630 - }, - { - "epoch": 1.0472435274206906, - "grad_norm": 0.3202993091637731, - "learning_rate": 1.0926512798266273e-05, - "loss": 0.1316, - "step": 3635 - }, - { - "epoch": 1.0486838788664434, - "grad_norm": 0.2933354486531652, - "learning_rate": 1.0901473242326148e-05, - "loss": 0.1379, - "step": 3640 - }, - { - "epoch": 1.0501242303121963, - "grad_norm": 0.2827070705191196, - "learning_rate": 1.0876427986720715e-05, - "loss": 0.1373, - "step": 3645 - }, - { - "epoch": 1.0515645817579489, - "grad_norm": 0.3053591398371649, - "learning_rate": 1.0851377189801406e-05, - "loss": 0.1366, - "step": 3650 - }, - { - "epoch": 1.0530049332037017, - "grad_norm": 0.3254925917237289, - "learning_rate": 1.0826321009954683e-05, - "loss": 0.1421, - "step": 3655 - }, - { - "epoch": 1.0544452846494545, - "grad_norm": 0.2964349375907128, - "learning_rate": 1.0801259605601043e-05, - "loss": 0.143, - "step": 3660 - }, - { - "epoch": 1.0558856360952071, - "grad_norm": 0.31104349771963047, - "learning_rate": 1.077619313519401e-05, - "loss": 0.1372, - "step": 3665 - }, - { - "epoch": 1.05732598754096, - "grad_norm": 0.27745539952677795, - "learning_rate": 1.0751121757219154e-05, - "loss": 0.1405, - "step": 3670 - }, - { - "epoch": 1.0587663389867128, - "grad_norm": 0.2846782984236109, - "learning_rate": 1.0726045630193057e-05, - "loss": 0.1301, - "step": 3675 - }, - { - "epoch": 1.0602066904324656, - "grad_norm": 0.3088319093039246, - "learning_rate": 1.070096491266233e-05, - "loss": 0.1356, - "step": 3680 - }, - { - "epoch": 1.0616470418782182, - "grad_norm": 0.2682708073426634, - "learning_rate": 1.0675879763202623e-05, - "loss": 0.1317, - "step": 3685 - }, - { - "epoch": 1.063087393323971, - "grad_norm": 0.2671845851053607, - "learning_rate": 1.0650790340417592e-05, - "loss": 0.1337, - "step": 3690 - }, - { - "epoch": 1.064527744769724, - "grad_norm": 0.2778331787589554, - "learning_rate": 1.0625696802937911e-05, - "loss": 0.1377, - "step": 3695 - }, - { - "epoch": 1.0659680962154765, - "grad_norm": 0.30753300441114967, - "learning_rate": 1.0600599309420279e-05, - "loss": 0.1374, - "step": 3700 - }, - { - "epoch": 1.0674084476612293, - "grad_norm": 0.2775830751816375, - "learning_rate": 1.0575498018546407e-05, - "loss": 0.1307, - "step": 3705 - }, - { - "epoch": 1.0688487991069822, - "grad_norm": 0.32475842682289247, - "learning_rate": 1.0550393089022001e-05, - "loss": 0.1284, - "step": 3710 - }, - { - "epoch": 1.0702891505527348, - "grad_norm": 0.2927503470442227, - "learning_rate": 1.052528467957579e-05, - "loss": 0.1255, - "step": 3715 - }, - { - "epoch": 1.0717295019984876, - "grad_norm": 0.2918443033872336, - "learning_rate": 1.0500172948958502e-05, - "loss": 0.1296, - "step": 3720 - }, - { - "epoch": 1.0731698534442404, - "grad_norm": 0.2733071927339924, - "learning_rate": 1.0475058055941856e-05, - "loss": 0.1354, - "step": 3725 - }, - { - "epoch": 1.0746102048899933, - "grad_norm": 0.32091404766902804, - "learning_rate": 1.0449940159317564e-05, - "loss": 0.1403, - "step": 3730 - }, - { - "epoch": 1.0760505563357459, - "grad_norm": 0.27309247291871086, - "learning_rate": 1.042481941789634e-05, - "loss": 0.1299, - "step": 3735 - }, - { - "epoch": 1.0774909077814987, - "grad_norm": 0.291429030269511, - "learning_rate": 1.0399695990506877e-05, - "loss": 0.1349, - "step": 3740 - }, - { - "epoch": 1.0789312592272515, - "grad_norm": 0.2794660700760565, - "learning_rate": 1.0374570035994855e-05, - "loss": 0.1267, - "step": 3745 - }, - { - "epoch": 1.0803716106730041, - "grad_norm": 0.2915027456228963, - "learning_rate": 1.0349441713221923e-05, - "loss": 0.1332, - "step": 3750 - }, - { - "epoch": 1.081811962118757, - "grad_norm": 0.30229082944109603, - "learning_rate": 1.0324311181064714e-05, - "loss": 0.1284, - "step": 3755 - }, - { - "epoch": 1.0832523135645098, - "grad_norm": 0.28544390636563377, - "learning_rate": 1.0299178598413828e-05, - "loss": 0.1395, - "step": 3760 - }, - { - "epoch": 1.0846926650102624, - "grad_norm": 0.3197660163913866, - "learning_rate": 1.0274044124172817e-05, - "loss": 0.1296, - "step": 3765 - }, - { - "epoch": 1.0861330164560152, - "grad_norm": 0.28278629655789606, - "learning_rate": 1.0248907917257213e-05, - "loss": 0.1238, - "step": 3770 - }, - { - "epoch": 1.087573367901768, - "grad_norm": 0.2980589878724013, - "learning_rate": 1.022377013659349e-05, - "loss": 0.1383, - "step": 3775 - }, - { - "epoch": 1.089013719347521, - "grad_norm": 0.29273436019807486, - "learning_rate": 1.0198630941118075e-05, - "loss": 0.1299, - "step": 3780 - }, - { - "epoch": 1.0904540707932735, - "grad_norm": 0.2611335142282758, - "learning_rate": 1.0173490489776337e-05, - "loss": 0.1194, - "step": 3785 - }, - { - "epoch": 1.0918944222390263, - "grad_norm": 0.27401911211382207, - "learning_rate": 1.0148348941521596e-05, - "loss": 0.1283, - "step": 3790 - }, - { - "epoch": 1.0933347736847792, - "grad_norm": 0.27569212910222046, - "learning_rate": 1.012320645531409e-05, - "loss": 0.1357, - "step": 3795 - }, - { - "epoch": 1.0947751251305318, - "grad_norm": 0.31773489006970385, - "learning_rate": 1.0098063190120009e-05, - "loss": 0.1388, - "step": 3800 - }, - { - "epoch": 1.0962154765762846, - "grad_norm": 0.27529632570187174, - "learning_rate": 1.0072919304910446e-05, - "loss": 0.1348, - "step": 3805 - }, - { - "epoch": 1.0976558280220374, - "grad_norm": 0.32498147518056825, - "learning_rate": 1.0047774958660432e-05, - "loss": 0.1272, - "step": 3810 - }, - { - "epoch": 1.0990961794677903, - "grad_norm": 0.2774029992998093, - "learning_rate": 1.0022630310347905e-05, - "loss": 0.1235, - "step": 3815 - }, - { - "epoch": 1.1005365309135429, - "grad_norm": 0.2614075435164931, - "learning_rate": 9.99748551895271e-06, - "loss": 0.1296, - "step": 3820 - }, - { - "epoch": 1.1019768823592957, - "grad_norm": 0.3087699168742396, - "learning_rate": 9.972340743455606e-06, - "loss": 0.1298, - "step": 3825 - }, - { - "epoch": 1.1034172338050485, - "grad_norm": 0.26898843049721016, - "learning_rate": 9.947196142837237e-06, - "loss": 0.1261, - "step": 3830 - }, - { - "epoch": 1.1048575852508011, - "grad_norm": 0.2857369504598532, - "learning_rate": 9.922051876077157e-06, - "loss": 0.1373, - "step": 3835 - }, - { - "epoch": 1.106297936696554, - "grad_norm": 0.27007478399517304, - "learning_rate": 9.8969081021528e-06, - "loss": 0.1244, - "step": 3840 - }, - { - "epoch": 1.1077382881423068, - "grad_norm": 0.28803665926227207, - "learning_rate": 9.871764980038491e-06, - "loss": 0.133, - "step": 3845 - }, - { - "epoch": 1.1091786395880594, - "grad_norm": 0.29489016085930986, - "learning_rate": 9.846622668704421e-06, - "loss": 0.1342, - "step": 3850 - }, - { - "epoch": 1.1106189910338122, - "grad_norm": 0.30476944622519747, - "learning_rate": 9.821481327115665e-06, - "loss": 0.1288, - "step": 3855 - }, - { - "epoch": 1.112059342479565, - "grad_norm": 0.3092718795766665, - "learning_rate": 9.796341114231168e-06, - "loss": 0.1246, - "step": 3860 - }, - { - "epoch": 1.1134996939253177, - "grad_norm": 0.28805538998371455, - "learning_rate": 9.771202189002732e-06, - "loss": 0.1303, - "step": 3865 - }, - { - "epoch": 1.1149400453710705, - "grad_norm": 0.26965101242879297, - "learning_rate": 9.74606471037402e-06, - "loss": 0.1251, - "step": 3870 - }, - { - "epoch": 1.1163803968168233, - "grad_norm": 0.2524930013344495, - "learning_rate": 9.720928837279555e-06, - "loss": 0.1331, - "step": 3875 - }, - { - "epoch": 1.1178207482625762, - "grad_norm": 0.27639450425848394, - "learning_rate": 9.6957947286437e-06, - "loss": 0.1326, - "step": 3880 - }, - { - "epoch": 1.1192610997083288, - "grad_norm": 0.2950953833829538, - "learning_rate": 9.67066254337966e-06, - "loss": 0.1396, - "step": 3885 - }, - { - "epoch": 1.1207014511540816, - "grad_norm": 0.3332689077004191, - "learning_rate": 9.645532440388491e-06, - "loss": 0.1395, - "step": 3890 - }, - { - "epoch": 1.1221418025998344, - "grad_norm": 0.3045207707948191, - "learning_rate": 9.620404578558078e-06, - "loss": 0.1332, - "step": 3895 - }, - { - "epoch": 1.123582154045587, - "grad_norm": 0.27543129852179515, - "learning_rate": 9.59527911676213e-06, - "loss": 0.1312, - "step": 3900 - }, - { - "epoch": 1.1250225054913399, - "grad_norm": 0.322786857182541, - "learning_rate": 9.570156213859188e-06, - "loss": 0.1363, - "step": 3905 - }, - { - "epoch": 1.1264628569370927, - "grad_norm": 0.3332907289414307, - "learning_rate": 9.545036028691618e-06, - "loss": 0.1452, - "step": 3910 - }, - { - "epoch": 1.1279032083828455, - "grad_norm": 0.2964745281324652, - "learning_rate": 9.519918720084595e-06, - "loss": 0.1255, - "step": 3915 - }, - { - "epoch": 1.1293435598285981, - "grad_norm": 0.29380526786178957, - "learning_rate": 9.494804446845105e-06, - "loss": 0.1353, - "step": 3920 - }, - { - "epoch": 1.130783911274351, - "grad_norm": 0.3124984711938421, - "learning_rate": 9.46969336776095e-06, - "loss": 0.1194, - "step": 3925 - }, - { - "epoch": 1.1322242627201038, - "grad_norm": 0.2784062232835093, - "learning_rate": 9.444585641599736e-06, - "loss": 0.1267, - "step": 3930 - }, - { - "epoch": 1.1336646141658564, - "grad_norm": 0.2960281672476851, - "learning_rate": 9.41948142710786e-06, - "loss": 0.1288, - "step": 3935 - }, - { - "epoch": 1.1351049656116092, - "grad_norm": 0.3319737170484304, - "learning_rate": 9.394380883009528e-06, - "loss": 0.134, - "step": 3940 - }, - { - "epoch": 1.136545317057362, - "grad_norm": 0.2676639926944894, - "learning_rate": 9.369284168005739e-06, - "loss": 0.1302, - "step": 3945 - }, - { - "epoch": 1.1379856685031147, - "grad_norm": 0.29304917330014446, - "learning_rate": 9.344191440773269e-06, - "loss": 0.1398, - "step": 3950 - }, - { - "epoch": 1.1394260199488675, - "grad_norm": 0.2834441331194175, - "learning_rate": 9.3191028599637e-06, - "loss": 0.1366, - "step": 3955 - }, - { - "epoch": 1.1408663713946203, - "grad_norm": 0.2875227981738716, - "learning_rate": 9.294018584202378e-06, - "loss": 0.1275, - "step": 3960 - }, - { - "epoch": 1.142306722840373, - "grad_norm": 0.295392566822095, - "learning_rate": 9.268938772087444e-06, - "loss": 0.134, - "step": 3965 - }, - { - "epoch": 1.1437470742861258, - "grad_norm": 0.3031248184368795, - "learning_rate": 9.24386358218881e-06, - "loss": 0.1328, - "step": 3970 - }, - { - "epoch": 1.1451874257318786, - "grad_norm": 0.281546642383125, - "learning_rate": 9.218793173047167e-06, - "loss": 0.126, - "step": 3975 - }, - { - "epoch": 1.1466277771776314, - "grad_norm": 0.2820813141995356, - "learning_rate": 9.19372770317298e-06, - "loss": 0.1232, - "step": 3980 - }, - { - "epoch": 1.148068128623384, - "grad_norm": 0.2733287557965743, - "learning_rate": 9.168667331045482e-06, - "loss": 0.1356, - "step": 3985 - }, - { - "epoch": 1.1495084800691369, - "grad_norm": 0.3190121141129414, - "learning_rate": 9.143612215111679e-06, - "loss": 0.1453, - "step": 3990 - }, - { - "epoch": 1.1509488315148897, - "grad_norm": 0.3111192035405633, - "learning_rate": 9.118562513785334e-06, - "loss": 0.1425, - "step": 3995 - }, - { - "epoch": 1.1523891829606423, - "grad_norm": 0.3137082396200987, - "learning_rate": 9.093518385445988e-06, - "loss": 0.1377, - "step": 4000 - }, - { - "epoch": 1.1523891829606423, - "eval_loss": 0.14474257826805115, - "eval_runtime": 185.5439, - "eval_samples_per_second": 9.723, - "eval_steps_per_second": 2.431, - "step": 4000 - }, - { - "epoch": 1.1538295344063951, - "grad_norm": 0.28258671327645435, - "learning_rate": 9.06847998843794e-06, - "loss": 0.1334, - "step": 4005 - }, - { - "epoch": 1.155269885852148, - "grad_norm": 0.29926251581651486, - "learning_rate": 9.04344748106925e-06, - "loss": 0.1341, - "step": 4010 - }, - { - "epoch": 1.1567102372979008, - "grad_norm": 0.2704908892813702, - "learning_rate": 9.018421021610747e-06, - "loss": 0.1328, - "step": 4015 - }, - { - "epoch": 1.1581505887436534, - "grad_norm": 0.32713139197229957, - "learning_rate": 8.993400768295014e-06, - "loss": 0.1308, - "step": 4020 - }, - { - "epoch": 1.1595909401894062, - "grad_norm": 0.2766299542206814, - "learning_rate": 8.968386879315404e-06, - "loss": 0.1248, - "step": 4025 - }, - { - "epoch": 1.161031291635159, - "grad_norm": 0.28383009419843636, - "learning_rate": 8.94337951282502e-06, - "loss": 0.1305, - "step": 4030 - }, - { - "epoch": 1.1624716430809117, - "grad_norm": 0.29371071990613795, - "learning_rate": 8.918378826935731e-06, - "loss": 0.1382, - "step": 4035 - }, - { - "epoch": 1.1639119945266645, - "grad_norm": 0.3150456872535722, - "learning_rate": 8.893384979717165e-06, - "loss": 0.1359, - "step": 4040 - }, - { - "epoch": 1.1653523459724173, - "grad_norm": 0.27842595651311847, - "learning_rate": 8.86839812919572e-06, - "loss": 0.1325, - "step": 4045 - }, - { - "epoch": 1.16679269741817, - "grad_norm": 0.28767571922945206, - "learning_rate": 8.843418433353548e-06, - "loss": 0.129, - "step": 4050 - }, - { - "epoch": 1.1682330488639228, - "grad_norm": 0.29442793959753094, - "learning_rate": 8.818446050127565e-06, - "loss": 0.132, - "step": 4055 - }, - { - "epoch": 1.1696734003096756, - "grad_norm": 0.28879468366985844, - "learning_rate": 8.793481137408457e-06, - "loss": 0.1303, - "step": 4060 - }, - { - "epoch": 1.1711137517554282, - "grad_norm": 0.27722520036794807, - "learning_rate": 8.768523853039675e-06, - "loss": 0.1242, - "step": 4065 - }, - { - "epoch": 1.172554103201181, - "grad_norm": 0.2870500423966342, - "learning_rate": 8.743574354816433e-06, - "loss": 0.1273, - "step": 4070 - }, - { - "epoch": 1.1739944546469339, - "grad_norm": 0.29838754312619825, - "learning_rate": 8.718632800484725e-06, - "loss": 0.134, - "step": 4075 - }, - { - "epoch": 1.1754348060926867, - "grad_norm": 0.2792449091382121, - "learning_rate": 8.693699347740315e-06, - "loss": 0.1232, - "step": 4080 - }, - { - "epoch": 1.1768751575384393, - "grad_norm": 0.27360997864384, - "learning_rate": 8.668774154227745e-06, - "loss": 0.1262, - "step": 4085 - }, - { - "epoch": 1.1783155089841921, - "grad_norm": 0.311899339054919, - "learning_rate": 8.643857377539333e-06, - "loss": 0.1405, - "step": 4090 - }, - { - "epoch": 1.179755860429945, - "grad_norm": 0.2949013259318118, - "learning_rate": 8.618949175214187e-06, - "loss": 0.1318, - "step": 4095 - }, - { - "epoch": 1.1811962118756978, - "grad_norm": 0.27956199010868105, - "learning_rate": 8.594049704737199e-06, - "loss": 0.134, - "step": 4100 - }, - { - "epoch": 1.1826365633214504, - "grad_norm": 0.29596036435161904, - "learning_rate": 8.569159123538053e-06, - "loss": 0.1273, - "step": 4105 - }, - { - "epoch": 1.1840769147672032, - "grad_norm": 0.3191390201358693, - "learning_rate": 8.544277588990226e-06, - "loss": 0.1335, - "step": 4110 - }, - { - "epoch": 1.185517266212956, - "grad_norm": 0.28318994572027134, - "learning_rate": 8.519405258410007e-06, - "loss": 0.1308, - "step": 4115 - }, - { - "epoch": 1.1869576176587087, - "grad_norm": 0.27647236193756786, - "learning_rate": 8.49454228905548e-06, - "loss": 0.1338, - "step": 4120 - }, - { - "epoch": 1.1883979691044615, - "grad_norm": 0.2885923108928197, - "learning_rate": 8.469688838125549e-06, - "loss": 0.1274, - "step": 4125 - }, - { - "epoch": 1.1898383205502143, - "grad_norm": 0.27744666931948775, - "learning_rate": 8.444845062758937e-06, - "loss": 0.1313, - "step": 4130 - }, - { - "epoch": 1.191278671995967, - "grad_norm": 0.30589668436211354, - "learning_rate": 8.420011120033185e-06, - "loss": 0.1246, - "step": 4135 - }, - { - "epoch": 1.1927190234417198, - "grad_norm": 0.3233342541589803, - "learning_rate": 8.395187166963677e-06, - "loss": 0.1382, - "step": 4140 - }, - { - "epoch": 1.1941593748874726, - "grad_norm": 0.3011225199281378, - "learning_rate": 8.370373360502621e-06, - "loss": 0.1364, - "step": 4145 - }, - { - "epoch": 1.1955997263332252, - "grad_norm": 0.2699583457721809, - "learning_rate": 8.345569857538089e-06, - "loss": 0.1301, - "step": 4150 - }, - { - "epoch": 1.197040077778978, - "grad_norm": 0.29161496467052717, - "learning_rate": 8.320776814892996e-06, - "loss": 0.1337, - "step": 4155 - }, - { - "epoch": 1.1984804292247309, - "grad_norm": 0.2989838376182308, - "learning_rate": 8.295994389324125e-06, - "loss": 0.1329, - "step": 4160 - }, - { - "epoch": 1.1999207806704837, - "grad_norm": 0.2987527432419416, - "learning_rate": 8.271222737521135e-06, - "loss": 0.1343, - "step": 4165 - }, - { - "epoch": 1.2013611321162363, - "grad_norm": 0.3073154082480416, - "learning_rate": 8.246462016105561e-06, - "loss": 0.1294, - "step": 4170 - }, - { - "epoch": 1.2028014835619891, - "grad_norm": 0.2918080099388789, - "learning_rate": 8.221712381629824e-06, - "loss": 0.1253, - "step": 4175 - }, - { - "epoch": 1.204241835007742, - "grad_norm": 0.2941283674055012, - "learning_rate": 8.196973990576259e-06, - "loss": 0.1375, - "step": 4180 - }, - { - "epoch": 1.2056821864534946, - "grad_norm": 0.3010648090781424, - "learning_rate": 8.172246999356109e-06, - "loss": 0.1291, - "step": 4185 - }, - { - "epoch": 1.2071225378992474, - "grad_norm": 0.3112057276433019, - "learning_rate": 8.147531564308534e-06, - "loss": 0.1316, - "step": 4190 - }, - { - "epoch": 1.2085628893450002, - "grad_norm": 0.3229481450208124, - "learning_rate": 8.122827841699638e-06, - "loss": 0.1429, - "step": 4195 - }, - { - "epoch": 1.210003240790753, - "grad_norm": 0.2858141473020997, - "learning_rate": 8.09813598772147e-06, - "loss": 0.1281, - "step": 4200 - }, - { - "epoch": 1.2114435922365057, - "grad_norm": 0.2829245856140391, - "learning_rate": 8.07345615849103e-06, - "loss": 0.1267, - "step": 4205 - }, - { - "epoch": 1.2128839436822585, - "grad_norm": 0.3112793144040732, - "learning_rate": 8.0487885100493e-06, - "loss": 0.1262, - "step": 4210 - }, - { - "epoch": 1.2143242951280113, - "grad_norm": 0.28202317022242396, - "learning_rate": 8.02413319836024e-06, - "loss": 0.1329, - "step": 4215 - }, - { - "epoch": 1.215764646573764, - "grad_norm": 0.25601822143551783, - "learning_rate": 7.999490379309815e-06, - "loss": 0.1272, - "step": 4220 - }, - { - "epoch": 1.2172049980195168, - "grad_norm": 0.28229846356369415, - "learning_rate": 7.974860208705003e-06, - "loss": 0.1334, - "step": 4225 - }, - { - "epoch": 1.2186453494652696, - "grad_norm": 0.29081914492163424, - "learning_rate": 7.950242842272805e-06, - "loss": 0.1311, - "step": 4230 - }, - { - "epoch": 1.2200857009110222, - "grad_norm": 0.29767304550969986, - "learning_rate": 7.92563843565928e-06, - "loss": 0.1303, - "step": 4235 - }, - { - "epoch": 1.221526052356775, - "grad_norm": 0.279147638085936, - "learning_rate": 7.90104714442853e-06, - "loss": 0.132, - "step": 4240 - }, - { - "epoch": 1.2229664038025279, - "grad_norm": 0.2949168996509113, - "learning_rate": 7.876469124061748e-06, - "loss": 0.1268, - "step": 4245 - }, - { - "epoch": 1.2244067552482805, - "grad_norm": 0.2620835993688827, - "learning_rate": 7.851904529956207e-06, - "loss": 0.1316, - "step": 4250 - }, - { - "epoch": 1.2258471066940333, - "grad_norm": 0.28815330072715767, - "learning_rate": 7.827353517424303e-06, - "loss": 0.1319, - "step": 4255 - }, - { - "epoch": 1.2272874581397861, - "grad_norm": 0.27770301658683055, - "learning_rate": 7.802816241692554e-06, - "loss": 0.1279, - "step": 4260 - }, - { - "epoch": 1.228727809585539, - "grad_norm": 0.2754011061921349, - "learning_rate": 7.778292857900627e-06, - "loss": 0.1336, - "step": 4265 - }, - { - "epoch": 1.2301681610312916, - "grad_norm": 0.306991072991392, - "learning_rate": 7.753783521100362e-06, - "loss": 0.1389, - "step": 4270 - }, - { - "epoch": 1.2316085124770444, - "grad_norm": 0.28053697792633275, - "learning_rate": 7.72928838625477e-06, - "loss": 0.1275, - "step": 4275 - }, - { - "epoch": 1.2330488639227972, - "grad_norm": 0.30305612446281116, - "learning_rate": 7.704807608237089e-06, - "loss": 0.1295, - "step": 4280 - }, - { - "epoch": 1.2344892153685498, - "grad_norm": 0.2864190950078543, - "learning_rate": 7.680341341829765e-06, - "loss": 0.1331, - "step": 4285 - }, - { - "epoch": 1.2359295668143027, - "grad_norm": 0.3084452021982498, - "learning_rate": 7.655889741723503e-06, - "loss": 0.1291, - "step": 4290 - }, - { - "epoch": 1.2373699182600555, - "grad_norm": 0.3005817324491606, - "learning_rate": 7.631452962516278e-06, - "loss": 0.1356, - "step": 4295 - }, - { - "epoch": 1.2388102697058083, - "grad_norm": 0.2917648957325195, - "learning_rate": 7.6070311587123555e-06, - "loss": 0.1297, - "step": 4300 - }, - { - "epoch": 1.240250621151561, - "grad_norm": 0.30176130506840737, - "learning_rate": 7.5826244847213234e-06, - "loss": 0.1265, - "step": 4305 - }, - { - "epoch": 1.2416909725973138, - "grad_norm": 0.2526507461560052, - "learning_rate": 7.558233094857101e-06, - "loss": 0.1278, - "step": 4310 - }, - { - "epoch": 1.2431313240430666, - "grad_norm": 0.28625993806129957, - "learning_rate": 7.533857143336976e-06, - "loss": 0.1238, - "step": 4315 - }, - { - "epoch": 1.2445716754888192, - "grad_norm": 0.3146356292248194, - "learning_rate": 7.50949678428063e-06, - "loss": 0.137, - "step": 4320 - }, - { - "epoch": 1.246012026934572, - "grad_norm": 0.31206200245191695, - "learning_rate": 7.485152171709151e-06, - "loss": 0.1319, - "step": 4325 - }, - { - "epoch": 1.2474523783803249, - "grad_norm": 0.2860758480463773, - "learning_rate": 7.460823459544072e-06, - "loss": 0.1386, - "step": 4330 - }, - { - "epoch": 1.2488927298260775, - "grad_norm": 0.2771762439164134, - "learning_rate": 7.4365108016063955e-06, - "loss": 0.1264, - "step": 4335 - }, - { - "epoch": 1.2503330812718303, - "grad_norm": 0.30029450340343883, - "learning_rate": 7.4122143516156185e-06, - "loss": 0.1538, - "step": 4340 - }, - { - "epoch": 1.2517734327175831, - "grad_norm": 0.30376043235244393, - "learning_rate": 7.38793426318876e-06, - "loss": 0.1309, - "step": 4345 - }, - { - "epoch": 1.2532137841633357, - "grad_norm": 0.2848974573813015, - "learning_rate": 7.363670689839392e-06, - "loss": 0.1271, - "step": 4350 - }, - { - "epoch": 1.2546541356090886, - "grad_norm": 0.3274964514141692, - "learning_rate": 7.339423784976672e-06, - "loss": 0.1347, - "step": 4355 - }, - { - "epoch": 1.2560944870548414, - "grad_norm": 0.2808970287278979, - "learning_rate": 7.315193701904361e-06, - "loss": 0.1338, - "step": 4360 - }, - { - "epoch": 1.2575348385005942, - "grad_norm": 0.26647645243293355, - "learning_rate": 7.290980593819866e-06, - "loss": 0.1206, - "step": 4365 - }, - { - "epoch": 1.2589751899463468, - "grad_norm": 0.299200085436933, - "learning_rate": 7.266784613813268e-06, - "loss": 0.1282, - "step": 4370 - }, - { - "epoch": 1.2604155413920997, - "grad_norm": 0.2769069945827787, - "learning_rate": 7.24260591486636e-06, - "loss": 0.1356, - "step": 4375 - }, - { - "epoch": 1.2618558928378525, - "grad_norm": 0.27637014074328664, - "learning_rate": 7.218444649851661e-06, - "loss": 0.1359, - "step": 4380 - }, - { - "epoch": 1.2632962442836053, - "grad_norm": 0.31996200647691136, - "learning_rate": 7.194300971531473e-06, - "loss": 0.14, - "step": 4385 - }, - { - "epoch": 1.264736595729358, - "grad_norm": 0.31623231666179996, - "learning_rate": 7.170175032556902e-06, - "loss": 0.1283, - "step": 4390 - }, - { - "epoch": 1.2661769471751108, - "grad_norm": 0.30271632929723064, - "learning_rate": 7.146066985466889e-06, - "loss": 0.1285, - "step": 4395 - }, - { - "epoch": 1.2676172986208636, - "grad_norm": 0.2737967446190047, - "learning_rate": 7.121976982687253e-06, - "loss": 0.1271, - "step": 4400 - }, - { - "epoch": 1.2690576500666162, - "grad_norm": 0.27977893252007474, - "learning_rate": 7.097905176529734e-06, - "loss": 0.1258, - "step": 4405 - }, - { - "epoch": 1.270498001512369, - "grad_norm": 0.30126818356616486, - "learning_rate": 7.073851719191014e-06, - "loss": 0.131, - "step": 4410 - }, - { - "epoch": 1.2719383529581219, - "grad_norm": 0.299507259922042, - "learning_rate": 7.049816762751762e-06, - "loss": 0.1308, - "step": 4415 - }, - { - "epoch": 1.2733787044038745, - "grad_norm": 0.2649336740662209, - "learning_rate": 7.02580045917568e-06, - "loss": 0.13, - "step": 4420 - }, - { - "epoch": 1.2748190558496273, - "grad_norm": 0.2564044244734405, - "learning_rate": 7.001802960308534e-06, - "loss": 0.1211, - "step": 4425 - }, - { - "epoch": 1.2762594072953801, - "grad_norm": 0.2977518133910194, - "learning_rate": 6.977824417877183e-06, - "loss": 0.1347, - "step": 4430 - }, - { - "epoch": 1.2776997587411327, - "grad_norm": 0.37240677712373393, - "learning_rate": 6.953864983488646e-06, - "loss": 0.1356, - "step": 4435 - }, - { - "epoch": 1.2791401101868856, - "grad_norm": 0.2965189745890561, - "learning_rate": 6.929924808629122e-06, - "loss": 0.1265, - "step": 4440 - }, - { - "epoch": 1.2805804616326384, - "grad_norm": 0.2752308024638383, - "learning_rate": 6.906004044663046e-06, - "loss": 0.1251, - "step": 4445 - }, - { - "epoch": 1.282020813078391, - "grad_norm": 0.30571802447619534, - "learning_rate": 6.882102842832115e-06, - "loss": 0.1332, - "step": 4450 - }, - { - "epoch": 1.2834611645241438, - "grad_norm": 0.2842226334713274, - "learning_rate": 6.858221354254352e-06, - "loss": 0.1363, - "step": 4455 - }, - { - "epoch": 1.2849015159698967, - "grad_norm": 0.2979873943753214, - "learning_rate": 6.834359729923138e-06, - "loss": 0.1298, - "step": 4460 - }, - { - "epoch": 1.2863418674156495, - "grad_norm": 0.30630908938424195, - "learning_rate": 6.81051812070626e-06, - "loss": 0.1351, - "step": 4465 - }, - { - "epoch": 1.2877822188614023, - "grad_norm": 0.28492055381385706, - "learning_rate": 6.786696677344949e-06, - "loss": 0.1208, - "step": 4470 - }, - { - "epoch": 1.289222570307155, - "grad_norm": 0.28384613081056187, - "learning_rate": 6.762895550452948e-06, - "loss": 0.1395, - "step": 4475 - }, - { - "epoch": 1.2906629217529078, - "grad_norm": 0.324228616513587, - "learning_rate": 6.739114890515542e-06, - "loss": 0.1378, - "step": 4480 - }, - { - "epoch": 1.2921032731986606, - "grad_norm": 0.2853156077237628, - "learning_rate": 6.715354847888607e-06, - "loss": 0.1226, - "step": 4485 - }, - { - "epoch": 1.2935436246444132, - "grad_norm": 0.31375254697639315, - "learning_rate": 6.691615572797672e-06, - "loss": 0.1392, - "step": 4490 - }, - { - "epoch": 1.294983976090166, - "grad_norm": 0.2883946445076774, - "learning_rate": 6.667897215336954e-06, - "loss": 0.1217, - "step": 4495 - }, - { - "epoch": 1.2964243275359189, - "grad_norm": 0.2895452595912213, - "learning_rate": 6.64419992546842e-06, - "loss": 0.1331, - "step": 4500 - }, - { - "epoch": 1.2964243275359189, - "eval_loss": 0.14379242062568665, - "eval_runtime": 184.0705, - "eval_samples_per_second": 9.801, - "eval_steps_per_second": 2.45, - "step": 4500 - }, - { - "epoch": 1.2978646789816715, - "grad_norm": 0.28665306269625207, - "learning_rate": 6.620523853020828e-06, - "loss": 0.1293, - "step": 4505 - }, - { - "epoch": 1.2993050304274243, - "grad_norm": 0.3087468378172715, - "learning_rate": 6.596869147688796e-06, - "loss": 0.1359, - "step": 4510 - }, - { - "epoch": 1.3007453818731771, - "grad_norm": 0.31040503221184323, - "learning_rate": 6.5732359590318405e-06, - "loss": 0.1289, - "step": 4515 - }, - { - "epoch": 1.3021857333189297, - "grad_norm": 0.2614034619917938, - "learning_rate": 6.549624436473437e-06, - "loss": 0.1317, - "step": 4520 - }, - { - "epoch": 1.3036260847646826, - "grad_norm": 0.29279408085324476, - "learning_rate": 6.526034729300077e-06, - "loss": 0.1302, - "step": 4525 - }, - { - "epoch": 1.3050664362104354, - "grad_norm": 0.27391338017046146, - "learning_rate": 6.502466986660318e-06, - "loss": 0.1237, - "step": 4530 - }, - { - "epoch": 1.306506787656188, - "grad_norm": 0.2577932055235253, - "learning_rate": 6.478921357563852e-06, - "loss": 0.133, - "step": 4535 - }, - { - "epoch": 1.3079471391019408, - "grad_norm": 0.29115643412884196, - "learning_rate": 6.4553979908805405e-06, - "loss": 0.1264, - "step": 4540 - }, - { - "epoch": 1.3093874905476937, - "grad_norm": 0.2805429545326651, - "learning_rate": 6.4318970353395015e-06, - "loss": 0.126, - "step": 4545 - }, - { - "epoch": 1.3108278419934463, - "grad_norm": 0.28048701005423443, - "learning_rate": 6.408418639528155e-06, - "loss": 0.1304, - "step": 4550 - }, - { - "epoch": 1.312268193439199, - "grad_norm": 0.31036490760140995, - "learning_rate": 6.38496295189128e-06, - "loss": 0.1299, - "step": 4555 - }, - { - "epoch": 1.313708544884952, - "grad_norm": 0.27781555493965154, - "learning_rate": 6.361530120730084e-06, - "loss": 0.1283, - "step": 4560 - }, - { - "epoch": 1.3151488963307048, - "grad_norm": 0.28470922038550206, - "learning_rate": 6.338120294201257e-06, - "loss": 0.1273, - "step": 4565 - }, - { - "epoch": 1.3165892477764576, - "grad_norm": 0.2812452734554949, - "learning_rate": 6.314733620316047e-06, - "loss": 0.1225, - "step": 4570 - }, - { - "epoch": 1.3180295992222102, - "grad_norm": 0.29152015278801036, - "learning_rate": 6.291370246939312e-06, - "loss": 0.132, - "step": 4575 - }, - { - "epoch": 1.319469950667963, - "grad_norm": 0.28443383104443637, - "learning_rate": 6.268030321788589e-06, - "loss": 0.1293, - "step": 4580 - }, - { - "epoch": 1.3209103021137159, - "grad_norm": 0.2959906683847514, - "learning_rate": 6.244713992433164e-06, - "loss": 0.1335, - "step": 4585 - }, - { - "epoch": 1.3223506535594685, - "grad_norm": 0.3040944482039468, - "learning_rate": 6.221421406293131e-06, - "loss": 0.1273, - "step": 4590 - }, - { - "epoch": 1.3237910050052213, - "grad_norm": 0.30882724239677245, - "learning_rate": 6.1981527106384765e-06, - "loss": 0.1191, - "step": 4595 - }, - { - "epoch": 1.3252313564509741, - "grad_norm": 0.302671709123605, - "learning_rate": 6.17490805258812e-06, - "loss": 0.1365, - "step": 4600 - }, - { - "epoch": 1.3266717078967267, - "grad_norm": 0.29010412797273544, - "learning_rate": 6.151687579109015e-06, - "loss": 0.1402, - "step": 4605 - }, - { - "epoch": 1.3281120593424796, - "grad_norm": 0.2638211025663234, - "learning_rate": 6.128491437015202e-06, - "loss": 0.122, - "step": 4610 - }, - { - "epoch": 1.3295524107882324, - "grad_norm": 0.3317741888903263, - "learning_rate": 6.1053197729668745e-06, - "loss": 0.1234, - "step": 4615 - }, - { - "epoch": 1.330992762233985, - "grad_norm": 0.305170078612422, - "learning_rate": 6.082172733469469e-06, - "loss": 0.1316, - "step": 4620 - }, - { - "epoch": 1.3324331136797378, - "grad_norm": 0.28817033083195814, - "learning_rate": 6.059050464872731e-06, - "loss": 0.1366, - "step": 4625 - }, - { - "epoch": 1.3338734651254907, - "grad_norm": 0.2953828272280745, - "learning_rate": 6.03595311336979e-06, - "loss": 0.1272, - "step": 4630 - }, - { - "epoch": 1.3353138165712433, - "grad_norm": 0.2830751051303079, - "learning_rate": 6.0128808249962255e-06, - "loss": 0.1404, - "step": 4635 - }, - { - "epoch": 1.336754168016996, - "grad_norm": 0.3252605933332986, - "learning_rate": 5.989833745629163e-06, - "loss": 0.129, - "step": 4640 - }, - { - "epoch": 1.338194519462749, - "grad_norm": 0.30213564618018374, - "learning_rate": 5.966812020986341e-06, - "loss": 0.13, - "step": 4645 - }, - { - "epoch": 1.3396348709085018, - "grad_norm": 0.2883204719629323, - "learning_rate": 5.943815796625179e-06, - "loss": 0.1253, - "step": 4650 - }, - { - "epoch": 1.3410752223542544, - "grad_norm": 0.2959117376319711, - "learning_rate": 5.920845217941874e-06, - "loss": 0.1327, - "step": 4655 - }, - { - "epoch": 1.3425155738000072, - "grad_norm": 0.31232438887774006, - "learning_rate": 5.8979004301704814e-06, - "loss": 0.1383, - "step": 4660 - }, - { - "epoch": 1.34395592524576, - "grad_norm": 0.29382396748754, - "learning_rate": 5.874981578381985e-06, - "loss": 0.133, - "step": 4665 - }, - { - "epoch": 1.3453962766915128, - "grad_norm": 0.30443621369119456, - "learning_rate": 5.852088807483385e-06, - "loss": 0.1303, - "step": 4670 - }, - { - "epoch": 1.3468366281372655, - "grad_norm": 0.2807950559344455, - "learning_rate": 5.829222262216783e-06, - "loss": 0.1327, - "step": 4675 - }, - { - "epoch": 1.3482769795830183, - "grad_norm": 0.28883011845316237, - "learning_rate": 5.80638208715847e-06, - "loss": 0.1321, - "step": 4680 - }, - { - "epoch": 1.3497173310287711, - "grad_norm": 0.2822159218958781, - "learning_rate": 5.783568426718001e-06, - "loss": 0.1243, - "step": 4685 - }, - { - "epoch": 1.3511576824745237, - "grad_norm": 0.2856294468316457, - "learning_rate": 5.76078142513729e-06, - "loss": 0.1264, - "step": 4690 - }, - { - "epoch": 1.3525980339202766, - "grad_norm": 0.2833850663671519, - "learning_rate": 5.738021226489711e-06, - "loss": 0.1249, - "step": 4695 - }, - { - "epoch": 1.3540383853660294, - "grad_norm": 0.3098314509041874, - "learning_rate": 5.715287974679156e-06, - "loss": 0.1317, - "step": 4700 - }, - { - "epoch": 1.355478736811782, - "grad_norm": 0.2509107454497361, - "learning_rate": 5.692581813439147e-06, - "loss": 0.1207, - "step": 4705 - }, - { - "epoch": 1.3569190882575348, - "grad_norm": 0.288399827316461, - "learning_rate": 5.669902886331935e-06, - "loss": 0.1313, - "step": 4710 - }, - { - "epoch": 1.3583594397032877, - "grad_norm": 0.2899923984835422, - "learning_rate": 5.647251336747565e-06, - "loss": 0.1353, - "step": 4715 - }, - { - "epoch": 1.3597997911490403, - "grad_norm": 0.30442118246180094, - "learning_rate": 5.62462730790299e-06, - "loss": 0.1242, - "step": 4720 - }, - { - "epoch": 1.361240142594793, - "grad_norm": 0.3136171534592957, - "learning_rate": 5.602030942841161e-06, - "loss": 0.1249, - "step": 4725 - }, - { - "epoch": 1.362680494040546, - "grad_norm": 0.29860304044620467, - "learning_rate": 5.579462384430123e-06, - "loss": 0.1283, - "step": 4730 - }, - { - "epoch": 1.3641208454862985, - "grad_norm": 0.31046611461157403, - "learning_rate": 5.556921775362101e-06, - "loss": 0.135, - "step": 4735 - }, - { - "epoch": 1.3655611969320514, - "grad_norm": 0.28704908174570976, - "learning_rate": 5.5344092581526246e-06, - "loss": 0.1387, - "step": 4740 - }, - { - "epoch": 1.3670015483778042, - "grad_norm": 0.3341532812497149, - "learning_rate": 5.5119249751395955e-06, - "loss": 0.143, - "step": 4745 - }, - { - "epoch": 1.368441899823557, - "grad_norm": 0.29001862112776317, - "learning_rate": 5.489469068482399e-06, - "loss": 0.1461, - "step": 4750 - }, - { - "epoch": 1.3698822512693098, - "grad_norm": 0.2834403752479179, - "learning_rate": 5.467041680161029e-06, - "loss": 0.1233, - "step": 4755 - }, - { - "epoch": 1.3713226027150625, - "grad_norm": 0.33143013156488327, - "learning_rate": 5.444642951975137e-06, - "loss": 0.1272, - "step": 4760 - }, - { - "epoch": 1.3727629541608153, - "grad_norm": 0.28777376307497, - "learning_rate": 5.422273025543197e-06, - "loss": 0.1289, - "step": 4765 - }, - { - "epoch": 1.3742033056065681, - "grad_norm": 0.28509358403097473, - "learning_rate": 5.399932042301565e-06, - "loss": 0.1359, - "step": 4770 - }, - { - "epoch": 1.3756436570523207, - "grad_norm": 0.31777606236187494, - "learning_rate": 5.377620143503598e-06, - "loss": 0.1364, - "step": 4775 - }, - { - "epoch": 1.3770840084980736, - "grad_norm": 0.2892592873947313, - "learning_rate": 5.355337470218778e-06, - "loss": 0.1325, - "step": 4780 - }, - { - "epoch": 1.3785243599438264, - "grad_norm": 0.27422137870653046, - "learning_rate": 5.333084163331794e-06, - "loss": 0.1298, - "step": 4785 - }, - { - "epoch": 1.379964711389579, - "grad_norm": 0.3146053844222587, - "learning_rate": 5.3108603635416654e-06, - "loss": 0.1434, - "step": 4790 - }, - { - "epoch": 1.3814050628353318, - "grad_norm": 0.27997320753339555, - "learning_rate": 5.288666211360848e-06, - "loss": 0.1342, - "step": 4795 - }, - { - "epoch": 1.3828454142810847, - "grad_norm": 0.2984800479383985, - "learning_rate": 5.266501847114349e-06, - "loss": 0.1346, - "step": 4800 - }, - { - "epoch": 1.3842857657268373, - "grad_norm": 0.29201935536901064, - "learning_rate": 5.2443674109388355e-06, - "loss": 0.1264, - "step": 4805 - }, - { - "epoch": 1.38572611717259, - "grad_norm": 0.27719720949263027, - "learning_rate": 5.222263042781761e-06, - "loss": 0.1301, - "step": 4810 - }, - { - "epoch": 1.387166468618343, - "grad_norm": 0.3042597202236721, - "learning_rate": 5.200188882400458e-06, - "loss": 0.1291, - "step": 4815 - }, - { - "epoch": 1.3886068200640955, - "grad_norm": 0.30302490372496094, - "learning_rate": 5.178145069361269e-06, - "loss": 0.1387, - "step": 4820 - }, - { - "epoch": 1.3900471715098484, - "grad_norm": 0.28176863882814285, - "learning_rate": 5.156131743038672e-06, - "loss": 0.1292, - "step": 4825 - }, - { - "epoch": 1.3914875229556012, - "grad_norm": 0.3058547991836209, - "learning_rate": 5.134149042614381e-06, - "loss": 0.1376, - "step": 4830 - }, - { - "epoch": 1.3929278744013538, - "grad_norm": 0.30393509196007185, - "learning_rate": 5.112197107076473e-06, - "loss": 0.1355, - "step": 4835 - }, - { - "epoch": 1.3943682258471066, - "grad_norm": 0.26425351590901147, - "learning_rate": 5.090276075218516e-06, - "loss": 0.1253, - "step": 4840 - }, - { - "epoch": 1.3958085772928595, - "grad_norm": 0.2789170911602346, - "learning_rate": 5.0683860856386805e-06, - "loss": 0.1226, - "step": 4845 - }, - { - "epoch": 1.3972489287386123, - "grad_norm": 0.3084946604950202, - "learning_rate": 5.046527276738869e-06, - "loss": 0.1403, - "step": 4850 - }, - { - "epoch": 1.3986892801843651, - "grad_norm": 0.2817833925106797, - "learning_rate": 5.02469978672385e-06, - "loss": 0.1286, - "step": 4855 - }, - { - "epoch": 1.4001296316301177, - "grad_norm": 0.31976795477023545, - "learning_rate": 5.002903753600368e-06, - "loss": 0.1388, - "step": 4860 - }, - { - "epoch": 1.4015699830758706, - "grad_norm": 0.31706703406909087, - "learning_rate": 4.981139315176272e-06, - "loss": 0.1256, - "step": 4865 - }, - { - "epoch": 1.4030103345216234, - "grad_norm": 0.29670136774417283, - "learning_rate": 4.959406609059661e-06, - "loss": 0.1305, - "step": 4870 - }, - { - "epoch": 1.404450685967376, - "grad_norm": 0.3093837514114273, - "learning_rate": 4.937705772657992e-06, - "loss": 0.1279, - "step": 4875 - }, - { - "epoch": 1.4058910374131288, - "grad_norm": 0.26356143472517385, - "learning_rate": 4.916036943177235e-06, - "loss": 0.1349, - "step": 4880 - }, - { - "epoch": 1.4073313888588816, - "grad_norm": 0.3339315239470743, - "learning_rate": 4.894400257620982e-06, - "loss": 0.1361, - "step": 4885 - }, - { - "epoch": 1.4087717403046343, - "grad_norm": 0.32122936240066957, - "learning_rate": 4.872795852789592e-06, - "loss": 0.1306, - "step": 4890 - }, - { - "epoch": 1.410212091750387, - "grad_norm": 0.28913451492903725, - "learning_rate": 4.851223865279336e-06, - "loss": 0.1356, - "step": 4895 - }, - { - "epoch": 1.41165244319614, - "grad_norm": 0.30914198884665406, - "learning_rate": 4.829684431481516e-06, - "loss": 0.1299, - "step": 4900 - }, - { - "epoch": 1.4130927946418925, - "grad_norm": 0.2733782589344944, - "learning_rate": 4.8081776875815966e-06, - "loss": 0.1301, - "step": 4905 - }, - { - "epoch": 1.4145331460876454, - "grad_norm": 0.3485745526900565, - "learning_rate": 4.786703769558382e-06, - "loss": 0.1253, - "step": 4910 - }, - { - "epoch": 1.4159734975333982, - "grad_norm": 0.311015844030605, - "learning_rate": 4.765262813183112e-06, - "loss": 0.1243, - "step": 4915 - }, - { - "epoch": 1.4174138489791508, - "grad_norm": 0.2642965069547113, - "learning_rate": 4.743854954018628e-06, - "loss": 0.1195, - "step": 4920 - }, - { - "epoch": 1.4188542004249036, - "grad_norm": 0.272979546582442, - "learning_rate": 4.7224803274185185e-06, - "loss": 0.1212, - "step": 4925 - }, - { - "epoch": 1.4202945518706565, - "grad_norm": 0.3038792772484129, - "learning_rate": 4.701139068526243e-06, - "loss": 0.1338, - "step": 4930 - }, - { - "epoch": 1.4217349033164093, - "grad_norm": 0.2767357670064067, - "learning_rate": 4.679831312274298e-06, - "loss": 0.1255, - "step": 4935 - }, - { - "epoch": 1.423175254762162, - "grad_norm": 0.27710201889151737, - "learning_rate": 4.658557193383352e-06, - "loss": 0.1357, - "step": 4940 - }, - { - "epoch": 1.4246156062079147, - "grad_norm": 0.3096121537912325, - "learning_rate": 4.637316846361395e-06, - "loss": 0.1293, - "step": 4945 - }, - { - "epoch": 1.4260559576536675, - "grad_norm": 0.29924550507004927, - "learning_rate": 4.616110405502903e-06, - "loss": 0.1367, - "step": 4950 - }, - { - "epoch": 1.4274963090994204, - "grad_norm": 0.3084219881809167, - "learning_rate": 4.594938004887963e-06, - "loss": 0.1268, - "step": 4955 - }, - { - "epoch": 1.428936660545173, - "grad_norm": 0.30194233813871496, - "learning_rate": 4.57379977838144e-06, - "loss": 0.1386, - "step": 4960 - }, - { - "epoch": 1.4303770119909258, - "grad_norm": 0.2924906618475438, - "learning_rate": 4.5526958596321415e-06, - "loss": 0.1326, - "step": 4965 - }, - { - "epoch": 1.4318173634366786, - "grad_norm": 0.2928117826425519, - "learning_rate": 4.531626382071947e-06, - "loss": 0.1337, - "step": 4970 - }, - { - "epoch": 1.4332577148824313, - "grad_norm": 0.2890472449363505, - "learning_rate": 4.510591478914984e-06, - "loss": 0.1326, - "step": 4975 - }, - { - "epoch": 1.434698066328184, - "grad_norm": 0.2816438145391565, - "learning_rate": 4.489591283156778e-06, - "loss": 0.1298, - "step": 4980 - }, - { - "epoch": 1.436138417773937, - "grad_norm": 0.2632230619903958, - "learning_rate": 4.468625927573411e-06, - "loss": 0.1263, - "step": 4985 - }, - { - "epoch": 1.4375787692196895, - "grad_norm": 0.29380703917060536, - "learning_rate": 4.447695544720685e-06, - "loss": 0.1385, - "step": 4990 - }, - { - "epoch": 1.4390191206654424, - "grad_norm": 0.28185514155481595, - "learning_rate": 4.426800266933291e-06, - "loss": 0.1306, - "step": 4995 - }, - { - "epoch": 1.4404594721111952, - "grad_norm": 0.2874696989018966, - "learning_rate": 4.405940226323953e-06, - "loss": 0.1311, - "step": 5000 - }, - { - "epoch": 1.4404594721111952, - "eval_loss": 0.14263266324996948, - "eval_runtime": 182.4281, - "eval_samples_per_second": 9.889, - "eval_steps_per_second": 2.472, - "step": 5000 - }, - { - "epoch": 1.4418998235569478, - "grad_norm": 0.29035247275746917, - "learning_rate": 4.385115554782608e-06, - "loss": 0.1284, - "step": 5005 - }, - { - "epoch": 1.4433401750027006, - "grad_norm": 0.2979091755169656, - "learning_rate": 4.364326383975576e-06, - "loss": 0.1393, - "step": 5010 - }, - { - "epoch": 1.4447805264484535, - "grad_norm": 0.2851008473192101, - "learning_rate": 4.343572845344699e-06, - "loss": 0.1302, - "step": 5015 - }, - { - "epoch": 1.446220877894206, - "grad_norm": 0.29510410689584077, - "learning_rate": 4.3228550701065555e-06, - "loss": 0.1295, - "step": 5020 - }, - { - "epoch": 1.4476612293399589, - "grad_norm": 0.27703300197438524, - "learning_rate": 4.302173189251592e-06, - "loss": 0.1277, - "step": 5025 - }, - { - "epoch": 1.4491015807857117, - "grad_norm": 0.2762512635985658, - "learning_rate": 4.281527333543304e-06, - "loss": 0.135, - "step": 5030 - }, - { - "epoch": 1.4505419322314645, - "grad_norm": 0.3041165339805363, - "learning_rate": 4.260917633517432e-06, - "loss": 0.1243, - "step": 5035 - }, - { - "epoch": 1.4519822836772174, - "grad_norm": 0.3060868972240773, - "learning_rate": 4.2403442194811015e-06, - "loss": 0.1298, - "step": 5040 - }, - { - "epoch": 1.45342263512297, - "grad_norm": 0.2893090691141548, - "learning_rate": 4.2198072215120234e-06, - "loss": 0.1364, - "step": 5045 - }, - { - "epoch": 1.4548629865687228, - "grad_norm": 0.31038762857518265, - "learning_rate": 4.1993067694576604e-06, - "loss": 0.1319, - "step": 5050 - }, - { - "epoch": 1.4563033380144756, - "grad_norm": 0.2812918503365721, - "learning_rate": 4.178842992934412e-06, - "loss": 0.1262, - "step": 5055 - }, - { - "epoch": 1.4577436894602283, - "grad_norm": 0.2850284783778579, - "learning_rate": 4.158416021326787e-06, - "loss": 0.1296, - "step": 5060 - }, - { - "epoch": 1.459184040905981, - "grad_norm": 0.32065754223579734, - "learning_rate": 4.138025983786606e-06, - "loss": 0.1328, - "step": 5065 - }, - { - "epoch": 1.460624392351734, - "grad_norm": 0.34520060614106257, - "learning_rate": 4.117673009232155e-06, - "loss": 0.1323, - "step": 5070 - }, - { - "epoch": 1.4620647437974865, - "grad_norm": 0.28913376259082474, - "learning_rate": 4.097357226347385e-06, - "loss": 0.122, - "step": 5075 - }, - { - "epoch": 1.4635050952432394, - "grad_norm": 0.3380735998941071, - "learning_rate": 4.077078763581112e-06, - "loss": 0.1331, - "step": 5080 - }, - { - "epoch": 1.4649454466889922, - "grad_norm": 0.32045923135465526, - "learning_rate": 4.056837749146176e-06, - "loss": 0.13, - "step": 5085 - }, - { - "epoch": 1.4663857981347448, - "grad_norm": 0.3204047195205385, - "learning_rate": 4.036634311018657e-06, - "loss": 0.1271, - "step": 5090 - }, - { - "epoch": 1.4678261495804976, - "grad_norm": 0.3177103619589229, - "learning_rate": 4.016468576937048e-06, - "loss": 0.1313, - "step": 5095 - }, - { - "epoch": 1.4692665010262504, - "grad_norm": 0.3170687328843362, - "learning_rate": 3.996340674401452e-06, - "loss": 0.143, - "step": 5100 - }, - { - "epoch": 1.470706852472003, - "grad_norm": 0.31482701115246225, - "learning_rate": 3.976250730672789e-06, - "loss": 0.1267, - "step": 5105 - }, - { - "epoch": 1.4721472039177559, - "grad_norm": 0.29749419127247917, - "learning_rate": 3.95619887277197e-06, - "loss": 0.1355, - "step": 5110 - }, - { - "epoch": 1.4735875553635087, - "grad_norm": 0.2721513816837507, - "learning_rate": 3.936185227479104e-06, - "loss": 0.1262, - "step": 5115 - }, - { - "epoch": 1.4750279068092613, - "grad_norm": 0.28848818134175824, - "learning_rate": 3.91620992133271e-06, - "loss": 0.14, - "step": 5120 - }, - { - "epoch": 1.4764682582550142, - "grad_norm": 0.2878594948643242, - "learning_rate": 3.896273080628881e-06, - "loss": 0.1256, - "step": 5125 - }, - { - "epoch": 1.477908609700767, - "grad_norm": 0.295688557911149, - "learning_rate": 3.876374831420523e-06, - "loss": 0.1326, - "step": 5130 - }, - { - "epoch": 1.4793489611465198, - "grad_norm": 0.29623849761201715, - "learning_rate": 3.856515299516545e-06, - "loss": 0.132, - "step": 5135 - }, - { - "epoch": 1.4807893125922726, - "grad_norm": 0.30600381790452613, - "learning_rate": 3.8366946104810535e-06, - "loss": 0.1319, - "step": 5140 - }, - { - "epoch": 1.4822296640380253, - "grad_norm": 0.2829908397887632, - "learning_rate": 3.816912889632567e-06, - "loss": 0.1304, - "step": 5145 - }, - { - "epoch": 1.483670015483778, - "grad_norm": 0.2859310181485027, - "learning_rate": 3.7971702620432306e-06, - "loss": 0.1291, - "step": 5150 - }, - { - "epoch": 1.485110366929531, - "grad_norm": 0.2903900767146654, - "learning_rate": 3.777466852538012e-06, - "loss": 0.1269, - "step": 5155 - }, - { - "epoch": 1.4865507183752835, - "grad_norm": 0.30693062898146356, - "learning_rate": 3.757802785693919e-06, - "loss": 0.1227, - "step": 5160 - }, - { - "epoch": 1.4879910698210363, - "grad_norm": 0.2920336924699934, - "learning_rate": 3.738178185839212e-06, - "loss": 0.1298, - "step": 5165 - }, - { - "epoch": 1.4894314212667892, - "grad_norm": 0.2945995741156779, - "learning_rate": 3.718593177052611e-06, - "loss": 0.1296, - "step": 5170 - }, - { - "epoch": 1.4908717727125418, - "grad_norm": 0.281519581967037, - "learning_rate": 3.699047883162531e-06, - "loss": 0.1348, - "step": 5175 - }, - { - "epoch": 1.4923121241582946, - "grad_norm": 0.2942529713423248, - "learning_rate": 3.679542427746272e-06, - "loss": 0.1285, - "step": 5180 - }, - { - "epoch": 1.4937524756040474, - "grad_norm": 0.2590754218015783, - "learning_rate": 3.660076934129253e-06, - "loss": 0.1198, - "step": 5185 - }, - { - "epoch": 1.4951928270498, - "grad_norm": 0.3110962189352441, - "learning_rate": 3.6406515253842433e-06, - "loss": 0.1329, - "step": 5190 - }, - { - "epoch": 1.4966331784955529, - "grad_norm": 0.3012107480072597, - "learning_rate": 3.621266324330548e-06, - "loss": 0.1269, - "step": 5195 - }, - { - "epoch": 1.4980735299413057, - "grad_norm": 0.27989842275949034, - "learning_rate": 3.601921453533269e-06, - "loss": 0.1264, - "step": 5200 - }, - { - "epoch": 1.4995138813870583, - "grad_norm": 0.27816202646485955, - "learning_rate": 3.582617035302519e-06, - "loss": 0.1353, - "step": 5205 - }, - { - "epoch": 1.5009542328328112, - "grad_norm": 0.28449873805759873, - "learning_rate": 3.5633531916926355e-06, - "loss": 0.1401, - "step": 5210 - }, - { - "epoch": 1.502394584278564, - "grad_norm": 0.2939328466462327, - "learning_rate": 3.5441300445014204e-06, - "loss": 0.1309, - "step": 5215 - }, - { - "epoch": 1.5038349357243166, - "grad_norm": 0.27676106704556663, - "learning_rate": 3.5249477152693746e-06, - "loss": 0.1255, - "step": 5220 - }, - { - "epoch": 1.5052752871700696, - "grad_norm": 0.3091437198894899, - "learning_rate": 3.5058063252789164e-06, - "loss": 0.1337, - "step": 5225 - }, - { - "epoch": 1.5067156386158223, - "grad_norm": 0.29089963493957577, - "learning_rate": 3.486705995553623e-06, - "loss": 0.1179, - "step": 5230 - }, - { - "epoch": 1.5081559900615749, - "grad_norm": 0.28563278658009045, - "learning_rate": 3.467646846857462e-06, - "loss": 0.1324, - "step": 5235 - }, - { - "epoch": 1.509596341507328, - "grad_norm": 0.27308010285162093, - "learning_rate": 3.448628999694028e-06, - "loss": 0.131, - "step": 5240 - }, - { - "epoch": 1.5110366929530805, - "grad_norm": 0.2692168630506388, - "learning_rate": 3.4296525743057917e-06, - "loss": 0.1245, - "step": 5245 - }, - { - "epoch": 1.5124770443988333, - "grad_norm": 0.28589432120186975, - "learning_rate": 3.4107176906733186e-06, - "loss": 0.1395, - "step": 5250 - }, - { - "epoch": 1.5139173958445862, - "grad_norm": 0.3230293467940769, - "learning_rate": 3.3918244685145273e-06, - "loss": 0.1239, - "step": 5255 - }, - { - "epoch": 1.5153577472903388, - "grad_norm": 0.2724298060757966, - "learning_rate": 3.3729730272839236e-06, - "loss": 0.1243, - "step": 5260 - }, - { - "epoch": 1.5167980987360916, - "grad_norm": 0.2875130873464737, - "learning_rate": 3.3541634861718586e-06, - "loss": 0.1267, - "step": 5265 - }, - { - "epoch": 1.5182384501818444, - "grad_norm": 0.2545525280708859, - "learning_rate": 3.335395964103746e-06, - "loss": 0.1257, - "step": 5270 - }, - { - "epoch": 1.519678801627597, - "grad_norm": 0.28758741388989995, - "learning_rate": 3.3166705797393505e-06, - "loss": 0.1218, - "step": 5275 - }, - { - "epoch": 1.5211191530733499, - "grad_norm": 0.28405695915470375, - "learning_rate": 3.2979874514720044e-06, - "loss": 0.1304, - "step": 5280 - }, - { - "epoch": 1.5225595045191027, - "grad_norm": 0.30359930663025747, - "learning_rate": 3.2793466974278698e-06, - "loss": 0.1312, - "step": 5285 - }, - { - "epoch": 1.5239998559648553, - "grad_norm": 0.2739393344439081, - "learning_rate": 3.2607484354652053e-06, - "loss": 0.1266, - "step": 5290 - }, - { - "epoch": 1.5254402074106082, - "grad_norm": 0.29944033968813727, - "learning_rate": 3.2421927831735946e-06, - "loss": 0.1272, - "step": 5295 - }, - { - "epoch": 1.526880558856361, - "grad_norm": 0.2954139006807455, - "learning_rate": 3.2236798578732243e-06, - "loss": 0.124, - "step": 5300 - }, - { - "epoch": 1.5283209103021136, - "grad_norm": 0.29210672998811726, - "learning_rate": 3.2052097766141333e-06, - "loss": 0.1318, - "step": 5305 - }, - { - "epoch": 1.5297612617478666, - "grad_norm": 0.2600604904032675, - "learning_rate": 3.1867826561754734e-06, - "loss": 0.1249, - "step": 5310 - }, - { - "epoch": 1.5312016131936192, - "grad_norm": 0.30502890550059597, - "learning_rate": 3.168398613064769e-06, - "loss": 0.1331, - "step": 5315 - }, - { - "epoch": 1.5326419646393719, - "grad_norm": 0.2927272597507059, - "learning_rate": 3.150057763517195e-06, - "loss": 0.1398, - "step": 5320 - }, - { - "epoch": 1.534082316085125, - "grad_norm": 0.27938831535091735, - "learning_rate": 3.1317602234948176e-06, - "loss": 0.1263, - "step": 5325 - }, - { - "epoch": 1.5355226675308775, - "grad_norm": 0.3173797555345694, - "learning_rate": 3.1135061086858744e-06, - "loss": 0.1331, - "step": 5330 - }, - { - "epoch": 1.5369630189766303, - "grad_norm": 0.28629187116885524, - "learning_rate": 3.0952955345040536e-06, - "loss": 0.1232, - "step": 5335 - }, - { - "epoch": 1.5384033704223832, - "grad_norm": 0.315475203948924, - "learning_rate": 3.0771286160877422e-06, - "loss": 0.1361, - "step": 5340 - }, - { - "epoch": 1.5398437218681358, - "grad_norm": 0.3090927948177035, - "learning_rate": 3.0590054682993107e-06, - "loss": 0.1329, - "step": 5345 - }, - { - "epoch": 1.5412840733138886, - "grad_norm": 0.2952878570632112, - "learning_rate": 3.0409262057243873e-06, - "loss": 0.1307, - "step": 5350 - }, - { - "epoch": 1.5427244247596414, - "grad_norm": 0.292962868707295, - "learning_rate": 3.022890942671126e-06, - "loss": 0.1223, - "step": 5355 - }, - { - "epoch": 1.544164776205394, - "grad_norm": 0.268435660253563, - "learning_rate": 3.004899793169499e-06, - "loss": 0.1219, - "step": 5360 - }, - { - "epoch": 1.5456051276511469, - "grad_norm": 0.2756367576477082, - "learning_rate": 2.986952870970555e-06, - "loss": 0.1326, - "step": 5365 - }, - { - "epoch": 1.5470454790968997, - "grad_norm": 0.28750683938474214, - "learning_rate": 2.969050289545714e-06, - "loss": 0.1314, - "step": 5370 - }, - { - "epoch": 1.5484858305426523, - "grad_norm": 0.27186607640486593, - "learning_rate": 2.9511921620860564e-06, - "loss": 0.1265, - "step": 5375 - }, - { - "epoch": 1.5499261819884051, - "grad_norm": 0.28628288268866514, - "learning_rate": 2.9333786015015785e-06, - "loss": 0.1355, - "step": 5380 - }, - { - "epoch": 1.551366533434158, - "grad_norm": 0.31905004510449597, - "learning_rate": 2.9156097204205067e-06, - "loss": 0.1322, - "step": 5385 - }, - { - "epoch": 1.5528068848799106, - "grad_norm": 0.26846823715130697, - "learning_rate": 2.897885631188585e-06, - "loss": 0.1318, - "step": 5390 - }, - { - "epoch": 1.5542472363256634, - "grad_norm": 0.2910887451967936, - "learning_rate": 2.8802064458683455e-06, - "loss": 0.1295, - "step": 5395 - }, - { - "epoch": 1.5556875877714162, - "grad_norm": 0.29876503104887947, - "learning_rate": 2.862572276238407e-06, - "loss": 0.1326, - "step": 5400 - }, - { - "epoch": 1.5571279392171689, - "grad_norm": 0.2747178203316531, - "learning_rate": 2.844983233792785e-06, - "loss": 0.1233, - "step": 5405 - }, - { - "epoch": 1.558568290662922, - "grad_norm": 0.2860909643835414, - "learning_rate": 2.827439429740164e-06, - "loss": 0.1256, - "step": 5410 - }, - { - "epoch": 1.5600086421086745, - "grad_norm": 0.2731315827900148, - "learning_rate": 2.8099409750032035e-06, - "loss": 0.131, - "step": 5415 - }, - { - "epoch": 1.5614489935544271, - "grad_norm": 0.28706730422047516, - "learning_rate": 2.7924879802178395e-06, - "loss": 0.1277, - "step": 5420 - }, - { - "epoch": 1.5628893450001802, - "grad_norm": 0.3052258135371807, - "learning_rate": 2.77508055573258e-06, - "loss": 0.1207, - "step": 5425 - }, - { - "epoch": 1.5643296964459328, - "grad_norm": 0.2851069995923877, - "learning_rate": 2.7577188116078148e-06, - "loss": 0.1299, - "step": 5430 - }, - { - "epoch": 1.5657700478916856, - "grad_norm": 0.2951839634863107, - "learning_rate": 2.74040285761511e-06, - "loss": 0.1304, - "step": 5435 - }, - { - "epoch": 1.5672103993374384, - "grad_norm": 0.26286219302398806, - "learning_rate": 2.723132803236517e-06, - "loss": 0.1235, - "step": 5440 - }, - { - "epoch": 1.568650750783191, - "grad_norm": 0.2979441766077297, - "learning_rate": 2.7059087576638876e-06, - "loss": 0.1256, - "step": 5445 - }, - { - "epoch": 1.5700911022289439, - "grad_norm": 0.3205557150437196, - "learning_rate": 2.6887308297981775e-06, - "loss": 0.1371, - "step": 5450 - }, - { - "epoch": 1.5715314536746967, - "grad_norm": 0.2721271632458455, - "learning_rate": 2.6715991282487454e-06, - "loss": 0.1332, - "step": 5455 - }, - { - "epoch": 1.5729718051204493, - "grad_norm": 0.27278176079279876, - "learning_rate": 2.6545137613326968e-06, - "loss": 0.1276, - "step": 5460 - }, - { - "epoch": 1.5744121565662021, - "grad_norm": 0.29700660323704126, - "learning_rate": 2.63747483707417e-06, - "loss": 0.1233, - "step": 5465 - }, - { - "epoch": 1.575852508011955, - "grad_norm": 0.2825088591042487, - "learning_rate": 2.620482463203665e-06, - "loss": 0.1384, - "step": 5470 - }, - { - "epoch": 1.5772928594577076, - "grad_norm": 0.29095414150571464, - "learning_rate": 2.6035367471573712e-06, - "loss": 0.1296, - "step": 5475 - }, - { - "epoch": 1.5787332109034604, - "grad_norm": 0.2663708625782063, - "learning_rate": 2.586637796076468e-06, - "loss": 0.1266, - "step": 5480 - }, - { - "epoch": 1.5801735623492132, - "grad_norm": 0.3045977656337426, - "learning_rate": 2.569785716806462e-06, - "loss": 0.1296, - "step": 5485 - }, - { - "epoch": 1.5816139137949659, - "grad_norm": 0.31566523598693275, - "learning_rate": 2.5529806158965065e-06, - "loss": 0.1342, - "step": 5490 - }, - { - "epoch": 1.5830542652407187, - "grad_norm": 0.30588183705349636, - "learning_rate": 2.5362225995987277e-06, - "loss": 0.1319, - "step": 5495 - }, - { - "epoch": 1.5844946166864715, - "grad_norm": 0.29519798602614045, - "learning_rate": 2.5195117738675625e-06, - "loss": 0.1321, - "step": 5500 - }, - { - "epoch": 1.5844946166864715, - "eval_loss": 0.14166609942913055, - "eval_runtime": 183.2221, - "eval_samples_per_second": 9.846, - "eval_steps_per_second": 2.461, - "step": 5500 - }, - { - "epoch": 1.5859349681322241, - "grad_norm": 0.2909461656220088, - "learning_rate": 2.502848244359071e-06, - "loss": 0.1286, - "step": 5505 - }, - { - "epoch": 1.5873753195779772, - "grad_norm": 0.2898578050354809, - "learning_rate": 2.486232116430275e-06, - "loss": 0.1342, - "step": 5510 - }, - { - "epoch": 1.5888156710237298, - "grad_norm": 0.3086150696182557, - "learning_rate": 2.469663495138509e-06, - "loss": 0.1295, - "step": 5515 - }, - { - "epoch": 1.5902560224694824, - "grad_norm": 0.30421519295681765, - "learning_rate": 2.4531424852407316e-06, - "loss": 0.1335, - "step": 5520 - }, - { - "epoch": 1.5916963739152354, - "grad_norm": 0.288178893935303, - "learning_rate": 2.436669191192864e-06, - "loss": 0.1272, - "step": 5525 - }, - { - "epoch": 1.593136725360988, - "grad_norm": 0.2860066401966028, - "learning_rate": 2.420243717149159e-06, - "loss": 0.1333, - "step": 5530 - }, - { - "epoch": 1.5945770768067409, - "grad_norm": 0.29160475999502244, - "learning_rate": 2.403866166961507e-06, - "loss": 0.1267, - "step": 5535 - }, - { - "epoch": 1.5960174282524937, - "grad_norm": 0.2755214718192039, - "learning_rate": 2.3875366441787984e-06, - "loss": 0.121, - "step": 5540 - }, - { - "epoch": 1.5974577796982463, - "grad_norm": 0.2779639622212633, - "learning_rate": 2.3712552520462683e-06, - "loss": 0.1269, - "step": 5545 - }, - { - "epoch": 1.5988981311439991, - "grad_norm": 0.26597538696786716, - "learning_rate": 2.3550220935048375e-06, - "loss": 0.1223, - "step": 5550 - }, - { - "epoch": 1.600338482589752, - "grad_norm": 0.2865145501913996, - "learning_rate": 2.338837271190464e-06, - "loss": 0.1266, - "step": 5555 - }, - { - "epoch": 1.6017788340355046, - "grad_norm": 0.27749201843913596, - "learning_rate": 2.3227008874334943e-06, - "loss": 0.1255, - "step": 5560 - }, - { - "epoch": 1.6032191854812574, - "grad_norm": 0.2772893573934755, - "learning_rate": 2.306613044258017e-06, - "loss": 0.1277, - "step": 5565 - }, - { - "epoch": 1.6046595369270102, - "grad_norm": 0.28121537003540437, - "learning_rate": 2.290573843381222e-06, - "loss": 0.1307, - "step": 5570 - }, - { - "epoch": 1.6060998883727629, - "grad_norm": 0.28006234124775475, - "learning_rate": 2.2745833862127466e-06, - "loss": 0.1265, - "step": 5575 - }, - { - "epoch": 1.6075402398185157, - "grad_norm": 0.28516117847488853, - "learning_rate": 2.258641773854041e-06, - "loss": 0.1279, - "step": 5580 - }, - { - "epoch": 1.6089805912642685, - "grad_norm": 0.29225443516823396, - "learning_rate": 2.242749107097736e-06, - "loss": 0.1198, - "step": 5585 - }, - { - "epoch": 1.6104209427100211, - "grad_norm": 0.29078063045804087, - "learning_rate": 2.226905486426989e-06, - "loss": 0.1254, - "step": 5590 - }, - { - "epoch": 1.6118612941557742, - "grad_norm": 0.3159257252559826, - "learning_rate": 2.2111110120148638e-06, - "loss": 0.1338, - "step": 5595 - }, - { - "epoch": 1.6133016456015268, - "grad_norm": 0.27166305277438857, - "learning_rate": 2.1953657837236887e-06, - "loss": 0.14, - "step": 5600 - }, - { - "epoch": 1.6147419970472794, - "grad_norm": 0.3188347122514508, - "learning_rate": 2.17966990110443e-06, - "loss": 0.1397, - "step": 5605 - }, - { - "epoch": 1.6161823484930324, - "grad_norm": 0.3145543667358082, - "learning_rate": 2.1640234633960544e-06, - "loss": 0.1295, - "step": 5610 - }, - { - "epoch": 1.617622699938785, - "grad_norm": 0.2893812485783845, - "learning_rate": 2.1484265695249205e-06, - "loss": 0.1224, - "step": 5615 - }, - { - "epoch": 1.6190630513845379, - "grad_norm": 0.3286526294166698, - "learning_rate": 2.1328793181041284e-06, - "loss": 0.129, - "step": 5620 - }, - { - "epoch": 1.6205034028302907, - "grad_norm": 0.30207325112357286, - "learning_rate": 2.11738180743291e-06, - "loss": 0.1291, - "step": 5625 - }, - { - "epoch": 1.6219437542760433, - "grad_norm": 0.3098575022621966, - "learning_rate": 2.101934135496018e-06, - "loss": 0.1395, - "step": 5630 - }, - { - "epoch": 1.6233841057217961, - "grad_norm": 0.30020432664082614, - "learning_rate": 2.0865363999630704e-06, - "loss": 0.1259, - "step": 5635 - }, - { - "epoch": 1.624824457167549, - "grad_norm": 0.2862022859486779, - "learning_rate": 2.0711886981879812e-06, - "loss": 0.1293, - "step": 5640 - }, - { - "epoch": 1.6262648086133016, - "grad_norm": 0.2869456622784833, - "learning_rate": 2.055891127208306e-06, - "loss": 0.1234, - "step": 5645 - }, - { - "epoch": 1.6277051600590544, - "grad_norm": 0.3025687824497439, - "learning_rate": 2.0406437837446446e-06, - "loss": 0.1317, - "step": 5650 - }, - { - "epoch": 1.6291455115048072, - "grad_norm": 0.31367247268097065, - "learning_rate": 2.025446764200034e-06, - "loss": 0.1197, - "step": 5655 - }, - { - "epoch": 1.6305858629505599, - "grad_norm": 0.3081066945029777, - "learning_rate": 2.0103001646593277e-06, - "loss": 0.1355, - "step": 5660 - }, - { - "epoch": 1.6320262143963127, - "grad_norm": 0.30372574948208725, - "learning_rate": 1.995204080888592e-06, - "loss": 0.1273, - "step": 5665 - }, - { - "epoch": 1.6334665658420655, - "grad_norm": 0.2773828728322217, - "learning_rate": 1.980158608334504e-06, - "loss": 0.1253, - "step": 5670 - }, - { - "epoch": 1.6349069172878181, - "grad_norm": 0.2685031281995709, - "learning_rate": 1.965163842123745e-06, - "loss": 0.1213, - "step": 5675 - }, - { - "epoch": 1.636347268733571, - "grad_norm": 0.2772769131129277, - "learning_rate": 1.950219877062397e-06, - "loss": 0.1311, - "step": 5680 - }, - { - "epoch": 1.6377876201793238, - "grad_norm": 0.27691142659726703, - "learning_rate": 1.935326807635355e-06, - "loss": 0.1234, - "step": 5685 - }, - { - "epoch": 1.6392279716250764, - "grad_norm": 0.27592948769412723, - "learning_rate": 1.9204847280057117e-06, - "loss": 0.1309, - "step": 5690 - }, - { - "epoch": 1.6406683230708294, - "grad_norm": 0.30816287352415167, - "learning_rate": 1.90569373201417e-06, - "loss": 0.1247, - "step": 5695 - }, - { - "epoch": 1.642108674516582, - "grad_norm": 0.2945379079952402, - "learning_rate": 1.8909539131784616e-06, - "loss": 0.1304, - "step": 5700 - }, - { - "epoch": 1.6435490259623347, - "grad_norm": 0.29639169392730347, - "learning_rate": 1.8762653646927354e-06, - "loss": 0.1305, - "step": 5705 - }, - { - "epoch": 1.6449893774080877, - "grad_norm": 0.3005355791166683, - "learning_rate": 1.8616281794269797e-06, - "loss": 0.1311, - "step": 5710 - }, - { - "epoch": 1.6464297288538403, - "grad_norm": 0.3066605889100896, - "learning_rate": 1.847042449926435e-06, - "loss": 0.1292, - "step": 5715 - }, - { - "epoch": 1.6478700802995931, - "grad_norm": 0.29468421114638166, - "learning_rate": 1.8325082684110017e-06, - "loss": 0.1339, - "step": 5720 - }, - { - "epoch": 1.649310431745346, - "grad_norm": 0.278781498205819, - "learning_rate": 1.8180257267746726e-06, - "loss": 0.1321, - "step": 5725 - }, - { - "epoch": 1.6507507831910986, - "grad_norm": 0.2948317736555441, - "learning_rate": 1.8035949165849332e-06, - "loss": 0.1288, - "step": 5730 - }, - { - "epoch": 1.6521911346368514, - "grad_norm": 0.2726210628368952, - "learning_rate": 1.7892159290821931e-06, - "loss": 0.1205, - "step": 5735 - }, - { - "epoch": 1.6536314860826042, - "grad_norm": 0.29140658348322845, - "learning_rate": 1.7748888551792077e-06, - "loss": 0.1367, - "step": 5740 - }, - { - "epoch": 1.6550718375283568, - "grad_norm": 0.28048412965835157, - "learning_rate": 1.760613785460501e-06, - "loss": 0.1263, - "step": 5745 - }, - { - "epoch": 1.6565121889741097, - "grad_norm": 0.28351884101089136, - "learning_rate": 1.7463908101817962e-06, - "loss": 0.1254, - "step": 5750 - }, - { - "epoch": 1.6579525404198625, - "grad_norm": 0.27922439198383014, - "learning_rate": 1.7322200192694471e-06, - "loss": 0.1182, - "step": 5755 - }, - { - "epoch": 1.6593928918656151, - "grad_norm": 0.2819390130377161, - "learning_rate": 1.718101502319861e-06, - "loss": 0.1268, - "step": 5760 - }, - { - "epoch": 1.660833243311368, - "grad_norm": 0.3055312304432241, - "learning_rate": 1.704035348598937e-06, - "loss": 0.1271, - "step": 5765 - }, - { - "epoch": 1.6622735947571208, - "grad_norm": 0.2634592274743398, - "learning_rate": 1.6900216470415076e-06, - "loss": 0.1215, - "step": 5770 - }, - { - "epoch": 1.6637139462028734, - "grad_norm": 0.30635125652537465, - "learning_rate": 1.6760604862507645e-06, - "loss": 0.1304, - "step": 5775 - }, - { - "epoch": 1.6651542976486262, - "grad_norm": 0.283548657450537, - "learning_rate": 1.6621519544977072e-06, - "loss": 0.1279, - "step": 5780 - }, - { - "epoch": 1.666594649094379, - "grad_norm": 0.29058908783134074, - "learning_rate": 1.648296139720581e-06, - "loss": 0.1298, - "step": 5785 - }, - { - "epoch": 1.6680350005401317, - "grad_norm": 0.28167987115880083, - "learning_rate": 1.634493129524325e-06, - "loss": 0.1239, - "step": 5790 - }, - { - "epoch": 1.6694753519858847, - "grad_norm": 0.2676116536078819, - "learning_rate": 1.6207430111800081e-06, - "loss": 0.1324, - "step": 5795 - }, - { - "epoch": 1.6709157034316373, - "grad_norm": 0.2954262440265811, - "learning_rate": 1.6070458716242977e-06, - "loss": 0.1274, - "step": 5800 - }, - { - "epoch": 1.67235605487739, - "grad_norm": 0.30524434113838955, - "learning_rate": 1.5934017974588845e-06, - "loss": 0.1332, - "step": 5805 - }, - { - "epoch": 1.673796406323143, - "grad_norm": 0.2723087735878449, - "learning_rate": 1.5798108749499542e-06, - "loss": 0.1309, - "step": 5810 - }, - { - "epoch": 1.6752367577688956, - "grad_norm": 0.276644502192923, - "learning_rate": 1.5662731900276307e-06, - "loss": 0.1288, - "step": 5815 - }, - { - "epoch": 1.6766771092146484, - "grad_norm": 0.2843596250126526, - "learning_rate": 1.5527888282854386e-06, - "loss": 0.1271, - "step": 5820 - }, - { - "epoch": 1.6781174606604012, - "grad_norm": 0.3050371869749407, - "learning_rate": 1.5393578749797667e-06, - "loss": 0.1277, - "step": 5825 - }, - { - "epoch": 1.6795578121061538, - "grad_norm": 0.30443412637747763, - "learning_rate": 1.5259804150293144e-06, - "loss": 0.1264, - "step": 5830 - }, - { - "epoch": 1.6809981635519067, - "grad_norm": 0.28129272818555606, - "learning_rate": 1.512656533014566e-06, - "loss": 0.131, - "step": 5835 - }, - { - "epoch": 1.6824385149976595, - "grad_norm": 0.2744057680465857, - "learning_rate": 1.499386313177258e-06, - "loss": 0.1326, - "step": 5840 - }, - { - "epoch": 1.6838788664434121, - "grad_norm": 0.31846251340497744, - "learning_rate": 1.4861698394198366e-06, - "loss": 0.1331, - "step": 5845 - }, - { - "epoch": 1.685319217889165, - "grad_norm": 0.30051590436367076, - "learning_rate": 1.473007195304934e-06, - "loss": 0.1263, - "step": 5850 - }, - { - "epoch": 1.6867595693349178, - "grad_norm": 0.2793350788441144, - "learning_rate": 1.4598984640548375e-06, - "loss": 0.1345, - "step": 5855 - }, - { - "epoch": 1.6881999207806704, - "grad_norm": 0.2707364156107711, - "learning_rate": 1.4468437285509652e-06, - "loss": 0.1284, - "step": 5860 - }, - { - "epoch": 1.6896402722264232, - "grad_norm": 0.314258875396424, - "learning_rate": 1.4338430713333397e-06, - "loss": 0.1282, - "step": 5865 - }, - { - "epoch": 1.691080623672176, - "grad_norm": 0.3003554107965295, - "learning_rate": 1.4208965746000725e-06, - "loss": 0.1321, - "step": 5870 - }, - { - "epoch": 1.6925209751179286, - "grad_norm": 0.2953755650564155, - "learning_rate": 1.408004320206835e-06, - "loss": 0.1287, - "step": 5875 - }, - { - "epoch": 1.6939613265636817, - "grad_norm": 0.26321096545532147, - "learning_rate": 1.3951663896663426e-06, - "loss": 0.1215, - "step": 5880 - }, - { - "epoch": 1.6954016780094343, - "grad_norm": 0.29649122509259834, - "learning_rate": 1.3823828641478532e-06, - "loss": 0.1288, - "step": 5885 - }, - { - "epoch": 1.696842029455187, - "grad_norm": 0.2990214517225168, - "learning_rate": 1.3696538244766256e-06, - "loss": 0.1279, - "step": 5890 - }, - { - "epoch": 1.69828238090094, - "grad_norm": 0.31449726804552053, - "learning_rate": 1.3569793511334416e-06, - "loss": 0.1412, - "step": 5895 - }, - { - "epoch": 1.6997227323466926, - "grad_norm": 0.31375582105120253, - "learning_rate": 1.3443595242540753e-06, - "loss": 0.1355, - "step": 5900 - }, - { - "epoch": 1.7011630837924454, - "grad_norm": 0.28341031790692495, - "learning_rate": 1.3317944236287882e-06, - "loss": 0.1214, - "step": 5905 - }, - { - "epoch": 1.7026034352381982, - "grad_norm": 0.2899396718777641, - "learning_rate": 1.3192841287018376e-06, - "loss": 0.1301, - "step": 5910 - }, - { - "epoch": 1.7040437866839508, - "grad_norm": 0.2735047037867141, - "learning_rate": 1.3068287185709584e-06, - "loss": 0.1287, - "step": 5915 - }, - { - "epoch": 1.7054841381297037, - "grad_norm": 0.2763878558281379, - "learning_rate": 1.2944282719868739e-06, - "loss": 0.1299, - "step": 5920 - }, - { - "epoch": 1.7069244895754565, - "grad_norm": 0.29905907907114176, - "learning_rate": 1.282082867352794e-06, - "loss": 0.1285, - "step": 5925 - }, - { - "epoch": 1.7083648410212091, - "grad_norm": 0.2768076492722104, - "learning_rate": 1.2697925827239166e-06, - "loss": 0.1301, - "step": 5930 - }, - { - "epoch": 1.709805192466962, - "grad_norm": 0.27653267521768804, - "learning_rate": 1.2575574958069392e-06, - "loss": 0.1212, - "step": 5935 - }, - { - "epoch": 1.7112455439127148, - "grad_norm": 0.2764018528709046, - "learning_rate": 1.24537768395957e-06, - "loss": 0.1233, - "step": 5940 - }, - { - "epoch": 1.7126858953584674, - "grad_norm": 0.28421347311440187, - "learning_rate": 1.2332532241900275e-06, - "loss": 0.133, - "step": 5945 - }, - { - "epoch": 1.7141262468042202, - "grad_norm": 0.2889271677627283, - "learning_rate": 1.2211841931565615e-06, - "loss": 0.1321, - "step": 5950 - }, - { - "epoch": 1.715566598249973, - "grad_norm": 0.3030014581222401, - "learning_rate": 1.2091706671669746e-06, - "loss": 0.1381, - "step": 5955 - }, - { - "epoch": 1.7170069496957256, - "grad_norm": 0.27428273725744723, - "learning_rate": 1.1972127221781238e-06, - "loss": 0.1285, - "step": 5960 - }, - { - "epoch": 1.7184473011414785, - "grad_norm": 0.3008427383898093, - "learning_rate": 1.1853104337954535e-06, - "loss": 0.1306, - "step": 5965 - }, - { - "epoch": 1.7198876525872313, - "grad_norm": 0.3229248441404968, - "learning_rate": 1.1734638772725104e-06, - "loss": 0.1334, - "step": 5970 - }, - { - "epoch": 1.721328004032984, - "grad_norm": 0.30334834034021635, - "learning_rate": 1.161673127510472e-06, - "loss": 0.1266, - "step": 5975 - }, - { - "epoch": 1.722768355478737, - "grad_norm": 0.2852528997722053, - "learning_rate": 1.1499382590576736e-06, - "loss": 0.1276, - "step": 5980 - }, - { - "epoch": 1.7242087069244896, - "grad_norm": 0.2895562296977625, - "learning_rate": 1.1382593461091308e-06, - "loss": 0.1319, - "step": 5985 - }, - { - "epoch": 1.7256490583702422, - "grad_norm": 0.2941343826737447, - "learning_rate": 1.1266364625060722e-06, - "loss": 0.1324, - "step": 5990 - }, - { - "epoch": 1.7270894098159952, - "grad_norm": 0.2735135243510112, - "learning_rate": 1.1150696817354867e-06, - "loss": 0.1318, - "step": 5995 - }, - { - "epoch": 1.7285297612617478, - "grad_norm": 0.2762792761084767, - "learning_rate": 1.1035590769296313e-06, - "loss": 0.1354, - "step": 6000 - }, - { - "epoch": 1.7285297612617478, - "eval_loss": 0.14099709689617157, - "eval_runtime": 186.76, - "eval_samples_per_second": 9.659, - "eval_steps_per_second": 2.415, - "step": 6000 - }, - { - "epoch": 1.7299701127075007, - "grad_norm": 0.2822759449386405, - "learning_rate": 1.09210472086559e-06, - "loss": 0.1262, - "step": 6005 - }, - { - "epoch": 1.7314104641532535, - "grad_norm": 0.2852954331322426, - "learning_rate": 1.080706685964814e-06, - "loss": 0.1291, - "step": 6010 - }, - { - "epoch": 1.7328508155990061, - "grad_norm": 0.283633303505763, - "learning_rate": 1.0693650442926496e-06, - "loss": 0.1299, - "step": 6015 - }, - { - "epoch": 1.734291167044759, - "grad_norm": 0.2893275038552144, - "learning_rate": 1.058079867557893e-06, - "loss": 0.1224, - "step": 6020 - }, - { - "epoch": 1.7357315184905118, - "grad_norm": 0.2929228007525251, - "learning_rate": 1.0468512271123376e-06, - "loss": 0.1212, - "step": 6025 - }, - { - "epoch": 1.7371718699362644, - "grad_norm": 0.276901291045506, - "learning_rate": 1.0356791939503164e-06, - "loss": 0.1277, - "step": 6030 - }, - { - "epoch": 1.7386122213820172, - "grad_norm": 0.3125904701701172, - "learning_rate": 1.0245638387082578e-06, - "loss": 0.1264, - "step": 6035 - }, - { - "epoch": 1.74005257282777, - "grad_norm": 0.2741364320107291, - "learning_rate": 1.0135052316642358e-06, - "loss": 0.1251, - "step": 6040 - }, - { - "epoch": 1.7414929242735226, - "grad_norm": 0.3041429091229379, - "learning_rate": 1.002503442737527e-06, - "loss": 0.1314, - "step": 6045 - }, - { - "epoch": 1.7429332757192755, - "grad_norm": 0.3070386341868555, - "learning_rate": 9.915585414881767e-07, - "loss": 0.1336, - "step": 6050 - }, - { - "epoch": 1.7443736271650283, - "grad_norm": 0.2861168629205116, - "learning_rate": 9.806705971165443e-07, - "loss": 0.1294, - "step": 6055 - }, - { - "epoch": 1.745813978610781, - "grad_norm": 0.28061574721986277, - "learning_rate": 9.698396784628704e-07, - "loss": 0.1249, - "step": 6060 - }, - { - "epoch": 1.7472543300565337, - "grad_norm": 0.32122825311739844, - "learning_rate": 9.590658540068564e-07, - "loss": 0.1275, - "step": 6065 - }, - { - "epoch": 1.7486946815022866, - "grad_norm": 0.3039376380929849, - "learning_rate": 9.48349191867205e-07, - "loss": 0.1318, - "step": 6070 - }, - { - "epoch": 1.7501350329480392, - "grad_norm": 0.27031359733052707, - "learning_rate": 9.376897598012102e-07, - "loss": 0.1296, - "step": 6075 - }, - { - "epoch": 1.7515753843937922, - "grad_norm": 0.27603252423746855, - "learning_rate": 9.270876252043249e-07, - "loss": 0.1255, - "step": 6080 - }, - { - "epoch": 1.7530157358395448, - "grad_norm": 0.28168370555571337, - "learning_rate": 9.165428551097288e-07, - "loss": 0.1227, - "step": 6085 - }, - { - "epoch": 1.7544560872852974, - "grad_norm": 0.29637741640626303, - "learning_rate": 9.060555161879069e-07, - "loss": 0.1345, - "step": 6090 - }, - { - "epoch": 1.7558964387310505, - "grad_norm": 0.2912099853152085, - "learning_rate": 8.956256747462367e-07, - "loss": 0.1253, - "step": 6095 - }, - { - "epoch": 1.757336790176803, - "grad_norm": 0.27901921713470756, - "learning_rate": 8.852533967285515e-07, - "loss": 0.1154, - "step": 6100 - }, - { - "epoch": 1.758777141622556, - "grad_norm": 0.27957055801860914, - "learning_rate": 8.749387477147408e-07, - "loss": 0.1224, - "step": 6105 - }, - { - "epoch": 1.7602174930683088, - "grad_norm": 0.27918555052505345, - "learning_rate": 8.646817929203233e-07, - "loss": 0.1251, - "step": 6110 - }, - { - "epoch": 1.7616578445140614, - "grad_norm": 0.30402797081062277, - "learning_rate": 8.544825971960402e-07, - "loss": 0.1351, - "step": 6115 - }, - { - "epoch": 1.7630981959598142, - "grad_norm": 0.27127335623081955, - "learning_rate": 8.443412250274519e-07, - "loss": 0.1314, - "step": 6120 - }, - { - "epoch": 1.764538547405567, - "grad_norm": 0.28386937418963853, - "learning_rate": 8.342577405345132e-07, - "loss": 0.1185, - "step": 6125 - }, - { - "epoch": 1.7659788988513196, - "grad_norm": 0.30462689876606047, - "learning_rate": 8.242322074711806e-07, - "loss": 0.128, - "step": 6130 - }, - { - "epoch": 1.7674192502970725, - "grad_norm": 0.2679545912074383, - "learning_rate": 8.142646892250106e-07, - "loss": 0.1208, - "step": 6135 - }, - { - "epoch": 1.7688596017428253, - "grad_norm": 0.30324932458216164, - "learning_rate": 8.043552488167505e-07, - "loss": 0.1253, - "step": 6140 - }, - { - "epoch": 1.770299953188578, - "grad_norm": 0.2809028272767657, - "learning_rate": 7.945039488999396e-07, - "loss": 0.1365, - "step": 6145 - }, - { - "epoch": 1.7717403046343307, - "grad_norm": 0.2921736188573899, - "learning_rate": 7.847108517605284e-07, - "loss": 0.1198, - "step": 6150 - }, - { - "epoch": 1.7731806560800836, - "grad_norm": 0.3076568585771632, - "learning_rate": 7.749760193164657e-07, - "loss": 0.1252, - "step": 6155 - }, - { - "epoch": 1.7746210075258362, - "grad_norm": 0.28028733764115277, - "learning_rate": 7.652995131173146e-07, - "loss": 0.1202, - "step": 6160 - }, - { - "epoch": 1.7760613589715892, - "grad_norm": 0.27140081979265973, - "learning_rate": 7.556813943438712e-07, - "loss": 0.1278, - "step": 6165 - }, - { - "epoch": 1.7775017104173418, - "grad_norm": 0.28397278127939163, - "learning_rate": 7.461217238077656e-07, - "loss": 0.1279, - "step": 6170 - }, - { - "epoch": 1.7789420618630944, - "grad_norm": 0.29162913313509997, - "learning_rate": 7.366205619510803e-07, - "loss": 0.1222, - "step": 6175 - }, - { - "epoch": 1.7803824133088475, - "grad_norm": 0.2831971876270899, - "learning_rate": 7.271779688459746e-07, - "loss": 0.1306, - "step": 6180 - }, - { - "epoch": 1.7818227647546, - "grad_norm": 0.31140814074989936, - "learning_rate": 7.177940041942965e-07, - "loss": 0.1232, - "step": 6185 - }, - { - "epoch": 1.783263116200353, - "grad_norm": 0.2961845757723407, - "learning_rate": 7.084687273272139e-07, - "loss": 0.1384, - "step": 6190 - }, - { - "epoch": 1.7847034676461058, - "grad_norm": 0.2909880963891404, - "learning_rate": 6.992021972048312e-07, - "loss": 0.1358, - "step": 6195 - }, - { - "epoch": 1.7861438190918584, - "grad_norm": 0.2847134635831081, - "learning_rate": 6.899944724158192e-07, - "loss": 0.1311, - "step": 6200 - }, - { - "epoch": 1.7875841705376112, - "grad_norm": 0.30563787075961574, - "learning_rate": 6.808456111770467e-07, - "loss": 0.1181, - "step": 6205 - }, - { - "epoch": 1.789024521983364, - "grad_norm": 0.28983189625700895, - "learning_rate": 6.717556713332129e-07, - "loss": 0.1291, - "step": 6210 - }, - { - "epoch": 1.7904648734291166, - "grad_norm": 0.29330483143372066, - "learning_rate": 6.627247103564771e-07, - "loss": 0.1356, - "step": 6215 - }, - { - "epoch": 1.7919052248748695, - "grad_norm": 0.3032075996039303, - "learning_rate": 6.53752785346099e-07, - "loss": 0.1252, - "step": 6220 - }, - { - "epoch": 1.7933455763206223, - "grad_norm": 0.2872904446310524, - "learning_rate": 6.44839953028078e-07, - "loss": 0.1353, - "step": 6225 - }, - { - "epoch": 1.794785927766375, - "grad_norm": 0.2749000307290741, - "learning_rate": 6.359862697547891e-07, - "loss": 0.1325, - "step": 6230 - }, - { - "epoch": 1.7962262792121277, - "grad_norm": 0.29018671520624423, - "learning_rate": 6.271917915046388e-07, - "loss": 0.1334, - "step": 6235 - }, - { - "epoch": 1.7976666306578806, - "grad_norm": 0.29181272959005167, - "learning_rate": 6.184565738816961e-07, - "loss": 0.1417, - "step": 6240 - }, - { - "epoch": 1.7991069821036332, - "grad_norm": 0.2860722043841982, - "learning_rate": 6.097806721153498e-07, - "loss": 0.1251, - "step": 6245 - }, - { - "epoch": 1.800547333549386, - "grad_norm": 0.2928724802398806, - "learning_rate": 6.011641410599611e-07, - "loss": 0.1281, - "step": 6250 - }, - { - "epoch": 1.8019876849951388, - "grad_norm": 0.29674996515037555, - "learning_rate": 5.926070351945079e-07, - "loss": 0.1306, - "step": 6255 - }, - { - "epoch": 1.8034280364408914, - "grad_norm": 0.29284969211694145, - "learning_rate": 5.841094086222465e-07, - "loss": 0.1228, - "step": 6260 - }, - { - "epoch": 1.8048683878866445, - "grad_norm": 0.28914358868452567, - "learning_rate": 5.756713150703752e-07, - "loss": 0.1281, - "step": 6265 - }, - { - "epoch": 1.806308739332397, - "grad_norm": 0.2784100141416387, - "learning_rate": 5.6729280788968e-07, - "loss": 0.1169, - "step": 6270 - }, - { - "epoch": 1.8077490907781497, - "grad_norm": 0.26766994019437396, - "learning_rate": 5.589739400542071e-07, - "loss": 0.1226, - "step": 6275 - }, - { - "epoch": 1.8091894422239028, - "grad_norm": 0.30757543672267906, - "learning_rate": 5.507147641609334e-07, - "loss": 0.1287, - "step": 6280 - }, - { - "epoch": 1.8106297936696554, - "grad_norm": 0.2913516026723883, - "learning_rate": 5.425153324294175e-07, - "loss": 0.1259, - "step": 6285 - }, - { - "epoch": 1.8120701451154082, - "grad_norm": 0.2833838801982532, - "learning_rate": 5.343756967014846e-07, - "loss": 0.1316, - "step": 6290 - }, - { - "epoch": 1.813510496561161, - "grad_norm": 0.28126356941560415, - "learning_rate": 5.262959084408891e-07, - "loss": 0.1262, - "step": 6295 - }, - { - "epoch": 1.8149508480069136, - "grad_norm": 0.2791890015076149, - "learning_rate": 5.182760187329949e-07, - "loss": 0.1253, - "step": 6300 - }, - { - "epoch": 1.8163911994526665, - "grad_norm": 0.28325951768891716, - "learning_rate": 5.103160782844541e-07, - "loss": 0.1184, - "step": 6305 - }, - { - "epoch": 1.8178315508984193, - "grad_norm": 0.28791275189471405, - "learning_rate": 5.024161374228765e-07, - "loss": 0.1378, - "step": 6310 - }, - { - "epoch": 1.819271902344172, - "grad_norm": 0.29622226881158076, - "learning_rate": 4.945762460965209e-07, - "loss": 0.126, - "step": 6315 - }, - { - "epoch": 1.8207122537899247, - "grad_norm": 0.30683589530685085, - "learning_rate": 4.8679645387398e-07, - "loss": 0.1371, - "step": 6320 - }, - { - "epoch": 1.8221526052356776, - "grad_norm": 0.3082445698857907, - "learning_rate": 4.790768099438558e-07, - "loss": 0.1268, - "step": 6325 - }, - { - "epoch": 1.8235929566814302, - "grad_norm": 0.31230886419869525, - "learning_rate": 4.714173631144592e-07, - "loss": 0.1333, - "step": 6330 - }, - { - "epoch": 1.825033308127183, - "grad_norm": 0.2902616527080115, - "learning_rate": 4.638181618135007e-07, - "loss": 0.128, - "step": 6335 - }, - { - "epoch": 1.8264736595729358, - "grad_norm": 0.29991008830228766, - "learning_rate": 4.562792540877792e-07, - "loss": 0.125, - "step": 6340 - }, - { - "epoch": 1.8279140110186884, - "grad_norm": 0.3053693304563772, - "learning_rate": 4.488006876028805e-07, - "loss": 0.1251, - "step": 6345 - }, - { - "epoch": 1.8293543624644413, - "grad_norm": 0.2841149117151109, - "learning_rate": 4.413825096428781e-07, - "loss": 0.1338, - "step": 6350 - }, - { - "epoch": 1.830794713910194, - "grad_norm": 0.27769192328457887, - "learning_rate": 4.3402476711002947e-07, - "loss": 0.1236, - "step": 6355 - }, - { - "epoch": 1.8322350653559467, - "grad_norm": 0.30137959777639806, - "learning_rate": 4.2672750652448467e-07, - "loss": 0.1348, - "step": 6360 - }, - { - "epoch": 1.8336754168016998, - "grad_norm": 0.30779020386772094, - "learning_rate": 4.1949077402399063e-07, - "loss": 0.1289, - "step": 6365 - }, - { - "epoch": 1.8351157682474524, - "grad_norm": 0.2932190435782668, - "learning_rate": 4.1231461536359374e-07, - "loss": 0.1244, - "step": 6370 - }, - { - "epoch": 1.836556119693205, - "grad_norm": 0.2997160829285179, - "learning_rate": 4.051990759153612e-07, - "loss": 0.1246, - "step": 6375 - }, - { - "epoch": 1.837996471138958, - "grad_norm": 0.2837660942585995, - "learning_rate": 3.981442006680869e-07, - "loss": 0.1305, - "step": 6380 - }, - { - "epoch": 1.8394368225847106, - "grad_norm": 0.3161468834055083, - "learning_rate": 3.911500342270058e-07, - "loss": 0.128, - "step": 6385 - }, - { - "epoch": 1.8408771740304635, - "grad_norm": 0.30244059578998417, - "learning_rate": 3.842166208135201e-07, - "loss": 0.1247, - "step": 6390 - }, - { - "epoch": 1.8423175254762163, - "grad_norm": 0.27806142338453754, - "learning_rate": 3.77344004264909e-07, - "loss": 0.1143, - "step": 6395 - }, - { - "epoch": 1.843757876921969, - "grad_norm": 0.30174417783454494, - "learning_rate": 3.705322280340562e-07, - "loss": 0.1236, - "step": 6400 - }, - { - "epoch": 1.8451982283677217, - "grad_norm": 0.30013465943928136, - "learning_rate": 3.637813351891806e-07, - "loss": 0.1335, - "step": 6405 - }, - { - "epoch": 1.8466385798134746, - "grad_norm": 0.2819689346445666, - "learning_rate": 3.5709136841355686e-07, - "loss": 0.1228, - "step": 6410 - }, - { - "epoch": 1.8480789312592272, - "grad_norm": 0.2946026656198203, - "learning_rate": 3.504623700052456e-07, - "loss": 0.138, - "step": 6415 - }, - { - "epoch": 1.84951928270498, - "grad_norm": 0.290038702050758, - "learning_rate": 3.4389438187683146e-07, - "loss": 0.1261, - "step": 6420 - }, - { - "epoch": 1.8509596341507328, - "grad_norm": 0.2869715781228218, - "learning_rate": 3.37387445555154e-07, - "loss": 0.1223, - "step": 6425 - }, - { - "epoch": 1.8523999855964854, - "grad_norm": 0.2833834600270098, - "learning_rate": 3.30941602181043e-07, - "loss": 0.121, - "step": 6430 - }, - { - "epoch": 1.8538403370422383, - "grad_norm": 0.2850728665063182, - "learning_rate": 3.2455689250906584e-07, - "loss": 0.1258, - "step": 6435 - }, - { - "epoch": 1.855280688487991, - "grad_norm": 0.29666657702209753, - "learning_rate": 3.1823335690725933e-07, - "loss": 0.1294, - "step": 6440 - }, - { - "epoch": 1.8567210399337437, - "grad_norm": 0.26968192494466475, - "learning_rate": 3.119710353568872e-07, - "loss": 0.1165, - "step": 6445 - }, - { - "epoch": 1.8581613913794968, - "grad_norm": 0.2996210613153865, - "learning_rate": 3.0576996745217637e-07, - "loss": 0.134, - "step": 6450 - }, - { - "epoch": 1.8596017428252494, - "grad_norm": 0.293024903599958, - "learning_rate": 2.996301924000711e-07, - "loss": 0.1261, - "step": 6455 - }, - { - "epoch": 1.861042094271002, - "grad_norm": 0.2766190559075753, - "learning_rate": 2.935517490199857e-07, - "loss": 0.123, - "step": 6460 - }, - { - "epoch": 1.862482445716755, - "grad_norm": 0.2853304131204647, - "learning_rate": 2.8753467574355707e-07, - "loss": 0.1195, - "step": 6465 - }, - { - "epoch": 1.8639227971625076, - "grad_norm": 0.3036274997665907, - "learning_rate": 2.815790106144045e-07, - "loss": 0.1327, - "step": 6470 - }, - { - "epoch": 1.8653631486082605, - "grad_norm": 0.2899082654759326, - "learning_rate": 2.756847912878846e-07, - "loss": 0.1258, - "step": 6475 - }, - { - "epoch": 1.8668035000540133, - "grad_norm": 0.269201950199654, - "learning_rate": 2.698520550308581e-07, - "loss": 0.1216, - "step": 6480 - }, - { - "epoch": 1.868243851499766, - "grad_norm": 0.3061531908691431, - "learning_rate": 2.640808387214522e-07, - "loss": 0.1271, - "step": 6485 - }, - { - "epoch": 1.8696842029455187, - "grad_norm": 0.3114060303595798, - "learning_rate": 2.5837117884882743e-07, - "loss": 0.1254, - "step": 6490 - }, - { - "epoch": 1.8711245543912716, - "grad_norm": 0.2992475437515773, - "learning_rate": 2.527231115129458e-07, - "loss": 0.1258, - "step": 6495 - }, - { - "epoch": 1.8725649058370242, - "grad_norm": 0.2830997835854172, - "learning_rate": 2.4713667242434294e-07, - "loss": 0.1316, - "step": 6500 - }, - { - "epoch": 1.8725649058370242, - "eval_loss": 0.14066626131534576, - "eval_runtime": 180.7398, - "eval_samples_per_second": 9.981, - "eval_steps_per_second": 2.495, - "step": 6500 - }, - { - "epoch": 1.874005257282777, - "grad_norm": 0.30147030501531397, - "learning_rate": 2.416118969039061e-07, - "loss": 0.1276, - "step": 6505 - }, - { - "epoch": 1.8754456087285298, - "grad_norm": 0.28892524030979483, - "learning_rate": 2.361488198826445e-07, - "loss": 0.127, - "step": 6510 - }, - { - "epoch": 1.8768859601742824, - "grad_norm": 0.2953532817059129, - "learning_rate": 2.3074747590147384e-07, - "loss": 0.137, - "step": 6515 - }, - { - "epoch": 1.8783263116200353, - "grad_norm": 0.25078807633095623, - "learning_rate": 2.2540789911099536e-07, - "loss": 0.1225, - "step": 6520 - }, - { - "epoch": 1.879766663065788, - "grad_norm": 0.28249677086004654, - "learning_rate": 2.2013012327127826e-07, - "loss": 0.1253, - "step": 6525 - }, - { - "epoch": 1.8812070145115407, - "grad_norm": 0.2850391716271125, - "learning_rate": 2.1491418175165202e-07, - "loss": 0.133, - "step": 6530 - }, - { - "epoch": 1.8826473659572935, - "grad_norm": 0.28209719167913044, - "learning_rate": 2.097601075304878e-07, - "loss": 0.1243, - "step": 6535 - }, - { - "epoch": 1.8840877174030464, - "grad_norm": 0.2877245048633624, - "learning_rate": 2.0466793319499856e-07, - "loss": 0.1344, - "step": 6540 - }, - { - "epoch": 1.885528068848799, - "grad_norm": 0.283418662494498, - "learning_rate": 1.9963769094102247e-07, - "loss": 0.1317, - "step": 6545 - }, - { - "epoch": 1.886968420294552, - "grad_norm": 0.2932428710876708, - "learning_rate": 1.946694125728299e-07, - "loss": 0.1319, - "step": 6550 - }, - { - "epoch": 1.8884087717403046, - "grad_norm": 0.30112745695158777, - "learning_rate": 1.8976312950291453e-07, - "loss": 0.1256, - "step": 6555 - }, - { - "epoch": 1.8898491231860572, - "grad_norm": 0.27640553764468206, - "learning_rate": 1.8491887275180143e-07, - "loss": 0.125, - "step": 6560 - }, - { - "epoch": 1.8912894746318103, - "grad_norm": 0.2940276593318057, - "learning_rate": 1.8013667294784376e-07, - "loss": 0.1289, - "step": 6565 - }, - { - "epoch": 1.892729826077563, - "grad_norm": 0.27144519568474473, - "learning_rate": 1.7541656032703413e-07, - "loss": 0.1354, - "step": 6570 - }, - { - "epoch": 1.8941701775233157, - "grad_norm": 0.275764857359935, - "learning_rate": 1.707585647328136e-07, - "loss": 0.1292, - "step": 6575 - }, - { - "epoch": 1.8956105289690686, - "grad_norm": 0.27755806619480555, - "learning_rate": 1.6616271561587737e-07, - "loss": 0.1188, - "step": 6580 - }, - { - "epoch": 1.8970508804148212, - "grad_norm": 0.2997586600616719, - "learning_rate": 1.6162904203399722e-07, - "loss": 0.1381, - "step": 6585 - }, - { - "epoch": 1.898491231860574, - "grad_norm": 0.2941056183108044, - "learning_rate": 1.571575726518293e-07, - "loss": 0.1308, - "step": 6590 - }, - { - "epoch": 1.8999315833063268, - "grad_norm": 0.31046657242771797, - "learning_rate": 1.5274833574073887e-07, - "loss": 0.1371, - "step": 6595 - }, - { - "epoch": 1.9013719347520794, - "grad_norm": 0.31288177847066073, - "learning_rate": 1.4840135917862041e-07, - "loss": 0.1297, - "step": 6600 - }, - { - "epoch": 1.9028122861978323, - "grad_norm": 0.29822816494380705, - "learning_rate": 1.4411667044971657e-07, - "loss": 0.1347, - "step": 6605 - }, - { - "epoch": 1.904252637643585, - "grad_norm": 0.2850936706817707, - "learning_rate": 1.3989429664445275e-07, - "loss": 0.1332, - "step": 6610 - }, - { - "epoch": 1.9056929890893377, - "grad_norm": 0.26285478488340075, - "learning_rate": 1.3573426445925853e-07, - "loss": 0.1226, - "step": 6615 - }, - { - "epoch": 1.9071333405350905, - "grad_norm": 0.32854759121769145, - "learning_rate": 1.316366001964009e-07, - "loss": 0.1281, - "step": 6620 - }, - { - "epoch": 1.9085736919808434, - "grad_norm": 0.30927391667980875, - "learning_rate": 1.2760132976382123e-07, - "loss": 0.1282, - "step": 6625 - }, - { - "epoch": 1.910014043426596, - "grad_norm": 0.2910109063295465, - "learning_rate": 1.2362847867496754e-07, - "loss": 0.1238, - "step": 6630 - }, - { - "epoch": 1.9114543948723488, - "grad_norm": 0.2962860334445714, - "learning_rate": 1.197180720486346e-07, - "loss": 0.1378, - "step": 6635 - }, - { - "epoch": 1.9128947463181016, - "grad_norm": 0.2887305818228257, - "learning_rate": 1.1587013460880537e-07, - "loss": 0.1239, - "step": 6640 - }, - { - "epoch": 1.9143350977638542, - "grad_norm": 0.31793728948458577, - "learning_rate": 1.1208469068449413e-07, - "loss": 0.124, - "step": 6645 - }, - { - "epoch": 1.9157754492096073, - "grad_norm": 0.29221711966306396, - "learning_rate": 1.0836176420959354e-07, - "loss": 0.1311, - "step": 6650 - }, - { - "epoch": 1.91721580065536, - "grad_norm": 0.31385454606410323, - "learning_rate": 1.0470137872272246e-07, - "loss": 0.1357, - "step": 6655 - }, - { - "epoch": 1.9186561521011125, - "grad_norm": 0.31438512993189466, - "learning_rate": 1.01103557367076e-07, - "loss": 0.1416, - "step": 6660 - }, - { - "epoch": 1.9200965035468656, - "grad_norm": 0.29829830683355085, - "learning_rate": 9.756832289028239e-08, - "loss": 0.1306, - "step": 6665 - }, - { - "epoch": 1.9215368549926182, - "grad_norm": 0.260802342295974, - "learning_rate": 9.40956976442564e-08, - "loss": 0.1277, - "step": 6670 - }, - { - "epoch": 1.922977206438371, - "grad_norm": 0.3197171414870384, - "learning_rate": 9.068570358506058e-08, - "loss": 0.1314, - "step": 6675 - }, - { - "epoch": 1.9244175578841238, - "grad_norm": 0.2894154530923691, - "learning_rate": 8.733836227276082e-08, - "loss": 0.1284, - "step": 6680 - }, - { - "epoch": 1.9258579093298764, - "grad_norm": 0.29695374382019035, - "learning_rate": 8.405369487129889e-08, - "loss": 0.1329, - "step": 6685 - }, - { - "epoch": 1.9272982607756293, - "grad_norm": 0.30960960633140455, - "learning_rate": 8.083172214835011e-08, - "loss": 0.1365, - "step": 6690 - }, - { - "epoch": 1.928738612221382, - "grad_norm": 0.2971288891300368, - "learning_rate": 7.767246447519694e-08, - "loss": 0.126, - "step": 6695 - }, - { - "epoch": 1.9301789636671347, - "grad_norm": 0.2789858562034354, - "learning_rate": 7.457594182660011e-08, - "loss": 0.125, - "step": 6700 - }, - { - "epoch": 1.9316193151128875, - "grad_norm": 0.29826867052107653, - "learning_rate": 7.154217378066875e-08, - "loss": 0.135, - "step": 6705 - }, - { - "epoch": 1.9330596665586404, - "grad_norm": 0.32154445451796954, - "learning_rate": 6.85711795187416e-08, - "loss": 0.1306, - "step": 6710 - }, - { - "epoch": 1.934500018004393, - "grad_norm": 0.2816182285382255, - "learning_rate": 6.566297782526155e-08, - "loss": 0.1338, - "step": 6715 - }, - { - "epoch": 1.9359403694501458, - "grad_norm": 0.2805436494834369, - "learning_rate": 6.281758708765796e-08, - "loss": 0.1399, - "step": 6720 - }, - { - "epoch": 1.9373807208958986, - "grad_norm": 0.2871447896608813, - "learning_rate": 6.00350252962334e-08, - "loss": 0.1326, - "step": 6725 - }, - { - "epoch": 1.9388210723416512, - "grad_norm": 0.2865634190488767, - "learning_rate": 5.731531004404378e-08, - "loss": 0.1348, - "step": 6730 - }, - { - "epoch": 1.940261423787404, - "grad_norm": 0.3073170789090603, - "learning_rate": 5.465845852679397e-08, - "loss": 0.1265, - "step": 6735 - }, - { - "epoch": 1.941701775233157, - "grad_norm": 0.27714379568487674, - "learning_rate": 5.206448754272342e-08, - "loss": 0.1226, - "step": 6740 - }, - { - "epoch": 1.9431421266789095, - "grad_norm": 0.2986995263002976, - "learning_rate": 4.9533413492504065e-08, - "loss": 0.1284, - "step": 6745 - }, - { - "epoch": 1.9445824781246626, - "grad_norm": 0.31721814876106047, - "learning_rate": 4.706525237913595e-08, - "loss": 0.1192, - "step": 6750 - }, - { - "epoch": 1.9460228295704152, - "grad_norm": 0.2909997426660172, - "learning_rate": 4.466001980784063e-08, - "loss": 0.1317, - "step": 6755 - }, - { - "epoch": 1.947463181016168, - "grad_norm": 0.28400270417549767, - "learning_rate": 4.231773098597236e-08, - "loss": 0.1338, - "step": 6760 - }, - { - "epoch": 1.9489035324619208, - "grad_norm": 0.2743912511811287, - "learning_rate": 4.0038400722911544e-08, - "loss": 0.1303, - "step": 6765 - }, - { - "epoch": 1.9503438839076734, - "grad_norm": 0.2815930208370952, - "learning_rate": 3.7822043429980304e-08, - "loss": 0.1333, - "step": 6770 - }, - { - "epoch": 1.9517842353534263, - "grad_norm": 0.29321178019359334, - "learning_rate": 3.566867312034483e-08, - "loss": 0.1248, - "step": 6775 - }, - { - "epoch": 1.953224586799179, - "grad_norm": 0.28463818504050836, - "learning_rate": 3.357830340892987e-08, - "loss": 0.134, - "step": 6780 - }, - { - "epoch": 1.9546649382449317, - "grad_norm": 0.3052187447654327, - "learning_rate": 3.155094751233101e-08, - "loss": 0.1256, - "step": 6785 - }, - { - "epoch": 1.9561052896906845, - "grad_norm": 0.28659122590048935, - "learning_rate": 2.9586618248731436e-08, - "loss": 0.1226, - "step": 6790 - }, - { - "epoch": 1.9575456411364374, - "grad_norm": 0.282038441472891, - "learning_rate": 2.768532803782531e-08, - "loss": 0.1347, - "step": 6795 - }, - { - "epoch": 1.95898599258219, - "grad_norm": 0.2947473056279094, - "learning_rate": 2.5847088900728955e-08, - "loss": 0.1307, - "step": 6800 - }, - { - "epoch": 1.9604263440279428, - "grad_norm": 0.28871021497286375, - "learning_rate": 2.407191245991758e-08, - "loss": 0.1257, - "step": 6805 - }, - { - "epoch": 1.9618666954736956, - "grad_norm": 0.2793833356124683, - "learning_rate": 2.2359809939139775e-08, - "loss": 0.1199, - "step": 6810 - }, - { - "epoch": 1.9633070469194482, - "grad_norm": 0.28471879043256865, - "learning_rate": 2.0710792163357586e-08, - "loss": 0.1301, - "step": 6815 - }, - { - "epoch": 1.964747398365201, - "grad_norm": 0.326540718273098, - "learning_rate": 1.912486955866988e-08, - "loss": 0.1281, - "step": 6820 - }, - { - "epoch": 1.966187749810954, - "grad_norm": 0.2945065457407077, - "learning_rate": 1.7602052152247973e-08, - "loss": 0.1299, - "step": 6825 - }, - { - "epoch": 1.9676281012567065, - "grad_norm": 0.2860442540390112, - "learning_rate": 1.6142349572275674e-08, - "loss": 0.1331, - "step": 6830 - }, - { - "epoch": 1.9690684527024596, - "grad_norm": 0.28188944868108917, - "learning_rate": 1.4745771047887104e-08, - "loss": 0.1338, - "step": 6835 - }, - { - "epoch": 1.9705088041482122, - "grad_norm": 0.28844347874593, - "learning_rate": 1.3412325409103421e-08, - "loss": 0.1247, - "step": 6840 - }, - { - "epoch": 1.9719491555939648, - "grad_norm": 0.27691996340853114, - "learning_rate": 1.2142021086786194e-08, - "loss": 0.1243, - "step": 6845 - }, - { - "epoch": 1.9733895070397178, - "grad_norm": 0.30470759791737545, - "learning_rate": 1.0934866112575215e-08, - "loss": 0.1343, - "step": 6850 - }, - { - "epoch": 1.9748298584854704, - "grad_norm": 0.2704989184353482, - "learning_rate": 9.790868118843e-09, - "loss": 0.1281, - "step": 6855 - }, - { - "epoch": 1.9762702099312233, - "grad_norm": 0.2936626817128071, - "learning_rate": 8.710034338643702e-09, - "loss": 0.1293, - "step": 6860 - }, - { - "epoch": 1.977710561376976, - "grad_norm": 0.28036256864259923, - "learning_rate": 7.692371605670935e-09, - "loss": 0.1299, - "step": 6865 - }, - { - "epoch": 1.9791509128227287, - "grad_norm": 0.29270530929271654, - "learning_rate": 6.737886354211132e-09, - "loss": 0.1327, - "step": 6870 - }, - { - "epoch": 1.9805912642684815, - "grad_norm": 0.31142795102161236, - "learning_rate": 5.84658461910359e-09, - "loss": 0.1301, - "step": 6875 - }, - { - "epoch": 1.9820316157142344, - "grad_norm": 0.28078438046285364, - "learning_rate": 5.018472035701605e-09, - "loss": 0.1375, - "step": 6880 - }, - { - "epoch": 1.983471967159987, - "grad_norm": 0.3015462902715081, - "learning_rate": 4.253553839842495e-09, - "loss": 0.1306, - "step": 6885 - }, - { - "epoch": 1.9849123186057398, - "grad_norm": 0.29201527254897053, - "learning_rate": 3.5518348678043046e-09, - "loss": 0.1264, - "step": 6890 - }, - { - "epoch": 1.9863526700514926, - "grad_norm": 0.297518857741422, - "learning_rate": 2.9133195562847106e-09, - "loss": 0.1278, - "step": 6895 - }, - { - "epoch": 1.9877930214972452, - "grad_norm": 0.2899898155130717, - "learning_rate": 2.338011942368823e-09, - "loss": 0.1287, - "step": 6900 - }, - { - "epoch": 1.989233372942998, - "grad_norm": 0.29940028846058375, - "learning_rate": 1.8259156635025422e-09, - "loss": 0.1323, - "step": 6905 - }, - { - "epoch": 1.990673724388751, - "grad_norm": 0.307580959013968, - "learning_rate": 1.3770339574714631e-09, - "loss": 0.1359, - "step": 6910 - }, - { - "epoch": 1.9921140758345035, - "grad_norm": 0.2926247859880895, - "learning_rate": 9.913696623808922e-10, - "loss": 0.1277, - "step": 6915 - }, - { - "epoch": 1.9935544272802563, - "grad_norm": 0.24746513443996024, - "learning_rate": 6.68925216636973e-10, - "loss": 0.1232, - "step": 6920 - }, - { - "epoch": 1.9949947787260092, - "grad_norm": 0.31522655128809673, - "learning_rate": 4.0970265892892327e-10, - "loss": 0.127, - "step": 6925 - }, - { - "epoch": 1.9964351301717618, - "grad_norm": 0.2858966249788635, - "learning_rate": 2.1370362822237256e-10, - "loss": 0.1273, - "step": 6930 - }, - { - "epoch": 1.9978754816175148, - "grad_norm": 0.30208855377084065, - "learning_rate": 8.092936374159977e-11, - "loss": 0.1299, - "step": 6935 - }, - { - "epoch": 1.9993158330632674, - "grad_norm": 0.27290887764863414, - "learning_rate": 1.1380704968422251e-11, - "loss": 0.1311, - "step": 6940 - }, - { - "epoch": 1.9998919736415686, - "step": 6942, - "total_flos": 2.1613118366416896e+16, - "train_loss": 0.0653228780393709, - "train_runtime": 98246.8444, - "train_samples_per_second": 4.523, - "train_steps_per_second": 0.071 + "epoch": 0.9998197093715069, + "step": 3466, + "total_flos": 4977616761913344.0, + "train_loss": 0.6325558101381102, + "train_runtime": 63848.9812, + "train_samples_per_second": 3.475, + "train_steps_per_second": 0.054 } ], "logging_steps": 5, - "max_steps": 6942, + "max_steps": 3466, "num_input_tokens_seen": 0, - "num_train_epochs": 2, + "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { @@ -9856,7 +4935,7 @@ "attributes": {} } }, - "total_flos": 2.1613118366416896e+16, + "total_flos": 4977616761913344.0, "train_batch_size": 1, "trial_name": null, "trial_params": null