diff --git "a/checkpoint-4800/trainer_state.json" "b/checkpoint-4800/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4800/trainer_state.json" @@ -0,0 +1,33813 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.6359143327841847, + "eval_steps": 200, + "global_step": 4800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005491488193300384, + "grad_norm": 1.4225262906610563, + "learning_rate": 0.0, + "loss": 0.4574, + "step": 1 + }, + { + "epoch": 0.001098297638660077, + "grad_norm": 1.2088518087838116, + "learning_rate": 1.729696904450771e-07, + "loss": 0.4484, + "step": 2 + }, + { + "epoch": 0.0016474464579901153, + "grad_norm": 1.75482712948527, + "learning_rate": 2.741504731167937e-07, + "loss": 0.553, + "step": 3 + }, + { + "epoch": 0.002196595277320154, + "grad_norm": 1.6476495401371742, + "learning_rate": 3.459393808901542e-07, + "loss": 0.4866, + "step": 4 + }, + { + "epoch": 0.0027457440966501922, + "grad_norm": 1.168396277727929, + "learning_rate": 4.016231838083946e-07, + "loss": 0.4492, + "step": 5 + }, + { + "epoch": 0.0032948929159802307, + "grad_norm": 1.8756805874064717, + "learning_rate": 4.4712016356187073e-07, + "loss": 0.5393, + "step": 6 + }, + { + "epoch": 0.003844041735310269, + "grad_norm": 1.4295957251308191, + "learning_rate": 4.855873118377673e-07, + "loss": 0.4612, + "step": 7 + }, + { + "epoch": 0.004393190554640308, + "grad_norm": 1.3019184497919265, + "learning_rate": 5.189090713352312e-07, + "loss": 0.4273, + "step": 8 + }, + { + "epoch": 0.004942339373970346, + "grad_norm": 1.4301794602572218, + "learning_rate": 5.483009462335874e-07, + "loss": 0.4187, + "step": 9 + }, + { + "epoch": 0.0054914881933003845, + "grad_norm": 1.1171001206048818, + "learning_rate": 5.745928742534718e-07, + "loss": 0.4334, + "step": 10 + }, + { + "epoch": 0.0060406370126304225, + "grad_norm": 1.2023161985960118, + "learning_rate": 5.983768161916053e-07, + "loss": 0.4776, + "step": 11 + }, + { + "epoch": 0.006589785831960461, + "grad_norm": 1.561592987562026, + "learning_rate": 6.200898540069478e-07, + "loss": 0.4274, + "step": 12 + }, + { + "epoch": 0.007138934651290499, + "grad_norm": 1.0610561139148416, + "learning_rate": 6.40063912557533e-07, + "loss": 0.4131, + "step": 13 + }, + { + "epoch": 0.007688083470620538, + "grad_norm": 1.0984338347008844, + "learning_rate": 6.585570022828442e-07, + "loss": 0.4548, + "step": 14 + }, + { + "epoch": 0.008237232289950576, + "grad_norm": 1.267906923097231, + "learning_rate": 6.757736569251883e-07, + "loss": 0.4912, + "step": 15 + }, + { + "epoch": 0.008786381109280615, + "grad_norm": 2.054936399665518, + "learning_rate": 6.918787617803084e-07, + "loss": 0.5067, + "step": 16 + }, + { + "epoch": 0.009335529928610654, + "grad_norm": 1.0888894685956663, + "learning_rate": 7.070071823568266e-07, + "loss": 0.3836, + "step": 17 + }, + { + "epoch": 0.009884678747940691, + "grad_norm": 0.933933401628967, + "learning_rate": 7.212706366786644e-07, + "loss": 0.4321, + "step": 18 + }, + { + "epoch": 0.01043382756727073, + "grad_norm": 1.2062385692313455, + "learning_rate": 7.34762707033463e-07, + "loss": 0.4584, + "step": 19 + }, + { + "epoch": 0.010982976386600769, + "grad_norm": 1.5470382581754798, + "learning_rate": 7.475625646985488e-07, + "loss": 0.5263, + "step": 20 + }, + { + "epoch": 0.011532125205930808, + "grad_norm": 1.052236373995569, + "learning_rate": 7.59737784954561e-07, + "loss": 0.4506, + "step": 21 + }, + { + "epoch": 0.012081274025260845, + "grad_norm": 1.0539356257460601, + "learning_rate": 7.713465066366824e-07, + "loss": 0.4079, + "step": 22 + }, + { + "epoch": 0.012630422844590884, + "grad_norm": 0.7116874613543462, + "learning_rate": 7.824391112483089e-07, + "loss": 0.4279, + "step": 23 + }, + { + "epoch": 0.013179571663920923, + "grad_norm": 0.9701267618020503, + "learning_rate": 7.930595444520249e-07, + "loss": 0.4392, + "step": 24 + }, + { + "epoch": 0.013728720483250962, + "grad_norm": 0.7554895854135255, + "learning_rate": 8.032463676167892e-07, + "loss": 0.4201, + "step": 25 + }, + { + "epoch": 0.014277869302580999, + "grad_norm": 0.9220843488420176, + "learning_rate": 8.130336030026101e-07, + "loss": 0.4435, + "step": 26 + }, + { + "epoch": 0.014827018121911038, + "grad_norm": 1.1944285424449907, + "learning_rate": 8.22451419350381e-07, + "loss": 0.4906, + "step": 27 + }, + { + "epoch": 0.015376166941241077, + "grad_norm": 0.7683622840626223, + "learning_rate": 8.315266927279214e-07, + "loss": 0.4315, + "step": 28 + }, + { + "epoch": 0.015925315760571115, + "grad_norm": 0.8412199225414874, + "learning_rate": 8.402834689152837e-07, + "loss": 0.412, + "step": 29 + }, + { + "epoch": 0.016474464579901153, + "grad_norm": 0.9195353340666571, + "learning_rate": 8.487433473702655e-07, + "loss": 0.425, + "step": 30 + }, + { + "epoch": 0.017023613399231193, + "grad_norm": 1.014454974143959, + "learning_rate": 8.569258022117608e-07, + "loss": 0.4129, + "step": 31 + }, + { + "epoch": 0.01757276221856123, + "grad_norm": 0.9286572043623594, + "learning_rate": 8.648484522253854e-07, + "loss": 0.4265, + "step": 32 + }, + { + "epoch": 0.018121911037891267, + "grad_norm": 0.8888140783501768, + "learning_rate": 8.725272893083989e-07, + "loss": 0.4111, + "step": 33 + }, + { + "epoch": 0.018671059857221308, + "grad_norm": 0.8673559176229574, + "learning_rate": 8.799768728019036e-07, + "loss": 0.4202, + "step": 34 + }, + { + "epoch": 0.019220208676551345, + "grad_norm": 0.7808202407404673, + "learning_rate": 8.87210495646162e-07, + "loss": 0.4342, + "step": 35 + }, + { + "epoch": 0.019769357495881382, + "grad_norm": 0.6796871324723183, + "learning_rate": 8.942403271237415e-07, + "loss": 0.3855, + "step": 36 + }, + { + "epoch": 0.020318506315211423, + "grad_norm": 0.6081508652500198, + "learning_rate": 9.010775360409045e-07, + "loss": 0.4036, + "step": 37 + }, + { + "epoch": 0.02086765513454146, + "grad_norm": 0.7420082429740513, + "learning_rate": 9.077323974785401e-07, + "loss": 0.4641, + "step": 38 + }, + { + "epoch": 0.0214168039538715, + "grad_norm": 0.7524300594082499, + "learning_rate": 9.142143856743266e-07, + "loss": 0.4307, + "step": 39 + }, + { + "epoch": 0.021965952773201538, + "grad_norm": 0.7009814899366995, + "learning_rate": 9.205322551436259e-07, + "loss": 0.4238, + "step": 40 + }, + { + "epoch": 0.022515101592531575, + "grad_norm": 0.5935645569829775, + "learning_rate": 9.266941117821921e-07, + "loss": 0.4077, + "step": 41 + }, + { + "epoch": 0.023064250411861616, + "grad_norm": 0.5282148216196204, + "learning_rate": 9.32707475399638e-07, + "loss": 0.4575, + "step": 42 + }, + { + "epoch": 0.023613399231191653, + "grad_norm": 0.9126595655856949, + "learning_rate": 9.38579334893854e-07, + "loss": 0.4678, + "step": 43 + }, + { + "epoch": 0.02416254805052169, + "grad_norm": 0.5936890079068945, + "learning_rate": 9.443161970817594e-07, + "loss": 0.3801, + "step": 44 + }, + { + "epoch": 0.02471169686985173, + "grad_norm": 0.5387510022171181, + "learning_rate": 9.499241300419819e-07, + "loss": 0.4201, + "step": 45 + }, + { + "epoch": 0.025260845689181768, + "grad_norm": 0.49338476111037977, + "learning_rate": 9.55408801693386e-07, + "loss": 0.4022, + "step": 46 + }, + { + "epoch": 0.02580999450851181, + "grad_norm": 0.6924157033463041, + "learning_rate": 9.60775514224357e-07, + "loss": 0.4585, + "step": 47 + }, + { + "epoch": 0.026359143327841845, + "grad_norm": 1.0357916266494314, + "learning_rate": 9.66029234897102e-07, + "loss": 0.4567, + "step": 48 + }, + { + "epoch": 0.026908292147171883, + "grad_norm": 0.6202871224353081, + "learning_rate": 9.711746236755347e-07, + "loss": 0.3966, + "step": 49 + }, + { + "epoch": 0.027457440966501923, + "grad_norm": 0.5651799439240893, + "learning_rate": 9.762160580618663e-07, + "loss": 0.4312, + "step": 50 + }, + { + "epoch": 0.02800658978583196, + "grad_norm": 0.5999985257992868, + "learning_rate": 9.811576554736202e-07, + "loss": 0.3799, + "step": 51 + }, + { + "epoch": 0.028555738605161998, + "grad_norm": 0.6818309292774629, + "learning_rate": 9.860032934476873e-07, + "loss": 0.3931, + "step": 52 + }, + { + "epoch": 0.029104887424492038, + "grad_norm": 0.6041213206398276, + "learning_rate": 9.907566279198219e-07, + "loss": 0.4585, + "step": 53 + }, + { + "epoch": 0.029654036243822075, + "grad_norm": 0.5836230976134994, + "learning_rate": 9.954211097954582e-07, + "loss": 0.3951, + "step": 54 + }, + { + "epoch": 0.030203185063152116, + "grad_norm": 0.6090123753109918, + "learning_rate": 1e-06, + "loss": 0.4539, + "step": 55 + }, + { + "epoch": 0.030752333882482153, + "grad_norm": 0.5963904836943319, + "learning_rate": 9.999999156426468e-07, + "loss": 0.4438, + "step": 56 + }, + { + "epoch": 0.03130148270181219, + "grad_norm": 0.5258024706804068, + "learning_rate": 9.99999662570616e-07, + "loss": 0.4109, + "step": 57 + }, + { + "epoch": 0.03185063152114223, + "grad_norm": 0.591275210526606, + "learning_rate": 9.999992407839927e-07, + "loss": 0.5208, + "step": 58 + }, + { + "epoch": 0.032399780340472265, + "grad_norm": 0.5624958905139562, + "learning_rate": 9.999986502829197e-07, + "loss": 0.4294, + "step": 59 + }, + { + "epoch": 0.032948929159802305, + "grad_norm": 0.535445043933981, + "learning_rate": 9.999978910675956e-07, + "loss": 0.4044, + "step": 60 + }, + { + "epoch": 0.033498077979132346, + "grad_norm": 0.5665677363927388, + "learning_rate": 9.999969631382771e-07, + "loss": 0.3916, + "step": 61 + }, + { + "epoch": 0.034047226798462386, + "grad_norm": 0.913452655593011, + "learning_rate": 9.999958664952773e-07, + "loss": 0.4627, + "step": 62 + }, + { + "epoch": 0.03459637561779242, + "grad_norm": 0.6483781045962704, + "learning_rate": 9.99994601138966e-07, + "loss": 0.4445, + "step": 63 + }, + { + "epoch": 0.03514552443712246, + "grad_norm": 0.5532138331818498, + "learning_rate": 9.999931670697708e-07, + "loss": 0.3963, + "step": 64 + }, + { + "epoch": 0.0356946732564525, + "grad_norm": 0.5119120510194277, + "learning_rate": 9.999915642881749e-07, + "loss": 0.3909, + "step": 65 + }, + { + "epoch": 0.036243822075782535, + "grad_norm": 0.4881257479615943, + "learning_rate": 9.999897927947198e-07, + "loss": 0.4045, + "step": 66 + }, + { + "epoch": 0.036792970895112576, + "grad_norm": 0.680486722717849, + "learning_rate": 9.999878525900026e-07, + "loss": 0.4202, + "step": 67 + }, + { + "epoch": 0.037342119714442616, + "grad_norm": 0.904382014225488, + "learning_rate": 9.99985743674679e-07, + "loss": 0.4007, + "step": 68 + }, + { + "epoch": 0.03789126853377265, + "grad_norm": 0.5022031033791474, + "learning_rate": 9.999834660494596e-07, + "loss": 0.3872, + "step": 69 + }, + { + "epoch": 0.03844041735310269, + "grad_norm": 0.5949380451119111, + "learning_rate": 9.999810197151142e-07, + "loss": 0.3993, + "step": 70 + }, + { + "epoch": 0.03898956617243273, + "grad_norm": 0.42789905818799473, + "learning_rate": 9.999784046724673e-07, + "loss": 0.4247, + "step": 71 + }, + { + "epoch": 0.039538714991762765, + "grad_norm": 0.4358760331006809, + "learning_rate": 9.99975620922402e-07, + "loss": 0.4127, + "step": 72 + }, + { + "epoch": 0.040087863811092805, + "grad_norm": 0.39731808288616693, + "learning_rate": 9.999726684658574e-07, + "loss": 0.4059, + "step": 73 + }, + { + "epoch": 0.040637012630422846, + "grad_norm": 0.5998558014410175, + "learning_rate": 9.9996954730383e-07, + "loss": 0.4605, + "step": 74 + }, + { + "epoch": 0.04118616144975288, + "grad_norm": 0.7401577298826412, + "learning_rate": 9.999662574373731e-07, + "loss": 0.4501, + "step": 75 + }, + { + "epoch": 0.04173531026908292, + "grad_norm": 0.4158209983397508, + "learning_rate": 9.99962798867597e-07, + "loss": 0.4015, + "step": 76 + }, + { + "epoch": 0.04228445908841296, + "grad_norm": 0.41274797354801207, + "learning_rate": 9.999591715956685e-07, + "loss": 0.3902, + "step": 77 + }, + { + "epoch": 0.042833607907743, + "grad_norm": 0.5850701714590281, + "learning_rate": 9.999553756228119e-07, + "loss": 0.4142, + "step": 78 + }, + { + "epoch": 0.043382756727073035, + "grad_norm": 0.5032840143749217, + "learning_rate": 9.999514109503082e-07, + "loss": 0.403, + "step": 79 + }, + { + "epoch": 0.043931905546403076, + "grad_norm": 0.43979292990216845, + "learning_rate": 9.999472775794953e-07, + "loss": 0.3966, + "step": 80 + }, + { + "epoch": 0.044481054365733116, + "grad_norm": 0.40361288414169005, + "learning_rate": 9.99942975511768e-07, + "loss": 0.4216, + "step": 81 + }, + { + "epoch": 0.04503020318506315, + "grad_norm": 0.39618981132075415, + "learning_rate": 9.999385047485781e-07, + "loss": 0.4044, + "step": 82 + }, + { + "epoch": 0.04557935200439319, + "grad_norm": 0.7884738418257248, + "learning_rate": 9.999338652914345e-07, + "loss": 0.4311, + "step": 83 + }, + { + "epoch": 0.04612850082372323, + "grad_norm": 0.583681056654287, + "learning_rate": 9.999290571419028e-07, + "loss": 0.4447, + "step": 84 + }, + { + "epoch": 0.046677649643053265, + "grad_norm": 0.454936239495054, + "learning_rate": 9.999240803016054e-07, + "loss": 0.4046, + "step": 85 + }, + { + "epoch": 0.047226798462383306, + "grad_norm": 0.6575839772121563, + "learning_rate": 9.999189347722217e-07, + "loss": 0.3932, + "step": 86 + }, + { + "epoch": 0.047775947281713346, + "grad_norm": 0.5180410212467814, + "learning_rate": 9.999136205554885e-07, + "loss": 0.431, + "step": 87 + }, + { + "epoch": 0.04832509610104338, + "grad_norm": 0.4137122043878319, + "learning_rate": 9.99908137653199e-07, + "loss": 0.389, + "step": 88 + }, + { + "epoch": 0.04887424492037342, + "grad_norm": 0.3701143520536436, + "learning_rate": 9.999024860672035e-07, + "loss": 0.4106, + "step": 89 + }, + { + "epoch": 0.04942339373970346, + "grad_norm": 0.5263597284937761, + "learning_rate": 9.99896665799409e-07, + "loss": 0.4214, + "step": 90 + }, + { + "epoch": 0.049972542559033495, + "grad_norm": 0.47557602438131685, + "learning_rate": 9.998906768517797e-07, + "loss": 0.3858, + "step": 91 + }, + { + "epoch": 0.050521691378363535, + "grad_norm": 0.47864372777099196, + "learning_rate": 9.998845192263367e-07, + "loss": 0.4032, + "step": 92 + }, + { + "epoch": 0.051070840197693576, + "grad_norm": 0.4086842726878955, + "learning_rate": 9.998781929251582e-07, + "loss": 0.3923, + "step": 93 + }, + { + "epoch": 0.05161998901702362, + "grad_norm": 0.41763438148578, + "learning_rate": 9.998716979503789e-07, + "loss": 0.359, + "step": 94 + }, + { + "epoch": 0.05216913783635365, + "grad_norm": 0.41013402413321715, + "learning_rate": 9.998650343041904e-07, + "loss": 0.3892, + "step": 95 + }, + { + "epoch": 0.05271828665568369, + "grad_norm": 0.3894742717828985, + "learning_rate": 9.998582019888418e-07, + "loss": 0.4144, + "step": 96 + }, + { + "epoch": 0.05326743547501373, + "grad_norm": 0.496851285380317, + "learning_rate": 9.998512010066385e-07, + "loss": 0.3975, + "step": 97 + }, + { + "epoch": 0.053816584294343765, + "grad_norm": 0.40890426232079896, + "learning_rate": 9.998440313599432e-07, + "loss": 0.4108, + "step": 98 + }, + { + "epoch": 0.054365733113673806, + "grad_norm": 0.7032273015391635, + "learning_rate": 9.998366930511754e-07, + "loss": 0.4381, + "step": 99 + }, + { + "epoch": 0.054914881933003847, + "grad_norm": 0.481860551076321, + "learning_rate": 9.998291860828114e-07, + "loss": 0.4481, + "step": 100 + }, + { + "epoch": 0.05546403075233388, + "grad_norm": 0.3960595845175762, + "learning_rate": 9.998215104573845e-07, + "loss": 0.4003, + "step": 101 + }, + { + "epoch": 0.05601317957166392, + "grad_norm": 0.5138990272499223, + "learning_rate": 9.998136661774851e-07, + "loss": 0.3848, + "step": 102 + }, + { + "epoch": 0.05656232839099396, + "grad_norm": 0.46259719838647606, + "learning_rate": 9.998056532457605e-07, + "loss": 0.4253, + "step": 103 + }, + { + "epoch": 0.057111477210323995, + "grad_norm": 0.4865639506859819, + "learning_rate": 9.997974716649143e-07, + "loss": 0.4173, + "step": 104 + }, + { + "epoch": 0.057660626029654036, + "grad_norm": 0.5542792137654188, + "learning_rate": 9.997891214377078e-07, + "loss": 0.3924, + "step": 105 + }, + { + "epoch": 0.058209774848984076, + "grad_norm": 0.3970870620860831, + "learning_rate": 9.997806025669586e-07, + "loss": 0.386, + "step": 106 + }, + { + "epoch": 0.05875892366831411, + "grad_norm": 0.8684158596164165, + "learning_rate": 9.99771915055542e-07, + "loss": 0.3942, + "step": 107 + }, + { + "epoch": 0.05930807248764415, + "grad_norm": 0.39883827363387453, + "learning_rate": 9.997630589063896e-07, + "loss": 0.3644, + "step": 108 + }, + { + "epoch": 0.05985722130697419, + "grad_norm": 0.3977619556884487, + "learning_rate": 9.997540341224897e-07, + "loss": 0.3692, + "step": 109 + }, + { + "epoch": 0.06040637012630423, + "grad_norm": 0.5584968084463278, + "learning_rate": 9.997448407068881e-07, + "loss": 0.3847, + "step": 110 + }, + { + "epoch": 0.060955518945634266, + "grad_norm": 0.4901307252729202, + "learning_rate": 9.997354786626871e-07, + "loss": 0.3894, + "step": 111 + }, + { + "epoch": 0.061504667764964306, + "grad_norm": 0.5357429553372771, + "learning_rate": 9.99725947993046e-07, + "loss": 0.3555, + "step": 112 + }, + { + "epoch": 0.06205381658429435, + "grad_norm": 0.8108112801429885, + "learning_rate": 9.99716248701181e-07, + "loss": 0.3882, + "step": 113 + }, + { + "epoch": 0.06260296540362438, + "grad_norm": 0.391691207820883, + "learning_rate": 9.997063807903656e-07, + "loss": 0.3741, + "step": 114 + }, + { + "epoch": 0.06315211422295441, + "grad_norm": 0.3796279632840399, + "learning_rate": 9.996963442639298e-07, + "loss": 0.3228, + "step": 115 + }, + { + "epoch": 0.06370126304228446, + "grad_norm": 0.42435147960702796, + "learning_rate": 9.996861391252602e-07, + "loss": 0.4165, + "step": 116 + }, + { + "epoch": 0.0642504118616145, + "grad_norm": 0.5821283761287043, + "learning_rate": 9.99675765377801e-07, + "loss": 0.4116, + "step": 117 + }, + { + "epoch": 0.06479956068094453, + "grad_norm": 0.5389945137755192, + "learning_rate": 9.996652230250526e-07, + "loss": 0.3862, + "step": 118 + }, + { + "epoch": 0.06534870950027458, + "grad_norm": 0.4445522665144674, + "learning_rate": 9.99654512070573e-07, + "loss": 0.4287, + "step": 119 + }, + { + "epoch": 0.06589785831960461, + "grad_norm": 0.5060430282410254, + "learning_rate": 9.996436325179768e-07, + "loss": 0.3616, + "step": 120 + }, + { + "epoch": 0.06644700713893466, + "grad_norm": 0.30509818128951455, + "learning_rate": 9.996325843709352e-07, + "loss": 0.3365, + "step": 121 + }, + { + "epoch": 0.06699615595826469, + "grad_norm": 0.515201460268608, + "learning_rate": 9.996213676331764e-07, + "loss": 0.4321, + "step": 122 + }, + { + "epoch": 0.06754530477759473, + "grad_norm": 0.5446747484512473, + "learning_rate": 9.99609982308486e-07, + "loss": 0.4181, + "step": 123 + }, + { + "epoch": 0.06809445359692477, + "grad_norm": 0.5024648771569943, + "learning_rate": 9.99598428400706e-07, + "loss": 0.3746, + "step": 124 + }, + { + "epoch": 0.0686436024162548, + "grad_norm": 0.4652208294214632, + "learning_rate": 9.995867059137356e-07, + "loss": 0.3866, + "step": 125 + }, + { + "epoch": 0.06919275123558484, + "grad_norm": 0.3408332502864401, + "learning_rate": 9.9957481485153e-07, + "loss": 0.359, + "step": 126 + }, + { + "epoch": 0.06974190005491489, + "grad_norm": 0.4070667377883137, + "learning_rate": 9.995627552181032e-07, + "loss": 0.3273, + "step": 127 + }, + { + "epoch": 0.07029104887424492, + "grad_norm": 0.36927260336226775, + "learning_rate": 9.995505270175238e-07, + "loss": 0.3594, + "step": 128 + }, + { + "epoch": 0.07084019769357495, + "grad_norm": 0.4375876523488486, + "learning_rate": 9.995381302539186e-07, + "loss": 0.3958, + "step": 129 + }, + { + "epoch": 0.071389346512905, + "grad_norm": 0.3792126627963935, + "learning_rate": 9.995255649314714e-07, + "loss": 0.3407, + "step": 130 + }, + { + "epoch": 0.07193849533223504, + "grad_norm": 0.35222602094119543, + "learning_rate": 9.995128310544225e-07, + "loss": 0.3514, + "step": 131 + }, + { + "epoch": 0.07248764415156507, + "grad_norm": 0.45819894844073455, + "learning_rate": 9.994999286270687e-07, + "loss": 0.3758, + "step": 132 + }, + { + "epoch": 0.07303679297089512, + "grad_norm": 0.4262973424398369, + "learning_rate": 9.994868576537646e-07, + "loss": 0.3538, + "step": 133 + }, + { + "epoch": 0.07358594179022515, + "grad_norm": 0.5336016620311667, + "learning_rate": 9.994736181389207e-07, + "loss": 0.4559, + "step": 134 + }, + { + "epoch": 0.07413509060955518, + "grad_norm": 0.4136505217969873, + "learning_rate": 9.99460210087005e-07, + "loss": 0.4176, + "step": 135 + }, + { + "epoch": 0.07468423942888523, + "grad_norm": 0.28382829003883114, + "learning_rate": 9.994466335025425e-07, + "loss": 0.3668, + "step": 136 + }, + { + "epoch": 0.07523338824821527, + "grad_norm": 0.4975464706635873, + "learning_rate": 9.994328883901146e-07, + "loss": 0.414, + "step": 137 + }, + { + "epoch": 0.0757825370675453, + "grad_norm": 0.45122723000653814, + "learning_rate": 9.994189747543596e-07, + "loss": 0.3959, + "step": 138 + }, + { + "epoch": 0.07633168588687535, + "grad_norm": 0.5299261818855341, + "learning_rate": 9.994048925999732e-07, + "loss": 0.4173, + "step": 139 + }, + { + "epoch": 0.07688083470620538, + "grad_norm": 0.46002340806867353, + "learning_rate": 9.993906419317072e-07, + "loss": 0.3942, + "step": 140 + }, + { + "epoch": 0.07742998352553541, + "grad_norm": 0.3605976614381673, + "learning_rate": 9.99376222754371e-07, + "loss": 0.3802, + "step": 141 + }, + { + "epoch": 0.07797913234486546, + "grad_norm": 0.5362236526756698, + "learning_rate": 9.993616350728304e-07, + "loss": 0.3779, + "step": 142 + }, + { + "epoch": 0.0785282811641955, + "grad_norm": 0.34943871592531056, + "learning_rate": 9.993468788920084e-07, + "loss": 0.367, + "step": 143 + }, + { + "epoch": 0.07907742998352553, + "grad_norm": 0.39683303312446844, + "learning_rate": 9.993319542168843e-07, + "loss": 0.421, + "step": 144 + }, + { + "epoch": 0.07962657880285558, + "grad_norm": 0.3875959328068498, + "learning_rate": 9.993168610524948e-07, + "loss": 0.3809, + "step": 145 + }, + { + "epoch": 0.08017572762218561, + "grad_norm": 0.4730306526908986, + "learning_rate": 9.993015994039334e-07, + "loss": 0.3524, + "step": 146 + }, + { + "epoch": 0.08072487644151564, + "grad_norm": 0.4760955148163818, + "learning_rate": 9.992861692763502e-07, + "loss": 0.378, + "step": 147 + }, + { + "epoch": 0.08127402526084569, + "grad_norm": 0.47121478486586815, + "learning_rate": 9.992705706749524e-07, + "loss": 0.3679, + "step": 148 + }, + { + "epoch": 0.08182317408017573, + "grad_norm": 0.5283249542882976, + "learning_rate": 9.992548036050038e-07, + "loss": 0.3864, + "step": 149 + }, + { + "epoch": 0.08237232289950576, + "grad_norm": 0.5002857967852061, + "learning_rate": 9.992388680718255e-07, + "loss": 0.387, + "step": 150 + }, + { + "epoch": 0.0829214717188358, + "grad_norm": 0.679985230809286, + "learning_rate": 9.99222764080795e-07, + "loss": 0.3957, + "step": 151 + }, + { + "epoch": 0.08347062053816584, + "grad_norm": 0.41311368019230177, + "learning_rate": 9.992064916373466e-07, + "loss": 0.3839, + "step": 152 + }, + { + "epoch": 0.08401976935749589, + "grad_norm": 0.4091730337302347, + "learning_rate": 9.99190050746972e-07, + "loss": 0.412, + "step": 153 + }, + { + "epoch": 0.08456891817682592, + "grad_norm": 0.4014223008819766, + "learning_rate": 9.99173441415219e-07, + "loss": 0.3561, + "step": 154 + }, + { + "epoch": 0.08511806699615596, + "grad_norm": 0.40109303686643105, + "learning_rate": 9.99156663647693e-07, + "loss": 0.3372, + "step": 155 + }, + { + "epoch": 0.085667215815486, + "grad_norm": 0.39806264515079226, + "learning_rate": 9.991397174500556e-07, + "loss": 0.3906, + "step": 156 + }, + { + "epoch": 0.08621636463481604, + "grad_norm": 0.5062314911028993, + "learning_rate": 9.991226028280257e-07, + "loss": 0.3939, + "step": 157 + }, + { + "epoch": 0.08676551345414607, + "grad_norm": 0.38662768415414206, + "learning_rate": 9.99105319787379e-07, + "loss": 0.3987, + "step": 158 + }, + { + "epoch": 0.08731466227347612, + "grad_norm": 0.4590133256755766, + "learning_rate": 9.990878683339475e-07, + "loss": 0.3299, + "step": 159 + }, + { + "epoch": 0.08786381109280615, + "grad_norm": 0.41380348003567224, + "learning_rate": 9.990702484736205e-07, + "loss": 0.4035, + "step": 160 + }, + { + "epoch": 0.08841295991213619, + "grad_norm": 0.5438042147725886, + "learning_rate": 9.990524602123444e-07, + "loss": 0.348, + "step": 161 + }, + { + "epoch": 0.08896210873146623, + "grad_norm": 0.5713386655453655, + "learning_rate": 9.99034503556122e-07, + "loss": 0.408, + "step": 162 + }, + { + "epoch": 0.08951125755079627, + "grad_norm": 0.3783590546794582, + "learning_rate": 9.990163785110126e-07, + "loss": 0.3864, + "step": 163 + }, + { + "epoch": 0.0900604063701263, + "grad_norm": 0.33683455763843745, + "learning_rate": 9.989980850831331e-07, + "loss": 0.347, + "step": 164 + }, + { + "epoch": 0.09060955518945635, + "grad_norm": 0.4240240644529421, + "learning_rate": 9.989796232786567e-07, + "loss": 0.3352, + "step": 165 + }, + { + "epoch": 0.09115870400878638, + "grad_norm": 0.5058278687597212, + "learning_rate": 9.989609931038138e-07, + "loss": 0.3935, + "step": 166 + }, + { + "epoch": 0.09170785282811642, + "grad_norm": 0.3983611191440675, + "learning_rate": 9.98942194564891e-07, + "loss": 0.4081, + "step": 167 + }, + { + "epoch": 0.09225700164744646, + "grad_norm": 0.4427158210708611, + "learning_rate": 9.989232276682323e-07, + "loss": 0.348, + "step": 168 + }, + { + "epoch": 0.0928061504667765, + "grad_norm": 0.4520051829129437, + "learning_rate": 9.989040924202386e-07, + "loss": 0.4108, + "step": 169 + }, + { + "epoch": 0.09335529928610653, + "grad_norm": 0.364594432913114, + "learning_rate": 9.988847888273671e-07, + "loss": 0.3583, + "step": 170 + }, + { + "epoch": 0.09390444810543658, + "grad_norm": 0.3678552199329568, + "learning_rate": 9.988653168961321e-07, + "loss": 0.3307, + "step": 171 + }, + { + "epoch": 0.09445359692476661, + "grad_norm": 0.5709532339625728, + "learning_rate": 9.988456766331046e-07, + "loss": 0.4148, + "step": 172 + }, + { + "epoch": 0.09500274574409664, + "grad_norm": 0.3980035452670733, + "learning_rate": 9.988258680449123e-07, + "loss": 0.3975, + "step": 173 + }, + { + "epoch": 0.09555189456342669, + "grad_norm": 0.42917281879579317, + "learning_rate": 9.988058911382402e-07, + "loss": 0.3384, + "step": 174 + }, + { + "epoch": 0.09610104338275673, + "grad_norm": 0.3512767098796575, + "learning_rate": 9.987857459198296e-07, + "loss": 0.3722, + "step": 175 + }, + { + "epoch": 0.09665019220208676, + "grad_norm": 0.6212737994624551, + "learning_rate": 9.98765432396479e-07, + "loss": 0.3965, + "step": 176 + }, + { + "epoch": 0.09719934102141681, + "grad_norm": 0.5174479120329459, + "learning_rate": 9.987449505750431e-07, + "loss": 0.4035, + "step": 177 + }, + { + "epoch": 0.09774848984074684, + "grad_norm": 0.4093324105938102, + "learning_rate": 9.987243004624337e-07, + "loss": 0.3578, + "step": 178 + }, + { + "epoch": 0.09829763866007687, + "grad_norm": 0.34138629472472504, + "learning_rate": 9.9870348206562e-07, + "loss": 0.364, + "step": 179 + }, + { + "epoch": 0.09884678747940692, + "grad_norm": 0.4285231430228904, + "learning_rate": 9.986824953916272e-07, + "loss": 0.3527, + "step": 180 + }, + { + "epoch": 0.09939593629873696, + "grad_norm": 0.3971700866108027, + "learning_rate": 9.986613404475373e-07, + "loss": 0.3962, + "step": 181 + }, + { + "epoch": 0.09994508511806699, + "grad_norm": 0.4082782904476488, + "learning_rate": 9.986400172404894e-07, + "loss": 0.3312, + "step": 182 + }, + { + "epoch": 0.10049423393739704, + "grad_norm": 0.5190253973628005, + "learning_rate": 9.986185257776794e-07, + "loss": 0.3486, + "step": 183 + }, + { + "epoch": 0.10104338275672707, + "grad_norm": 0.4795273521744964, + "learning_rate": 9.985968660663599e-07, + "loss": 0.3392, + "step": 184 + }, + { + "epoch": 0.1015925315760571, + "grad_norm": 0.3460609246158477, + "learning_rate": 9.985750381138403e-07, + "loss": 0.3705, + "step": 185 + }, + { + "epoch": 0.10214168039538715, + "grad_norm": 0.4452233441846428, + "learning_rate": 9.985530419274865e-07, + "loss": 0.2976, + "step": 186 + }, + { + "epoch": 0.10269082921471719, + "grad_norm": 0.4553211629734614, + "learning_rate": 9.985308775147213e-07, + "loss": 0.3649, + "step": 187 + }, + { + "epoch": 0.10323997803404723, + "grad_norm": 0.4796653268030492, + "learning_rate": 9.98508544883025e-07, + "loss": 0.3674, + "step": 188 + }, + { + "epoch": 0.10378912685337727, + "grad_norm": 0.41997355091130645, + "learning_rate": 9.984860440399334e-07, + "loss": 0.3828, + "step": 189 + }, + { + "epoch": 0.1043382756727073, + "grad_norm": 0.44852018455981013, + "learning_rate": 9.9846337499304e-07, + "loss": 0.3305, + "step": 190 + }, + { + "epoch": 0.10488742449203735, + "grad_norm": 0.43346022282374863, + "learning_rate": 9.984405377499948e-07, + "loss": 0.3694, + "step": 191 + }, + { + "epoch": 0.10543657331136738, + "grad_norm": 0.4345686434268374, + "learning_rate": 9.984175323185042e-07, + "loss": 0.3774, + "step": 192 + }, + { + "epoch": 0.10598572213069742, + "grad_norm": 0.42529178344892793, + "learning_rate": 9.983943587063322e-07, + "loss": 0.3753, + "step": 193 + }, + { + "epoch": 0.10653487095002746, + "grad_norm": 0.3495647195575569, + "learning_rate": 9.983710169212986e-07, + "loss": 0.3856, + "step": 194 + }, + { + "epoch": 0.1070840197693575, + "grad_norm": 0.3526832472899213, + "learning_rate": 9.983475069712806e-07, + "loss": 0.3437, + "step": 195 + }, + { + "epoch": 0.10763316858868753, + "grad_norm": 0.46510110816199074, + "learning_rate": 9.98323828864212e-07, + "loss": 0.3485, + "step": 196 + }, + { + "epoch": 0.10818231740801758, + "grad_norm": 0.4348701784089849, + "learning_rate": 9.98299982608083e-07, + "loss": 0.3674, + "step": 197 + }, + { + "epoch": 0.10873146622734761, + "grad_norm": 0.5501938433457089, + "learning_rate": 9.982759682109414e-07, + "loss": 0.4056, + "step": 198 + }, + { + "epoch": 0.10928061504667765, + "grad_norm": 0.6244660890078381, + "learning_rate": 9.982517856808903e-07, + "loss": 0.4472, + "step": 199 + }, + { + "epoch": 0.10982976386600769, + "grad_norm": 0.41988021802529674, + "learning_rate": 9.982274350260913e-07, + "loss": 0.3926, + "step": 200 + }, + { + "epoch": 0.10982976386600769, + "eval_loss": 0.4592891335487366, + "eval_runtime": 19.1098, + "eval_samples_per_second": 23.182, + "eval_steps_per_second": 0.994, + "step": 200 + }, + { + "epoch": 0.11037891268533773, + "grad_norm": 0.4415435064751627, + "learning_rate": 9.98202916254761e-07, + "loss": 0.3667, + "step": 201 + }, + { + "epoch": 0.11092806150466776, + "grad_norm": 0.35924372460927034, + "learning_rate": 9.981782293751745e-07, + "loss": 0.3733, + "step": 202 + }, + { + "epoch": 0.11147721032399781, + "grad_norm": 0.38821674798414335, + "learning_rate": 9.981533743956623e-07, + "loss": 0.3668, + "step": 203 + }, + { + "epoch": 0.11202635914332784, + "grad_norm": 0.40750498767827315, + "learning_rate": 9.981283513246117e-07, + "loss": 0.3779, + "step": 204 + }, + { + "epoch": 0.11257550796265788, + "grad_norm": 0.44839782402832984, + "learning_rate": 9.981031601704675e-07, + "loss": 0.3933, + "step": 205 + }, + { + "epoch": 0.11312465678198792, + "grad_norm": 0.5550047522714568, + "learning_rate": 9.980778009417306e-07, + "loss": 0.3648, + "step": 206 + }, + { + "epoch": 0.11367380560131796, + "grad_norm": 0.3992747457977071, + "learning_rate": 9.98052273646959e-07, + "loss": 0.3474, + "step": 207 + }, + { + "epoch": 0.11422295442064799, + "grad_norm": 0.395776988189523, + "learning_rate": 9.98026578294767e-07, + "loss": 0.3617, + "step": 208 + }, + { + "epoch": 0.11477210323997804, + "grad_norm": 0.47873976999967466, + "learning_rate": 9.980007148938257e-07, + "loss": 0.3915, + "step": 209 + }, + { + "epoch": 0.11532125205930807, + "grad_norm": 0.48073825558813443, + "learning_rate": 9.979746834528634e-07, + "loss": 0.3457, + "step": 210 + }, + { + "epoch": 0.1158704008786381, + "grad_norm": 0.40926753855165304, + "learning_rate": 9.979484839806645e-07, + "loss": 0.3231, + "step": 211 + }, + { + "epoch": 0.11641954969796815, + "grad_norm": 0.4951877426042713, + "learning_rate": 9.979221164860707e-07, + "loss": 0.3165, + "step": 212 + }, + { + "epoch": 0.11696869851729819, + "grad_norm": 0.5632310747337004, + "learning_rate": 9.978955809779797e-07, + "loss": 0.3691, + "step": 213 + }, + { + "epoch": 0.11751784733662822, + "grad_norm": 0.3411906362752352, + "learning_rate": 9.978688774653462e-07, + "loss": 0.3236, + "step": 214 + }, + { + "epoch": 0.11806699615595827, + "grad_norm": 0.4589008696737782, + "learning_rate": 9.978420059571822e-07, + "loss": 0.3745, + "step": 215 + }, + { + "epoch": 0.1186161449752883, + "grad_norm": 0.40534732619235747, + "learning_rate": 9.978149664625552e-07, + "loss": 0.3649, + "step": 216 + }, + { + "epoch": 0.11916529379461833, + "grad_norm": 0.45724827266369716, + "learning_rate": 9.977877589905903e-07, + "loss": 0.3338, + "step": 217 + }, + { + "epoch": 0.11971444261394838, + "grad_norm": 0.395872308929039, + "learning_rate": 9.97760383550469e-07, + "loss": 0.3511, + "step": 218 + }, + { + "epoch": 0.12026359143327842, + "grad_norm": 0.37096602353182145, + "learning_rate": 9.977328401514297e-07, + "loss": 0.3407, + "step": 219 + }, + { + "epoch": 0.12081274025260846, + "grad_norm": 0.39029994608220525, + "learning_rate": 9.977051288027672e-07, + "loss": 0.3656, + "step": 220 + }, + { + "epoch": 0.1213618890719385, + "grad_norm": 0.4321717157067283, + "learning_rate": 9.976772495138326e-07, + "loss": 0.3745, + "step": 221 + }, + { + "epoch": 0.12191103789126853, + "grad_norm": 0.4669317733485491, + "learning_rate": 9.976492022940347e-07, + "loss": 0.3879, + "step": 222 + }, + { + "epoch": 0.12246018671059858, + "grad_norm": 0.41070948336327107, + "learning_rate": 9.976209871528382e-07, + "loss": 0.3824, + "step": 223 + }, + { + "epoch": 0.12300933552992861, + "grad_norm": 0.31792415033613153, + "learning_rate": 9.975926040997649e-07, + "loss": 0.351, + "step": 224 + }, + { + "epoch": 0.12355848434925865, + "grad_norm": 0.48492647343265355, + "learning_rate": 9.975640531443926e-07, + "loss": 0.3901, + "step": 225 + }, + { + "epoch": 0.1241076331685887, + "grad_norm": 0.749265387589108, + "learning_rate": 9.975353342963561e-07, + "loss": 0.4107, + "step": 226 + }, + { + "epoch": 0.12465678198791873, + "grad_norm": 0.48471643998709324, + "learning_rate": 9.975064475653477e-07, + "loss": 0.4175, + "step": 227 + }, + { + "epoch": 0.12520593080724876, + "grad_norm": 0.4710683247831898, + "learning_rate": 9.97477392961115e-07, + "loss": 0.3562, + "step": 228 + }, + { + "epoch": 0.1257550796265788, + "grad_norm": 0.3644446376775287, + "learning_rate": 9.97448170493463e-07, + "loss": 0.3559, + "step": 229 + }, + { + "epoch": 0.12630422844590883, + "grad_norm": 0.3348740546399851, + "learning_rate": 9.974187801722534e-07, + "loss": 0.3428, + "step": 230 + }, + { + "epoch": 0.12685337726523888, + "grad_norm": 0.3760807451612699, + "learning_rate": 9.97389222007404e-07, + "loss": 0.3615, + "step": 231 + }, + { + "epoch": 0.12740252608456892, + "grad_norm": 0.4353511228168891, + "learning_rate": 9.973594960088898e-07, + "loss": 0.3922, + "step": 232 + }, + { + "epoch": 0.12795167490389894, + "grad_norm": 0.405228004951534, + "learning_rate": 9.973296021867424e-07, + "loss": 0.289, + "step": 233 + }, + { + "epoch": 0.128500823723229, + "grad_norm": 0.4599862585068726, + "learning_rate": 9.972995405510496e-07, + "loss": 0.4217, + "step": 234 + }, + { + "epoch": 0.12904997254255904, + "grad_norm": 0.40813300867484475, + "learning_rate": 9.97269311111956e-07, + "loss": 0.3372, + "step": 235 + }, + { + "epoch": 0.12959912136188906, + "grad_norm": 0.428181122274626, + "learning_rate": 9.972389138796632e-07, + "loss": 0.363, + "step": 236 + }, + { + "epoch": 0.1301482701812191, + "grad_norm": 0.39017516006132724, + "learning_rate": 9.97208348864429e-07, + "loss": 0.3544, + "step": 237 + }, + { + "epoch": 0.13069741900054915, + "grad_norm": 0.46677760996170964, + "learning_rate": 9.971776160765678e-07, + "loss": 0.341, + "step": 238 + }, + { + "epoch": 0.1312465678198792, + "grad_norm": 0.3797991014913136, + "learning_rate": 9.971467155264512e-07, + "loss": 0.3389, + "step": 239 + }, + { + "epoch": 0.13179571663920922, + "grad_norm": 0.4573302999433252, + "learning_rate": 9.971156472245068e-07, + "loss": 0.3472, + "step": 240 + }, + { + "epoch": 0.13234486545853927, + "grad_norm": 0.39387593731789705, + "learning_rate": 9.97084411181219e-07, + "loss": 0.321, + "step": 241 + }, + { + "epoch": 0.13289401427786932, + "grad_norm": 0.4709552965683147, + "learning_rate": 9.970530074071288e-07, + "loss": 0.3608, + "step": 242 + }, + { + "epoch": 0.13344316309719934, + "grad_norm": 0.42579713480844766, + "learning_rate": 9.970214359128335e-07, + "loss": 0.3316, + "step": 243 + }, + { + "epoch": 0.13399231191652938, + "grad_norm": 0.47601317716932834, + "learning_rate": 9.96989696708988e-07, + "loss": 0.3172, + "step": 244 + }, + { + "epoch": 0.13454146073585943, + "grad_norm": 0.44384536282866577, + "learning_rate": 9.969577898063025e-07, + "loss": 0.3586, + "step": 245 + }, + { + "epoch": 0.13509060955518945, + "grad_norm": 0.39505986923158526, + "learning_rate": 9.969257152155448e-07, + "loss": 0.3656, + "step": 246 + }, + { + "epoch": 0.1356397583745195, + "grad_norm": 0.42418758008300533, + "learning_rate": 9.968934729475387e-07, + "loss": 0.3922, + "step": 247 + }, + { + "epoch": 0.13618890719384955, + "grad_norm": 0.5093440412591362, + "learning_rate": 9.968610630131648e-07, + "loss": 0.3479, + "step": 248 + }, + { + "epoch": 0.13673805601317957, + "grad_norm": 0.7139282967080007, + "learning_rate": 9.968284854233602e-07, + "loss": 0.3909, + "step": 249 + }, + { + "epoch": 0.1372872048325096, + "grad_norm": 0.6529816407471987, + "learning_rate": 9.967957401891189e-07, + "loss": 0.4136, + "step": 250 + }, + { + "epoch": 0.13783635365183966, + "grad_norm": 0.6216389978109844, + "learning_rate": 9.967628273214908e-07, + "loss": 0.3782, + "step": 251 + }, + { + "epoch": 0.13838550247116968, + "grad_norm": 0.44445537712753524, + "learning_rate": 9.967297468315833e-07, + "loss": 0.3257, + "step": 252 + }, + { + "epoch": 0.13893465129049973, + "grad_norm": 0.4020209287971924, + "learning_rate": 9.966964987305595e-07, + "loss": 0.3169, + "step": 253 + }, + { + "epoch": 0.13948380010982978, + "grad_norm": 0.43357581260818745, + "learning_rate": 9.96663083029639e-07, + "loss": 0.3733, + "step": 254 + }, + { + "epoch": 0.1400329489291598, + "grad_norm": 0.4795315250000879, + "learning_rate": 9.966294997400994e-07, + "loss": 0.4023, + "step": 255 + }, + { + "epoch": 0.14058209774848984, + "grad_norm": 0.3859745136767488, + "learning_rate": 9.965957488732731e-07, + "loss": 0.352, + "step": 256 + }, + { + "epoch": 0.1411312465678199, + "grad_norm": 0.4612583423759146, + "learning_rate": 9.965618304405498e-07, + "loss": 0.3563, + "step": 257 + }, + { + "epoch": 0.1416803953871499, + "grad_norm": 0.4598385721150566, + "learning_rate": 9.965277444533758e-07, + "loss": 0.3574, + "step": 258 + }, + { + "epoch": 0.14222954420647996, + "grad_norm": 0.570962890716166, + "learning_rate": 9.96493490923254e-07, + "loss": 0.3889, + "step": 259 + }, + { + "epoch": 0.14277869302581, + "grad_norm": 0.41513443701763686, + "learning_rate": 9.964590698617438e-07, + "loss": 0.3026, + "step": 260 + }, + { + "epoch": 0.14332784184514002, + "grad_norm": 0.49569746196966874, + "learning_rate": 9.964244812804605e-07, + "loss": 0.401, + "step": 261 + }, + { + "epoch": 0.14387699066447007, + "grad_norm": 0.3925347823971518, + "learning_rate": 9.96389725191077e-07, + "loss": 0.3901, + "step": 262 + }, + { + "epoch": 0.14442613948380012, + "grad_norm": 0.566322238816806, + "learning_rate": 9.96354801605322e-07, + "loss": 0.3576, + "step": 263 + }, + { + "epoch": 0.14497528830313014, + "grad_norm": 0.39695676135462576, + "learning_rate": 9.963197105349812e-07, + "loss": 0.3789, + "step": 264 + }, + { + "epoch": 0.1455244371224602, + "grad_norm": 0.5925334242414747, + "learning_rate": 9.962844519918958e-07, + "loss": 0.3467, + "step": 265 + }, + { + "epoch": 0.14607358594179023, + "grad_norm": 0.60271074859749, + "learning_rate": 9.962490259879652e-07, + "loss": 0.3714, + "step": 266 + }, + { + "epoch": 0.14662273476112025, + "grad_norm": 0.4822538904423981, + "learning_rate": 9.962134325351439e-07, + "loss": 0.3448, + "step": 267 + }, + { + "epoch": 0.1471718835804503, + "grad_norm": 0.4667548317418967, + "learning_rate": 9.96177671645443e-07, + "loss": 0.3309, + "step": 268 + }, + { + "epoch": 0.14772103239978035, + "grad_norm": 0.6527901023288586, + "learning_rate": 9.961417433309311e-07, + "loss": 0.3745, + "step": 269 + }, + { + "epoch": 0.14827018121911037, + "grad_norm": 0.5089121492656247, + "learning_rate": 9.961056476037324e-07, + "loss": 0.334, + "step": 270 + }, + { + "epoch": 0.14881933003844042, + "grad_norm": 0.44832682542367025, + "learning_rate": 9.960693844760283e-07, + "loss": 0.3245, + "step": 271 + }, + { + "epoch": 0.14936847885777046, + "grad_norm": 0.44360234778223506, + "learning_rate": 9.960329539600556e-07, + "loss": 0.3703, + "step": 272 + }, + { + "epoch": 0.14991762767710048, + "grad_norm": 0.6056027201575581, + "learning_rate": 9.959963560681086e-07, + "loss": 0.3529, + "step": 273 + }, + { + "epoch": 0.15046677649643053, + "grad_norm": 0.5990019290407864, + "learning_rate": 9.959595908125378e-07, + "loss": 0.3643, + "step": 274 + }, + { + "epoch": 0.15101592531576058, + "grad_norm": 0.3635109496239804, + "learning_rate": 9.9592265820575e-07, + "loss": 0.3782, + "step": 275 + }, + { + "epoch": 0.1515650741350906, + "grad_norm": 0.46936898990133824, + "learning_rate": 9.958855582602085e-07, + "loss": 0.3327, + "step": 276 + }, + { + "epoch": 0.15211422295442065, + "grad_norm": 0.3776537571444719, + "learning_rate": 9.958482909884336e-07, + "loss": 0.3685, + "step": 277 + }, + { + "epoch": 0.1526633717737507, + "grad_norm": 0.41210296949155756, + "learning_rate": 9.958108564030012e-07, + "loss": 0.3322, + "step": 278 + }, + { + "epoch": 0.15321252059308071, + "grad_norm": 0.48290939123842697, + "learning_rate": 9.95773254516544e-07, + "loss": 0.3345, + "step": 279 + }, + { + "epoch": 0.15376166941241076, + "grad_norm": 0.382141670936933, + "learning_rate": 9.957354853417515e-07, + "loss": 0.3589, + "step": 280 + }, + { + "epoch": 0.1543108182317408, + "grad_norm": 0.4175712720125532, + "learning_rate": 9.956975488913697e-07, + "loss": 0.3732, + "step": 281 + }, + { + "epoch": 0.15485996705107083, + "grad_norm": 0.3988533984078127, + "learning_rate": 9.956594451782e-07, + "loss": 0.3254, + "step": 282 + }, + { + "epoch": 0.15540911587040088, + "grad_norm": 0.5399841474010806, + "learning_rate": 9.956211742151017e-07, + "loss": 0.3533, + "step": 283 + }, + { + "epoch": 0.15595826468973092, + "grad_norm": 0.3775942522914299, + "learning_rate": 9.955827360149894e-07, + "loss": 0.3164, + "step": 284 + }, + { + "epoch": 0.15650741350906094, + "grad_norm": 0.5950048479549518, + "learning_rate": 9.95544130590835e-07, + "loss": 0.3749, + "step": 285 + }, + { + "epoch": 0.157056562328391, + "grad_norm": 0.48837745189753445, + "learning_rate": 9.955053579556659e-07, + "loss": 0.3729, + "step": 286 + }, + { + "epoch": 0.15760571114772104, + "grad_norm": 0.38176832343027967, + "learning_rate": 9.95466418122567e-07, + "loss": 0.3547, + "step": 287 + }, + { + "epoch": 0.15815485996705106, + "grad_norm": 0.3954692202082432, + "learning_rate": 9.954273111046783e-07, + "loss": 0.3351, + "step": 288 + }, + { + "epoch": 0.1587040087863811, + "grad_norm": 0.4419211445608487, + "learning_rate": 9.953880369151978e-07, + "loss": 0.3474, + "step": 289 + }, + { + "epoch": 0.15925315760571115, + "grad_norm": 0.4666008595472022, + "learning_rate": 9.953485955673785e-07, + "loss": 0.3361, + "step": 290 + }, + { + "epoch": 0.15980230642504117, + "grad_norm": 0.4559285279712608, + "learning_rate": 9.953089870745308e-07, + "loss": 0.3246, + "step": 291 + }, + { + "epoch": 0.16035145524437122, + "grad_norm": 0.4129733759139695, + "learning_rate": 9.952692114500208e-07, + "loss": 0.3539, + "step": 292 + }, + { + "epoch": 0.16090060406370127, + "grad_norm": 0.45498605264209285, + "learning_rate": 9.952292687072713e-07, + "loss": 0.3307, + "step": 293 + }, + { + "epoch": 0.1614497528830313, + "grad_norm": 0.7105153458114862, + "learning_rate": 9.95189158859762e-07, + "loss": 0.3784, + "step": 294 + }, + { + "epoch": 0.16199890170236134, + "grad_norm": 0.3617617956159836, + "learning_rate": 9.951488819210278e-07, + "loss": 0.3255, + "step": 295 + }, + { + "epoch": 0.16254805052169138, + "grad_norm": 0.4289627087987424, + "learning_rate": 9.95108437904661e-07, + "loss": 0.3363, + "step": 296 + }, + { + "epoch": 0.1630971993410214, + "grad_norm": 0.4483127161193595, + "learning_rate": 9.950678268243102e-07, + "loss": 0.337, + "step": 297 + }, + { + "epoch": 0.16364634816035145, + "grad_norm": 0.47752519580381575, + "learning_rate": 9.950270486936798e-07, + "loss": 0.3238, + "step": 298 + }, + { + "epoch": 0.1641954969796815, + "grad_norm": 0.4359700802585661, + "learning_rate": 9.949861035265312e-07, + "loss": 0.3274, + "step": 299 + }, + { + "epoch": 0.16474464579901152, + "grad_norm": 0.45171067082593136, + "learning_rate": 9.949449913366817e-07, + "loss": 0.3437, + "step": 300 + }, + { + "epoch": 0.16529379461834157, + "grad_norm": 0.41749672022873435, + "learning_rate": 9.94903712138005e-07, + "loss": 0.4048, + "step": 301 + }, + { + "epoch": 0.1658429434376716, + "grad_norm": 0.38017600202400764, + "learning_rate": 9.948622659444316e-07, + "loss": 0.3336, + "step": 302 + }, + { + "epoch": 0.16639209225700163, + "grad_norm": 0.43372715551315405, + "learning_rate": 9.94820652769948e-07, + "loss": 0.3673, + "step": 303 + }, + { + "epoch": 0.16694124107633168, + "grad_norm": 0.42527734939799494, + "learning_rate": 9.94778872628597e-07, + "loss": 0.34, + "step": 304 + }, + { + "epoch": 0.16749038989566173, + "grad_norm": 0.5650518043204806, + "learning_rate": 9.947369255344778e-07, + "loss": 0.3276, + "step": 305 + }, + { + "epoch": 0.16803953871499178, + "grad_norm": 0.4868831880087055, + "learning_rate": 9.946948115017462e-07, + "loss": 0.3368, + "step": 306 + }, + { + "epoch": 0.1685886875343218, + "grad_norm": 0.4492744346099952, + "learning_rate": 9.946525305446142e-07, + "loss": 0.3305, + "step": 307 + }, + { + "epoch": 0.16913783635365184, + "grad_norm": 0.4516893606448487, + "learning_rate": 9.946100826773497e-07, + "loss": 0.3545, + "step": 308 + }, + { + "epoch": 0.1696869851729819, + "grad_norm": 0.4485378202702623, + "learning_rate": 9.945674679142776e-07, + "loss": 0.3481, + "step": 309 + }, + { + "epoch": 0.1702361339923119, + "grad_norm": 0.4932332975680576, + "learning_rate": 9.945246862697789e-07, + "loss": 0.3608, + "step": 310 + }, + { + "epoch": 0.17078528281164196, + "grad_norm": 0.3717207417408704, + "learning_rate": 9.944817377582905e-07, + "loss": 0.3088, + "step": 311 + }, + { + "epoch": 0.171334431630972, + "grad_norm": 0.37840003050223336, + "learning_rate": 9.94438622394306e-07, + "loss": 0.3609, + "step": 312 + }, + { + "epoch": 0.17188358045030203, + "grad_norm": 0.4817829974733523, + "learning_rate": 9.943953401923756e-07, + "loss": 0.3328, + "step": 313 + }, + { + "epoch": 0.17243272926963207, + "grad_norm": 0.3648487195651704, + "learning_rate": 9.943518911671048e-07, + "loss": 0.3285, + "step": 314 + }, + { + "epoch": 0.17298187808896212, + "grad_norm": 0.4981300603086174, + "learning_rate": 9.943082753331567e-07, + "loss": 0.3949, + "step": 315 + }, + { + "epoch": 0.17353102690829214, + "grad_norm": 0.48539853349156564, + "learning_rate": 9.942644927052497e-07, + "loss": 0.3359, + "step": 316 + }, + { + "epoch": 0.1740801757276222, + "grad_norm": 0.47397270845794354, + "learning_rate": 9.942205432981588e-07, + "loss": 0.3564, + "step": 317 + }, + { + "epoch": 0.17462932454695224, + "grad_norm": 0.3387854426456379, + "learning_rate": 9.941764271267156e-07, + "loss": 0.2923, + "step": 318 + }, + { + "epoch": 0.17517847336628226, + "grad_norm": 0.5233782672135069, + "learning_rate": 9.941321442058075e-07, + "loss": 0.3302, + "step": 319 + }, + { + "epoch": 0.1757276221856123, + "grad_norm": 0.5398165786661566, + "learning_rate": 9.94087694550378e-07, + "loss": 0.355, + "step": 320 + }, + { + "epoch": 0.17627677100494235, + "grad_norm": 0.4110340753561315, + "learning_rate": 9.94043078175428e-07, + "loss": 0.3339, + "step": 321 + }, + { + "epoch": 0.17682591982427237, + "grad_norm": 0.361861039508175, + "learning_rate": 9.93998295096013e-07, + "loss": 0.3528, + "step": 322 + }, + { + "epoch": 0.17737506864360242, + "grad_norm": 0.4453360793221531, + "learning_rate": 9.939533453272465e-07, + "loss": 0.315, + "step": 323 + }, + { + "epoch": 0.17792421746293247, + "grad_norm": 0.5017845392756807, + "learning_rate": 9.939082288842968e-07, + "loss": 0.4, + "step": 324 + }, + { + "epoch": 0.17847336628226249, + "grad_norm": 0.39937084323677985, + "learning_rate": 9.938629457823894e-07, + "loss": 0.3554, + "step": 325 + }, + { + "epoch": 0.17902251510159253, + "grad_norm": 0.508911692715381, + "learning_rate": 9.93817496036805e-07, + "loss": 0.3355, + "step": 326 + }, + { + "epoch": 0.17957166392092258, + "grad_norm": 0.38494293982296035, + "learning_rate": 9.93771879662882e-07, + "loss": 0.3527, + "step": 327 + }, + { + "epoch": 0.1801208127402526, + "grad_norm": 0.4027297806365311, + "learning_rate": 9.93726096676014e-07, + "loss": 0.3791, + "step": 328 + }, + { + "epoch": 0.18066996155958265, + "grad_norm": 0.4758350347574514, + "learning_rate": 9.936801470916509e-07, + "loss": 0.3331, + "step": 329 + }, + { + "epoch": 0.1812191103789127, + "grad_norm": 0.5093063799021175, + "learning_rate": 9.93634030925299e-07, + "loss": 0.3302, + "step": 330 + }, + { + "epoch": 0.18176825919824272, + "grad_norm": 0.4062567803320839, + "learning_rate": 9.935877481925212e-07, + "loss": 0.3244, + "step": 331 + }, + { + "epoch": 0.18231740801757276, + "grad_norm": 0.38813652430611273, + "learning_rate": 9.935412989089358e-07, + "loss": 0.3763, + "step": 332 + }, + { + "epoch": 0.1828665568369028, + "grad_norm": 0.5484997009750862, + "learning_rate": 9.93494683090218e-07, + "loss": 0.3666, + "step": 333 + }, + { + "epoch": 0.18341570565623283, + "grad_norm": 0.5308548289699279, + "learning_rate": 9.934479007520986e-07, + "loss": 0.4148, + "step": 334 + }, + { + "epoch": 0.18396485447556288, + "grad_norm": 0.4362455388628846, + "learning_rate": 9.93400951910365e-07, + "loss": 0.3601, + "step": 335 + }, + { + "epoch": 0.18451400329489293, + "grad_norm": 0.5214112933772652, + "learning_rate": 9.933538365808612e-07, + "loss": 0.3516, + "step": 336 + }, + { + "epoch": 0.18506315211422295, + "grad_norm": 0.5459935043716072, + "learning_rate": 9.93306554779486e-07, + "loss": 0.3204, + "step": 337 + }, + { + "epoch": 0.185612300933553, + "grad_norm": 0.4354546568259888, + "learning_rate": 9.932591065221962e-07, + "loss": 0.339, + "step": 338 + }, + { + "epoch": 0.18616144975288304, + "grad_norm": 0.46089721608356077, + "learning_rate": 9.93211491825003e-07, + "loss": 0.3696, + "step": 339 + }, + { + "epoch": 0.18671059857221306, + "grad_norm": 0.44629559276548253, + "learning_rate": 9.931637107039754e-07, + "loss": 0.3082, + "step": 340 + }, + { + "epoch": 0.1872597473915431, + "grad_norm": 0.4618432560242684, + "learning_rate": 9.931157631752371e-07, + "loss": 0.3619, + "step": 341 + }, + { + "epoch": 0.18780889621087316, + "grad_norm": 0.467116332557529, + "learning_rate": 9.93067649254969e-07, + "loss": 0.3588, + "step": 342 + }, + { + "epoch": 0.18835804503020318, + "grad_norm": 0.4056585203016652, + "learning_rate": 9.930193689594073e-07, + "loss": 0.3059, + "step": 343 + }, + { + "epoch": 0.18890719384953322, + "grad_norm": 0.364051333857049, + "learning_rate": 9.929709223048455e-07, + "loss": 0.3344, + "step": 344 + }, + { + "epoch": 0.18945634266886327, + "grad_norm": 0.36147265732149836, + "learning_rate": 9.929223093076322e-07, + "loss": 0.3282, + "step": 345 + }, + { + "epoch": 0.1900054914881933, + "grad_norm": 0.40663001496083007, + "learning_rate": 9.928735299841727e-07, + "loss": 0.3017, + "step": 346 + }, + { + "epoch": 0.19055464030752334, + "grad_norm": 0.46893798236659495, + "learning_rate": 9.92824584350928e-07, + "loss": 0.3825, + "step": 347 + }, + { + "epoch": 0.19110378912685339, + "grad_norm": 0.4554788869837212, + "learning_rate": 9.927754724244154e-07, + "loss": 0.3033, + "step": 348 + }, + { + "epoch": 0.1916529379461834, + "grad_norm": 0.4986405237170547, + "learning_rate": 9.927261942212086e-07, + "loss": 0.3315, + "step": 349 + }, + { + "epoch": 0.19220208676551345, + "grad_norm": 0.7660448652449976, + "learning_rate": 9.926767497579368e-07, + "loss": 0.3772, + "step": 350 + }, + { + "epoch": 0.1927512355848435, + "grad_norm": 0.4034514573135684, + "learning_rate": 9.926271390512863e-07, + "loss": 0.3464, + "step": 351 + }, + { + "epoch": 0.19330038440417352, + "grad_norm": 0.4837190340742054, + "learning_rate": 9.925773621179983e-07, + "loss": 0.3592, + "step": 352 + }, + { + "epoch": 0.19384953322350357, + "grad_norm": 0.4174811670579678, + "learning_rate": 9.925274189748711e-07, + "loss": 0.3273, + "step": 353 + }, + { + "epoch": 0.19439868204283361, + "grad_norm": 0.4045739352355624, + "learning_rate": 9.924773096387583e-07, + "loss": 0.3241, + "step": 354 + }, + { + "epoch": 0.19494783086216363, + "grad_norm": 0.44473166066728204, + "learning_rate": 9.924270341265703e-07, + "loss": 0.3338, + "step": 355 + }, + { + "epoch": 0.19549697968149368, + "grad_norm": 0.36575751472761997, + "learning_rate": 9.92376592455273e-07, + "loss": 0.3506, + "step": 356 + }, + { + "epoch": 0.19604612850082373, + "grad_norm": 0.4798722835482685, + "learning_rate": 9.923259846418886e-07, + "loss": 0.3337, + "step": 357 + }, + { + "epoch": 0.19659527732015375, + "grad_norm": 0.4544698745226302, + "learning_rate": 9.922752107034955e-07, + "loss": 0.3623, + "step": 358 + }, + { + "epoch": 0.1971444261394838, + "grad_norm": 0.4011868522899694, + "learning_rate": 9.922242706572279e-07, + "loss": 0.3747, + "step": 359 + }, + { + "epoch": 0.19769357495881384, + "grad_norm": 0.41895377789327076, + "learning_rate": 9.921731645202763e-07, + "loss": 0.3482, + "step": 360 + }, + { + "epoch": 0.19824272377814386, + "grad_norm": 0.4960808386642329, + "learning_rate": 9.921218923098872e-07, + "loss": 0.3744, + "step": 361 + }, + { + "epoch": 0.1987918725974739, + "grad_norm": 0.4346701572285728, + "learning_rate": 9.920704540433632e-07, + "loss": 0.332, + "step": 362 + }, + { + "epoch": 0.19934102141680396, + "grad_norm": 0.4391749087620093, + "learning_rate": 9.920188497380622e-07, + "loss": 0.4105, + "step": 363 + }, + { + "epoch": 0.19989017023613398, + "grad_norm": 0.46502915667274697, + "learning_rate": 9.919670794113993e-07, + "loss": 0.3201, + "step": 364 + }, + { + "epoch": 0.20043931905546403, + "grad_norm": 0.3745778116593925, + "learning_rate": 9.91915143080845e-07, + "loss": 0.3353, + "step": 365 + }, + { + "epoch": 0.20098846787479407, + "grad_norm": 0.480474338823275, + "learning_rate": 9.918630407639258e-07, + "loss": 0.3681, + "step": 366 + }, + { + "epoch": 0.2015376166941241, + "grad_norm": 0.4572293832732304, + "learning_rate": 9.918107724782245e-07, + "loss": 0.3467, + "step": 367 + }, + { + "epoch": 0.20208676551345414, + "grad_norm": 0.4301394422682637, + "learning_rate": 9.917583382413792e-07, + "loss": 0.3464, + "step": 368 + }, + { + "epoch": 0.2026359143327842, + "grad_norm": 0.4455891905085749, + "learning_rate": 9.917057380710854e-07, + "loss": 0.3272, + "step": 369 + }, + { + "epoch": 0.2031850631521142, + "grad_norm": 0.5152756392544137, + "learning_rate": 9.916529719850927e-07, + "loss": 0.337, + "step": 370 + }, + { + "epoch": 0.20373421197144426, + "grad_norm": 0.33620823144229933, + "learning_rate": 9.916000400012086e-07, + "loss": 0.3225, + "step": 371 + }, + { + "epoch": 0.2042833607907743, + "grad_norm": 0.48920858152010266, + "learning_rate": 9.915469421372951e-07, + "loss": 0.3467, + "step": 372 + }, + { + "epoch": 0.20483250961010435, + "grad_norm": 0.42676704637335544, + "learning_rate": 9.914936784112712e-07, + "loss": 0.305, + "step": 373 + }, + { + "epoch": 0.20538165842943437, + "grad_norm": 0.360908826691987, + "learning_rate": 9.914402488411112e-07, + "loss": 0.3359, + "step": 374 + }, + { + "epoch": 0.20593080724876442, + "grad_norm": 0.45603145474492923, + "learning_rate": 9.913866534448455e-07, + "loss": 0.3011, + "step": 375 + }, + { + "epoch": 0.20647995606809447, + "grad_norm": 0.3475347003368911, + "learning_rate": 9.913328922405608e-07, + "loss": 0.3579, + "step": 376 + }, + { + "epoch": 0.2070291048874245, + "grad_norm": 0.38080924002398087, + "learning_rate": 9.912789652463995e-07, + "loss": 0.3089, + "step": 377 + }, + { + "epoch": 0.20757825370675453, + "grad_norm": 0.5130024962034675, + "learning_rate": 9.912248724805599e-07, + "loss": 0.3427, + "step": 378 + }, + { + "epoch": 0.20812740252608458, + "grad_norm": 0.4195870555367316, + "learning_rate": 9.911706139612962e-07, + "loss": 0.3309, + "step": 379 + }, + { + "epoch": 0.2086765513454146, + "grad_norm": 0.3962602511086976, + "learning_rate": 9.91116189706919e-07, + "loss": 0.3273, + "step": 380 + }, + { + "epoch": 0.20922570016474465, + "grad_norm": 0.42187454269421343, + "learning_rate": 9.91061599735794e-07, + "loss": 0.3309, + "step": 381 + }, + { + "epoch": 0.2097748489840747, + "grad_norm": 0.7156265150667257, + "learning_rate": 9.910068440663438e-07, + "loss": 0.3885, + "step": 382 + }, + { + "epoch": 0.21032399780340472, + "grad_norm": 0.48844455845770823, + "learning_rate": 9.909519227170462e-07, + "loss": 0.327, + "step": 383 + }, + { + "epoch": 0.21087314662273476, + "grad_norm": 0.40278423009662806, + "learning_rate": 9.90896835706435e-07, + "loss": 0.333, + "step": 384 + }, + { + "epoch": 0.2114222954420648, + "grad_norm": 0.4136773266220641, + "learning_rate": 9.908415830531001e-07, + "loss": 0.3112, + "step": 385 + }, + { + "epoch": 0.21197144426139483, + "grad_norm": 0.4735432276249727, + "learning_rate": 9.907861647756875e-07, + "loss": 0.3505, + "step": 386 + }, + { + "epoch": 0.21252059308072488, + "grad_norm": 0.5748703615776792, + "learning_rate": 9.907305808928986e-07, + "loss": 0.3604, + "step": 387 + }, + { + "epoch": 0.21306974190005493, + "grad_norm": 0.5574444372579445, + "learning_rate": 9.90674831423491e-07, + "loss": 0.3052, + "step": 388 + }, + { + "epoch": 0.21361889071938495, + "grad_norm": 0.47561555556499935, + "learning_rate": 9.906189163862778e-07, + "loss": 0.3821, + "step": 389 + }, + { + "epoch": 0.214168039538715, + "grad_norm": 0.5230808399701486, + "learning_rate": 9.905628358001286e-07, + "loss": 0.3963, + "step": 390 + }, + { + "epoch": 0.21471718835804504, + "grad_norm": 0.4833674871215576, + "learning_rate": 9.905065896839685e-07, + "loss": 0.2967, + "step": 391 + }, + { + "epoch": 0.21526633717737506, + "grad_norm": 0.48558954395823517, + "learning_rate": 9.904501780567783e-07, + "loss": 0.2967, + "step": 392 + }, + { + "epoch": 0.2158154859967051, + "grad_norm": 0.5202443193481654, + "learning_rate": 9.903936009375951e-07, + "loss": 0.3814, + "step": 393 + }, + { + "epoch": 0.21636463481603516, + "grad_norm": 0.44496363289627866, + "learning_rate": 9.903368583455112e-07, + "loss": 0.3377, + "step": 394 + }, + { + "epoch": 0.21691378363536518, + "grad_norm": 0.604333190934294, + "learning_rate": 9.902799502996756e-07, + "loss": 0.3327, + "step": 395 + }, + { + "epoch": 0.21746293245469522, + "grad_norm": 0.40242662221613446, + "learning_rate": 9.902228768192924e-07, + "loss": 0.3188, + "step": 396 + }, + { + "epoch": 0.21801208127402527, + "grad_norm": 0.4371578522370144, + "learning_rate": 9.901656379236221e-07, + "loss": 0.3034, + "step": 397 + }, + { + "epoch": 0.2185612300933553, + "grad_norm": 0.3887168932521033, + "learning_rate": 9.901082336319801e-07, + "loss": 0.3485, + "step": 398 + }, + { + "epoch": 0.21911037891268534, + "grad_norm": 0.46838390171823346, + "learning_rate": 9.900506639637388e-07, + "loss": 0.3148, + "step": 399 + }, + { + "epoch": 0.21965952773201539, + "grad_norm": 0.5676664228092575, + "learning_rate": 9.899929289383255e-07, + "loss": 0.32, + "step": 400 + }, + { + "epoch": 0.21965952773201539, + "eval_loss": 0.4228764772415161, + "eval_runtime": 18.526, + "eval_samples_per_second": 23.912, + "eval_steps_per_second": 1.026, + "step": 400 + }, + { + "epoch": 0.2202086765513454, + "grad_norm": 0.42390860934965974, + "learning_rate": 9.89935028575224e-07, + "loss": 0.3505, + "step": 401 + }, + { + "epoch": 0.22075782537067545, + "grad_norm": 0.502187296561082, + "learning_rate": 9.898769628939733e-07, + "loss": 0.307, + "step": 402 + }, + { + "epoch": 0.2213069741900055, + "grad_norm": 0.46942801548132196, + "learning_rate": 9.898187319141685e-07, + "loss": 0.3154, + "step": 403 + }, + { + "epoch": 0.22185612300933552, + "grad_norm": 0.4747127982547361, + "learning_rate": 9.897603356554602e-07, + "loss": 0.3348, + "step": 404 + }, + { + "epoch": 0.22240527182866557, + "grad_norm": 0.3863223529291394, + "learning_rate": 9.897017741375553e-07, + "loss": 0.3559, + "step": 405 + }, + { + "epoch": 0.22295442064799562, + "grad_norm": 0.602147291296106, + "learning_rate": 9.89643047380216e-07, + "loss": 0.4005, + "step": 406 + }, + { + "epoch": 0.22350356946732564, + "grad_norm": 0.5088089757894816, + "learning_rate": 9.895841554032604e-07, + "loss": 0.3228, + "step": 407 + }, + { + "epoch": 0.22405271828665568, + "grad_norm": 0.41907597042910427, + "learning_rate": 9.895250982265623e-07, + "loss": 0.3393, + "step": 408 + }, + { + "epoch": 0.22460186710598573, + "grad_norm": 0.44399642256849997, + "learning_rate": 9.894658758700515e-07, + "loss": 0.3604, + "step": 409 + }, + { + "epoch": 0.22515101592531575, + "grad_norm": 1.0077425036179892, + "learning_rate": 9.894064883537134e-07, + "loss": 0.4804, + "step": 410 + }, + { + "epoch": 0.2257001647446458, + "grad_norm": 0.45563398620135925, + "learning_rate": 9.89346935697589e-07, + "loss": 0.3046, + "step": 411 + }, + { + "epoch": 0.22624931356397585, + "grad_norm": 1.004201405548782, + "learning_rate": 9.89287217921775e-07, + "loss": 0.3217, + "step": 412 + }, + { + "epoch": 0.22679846238330587, + "grad_norm": 0.5717102983313682, + "learning_rate": 9.892273350464241e-07, + "loss": 0.3941, + "step": 413 + }, + { + "epoch": 0.2273476112026359, + "grad_norm": 0.40676708183793037, + "learning_rate": 9.891672870917443e-07, + "loss": 0.3115, + "step": 414 + }, + { + "epoch": 0.22789676002196596, + "grad_norm": 0.5754673510381386, + "learning_rate": 9.891070740780001e-07, + "loss": 0.3674, + "step": 415 + }, + { + "epoch": 0.22844590884129598, + "grad_norm": 0.5466212066527785, + "learning_rate": 9.89046696025511e-07, + "loss": 0.3258, + "step": 416 + }, + { + "epoch": 0.22899505766062603, + "grad_norm": 0.5214207015834531, + "learning_rate": 9.889861529546518e-07, + "loss": 0.3273, + "step": 417 + }, + { + "epoch": 0.22954420647995608, + "grad_norm": 0.5770370955549603, + "learning_rate": 9.889254448858543e-07, + "loss": 0.3155, + "step": 418 + }, + { + "epoch": 0.2300933552992861, + "grad_norm": 0.5659239065294162, + "learning_rate": 9.888645718396048e-07, + "loss": 0.3291, + "step": 419 + }, + { + "epoch": 0.23064250411861614, + "grad_norm": 0.5331675770333455, + "learning_rate": 9.888035338364458e-07, + "loss": 0.3454, + "step": 420 + }, + { + "epoch": 0.2311916529379462, + "grad_norm": 0.4993020562403641, + "learning_rate": 9.887423308969757e-07, + "loss": 0.3793, + "step": 421 + }, + { + "epoch": 0.2317408017572762, + "grad_norm": 0.4403038899592374, + "learning_rate": 9.88680963041848e-07, + "loss": 0.3469, + "step": 422 + }, + { + "epoch": 0.23228995057660626, + "grad_norm": 0.46570396264636643, + "learning_rate": 9.886194302917718e-07, + "loss": 0.2952, + "step": 423 + }, + { + "epoch": 0.2328390993959363, + "grad_norm": 0.5520583196776558, + "learning_rate": 9.885577326675123e-07, + "loss": 0.3451, + "step": 424 + }, + { + "epoch": 0.23338824821526633, + "grad_norm": 0.592935506523908, + "learning_rate": 9.884958701898906e-07, + "loss": 0.3436, + "step": 425 + }, + { + "epoch": 0.23393739703459637, + "grad_norm": 0.381863023561074, + "learning_rate": 9.884338428797823e-07, + "loss": 0.3496, + "step": 426 + }, + { + "epoch": 0.23448654585392642, + "grad_norm": 0.4081311020819649, + "learning_rate": 9.8837165075812e-07, + "loss": 0.3371, + "step": 427 + }, + { + "epoch": 0.23503569467325644, + "grad_norm": 0.4145298686576029, + "learning_rate": 9.883092938458906e-07, + "loss": 0.3483, + "step": 428 + }, + { + "epoch": 0.2355848434925865, + "grad_norm": 0.46947800565120756, + "learning_rate": 9.88246772164138e-07, + "loss": 0.3247, + "step": 429 + }, + { + "epoch": 0.23613399231191654, + "grad_norm": 0.45028601564688364, + "learning_rate": 9.881840857339603e-07, + "loss": 0.3155, + "step": 430 + }, + { + "epoch": 0.23668314113124655, + "grad_norm": 0.38861384435571195, + "learning_rate": 9.881212345765125e-07, + "loss": 0.3458, + "step": 431 + }, + { + "epoch": 0.2372322899505766, + "grad_norm": 0.4087983048885045, + "learning_rate": 9.880582187130037e-07, + "loss": 0.3602, + "step": 432 + }, + { + "epoch": 0.23778143876990665, + "grad_norm": 0.4451626928583509, + "learning_rate": 9.879950381647004e-07, + "loss": 0.2972, + "step": 433 + }, + { + "epoch": 0.23833058758923667, + "grad_norm": 0.40197382302181084, + "learning_rate": 9.879316929529227e-07, + "loss": 0.3442, + "step": 434 + }, + { + "epoch": 0.23887973640856672, + "grad_norm": 0.46154090432215, + "learning_rate": 9.878681830990482e-07, + "loss": 0.3248, + "step": 435 + }, + { + "epoch": 0.23942888522789676, + "grad_norm": 0.5456605674792115, + "learning_rate": 9.878045086245086e-07, + "loss": 0.3336, + "step": 436 + }, + { + "epoch": 0.23997803404722678, + "grad_norm": 0.5440045392154607, + "learning_rate": 9.87740669550792e-07, + "loss": 0.3114, + "step": 437 + }, + { + "epoch": 0.24052718286655683, + "grad_norm": 0.43071394758946097, + "learning_rate": 9.876766658994415e-07, + "loss": 0.3272, + "step": 438 + }, + { + "epoch": 0.24107633168588688, + "grad_norm": 0.383344813733855, + "learning_rate": 9.87612497692056e-07, + "loss": 0.3056, + "step": 439 + }, + { + "epoch": 0.24162548050521693, + "grad_norm": 0.5017906990034449, + "learning_rate": 9.875481649502897e-07, + "loss": 0.3581, + "step": 440 + }, + { + "epoch": 0.24217462932454695, + "grad_norm": 0.3908550031182706, + "learning_rate": 9.87483667695853e-07, + "loss": 0.3286, + "step": 441 + }, + { + "epoch": 0.242723778143877, + "grad_norm": 0.568898227059734, + "learning_rate": 9.87419005950511e-07, + "loss": 0.3718, + "step": 442 + }, + { + "epoch": 0.24327292696320704, + "grad_norm": 0.5045646022299286, + "learning_rate": 9.87354179736085e-07, + "loss": 0.3311, + "step": 443 + }, + { + "epoch": 0.24382207578253706, + "grad_norm": 0.4389838124329114, + "learning_rate": 9.872891890744511e-07, + "loss": 0.3282, + "step": 444 + }, + { + "epoch": 0.2443712246018671, + "grad_norm": 0.3868336363651343, + "learning_rate": 9.872240339875414e-07, + "loss": 0.301, + "step": 445 + }, + { + "epoch": 0.24492037342119716, + "grad_norm": 0.5277805286968514, + "learning_rate": 9.871587144973434e-07, + "loss": 0.285, + "step": 446 + }, + { + "epoch": 0.24546952224052718, + "grad_norm": 0.5246407034439886, + "learning_rate": 9.870932306258998e-07, + "loss": 0.3451, + "step": 447 + }, + { + "epoch": 0.24601867105985722, + "grad_norm": 0.5173616515843543, + "learning_rate": 9.870275823953094e-07, + "loss": 0.2902, + "step": 448 + }, + { + "epoch": 0.24656781987918727, + "grad_norm": 0.5486693791715967, + "learning_rate": 9.869617698277256e-07, + "loss": 0.297, + "step": 449 + }, + { + "epoch": 0.2471169686985173, + "grad_norm": 0.4021717743743248, + "learning_rate": 9.86895792945358e-07, + "loss": 0.3276, + "step": 450 + }, + { + "epoch": 0.24766611751784734, + "grad_norm": 0.45452302398819655, + "learning_rate": 9.868296517704712e-07, + "loss": 0.3392, + "step": 451 + }, + { + "epoch": 0.2482152663371774, + "grad_norm": 0.39928704714891894, + "learning_rate": 9.867633463253854e-07, + "loss": 0.2873, + "step": 452 + }, + { + "epoch": 0.2487644151565074, + "grad_norm": 0.6310486620676942, + "learning_rate": 9.866968766324767e-07, + "loss": 0.4076, + "step": 453 + }, + { + "epoch": 0.24931356397583745, + "grad_norm": 0.49015974476959123, + "learning_rate": 9.866302427141756e-07, + "loss": 0.3256, + "step": 454 + }, + { + "epoch": 0.2498627127951675, + "grad_norm": 0.4665908418347988, + "learning_rate": 9.865634445929688e-07, + "loss": 0.2981, + "step": 455 + }, + { + "epoch": 0.2504118616144975, + "grad_norm": 0.44224726582538065, + "learning_rate": 9.864964822913985e-07, + "loss": 0.3123, + "step": 456 + }, + { + "epoch": 0.25096101043382757, + "grad_norm": 0.4524761141687417, + "learning_rate": 9.864293558320615e-07, + "loss": 0.3094, + "step": 457 + }, + { + "epoch": 0.2515101592531576, + "grad_norm": 0.4987568098773706, + "learning_rate": 9.863620652376107e-07, + "loss": 0.33, + "step": 458 + }, + { + "epoch": 0.25205930807248766, + "grad_norm": 0.487182594814918, + "learning_rate": 9.862946105307541e-07, + "loss": 0.3318, + "step": 459 + }, + { + "epoch": 0.25260845689181766, + "grad_norm": 0.45648480236265654, + "learning_rate": 9.862269917342555e-07, + "loss": 0.3134, + "step": 460 + }, + { + "epoch": 0.2531576057111477, + "grad_norm": 0.49300967815141156, + "learning_rate": 9.861592088709335e-07, + "loss": 0.3525, + "step": 461 + }, + { + "epoch": 0.25370675453047775, + "grad_norm": 0.3701771570295517, + "learning_rate": 9.860912619636625e-07, + "loss": 0.336, + "step": 462 + }, + { + "epoch": 0.2542559033498078, + "grad_norm": 0.3706429817164138, + "learning_rate": 9.860231510353717e-07, + "loss": 0.306, + "step": 463 + }, + { + "epoch": 0.25480505216913785, + "grad_norm": 0.5397276494382715, + "learning_rate": 9.859548761090466e-07, + "loss": 0.3404, + "step": 464 + }, + { + "epoch": 0.2553542009884679, + "grad_norm": 0.5635005586188511, + "learning_rate": 9.85886437207727e-07, + "loss": 0.3423, + "step": 465 + }, + { + "epoch": 0.2559033498077979, + "grad_norm": 0.4904923005097277, + "learning_rate": 9.858178343545085e-07, + "loss": 0.3386, + "step": 466 + }, + { + "epoch": 0.25645249862712793, + "grad_norm": 0.5398129446008373, + "learning_rate": 9.857490675725423e-07, + "loss": 0.3322, + "step": 467 + }, + { + "epoch": 0.257001647446458, + "grad_norm": 0.5476004333768724, + "learning_rate": 9.856801368850347e-07, + "loss": 0.3237, + "step": 468 + }, + { + "epoch": 0.25755079626578803, + "grad_norm": 0.4837385163663176, + "learning_rate": 9.856110423152472e-07, + "loss": 0.3467, + "step": 469 + }, + { + "epoch": 0.2580999450851181, + "grad_norm": 0.4292631708341571, + "learning_rate": 9.855417838864964e-07, + "loss": 0.3378, + "step": 470 + }, + { + "epoch": 0.2586490939044481, + "grad_norm": 0.49297624374899757, + "learning_rate": 9.854723616221547e-07, + "loss": 0.2706, + "step": 471 + }, + { + "epoch": 0.2591982427237781, + "grad_norm": 0.6009134590862357, + "learning_rate": 9.854027755456494e-07, + "loss": 0.3744, + "step": 472 + }, + { + "epoch": 0.25974739154310816, + "grad_norm": 0.499450242151516, + "learning_rate": 9.853330256804637e-07, + "loss": 0.3075, + "step": 473 + }, + { + "epoch": 0.2602965403624382, + "grad_norm": 0.48280821398488216, + "learning_rate": 9.85263112050135e-07, + "loss": 0.3468, + "step": 474 + }, + { + "epoch": 0.26084568918176826, + "grad_norm": 0.41285380008388917, + "learning_rate": 9.851930346782568e-07, + "loss": 0.3569, + "step": 475 + }, + { + "epoch": 0.2613948380010983, + "grad_norm": 0.5351488316687973, + "learning_rate": 9.85122793588478e-07, + "loss": 0.3331, + "step": 476 + }, + { + "epoch": 0.26194398682042835, + "grad_norm": 0.5028414897022058, + "learning_rate": 9.850523888045017e-07, + "loss": 0.372, + "step": 477 + }, + { + "epoch": 0.2624931356397584, + "grad_norm": 0.4735933901803555, + "learning_rate": 9.849818203500874e-07, + "loss": 0.2975, + "step": 478 + }, + { + "epoch": 0.2630422844590884, + "grad_norm": 0.36759435988077777, + "learning_rate": 9.849110882490492e-07, + "loss": 0.3433, + "step": 479 + }, + { + "epoch": 0.26359143327841844, + "grad_norm": 0.46104607271477915, + "learning_rate": 9.848401925252565e-07, + "loss": 0.3375, + "step": 480 + }, + { + "epoch": 0.2641405820977485, + "grad_norm": 0.49767284968597836, + "learning_rate": 9.847691332026344e-07, + "loss": 0.2742, + "step": 481 + }, + { + "epoch": 0.26468973091707854, + "grad_norm": 0.4351918538089829, + "learning_rate": 9.846979103051624e-07, + "loss": 0.3218, + "step": 482 + }, + { + "epoch": 0.2652388797364086, + "grad_norm": 0.44627295269938216, + "learning_rate": 9.846265238568757e-07, + "loss": 0.2978, + "step": 483 + }, + { + "epoch": 0.26578802855573863, + "grad_norm": 0.5349739099174223, + "learning_rate": 9.845549738818645e-07, + "loss": 0.3389, + "step": 484 + }, + { + "epoch": 0.2663371773750686, + "grad_norm": 0.5124644709235744, + "learning_rate": 9.844832604042745e-07, + "loss": 0.3172, + "step": 485 + }, + { + "epoch": 0.26688632619439867, + "grad_norm": 0.6418111086810989, + "learning_rate": 9.844113834483061e-07, + "loss": 0.3165, + "step": 486 + }, + { + "epoch": 0.2674354750137287, + "grad_norm": 0.5545339025295388, + "learning_rate": 9.843393430382155e-07, + "loss": 0.3014, + "step": 487 + }, + { + "epoch": 0.26798462383305877, + "grad_norm": 0.6592300615544398, + "learning_rate": 9.842671391983135e-07, + "loss": 0.3403, + "step": 488 + }, + { + "epoch": 0.2685337726523888, + "grad_norm": 0.5104216097006875, + "learning_rate": 9.841947719529659e-07, + "loss": 0.3526, + "step": 489 + }, + { + "epoch": 0.26908292147171886, + "grad_norm": 0.41034725870092115, + "learning_rate": 9.841222413265942e-07, + "loss": 0.3084, + "step": 490 + }, + { + "epoch": 0.26963207029104885, + "grad_norm": 0.5407484438988068, + "learning_rate": 9.840495473436752e-07, + "loss": 0.3698, + "step": 491 + }, + { + "epoch": 0.2701812191103789, + "grad_norm": 0.5530023755029136, + "learning_rate": 9.8397669002874e-07, + "loss": 0.3221, + "step": 492 + }, + { + "epoch": 0.27073036792970895, + "grad_norm": 0.4166421182937445, + "learning_rate": 9.839036694063754e-07, + "loss": 0.3077, + "step": 493 + }, + { + "epoch": 0.271279516749039, + "grad_norm": 0.6849734541567781, + "learning_rate": 9.83830485501223e-07, + "loss": 0.3319, + "step": 494 + }, + { + "epoch": 0.27182866556836904, + "grad_norm": 0.4533952074845478, + "learning_rate": 9.8375713833798e-07, + "loss": 0.3188, + "step": 495 + }, + { + "epoch": 0.2723778143876991, + "grad_norm": 0.4733529516975668, + "learning_rate": 9.836836279413981e-07, + "loss": 0.313, + "step": 496 + }, + { + "epoch": 0.2729269632070291, + "grad_norm": 0.3758010717560732, + "learning_rate": 9.836099543362845e-07, + "loss": 0.29, + "step": 497 + }, + { + "epoch": 0.27347611202635913, + "grad_norm": 0.6357215798182766, + "learning_rate": 9.835361175475014e-07, + "loss": 0.3559, + "step": 498 + }, + { + "epoch": 0.2740252608456892, + "grad_norm": 0.44075419639782126, + "learning_rate": 9.834621175999656e-07, + "loss": 0.2992, + "step": 499 + }, + { + "epoch": 0.2745744096650192, + "grad_norm": 0.3832583022306382, + "learning_rate": 9.833879545186496e-07, + "loss": 0.2806, + "step": 500 + }, + { + "epoch": 0.2751235584843493, + "grad_norm": 0.4994007202886253, + "learning_rate": 9.833136283285809e-07, + "loss": 0.3425, + "step": 501 + }, + { + "epoch": 0.2756727073036793, + "grad_norm": 0.4639517335419424, + "learning_rate": 9.832391390548417e-07, + "loss": 0.3235, + "step": 502 + }, + { + "epoch": 0.2762218561230093, + "grad_norm": 0.508594139359251, + "learning_rate": 9.831644867225692e-07, + "loss": 0.3303, + "step": 503 + }, + { + "epoch": 0.27677100494233936, + "grad_norm": 0.44897951727340707, + "learning_rate": 9.83089671356956e-07, + "loss": 0.2931, + "step": 504 + }, + { + "epoch": 0.2773201537616694, + "grad_norm": 0.5360986959496855, + "learning_rate": 9.830146929832497e-07, + "loss": 0.3026, + "step": 505 + }, + { + "epoch": 0.27786930258099946, + "grad_norm": 0.46558855157707546, + "learning_rate": 9.829395516267524e-07, + "loss": 0.3499, + "step": 506 + }, + { + "epoch": 0.2784184514003295, + "grad_norm": 0.4142034337717624, + "learning_rate": 9.828642473128217e-07, + "loss": 0.3138, + "step": 507 + }, + { + "epoch": 0.27896760021965955, + "grad_norm": 0.4189635437858386, + "learning_rate": 9.8278878006687e-07, + "loss": 0.3207, + "step": 508 + }, + { + "epoch": 0.27951674903898954, + "grad_norm": 0.39559383258550235, + "learning_rate": 9.827131499143647e-07, + "loss": 0.3123, + "step": 509 + }, + { + "epoch": 0.2800658978583196, + "grad_norm": 0.49778186676134906, + "learning_rate": 9.826373568808282e-07, + "loss": 0.337, + "step": 510 + }, + { + "epoch": 0.28061504667764964, + "grad_norm": 0.5150403201162589, + "learning_rate": 9.82561400991838e-07, + "loss": 0.3322, + "step": 511 + }, + { + "epoch": 0.2811641954969797, + "grad_norm": 0.46460221311660477, + "learning_rate": 9.824852822730263e-07, + "loss": 0.3322, + "step": 512 + }, + { + "epoch": 0.28171334431630973, + "grad_norm": 0.4666271193920616, + "learning_rate": 9.824090007500802e-07, + "loss": 0.3054, + "step": 513 + }, + { + "epoch": 0.2822624931356398, + "grad_norm": 0.46552853262562716, + "learning_rate": 9.823325564487422e-07, + "loss": 0.3143, + "step": 514 + }, + { + "epoch": 0.2828116419549698, + "grad_norm": 0.7429484365482533, + "learning_rate": 9.822559493948093e-07, + "loss": 0.3501, + "step": 515 + }, + { + "epoch": 0.2833607907742998, + "grad_norm": 0.485070342608985, + "learning_rate": 9.821791796141335e-07, + "loss": 0.3162, + "step": 516 + }, + { + "epoch": 0.28390993959362987, + "grad_norm": 0.6414310908698505, + "learning_rate": 9.821022471326217e-07, + "loss": 0.3464, + "step": 517 + }, + { + "epoch": 0.2844590884129599, + "grad_norm": 0.5547569678188016, + "learning_rate": 9.820251519762361e-07, + "loss": 0.2916, + "step": 518 + }, + { + "epoch": 0.28500823723228996, + "grad_norm": 0.5534751713608991, + "learning_rate": 9.819478941709933e-07, + "loss": 0.3097, + "step": 519 + }, + { + "epoch": 0.28555738605162, + "grad_norm": 0.3729588387904154, + "learning_rate": 9.818704737429648e-07, + "loss": 0.3549, + "step": 520 + }, + { + "epoch": 0.28610653487095, + "grad_norm": 0.4515705048081947, + "learning_rate": 9.817928907182773e-07, + "loss": 0.3223, + "step": 521 + }, + { + "epoch": 0.28665568369028005, + "grad_norm": 0.5633114309906939, + "learning_rate": 9.81715145123112e-07, + "loss": 0.3637, + "step": 522 + }, + { + "epoch": 0.2872048325096101, + "grad_norm": 0.5583309472873978, + "learning_rate": 9.816372369837058e-07, + "loss": 0.3061, + "step": 523 + }, + { + "epoch": 0.28775398132894014, + "grad_norm": 0.5645992858236648, + "learning_rate": 9.81559166326349e-07, + "loss": 0.3362, + "step": 524 + }, + { + "epoch": 0.2883031301482702, + "grad_norm": 0.4261714536575483, + "learning_rate": 9.814809331773882e-07, + "loss": 0.339, + "step": 525 + }, + { + "epoch": 0.28885227896760024, + "grad_norm": 0.468084747846434, + "learning_rate": 9.81402537563224e-07, + "loss": 0.2902, + "step": 526 + }, + { + "epoch": 0.28940142778693023, + "grad_norm": 0.5508973041842398, + "learning_rate": 9.813239795103118e-07, + "loss": 0.3396, + "step": 527 + }, + { + "epoch": 0.2899505766062603, + "grad_norm": 0.47663021394730964, + "learning_rate": 9.812452590451625e-07, + "loss": 0.2864, + "step": 528 + }, + { + "epoch": 0.2904997254255903, + "grad_norm": 0.5248569052958911, + "learning_rate": 9.81166376194341e-07, + "loss": 0.3292, + "step": 529 + }, + { + "epoch": 0.2910488742449204, + "grad_norm": 0.45095303752764915, + "learning_rate": 9.810873309844674e-07, + "loss": 0.3055, + "step": 530 + }, + { + "epoch": 0.2915980230642504, + "grad_norm": 0.4316577045972638, + "learning_rate": 9.810081234422168e-07, + "loss": 0.3126, + "step": 531 + }, + { + "epoch": 0.29214717188358047, + "grad_norm": 0.5426342316958122, + "learning_rate": 9.809287535943186e-07, + "loss": 0.3433, + "step": 532 + }, + { + "epoch": 0.29269632070291046, + "grad_norm": 0.45129548620138626, + "learning_rate": 9.80849221467557e-07, + "loss": 0.295, + "step": 533 + }, + { + "epoch": 0.2932454695222405, + "grad_norm": 0.4349520560713567, + "learning_rate": 9.807695270887717e-07, + "loss": 0.3176, + "step": 534 + }, + { + "epoch": 0.29379461834157056, + "grad_norm": 0.4750205582822795, + "learning_rate": 9.80689670484856e-07, + "loss": 0.318, + "step": 535 + }, + { + "epoch": 0.2943437671609006, + "grad_norm": 0.35635393599511017, + "learning_rate": 9.80609651682759e-07, + "loss": 0.3028, + "step": 536 + }, + { + "epoch": 0.29489291598023065, + "grad_norm": 0.6318683320306482, + "learning_rate": 9.80529470709484e-07, + "loss": 0.3273, + "step": 537 + }, + { + "epoch": 0.2954420647995607, + "grad_norm": 0.41845496479417515, + "learning_rate": 9.804491275920891e-07, + "loss": 0.3262, + "step": 538 + }, + { + "epoch": 0.2959912136188907, + "grad_norm": 0.4784822041779314, + "learning_rate": 9.803686223576873e-07, + "loss": 0.3521, + "step": 539 + }, + { + "epoch": 0.29654036243822074, + "grad_norm": 0.4881172617542623, + "learning_rate": 9.80287955033446e-07, + "loss": 0.3166, + "step": 540 + }, + { + "epoch": 0.2970895112575508, + "grad_norm": 0.5887935373117399, + "learning_rate": 9.802071256465871e-07, + "loss": 0.3493, + "step": 541 + }, + { + "epoch": 0.29763866007688083, + "grad_norm": 0.4421694064847873, + "learning_rate": 9.801261342243882e-07, + "loss": 0.2972, + "step": 542 + }, + { + "epoch": 0.2981878088962109, + "grad_norm": 0.4926102490919537, + "learning_rate": 9.800449807941805e-07, + "loss": 0.2786, + "step": 543 + }, + { + "epoch": 0.29873695771554093, + "grad_norm": 0.3943114407203231, + "learning_rate": 9.799636653833503e-07, + "loss": 0.2983, + "step": 544 + }, + { + "epoch": 0.299286106534871, + "grad_norm": 0.5343510083262023, + "learning_rate": 9.79882188019339e-07, + "loss": 0.3005, + "step": 545 + }, + { + "epoch": 0.29983525535420097, + "grad_norm": 0.40683853158379923, + "learning_rate": 9.798005487296414e-07, + "loss": 0.3554, + "step": 546 + }, + { + "epoch": 0.300384404173531, + "grad_norm": 0.4227452823290433, + "learning_rate": 9.797187475418085e-07, + "loss": 0.3598, + "step": 547 + }, + { + "epoch": 0.30093355299286106, + "grad_norm": 0.5216737746477398, + "learning_rate": 9.796367844834448e-07, + "loss": 0.2903, + "step": 548 + }, + { + "epoch": 0.3014827018121911, + "grad_norm": 0.47930504282709674, + "learning_rate": 9.795546595822099e-07, + "loss": 0.2803, + "step": 549 + }, + { + "epoch": 0.30203185063152116, + "grad_norm": 0.4129539214802851, + "learning_rate": 9.794723728658183e-07, + "loss": 0.3148, + "step": 550 + }, + { + "epoch": 0.3025809994508512, + "grad_norm": 0.5261404252421711, + "learning_rate": 9.79389924362038e-07, + "loss": 0.3738, + "step": 551 + }, + { + "epoch": 0.3031301482701812, + "grad_norm": 0.4518393459552854, + "learning_rate": 9.793073140986928e-07, + "loss": 0.2865, + "step": 552 + }, + { + "epoch": 0.30367929708951125, + "grad_norm": 0.42913338660332895, + "learning_rate": 9.792245421036605e-07, + "loss": 0.3402, + "step": 553 + }, + { + "epoch": 0.3042284459088413, + "grad_norm": 0.37538280309301175, + "learning_rate": 9.791416084048735e-07, + "loss": 0.347, + "step": 554 + }, + { + "epoch": 0.30477759472817134, + "grad_norm": 0.47867070657374167, + "learning_rate": 9.790585130303194e-07, + "loss": 0.3229, + "step": 555 + }, + { + "epoch": 0.3053267435475014, + "grad_norm": 0.5287224541697721, + "learning_rate": 9.78975256008039e-07, + "loss": 0.2912, + "step": 556 + }, + { + "epoch": 0.30587589236683144, + "grad_norm": 0.6067703821218071, + "learning_rate": 9.788918373661291e-07, + "loss": 0.3871, + "step": 557 + }, + { + "epoch": 0.30642504118616143, + "grad_norm": 0.5708585514938724, + "learning_rate": 9.788082571327403e-07, + "loss": 0.3505, + "step": 558 + }, + { + "epoch": 0.3069741900054915, + "grad_norm": 0.5264752428886793, + "learning_rate": 9.787245153360776e-07, + "loss": 0.31, + "step": 559 + }, + { + "epoch": 0.3075233388248215, + "grad_norm": 0.3697823456895624, + "learning_rate": 9.786406120044012e-07, + "loss": 0.3203, + "step": 560 + }, + { + "epoch": 0.30807248764415157, + "grad_norm": 0.6486026958975594, + "learning_rate": 9.785565471660249e-07, + "loss": 0.3225, + "step": 561 + }, + { + "epoch": 0.3086216364634816, + "grad_norm": 0.4376569906728734, + "learning_rate": 9.784723208493178e-07, + "loss": 0.3138, + "step": 562 + }, + { + "epoch": 0.30917078528281167, + "grad_norm": 0.5791392718704185, + "learning_rate": 9.78387933082703e-07, + "loss": 0.3242, + "step": 563 + }, + { + "epoch": 0.30971993410214166, + "grad_norm": 0.5969372123363916, + "learning_rate": 9.783033838946583e-07, + "loss": 0.3092, + "step": 564 + }, + { + "epoch": 0.3102690829214717, + "grad_norm": 0.49006087125156034, + "learning_rate": 9.782186733137163e-07, + "loss": 0.2805, + "step": 565 + }, + { + "epoch": 0.31081823174080175, + "grad_norm": 0.3876002518816803, + "learning_rate": 9.781338013684633e-07, + "loss": 0.3589, + "step": 566 + }, + { + "epoch": 0.3113673805601318, + "grad_norm": 0.4634736741485902, + "learning_rate": 9.780487680875403e-07, + "loss": 0.3139, + "step": 567 + }, + { + "epoch": 0.31191652937946185, + "grad_norm": 0.4101658048398333, + "learning_rate": 9.779635734996432e-07, + "loss": 0.2801, + "step": 568 + }, + { + "epoch": 0.3124656781987919, + "grad_norm": 0.4085151582885634, + "learning_rate": 9.778782176335223e-07, + "loss": 0.3019, + "step": 569 + }, + { + "epoch": 0.3130148270181219, + "grad_norm": 0.43294500024428756, + "learning_rate": 9.777927005179814e-07, + "loss": 0.3088, + "step": 570 + }, + { + "epoch": 0.31356397583745194, + "grad_norm": 0.39046159391356045, + "learning_rate": 9.7770702218188e-07, + "loss": 0.2653, + "step": 571 + }, + { + "epoch": 0.314113124656782, + "grad_norm": 0.5399396643524935, + "learning_rate": 9.776211826541307e-07, + "loss": 0.3196, + "step": 572 + }, + { + "epoch": 0.31466227347611203, + "grad_norm": 0.3875407680219636, + "learning_rate": 9.77535181963702e-07, + "loss": 0.3062, + "step": 573 + }, + { + "epoch": 0.3152114222954421, + "grad_norm": 0.7710894183016851, + "learning_rate": 9.774490201396153e-07, + "loss": 0.3637, + "step": 574 + }, + { + "epoch": 0.3157605711147721, + "grad_norm": 0.6260393246599966, + "learning_rate": 9.773626972109473e-07, + "loss": 0.3272, + "step": 575 + }, + { + "epoch": 0.3163097199341021, + "grad_norm": 0.47925775372440726, + "learning_rate": 9.772762132068289e-07, + "loss": 0.3634, + "step": 576 + }, + { + "epoch": 0.31685886875343217, + "grad_norm": 0.5270539925513561, + "learning_rate": 9.77189568156445e-07, + "loss": 0.2908, + "step": 577 + }, + { + "epoch": 0.3174080175727622, + "grad_norm": 0.6228008650500568, + "learning_rate": 9.771027620890354e-07, + "loss": 0.411, + "step": 578 + }, + { + "epoch": 0.31795716639209226, + "grad_norm": 0.4044321698401574, + "learning_rate": 9.770157950338937e-07, + "loss": 0.3164, + "step": 579 + }, + { + "epoch": 0.3185063152114223, + "grad_norm": 0.5729343020853266, + "learning_rate": 9.769286670203684e-07, + "loss": 0.3132, + "step": 580 + }, + { + "epoch": 0.31905546403075236, + "grad_norm": 0.5291556395514854, + "learning_rate": 9.768413780778617e-07, + "loss": 0.3126, + "step": 581 + }, + { + "epoch": 0.31960461285008235, + "grad_norm": 0.38093655467159043, + "learning_rate": 9.767539282358303e-07, + "loss": 0.2945, + "step": 582 + }, + { + "epoch": 0.3201537616694124, + "grad_norm": 0.5146897586541881, + "learning_rate": 9.766663175237855e-07, + "loss": 0.289, + "step": 583 + }, + { + "epoch": 0.32070291048874244, + "grad_norm": 0.6481568907692759, + "learning_rate": 9.76578545971293e-07, + "loss": 0.2729, + "step": 584 + }, + { + "epoch": 0.3212520593080725, + "grad_norm": 0.5173196623975647, + "learning_rate": 9.76490613607972e-07, + "loss": 0.2613, + "step": 585 + }, + { + "epoch": 0.32180120812740254, + "grad_norm": 0.7492756748904551, + "learning_rate": 9.764025204634966e-07, + "loss": 0.3042, + "step": 586 + }, + { + "epoch": 0.3223503569467326, + "grad_norm": 0.4470615640514475, + "learning_rate": 9.763142665675948e-07, + "loss": 0.2581, + "step": 587 + }, + { + "epoch": 0.3228995057660626, + "grad_norm": 0.44794315830164905, + "learning_rate": 9.762258519500494e-07, + "loss": 0.2843, + "step": 588 + }, + { + "epoch": 0.3234486545853926, + "grad_norm": 0.49029693062976754, + "learning_rate": 9.761372766406968e-07, + "loss": 0.3074, + "step": 589 + }, + { + "epoch": 0.3239978034047227, + "grad_norm": 0.605411935679811, + "learning_rate": 9.76048540669428e-07, + "loss": 0.3015, + "step": 590 + }, + { + "epoch": 0.3245469522240527, + "grad_norm": 0.46431875631226277, + "learning_rate": 9.75959644066188e-07, + "loss": 0.2868, + "step": 591 + }, + { + "epoch": 0.32509610104338277, + "grad_norm": 0.46579396920898236, + "learning_rate": 9.758705868609762e-07, + "loss": 0.3274, + "step": 592 + }, + { + "epoch": 0.3256452498627128, + "grad_norm": 0.46574517082980155, + "learning_rate": 9.757813690838464e-07, + "loss": 0.3189, + "step": 593 + }, + { + "epoch": 0.3261943986820428, + "grad_norm": 0.5517949321627059, + "learning_rate": 9.756919907649059e-07, + "loss": 0.3258, + "step": 594 + }, + { + "epoch": 0.32674354750137286, + "grad_norm": 0.5641509784115365, + "learning_rate": 9.756024519343169e-07, + "loss": 0.2857, + "step": 595 + }, + { + "epoch": 0.3272926963207029, + "grad_norm": 0.4618260776880444, + "learning_rate": 9.755127526222953e-07, + "loss": 0.2739, + "step": 596 + }, + { + "epoch": 0.32784184514003295, + "grad_norm": 0.5145814405256917, + "learning_rate": 9.754228928591113e-07, + "loss": 0.3695, + "step": 597 + }, + { + "epoch": 0.328390993959363, + "grad_norm": 0.43421024893078836, + "learning_rate": 9.753328726750893e-07, + "loss": 0.3141, + "step": 598 + }, + { + "epoch": 0.32894014277869305, + "grad_norm": 0.4474944382114859, + "learning_rate": 9.752426921006077e-07, + "loss": 0.2972, + "step": 599 + }, + { + "epoch": 0.32948929159802304, + "grad_norm": 0.5577674749637513, + "learning_rate": 9.751523511660992e-07, + "loss": 0.3282, + "step": 600 + }, + { + "epoch": 0.32948929159802304, + "eval_loss": 0.3990951478481293, + "eval_runtime": 21.3598, + "eval_samples_per_second": 20.74, + "eval_steps_per_second": 0.89, + "step": 600 + }, + { + "epoch": 0.3300384404173531, + "grad_norm": 0.7535138789624793, + "learning_rate": 9.750618499020507e-07, + "loss": 0.2793, + "step": 601 + }, + { + "epoch": 0.33058758923668313, + "grad_norm": 0.5596192387559347, + "learning_rate": 9.749711883390028e-07, + "loss": 0.2913, + "step": 602 + }, + { + "epoch": 0.3311367380560132, + "grad_norm": 0.4916395046398006, + "learning_rate": 9.748803665075505e-07, + "loss": 0.2846, + "step": 603 + }, + { + "epoch": 0.3316858868753432, + "grad_norm": 0.5387551035946881, + "learning_rate": 9.74789384438343e-07, + "loss": 0.3312, + "step": 604 + }, + { + "epoch": 0.3322350356946733, + "grad_norm": 0.38088976392666973, + "learning_rate": 9.74698242162083e-07, + "loss": 0.3003, + "step": 605 + }, + { + "epoch": 0.33278418451400327, + "grad_norm": 0.355638876086518, + "learning_rate": 9.746069397095282e-07, + "loss": 0.3218, + "step": 606 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.5147789610443237, + "learning_rate": 9.745154771114893e-07, + "loss": 0.3119, + "step": 607 + }, + { + "epoch": 0.33388248215266336, + "grad_norm": 0.4174078167671674, + "learning_rate": 9.74423854398832e-07, + "loss": 0.3443, + "step": 608 + }, + { + "epoch": 0.3344316309719934, + "grad_norm": 0.43132485845172025, + "learning_rate": 9.743320716024752e-07, + "loss": 0.2791, + "step": 609 + }, + { + "epoch": 0.33498077979132346, + "grad_norm": 0.5576109280264451, + "learning_rate": 9.742401287533924e-07, + "loss": 0.2937, + "step": 610 + }, + { + "epoch": 0.3355299286106535, + "grad_norm": 0.3849380469438766, + "learning_rate": 9.741480258826108e-07, + "loss": 0.305, + "step": 611 + }, + { + "epoch": 0.33607907742998355, + "grad_norm": 0.4118908381245245, + "learning_rate": 9.74055763021212e-07, + "loss": 0.3115, + "step": 612 + }, + { + "epoch": 0.33662822624931354, + "grad_norm": 0.5583608347142233, + "learning_rate": 9.739633402003312e-07, + "loss": 0.3299, + "step": 613 + }, + { + "epoch": 0.3371773750686436, + "grad_norm": 0.970066195981141, + "learning_rate": 9.738707574511575e-07, + "loss": 0.492, + "step": 614 + }, + { + "epoch": 0.33772652388797364, + "grad_norm": 0.47213379281722617, + "learning_rate": 9.737780148049343e-07, + "loss": 0.3081, + "step": 615 + }, + { + "epoch": 0.3382756727073037, + "grad_norm": 0.4913912076262507, + "learning_rate": 9.73685112292959e-07, + "loss": 0.2741, + "step": 616 + }, + { + "epoch": 0.33882482152663373, + "grad_norm": 0.4597155671956856, + "learning_rate": 9.735920499465826e-07, + "loss": 0.3032, + "step": 617 + }, + { + "epoch": 0.3393739703459638, + "grad_norm": 0.3935996589821795, + "learning_rate": 9.7349882779721e-07, + "loss": 0.3091, + "step": 618 + }, + { + "epoch": 0.3399231191652938, + "grad_norm": 0.4209399399615395, + "learning_rate": 9.734054458763005e-07, + "loss": 0.3454, + "step": 619 + }, + { + "epoch": 0.3404722679846238, + "grad_norm": 0.6537994713114161, + "learning_rate": 9.73311904215367e-07, + "loss": 0.3546, + "step": 620 + }, + { + "epoch": 0.34102141680395387, + "grad_norm": 0.48905936776390374, + "learning_rate": 9.732182028459767e-07, + "loss": 0.2848, + "step": 621 + }, + { + "epoch": 0.3415705656232839, + "grad_norm": 0.4938866124667207, + "learning_rate": 9.731243417997498e-07, + "loss": 0.3066, + "step": 622 + }, + { + "epoch": 0.34211971444261396, + "grad_norm": 0.5613140069418393, + "learning_rate": 9.730303211083612e-07, + "loss": 0.313, + "step": 623 + }, + { + "epoch": 0.342668863261944, + "grad_norm": 0.6190131733066165, + "learning_rate": 9.729361408035396e-07, + "loss": 0.3107, + "step": 624 + }, + { + "epoch": 0.343218012081274, + "grad_norm": 0.5156384523404802, + "learning_rate": 9.728418009170672e-07, + "loss": 0.3055, + "step": 625 + }, + { + "epoch": 0.34376716090060405, + "grad_norm": 0.45191362956934983, + "learning_rate": 9.7274730148078e-07, + "loss": 0.2908, + "step": 626 + }, + { + "epoch": 0.3443163097199341, + "grad_norm": 0.47085252608535416, + "learning_rate": 9.726526425265684e-07, + "loss": 0.3348, + "step": 627 + }, + { + "epoch": 0.34486545853926415, + "grad_norm": 0.5525369365824818, + "learning_rate": 9.72557824086376e-07, + "loss": 0.3105, + "step": 628 + }, + { + "epoch": 0.3454146073585942, + "grad_norm": 0.4510172443452184, + "learning_rate": 9.724628461922012e-07, + "loss": 0.3053, + "step": 629 + }, + { + "epoch": 0.34596375617792424, + "grad_norm": 0.6535047047887291, + "learning_rate": 9.723677088760948e-07, + "loss": 0.2922, + "step": 630 + }, + { + "epoch": 0.34651290499725423, + "grad_norm": 0.74351729819573, + "learning_rate": 9.722724121701626e-07, + "loss": 0.3051, + "step": 631 + }, + { + "epoch": 0.3470620538165843, + "grad_norm": 0.5805013009226808, + "learning_rate": 9.721769561065636e-07, + "loss": 0.3594, + "step": 632 + }, + { + "epoch": 0.34761120263591433, + "grad_norm": 0.5814129131140799, + "learning_rate": 9.720813407175104e-07, + "loss": 0.3167, + "step": 633 + }, + { + "epoch": 0.3481603514552444, + "grad_norm": 0.6309209196880202, + "learning_rate": 9.7198556603527e-07, + "loss": 0.3734, + "step": 634 + }, + { + "epoch": 0.3487095002745744, + "grad_norm": 0.46224009558769286, + "learning_rate": 9.71889632092163e-07, + "loss": 0.287, + "step": 635 + }, + { + "epoch": 0.34925864909390447, + "grad_norm": 0.5272531876962709, + "learning_rate": 9.71793538920563e-07, + "loss": 0.279, + "step": 636 + }, + { + "epoch": 0.34980779791323446, + "grad_norm": 0.44163137495470217, + "learning_rate": 9.716972865528985e-07, + "loss": 0.2678, + "step": 637 + }, + { + "epoch": 0.3503569467325645, + "grad_norm": 0.6380350145592627, + "learning_rate": 9.716008750216508e-07, + "loss": 0.3261, + "step": 638 + }, + { + "epoch": 0.35090609555189456, + "grad_norm": 0.6248761009397122, + "learning_rate": 9.715043043593553e-07, + "loss": 0.3116, + "step": 639 + }, + { + "epoch": 0.3514552443712246, + "grad_norm": 0.510139799304016, + "learning_rate": 9.71407574598601e-07, + "loss": 0.3322, + "step": 640 + }, + { + "epoch": 0.35200439319055465, + "grad_norm": 0.5148774048289841, + "learning_rate": 9.713106857720308e-07, + "loss": 0.2989, + "step": 641 + }, + { + "epoch": 0.3525535420098847, + "grad_norm": 0.5052236723776224, + "learning_rate": 9.712136379123408e-07, + "loss": 0.2853, + "step": 642 + }, + { + "epoch": 0.3531026908292147, + "grad_norm": 0.4195131561701459, + "learning_rate": 9.711164310522813e-07, + "loss": 0.3065, + "step": 643 + }, + { + "epoch": 0.35365183964854474, + "grad_norm": 0.5377543477776671, + "learning_rate": 9.710190652246561e-07, + "loss": 0.3426, + "step": 644 + }, + { + "epoch": 0.3542009884678748, + "grad_norm": 0.4133377610162905, + "learning_rate": 9.709215404623225e-07, + "loss": 0.281, + "step": 645 + }, + { + "epoch": 0.35475013728720484, + "grad_norm": 0.41211088337271534, + "learning_rate": 9.708238567981914e-07, + "loss": 0.2874, + "step": 646 + }, + { + "epoch": 0.3552992861065349, + "grad_norm": 0.5723847095442636, + "learning_rate": 9.707260142652274e-07, + "loss": 0.2726, + "step": 647 + }, + { + "epoch": 0.35584843492586493, + "grad_norm": 0.5196741588284901, + "learning_rate": 9.706280128964493e-07, + "loss": 0.3138, + "step": 648 + }, + { + "epoch": 0.3563975837451949, + "grad_norm": 0.4787700955419951, + "learning_rate": 9.705298527249282e-07, + "loss": 0.3165, + "step": 649 + }, + { + "epoch": 0.35694673256452497, + "grad_norm": 0.6034763612698993, + "learning_rate": 9.7043153378379e-07, + "loss": 0.3168, + "step": 650 + }, + { + "epoch": 0.357495881383855, + "grad_norm": 0.581155600436397, + "learning_rate": 9.703330561062134e-07, + "loss": 0.3249, + "step": 651 + }, + { + "epoch": 0.35804503020318507, + "grad_norm": 0.4296014615750807, + "learning_rate": 9.702344197254315e-07, + "loss": 0.2664, + "step": 652 + }, + { + "epoch": 0.3585941790225151, + "grad_norm": 0.4297859552951615, + "learning_rate": 9.701356246747298e-07, + "loss": 0.2974, + "step": 653 + }, + { + "epoch": 0.35914332784184516, + "grad_norm": 0.3995106691189651, + "learning_rate": 9.700366709874486e-07, + "loss": 0.3457, + "step": 654 + }, + { + "epoch": 0.35969247666117515, + "grad_norm": 0.500550982356711, + "learning_rate": 9.699375586969807e-07, + "loss": 0.2624, + "step": 655 + }, + { + "epoch": 0.3602416254805052, + "grad_norm": 0.42314748190658086, + "learning_rate": 9.69838287836773e-07, + "loss": 0.2745, + "step": 656 + }, + { + "epoch": 0.36079077429983525, + "grad_norm": 0.4876435943431558, + "learning_rate": 9.697388584403256e-07, + "loss": 0.3278, + "step": 657 + }, + { + "epoch": 0.3613399231191653, + "grad_norm": 0.5284152926781756, + "learning_rate": 9.696392705411926e-07, + "loss": 0.3094, + "step": 658 + }, + { + "epoch": 0.36188907193849534, + "grad_norm": 0.4483408446657781, + "learning_rate": 9.69539524172981e-07, + "loss": 0.3084, + "step": 659 + }, + { + "epoch": 0.3624382207578254, + "grad_norm": 0.44898867507859125, + "learning_rate": 9.694396193693517e-07, + "loss": 0.2784, + "step": 660 + }, + { + "epoch": 0.3629873695771554, + "grad_norm": 0.5226977215584937, + "learning_rate": 9.693395561640185e-07, + "loss": 0.3361, + "step": 661 + }, + { + "epoch": 0.36353651839648543, + "grad_norm": 0.5608672121095176, + "learning_rate": 9.692393345907495e-07, + "loss": 0.3167, + "step": 662 + }, + { + "epoch": 0.3640856672158155, + "grad_norm": 0.5626838918802252, + "learning_rate": 9.691389546833655e-07, + "loss": 0.3298, + "step": 663 + }, + { + "epoch": 0.3646348160351455, + "grad_norm": 0.5047475049389516, + "learning_rate": 9.690384164757413e-07, + "loss": 0.3005, + "step": 664 + }, + { + "epoch": 0.3651839648544756, + "grad_norm": 0.3981343843323678, + "learning_rate": 9.689377200018044e-07, + "loss": 0.3219, + "step": 665 + }, + { + "epoch": 0.3657331136738056, + "grad_norm": 0.44119825489979947, + "learning_rate": 9.688368652955367e-07, + "loss": 0.2966, + "step": 666 + }, + { + "epoch": 0.3662822624931356, + "grad_norm": 0.5118289344134571, + "learning_rate": 9.687358523909724e-07, + "loss": 0.3063, + "step": 667 + }, + { + "epoch": 0.36683141131246566, + "grad_norm": 0.39175252246613007, + "learning_rate": 9.686346813222e-07, + "loss": 0.3028, + "step": 668 + }, + { + "epoch": 0.3673805601317957, + "grad_norm": 0.40019068400355734, + "learning_rate": 9.685333521233608e-07, + "loss": 0.2826, + "step": 669 + }, + { + "epoch": 0.36792970895112576, + "grad_norm": 0.37582466249933383, + "learning_rate": 9.6843186482865e-07, + "loss": 0.3016, + "step": 670 + }, + { + "epoch": 0.3684788577704558, + "grad_norm": 0.5268600748409547, + "learning_rate": 9.683302194723155e-07, + "loss": 0.3324, + "step": 671 + }, + { + "epoch": 0.36902800658978585, + "grad_norm": 0.5673301016874637, + "learning_rate": 9.68228416088659e-07, + "loss": 0.2707, + "step": 672 + }, + { + "epoch": 0.36957715540911584, + "grad_norm": 0.45456325584425716, + "learning_rate": 9.68126454712035e-07, + "loss": 0.2942, + "step": 673 + }, + { + "epoch": 0.3701263042284459, + "grad_norm": 0.5007265572559167, + "learning_rate": 9.680243353768525e-07, + "loss": 0.292, + "step": 674 + }, + { + "epoch": 0.37067545304777594, + "grad_norm": 0.46899432742924696, + "learning_rate": 9.679220581175725e-07, + "loss": 0.3011, + "step": 675 + }, + { + "epoch": 0.371224601867106, + "grad_norm": 0.49659269295043473, + "learning_rate": 9.678196229687098e-07, + "loss": 0.3087, + "step": 676 + }, + { + "epoch": 0.37177375068643603, + "grad_norm": 0.37607093840017963, + "learning_rate": 9.677170299648325e-07, + "loss": 0.3142, + "step": 677 + }, + { + "epoch": 0.3723228995057661, + "grad_norm": 0.4646101284343405, + "learning_rate": 9.67614279140562e-07, + "loss": 0.3097, + "step": 678 + }, + { + "epoch": 0.37287204832509613, + "grad_norm": 0.4163066954353121, + "learning_rate": 9.675113705305732e-07, + "loss": 0.3203, + "step": 679 + }, + { + "epoch": 0.3734211971444261, + "grad_norm": 0.5328637350114481, + "learning_rate": 9.674083041695935e-07, + "loss": 0.2683, + "step": 680 + }, + { + "epoch": 0.37397034596375617, + "grad_norm": 0.5380563053012897, + "learning_rate": 9.673050800924044e-07, + "loss": 0.2924, + "step": 681 + }, + { + "epoch": 0.3745194947830862, + "grad_norm": 0.5125953775604254, + "learning_rate": 9.672016983338397e-07, + "loss": 0.2721, + "step": 682 + }, + { + "epoch": 0.37506864360241626, + "grad_norm": 0.5062247634748299, + "learning_rate": 9.670981589287874e-07, + "loss": 0.2848, + "step": 683 + }, + { + "epoch": 0.3756177924217463, + "grad_norm": 0.4022672964926895, + "learning_rate": 9.669944619121884e-07, + "loss": 0.2706, + "step": 684 + }, + { + "epoch": 0.37616694124107636, + "grad_norm": 0.4973441997825326, + "learning_rate": 9.668906073190357e-07, + "loss": 0.2799, + "step": 685 + }, + { + "epoch": 0.37671609006040635, + "grad_norm": 0.5102836144006504, + "learning_rate": 9.667865951843774e-07, + "loss": 0.277, + "step": 686 + }, + { + "epoch": 0.3772652388797364, + "grad_norm": 0.48942983713029503, + "learning_rate": 9.666824255433135e-07, + "loss": 0.3191, + "step": 687 + }, + { + "epoch": 0.37781438769906645, + "grad_norm": 0.9668850905299259, + "learning_rate": 9.665780984309974e-07, + "loss": 0.3065, + "step": 688 + }, + { + "epoch": 0.3783635365183965, + "grad_norm": 0.5647393299084974, + "learning_rate": 9.66473613882635e-07, + "loss": 0.3321, + "step": 689 + }, + { + "epoch": 0.37891268533772654, + "grad_norm": 0.5229824452045684, + "learning_rate": 9.66368971933487e-07, + "loss": 0.3316, + "step": 690 + }, + { + "epoch": 0.3794618341570566, + "grad_norm": 0.49594910353265875, + "learning_rate": 9.662641726188658e-07, + "loss": 0.2816, + "step": 691 + }, + { + "epoch": 0.3800109829763866, + "grad_norm": 0.4255955812380013, + "learning_rate": 9.661592159741372e-07, + "loss": 0.3278, + "step": 692 + }, + { + "epoch": 0.3805601317957166, + "grad_norm": 0.5090976935406557, + "learning_rate": 9.6605410203472e-07, + "loss": 0.319, + "step": 693 + }, + { + "epoch": 0.3811092806150467, + "grad_norm": 0.5588716904428634, + "learning_rate": 9.659488308360868e-07, + "loss": 0.2722, + "step": 694 + }, + { + "epoch": 0.3816584294343767, + "grad_norm": 0.42069896911120797, + "learning_rate": 9.658434024137623e-07, + "loss": 0.318, + "step": 695 + }, + { + "epoch": 0.38220757825370677, + "grad_norm": 0.6003031262008706, + "learning_rate": 9.65737816803325e-07, + "loss": 0.3144, + "step": 696 + }, + { + "epoch": 0.3827567270730368, + "grad_norm": 0.42317790502935254, + "learning_rate": 9.65632074040406e-07, + "loss": 0.3242, + "step": 697 + }, + { + "epoch": 0.3833058758923668, + "grad_norm": 0.484015090721489, + "learning_rate": 9.655261741606898e-07, + "loss": 0.3168, + "step": 698 + }, + { + "epoch": 0.38385502471169686, + "grad_norm": 0.4891502973693406, + "learning_rate": 9.654201171999135e-07, + "loss": 0.2901, + "step": 699 + }, + { + "epoch": 0.3844041735310269, + "grad_norm": 0.6789623364281554, + "learning_rate": 9.653139031938674e-07, + "loss": 0.3461, + "step": 700 + }, + { + "epoch": 0.38495332235035695, + "grad_norm": 0.510233112331138, + "learning_rate": 9.652075321783948e-07, + "loss": 0.2606, + "step": 701 + }, + { + "epoch": 0.385502471169687, + "grad_norm": 0.4339892378573219, + "learning_rate": 9.65101004189392e-07, + "loss": 0.2764, + "step": 702 + }, + { + "epoch": 0.38605161998901705, + "grad_norm": 0.44546824152165365, + "learning_rate": 9.649943192628088e-07, + "loss": 0.303, + "step": 703 + }, + { + "epoch": 0.38660076880834704, + "grad_norm": 0.5077409772487453, + "learning_rate": 9.648874774346466e-07, + "loss": 0.2745, + "step": 704 + }, + { + "epoch": 0.3871499176276771, + "grad_norm": 0.5351817859128496, + "learning_rate": 9.647804787409609e-07, + "loss": 0.3215, + "step": 705 + }, + { + "epoch": 0.38769906644700713, + "grad_norm": 0.6654375581180654, + "learning_rate": 9.646733232178602e-07, + "loss": 0.3005, + "step": 706 + }, + { + "epoch": 0.3882482152663372, + "grad_norm": 0.47479590504591296, + "learning_rate": 9.64566010901505e-07, + "loss": 0.3026, + "step": 707 + }, + { + "epoch": 0.38879736408566723, + "grad_norm": 0.45767142869618566, + "learning_rate": 9.644585418281095e-07, + "loss": 0.3234, + "step": 708 + }, + { + "epoch": 0.3893465129049973, + "grad_norm": 0.5351000334146468, + "learning_rate": 9.643509160339405e-07, + "loss": 0.3178, + "step": 709 + }, + { + "epoch": 0.38989566172432727, + "grad_norm": 0.5491196154093239, + "learning_rate": 9.642431335553179e-07, + "loss": 0.3036, + "step": 710 + }, + { + "epoch": 0.3904448105436573, + "grad_norm": 0.4953373720579792, + "learning_rate": 9.641351944286141e-07, + "loss": 0.309, + "step": 711 + }, + { + "epoch": 0.39099395936298736, + "grad_norm": 0.529089999932197, + "learning_rate": 9.640270986902546e-07, + "loss": 0.2937, + "step": 712 + }, + { + "epoch": 0.3915431081823174, + "grad_norm": 0.5021246446188063, + "learning_rate": 9.639188463767179e-07, + "loss": 0.2891, + "step": 713 + }, + { + "epoch": 0.39209225700164746, + "grad_norm": 0.509633641940779, + "learning_rate": 9.638104375245352e-07, + "loss": 0.2892, + "step": 714 + }, + { + "epoch": 0.3926414058209775, + "grad_norm": 0.4862639914462522, + "learning_rate": 9.6370187217029e-07, + "loss": 0.3117, + "step": 715 + }, + { + "epoch": 0.3931905546403075, + "grad_norm": 0.446018700976694, + "learning_rate": 9.635931503506197e-07, + "loss": 0.2978, + "step": 716 + }, + { + "epoch": 0.39373970345963755, + "grad_norm": 0.539543136781025, + "learning_rate": 9.634842721022135e-07, + "loss": 0.3292, + "step": 717 + }, + { + "epoch": 0.3942888522789676, + "grad_norm": 0.49828782521512316, + "learning_rate": 9.63375237461814e-07, + "loss": 0.2967, + "step": 718 + }, + { + "epoch": 0.39483800109829764, + "grad_norm": 0.480059502587545, + "learning_rate": 9.632660464662165e-07, + "loss": 0.3162, + "step": 719 + }, + { + "epoch": 0.3953871499176277, + "grad_norm": 0.43445689864749637, + "learning_rate": 9.631566991522687e-07, + "loss": 0.2881, + "step": 720 + }, + { + "epoch": 0.39593629873695774, + "grad_norm": 0.4342381150865927, + "learning_rate": 9.630471955568714e-07, + "loss": 0.2808, + "step": 721 + }, + { + "epoch": 0.39648544755628773, + "grad_norm": 0.4536683984099135, + "learning_rate": 9.629375357169778e-07, + "loss": 0.2769, + "step": 722 + }, + { + "epoch": 0.3970345963756178, + "grad_norm": 0.4446486023148765, + "learning_rate": 9.628277196695944e-07, + "loss": 0.2648, + "step": 723 + }, + { + "epoch": 0.3975837451949478, + "grad_norm": 0.5346333487867275, + "learning_rate": 9.627177474517799e-07, + "loss": 0.3142, + "step": 724 + }, + { + "epoch": 0.39813289401427787, + "grad_norm": 0.5358051956974806, + "learning_rate": 9.62607619100646e-07, + "loss": 0.3208, + "step": 725 + }, + { + "epoch": 0.3986820428336079, + "grad_norm": 0.5732398102241596, + "learning_rate": 9.624973346533567e-07, + "loss": 0.323, + "step": 726 + }, + { + "epoch": 0.39923119165293797, + "grad_norm": 0.48547846323551097, + "learning_rate": 9.623868941471292e-07, + "loss": 0.3343, + "step": 727 + }, + { + "epoch": 0.39978034047226796, + "grad_norm": 1.2687221970572045, + "learning_rate": 9.62276297619233e-07, + "loss": 0.4866, + "step": 728 + }, + { + "epoch": 0.400329489291598, + "grad_norm": 0.48601698432633017, + "learning_rate": 9.621655451069901e-07, + "loss": 0.281, + "step": 729 + }, + { + "epoch": 0.40087863811092805, + "grad_norm": 0.6849417587452975, + "learning_rate": 9.620546366477761e-07, + "loss": 0.2849, + "step": 730 + }, + { + "epoch": 0.4014277869302581, + "grad_norm": 0.4816077181452332, + "learning_rate": 9.619435722790179e-07, + "loss": 0.2682, + "step": 731 + }, + { + "epoch": 0.40197693574958815, + "grad_norm": 0.4787268819031274, + "learning_rate": 9.618323520381958e-07, + "loss": 0.2727, + "step": 732 + }, + { + "epoch": 0.4025260845689182, + "grad_norm": 0.5491644244679555, + "learning_rate": 9.617209759628423e-07, + "loss": 0.3453, + "step": 733 + }, + { + "epoch": 0.4030752333882482, + "grad_norm": 0.512411114839357, + "learning_rate": 9.61609444090543e-07, + "loss": 0.3192, + "step": 734 + }, + { + "epoch": 0.40362438220757824, + "grad_norm": 0.6185218965870894, + "learning_rate": 9.61497756458936e-07, + "loss": 0.3049, + "step": 735 + }, + { + "epoch": 0.4041735310269083, + "grad_norm": 0.568231175264156, + "learning_rate": 9.613859131057113e-07, + "loss": 0.3052, + "step": 736 + }, + { + "epoch": 0.40472267984623833, + "grad_norm": 0.6400632494660251, + "learning_rate": 9.612739140686123e-07, + "loss": 0.3194, + "step": 737 + }, + { + "epoch": 0.4052718286655684, + "grad_norm": 1.0806023635161741, + "learning_rate": 9.611617593854342e-07, + "loss": 0.3057, + "step": 738 + }, + { + "epoch": 0.4058209774848984, + "grad_norm": 0.6063220373539205, + "learning_rate": 9.610494490940252e-07, + "loss": 0.3173, + "step": 739 + }, + { + "epoch": 0.4063701263042284, + "grad_norm": 0.4843971420219079, + "learning_rate": 9.609369832322859e-07, + "loss": 0.2956, + "step": 740 + }, + { + "epoch": 0.40691927512355847, + "grad_norm": 0.4905855464007312, + "learning_rate": 9.608243618381695e-07, + "loss": 0.2789, + "step": 741 + }, + { + "epoch": 0.4074684239428885, + "grad_norm": 0.7025421677093987, + "learning_rate": 9.607115849496815e-07, + "loss": 0.2977, + "step": 742 + }, + { + "epoch": 0.40801757276221856, + "grad_norm": 0.5256562409723281, + "learning_rate": 9.605986526048801e-07, + "loss": 0.3047, + "step": 743 + }, + { + "epoch": 0.4085667215815486, + "grad_norm": 0.4493202041470104, + "learning_rate": 9.604855648418757e-07, + "loss": 0.3122, + "step": 744 + }, + { + "epoch": 0.40911587040087866, + "grad_norm": 0.4457684561463568, + "learning_rate": 9.603723216988308e-07, + "loss": 0.2819, + "step": 745 + }, + { + "epoch": 0.4096650192202087, + "grad_norm": 0.3818819244872842, + "learning_rate": 9.602589232139615e-07, + "loss": 0.3033, + "step": 746 + }, + { + "epoch": 0.4102141680395387, + "grad_norm": 0.5439363449127944, + "learning_rate": 9.60145369425535e-07, + "loss": 0.2589, + "step": 747 + }, + { + "epoch": 0.41076331685886874, + "grad_norm": 0.6063742306547529, + "learning_rate": 9.60031660371872e-07, + "loss": 0.3205, + "step": 748 + }, + { + "epoch": 0.4113124656781988, + "grad_norm": 0.4810223155119117, + "learning_rate": 9.599177960913448e-07, + "loss": 0.3043, + "step": 749 + }, + { + "epoch": 0.41186161449752884, + "grad_norm": 0.6567690969443539, + "learning_rate": 9.598037766223787e-07, + "loss": 0.3811, + "step": 750 + }, + { + "epoch": 0.4124107633168589, + "grad_norm": 0.396378898797153, + "learning_rate": 9.596896020034507e-07, + "loss": 0.2467, + "step": 751 + }, + { + "epoch": 0.41295991213618893, + "grad_norm": 0.4171907276326462, + "learning_rate": 9.595752722730908e-07, + "loss": 0.2909, + "step": 752 + }, + { + "epoch": 0.4135090609555189, + "grad_norm": 0.4366953949450168, + "learning_rate": 9.594607874698812e-07, + "loss": 0.2808, + "step": 753 + }, + { + "epoch": 0.414058209774849, + "grad_norm": 0.5098541784738473, + "learning_rate": 9.593461476324559e-07, + "loss": 0.2941, + "step": 754 + }, + { + "epoch": 0.414607358594179, + "grad_norm": 0.5771167137620298, + "learning_rate": 9.592313527995018e-07, + "loss": 0.3404, + "step": 755 + }, + { + "epoch": 0.41515650741350907, + "grad_norm": 0.4122083489176511, + "learning_rate": 9.59116403009758e-07, + "loss": 0.3289, + "step": 756 + }, + { + "epoch": 0.4157056562328391, + "grad_norm": 0.576141910264704, + "learning_rate": 9.590012983020156e-07, + "loss": 0.3254, + "step": 757 + }, + { + "epoch": 0.41625480505216916, + "grad_norm": 0.4156118943499862, + "learning_rate": 9.588860387151186e-07, + "loss": 0.303, + "step": 758 + }, + { + "epoch": 0.41680395387149916, + "grad_norm": 0.4149677926920236, + "learning_rate": 9.587706242879626e-07, + "loss": 0.2428, + "step": 759 + }, + { + "epoch": 0.4173531026908292, + "grad_norm": 0.46849318941518386, + "learning_rate": 9.586550550594957e-07, + "loss": 0.3186, + "step": 760 + }, + { + "epoch": 0.41790225151015925, + "grad_norm": 0.577918250208169, + "learning_rate": 9.585393310687184e-07, + "loss": 0.3931, + "step": 761 + }, + { + "epoch": 0.4184514003294893, + "grad_norm": 0.4835356500718274, + "learning_rate": 9.58423452354683e-07, + "loss": 0.2647, + "step": 762 + }, + { + "epoch": 0.41900054914881935, + "grad_norm": 0.7029114169052276, + "learning_rate": 9.583074189564946e-07, + "loss": 0.2908, + "step": 763 + }, + { + "epoch": 0.4195496979681494, + "grad_norm": 0.4504448045494503, + "learning_rate": 9.5819123091331e-07, + "loss": 0.2758, + "step": 764 + }, + { + "epoch": 0.4200988467874794, + "grad_norm": 0.4329513902572511, + "learning_rate": 9.580748882643386e-07, + "loss": 0.3147, + "step": 765 + }, + { + "epoch": 0.42064799560680943, + "grad_norm": 0.44771741448741187, + "learning_rate": 9.579583910488415e-07, + "loss": 0.2814, + "step": 766 + }, + { + "epoch": 0.4211971444261395, + "grad_norm": 0.46685480671831064, + "learning_rate": 9.578417393061326e-07, + "loss": 0.3293, + "step": 767 + }, + { + "epoch": 0.42174629324546953, + "grad_norm": 0.48456073285859785, + "learning_rate": 9.57724933075577e-07, + "loss": 0.3201, + "step": 768 + }, + { + "epoch": 0.4222954420647996, + "grad_norm": 0.4133689638287976, + "learning_rate": 9.57607972396593e-07, + "loss": 0.3207, + "step": 769 + }, + { + "epoch": 0.4228445908841296, + "grad_norm": 0.5014614687785577, + "learning_rate": 9.5749085730865e-07, + "loss": 0.3088, + "step": 770 + }, + { + "epoch": 0.4233937397034596, + "grad_norm": 0.4207848646541183, + "learning_rate": 9.573735878512708e-07, + "loss": 0.2839, + "step": 771 + }, + { + "epoch": 0.42394288852278966, + "grad_norm": 0.36226359304686095, + "learning_rate": 9.572561640640286e-07, + "loss": 0.285, + "step": 772 + }, + { + "epoch": 0.4244920373421197, + "grad_norm": 0.5327078794794589, + "learning_rate": 9.571385859865505e-07, + "loss": 0.2801, + "step": 773 + }, + { + "epoch": 0.42504118616144976, + "grad_norm": 0.5065927036765948, + "learning_rate": 9.57020853658514e-07, + "loss": 0.3097, + "step": 774 + }, + { + "epoch": 0.4255903349807798, + "grad_norm": 0.5327545308558694, + "learning_rate": 9.5690296711965e-07, + "loss": 0.3258, + "step": 775 + }, + { + "epoch": 0.42613948380010985, + "grad_norm": 0.41681155903373474, + "learning_rate": 9.567849264097408e-07, + "loss": 0.3346, + "step": 776 + }, + { + "epoch": 0.42668863261943984, + "grad_norm": 0.41704633743883057, + "learning_rate": 9.566667315686204e-07, + "loss": 0.2989, + "step": 777 + }, + { + "epoch": 0.4272377814387699, + "grad_norm": 0.5319763283008776, + "learning_rate": 9.565483826361754e-07, + "loss": 0.2987, + "step": 778 + }, + { + "epoch": 0.42778693025809994, + "grad_norm": 0.4842008761674137, + "learning_rate": 9.564298796523443e-07, + "loss": 0.3076, + "step": 779 + }, + { + "epoch": 0.42833607907743, + "grad_norm": 0.45192872305885196, + "learning_rate": 9.563112226571173e-07, + "loss": 0.2882, + "step": 780 + }, + { + "epoch": 0.42888522789676004, + "grad_norm": 0.41908768948092573, + "learning_rate": 9.56192411690537e-07, + "loss": 0.2961, + "step": 781 + }, + { + "epoch": 0.4294343767160901, + "grad_norm": 0.5179482366434025, + "learning_rate": 9.560734467926975e-07, + "loss": 0.3037, + "step": 782 + }, + { + "epoch": 0.4299835255354201, + "grad_norm": 0.4916582966779928, + "learning_rate": 9.559543280037453e-07, + "loss": 0.29, + "step": 783 + }, + { + "epoch": 0.4305326743547501, + "grad_norm": 0.5700432679096007, + "learning_rate": 9.558350553638786e-07, + "loss": 0.294, + "step": 784 + }, + { + "epoch": 0.43108182317408017, + "grad_norm": 0.5839314684230201, + "learning_rate": 9.557156289133473e-07, + "loss": 0.2831, + "step": 785 + }, + { + "epoch": 0.4316309719934102, + "grad_norm": 0.48557340618273365, + "learning_rate": 9.555960486924535e-07, + "loss": 0.3188, + "step": 786 + }, + { + "epoch": 0.43218012081274026, + "grad_norm": 0.4929418875926838, + "learning_rate": 9.554763147415511e-07, + "loss": 0.3325, + "step": 787 + }, + { + "epoch": 0.4327292696320703, + "grad_norm": 0.3809242223425811, + "learning_rate": 9.553564271010462e-07, + "loss": 0.282, + "step": 788 + }, + { + "epoch": 0.4332784184514003, + "grad_norm": 0.45802986076725805, + "learning_rate": 9.55236385811396e-07, + "loss": 0.2655, + "step": 789 + }, + { + "epoch": 0.43382756727073035, + "grad_norm": 0.4024472187473021, + "learning_rate": 9.551161909131102e-07, + "loss": 0.302, + "step": 790 + }, + { + "epoch": 0.4343767160900604, + "grad_norm": 0.4529326437615863, + "learning_rate": 9.549958424467503e-07, + "loss": 0.3127, + "step": 791 + }, + { + "epoch": 0.43492586490939045, + "grad_norm": 0.5094681766059965, + "learning_rate": 9.548753404529291e-07, + "loss": 0.2948, + "step": 792 + }, + { + "epoch": 0.4354750137287205, + "grad_norm": 0.4079607057950661, + "learning_rate": 9.547546849723122e-07, + "loss": 0.2973, + "step": 793 + }, + { + "epoch": 0.43602416254805054, + "grad_norm": 0.4843818222476831, + "learning_rate": 9.546338760456157e-07, + "loss": 0.29, + "step": 794 + }, + { + "epoch": 0.43657331136738053, + "grad_norm": 0.6200251252974922, + "learning_rate": 9.545129137136088e-07, + "loss": 0.326, + "step": 795 + }, + { + "epoch": 0.4371224601867106, + "grad_norm": 0.5148061588557831, + "learning_rate": 9.543917980171111e-07, + "loss": 0.2919, + "step": 796 + }, + { + "epoch": 0.43767160900604063, + "grad_norm": 0.4647719726184225, + "learning_rate": 9.542705289969954e-07, + "loss": 0.2861, + "step": 797 + }, + { + "epoch": 0.4382207578253707, + "grad_norm": 0.5207409513182187, + "learning_rate": 9.541491066941852e-07, + "loss": 0.3313, + "step": 798 + }, + { + "epoch": 0.4387699066447007, + "grad_norm": 0.3997787911557919, + "learning_rate": 9.54027531149656e-07, + "loss": 0.2881, + "step": 799 + }, + { + "epoch": 0.43931905546403077, + "grad_norm": 0.6069803871029724, + "learning_rate": 9.539058024044351e-07, + "loss": 0.2835, + "step": 800 + }, + { + "epoch": 0.43931905546403077, + "eval_loss": 0.3838886320590973, + "eval_runtime": 20.625, + "eval_samples_per_second": 21.479, + "eval_steps_per_second": 0.921, + "step": 800 + }, + { + "epoch": 0.43986820428336076, + "grad_norm": 0.5217888186382518, + "learning_rate": 9.537839204996016e-07, + "loss": 0.3011, + "step": 801 + }, + { + "epoch": 0.4404173531026908, + "grad_norm": 0.43913261925358565, + "learning_rate": 9.53661885476286e-07, + "loss": 0.2978, + "step": 802 + }, + { + "epoch": 0.44096650192202086, + "grad_norm": 0.5022079288969393, + "learning_rate": 9.535396973756706e-07, + "loss": 0.278, + "step": 803 + }, + { + "epoch": 0.4415156507413509, + "grad_norm": 0.35775508016281304, + "learning_rate": 9.534173562389896e-07, + "loss": 0.3043, + "step": 804 + }, + { + "epoch": 0.44206479956068095, + "grad_norm": 0.4096278279566925, + "learning_rate": 9.532948621075284e-07, + "loss": 0.3128, + "step": 805 + }, + { + "epoch": 0.442613948380011, + "grad_norm": 0.4079260738482481, + "learning_rate": 9.531722150226246e-07, + "loss": 0.2912, + "step": 806 + }, + { + "epoch": 0.443163097199341, + "grad_norm": 0.4919102979709681, + "learning_rate": 9.530494150256666e-07, + "loss": 0.3205, + "step": 807 + }, + { + "epoch": 0.44371224601867104, + "grad_norm": 0.4988189752462686, + "learning_rate": 9.529264621580951e-07, + "loss": 0.2765, + "step": 808 + }, + { + "epoch": 0.4442613948380011, + "grad_norm": 0.6162969675302271, + "learning_rate": 9.528033564614021e-07, + "loss": 0.3293, + "step": 809 + }, + { + "epoch": 0.44481054365733114, + "grad_norm": 0.4048799307883794, + "learning_rate": 9.526800979771314e-07, + "loss": 0.3248, + "step": 810 + }, + { + "epoch": 0.4453596924766612, + "grad_norm": 0.5073837004555553, + "learning_rate": 9.525566867468781e-07, + "loss": 0.2901, + "step": 811 + }, + { + "epoch": 0.44590884129599123, + "grad_norm": 0.5405585042735882, + "learning_rate": 9.524331228122888e-07, + "loss": 0.3353, + "step": 812 + }, + { + "epoch": 0.4464579901153213, + "grad_norm": 0.620403327298711, + "learning_rate": 9.523094062150621e-07, + "loss": 0.284, + "step": 813 + }, + { + "epoch": 0.44700713893465127, + "grad_norm": 0.5736180586292634, + "learning_rate": 9.521855369969475e-07, + "loss": 0.333, + "step": 814 + }, + { + "epoch": 0.4475562877539813, + "grad_norm": 0.510605890936949, + "learning_rate": 9.520615151997465e-07, + "loss": 0.2819, + "step": 815 + }, + { + "epoch": 0.44810543657331137, + "grad_norm": 0.49143475086367644, + "learning_rate": 9.519373408653117e-07, + "loss": 0.2391, + "step": 816 + }, + { + "epoch": 0.4486545853926414, + "grad_norm": 0.45201489835598707, + "learning_rate": 9.518130140355475e-07, + "loss": 0.2885, + "step": 817 + }, + { + "epoch": 0.44920373421197146, + "grad_norm": 0.573725064161955, + "learning_rate": 9.516885347524095e-07, + "loss": 0.2839, + "step": 818 + }, + { + "epoch": 0.4497528830313015, + "grad_norm": 0.47386243797489513, + "learning_rate": 9.51563903057905e-07, + "loss": 0.2696, + "step": 819 + }, + { + "epoch": 0.4503020318506315, + "grad_norm": 0.47205463916067886, + "learning_rate": 9.514391189940926e-07, + "loss": 0.2523, + "step": 820 + }, + { + "epoch": 0.45085118066996155, + "grad_norm": 0.45044665068911605, + "learning_rate": 9.513141826030823e-07, + "loss": 0.3123, + "step": 821 + }, + { + "epoch": 0.4514003294892916, + "grad_norm": 0.4158611609735077, + "learning_rate": 9.511890939270353e-07, + "loss": 0.2925, + "step": 822 + }, + { + "epoch": 0.45194947830862164, + "grad_norm": 0.5223967234947888, + "learning_rate": 9.510638530081648e-07, + "loss": 0.2975, + "step": 823 + }, + { + "epoch": 0.4524986271279517, + "grad_norm": 0.5843315058304418, + "learning_rate": 9.509384598887347e-07, + "loss": 0.2739, + "step": 824 + }, + { + "epoch": 0.45304777594728174, + "grad_norm": 0.6273015607567209, + "learning_rate": 9.50812914611061e-07, + "loss": 0.2956, + "step": 825 + }, + { + "epoch": 0.45359692476661173, + "grad_norm": 0.5026457859686514, + "learning_rate": 9.506872172175101e-07, + "loss": 0.3232, + "step": 826 + }, + { + "epoch": 0.4541460735859418, + "grad_norm": 0.5180912442648508, + "learning_rate": 9.505613677505003e-07, + "loss": 0.297, + "step": 827 + }, + { + "epoch": 0.4546952224052718, + "grad_norm": 0.45552996393675466, + "learning_rate": 9.504353662525014e-07, + "loss": 0.2965, + "step": 828 + }, + { + "epoch": 0.4552443712246019, + "grad_norm": 0.42650174704417904, + "learning_rate": 9.503092127660342e-07, + "loss": 0.2819, + "step": 829 + }, + { + "epoch": 0.4557935200439319, + "grad_norm": 0.3817615029385988, + "learning_rate": 9.501829073336708e-07, + "loss": 0.3229, + "step": 830 + }, + { + "epoch": 0.45634266886326197, + "grad_norm": 0.493972109103768, + "learning_rate": 9.500564499980347e-07, + "loss": 0.2791, + "step": 831 + }, + { + "epoch": 0.45689181768259196, + "grad_norm": 0.4867861449067947, + "learning_rate": 9.499298408018004e-07, + "loss": 0.3061, + "step": 832 + }, + { + "epoch": 0.457440966501922, + "grad_norm": 0.41619730088568113, + "learning_rate": 9.498030797876939e-07, + "loss": 0.254, + "step": 833 + }, + { + "epoch": 0.45799011532125206, + "grad_norm": 0.536376759166917, + "learning_rate": 9.496761669984926e-07, + "loss": 0.3137, + "step": 834 + }, + { + "epoch": 0.4585392641405821, + "grad_norm": 0.46274798693893776, + "learning_rate": 9.495491024770246e-07, + "loss": 0.2837, + "step": 835 + }, + { + "epoch": 0.45908841295991215, + "grad_norm": 0.39714810413306906, + "learning_rate": 9.494218862661698e-07, + "loss": 0.2952, + "step": 836 + }, + { + "epoch": 0.4596375617792422, + "grad_norm": 0.5090674507672917, + "learning_rate": 9.492945184088585e-07, + "loss": 0.2747, + "step": 837 + }, + { + "epoch": 0.4601867105985722, + "grad_norm": 0.452786430103756, + "learning_rate": 9.491669989480734e-07, + "loss": 0.2996, + "step": 838 + }, + { + "epoch": 0.46073585941790224, + "grad_norm": 0.6145939049047441, + "learning_rate": 9.490393279268469e-07, + "loss": 0.3071, + "step": 839 + }, + { + "epoch": 0.4612850082372323, + "grad_norm": 0.4456318375354344, + "learning_rate": 9.489115053882636e-07, + "loss": 0.3, + "step": 840 + }, + { + "epoch": 0.46183415705656233, + "grad_norm": 0.4978485181987157, + "learning_rate": 9.487835313754589e-07, + "loss": 0.2721, + "step": 841 + }, + { + "epoch": 0.4623833058758924, + "grad_norm": 0.6289211533363681, + "learning_rate": 9.486554059316193e-07, + "loss": 0.2631, + "step": 842 + }, + { + "epoch": 0.46293245469522243, + "grad_norm": 0.4062196482018521, + "learning_rate": 9.485271290999822e-07, + "loss": 0.2851, + "step": 843 + }, + { + "epoch": 0.4634816035145524, + "grad_norm": 0.45900569659721774, + "learning_rate": 9.483987009238366e-07, + "loss": 0.284, + "step": 844 + }, + { + "epoch": 0.46403075233388247, + "grad_norm": 0.4263527025139981, + "learning_rate": 9.482701214465223e-07, + "loss": 0.2828, + "step": 845 + }, + { + "epoch": 0.4645799011532125, + "grad_norm": 0.45434760533612745, + "learning_rate": 9.481413907114298e-07, + "loss": 0.262, + "step": 846 + }, + { + "epoch": 0.46512904997254256, + "grad_norm": 0.4728893975674042, + "learning_rate": 9.480125087620013e-07, + "loss": 0.2823, + "step": 847 + }, + { + "epoch": 0.4656781987918726, + "grad_norm": 0.5007876177507758, + "learning_rate": 9.478834756417297e-07, + "loss": 0.3191, + "step": 848 + }, + { + "epoch": 0.46622734761120266, + "grad_norm": 0.5662662290264328, + "learning_rate": 9.477542913941587e-07, + "loss": 0.3078, + "step": 849 + }, + { + "epoch": 0.46677649643053265, + "grad_norm": 0.5068252855750915, + "learning_rate": 9.476249560628831e-07, + "loss": 0.3392, + "step": 850 + }, + { + "epoch": 0.4673256452498627, + "grad_norm": 0.4912610553396206, + "learning_rate": 9.474954696915494e-07, + "loss": 0.3353, + "step": 851 + }, + { + "epoch": 0.46787479406919275, + "grad_norm": 0.4954243974910444, + "learning_rate": 9.473658323238539e-07, + "loss": 0.257, + "step": 852 + }, + { + "epoch": 0.4684239428885228, + "grad_norm": 0.5739821658616234, + "learning_rate": 9.472360440035448e-07, + "loss": 0.3647, + "step": 853 + }, + { + "epoch": 0.46897309170785284, + "grad_norm": 0.4583902609163127, + "learning_rate": 9.471061047744207e-07, + "loss": 0.2645, + "step": 854 + }, + { + "epoch": 0.4695222405271829, + "grad_norm": 0.45603431744449624, + "learning_rate": 9.469760146803315e-07, + "loss": 0.2948, + "step": 855 + }, + { + "epoch": 0.4700713893465129, + "grad_norm": 0.43730155755645916, + "learning_rate": 9.468457737651775e-07, + "loss": 0.3249, + "step": 856 + }, + { + "epoch": 0.4706205381658429, + "grad_norm": 0.4450165051325497, + "learning_rate": 9.467153820729103e-07, + "loss": 0.2847, + "step": 857 + }, + { + "epoch": 0.471169686985173, + "grad_norm": 0.3873009851367693, + "learning_rate": 9.465848396475326e-07, + "loss": 0.3554, + "step": 858 + }, + { + "epoch": 0.471718835804503, + "grad_norm": 0.44400554875399245, + "learning_rate": 9.464541465330972e-07, + "loss": 0.292, + "step": 859 + }, + { + "epoch": 0.47226798462383307, + "grad_norm": 0.39940863324033565, + "learning_rate": 9.463233027737086e-07, + "loss": 0.3036, + "step": 860 + }, + { + "epoch": 0.4728171334431631, + "grad_norm": 0.4563234904534277, + "learning_rate": 9.461923084135215e-07, + "loss": 0.2964, + "step": 861 + }, + { + "epoch": 0.4733662822624931, + "grad_norm": 0.6470415644215131, + "learning_rate": 9.460611634967417e-07, + "loss": 0.2917, + "step": 862 + }, + { + "epoch": 0.47391543108182316, + "grad_norm": 0.5910395927439237, + "learning_rate": 9.45929868067626e-07, + "loss": 0.2868, + "step": 863 + }, + { + "epoch": 0.4744645799011532, + "grad_norm": 0.7232206855561645, + "learning_rate": 9.457984221704815e-07, + "loss": 0.3685, + "step": 864 + }, + { + "epoch": 0.47501372872048325, + "grad_norm": 0.4846117391078469, + "learning_rate": 9.456668258496663e-07, + "loss": 0.3112, + "step": 865 + }, + { + "epoch": 0.4755628775398133, + "grad_norm": 0.5525715056574515, + "learning_rate": 9.455350791495896e-07, + "loss": 0.3139, + "step": 866 + }, + { + "epoch": 0.47611202635914335, + "grad_norm": 0.7880040011238945, + "learning_rate": 9.454031821147109e-07, + "loss": 0.2924, + "step": 867 + }, + { + "epoch": 0.47666117517847334, + "grad_norm": 0.5151840132964554, + "learning_rate": 9.452711347895407e-07, + "loss": 0.3901, + "step": 868 + }, + { + "epoch": 0.4772103239978034, + "grad_norm": 0.4040778589954771, + "learning_rate": 9.451389372186399e-07, + "loss": 0.2988, + "step": 869 + }, + { + "epoch": 0.47775947281713343, + "grad_norm": 0.4587348256162564, + "learning_rate": 9.450065894466205e-07, + "loss": 0.3157, + "step": 870 + }, + { + "epoch": 0.4783086216364635, + "grad_norm": 0.43821097109002627, + "learning_rate": 9.448740915181448e-07, + "loss": 0.2581, + "step": 871 + }, + { + "epoch": 0.47885777045579353, + "grad_norm": 0.4609381652044799, + "learning_rate": 9.447414434779262e-07, + "loss": 0.2566, + "step": 872 + }, + { + "epoch": 0.4794069192751236, + "grad_norm": 0.561922576075349, + "learning_rate": 9.446086453707285e-07, + "loss": 0.3319, + "step": 873 + }, + { + "epoch": 0.47995606809445357, + "grad_norm": 0.46019066930117003, + "learning_rate": 9.44475697241366e-07, + "loss": 0.238, + "step": 874 + }, + { + "epoch": 0.4805052169137836, + "grad_norm": 0.40837440613786424, + "learning_rate": 9.443425991347038e-07, + "loss": 0.3039, + "step": 875 + }, + { + "epoch": 0.48105436573311366, + "grad_norm": 0.4350413843552751, + "learning_rate": 9.442093510956578e-07, + "loss": 0.296, + "step": 876 + }, + { + "epoch": 0.4816035145524437, + "grad_norm": 0.6111074790447228, + "learning_rate": 9.440759531691941e-07, + "loss": 0.2763, + "step": 877 + }, + { + "epoch": 0.48215266337177376, + "grad_norm": 0.43519180562596144, + "learning_rate": 9.439424054003296e-07, + "loss": 0.2751, + "step": 878 + }, + { + "epoch": 0.4827018121911038, + "grad_norm": 0.4598670249779719, + "learning_rate": 9.438087078341321e-07, + "loss": 0.2993, + "step": 879 + }, + { + "epoch": 0.48325096101043385, + "grad_norm": 0.47378295023804284, + "learning_rate": 9.436748605157192e-07, + "loss": 0.3243, + "step": 880 + }, + { + "epoch": 0.48380010982976385, + "grad_norm": 0.4461236631512308, + "learning_rate": 9.435408634902595e-07, + "loss": 0.2682, + "step": 881 + }, + { + "epoch": 0.4843492586490939, + "grad_norm": 0.7920326725183295, + "learning_rate": 9.434067168029721e-07, + "loss": 0.2738, + "step": 882 + }, + { + "epoch": 0.48489840746842394, + "grad_norm": 0.44157805064535044, + "learning_rate": 9.432724204991268e-07, + "loss": 0.2692, + "step": 883 + }, + { + "epoch": 0.485447556287754, + "grad_norm": 0.5442212924937905, + "learning_rate": 9.431379746240433e-07, + "loss": 0.3247, + "step": 884 + }, + { + "epoch": 0.48599670510708404, + "grad_norm": 0.4581860021645979, + "learning_rate": 9.430033792230924e-07, + "loss": 0.266, + "step": 885 + }, + { + "epoch": 0.4865458539264141, + "grad_norm": 0.5049337337515573, + "learning_rate": 9.428686343416948e-07, + "loss": 0.2949, + "step": 886 + }, + { + "epoch": 0.4870950027457441, + "grad_norm": 0.45948676235725366, + "learning_rate": 9.427337400253222e-07, + "loss": 0.246, + "step": 887 + }, + { + "epoch": 0.4876441515650741, + "grad_norm": 0.4014316553566119, + "learning_rate": 9.425986963194964e-07, + "loss": 0.3193, + "step": 888 + }, + { + "epoch": 0.48819330038440417, + "grad_norm": 0.466582939409045, + "learning_rate": 9.424635032697897e-07, + "loss": 0.299, + "step": 889 + }, + { + "epoch": 0.4887424492037342, + "grad_norm": 0.45486647734910446, + "learning_rate": 9.423281609218244e-07, + "loss": 0.3037, + "step": 890 + }, + { + "epoch": 0.48929159802306427, + "grad_norm": 0.45736700020323157, + "learning_rate": 9.421926693212741e-07, + "loss": 0.2934, + "step": 891 + }, + { + "epoch": 0.4898407468423943, + "grad_norm": 0.46611243538903646, + "learning_rate": 9.420570285138622e-07, + "loss": 0.2685, + "step": 892 + }, + { + "epoch": 0.4903898956617243, + "grad_norm": 0.6729260072812459, + "learning_rate": 9.41921238545362e-07, + "loss": 0.3409, + "step": 893 + }, + { + "epoch": 0.49093904448105435, + "grad_norm": 0.5186531107833405, + "learning_rate": 9.417852994615979e-07, + "loss": 0.3485, + "step": 894 + }, + { + "epoch": 0.4914881933003844, + "grad_norm": 0.4523439174163831, + "learning_rate": 9.416492113084443e-07, + "loss": 0.2894, + "step": 895 + }, + { + "epoch": 0.49203734211971445, + "grad_norm": 0.5607308198942792, + "learning_rate": 9.41512974131826e-07, + "loss": 0.2976, + "step": 896 + }, + { + "epoch": 0.4925864909390445, + "grad_norm": 0.4997124535210658, + "learning_rate": 9.413765879777182e-07, + "loss": 0.3161, + "step": 897 + }, + { + "epoch": 0.49313563975837454, + "grad_norm": 0.5930640137470536, + "learning_rate": 9.412400528921457e-07, + "loss": 0.2949, + "step": 898 + }, + { + "epoch": 0.49368478857770454, + "grad_norm": 0.40320785859497194, + "learning_rate": 9.411033689211843e-07, + "loss": 0.2995, + "step": 899 + }, + { + "epoch": 0.4942339373970346, + "grad_norm": 0.42578648193220026, + "learning_rate": 9.4096653611096e-07, + "loss": 0.2967, + "step": 900 + }, + { + "epoch": 0.49478308621636463, + "grad_norm": 0.5942916136732092, + "learning_rate": 9.408295545076487e-07, + "loss": 0.3156, + "step": 901 + }, + { + "epoch": 0.4953322350356947, + "grad_norm": 0.5386438289462784, + "learning_rate": 9.406924241574767e-07, + "loss": 0.3137, + "step": 902 + }, + { + "epoch": 0.4958813838550247, + "grad_norm": 0.5022407514695376, + "learning_rate": 9.405551451067201e-07, + "loss": 0.2902, + "step": 903 + }, + { + "epoch": 0.4964305326743548, + "grad_norm": 0.4962441291250635, + "learning_rate": 9.404177174017059e-07, + "loss": 0.3033, + "step": 904 + }, + { + "epoch": 0.49697968149368477, + "grad_norm": 0.49621231233036023, + "learning_rate": 9.402801410888109e-07, + "loss": 0.2938, + "step": 905 + }, + { + "epoch": 0.4975288303130148, + "grad_norm": 0.49731826368397647, + "learning_rate": 9.401424162144617e-07, + "loss": 0.3045, + "step": 906 + }, + { + "epoch": 0.49807797913234486, + "grad_norm": 0.4166538106214972, + "learning_rate": 9.400045428251357e-07, + "loss": 0.2894, + "step": 907 + }, + { + "epoch": 0.4986271279516749, + "grad_norm": 0.4691592328681605, + "learning_rate": 9.398665209673601e-07, + "loss": 0.2701, + "step": 908 + }, + { + "epoch": 0.49917627677100496, + "grad_norm": 0.4461402006742555, + "learning_rate": 9.39728350687712e-07, + "loss": 0.328, + "step": 909 + }, + { + "epoch": 0.499725425590335, + "grad_norm": 0.5621232251239702, + "learning_rate": 9.395900320328187e-07, + "loss": 0.2918, + "step": 910 + }, + { + "epoch": 0.500274574409665, + "grad_norm": 0.5227577695217629, + "learning_rate": 9.39451565049358e-07, + "loss": 0.2888, + "step": 911 + }, + { + "epoch": 0.500823723228995, + "grad_norm": 0.39227052219957875, + "learning_rate": 9.39312949784057e-07, + "loss": 0.3098, + "step": 912 + }, + { + "epoch": 0.5013728720483251, + "grad_norm": 0.3768420830212145, + "learning_rate": 9.391741862836936e-07, + "loss": 0.3085, + "step": 913 + }, + { + "epoch": 0.5019220208676551, + "grad_norm": 0.4619027272530282, + "learning_rate": 9.390352745950952e-07, + "loss": 0.2615, + "step": 914 + }, + { + "epoch": 0.5024711696869851, + "grad_norm": 0.5721247327288269, + "learning_rate": 9.388962147651392e-07, + "loss": 0.3285, + "step": 915 + }, + { + "epoch": 0.5030203185063152, + "grad_norm": 0.42653088411587964, + "learning_rate": 9.387570068407536e-07, + "loss": 0.2764, + "step": 916 + }, + { + "epoch": 0.5035694673256452, + "grad_norm": 0.664657702924952, + "learning_rate": 9.386176508689155e-07, + "loss": 0.3561, + "step": 917 + }, + { + "epoch": 0.5041186161449753, + "grad_norm": 0.5253474047699631, + "learning_rate": 9.384781468966527e-07, + "loss": 0.287, + "step": 918 + }, + { + "epoch": 0.5046677649643053, + "grad_norm": 0.5336025783006878, + "learning_rate": 9.383384949710427e-07, + "loss": 0.2593, + "step": 919 + }, + { + "epoch": 0.5052169137836353, + "grad_norm": 0.3921022334808134, + "learning_rate": 9.381986951392127e-07, + "loss": 0.2852, + "step": 920 + }, + { + "epoch": 0.5057660626029654, + "grad_norm": 0.4819111234045801, + "learning_rate": 9.380587474483399e-07, + "loss": 0.28, + "step": 921 + }, + { + "epoch": 0.5063152114222954, + "grad_norm": 0.5474448343933163, + "learning_rate": 9.379186519456518e-07, + "loss": 0.2978, + "step": 922 + }, + { + "epoch": 0.5068643602416255, + "grad_norm": 0.8522125809139546, + "learning_rate": 9.377784086784252e-07, + "loss": 0.3143, + "step": 923 + }, + { + "epoch": 0.5074135090609555, + "grad_norm": 0.4849817257556121, + "learning_rate": 9.376380176939871e-07, + "loss": 0.2821, + "step": 924 + }, + { + "epoch": 0.5079626578802856, + "grad_norm": 0.41393907286812087, + "learning_rate": 9.374974790397144e-07, + "loss": 0.32, + "step": 925 + }, + { + "epoch": 0.5085118066996156, + "grad_norm": 0.434081630367767, + "learning_rate": 9.373567927630336e-07, + "loss": 0.2928, + "step": 926 + }, + { + "epoch": 0.5090609555189456, + "grad_norm": 0.4540361450635598, + "learning_rate": 9.372159589114213e-07, + "loss": 0.2816, + "step": 927 + }, + { + "epoch": 0.5096101043382757, + "grad_norm": 0.4516371688883139, + "learning_rate": 9.370749775324033e-07, + "loss": 0.2847, + "step": 928 + }, + { + "epoch": 0.5101592531576057, + "grad_norm": 0.6032813006104688, + "learning_rate": 9.369338486735562e-07, + "loss": 0.2767, + "step": 929 + }, + { + "epoch": 0.5107084019769358, + "grad_norm": 0.49894381999492476, + "learning_rate": 9.367925723825053e-07, + "loss": 0.2897, + "step": 930 + }, + { + "epoch": 0.5112575507962658, + "grad_norm": 0.4572027432433077, + "learning_rate": 9.366511487069265e-07, + "loss": 0.2908, + "step": 931 + }, + { + "epoch": 0.5118066996155958, + "grad_norm": 0.46266433746076124, + "learning_rate": 9.365095776945451e-07, + "loss": 0.2546, + "step": 932 + }, + { + "epoch": 0.5123558484349259, + "grad_norm": 0.5730402499622492, + "learning_rate": 9.363678593931358e-07, + "loss": 0.2847, + "step": 933 + }, + { + "epoch": 0.5129049972542559, + "grad_norm": 0.4967244319281186, + "learning_rate": 9.362259938505233e-07, + "loss": 0.2771, + "step": 934 + }, + { + "epoch": 0.513454146073586, + "grad_norm": 0.49356871672781477, + "learning_rate": 9.360839811145824e-07, + "loss": 0.2581, + "step": 935 + }, + { + "epoch": 0.514003294892916, + "grad_norm": 0.39402326807548255, + "learning_rate": 9.359418212332369e-07, + "loss": 0.2736, + "step": 936 + }, + { + "epoch": 0.5145524437122461, + "grad_norm": 0.5069437138906921, + "learning_rate": 9.357995142544604e-07, + "loss": 0.2816, + "step": 937 + }, + { + "epoch": 0.5151015925315761, + "grad_norm": 0.6017536805081614, + "learning_rate": 9.356570602262765e-07, + "loss": 0.2937, + "step": 938 + }, + { + "epoch": 0.515650741350906, + "grad_norm": 0.49563430068716785, + "learning_rate": 9.355144591967578e-07, + "loss": 0.2873, + "step": 939 + }, + { + "epoch": 0.5161998901702362, + "grad_norm": 0.47672573468420487, + "learning_rate": 9.353717112140276e-07, + "loss": 0.2742, + "step": 940 + }, + { + "epoch": 0.5167490389895661, + "grad_norm": 0.5376604255900301, + "learning_rate": 9.352288163262575e-07, + "loss": 0.2763, + "step": 941 + }, + { + "epoch": 0.5172981878088962, + "grad_norm": 0.4313417427628322, + "learning_rate": 9.350857745816693e-07, + "loss": 0.3092, + "step": 942 + }, + { + "epoch": 0.5178473366282262, + "grad_norm": 0.5700375023421917, + "learning_rate": 9.349425860285346e-07, + "loss": 0.305, + "step": 943 + }, + { + "epoch": 0.5183964854475562, + "grad_norm": 0.5325081202379571, + "learning_rate": 9.347992507151739e-07, + "loss": 0.297, + "step": 944 + }, + { + "epoch": 0.5189456342668863, + "grad_norm": 0.4698987291310082, + "learning_rate": 9.34655768689958e-07, + "loss": 0.2883, + "step": 945 + }, + { + "epoch": 0.5194947830862163, + "grad_norm": 0.6051610218193799, + "learning_rate": 9.345121400013067e-07, + "loss": 0.2988, + "step": 946 + }, + { + "epoch": 0.5200439319055464, + "grad_norm": 0.5064449536476456, + "learning_rate": 9.343683646976891e-07, + "loss": 0.3229, + "step": 947 + }, + { + "epoch": 0.5205930807248764, + "grad_norm": 0.48144431665426274, + "learning_rate": 9.342244428276242e-07, + "loss": 0.2403, + "step": 948 + }, + { + "epoch": 0.5211422295442065, + "grad_norm": 0.4466776162042638, + "learning_rate": 9.340803744396804e-07, + "loss": 0.3248, + "step": 949 + }, + { + "epoch": 0.5216913783635365, + "grad_norm": 0.39529463847515395, + "learning_rate": 9.339361595824755e-07, + "loss": 0.2985, + "step": 950 + }, + { + "epoch": 0.5222405271828665, + "grad_norm": 0.4980142634653844, + "learning_rate": 9.337917983046766e-07, + "loss": 0.2635, + "step": 951 + }, + { + "epoch": 0.5227896760021966, + "grad_norm": 0.5515774761970018, + "learning_rate": 9.336472906550005e-07, + "loss": 0.2817, + "step": 952 + }, + { + "epoch": 0.5233388248215266, + "grad_norm": 0.4278604719170521, + "learning_rate": 9.335026366822129e-07, + "loss": 0.2938, + "step": 953 + }, + { + "epoch": 0.5238879736408567, + "grad_norm": 0.4656892942206506, + "learning_rate": 9.333578364351294e-07, + "loss": 0.2975, + "step": 954 + }, + { + "epoch": 0.5244371224601867, + "grad_norm": 0.4170003565448647, + "learning_rate": 9.332128899626148e-07, + "loss": 0.3096, + "step": 955 + }, + { + "epoch": 0.5249862712795168, + "grad_norm": 0.41154133521168373, + "learning_rate": 9.330677973135831e-07, + "loss": 0.297, + "step": 956 + }, + { + "epoch": 0.5255354200988468, + "grad_norm": 0.5552848734845941, + "learning_rate": 9.329225585369976e-07, + "loss": 0.2872, + "step": 957 + }, + { + "epoch": 0.5260845689181768, + "grad_norm": 0.5017420360248451, + "learning_rate": 9.327771736818712e-07, + "loss": 0.2718, + "step": 958 + }, + { + "epoch": 0.5266337177375069, + "grad_norm": 0.48497670923181874, + "learning_rate": 9.32631642797266e-07, + "loss": 0.3055, + "step": 959 + }, + { + "epoch": 0.5271828665568369, + "grad_norm": 0.4335856737583551, + "learning_rate": 9.324859659322933e-07, + "loss": 0.2555, + "step": 960 + }, + { + "epoch": 0.527732015376167, + "grad_norm": 0.44596133464332316, + "learning_rate": 9.323401431361133e-07, + "loss": 0.3581, + "step": 961 + }, + { + "epoch": 0.528281164195497, + "grad_norm": 0.6106061044251618, + "learning_rate": 9.321941744579363e-07, + "loss": 0.2935, + "step": 962 + }, + { + "epoch": 0.528830313014827, + "grad_norm": 0.4147544086466501, + "learning_rate": 9.320480599470209e-07, + "loss": 0.2998, + "step": 963 + }, + { + "epoch": 0.5293794618341571, + "grad_norm": 0.4794800558341023, + "learning_rate": 9.319017996526759e-07, + "loss": 0.2754, + "step": 964 + }, + { + "epoch": 0.5299286106534871, + "grad_norm": 0.44534003351037493, + "learning_rate": 9.317553936242583e-07, + "loss": 0.2709, + "step": 965 + }, + { + "epoch": 0.5304777594728172, + "grad_norm": 0.4867035947101275, + "learning_rate": 9.31608841911175e-07, + "loss": 0.2648, + "step": 966 + }, + { + "epoch": 0.5310269082921472, + "grad_norm": 0.4209812485651194, + "learning_rate": 9.314621445628818e-07, + "loss": 0.2547, + "step": 967 + }, + { + "epoch": 0.5315760571114773, + "grad_norm": 0.4681960942365018, + "learning_rate": 9.313153016288834e-07, + "loss": 0.2919, + "step": 968 + }, + { + "epoch": 0.5321252059308073, + "grad_norm": 0.4454053676970525, + "learning_rate": 9.311683131587341e-07, + "loss": 0.282, + "step": 969 + }, + { + "epoch": 0.5326743547501372, + "grad_norm": 0.5493578070784273, + "learning_rate": 9.310211792020373e-07, + "loss": 0.3053, + "step": 970 + }, + { + "epoch": 0.5332235035694673, + "grad_norm": 0.5390349826204069, + "learning_rate": 9.308738998084448e-07, + "loss": 0.2845, + "step": 971 + }, + { + "epoch": 0.5337726523887973, + "grad_norm": 0.3860137685475121, + "learning_rate": 9.307264750276581e-07, + "loss": 0.3687, + "step": 972 + }, + { + "epoch": 0.5343218012081274, + "grad_norm": 0.4849612252791877, + "learning_rate": 9.305789049094279e-07, + "loss": 0.2742, + "step": 973 + }, + { + "epoch": 0.5348709500274574, + "grad_norm": 0.3957981429710754, + "learning_rate": 9.304311895035535e-07, + "loss": 0.3143, + "step": 974 + }, + { + "epoch": 0.5354200988467874, + "grad_norm": 0.523471832019245, + "learning_rate": 9.302833288598835e-07, + "loss": 0.2877, + "step": 975 + }, + { + "epoch": 0.5359692476661175, + "grad_norm": 0.4404212152421518, + "learning_rate": 9.301353230283152e-07, + "loss": 0.2758, + "step": 976 + }, + { + "epoch": 0.5365183964854475, + "grad_norm": 0.5394204270927017, + "learning_rate": 9.299871720587954e-07, + "loss": 0.3072, + "step": 977 + }, + { + "epoch": 0.5370675453047776, + "grad_norm": 0.5823178998925181, + "learning_rate": 9.298388760013194e-07, + "loss": 0.2785, + "step": 978 + }, + { + "epoch": 0.5376166941241076, + "grad_norm": 0.4260921009488124, + "learning_rate": 9.296904349059318e-07, + "loss": 0.2717, + "step": 979 + }, + { + "epoch": 0.5381658429434377, + "grad_norm": 0.4019948136463715, + "learning_rate": 9.295418488227257e-07, + "loss": 0.2969, + "step": 980 + }, + { + "epoch": 0.5387149917627677, + "grad_norm": 0.42484620847494486, + "learning_rate": 9.293931178018437e-07, + "loss": 0.3319, + "step": 981 + }, + { + "epoch": 0.5392641405820977, + "grad_norm": 0.49589138464291516, + "learning_rate": 9.292442418934771e-07, + "loss": 0.3092, + "step": 982 + }, + { + "epoch": 0.5398132894014278, + "grad_norm": 0.4523297379936676, + "learning_rate": 9.290952211478659e-07, + "loss": 0.2805, + "step": 983 + }, + { + "epoch": 0.5403624382207578, + "grad_norm": 0.49903798190139037, + "learning_rate": 9.28946055615299e-07, + "loss": 0.3103, + "step": 984 + }, + { + "epoch": 0.5409115870400879, + "grad_norm": 0.44253415856030676, + "learning_rate": 9.287967453461146e-07, + "loss": 0.2831, + "step": 985 + }, + { + "epoch": 0.5414607358594179, + "grad_norm": 0.4916236801329548, + "learning_rate": 9.28647290390699e-07, + "loss": 0.2705, + "step": 986 + }, + { + "epoch": 0.5420098846787479, + "grad_norm": 0.4326158839163807, + "learning_rate": 9.284976907994881e-07, + "loss": 0.2763, + "step": 987 + }, + { + "epoch": 0.542559033498078, + "grad_norm": 0.39425862258101324, + "learning_rate": 9.28347946622966e-07, + "loss": 0.2584, + "step": 988 + }, + { + "epoch": 0.543108182317408, + "grad_norm": 0.5147046280713691, + "learning_rate": 9.28198057911666e-07, + "loss": 0.2574, + "step": 989 + }, + { + "epoch": 0.5436573311367381, + "grad_norm": 0.41079396351260283, + "learning_rate": 9.2804802471617e-07, + "loss": 0.2752, + "step": 990 + }, + { + "epoch": 0.5442064799560681, + "grad_norm": 0.4386781914515836, + "learning_rate": 9.278978470871086e-07, + "loss": 0.2849, + "step": 991 + }, + { + "epoch": 0.5447556287753982, + "grad_norm": 0.4376795852515167, + "learning_rate": 9.277475250751613e-07, + "loss": 0.2851, + "step": 992 + }, + { + "epoch": 0.5453047775947282, + "grad_norm": 0.5137192521633446, + "learning_rate": 9.275970587310562e-07, + "loss": 0.3714, + "step": 993 + }, + { + "epoch": 0.5458539264140582, + "grad_norm": 0.49699132352043934, + "learning_rate": 9.274464481055702e-07, + "loss": 0.251, + "step": 994 + }, + { + "epoch": 0.5464030752333883, + "grad_norm": 0.44790179295453575, + "learning_rate": 9.272956932495288e-07, + "loss": 0.2999, + "step": 995 + }, + { + "epoch": 0.5469522240527183, + "grad_norm": 0.4719079324155892, + "learning_rate": 9.27144794213806e-07, + "loss": 0.298, + "step": 996 + }, + { + "epoch": 0.5475013728720484, + "grad_norm": 0.5108807732449081, + "learning_rate": 9.269937510493249e-07, + "loss": 0.3093, + "step": 997 + }, + { + "epoch": 0.5480505216913784, + "grad_norm": 0.6127240020836249, + "learning_rate": 9.26842563807057e-07, + "loss": 0.2873, + "step": 998 + }, + { + "epoch": 0.5485996705107083, + "grad_norm": 0.48710113873037447, + "learning_rate": 9.266912325380225e-07, + "loss": 0.2569, + "step": 999 + }, + { + "epoch": 0.5491488193300385, + "grad_norm": 0.5054369705066603, + "learning_rate": 9.2653975729329e-07, + "loss": 0.333, + "step": 1000 + }, + { + "epoch": 0.5491488193300385, + "eval_loss": 0.372631311416626, + "eval_runtime": 18.5946, + "eval_samples_per_second": 23.824, + "eval_steps_per_second": 1.022, + "step": 1000 + }, + { + "epoch": 0.5496979681493684, + "grad_norm": 0.5563534486868508, + "learning_rate": 9.263881381239767e-07, + "loss": 0.3179, + "step": 1001 + }, + { + "epoch": 0.5502471169686985, + "grad_norm": 0.8165424003556195, + "learning_rate": 9.262363750812487e-07, + "loss": 0.3733, + "step": 1002 + }, + { + "epoch": 0.5507962657880285, + "grad_norm": 0.45455738506506776, + "learning_rate": 9.260844682163204e-07, + "loss": 0.2961, + "step": 1003 + }, + { + "epoch": 0.5513454146073586, + "grad_norm": 0.47059950329038075, + "learning_rate": 9.259324175804547e-07, + "loss": 0.2794, + "step": 1004 + }, + { + "epoch": 0.5518945634266886, + "grad_norm": 0.39990353842343673, + "learning_rate": 9.25780223224963e-07, + "loss": 0.2729, + "step": 1005 + }, + { + "epoch": 0.5524437122460186, + "grad_norm": 0.5200228849861063, + "learning_rate": 9.256278852012054e-07, + "loss": 0.2943, + "step": 1006 + }, + { + "epoch": 0.5529928610653487, + "grad_norm": 0.4439072868789634, + "learning_rate": 9.254754035605905e-07, + "loss": 0.2851, + "step": 1007 + }, + { + "epoch": 0.5535420098846787, + "grad_norm": 0.4409853570927373, + "learning_rate": 9.253227783545751e-07, + "loss": 0.2854, + "step": 1008 + }, + { + "epoch": 0.5540911587040088, + "grad_norm": 0.45284311556082324, + "learning_rate": 9.251700096346644e-07, + "loss": 0.315, + "step": 1009 + }, + { + "epoch": 0.5546403075233388, + "grad_norm": 0.39707003431281584, + "learning_rate": 9.250170974524126e-07, + "loss": 0.299, + "step": 1010 + }, + { + "epoch": 0.5551894563426688, + "grad_norm": 0.4362985555384377, + "learning_rate": 9.248640418594217e-07, + "loss": 0.2665, + "step": 1011 + }, + { + "epoch": 0.5557386051619989, + "grad_norm": 0.44207446986588456, + "learning_rate": 9.247108429073423e-07, + "loss": 0.2791, + "step": 1012 + }, + { + "epoch": 0.5562877539813289, + "grad_norm": 0.420943626873848, + "learning_rate": 9.245575006478735e-07, + "loss": 0.25, + "step": 1013 + }, + { + "epoch": 0.556836902800659, + "grad_norm": 0.48573056151952415, + "learning_rate": 9.244040151327625e-07, + "loss": 0.2608, + "step": 1014 + }, + { + "epoch": 0.557386051619989, + "grad_norm": 0.4342806958191244, + "learning_rate": 9.242503864138052e-07, + "loss": 0.2636, + "step": 1015 + }, + { + "epoch": 0.5579352004393191, + "grad_norm": 0.6293301005515782, + "learning_rate": 9.240966145428457e-07, + "loss": 0.2713, + "step": 1016 + }, + { + "epoch": 0.5584843492586491, + "grad_norm": 0.5747765499108737, + "learning_rate": 9.239426995717761e-07, + "loss": 0.349, + "step": 1017 + }, + { + "epoch": 0.5590334980779791, + "grad_norm": 0.5482605833191367, + "learning_rate": 9.237886415525372e-07, + "loss": 0.2673, + "step": 1018 + }, + { + "epoch": 0.5595826468973092, + "grad_norm": 0.4409608254364869, + "learning_rate": 9.236344405371177e-07, + "loss": 0.2967, + "step": 1019 + }, + { + "epoch": 0.5601317957166392, + "grad_norm": 0.40965160928332, + "learning_rate": 9.23480096577555e-07, + "loss": 0.2837, + "step": 1020 + }, + { + "epoch": 0.5606809445359693, + "grad_norm": 0.5517526784895708, + "learning_rate": 9.233256097259343e-07, + "loss": 0.3186, + "step": 1021 + }, + { + "epoch": 0.5612300933552993, + "grad_norm": 0.5093156398528925, + "learning_rate": 9.231709800343895e-07, + "loss": 0.3348, + "step": 1022 + }, + { + "epoch": 0.5617792421746294, + "grad_norm": 0.6554873163717477, + "learning_rate": 9.230162075551021e-07, + "loss": 0.3147, + "step": 1023 + }, + { + "epoch": 0.5623283909939594, + "grad_norm": 0.4039473778050129, + "learning_rate": 9.228612923403022e-07, + "loss": 0.2834, + "step": 1024 + }, + { + "epoch": 0.5628775398132894, + "grad_norm": 0.4666790654415315, + "learning_rate": 9.227062344422682e-07, + "loss": 0.2667, + "step": 1025 + }, + { + "epoch": 0.5634266886326195, + "grad_norm": 0.4435666165995333, + "learning_rate": 9.225510339133262e-07, + "loss": 0.3111, + "step": 1026 + }, + { + "epoch": 0.5639758374519495, + "grad_norm": 0.5154742963928248, + "learning_rate": 9.223956908058508e-07, + "loss": 0.3089, + "step": 1027 + }, + { + "epoch": 0.5645249862712796, + "grad_norm": 0.42878577706892507, + "learning_rate": 9.222402051722642e-07, + "loss": 0.2542, + "step": 1028 + }, + { + "epoch": 0.5650741350906096, + "grad_norm": 0.5269383382001943, + "learning_rate": 9.220845770650377e-07, + "loss": 0.2752, + "step": 1029 + }, + { + "epoch": 0.5656232839099395, + "grad_norm": 0.4495018623801755, + "learning_rate": 9.219288065366896e-07, + "loss": 0.2791, + "step": 1030 + }, + { + "epoch": 0.5661724327292696, + "grad_norm": 0.5065307592932909, + "learning_rate": 9.217728936397868e-07, + "loss": 0.3145, + "step": 1031 + }, + { + "epoch": 0.5667215815485996, + "grad_norm": 0.4391342148525691, + "learning_rate": 9.216168384269443e-07, + "loss": 0.2635, + "step": 1032 + }, + { + "epoch": 0.5672707303679297, + "grad_norm": 0.5479487923582129, + "learning_rate": 9.214606409508248e-07, + "loss": 0.2907, + "step": 1033 + }, + { + "epoch": 0.5678198791872597, + "grad_norm": 0.46579473113214453, + "learning_rate": 9.213043012641393e-07, + "loss": 0.2838, + "step": 1034 + }, + { + "epoch": 0.5683690280065898, + "grad_norm": 0.44503717521250913, + "learning_rate": 9.211478194196466e-07, + "loss": 0.2758, + "step": 1035 + }, + { + "epoch": 0.5689181768259198, + "grad_norm": 1.0394730117520932, + "learning_rate": 9.209911954701537e-07, + "loss": 0.4999, + "step": 1036 + }, + { + "epoch": 0.5694673256452498, + "grad_norm": 0.3933071990766414, + "learning_rate": 9.208344294685153e-07, + "loss": 0.2714, + "step": 1037 + }, + { + "epoch": 0.5700164744645799, + "grad_norm": 0.47083533744879774, + "learning_rate": 9.206775214676342e-07, + "loss": 0.2966, + "step": 1038 + }, + { + "epoch": 0.5705656232839099, + "grad_norm": 0.41288928208193654, + "learning_rate": 9.20520471520461e-07, + "loss": 0.2672, + "step": 1039 + }, + { + "epoch": 0.57111477210324, + "grad_norm": 0.4867781728778002, + "learning_rate": 9.203632796799943e-07, + "loss": 0.2913, + "step": 1040 + }, + { + "epoch": 0.57166392092257, + "grad_norm": 0.4596785861988866, + "learning_rate": 9.202059459992808e-07, + "loss": 0.2471, + "step": 1041 + }, + { + "epoch": 0.5722130697419, + "grad_norm": 0.5155292158510203, + "learning_rate": 9.200484705314144e-07, + "loss": 0.2834, + "step": 1042 + }, + { + "epoch": 0.5727622185612301, + "grad_norm": 0.42303242668989033, + "learning_rate": 9.198908533295377e-07, + "loss": 0.2817, + "step": 1043 + }, + { + "epoch": 0.5733113673805601, + "grad_norm": 0.5924247458314694, + "learning_rate": 9.197330944468401e-07, + "loss": 0.3294, + "step": 1044 + }, + { + "epoch": 0.5738605161998902, + "grad_norm": 0.7890449041973231, + "learning_rate": 9.195751939365601e-07, + "loss": 0.3521, + "step": 1045 + }, + { + "epoch": 0.5744096650192202, + "grad_norm": 0.6401070170444926, + "learning_rate": 9.194171518519827e-07, + "loss": 0.3124, + "step": 1046 + }, + { + "epoch": 0.5749588138385503, + "grad_norm": 0.47296090750512837, + "learning_rate": 9.192589682464416e-07, + "loss": 0.3118, + "step": 1047 + }, + { + "epoch": 0.5755079626578803, + "grad_norm": 0.5072279632489303, + "learning_rate": 9.19100643173318e-07, + "loss": 0.2501, + "step": 1048 + }, + { + "epoch": 0.5760571114772103, + "grad_norm": 0.46312294508566104, + "learning_rate": 9.189421766860408e-07, + "loss": 0.256, + "step": 1049 + }, + { + "epoch": 0.5766062602965404, + "grad_norm": 0.44863731340899177, + "learning_rate": 9.187835688380864e-07, + "loss": 0.2664, + "step": 1050 + }, + { + "epoch": 0.5771554091158704, + "grad_norm": 0.45904360310654274, + "learning_rate": 9.186248196829791e-07, + "loss": 0.2644, + "step": 1051 + }, + { + "epoch": 0.5777045579352005, + "grad_norm": 0.3688717426870435, + "learning_rate": 9.184659292742909e-07, + "loss": 0.2833, + "step": 1052 + }, + { + "epoch": 0.5782537067545305, + "grad_norm": 0.488584232027721, + "learning_rate": 9.183068976656418e-07, + "loss": 0.266, + "step": 1053 + }, + { + "epoch": 0.5788028555738605, + "grad_norm": 0.7392820483582149, + "learning_rate": 9.181477249106986e-07, + "loss": 0.2481, + "step": 1054 + }, + { + "epoch": 0.5793520043931906, + "grad_norm": 0.4856031534132429, + "learning_rate": 9.179884110631767e-07, + "loss": 0.2941, + "step": 1055 + }, + { + "epoch": 0.5799011532125206, + "grad_norm": 0.45023080124313786, + "learning_rate": 9.178289561768382e-07, + "loss": 0.2745, + "step": 1056 + }, + { + "epoch": 0.5804503020318507, + "grad_norm": 0.5244343852114275, + "learning_rate": 9.176693603054937e-07, + "loss": 0.2645, + "step": 1057 + }, + { + "epoch": 0.5809994508511807, + "grad_norm": 0.5355774488113496, + "learning_rate": 9.175096235030006e-07, + "loss": 0.2561, + "step": 1058 + }, + { + "epoch": 0.5815485996705108, + "grad_norm": 0.4205549619981094, + "learning_rate": 9.173497458232644e-07, + "loss": 0.2708, + "step": 1059 + }, + { + "epoch": 0.5820977484898407, + "grad_norm": 0.5046767029446393, + "learning_rate": 9.171897273202379e-07, + "loss": 0.2809, + "step": 1060 + }, + { + "epoch": 0.5826468973091707, + "grad_norm": 0.4208047571407197, + "learning_rate": 9.170295680479212e-07, + "loss": 0.2365, + "step": 1061 + }, + { + "epoch": 0.5831960461285008, + "grad_norm": 0.563195578366495, + "learning_rate": 9.168692680603625e-07, + "loss": 0.3115, + "step": 1062 + }, + { + "epoch": 0.5837451949478308, + "grad_norm": 0.4229304487438865, + "learning_rate": 9.167088274116569e-07, + "loss": 0.3147, + "step": 1063 + }, + { + "epoch": 0.5842943437671609, + "grad_norm": 0.49847873535309245, + "learning_rate": 9.165482461559472e-07, + "loss": 0.3496, + "step": 1064 + }, + { + "epoch": 0.5848434925864909, + "grad_norm": 0.38940763478597096, + "learning_rate": 9.163875243474237e-07, + "loss": 0.297, + "step": 1065 + }, + { + "epoch": 0.5853926414058209, + "grad_norm": 0.8981788759355992, + "learning_rate": 9.162266620403243e-07, + "loss": 0.2678, + "step": 1066 + }, + { + "epoch": 0.585941790225151, + "grad_norm": 0.41067609206423245, + "learning_rate": 9.160656592889339e-07, + "loss": 0.2692, + "step": 1067 + }, + { + "epoch": 0.586490939044481, + "grad_norm": 0.4943860716989215, + "learning_rate": 9.15904516147585e-07, + "loss": 0.2964, + "step": 1068 + }, + { + "epoch": 0.5870400878638111, + "grad_norm": 0.4997805459811917, + "learning_rate": 9.157432326706575e-07, + "loss": 0.289, + "step": 1069 + }, + { + "epoch": 0.5875892366831411, + "grad_norm": 0.3793351213782992, + "learning_rate": 9.155818089125786e-07, + "loss": 0.2623, + "step": 1070 + }, + { + "epoch": 0.5881383855024712, + "grad_norm": 0.562855713320707, + "learning_rate": 9.154202449278229e-07, + "loss": 0.2628, + "step": 1071 + }, + { + "epoch": 0.5886875343218012, + "grad_norm": 0.3754049716030077, + "learning_rate": 9.152585407709124e-07, + "loss": 0.269, + "step": 1072 + }, + { + "epoch": 0.5892366831411312, + "grad_norm": 0.5819722568773006, + "learning_rate": 9.150966964964161e-07, + "loss": 0.2871, + "step": 1073 + }, + { + "epoch": 0.5897858319604613, + "grad_norm": 0.43918964482111517, + "learning_rate": 9.149347121589505e-07, + "loss": 0.267, + "step": 1074 + }, + { + "epoch": 0.5903349807797913, + "grad_norm": 0.7524912779221687, + "learning_rate": 9.147725878131796e-07, + "loss": 0.2776, + "step": 1075 + }, + { + "epoch": 0.5908841295991214, + "grad_norm": 0.9002793616105111, + "learning_rate": 9.146103235138141e-07, + "loss": 0.3054, + "step": 1076 + }, + { + "epoch": 0.5914332784184514, + "grad_norm": 0.44101669571912955, + "learning_rate": 9.144479193156124e-07, + "loss": 0.2936, + "step": 1077 + }, + { + "epoch": 0.5919824272377814, + "grad_norm": 0.5867875151240506, + "learning_rate": 9.142853752733799e-07, + "loss": 0.3049, + "step": 1078 + }, + { + "epoch": 0.5925315760571115, + "grad_norm": 0.7208086384724053, + "learning_rate": 9.141226914419691e-07, + "loss": 0.4554, + "step": 1079 + }, + { + "epoch": 0.5930807248764415, + "grad_norm": 0.3874539214172494, + "learning_rate": 9.139598678762799e-07, + "loss": 0.2734, + "step": 1080 + }, + { + "epoch": 0.5936298736957716, + "grad_norm": 0.6043320796954844, + "learning_rate": 9.137969046312594e-07, + "loss": 0.3125, + "step": 1081 + }, + { + "epoch": 0.5941790225151016, + "grad_norm": 0.3912012788205948, + "learning_rate": 9.136338017619014e-07, + "loss": 0.2394, + "step": 1082 + }, + { + "epoch": 0.5947281713344317, + "grad_norm": 0.5414117089228002, + "learning_rate": 9.134705593232472e-07, + "loss": 0.2594, + "step": 1083 + }, + { + "epoch": 0.5952773201537617, + "grad_norm": 0.8054074074934338, + "learning_rate": 9.133071773703853e-07, + "loss": 0.3648, + "step": 1084 + }, + { + "epoch": 0.5958264689730917, + "grad_norm": 0.5807567169129422, + "learning_rate": 9.131436559584509e-07, + "loss": 0.3265, + "step": 1085 + }, + { + "epoch": 0.5963756177924218, + "grad_norm": 0.48697925601004677, + "learning_rate": 9.129799951426264e-07, + "loss": 0.2625, + "step": 1086 + }, + { + "epoch": 0.5969247666117518, + "grad_norm": 0.44450817749726423, + "learning_rate": 9.128161949781415e-07, + "loss": 0.2723, + "step": 1087 + }, + { + "epoch": 0.5974739154310819, + "grad_norm": 0.4146900771755306, + "learning_rate": 9.126522555202727e-07, + "loss": 0.2693, + "step": 1088 + }, + { + "epoch": 0.5980230642504119, + "grad_norm": 0.48869787874134757, + "learning_rate": 9.124881768243433e-07, + "loss": 0.2819, + "step": 1089 + }, + { + "epoch": 0.598572213069742, + "grad_norm": 0.5304503915276655, + "learning_rate": 9.123239589457242e-07, + "loss": 0.2943, + "step": 1090 + }, + { + "epoch": 0.599121361889072, + "grad_norm": 0.40953396026247785, + "learning_rate": 9.121596019398323e-07, + "loss": 0.2805, + "step": 1091 + }, + { + "epoch": 0.5996705107084019, + "grad_norm": 0.5587497644085375, + "learning_rate": 9.119951058621326e-07, + "loss": 0.289, + "step": 1092 + }, + { + "epoch": 0.600219659527732, + "grad_norm": 0.45194476417255347, + "learning_rate": 9.118304707681362e-07, + "loss": 0.271, + "step": 1093 + }, + { + "epoch": 0.600768808347062, + "grad_norm": 0.41864743967092277, + "learning_rate": 9.116656967134015e-07, + "loss": 0.2986, + "step": 1094 + }, + { + "epoch": 0.6013179571663921, + "grad_norm": 0.513933227032906, + "learning_rate": 9.115007837535336e-07, + "loss": 0.2785, + "step": 1095 + }, + { + "epoch": 0.6018671059857221, + "grad_norm": 0.49599002704508294, + "learning_rate": 9.113357319441842e-07, + "loss": 0.2996, + "step": 1096 + }, + { + "epoch": 0.6024162548050521, + "grad_norm": 0.423546937059523, + "learning_rate": 9.11170541341053e-07, + "loss": 0.2763, + "step": 1097 + }, + { + "epoch": 0.6029654036243822, + "grad_norm": 0.5185983592412352, + "learning_rate": 9.110052119998851e-07, + "loss": 0.2707, + "step": 1098 + }, + { + "epoch": 0.6035145524437122, + "grad_norm": 0.45148826524054897, + "learning_rate": 9.108397439764732e-07, + "loss": 0.2809, + "step": 1099 + }, + { + "epoch": 0.6040637012630423, + "grad_norm": 0.4660826413233851, + "learning_rate": 9.106741373266568e-07, + "loss": 0.2935, + "step": 1100 + }, + { + "epoch": 0.6046128500823723, + "grad_norm": 0.5419396810694019, + "learning_rate": 9.105083921063221e-07, + "loss": 0.2828, + "step": 1101 + }, + { + "epoch": 0.6051619989017024, + "grad_norm": 0.45922532712302805, + "learning_rate": 9.103425083714016e-07, + "loss": 0.2889, + "step": 1102 + }, + { + "epoch": 0.6057111477210324, + "grad_norm": 0.4649333694966789, + "learning_rate": 9.101764861778754e-07, + "loss": 0.2643, + "step": 1103 + }, + { + "epoch": 0.6062602965403624, + "grad_norm": 0.418968172335678, + "learning_rate": 9.100103255817696e-07, + "loss": 0.3125, + "step": 1104 + }, + { + "epoch": 0.6068094453596925, + "grad_norm": 0.45190514849951635, + "learning_rate": 9.098440266391574e-07, + "loss": 0.2483, + "step": 1105 + }, + { + "epoch": 0.6073585941790225, + "grad_norm": 0.4209969854467636, + "learning_rate": 9.096775894061586e-07, + "loss": 0.2743, + "step": 1106 + }, + { + "epoch": 0.6079077429983526, + "grad_norm": 0.4409671357922739, + "learning_rate": 9.095110139389395e-07, + "loss": 0.3051, + "step": 1107 + }, + { + "epoch": 0.6084568918176826, + "grad_norm": 0.5279492928832483, + "learning_rate": 9.093443002937131e-07, + "loss": 0.2817, + "step": 1108 + }, + { + "epoch": 0.6090060406370126, + "grad_norm": 0.39225930694864325, + "learning_rate": 9.091774485267395e-07, + "loss": 0.2871, + "step": 1109 + }, + { + "epoch": 0.6095551894563427, + "grad_norm": 0.5863271436048614, + "learning_rate": 9.090104586943247e-07, + "loss": 0.3082, + "step": 1110 + }, + { + "epoch": 0.6101043382756727, + "grad_norm": 0.4788268336104399, + "learning_rate": 9.088433308528217e-07, + "loss": 0.2536, + "step": 1111 + }, + { + "epoch": 0.6106534870950028, + "grad_norm": 0.5112645623805705, + "learning_rate": 9.0867606505863e-07, + "loss": 0.3205, + "step": 1112 + }, + { + "epoch": 0.6112026359143328, + "grad_norm": 0.5725254861344884, + "learning_rate": 9.085086613681957e-07, + "loss": 0.2549, + "step": 1113 + }, + { + "epoch": 0.6117517847336629, + "grad_norm": 0.46942712798410025, + "learning_rate": 9.083411198380112e-07, + "loss": 0.3318, + "step": 1114 + }, + { + "epoch": 0.6123009335529929, + "grad_norm": 0.49984667196143134, + "learning_rate": 9.081734405246158e-07, + "loss": 0.3263, + "step": 1115 + }, + { + "epoch": 0.6128500823723229, + "grad_norm": 0.3927550822295869, + "learning_rate": 9.08005623484595e-07, + "loss": 0.3072, + "step": 1116 + }, + { + "epoch": 0.613399231191653, + "grad_norm": 0.48374518376634384, + "learning_rate": 9.078376687745809e-07, + "loss": 0.2971, + "step": 1117 + }, + { + "epoch": 0.613948380010983, + "grad_norm": 0.5909556068133605, + "learning_rate": 9.07669576451252e-07, + "loss": 0.2788, + "step": 1118 + }, + { + "epoch": 0.6144975288303131, + "grad_norm": 0.4514702540817506, + "learning_rate": 9.075013465713333e-07, + "loss": 0.3036, + "step": 1119 + }, + { + "epoch": 0.615046677649643, + "grad_norm": 0.5730970328312417, + "learning_rate": 9.073329791915959e-07, + "loss": 0.2749, + "step": 1120 + }, + { + "epoch": 0.615595826468973, + "grad_norm": 0.46720357212585395, + "learning_rate": 9.071644743688581e-07, + "loss": 0.2834, + "step": 1121 + }, + { + "epoch": 0.6161449752883031, + "grad_norm": 0.4652048446584189, + "learning_rate": 9.069958321599836e-07, + "loss": 0.2516, + "step": 1122 + }, + { + "epoch": 0.6166941241076331, + "grad_norm": 0.49211847409586273, + "learning_rate": 9.068270526218835e-07, + "loss": 0.2895, + "step": 1123 + }, + { + "epoch": 0.6172432729269632, + "grad_norm": 0.5225658696718942, + "learning_rate": 9.06658135811514e-07, + "loss": 0.3083, + "step": 1124 + }, + { + "epoch": 0.6177924217462932, + "grad_norm": 0.4732494876596653, + "learning_rate": 9.064890817858786e-07, + "loss": 0.2883, + "step": 1125 + }, + { + "epoch": 0.6183415705656233, + "grad_norm": 0.7072333921465951, + "learning_rate": 9.063198906020269e-07, + "loss": 0.3051, + "step": 1126 + }, + { + "epoch": 0.6188907193849533, + "grad_norm": 0.568772736573002, + "learning_rate": 9.061505623170547e-07, + "loss": 0.3023, + "step": 1127 + }, + { + "epoch": 0.6194398682042833, + "grad_norm": 0.5089502546392174, + "learning_rate": 9.05981096988104e-07, + "loss": 0.3292, + "step": 1128 + }, + { + "epoch": 0.6199890170236134, + "grad_norm": 0.5380404881624282, + "learning_rate": 9.05811494672363e-07, + "loss": 0.2967, + "step": 1129 + }, + { + "epoch": 0.6205381658429434, + "grad_norm": 0.4092785459290651, + "learning_rate": 9.056417554270662e-07, + "loss": 0.3065, + "step": 1130 + }, + { + "epoch": 0.6210873146622735, + "grad_norm": 0.49815959251268416, + "learning_rate": 9.054718793094945e-07, + "loss": 0.2608, + "step": 1131 + }, + { + "epoch": 0.6216364634816035, + "grad_norm": 0.4908110343266882, + "learning_rate": 9.053018663769749e-07, + "loss": 0.2671, + "step": 1132 + }, + { + "epoch": 0.6221856123009335, + "grad_norm": 0.47435273371221404, + "learning_rate": 9.051317166868804e-07, + "loss": 0.2569, + "step": 1133 + }, + { + "epoch": 0.6227347611202636, + "grad_norm": 0.42499482377738906, + "learning_rate": 9.049614302966302e-07, + "loss": 0.2622, + "step": 1134 + }, + { + "epoch": 0.6232839099395936, + "grad_norm": 0.47501304937790956, + "learning_rate": 9.047910072636896e-07, + "loss": 0.3218, + "step": 1135 + }, + { + "epoch": 0.6238330587589237, + "grad_norm": 0.4507543358957904, + "learning_rate": 9.046204476455703e-07, + "loss": 0.2616, + "step": 1136 + }, + { + "epoch": 0.6243822075782537, + "grad_norm": 0.5867199213464204, + "learning_rate": 9.044497514998297e-07, + "loss": 0.2828, + "step": 1137 + }, + { + "epoch": 0.6249313563975838, + "grad_norm": 0.5004424648258832, + "learning_rate": 9.042789188840718e-07, + "loss": 0.2615, + "step": 1138 + }, + { + "epoch": 0.6254805052169138, + "grad_norm": 0.5353090704470145, + "learning_rate": 9.041079498559459e-07, + "loss": 0.2824, + "step": 1139 + }, + { + "epoch": 0.6260296540362438, + "grad_norm": 0.5116562529747475, + "learning_rate": 9.039368444731479e-07, + "loss": 0.2674, + "step": 1140 + }, + { + "epoch": 0.6265788028555739, + "grad_norm": 0.5227186253081871, + "learning_rate": 9.037656027934198e-07, + "loss": 0.2479, + "step": 1141 + }, + { + "epoch": 0.6271279516749039, + "grad_norm": 0.45780860128985457, + "learning_rate": 9.03594224874549e-07, + "loss": 0.2724, + "step": 1142 + }, + { + "epoch": 0.627677100494234, + "grad_norm": 0.43436053325347795, + "learning_rate": 9.034227107743694e-07, + "loss": 0.2733, + "step": 1143 + }, + { + "epoch": 0.628226249313564, + "grad_norm": 0.48117972611454424, + "learning_rate": 9.032510605507606e-07, + "loss": 0.2847, + "step": 1144 + }, + { + "epoch": 0.628775398132894, + "grad_norm": 0.741175894481161, + "learning_rate": 9.030792742616483e-07, + "loss": 0.3489, + "step": 1145 + }, + { + "epoch": 0.6293245469522241, + "grad_norm": 0.664681011052934, + "learning_rate": 9.029073519650042e-07, + "loss": 0.2996, + "step": 1146 + }, + { + "epoch": 0.629873695771554, + "grad_norm": 0.5157222479953482, + "learning_rate": 9.027352937188454e-07, + "loss": 0.2821, + "step": 1147 + }, + { + "epoch": 0.6304228445908842, + "grad_norm": 0.4836083017833757, + "learning_rate": 9.025630995812354e-07, + "loss": 0.2575, + "step": 1148 + }, + { + "epoch": 0.6309719934102141, + "grad_norm": 0.459569266796869, + "learning_rate": 9.023907696102835e-07, + "loss": 0.3097, + "step": 1149 + }, + { + "epoch": 0.6315211422295443, + "grad_norm": 0.4798189540098763, + "learning_rate": 9.022183038641445e-07, + "loss": 0.2927, + "step": 1150 + }, + { + "epoch": 0.6320702910488742, + "grad_norm": 0.4987997800145966, + "learning_rate": 9.020457024010195e-07, + "loss": 0.2971, + "step": 1151 + }, + { + "epoch": 0.6326194398682042, + "grad_norm": 1.9096532428061457, + "learning_rate": 9.018729652791548e-07, + "loss": 0.3226, + "step": 1152 + }, + { + "epoch": 0.6331685886875343, + "grad_norm": 0.44216483961472397, + "learning_rate": 9.01700092556843e-07, + "loss": 0.2638, + "step": 1153 + }, + { + "epoch": 0.6337177375068643, + "grad_norm": 0.7328629429496601, + "learning_rate": 9.015270842924223e-07, + "loss": 0.3098, + "step": 1154 + }, + { + "epoch": 0.6342668863261944, + "grad_norm": 0.5685306920819805, + "learning_rate": 9.013539405442766e-07, + "loss": 0.2933, + "step": 1155 + }, + { + "epoch": 0.6348160351455244, + "grad_norm": 0.5822974816059474, + "learning_rate": 9.011806613708355e-07, + "loss": 0.3205, + "step": 1156 + }, + { + "epoch": 0.6353651839648545, + "grad_norm": 0.45522725658968266, + "learning_rate": 9.010072468305742e-07, + "loss": 0.2643, + "step": 1157 + }, + { + "epoch": 0.6359143327841845, + "grad_norm": 0.4538908806885672, + "learning_rate": 9.008336969820141e-07, + "loss": 0.2785, + "step": 1158 + }, + { + "epoch": 0.6364634816035145, + "grad_norm": 0.4777941850696039, + "learning_rate": 9.006600118837214e-07, + "loss": 0.2352, + "step": 1159 + }, + { + "epoch": 0.6370126304228446, + "grad_norm": 0.4816371328153312, + "learning_rate": 9.004861915943088e-07, + "loss": 0.3168, + "step": 1160 + }, + { + "epoch": 0.6375617792421746, + "grad_norm": 0.35654766955404366, + "learning_rate": 9.003122361724341e-07, + "loss": 0.2711, + "step": 1161 + }, + { + "epoch": 0.6381109280615047, + "grad_norm": 0.4742952913712317, + "learning_rate": 9.001381456768008e-07, + "loss": 0.272, + "step": 1162 + }, + { + "epoch": 0.6386600768808347, + "grad_norm": 0.6322978339601845, + "learning_rate": 8.999639201661583e-07, + "loss": 0.2821, + "step": 1163 + }, + { + "epoch": 0.6392092257001647, + "grad_norm": 0.42119244942878137, + "learning_rate": 8.997895596993008e-07, + "loss": 0.2994, + "step": 1164 + }, + { + "epoch": 0.6397583745194948, + "grad_norm": 0.43903748409154886, + "learning_rate": 8.996150643350688e-07, + "loss": 0.2734, + "step": 1165 + }, + { + "epoch": 0.6403075233388248, + "grad_norm": 0.4761678888000459, + "learning_rate": 8.994404341323483e-07, + "loss": 0.2849, + "step": 1166 + }, + { + "epoch": 0.6408566721581549, + "grad_norm": 0.737157789799894, + "learning_rate": 8.992656691500703e-07, + "loss": 0.3252, + "step": 1167 + }, + { + "epoch": 0.6414058209774849, + "grad_norm": 0.5583112072557074, + "learning_rate": 8.990907694472114e-07, + "loss": 0.2866, + "step": 1168 + }, + { + "epoch": 0.641954969796815, + "grad_norm": 0.5338646041317848, + "learning_rate": 8.989157350827942e-07, + "loss": 0.2616, + "step": 1169 + }, + { + "epoch": 0.642504118616145, + "grad_norm": 0.45825004829388927, + "learning_rate": 8.987405661158859e-07, + "loss": 0.2327, + "step": 1170 + }, + { + "epoch": 0.643053267435475, + "grad_norm": 0.4595477401487814, + "learning_rate": 8.985652626055998e-07, + "loss": 0.2727, + "step": 1171 + }, + { + "epoch": 0.6436024162548051, + "grad_norm": 0.5748678681998864, + "learning_rate": 8.983898246110944e-07, + "loss": 0.2777, + "step": 1172 + }, + { + "epoch": 0.6441515650741351, + "grad_norm": 0.3900795295711715, + "learning_rate": 8.982142521915736e-07, + "loss": 0.2868, + "step": 1173 + }, + { + "epoch": 0.6447007138934652, + "grad_norm": 0.5190279935080138, + "learning_rate": 8.980385454062865e-07, + "loss": 0.2609, + "step": 1174 + }, + { + "epoch": 0.6452498627127952, + "grad_norm": 0.5151140929715181, + "learning_rate": 8.978627043145279e-07, + "loss": 0.2938, + "step": 1175 + }, + { + "epoch": 0.6457990115321252, + "grad_norm": 0.5037281238608102, + "learning_rate": 8.976867289756374e-07, + "loss": 0.2767, + "step": 1176 + }, + { + "epoch": 0.6463481603514553, + "grad_norm": 0.40939636643133553, + "learning_rate": 8.975106194490002e-07, + "loss": 0.2758, + "step": 1177 + }, + { + "epoch": 0.6468973091707853, + "grad_norm": 0.568518815783045, + "learning_rate": 8.973343757940471e-07, + "loss": 0.3135, + "step": 1178 + }, + { + "epoch": 0.6474464579901154, + "grad_norm": 0.41842298614450435, + "learning_rate": 8.971579980702533e-07, + "loss": 0.3016, + "step": 1179 + }, + { + "epoch": 0.6479956068094453, + "grad_norm": 0.5219504029737556, + "learning_rate": 8.969814863371403e-07, + "loss": 0.3162, + "step": 1180 + }, + { + "epoch": 0.6485447556287754, + "grad_norm": 0.5358210196394527, + "learning_rate": 8.968048406542741e-07, + "loss": 0.291, + "step": 1181 + }, + { + "epoch": 0.6490939044481054, + "grad_norm": 0.5116646850222143, + "learning_rate": 8.966280610812662e-07, + "loss": 0.2922, + "step": 1182 + }, + { + "epoch": 0.6496430532674354, + "grad_norm": 0.4383781220072977, + "learning_rate": 8.96451147677773e-07, + "loss": 0.3135, + "step": 1183 + }, + { + "epoch": 0.6501922020867655, + "grad_norm": 0.5597237164126757, + "learning_rate": 8.962741005034965e-07, + "loss": 0.303, + "step": 1184 + }, + { + "epoch": 0.6507413509060955, + "grad_norm": 0.417182537956975, + "learning_rate": 8.960969196181832e-07, + "loss": 0.2525, + "step": 1185 + }, + { + "epoch": 0.6512904997254256, + "grad_norm": 0.575169641090054, + "learning_rate": 8.959196050816257e-07, + "loss": 0.2823, + "step": 1186 + }, + { + "epoch": 0.6518396485447556, + "grad_norm": 0.5038074897677616, + "learning_rate": 8.957421569536607e-07, + "loss": 0.3152, + "step": 1187 + }, + { + "epoch": 0.6523887973640856, + "grad_norm": 0.5964828672055801, + "learning_rate": 8.955645752941706e-07, + "loss": 0.3339, + "step": 1188 + }, + { + "epoch": 0.6529379461834157, + "grad_norm": 0.4617619091425538, + "learning_rate": 8.953868601630826e-07, + "loss": 0.2835, + "step": 1189 + }, + { + "epoch": 0.6534870950027457, + "grad_norm": 0.5474386796376182, + "learning_rate": 8.952090116203688e-07, + "loss": 0.2871, + "step": 1190 + }, + { + "epoch": 0.6540362438220758, + "grad_norm": 0.3957478138397106, + "learning_rate": 8.950310297260468e-07, + "loss": 0.3139, + "step": 1191 + }, + { + "epoch": 0.6545853926414058, + "grad_norm": 0.4074071223396674, + "learning_rate": 8.94852914540179e-07, + "loss": 0.3019, + "step": 1192 + }, + { + "epoch": 0.6551345414607359, + "grad_norm": 0.5430721111070543, + "learning_rate": 8.946746661228726e-07, + "loss": 0.2674, + "step": 1193 + }, + { + "epoch": 0.6556836902800659, + "grad_norm": 0.4412845165835301, + "learning_rate": 8.944962845342798e-07, + "loss": 0.3119, + "step": 1194 + }, + { + "epoch": 0.6562328390993959, + "grad_norm": 0.49710103161223135, + "learning_rate": 8.943177698345978e-07, + "loss": 0.2757, + "step": 1195 + }, + { + "epoch": 0.656781987918726, + "grad_norm": 0.44532613058567916, + "learning_rate": 8.941391220840688e-07, + "loss": 0.3284, + "step": 1196 + }, + { + "epoch": 0.657331136738056, + "grad_norm": 0.6192176398577527, + "learning_rate": 8.939603413429798e-07, + "loss": 0.2528, + "step": 1197 + }, + { + "epoch": 0.6578802855573861, + "grad_norm": 0.5514717092770217, + "learning_rate": 8.937814276716629e-07, + "loss": 0.3178, + "step": 1198 + }, + { + "epoch": 0.6584294343767161, + "grad_norm": 0.44903509899343486, + "learning_rate": 8.936023811304946e-07, + "loss": 0.3086, + "step": 1199 + }, + { + "epoch": 0.6589785831960461, + "grad_norm": 0.4886647463373787, + "learning_rate": 8.934232017798967e-07, + "loss": 0.2399, + "step": 1200 + }, + { + "epoch": 0.6589785831960461, + "eval_loss": 0.36352044343948364, + "eval_runtime": 18.5912, + "eval_samples_per_second": 23.828, + "eval_steps_per_second": 1.022, + "step": 1200 + }, + { + "epoch": 0.6595277320153762, + "grad_norm": 0.5219718221773981, + "learning_rate": 8.932438896803355e-07, + "loss": 0.3048, + "step": 1201 + }, + { + "epoch": 0.6600768808347062, + "grad_norm": 0.4614849462021926, + "learning_rate": 8.930644448923223e-07, + "loss": 0.2797, + "step": 1202 + }, + { + "epoch": 0.6606260296540363, + "grad_norm": 0.3677266225613913, + "learning_rate": 8.928848674764132e-07, + "loss": 0.2682, + "step": 1203 + }, + { + "epoch": 0.6611751784733663, + "grad_norm": 0.5123719806838465, + "learning_rate": 8.927051574932087e-07, + "loss": 0.2978, + "step": 1204 + }, + { + "epoch": 0.6617243272926964, + "grad_norm": 0.3747966025929731, + "learning_rate": 8.925253150033546e-07, + "loss": 0.2791, + "step": 1205 + }, + { + "epoch": 0.6622734761120264, + "grad_norm": 0.5487569411632294, + "learning_rate": 8.923453400675408e-07, + "loss": 0.3291, + "step": 1206 + }, + { + "epoch": 0.6628226249313564, + "grad_norm": 0.5138417883651462, + "learning_rate": 8.921652327465026e-07, + "loss": 0.2763, + "step": 1207 + }, + { + "epoch": 0.6633717737506865, + "grad_norm": 0.5061801660213829, + "learning_rate": 8.91984993101019e-07, + "loss": 0.31, + "step": 1208 + }, + { + "epoch": 0.6639209225700164, + "grad_norm": 0.3942828388637908, + "learning_rate": 8.91804621191915e-07, + "loss": 0.3092, + "step": 1209 + }, + { + "epoch": 0.6644700713893466, + "grad_norm": 0.37763584539676537, + "learning_rate": 8.916241170800589e-07, + "loss": 0.2728, + "step": 1210 + }, + { + "epoch": 0.6650192202086765, + "grad_norm": 0.5009550373251159, + "learning_rate": 8.914434808263644e-07, + "loss": 0.264, + "step": 1211 + }, + { + "epoch": 0.6655683690280065, + "grad_norm": 0.7030277484483146, + "learning_rate": 8.912627124917895e-07, + "loss": 0.3135, + "step": 1212 + }, + { + "epoch": 0.6661175178473366, + "grad_norm": 0.42488607273609486, + "learning_rate": 8.910818121373369e-07, + "loss": 0.2818, + "step": 1213 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.493342392494313, + "learning_rate": 8.909007798240539e-07, + "loss": 0.2954, + "step": 1214 + }, + { + "epoch": 0.6672158154859967, + "grad_norm": 0.5039194951032646, + "learning_rate": 8.90719615613032e-07, + "loss": 0.253, + "step": 1215 + }, + { + "epoch": 0.6677649643053267, + "grad_norm": 0.42422715933104815, + "learning_rate": 8.905383195654078e-07, + "loss": 0.2852, + "step": 1216 + }, + { + "epoch": 0.6683141131246568, + "grad_norm": 0.494352707797291, + "learning_rate": 8.903568917423616e-07, + "loss": 0.2828, + "step": 1217 + }, + { + "epoch": 0.6688632619439868, + "grad_norm": 0.5665599561495389, + "learning_rate": 8.901753322051189e-07, + "loss": 0.2862, + "step": 1218 + }, + { + "epoch": 0.6694124107633168, + "grad_norm": 0.542595992828151, + "learning_rate": 8.899936410149496e-07, + "loss": 0.2638, + "step": 1219 + }, + { + "epoch": 0.6699615595826469, + "grad_norm": 0.5648813250199902, + "learning_rate": 8.89811818233167e-07, + "loss": 0.2909, + "step": 1220 + }, + { + "epoch": 0.6705107084019769, + "grad_norm": 0.4160562735138924, + "learning_rate": 8.896298639211304e-07, + "loss": 0.2616, + "step": 1221 + }, + { + "epoch": 0.671059857221307, + "grad_norm": 0.5413512791749923, + "learning_rate": 8.894477781402422e-07, + "loss": 0.2484, + "step": 1222 + }, + { + "epoch": 0.671609006040637, + "grad_norm": 0.4330049685128148, + "learning_rate": 8.892655609519497e-07, + "loss": 0.2436, + "step": 1223 + }, + { + "epoch": 0.6721581548599671, + "grad_norm": 0.5443521875616191, + "learning_rate": 8.890832124177447e-07, + "loss": 0.2954, + "step": 1224 + }, + { + "epoch": 0.6727073036792971, + "grad_norm": 0.4563967506734848, + "learning_rate": 8.88900732599163e-07, + "loss": 0.289, + "step": 1225 + }, + { + "epoch": 0.6732564524986271, + "grad_norm": 0.4754829406830648, + "learning_rate": 8.887181215577846e-07, + "loss": 0.2506, + "step": 1226 + }, + { + "epoch": 0.6738056013179572, + "grad_norm": 0.4934327565736833, + "learning_rate": 8.885353793552343e-07, + "loss": 0.2692, + "step": 1227 + }, + { + "epoch": 0.6743547501372872, + "grad_norm": 0.607866696262366, + "learning_rate": 8.883525060531808e-07, + "loss": 0.2768, + "step": 1228 + }, + { + "epoch": 0.6749038989566173, + "grad_norm": 0.4946664009061249, + "learning_rate": 8.881695017133371e-07, + "loss": 0.2801, + "step": 1229 + }, + { + "epoch": 0.6754530477759473, + "grad_norm": 0.392121091082154, + "learning_rate": 8.879863663974604e-07, + "loss": 0.271, + "step": 1230 + }, + { + "epoch": 0.6760021965952773, + "grad_norm": 0.44037563428625903, + "learning_rate": 8.878031001673519e-07, + "loss": 0.2906, + "step": 1231 + }, + { + "epoch": 0.6765513454146074, + "grad_norm": 0.48088457141067054, + "learning_rate": 8.876197030848575e-07, + "loss": 0.2677, + "step": 1232 + }, + { + "epoch": 0.6771004942339374, + "grad_norm": 0.4869690369556779, + "learning_rate": 8.874361752118669e-07, + "loss": 0.2861, + "step": 1233 + }, + { + "epoch": 0.6776496430532675, + "grad_norm": 0.5848282881506409, + "learning_rate": 8.87252516610314e-07, + "loss": 0.2982, + "step": 1234 + }, + { + "epoch": 0.6781987918725975, + "grad_norm": 0.4614839190247446, + "learning_rate": 8.870687273421766e-07, + "loss": 0.289, + "step": 1235 + }, + { + "epoch": 0.6787479406919276, + "grad_norm": 0.3705363391129515, + "learning_rate": 8.868848074694772e-07, + "loss": 0.2719, + "step": 1236 + }, + { + "epoch": 0.6792970895112576, + "grad_norm": 0.4650582139957983, + "learning_rate": 8.867007570542817e-07, + "loss": 0.297, + "step": 1237 + }, + { + "epoch": 0.6798462383305875, + "grad_norm": 0.535860936917416, + "learning_rate": 8.865165761587002e-07, + "loss": 0.3275, + "step": 1238 + }, + { + "epoch": 0.6803953871499177, + "grad_norm": 0.5426168703649902, + "learning_rate": 8.863322648448874e-07, + "loss": 0.2955, + "step": 1239 + }, + { + "epoch": 0.6809445359692476, + "grad_norm": 0.47271050132721426, + "learning_rate": 8.861478231750413e-07, + "loss": 0.2707, + "step": 1240 + }, + { + "epoch": 0.6814936847885777, + "grad_norm": 0.6385607302689855, + "learning_rate": 8.859632512114042e-07, + "loss": 0.2789, + "step": 1241 + }, + { + "epoch": 0.6820428336079077, + "grad_norm": 0.42936884881535814, + "learning_rate": 8.857785490162621e-07, + "loss": 0.3035, + "step": 1242 + }, + { + "epoch": 0.6825919824272377, + "grad_norm": 0.4332168020023281, + "learning_rate": 8.855937166519458e-07, + "loss": 0.2469, + "step": 1243 + }, + { + "epoch": 0.6831411312465678, + "grad_norm": 0.5340480171003407, + "learning_rate": 8.854087541808288e-07, + "loss": 0.3112, + "step": 1244 + }, + { + "epoch": 0.6836902800658978, + "grad_norm": 0.5095766870692692, + "learning_rate": 8.852236616653294e-07, + "loss": 0.257, + "step": 1245 + }, + { + "epoch": 0.6842394288852279, + "grad_norm": 0.44141398065536425, + "learning_rate": 8.850384391679096e-07, + "loss": 0.2878, + "step": 1246 + }, + { + "epoch": 0.6847885777045579, + "grad_norm": 0.6618308344776433, + "learning_rate": 8.84853086751075e-07, + "loss": 0.2951, + "step": 1247 + }, + { + "epoch": 0.685337726523888, + "grad_norm": 0.4747078229560669, + "learning_rate": 8.84667604477375e-07, + "loss": 0.2885, + "step": 1248 + }, + { + "epoch": 0.685886875343218, + "grad_norm": 0.5916294272889184, + "learning_rate": 8.844819924094037e-07, + "loss": 0.282, + "step": 1249 + }, + { + "epoch": 0.686436024162548, + "grad_norm": 0.4629434827118017, + "learning_rate": 8.842962506097977e-07, + "loss": 0.2939, + "step": 1250 + }, + { + "epoch": 0.6869851729818781, + "grad_norm": 0.46407511347786573, + "learning_rate": 8.841103791412382e-07, + "loss": 0.278, + "step": 1251 + }, + { + "epoch": 0.6875343218012081, + "grad_norm": 0.48788645650899126, + "learning_rate": 8.839243780664502e-07, + "loss": 0.2979, + "step": 1252 + }, + { + "epoch": 0.6880834706205382, + "grad_norm": 0.5520260733586249, + "learning_rate": 8.837382474482017e-07, + "loss": 0.2631, + "step": 1253 + }, + { + "epoch": 0.6886326194398682, + "grad_norm": 0.4871995962379758, + "learning_rate": 8.835519873493054e-07, + "loss": 0.2478, + "step": 1254 + }, + { + "epoch": 0.6891817682591982, + "grad_norm": 0.5275575643120288, + "learning_rate": 8.833655978326171e-07, + "loss": 0.3386, + "step": 1255 + }, + { + "epoch": 0.6897309170785283, + "grad_norm": 0.4106161686764015, + "learning_rate": 8.831790789610363e-07, + "loss": 0.2702, + "step": 1256 + }, + { + "epoch": 0.6902800658978583, + "grad_norm": 0.45944059978808116, + "learning_rate": 8.829924307975064e-07, + "loss": 0.2721, + "step": 1257 + }, + { + "epoch": 0.6908292147171884, + "grad_norm": 0.47343490936984833, + "learning_rate": 8.828056534050141e-07, + "loss": 0.272, + "step": 1258 + }, + { + "epoch": 0.6913783635365184, + "grad_norm": 0.6942036848333579, + "learning_rate": 8.8261874684659e-07, + "loss": 0.2572, + "step": 1259 + }, + { + "epoch": 0.6919275123558485, + "grad_norm": 0.5145323761806228, + "learning_rate": 8.824317111853081e-07, + "loss": 0.279, + "step": 1260 + }, + { + "epoch": 0.6924766611751785, + "grad_norm": 0.6755598203686058, + "learning_rate": 8.822445464842862e-07, + "loss": 0.2495, + "step": 1261 + }, + { + "epoch": 0.6930258099945085, + "grad_norm": 0.5214951093442063, + "learning_rate": 8.820572528066853e-07, + "loss": 0.3271, + "step": 1262 + }, + { + "epoch": 0.6935749588138386, + "grad_norm": 0.49535338438592735, + "learning_rate": 8.818698302157103e-07, + "loss": 0.3022, + "step": 1263 + }, + { + "epoch": 0.6941241076331686, + "grad_norm": 0.4880159994943199, + "learning_rate": 8.816822787746092e-07, + "loss": 0.3159, + "step": 1264 + }, + { + "epoch": 0.6946732564524987, + "grad_norm": 0.4488171348359517, + "learning_rate": 8.814945985466738e-07, + "loss": 0.3095, + "step": 1265 + }, + { + "epoch": 0.6952224052718287, + "grad_norm": 0.6174403081349206, + "learning_rate": 8.813067895952394e-07, + "loss": 0.3042, + "step": 1266 + }, + { + "epoch": 0.6957715540911587, + "grad_norm": 0.46375370522692666, + "learning_rate": 8.811188519836846e-07, + "loss": 0.2649, + "step": 1267 + }, + { + "epoch": 0.6963207029104888, + "grad_norm": 0.5588371042652637, + "learning_rate": 8.809307857754312e-07, + "loss": 0.2903, + "step": 1268 + }, + { + "epoch": 0.6968698517298187, + "grad_norm": 0.45054402444046043, + "learning_rate": 8.807425910339446e-07, + "loss": 0.3007, + "step": 1269 + }, + { + "epoch": 0.6974190005491488, + "grad_norm": 0.4799235119021597, + "learning_rate": 8.805542678227339e-07, + "loss": 0.2325, + "step": 1270 + }, + { + "epoch": 0.6979681493684788, + "grad_norm": 0.5810408907664303, + "learning_rate": 8.80365816205351e-07, + "loss": 0.2604, + "step": 1271 + }, + { + "epoch": 0.6985172981878089, + "grad_norm": 0.5558300105500974, + "learning_rate": 8.801772362453914e-07, + "loss": 0.2762, + "step": 1272 + }, + { + "epoch": 0.6990664470071389, + "grad_norm": 0.4957095954875263, + "learning_rate": 8.79988528006494e-07, + "loss": 0.2444, + "step": 1273 + }, + { + "epoch": 0.6996155958264689, + "grad_norm": 0.4776697394100596, + "learning_rate": 8.797996915523408e-07, + "loss": 0.2641, + "step": 1274 + }, + { + "epoch": 0.700164744645799, + "grad_norm": 0.48438577774761393, + "learning_rate": 8.796107269466572e-07, + "loss": 0.2812, + "step": 1275 + }, + { + "epoch": 0.700713893465129, + "grad_norm": 0.3866968789181543, + "learning_rate": 8.794216342532116e-07, + "loss": 0.2676, + "step": 1276 + }, + { + "epoch": 0.7012630422844591, + "grad_norm": 0.5328756379378069, + "learning_rate": 8.792324135358161e-07, + "loss": 0.2631, + "step": 1277 + }, + { + "epoch": 0.7018121911037891, + "grad_norm": 0.4761610142019809, + "learning_rate": 8.790430648583255e-07, + "loss": 0.2678, + "step": 1278 + }, + { + "epoch": 0.7023613399231191, + "grad_norm": 0.43582459714242316, + "learning_rate": 8.788535882846382e-07, + "loss": 0.2902, + "step": 1279 + }, + { + "epoch": 0.7029104887424492, + "grad_norm": 0.43844197771775184, + "learning_rate": 8.786639838786953e-07, + "loss": 0.2853, + "step": 1280 + }, + { + "epoch": 0.7034596375617792, + "grad_norm": 0.37377779055777965, + "learning_rate": 8.784742517044816e-07, + "loss": 0.2431, + "step": 1281 + }, + { + "epoch": 0.7040087863811093, + "grad_norm": 0.46856352563180786, + "learning_rate": 8.782843918260245e-07, + "loss": 0.3132, + "step": 1282 + }, + { + "epoch": 0.7045579352004393, + "grad_norm": 0.47171398293104033, + "learning_rate": 8.780944043073946e-07, + "loss": 0.2755, + "step": 1283 + }, + { + "epoch": 0.7051070840197694, + "grad_norm": 0.395798689755583, + "learning_rate": 8.779042892127063e-07, + "loss": 0.2834, + "step": 1284 + }, + { + "epoch": 0.7056562328390994, + "grad_norm": 0.41506834692102745, + "learning_rate": 8.777140466061158e-07, + "loss": 0.2331, + "step": 1285 + }, + { + "epoch": 0.7062053816584294, + "grad_norm": 0.5499373886807347, + "learning_rate": 8.77523676551823e-07, + "loss": 0.2488, + "step": 1286 + }, + { + "epoch": 0.7067545304777595, + "grad_norm": 0.502761676178298, + "learning_rate": 8.773331791140712e-07, + "loss": 0.2883, + "step": 1287 + }, + { + "epoch": 0.7073036792970895, + "grad_norm": 0.44450699010240075, + "learning_rate": 8.771425543571461e-07, + "loss": 0.2615, + "step": 1288 + }, + { + "epoch": 0.7078528281164196, + "grad_norm": 0.4731553496496406, + "learning_rate": 8.769518023453763e-07, + "loss": 0.3101, + "step": 1289 + }, + { + "epoch": 0.7084019769357496, + "grad_norm": 0.7167440579758683, + "learning_rate": 8.767609231431338e-07, + "loss": 0.3068, + "step": 1290 + }, + { + "epoch": 0.7089511257550797, + "grad_norm": 0.5155775414683639, + "learning_rate": 8.765699168148331e-07, + "loss": 0.2544, + "step": 1291 + }, + { + "epoch": 0.7095002745744097, + "grad_norm": 0.5667979731520204, + "learning_rate": 8.763787834249322e-07, + "loss": 0.274, + "step": 1292 + }, + { + "epoch": 0.7100494233937397, + "grad_norm": 0.534421880769502, + "learning_rate": 8.761875230379312e-07, + "loss": 0.2486, + "step": 1293 + }, + { + "epoch": 0.7105985722130698, + "grad_norm": 0.48974672517736045, + "learning_rate": 8.759961357183736e-07, + "loss": 0.2921, + "step": 1294 + }, + { + "epoch": 0.7111477210323998, + "grad_norm": 0.4641955710008565, + "learning_rate": 8.758046215308456e-07, + "loss": 0.2745, + "step": 1295 + }, + { + "epoch": 0.7116968698517299, + "grad_norm": 0.4526752045793606, + "learning_rate": 8.756129805399758e-07, + "loss": 0.2663, + "step": 1296 + }, + { + "epoch": 0.7122460186710599, + "grad_norm": 0.5068314778193749, + "learning_rate": 8.754212128104366e-07, + "loss": 0.328, + "step": 1297 + }, + { + "epoch": 0.7127951674903898, + "grad_norm": 0.4377201750861751, + "learning_rate": 8.75229318406942e-07, + "loss": 0.2478, + "step": 1298 + }, + { + "epoch": 0.71334431630972, + "grad_norm": 0.4233086920358855, + "learning_rate": 8.750372973942495e-07, + "loss": 0.2689, + "step": 1299 + }, + { + "epoch": 0.7138934651290499, + "grad_norm": 0.5060988338664691, + "learning_rate": 8.748451498371594e-07, + "loss": 0.2728, + "step": 1300 + }, + { + "epoch": 0.71444261394838, + "grad_norm": 0.6158788054210272, + "learning_rate": 8.746528758005139e-07, + "loss": 0.3505, + "step": 1301 + }, + { + "epoch": 0.71499176276771, + "grad_norm": 0.4833637139077168, + "learning_rate": 8.744604753491989e-07, + "loss": 0.2803, + "step": 1302 + }, + { + "epoch": 0.7155409115870401, + "grad_norm": 0.4589880573250104, + "learning_rate": 8.742679485481419e-07, + "loss": 0.2657, + "step": 1303 + }, + { + "epoch": 0.7160900604063701, + "grad_norm": 0.5251864252327677, + "learning_rate": 8.740752954623142e-07, + "loss": 0.2787, + "step": 1304 + }, + { + "epoch": 0.7166392092257001, + "grad_norm": 0.4022266755575418, + "learning_rate": 8.738825161567286e-07, + "loss": 0.2864, + "step": 1305 + }, + { + "epoch": 0.7171883580450302, + "grad_norm": 0.4096719701134503, + "learning_rate": 8.736896106964414e-07, + "loss": 0.297, + "step": 1306 + }, + { + "epoch": 0.7177375068643602, + "grad_norm": 0.4946200871997498, + "learning_rate": 8.73496579146551e-07, + "loss": 0.2872, + "step": 1307 + }, + { + "epoch": 0.7182866556836903, + "grad_norm": 0.4842141565048707, + "learning_rate": 8.733034215721984e-07, + "loss": 0.2837, + "step": 1308 + }, + { + "epoch": 0.7188358045030203, + "grad_norm": 0.6009862157947654, + "learning_rate": 8.731101380385669e-07, + "loss": 0.3482, + "step": 1309 + }, + { + "epoch": 0.7193849533223503, + "grad_norm": 0.47470169339397655, + "learning_rate": 8.729167286108831e-07, + "loss": 0.2833, + "step": 1310 + }, + { + "epoch": 0.7199341021416804, + "grad_norm": 0.47604670579454933, + "learning_rate": 8.72723193354415e-07, + "loss": 0.2633, + "step": 1311 + }, + { + "epoch": 0.7204832509610104, + "grad_norm": 0.5527021362912121, + "learning_rate": 8.725295323344741e-07, + "loss": 0.2653, + "step": 1312 + }, + { + "epoch": 0.7210323997803405, + "grad_norm": 0.5122435177100294, + "learning_rate": 8.723357456164137e-07, + "loss": 0.2938, + "step": 1313 + }, + { + "epoch": 0.7215815485996705, + "grad_norm": 0.5672461346022489, + "learning_rate": 8.721418332656295e-07, + "loss": 0.2923, + "step": 1314 + }, + { + "epoch": 0.7221306974190006, + "grad_norm": 0.7152713263523053, + "learning_rate": 8.719477953475598e-07, + "loss": 0.2764, + "step": 1315 + }, + { + "epoch": 0.7226798462383306, + "grad_norm": 0.5500128662785526, + "learning_rate": 8.717536319276856e-07, + "loss": 0.3, + "step": 1316 + }, + { + "epoch": 0.7232289950576606, + "grad_norm": 0.5586130501725085, + "learning_rate": 8.715593430715294e-07, + "loss": 0.2944, + "step": 1317 + }, + { + "epoch": 0.7237781438769907, + "grad_norm": 0.5554440952108585, + "learning_rate": 8.71364928844657e-07, + "loss": 0.2943, + "step": 1318 + }, + { + "epoch": 0.7243272926963207, + "grad_norm": 0.6189797783583086, + "learning_rate": 8.711703893126757e-07, + "loss": 0.2692, + "step": 1319 + }, + { + "epoch": 0.7248764415156508, + "grad_norm": 0.522359031404474, + "learning_rate": 8.709757245412356e-07, + "loss": 0.2767, + "step": 1320 + }, + { + "epoch": 0.7254255903349808, + "grad_norm": 0.5043463005715629, + "learning_rate": 8.707809345960288e-07, + "loss": 0.2284, + "step": 1321 + }, + { + "epoch": 0.7259747391543108, + "grad_norm": 0.4590657611646499, + "learning_rate": 8.705860195427899e-07, + "loss": 0.2389, + "step": 1322 + }, + { + "epoch": 0.7265238879736409, + "grad_norm": 0.6369775868563855, + "learning_rate": 8.703909794472951e-07, + "loss": 0.2832, + "step": 1323 + }, + { + "epoch": 0.7270730367929709, + "grad_norm": 0.7609135010847301, + "learning_rate": 8.701958143753639e-07, + "loss": 0.3253, + "step": 1324 + }, + { + "epoch": 0.727622185612301, + "grad_norm": 0.4213279947596712, + "learning_rate": 8.700005243928568e-07, + "loss": 0.2828, + "step": 1325 + }, + { + "epoch": 0.728171334431631, + "grad_norm": 0.6779155064964311, + "learning_rate": 8.698051095656772e-07, + "loss": 0.2817, + "step": 1326 + }, + { + "epoch": 0.7287204832509611, + "grad_norm": 0.4845191829636317, + "learning_rate": 8.696095699597704e-07, + "loss": 0.2572, + "step": 1327 + }, + { + "epoch": 0.729269632070291, + "grad_norm": 0.5153401960551395, + "learning_rate": 8.694139056411237e-07, + "loss": 0.2875, + "step": 1328 + }, + { + "epoch": 0.729818780889621, + "grad_norm": 0.5058468993620223, + "learning_rate": 8.692181166757668e-07, + "loss": 0.2871, + "step": 1329 + }, + { + "epoch": 0.7303679297089511, + "grad_norm": 0.49783065344280164, + "learning_rate": 8.69022203129771e-07, + "loss": 0.281, + "step": 1330 + }, + { + "epoch": 0.7309170785282811, + "grad_norm": 0.4085419585784888, + "learning_rate": 8.688261650692502e-07, + "loss": 0.2575, + "step": 1331 + }, + { + "epoch": 0.7314662273476112, + "grad_norm": 0.47672473572235635, + "learning_rate": 8.686300025603596e-07, + "loss": 0.3043, + "step": 1332 + }, + { + "epoch": 0.7320153761669412, + "grad_norm": 0.3979098555475834, + "learning_rate": 8.684337156692975e-07, + "loss": 0.235, + "step": 1333 + }, + { + "epoch": 0.7325645249862712, + "grad_norm": 0.44429932278469225, + "learning_rate": 8.682373044623027e-07, + "loss": 0.3324, + "step": 1334 + }, + { + "epoch": 0.7331136738056013, + "grad_norm": 0.4931167519804408, + "learning_rate": 8.680407690056573e-07, + "loss": 0.2728, + "step": 1335 + }, + { + "epoch": 0.7336628226249313, + "grad_norm": 0.47590947382458537, + "learning_rate": 8.678441093656846e-07, + "loss": 0.2768, + "step": 1336 + }, + { + "epoch": 0.7342119714442614, + "grad_norm": 0.5032795306263175, + "learning_rate": 8.676473256087499e-07, + "loss": 0.2558, + "step": 1337 + }, + { + "epoch": 0.7347611202635914, + "grad_norm": 0.4162609506004113, + "learning_rate": 8.674504178012607e-07, + "loss": 0.2556, + "step": 1338 + }, + { + "epoch": 0.7353102690829215, + "grad_norm": 0.4489167714209613, + "learning_rate": 8.672533860096659e-07, + "loss": 0.2452, + "step": 1339 + }, + { + "epoch": 0.7358594179022515, + "grad_norm": 0.4195370879737022, + "learning_rate": 8.670562303004565e-07, + "loss": 0.2885, + "step": 1340 + }, + { + "epoch": 0.7364085667215815, + "grad_norm": 0.4491685290850181, + "learning_rate": 8.668589507401653e-07, + "loss": 0.2648, + "step": 1341 + }, + { + "epoch": 0.7369577155409116, + "grad_norm": 0.5190688682422215, + "learning_rate": 8.666615473953671e-07, + "loss": 0.2787, + "step": 1342 + }, + { + "epoch": 0.7375068643602416, + "grad_norm": 0.7516698455360568, + "learning_rate": 8.66464020332678e-07, + "loss": 0.3141, + "step": 1343 + }, + { + "epoch": 0.7380560131795717, + "grad_norm": 0.487408097688339, + "learning_rate": 8.662663696187562e-07, + "loss": 0.2838, + "step": 1344 + }, + { + "epoch": 0.7386051619989017, + "grad_norm": 0.40509673185127437, + "learning_rate": 8.660685953203017e-07, + "loss": 0.2481, + "step": 1345 + }, + { + "epoch": 0.7391543108182317, + "grad_norm": 0.5643832142815581, + "learning_rate": 8.658706975040555e-07, + "loss": 0.256, + "step": 1346 + }, + { + "epoch": 0.7397034596375618, + "grad_norm": 0.446450893772803, + "learning_rate": 8.656726762368014e-07, + "loss": 0.2628, + "step": 1347 + }, + { + "epoch": 0.7402526084568918, + "grad_norm": 0.452043600960706, + "learning_rate": 8.654745315853641e-07, + "loss": 0.2833, + "step": 1348 + }, + { + "epoch": 0.7408017572762219, + "grad_norm": 0.5486074290974142, + "learning_rate": 8.6527626361661e-07, + "loss": 0.2869, + "step": 1349 + }, + { + "epoch": 0.7413509060955519, + "grad_norm": 0.42369563712940844, + "learning_rate": 8.650778723974473e-07, + "loss": 0.3202, + "step": 1350 + }, + { + "epoch": 0.741900054914882, + "grad_norm": 0.47309936413098236, + "learning_rate": 8.64879357994826e-07, + "loss": 0.2788, + "step": 1351 + }, + { + "epoch": 0.742449203734212, + "grad_norm": 0.6012847154507803, + "learning_rate": 8.646807204757367e-07, + "loss": 0.2553, + "step": 1352 + }, + { + "epoch": 0.742998352553542, + "grad_norm": 0.42871876512675017, + "learning_rate": 8.64481959907213e-07, + "loss": 0.256, + "step": 1353 + }, + { + "epoch": 0.7435475013728721, + "grad_norm": 0.4563399094153482, + "learning_rate": 8.642830763563289e-07, + "loss": 0.3013, + "step": 1354 + }, + { + "epoch": 0.7440966501922021, + "grad_norm": 0.3912004419547005, + "learning_rate": 8.640840698902003e-07, + "loss": 0.2409, + "step": 1355 + }, + { + "epoch": 0.7446457990115322, + "grad_norm": 0.5173591154484312, + "learning_rate": 8.638849405759847e-07, + "loss": 0.3006, + "step": 1356 + }, + { + "epoch": 0.7451949478308622, + "grad_norm": 0.4005829169066144, + "learning_rate": 8.636856884808808e-07, + "loss": 0.3037, + "step": 1357 + }, + { + "epoch": 0.7457440966501923, + "grad_norm": 0.38922793044449194, + "learning_rate": 8.634863136721288e-07, + "loss": 0.2579, + "step": 1358 + }, + { + "epoch": 0.7462932454695222, + "grad_norm": 0.4508264446534602, + "learning_rate": 8.632868162170103e-07, + "loss": 0.2635, + "step": 1359 + }, + { + "epoch": 0.7468423942888522, + "grad_norm": 0.4567553232369663, + "learning_rate": 8.630871961828484e-07, + "loss": 0.2628, + "step": 1360 + }, + { + "epoch": 0.7473915431081823, + "grad_norm": 0.514179983180096, + "learning_rate": 8.628874536370076e-07, + "loss": 0.262, + "step": 1361 + }, + { + "epoch": 0.7479406919275123, + "grad_norm": 0.40971659640124847, + "learning_rate": 8.626875886468937e-07, + "loss": 0.2833, + "step": 1362 + }, + { + "epoch": 0.7484898407468424, + "grad_norm": 0.40738908686850644, + "learning_rate": 8.624876012799533e-07, + "loss": 0.2747, + "step": 1363 + }, + { + "epoch": 0.7490389895661724, + "grad_norm": 0.5058590162965317, + "learning_rate": 8.622874916036755e-07, + "loss": 0.2823, + "step": 1364 + }, + { + "epoch": 0.7495881383855024, + "grad_norm": 0.5722901454466316, + "learning_rate": 8.620872596855894e-07, + "loss": 0.3056, + "step": 1365 + }, + { + "epoch": 0.7501372872048325, + "grad_norm": 0.511870789687394, + "learning_rate": 8.618869055932661e-07, + "loss": 0.2695, + "step": 1366 + }, + { + "epoch": 0.7506864360241625, + "grad_norm": 0.6014990334065109, + "learning_rate": 8.616864293943177e-07, + "loss": 0.2664, + "step": 1367 + }, + { + "epoch": 0.7512355848434926, + "grad_norm": 0.443095209160229, + "learning_rate": 8.614858311563975e-07, + "loss": 0.2617, + "step": 1368 + }, + { + "epoch": 0.7517847336628226, + "grad_norm": 0.48572809263891076, + "learning_rate": 8.612851109472e-07, + "loss": 0.2766, + "step": 1369 + }, + { + "epoch": 0.7523338824821527, + "grad_norm": 0.42169219052347473, + "learning_rate": 8.61084268834461e-07, + "loss": 0.2732, + "step": 1370 + }, + { + "epoch": 0.7528830313014827, + "grad_norm": 0.43953598898290763, + "learning_rate": 8.608833048859572e-07, + "loss": 0.314, + "step": 1371 + }, + { + "epoch": 0.7534321801208127, + "grad_norm": 0.6201686246822434, + "learning_rate": 8.606822191695065e-07, + "loss": 0.2944, + "step": 1372 + }, + { + "epoch": 0.7539813289401428, + "grad_norm": 0.5150555502827397, + "learning_rate": 8.604810117529679e-07, + "loss": 0.2951, + "step": 1373 + }, + { + "epoch": 0.7545304777594728, + "grad_norm": 0.47604597941304283, + "learning_rate": 8.602796827042418e-07, + "loss": 0.2532, + "step": 1374 + }, + { + "epoch": 0.7550796265788029, + "grad_norm": 0.4432236198856403, + "learning_rate": 8.600782320912689e-07, + "loss": 0.2408, + "step": 1375 + }, + { + "epoch": 0.7556287753981329, + "grad_norm": 0.5165007594893655, + "learning_rate": 8.598766599820316e-07, + "loss": 0.3305, + "step": 1376 + }, + { + "epoch": 0.7561779242174629, + "grad_norm": 0.6534173229326804, + "learning_rate": 8.596749664445531e-07, + "loss": 0.354, + "step": 1377 + }, + { + "epoch": 0.756727073036793, + "grad_norm": 0.4805234888708049, + "learning_rate": 8.594731515468975e-07, + "loss": 0.2419, + "step": 1378 + }, + { + "epoch": 0.757276221856123, + "grad_norm": 0.5356490285713452, + "learning_rate": 8.592712153571696e-07, + "loss": 0.2599, + "step": 1379 + }, + { + "epoch": 0.7578253706754531, + "grad_norm": 0.5440977222147195, + "learning_rate": 8.590691579435157e-07, + "loss": 0.2455, + "step": 1380 + }, + { + "epoch": 0.7583745194947831, + "grad_norm": 0.7149038990020312, + "learning_rate": 8.588669793741231e-07, + "loss": 0.3209, + "step": 1381 + }, + { + "epoch": 0.7589236683141132, + "grad_norm": 0.5262992947553964, + "learning_rate": 8.586646797172189e-07, + "loss": 0.2631, + "step": 1382 + }, + { + "epoch": 0.7594728171334432, + "grad_norm": 0.44514385035522225, + "learning_rate": 8.584622590410722e-07, + "loss": 0.2577, + "step": 1383 + }, + { + "epoch": 0.7600219659527732, + "grad_norm": 0.5079325668733327, + "learning_rate": 8.582597174139925e-07, + "loss": 0.3049, + "step": 1384 + }, + { + "epoch": 0.7605711147721033, + "grad_norm": 0.44969742907551946, + "learning_rate": 8.580570549043299e-07, + "loss": 0.2711, + "step": 1385 + }, + { + "epoch": 0.7611202635914333, + "grad_norm": 0.4943841266156845, + "learning_rate": 8.578542715804758e-07, + "loss": 0.2616, + "step": 1386 + }, + { + "epoch": 0.7616694124107634, + "grad_norm": 0.5330860403259663, + "learning_rate": 8.57651367510862e-07, + "loss": 0.2876, + "step": 1387 + }, + { + "epoch": 0.7622185612300933, + "grad_norm": 0.4064903031737812, + "learning_rate": 8.574483427639612e-07, + "loss": 0.262, + "step": 1388 + }, + { + "epoch": 0.7627677100494233, + "grad_norm": 0.41749032530568897, + "learning_rate": 8.572451974082867e-07, + "loss": 0.2657, + "step": 1389 + }, + { + "epoch": 0.7633168588687534, + "grad_norm": 0.48835804584658404, + "learning_rate": 8.570419315123924e-07, + "loss": 0.2429, + "step": 1390 + }, + { + "epoch": 0.7638660076880834, + "grad_norm": 0.4556370122488247, + "learning_rate": 8.568385451448735e-07, + "loss": 0.2682, + "step": 1391 + }, + { + "epoch": 0.7644151565074135, + "grad_norm": 0.43206927013875696, + "learning_rate": 8.56635038374365e-07, + "loss": 0.2706, + "step": 1392 + }, + { + "epoch": 0.7649643053267435, + "grad_norm": 0.4483782884803002, + "learning_rate": 8.564314112695432e-07, + "loss": 0.2874, + "step": 1393 + }, + { + "epoch": 0.7655134541460736, + "grad_norm": 0.49295950220252227, + "learning_rate": 8.562276638991246e-07, + "loss": 0.2491, + "step": 1394 + }, + { + "epoch": 0.7660626029654036, + "grad_norm": 0.4399500999953082, + "learning_rate": 8.560237963318664e-07, + "loss": 0.252, + "step": 1395 + }, + { + "epoch": 0.7666117517847336, + "grad_norm": 0.5450671017136176, + "learning_rate": 8.558198086365665e-07, + "loss": 0.3032, + "step": 1396 + }, + { + "epoch": 0.7671609006040637, + "grad_norm": 0.49177675167959445, + "learning_rate": 8.556157008820632e-07, + "loss": 0.2994, + "step": 1397 + }, + { + "epoch": 0.7677100494233937, + "grad_norm": 0.4008746012352519, + "learning_rate": 8.554114731372352e-07, + "loss": 0.2758, + "step": 1398 + }, + { + "epoch": 0.7682591982427238, + "grad_norm": 0.4946118346918174, + "learning_rate": 8.552071254710023e-07, + "loss": 0.254, + "step": 1399 + }, + { + "epoch": 0.7688083470620538, + "grad_norm": 0.4871183570035581, + "learning_rate": 8.550026579523239e-07, + "loss": 0.2438, + "step": 1400 + }, + { + "epoch": 0.7688083470620538, + "eval_loss": 0.35636478662490845, + "eval_runtime": 18.6032, + "eval_samples_per_second": 23.813, + "eval_steps_per_second": 1.021, + "step": 1400 + }, + { + "epoch": 0.7693574958813838, + "grad_norm": 0.4170527615018649, + "learning_rate": 8.547980706502001e-07, + "loss": 0.2633, + "step": 1401 + }, + { + "epoch": 0.7699066447007139, + "grad_norm": 0.5268912021997442, + "learning_rate": 8.545933636336719e-07, + "loss": 0.2606, + "step": 1402 + }, + { + "epoch": 0.7704557935200439, + "grad_norm": 0.49446906035660454, + "learning_rate": 8.543885369718203e-07, + "loss": 0.2869, + "step": 1403 + }, + { + "epoch": 0.771004942339374, + "grad_norm": 0.5065527120322841, + "learning_rate": 8.541835907337668e-07, + "loss": 0.2692, + "step": 1404 + }, + { + "epoch": 0.771554091158704, + "grad_norm": 0.5285636269155132, + "learning_rate": 8.539785249886733e-07, + "loss": 0.2466, + "step": 1405 + }, + { + "epoch": 0.7721032399780341, + "grad_norm": 0.522243530792344, + "learning_rate": 8.537733398057416e-07, + "loss": 0.2934, + "step": 1406 + }, + { + "epoch": 0.7726523887973641, + "grad_norm": 0.6485427994215435, + "learning_rate": 8.535680352542143e-07, + "loss": 0.35, + "step": 1407 + }, + { + "epoch": 0.7732015376166941, + "grad_norm": 0.7397283206685353, + "learning_rate": 8.533626114033744e-07, + "loss": 0.2852, + "step": 1408 + }, + { + "epoch": 0.7737506864360242, + "grad_norm": 0.5276436529049572, + "learning_rate": 8.531570683225443e-07, + "loss": 0.2725, + "step": 1409 + }, + { + "epoch": 0.7742998352553542, + "grad_norm": 0.6132530263761391, + "learning_rate": 8.529514060810878e-07, + "loss": 0.2633, + "step": 1410 + }, + { + "epoch": 0.7748489840746843, + "grad_norm": 0.4413060943506678, + "learning_rate": 8.527456247484079e-07, + "loss": 0.2646, + "step": 1411 + }, + { + "epoch": 0.7753981328940143, + "grad_norm": 0.5399502638660545, + "learning_rate": 8.525397243939487e-07, + "loss": 0.2624, + "step": 1412 + }, + { + "epoch": 0.7759472817133443, + "grad_norm": 0.3822802249378209, + "learning_rate": 8.523337050871933e-07, + "loss": 0.2896, + "step": 1413 + }, + { + "epoch": 0.7764964305326744, + "grad_norm": 0.46875358003904677, + "learning_rate": 8.521275668976661e-07, + "loss": 0.2694, + "step": 1414 + }, + { + "epoch": 0.7770455793520044, + "grad_norm": 0.47612295964115026, + "learning_rate": 8.519213098949311e-07, + "loss": 0.3008, + "step": 1415 + }, + { + "epoch": 0.7775947281713345, + "grad_norm": 0.4414768466019281, + "learning_rate": 8.517149341485926e-07, + "loss": 0.2913, + "step": 1416 + }, + { + "epoch": 0.7781438769906645, + "grad_norm": 0.5430840569533861, + "learning_rate": 8.515084397282943e-07, + "loss": 0.2504, + "step": 1417 + }, + { + "epoch": 0.7786930258099946, + "grad_norm": 0.5355086623243603, + "learning_rate": 8.51301826703721e-07, + "loss": 0.3026, + "step": 1418 + }, + { + "epoch": 0.7792421746293245, + "grad_norm": 0.4129496054739296, + "learning_rate": 8.510950951445967e-07, + "loss": 0.2529, + "step": 1419 + }, + { + "epoch": 0.7797913234486545, + "grad_norm": 0.48056228009166235, + "learning_rate": 8.508882451206856e-07, + "loss": 0.2817, + "step": 1420 + }, + { + "epoch": 0.7803404722679846, + "grad_norm": 0.45143835664255116, + "learning_rate": 8.50681276701792e-07, + "loss": 0.2591, + "step": 1421 + }, + { + "epoch": 0.7808896210873146, + "grad_norm": 0.5844080303955883, + "learning_rate": 8.504741899577604e-07, + "loss": 0.281, + "step": 1422 + }, + { + "epoch": 0.7814387699066447, + "grad_norm": 0.443980549403915, + "learning_rate": 8.502669849584749e-07, + "loss": 0.2988, + "step": 1423 + }, + { + "epoch": 0.7819879187259747, + "grad_norm": 0.5543252907821477, + "learning_rate": 8.500596617738592e-07, + "loss": 0.296, + "step": 1424 + }, + { + "epoch": 0.7825370675453048, + "grad_norm": 0.6565632002621868, + "learning_rate": 8.498522204738774e-07, + "loss": 0.2763, + "step": 1425 + }, + { + "epoch": 0.7830862163646348, + "grad_norm": 0.3881852587096809, + "learning_rate": 8.496446611285333e-07, + "loss": 0.2422, + "step": 1426 + }, + { + "epoch": 0.7836353651839648, + "grad_norm": 0.42959050993454223, + "learning_rate": 8.494369838078708e-07, + "loss": 0.2631, + "step": 1427 + }, + { + "epoch": 0.7841845140032949, + "grad_norm": 0.44569840872302136, + "learning_rate": 8.49229188581973e-07, + "loss": 0.2744, + "step": 1428 + }, + { + "epoch": 0.7847336628226249, + "grad_norm": 1.1174408877850759, + "learning_rate": 8.490212755209632e-07, + "loss": 0.4636, + "step": 1429 + }, + { + "epoch": 0.785282811641955, + "grad_norm": 0.5434410696983889, + "learning_rate": 8.488132446950046e-07, + "loss": 0.2587, + "step": 1430 + }, + { + "epoch": 0.785831960461285, + "grad_norm": 0.5560706595266278, + "learning_rate": 8.486050961742997e-07, + "loss": 0.2613, + "step": 1431 + }, + { + "epoch": 0.786381109280615, + "grad_norm": 0.54384047498177, + "learning_rate": 8.48396830029091e-07, + "loss": 0.2623, + "step": 1432 + }, + { + "epoch": 0.7869302580999451, + "grad_norm": 0.43216756513519927, + "learning_rate": 8.481884463296608e-07, + "loss": 0.2748, + "step": 1433 + }, + { + "epoch": 0.7874794069192751, + "grad_norm": 0.7750012230290634, + "learning_rate": 8.479799451463307e-07, + "loss": 0.3026, + "step": 1434 + }, + { + "epoch": 0.7880285557386052, + "grad_norm": 0.6095409861050018, + "learning_rate": 8.477713265494625e-07, + "loss": 0.2432, + "step": 1435 + }, + { + "epoch": 0.7885777045579352, + "grad_norm": 0.4239718598759618, + "learning_rate": 8.475625906094569e-07, + "loss": 0.2759, + "step": 1436 + }, + { + "epoch": 0.7891268533772653, + "grad_norm": 0.5610702879251471, + "learning_rate": 8.473537373967547e-07, + "loss": 0.2728, + "step": 1437 + }, + { + "epoch": 0.7896760021965953, + "grad_norm": 0.4245901185666458, + "learning_rate": 8.471447669818364e-07, + "loss": 0.2641, + "step": 1438 + }, + { + "epoch": 0.7902251510159253, + "grad_norm": 0.5104216315685541, + "learning_rate": 8.469356794352217e-07, + "loss": 0.2843, + "step": 1439 + }, + { + "epoch": 0.7907742998352554, + "grad_norm": 0.4919090033600878, + "learning_rate": 8.467264748274697e-07, + "loss": 0.2603, + "step": 1440 + }, + { + "epoch": 0.7913234486545854, + "grad_norm": 0.44442568399620447, + "learning_rate": 8.465171532291796e-07, + "loss": 0.2754, + "step": 1441 + }, + { + "epoch": 0.7918725974739155, + "grad_norm": 0.5876270472590753, + "learning_rate": 8.463077147109895e-07, + "loss": 0.2967, + "step": 1442 + }, + { + "epoch": 0.7924217462932455, + "grad_norm": 0.5110066028762595, + "learning_rate": 8.460981593435772e-07, + "loss": 0.2684, + "step": 1443 + }, + { + "epoch": 0.7929708951125755, + "grad_norm": 0.6078274569338576, + "learning_rate": 8.458884871976601e-07, + "loss": 0.2446, + "step": 1444 + }, + { + "epoch": 0.7935200439319056, + "grad_norm": 0.45988828601242304, + "learning_rate": 8.456786983439946e-07, + "loss": 0.2589, + "step": 1445 + }, + { + "epoch": 0.7940691927512356, + "grad_norm": 0.4930137310689949, + "learning_rate": 8.454687928533768e-07, + "loss": 0.2479, + "step": 1446 + }, + { + "epoch": 0.7946183415705657, + "grad_norm": 0.4758903150056251, + "learning_rate": 8.452587707966422e-07, + "loss": 0.2735, + "step": 1447 + }, + { + "epoch": 0.7951674903898956, + "grad_norm": 0.36398183444609655, + "learning_rate": 8.450486322446652e-07, + "loss": 0.2913, + "step": 1448 + }, + { + "epoch": 0.7957166392092258, + "grad_norm": 0.4332182368957013, + "learning_rate": 8.448383772683602e-07, + "loss": 0.2685, + "step": 1449 + }, + { + "epoch": 0.7962657880285557, + "grad_norm": 0.47589487027704447, + "learning_rate": 8.446280059386801e-07, + "loss": 0.2645, + "step": 1450 + }, + { + "epoch": 0.7968149368478857, + "grad_norm": 0.46798540538360656, + "learning_rate": 8.444175183266178e-07, + "loss": 0.2723, + "step": 1451 + }, + { + "epoch": 0.7973640856672158, + "grad_norm": 0.4431372380833636, + "learning_rate": 8.44206914503205e-07, + "loss": 0.2544, + "step": 1452 + }, + { + "epoch": 0.7979132344865458, + "grad_norm": 0.4933695669895439, + "learning_rate": 8.439961945395127e-07, + "loss": 0.2945, + "step": 1453 + }, + { + "epoch": 0.7984623833058759, + "grad_norm": 0.38662997940798843, + "learning_rate": 8.437853585066511e-07, + "loss": 0.2628, + "step": 1454 + }, + { + "epoch": 0.7990115321252059, + "grad_norm": 0.5037608841621234, + "learning_rate": 8.435744064757698e-07, + "loss": 0.2695, + "step": 1455 + }, + { + "epoch": 0.7995606809445359, + "grad_norm": 0.46011657437552106, + "learning_rate": 8.43363338518057e-07, + "loss": 0.2801, + "step": 1456 + }, + { + "epoch": 0.800109829763866, + "grad_norm": 0.5080452829175076, + "learning_rate": 8.431521547047406e-07, + "loss": 0.2622, + "step": 1457 + }, + { + "epoch": 0.800658978583196, + "grad_norm": 0.40704945372155243, + "learning_rate": 8.429408551070875e-07, + "loss": 0.2663, + "step": 1458 + }, + { + "epoch": 0.8012081274025261, + "grad_norm": 0.5037992387479383, + "learning_rate": 8.427294397964031e-07, + "loss": 0.2623, + "step": 1459 + }, + { + "epoch": 0.8017572762218561, + "grad_norm": 0.5787920228462682, + "learning_rate": 8.425179088440326e-07, + "loss": 0.2741, + "step": 1460 + }, + { + "epoch": 0.8023064250411862, + "grad_norm": 0.5576306216976004, + "learning_rate": 8.423062623213598e-07, + "loss": 0.29, + "step": 1461 + }, + { + "epoch": 0.8028555738605162, + "grad_norm": 0.5068193207052203, + "learning_rate": 8.420945002998075e-07, + "loss": 0.255, + "step": 1462 + }, + { + "epoch": 0.8034047226798462, + "grad_norm": 0.39963592645745505, + "learning_rate": 8.418826228508379e-07, + "loss": 0.2752, + "step": 1463 + }, + { + "epoch": 0.8039538714991763, + "grad_norm": 0.4626883793162904, + "learning_rate": 8.416706300459514e-07, + "loss": 0.2591, + "step": 1464 + }, + { + "epoch": 0.8045030203185063, + "grad_norm": 0.4141047875203491, + "learning_rate": 8.414585219566882e-07, + "loss": 0.2785, + "step": 1465 + }, + { + "epoch": 0.8050521691378364, + "grad_norm": 0.45701651764755485, + "learning_rate": 8.412462986546268e-07, + "loss": 0.264, + "step": 1466 + }, + { + "epoch": 0.8056013179571664, + "grad_norm": 0.4885042421916514, + "learning_rate": 8.410339602113845e-07, + "loss": 0.3199, + "step": 1467 + }, + { + "epoch": 0.8061504667764964, + "grad_norm": 0.4449814084236276, + "learning_rate": 8.408215066986179e-07, + "loss": 0.2687, + "step": 1468 + }, + { + "epoch": 0.8066996155958265, + "grad_norm": 0.4126776323920407, + "learning_rate": 8.406089381880224e-07, + "loss": 0.271, + "step": 1469 + }, + { + "epoch": 0.8072487644151565, + "grad_norm": 0.5622513520836145, + "learning_rate": 8.403962547513319e-07, + "loss": 0.3105, + "step": 1470 + }, + { + "epoch": 0.8077979132344866, + "grad_norm": 0.69485160530137, + "learning_rate": 8.40183456460319e-07, + "loss": 0.2756, + "step": 1471 + }, + { + "epoch": 0.8083470620538166, + "grad_norm": 0.5449552136015597, + "learning_rate": 8.399705433867958e-07, + "loss": 0.2963, + "step": 1472 + }, + { + "epoch": 0.8088962108731467, + "grad_norm": 0.5221242893369673, + "learning_rate": 8.39757515602612e-07, + "loss": 0.328, + "step": 1473 + }, + { + "epoch": 0.8094453596924767, + "grad_norm": 0.4725255648775804, + "learning_rate": 8.395443731796571e-07, + "loss": 0.2434, + "step": 1474 + }, + { + "epoch": 0.8099945085118067, + "grad_norm": 0.6265059211400055, + "learning_rate": 8.393311161898585e-07, + "loss": 0.3046, + "step": 1475 + }, + { + "epoch": 0.8105436573311368, + "grad_norm": 0.5923430603070216, + "learning_rate": 8.391177447051829e-07, + "loss": 0.32, + "step": 1476 + }, + { + "epoch": 0.8110928061504667, + "grad_norm": 0.5038809553361293, + "learning_rate": 8.389042587976352e-07, + "loss": 0.2822, + "step": 1477 + }, + { + "epoch": 0.8116419549697969, + "grad_norm": 0.47485516633956404, + "learning_rate": 8.386906585392588e-07, + "loss": 0.2382, + "step": 1478 + }, + { + "epoch": 0.8121911037891268, + "grad_norm": 0.5509063112758457, + "learning_rate": 8.38476944002136e-07, + "loss": 0.295, + "step": 1479 + }, + { + "epoch": 0.8127402526084568, + "grad_norm": 0.43549305118057363, + "learning_rate": 8.382631152583877e-07, + "loss": 0.2362, + "step": 1480 + }, + { + "epoch": 0.8132894014277869, + "grad_norm": 0.6980314603753898, + "learning_rate": 8.380491723801735e-07, + "loss": 0.2978, + "step": 1481 + }, + { + "epoch": 0.8138385502471169, + "grad_norm": 0.6456071749184628, + "learning_rate": 8.378351154396906e-07, + "loss": 0.2637, + "step": 1482 + }, + { + "epoch": 0.814387699066447, + "grad_norm": 0.39628775126095983, + "learning_rate": 8.37620944509176e-07, + "loss": 0.2484, + "step": 1483 + }, + { + "epoch": 0.814936847885777, + "grad_norm": 0.41786867787223875, + "learning_rate": 8.37406659660904e-07, + "loss": 0.26, + "step": 1484 + }, + { + "epoch": 0.8154859967051071, + "grad_norm": 0.4721240881385976, + "learning_rate": 8.371922609671877e-07, + "loss": 0.2651, + "step": 1485 + }, + { + "epoch": 0.8160351455244371, + "grad_norm": 0.6565824998450281, + "learning_rate": 8.369777485003795e-07, + "loss": 0.2691, + "step": 1486 + }, + { + "epoch": 0.8165842943437671, + "grad_norm": 0.37212098655661985, + "learning_rate": 8.367631223328688e-07, + "loss": 0.2632, + "step": 1487 + }, + { + "epoch": 0.8171334431630972, + "grad_norm": 0.5793290484177135, + "learning_rate": 8.365483825370843e-07, + "loss": 0.2453, + "step": 1488 + }, + { + "epoch": 0.8176825919824272, + "grad_norm": 0.46560796272313215, + "learning_rate": 8.363335291854928e-07, + "loss": 0.2899, + "step": 1489 + }, + { + "epoch": 0.8182317408017573, + "grad_norm": 0.5249878987308157, + "learning_rate": 8.361185623505993e-07, + "loss": 0.2969, + "step": 1490 + }, + { + "epoch": 0.8187808896210873, + "grad_norm": 0.5477312781998844, + "learning_rate": 8.359034821049471e-07, + "loss": 0.2703, + "step": 1491 + }, + { + "epoch": 0.8193300384404174, + "grad_norm": 0.4229478399076709, + "learning_rate": 8.356882885211179e-07, + "loss": 0.2568, + "step": 1492 + }, + { + "epoch": 0.8198791872597474, + "grad_norm": 0.3989114869226699, + "learning_rate": 8.354729816717319e-07, + "loss": 0.3068, + "step": 1493 + }, + { + "epoch": 0.8204283360790774, + "grad_norm": 0.3893644153432786, + "learning_rate": 8.352575616294467e-07, + "loss": 0.2478, + "step": 1494 + }, + { + "epoch": 0.8209774848984075, + "grad_norm": 0.625194027542938, + "learning_rate": 8.350420284669591e-07, + "loss": 0.3414, + "step": 1495 + }, + { + "epoch": 0.8215266337177375, + "grad_norm": 0.4856517126022426, + "learning_rate": 8.348263822570034e-07, + "loss": 0.2629, + "step": 1496 + }, + { + "epoch": 0.8220757825370676, + "grad_norm": 0.567719728166833, + "learning_rate": 8.346106230723523e-07, + "loss": 0.2626, + "step": 1497 + }, + { + "epoch": 0.8226249313563976, + "grad_norm": 0.46270203184098035, + "learning_rate": 8.343947509858166e-07, + "loss": 0.2897, + "step": 1498 + }, + { + "epoch": 0.8231740801757276, + "grad_norm": 0.6255026255765539, + "learning_rate": 8.341787660702448e-07, + "loss": 0.278, + "step": 1499 + }, + { + "epoch": 0.8237232289950577, + "grad_norm": 0.5472470169073899, + "learning_rate": 8.339626683985244e-07, + "loss": 0.261, + "step": 1500 + }, + { + "epoch": 0.8242723778143877, + "grad_norm": 0.43252569961690646, + "learning_rate": 8.337464580435802e-07, + "loss": 0.3564, + "step": 1501 + }, + { + "epoch": 0.8248215266337178, + "grad_norm": 0.4634700708502407, + "learning_rate": 8.335301350783752e-07, + "loss": 0.2852, + "step": 1502 + }, + { + "epoch": 0.8253706754530478, + "grad_norm": 0.4341353948067242, + "learning_rate": 8.333136995759105e-07, + "loss": 0.2372, + "step": 1503 + }, + { + "epoch": 0.8259198242723779, + "grad_norm": 0.45988440402985054, + "learning_rate": 8.330971516092249e-07, + "loss": 0.2538, + "step": 1504 + }, + { + "epoch": 0.8264689730917079, + "grad_norm": 0.41219672711224953, + "learning_rate": 8.328804912513956e-07, + "loss": 0.2821, + "step": 1505 + }, + { + "epoch": 0.8270181219110379, + "grad_norm": 0.4304928286185643, + "learning_rate": 8.326637185755373e-07, + "loss": 0.2555, + "step": 1506 + }, + { + "epoch": 0.827567270730368, + "grad_norm": 0.4690696985243173, + "learning_rate": 8.32446833654803e-07, + "loss": 0.2904, + "step": 1507 + }, + { + "epoch": 0.828116419549698, + "grad_norm": 0.5449421987324699, + "learning_rate": 8.322298365623833e-07, + "loss": 0.3081, + "step": 1508 + }, + { + "epoch": 0.828665568369028, + "grad_norm": 0.6906409481835247, + "learning_rate": 8.320127273715065e-07, + "loss": 0.3215, + "step": 1509 + }, + { + "epoch": 0.829214717188358, + "grad_norm": 0.4849914970257505, + "learning_rate": 8.317955061554393e-07, + "loss": 0.2745, + "step": 1510 + }, + { + "epoch": 0.829763866007688, + "grad_norm": 0.4765773804767519, + "learning_rate": 8.315781729874855e-07, + "loss": 0.3207, + "step": 1511 + }, + { + "epoch": 0.8303130148270181, + "grad_norm": 0.474877432271796, + "learning_rate": 8.313607279409874e-07, + "loss": 0.3053, + "step": 1512 + }, + { + "epoch": 0.8308621636463481, + "grad_norm": 0.6175212247304712, + "learning_rate": 8.311431710893244e-07, + "loss": 0.2984, + "step": 1513 + }, + { + "epoch": 0.8314113124656782, + "grad_norm": 0.5306669975291177, + "learning_rate": 8.309255025059141e-07, + "loss": 0.2681, + "step": 1514 + }, + { + "epoch": 0.8319604612850082, + "grad_norm": 0.547468286168826, + "learning_rate": 8.307077222642117e-07, + "loss": 0.28, + "step": 1515 + }, + { + "epoch": 0.8325096101043383, + "grad_norm": 0.5679207644320915, + "learning_rate": 8.304898304377098e-07, + "loss": 0.2703, + "step": 1516 + }, + { + "epoch": 0.8330587589236683, + "grad_norm": 0.48076553384995413, + "learning_rate": 8.302718270999388e-07, + "loss": 0.2802, + "step": 1517 + }, + { + "epoch": 0.8336079077429983, + "grad_norm": 0.4553089428815726, + "learning_rate": 8.300537123244671e-07, + "loss": 0.2929, + "step": 1518 + }, + { + "epoch": 0.8341570565623284, + "grad_norm": 0.46295785861732336, + "learning_rate": 8.298354861849003e-07, + "loss": 0.2609, + "step": 1519 + }, + { + "epoch": 0.8347062053816584, + "grad_norm": 0.5048380294612951, + "learning_rate": 8.296171487548814e-07, + "loss": 0.2666, + "step": 1520 + }, + { + "epoch": 0.8352553542009885, + "grad_norm": 0.4741662351274767, + "learning_rate": 8.293987001080917e-07, + "loss": 0.2533, + "step": 1521 + }, + { + "epoch": 0.8358045030203185, + "grad_norm": 0.7822656827612668, + "learning_rate": 8.291801403182492e-07, + "loss": 0.2639, + "step": 1522 + }, + { + "epoch": 0.8363536518396485, + "grad_norm": 0.47538701149401524, + "learning_rate": 8.2896146945911e-07, + "loss": 0.2416, + "step": 1523 + }, + { + "epoch": 0.8369028006589786, + "grad_norm": 0.431756000720763, + "learning_rate": 8.287426876044673e-07, + "loss": 0.2811, + "step": 1524 + }, + { + "epoch": 0.8374519494783086, + "grad_norm": 0.40932839690669437, + "learning_rate": 8.28523794828152e-07, + "loss": 0.2658, + "step": 1525 + }, + { + "epoch": 0.8380010982976387, + "grad_norm": 0.6878260459206037, + "learning_rate": 8.283047912040322e-07, + "loss": 0.2394, + "step": 1526 + }, + { + "epoch": 0.8385502471169687, + "grad_norm": 0.7192636381792953, + "learning_rate": 8.280856768060138e-07, + "loss": 0.3324, + "step": 1527 + }, + { + "epoch": 0.8390993959362988, + "grad_norm": 0.4480962450712991, + "learning_rate": 8.278664517080397e-07, + "loss": 0.2938, + "step": 1528 + }, + { + "epoch": 0.8396485447556288, + "grad_norm": 0.6394549253969104, + "learning_rate": 8.276471159840903e-07, + "loss": 0.3068, + "step": 1529 + }, + { + "epoch": 0.8401976935749588, + "grad_norm": 0.5232886697449434, + "learning_rate": 8.274276697081837e-07, + "loss": 0.306, + "step": 1530 + }, + { + "epoch": 0.8407468423942889, + "grad_norm": 0.5367383002514323, + "learning_rate": 8.27208112954374e-07, + "loss": 0.281, + "step": 1531 + }, + { + "epoch": 0.8412959912136189, + "grad_norm": 0.5399608090313538, + "learning_rate": 8.269884457967544e-07, + "loss": 0.2736, + "step": 1532 + }, + { + "epoch": 0.841845140032949, + "grad_norm": 0.5448170871645851, + "learning_rate": 8.267686683094542e-07, + "loss": 0.2645, + "step": 1533 + }, + { + "epoch": 0.842394288852279, + "grad_norm": 0.4770975157211897, + "learning_rate": 8.265487805666401e-07, + "loss": 0.2702, + "step": 1534 + }, + { + "epoch": 0.842943437671609, + "grad_norm": 0.41030774560154143, + "learning_rate": 8.263287826425163e-07, + "loss": 0.3042, + "step": 1535 + }, + { + "epoch": 0.8434925864909391, + "grad_norm": 0.4883096396011661, + "learning_rate": 8.261086746113236e-07, + "loss": 0.2332, + "step": 1536 + }, + { + "epoch": 0.844041735310269, + "grad_norm": 0.3991964499511617, + "learning_rate": 8.258884565473409e-07, + "loss": 0.2564, + "step": 1537 + }, + { + "epoch": 0.8445908841295992, + "grad_norm": 0.47163009935174277, + "learning_rate": 8.256681285248832e-07, + "loss": 0.2568, + "step": 1538 + }, + { + "epoch": 0.8451400329489291, + "grad_norm": 0.5429149138226584, + "learning_rate": 8.254476906183034e-07, + "loss": 0.2817, + "step": 1539 + }, + { + "epoch": 0.8456891817682592, + "grad_norm": 0.4356510363035574, + "learning_rate": 8.252271429019911e-07, + "loss": 0.2688, + "step": 1540 + }, + { + "epoch": 0.8462383305875892, + "grad_norm": 0.439999350761338, + "learning_rate": 8.250064854503731e-07, + "loss": 0.2795, + "step": 1541 + }, + { + "epoch": 0.8467874794069192, + "grad_norm": 0.44638633016895746, + "learning_rate": 8.247857183379129e-07, + "loss": 0.2866, + "step": 1542 + }, + { + "epoch": 0.8473366282262493, + "grad_norm": 0.45874440539186045, + "learning_rate": 8.245648416391115e-07, + "loss": 0.2513, + "step": 1543 + }, + { + "epoch": 0.8478857770455793, + "grad_norm": 0.4942048701143511, + "learning_rate": 8.243438554285066e-07, + "loss": 0.2914, + "step": 1544 + }, + { + "epoch": 0.8484349258649094, + "grad_norm": 0.6485254391068528, + "learning_rate": 8.241227597806729e-07, + "loss": 0.2864, + "step": 1545 + }, + { + "epoch": 0.8489840746842394, + "grad_norm": 0.44552925725187204, + "learning_rate": 8.239015547702221e-07, + "loss": 0.2774, + "step": 1546 + }, + { + "epoch": 0.8495332235035694, + "grad_norm": 0.4177584816686149, + "learning_rate": 8.236802404718024e-07, + "loss": 0.2588, + "step": 1547 + }, + { + "epoch": 0.8500823723228995, + "grad_norm": 0.5156423265438955, + "learning_rate": 8.234588169600996e-07, + "loss": 0.3068, + "step": 1548 + }, + { + "epoch": 0.8506315211422295, + "grad_norm": 0.5431465932616798, + "learning_rate": 8.232372843098359e-07, + "loss": 0.2764, + "step": 1549 + }, + { + "epoch": 0.8511806699615596, + "grad_norm": 0.6826306710079628, + "learning_rate": 8.230156425957702e-07, + "loss": 0.3305, + "step": 1550 + }, + { + "epoch": 0.8517298187808896, + "grad_norm": 0.4120972365696, + "learning_rate": 8.227938918926989e-07, + "loss": 0.2594, + "step": 1551 + }, + { + "epoch": 0.8522789676002197, + "grad_norm": 0.7054920171237341, + "learning_rate": 8.225720322754542e-07, + "loss": 0.2554, + "step": 1552 + }, + { + "epoch": 0.8528281164195497, + "grad_norm": 0.39428553771727576, + "learning_rate": 8.223500638189058e-07, + "loss": 0.2572, + "step": 1553 + }, + { + "epoch": 0.8533772652388797, + "grad_norm": 0.45096872189736265, + "learning_rate": 8.221279865979597e-07, + "loss": 0.282, + "step": 1554 + }, + { + "epoch": 0.8539264140582098, + "grad_norm": 0.5198560645426167, + "learning_rate": 8.21905800687559e-07, + "loss": 0.2628, + "step": 1555 + }, + { + "epoch": 0.8544755628775398, + "grad_norm": 0.5615601958452272, + "learning_rate": 8.21683506162683e-07, + "loss": 0.2633, + "step": 1556 + }, + { + "epoch": 0.8550247116968699, + "grad_norm": 0.4267297415548566, + "learning_rate": 8.214611030983483e-07, + "loss": 0.2517, + "step": 1557 + }, + { + "epoch": 0.8555738605161999, + "grad_norm": 0.5119102447203505, + "learning_rate": 8.212385915696072e-07, + "loss": 0.2737, + "step": 1558 + }, + { + "epoch": 0.85612300933553, + "grad_norm": 0.4772402963106916, + "learning_rate": 8.210159716515495e-07, + "loss": 0.2641, + "step": 1559 + }, + { + "epoch": 0.85667215815486, + "grad_norm": 0.4723282602362927, + "learning_rate": 8.207932434193012e-07, + "loss": 0.2744, + "step": 1560 + }, + { + "epoch": 0.85722130697419, + "grad_norm": 0.5247984990965343, + "learning_rate": 8.205704069480249e-07, + "loss": 0.2976, + "step": 1561 + }, + { + "epoch": 0.8577704557935201, + "grad_norm": 0.434901082740237, + "learning_rate": 8.203474623129195e-07, + "loss": 0.2678, + "step": 1562 + }, + { + "epoch": 0.8583196046128501, + "grad_norm": 0.4559973411833468, + "learning_rate": 8.201244095892209e-07, + "loss": 0.2688, + "step": 1563 + }, + { + "epoch": 0.8588687534321802, + "grad_norm": 0.4231064043390817, + "learning_rate": 8.199012488522009e-07, + "loss": 0.245, + "step": 1564 + }, + { + "epoch": 0.8594179022515102, + "grad_norm": 0.47218772093751676, + "learning_rate": 8.196779801771681e-07, + "loss": 0.2673, + "step": 1565 + }, + { + "epoch": 0.8599670510708401, + "grad_norm": 0.5520503565540403, + "learning_rate": 8.194546036394674e-07, + "loss": 0.2789, + "step": 1566 + }, + { + "epoch": 0.8605161998901703, + "grad_norm": 0.37498723691525676, + "learning_rate": 8.192311193144804e-07, + "loss": 0.2733, + "step": 1567 + }, + { + "epoch": 0.8610653487095002, + "grad_norm": 0.5149189703656774, + "learning_rate": 8.190075272776248e-07, + "loss": 0.277, + "step": 1568 + }, + { + "epoch": 0.8616144975288303, + "grad_norm": 0.45090773049821525, + "learning_rate": 8.187838276043543e-07, + "loss": 0.2713, + "step": 1569 + }, + { + "epoch": 0.8621636463481603, + "grad_norm": 0.4617050207480255, + "learning_rate": 8.185600203701596e-07, + "loss": 0.2529, + "step": 1570 + }, + { + "epoch": 0.8627127951674904, + "grad_norm": 0.43678266883374833, + "learning_rate": 8.183361056505673e-07, + "loss": 0.2147, + "step": 1571 + }, + { + "epoch": 0.8632619439868204, + "grad_norm": 0.6051509419557817, + "learning_rate": 8.181120835211405e-07, + "loss": 0.3373, + "step": 1572 + }, + { + "epoch": 0.8638110928061504, + "grad_norm": 0.5152957267725544, + "learning_rate": 8.178879540574782e-07, + "loss": 0.264, + "step": 1573 + }, + { + "epoch": 0.8643602416254805, + "grad_norm": 0.4642906848030533, + "learning_rate": 8.176637173352161e-07, + "loss": 0.2868, + "step": 1574 + }, + { + "epoch": 0.8649093904448105, + "grad_norm": 0.40258635278783617, + "learning_rate": 8.174393734300257e-07, + "loss": 0.3055, + "step": 1575 + }, + { + "epoch": 0.8654585392641406, + "grad_norm": 0.48691420533764, + "learning_rate": 8.172149224176146e-07, + "loss": 0.2804, + "step": 1576 + }, + { + "epoch": 0.8660076880834706, + "grad_norm": 0.4105792581508396, + "learning_rate": 8.169903643737269e-07, + "loss": 0.2602, + "step": 1577 + }, + { + "epoch": 0.8665568369028006, + "grad_norm": 0.3806674382835567, + "learning_rate": 8.167656993741429e-07, + "loss": 0.2679, + "step": 1578 + }, + { + "epoch": 0.8671059857221307, + "grad_norm": 0.5759802343499267, + "learning_rate": 8.165409274946785e-07, + "loss": 0.2872, + "step": 1579 + }, + { + "epoch": 0.8676551345414607, + "grad_norm": 0.41258408078017383, + "learning_rate": 8.16316048811186e-07, + "loss": 0.2541, + "step": 1580 + }, + { + "epoch": 0.8682042833607908, + "grad_norm": 0.5494199768508434, + "learning_rate": 8.160910633995537e-07, + "loss": 0.2628, + "step": 1581 + }, + { + "epoch": 0.8687534321801208, + "grad_norm": 0.4842256746977313, + "learning_rate": 8.158659713357057e-07, + "loss": 0.2945, + "step": 1582 + }, + { + "epoch": 0.8693025809994509, + "grad_norm": 0.5161051509926529, + "learning_rate": 8.156407726956027e-07, + "loss": 0.3009, + "step": 1583 + }, + { + "epoch": 0.8698517298187809, + "grad_norm": 0.6009319973252065, + "learning_rate": 8.154154675552405e-07, + "loss": 0.2802, + "step": 1584 + }, + { + "epoch": 0.8704008786381109, + "grad_norm": 0.5254518705648258, + "learning_rate": 8.151900559906515e-07, + "loss": 0.2651, + "step": 1585 + }, + { + "epoch": 0.870950027457441, + "grad_norm": 0.4269060890893298, + "learning_rate": 8.149645380779037e-07, + "loss": 0.2699, + "step": 1586 + }, + { + "epoch": 0.871499176276771, + "grad_norm": 0.5526083810148167, + "learning_rate": 8.147389138931011e-07, + "loss": 0.3272, + "step": 1587 + }, + { + "epoch": 0.8720483250961011, + "grad_norm": 0.4201904199832163, + "learning_rate": 8.145131835123837e-07, + "loss": 0.2719, + "step": 1588 + }, + { + "epoch": 0.8725974739154311, + "grad_norm": 0.4512924499577039, + "learning_rate": 8.14287347011927e-07, + "loss": 0.2894, + "step": 1589 + }, + { + "epoch": 0.8731466227347611, + "grad_norm": 0.5214547859034492, + "learning_rate": 8.140614044679426e-07, + "loss": 0.2892, + "step": 1590 + }, + { + "epoch": 0.8736957715540912, + "grad_norm": 0.6194074355365053, + "learning_rate": 8.138353559566779e-07, + "loss": 0.2898, + "step": 1591 + }, + { + "epoch": 0.8742449203734212, + "grad_norm": 0.5428445392822564, + "learning_rate": 8.136092015544158e-07, + "loss": 0.2935, + "step": 1592 + }, + { + "epoch": 0.8747940691927513, + "grad_norm": 0.4094756190013001, + "learning_rate": 8.133829413374749e-07, + "loss": 0.2351, + "step": 1593 + }, + { + "epoch": 0.8753432180120813, + "grad_norm": 0.669719460405006, + "learning_rate": 8.131565753822101e-07, + "loss": 0.3262, + "step": 1594 + }, + { + "epoch": 0.8758923668314114, + "grad_norm": 0.5394528055741196, + "learning_rate": 8.129301037650113e-07, + "loss": 0.2798, + "step": 1595 + }, + { + "epoch": 0.8764415156507414, + "grad_norm": 0.5481488572312055, + "learning_rate": 8.127035265623042e-07, + "loss": 0.2445, + "step": 1596 + }, + { + "epoch": 0.8769906644700713, + "grad_norm": 0.4536830212420526, + "learning_rate": 8.124768438505506e-07, + "loss": 0.254, + "step": 1597 + }, + { + "epoch": 0.8775398132894014, + "grad_norm": 0.461279718781452, + "learning_rate": 8.122500557062474e-07, + "loss": 0.2693, + "step": 1598 + }, + { + "epoch": 0.8780889621087314, + "grad_norm": 0.4768690733712926, + "learning_rate": 8.12023162205927e-07, + "loss": 0.2621, + "step": 1599 + }, + { + "epoch": 0.8786381109280615, + "grad_norm": 0.5067142684422133, + "learning_rate": 8.117961634261582e-07, + "loss": 0.2927, + "step": 1600 + }, + { + "epoch": 0.8786381109280615, + "eval_loss": 0.3504696190357208, + "eval_runtime": 18.5691, + "eval_samples_per_second": 23.857, + "eval_steps_per_second": 1.023, + "step": 1600 + }, + { + "epoch": 0.8791872597473915, + "grad_norm": 0.5304329469767416, + "learning_rate": 8.115690594435441e-07, + "loss": 0.2862, + "step": 1601 + }, + { + "epoch": 0.8797364085667215, + "grad_norm": 0.4200685970405138, + "learning_rate": 8.113418503347243e-07, + "loss": 0.219, + "step": 1602 + }, + { + "epoch": 0.8802855573860516, + "grad_norm": 0.5422488027845013, + "learning_rate": 8.111145361763734e-07, + "loss": 0.2781, + "step": 1603 + }, + { + "epoch": 0.8808347062053816, + "grad_norm": 0.5529610412765624, + "learning_rate": 8.108871170452015e-07, + "loss": 0.238, + "step": 1604 + }, + { + "epoch": 0.8813838550247117, + "grad_norm": 0.4069361697324099, + "learning_rate": 8.106595930179541e-07, + "loss": 0.2576, + "step": 1605 + }, + { + "epoch": 0.8819330038440417, + "grad_norm": 0.3989512929777735, + "learning_rate": 8.104319641714126e-07, + "loss": 0.2547, + "step": 1606 + }, + { + "epoch": 0.8824821526633718, + "grad_norm": 0.48605701233827703, + "learning_rate": 8.102042305823928e-07, + "loss": 0.2427, + "step": 1607 + }, + { + "epoch": 0.8830313014827018, + "grad_norm": 0.5582006189848949, + "learning_rate": 8.099763923277469e-07, + "loss": 0.3268, + "step": 1608 + }, + { + "epoch": 0.8835804503020318, + "grad_norm": 0.47369094368169923, + "learning_rate": 8.097484494843616e-07, + "loss": 0.2925, + "step": 1609 + }, + { + "epoch": 0.8841295991213619, + "grad_norm": 0.4348971617335604, + "learning_rate": 8.09520402129159e-07, + "loss": 0.3077, + "step": 1610 + }, + { + "epoch": 0.8846787479406919, + "grad_norm": 0.5196076776534113, + "learning_rate": 8.092922503390972e-07, + "loss": 0.2912, + "step": 1611 + }, + { + "epoch": 0.885227896760022, + "grad_norm": 0.6551376831089732, + "learning_rate": 8.090639941911689e-07, + "loss": 0.2991, + "step": 1612 + }, + { + "epoch": 0.885777045579352, + "grad_norm": 0.43678167140136764, + "learning_rate": 8.088356337624017e-07, + "loss": 0.2673, + "step": 1613 + }, + { + "epoch": 0.886326194398682, + "grad_norm": 0.6194670908851351, + "learning_rate": 8.086071691298594e-07, + "loss": 0.3134, + "step": 1614 + }, + { + "epoch": 0.8868753432180121, + "grad_norm": 0.4830895566863416, + "learning_rate": 8.083786003706402e-07, + "loss": 0.2639, + "step": 1615 + }, + { + "epoch": 0.8874244920373421, + "grad_norm": 0.38504093989091165, + "learning_rate": 8.081499275618774e-07, + "loss": 0.2847, + "step": 1616 + }, + { + "epoch": 0.8879736408566722, + "grad_norm": 0.47735204318074786, + "learning_rate": 8.079211507807399e-07, + "loss": 0.2666, + "step": 1617 + }, + { + "epoch": 0.8885227896760022, + "grad_norm": 0.4310478062500324, + "learning_rate": 8.076922701044314e-07, + "loss": 0.2595, + "step": 1618 + }, + { + "epoch": 0.8890719384953323, + "grad_norm": 0.426374728954958, + "learning_rate": 8.074632856101905e-07, + "loss": 0.2852, + "step": 1619 + }, + { + "epoch": 0.8896210873146623, + "grad_norm": 0.4297271659043643, + "learning_rate": 8.072341973752914e-07, + "loss": 0.2381, + "step": 1620 + }, + { + "epoch": 0.8901702361339923, + "grad_norm": 0.4319160129566455, + "learning_rate": 8.070050054770427e-07, + "loss": 0.2623, + "step": 1621 + }, + { + "epoch": 0.8907193849533224, + "grad_norm": 0.4582320243173229, + "learning_rate": 8.067757099927881e-07, + "loss": 0.3038, + "step": 1622 + }, + { + "epoch": 0.8912685337726524, + "grad_norm": 0.37868902040646024, + "learning_rate": 8.065463109999068e-07, + "loss": 0.2919, + "step": 1623 + }, + { + "epoch": 0.8918176825919825, + "grad_norm": 0.4519912345460703, + "learning_rate": 8.063168085758121e-07, + "loss": 0.256, + "step": 1624 + }, + { + "epoch": 0.8923668314113125, + "grad_norm": 0.49305590228844964, + "learning_rate": 8.060872027979527e-07, + "loss": 0.2513, + "step": 1625 + }, + { + "epoch": 0.8929159802306426, + "grad_norm": 0.4719066298778507, + "learning_rate": 8.058574937438123e-07, + "loss": 0.264, + "step": 1626 + }, + { + "epoch": 0.8934651290499726, + "grad_norm": 0.5336147730971229, + "learning_rate": 8.056276814909091e-07, + "loss": 0.2606, + "step": 1627 + }, + { + "epoch": 0.8940142778693025, + "grad_norm": 0.5464721736828236, + "learning_rate": 8.053977661167961e-07, + "loss": 0.2509, + "step": 1628 + }, + { + "epoch": 0.8945634266886326, + "grad_norm": 0.5118130759243972, + "learning_rate": 8.051677476990616e-07, + "loss": 0.2685, + "step": 1629 + }, + { + "epoch": 0.8951125755079626, + "grad_norm": 0.4319907093704611, + "learning_rate": 8.04937626315328e-07, + "loss": 0.2578, + "step": 1630 + }, + { + "epoch": 0.8956617243272927, + "grad_norm": 0.4372698982339803, + "learning_rate": 8.047074020432532e-07, + "loss": 0.2677, + "step": 1631 + }, + { + "epoch": 0.8962108731466227, + "grad_norm": 0.43375702875372335, + "learning_rate": 8.044770749605289e-07, + "loss": 0.2614, + "step": 1632 + }, + { + "epoch": 0.8967600219659527, + "grad_norm": 0.44766386955005877, + "learning_rate": 8.042466451448824e-07, + "loss": 0.2747, + "step": 1633 + }, + { + "epoch": 0.8973091707852828, + "grad_norm": 0.49185543197181786, + "learning_rate": 8.040161126740752e-07, + "loss": 0.2757, + "step": 1634 + }, + { + "epoch": 0.8978583196046128, + "grad_norm": 0.47647185034478784, + "learning_rate": 8.037854776259034e-07, + "loss": 0.3097, + "step": 1635 + }, + { + "epoch": 0.8984074684239429, + "grad_norm": 0.6229743998079772, + "learning_rate": 8.035547400781979e-07, + "loss": 0.2361, + "step": 1636 + }, + { + "epoch": 0.8989566172432729, + "grad_norm": 0.5905638636785615, + "learning_rate": 8.033239001088241e-07, + "loss": 0.3194, + "step": 1637 + }, + { + "epoch": 0.899505766062603, + "grad_norm": 0.456828043325492, + "learning_rate": 8.030929577956821e-07, + "loss": 0.2507, + "step": 1638 + }, + { + "epoch": 0.900054914881933, + "grad_norm": 0.5360359258999062, + "learning_rate": 8.028619132167063e-07, + "loss": 0.3045, + "step": 1639 + }, + { + "epoch": 0.900604063701263, + "grad_norm": 0.5143411214612951, + "learning_rate": 8.026307664498657e-07, + "loss": 0.2654, + "step": 1640 + }, + { + "epoch": 0.9011532125205931, + "grad_norm": 0.538470184145661, + "learning_rate": 8.023995175731638e-07, + "loss": 0.2786, + "step": 1641 + }, + { + "epoch": 0.9017023613399231, + "grad_norm": 0.4591072509050711, + "learning_rate": 8.02168166664639e-07, + "loss": 0.2398, + "step": 1642 + }, + { + "epoch": 0.9022515101592532, + "grad_norm": 0.5138710450347816, + "learning_rate": 8.01936713802363e-07, + "loss": 0.2438, + "step": 1643 + }, + { + "epoch": 0.9028006589785832, + "grad_norm": 0.4142712971863248, + "learning_rate": 8.017051590644431e-07, + "loss": 0.2776, + "step": 1644 + }, + { + "epoch": 0.9033498077979132, + "grad_norm": 0.5144798657168941, + "learning_rate": 8.014735025290202e-07, + "loss": 0.2512, + "step": 1645 + }, + { + "epoch": 0.9038989566172433, + "grad_norm": 0.4515667087998328, + "learning_rate": 8.012417442742703e-07, + "loss": 0.2444, + "step": 1646 + }, + { + "epoch": 0.9044481054365733, + "grad_norm": 0.4855148714739395, + "learning_rate": 8.010098843784028e-07, + "loss": 0.2548, + "step": 1647 + }, + { + "epoch": 0.9049972542559034, + "grad_norm": 0.47912310899290783, + "learning_rate": 8.007779229196622e-07, + "loss": 0.2629, + "step": 1648 + }, + { + "epoch": 0.9055464030752334, + "grad_norm": 0.5129694422052712, + "learning_rate": 8.005458599763267e-07, + "loss": 0.2736, + "step": 1649 + }, + { + "epoch": 0.9060955518945635, + "grad_norm": 0.43532731670312247, + "learning_rate": 8.003136956267091e-07, + "loss": 0.2608, + "step": 1650 + }, + { + "epoch": 0.9066447007138935, + "grad_norm": 0.5285884742744585, + "learning_rate": 8.000814299491565e-07, + "loss": 0.3026, + "step": 1651 + }, + { + "epoch": 0.9071938495332235, + "grad_norm": 0.516471533962854, + "learning_rate": 7.998490630220497e-07, + "loss": 0.2711, + "step": 1652 + }, + { + "epoch": 0.9077429983525536, + "grad_norm": 0.43593238158953923, + "learning_rate": 7.996165949238041e-07, + "loss": 0.2671, + "step": 1653 + }, + { + "epoch": 0.9082921471718836, + "grad_norm": 0.4467082812614068, + "learning_rate": 7.99384025732869e-07, + "loss": 0.3158, + "step": 1654 + }, + { + "epoch": 0.9088412959912137, + "grad_norm": 0.9106844683253277, + "learning_rate": 7.991513555277282e-07, + "loss": 0.4324, + "step": 1655 + }, + { + "epoch": 0.9093904448105437, + "grad_norm": 0.47262221420614964, + "learning_rate": 7.989185843868993e-07, + "loss": 0.2886, + "step": 1656 + }, + { + "epoch": 0.9099395936298736, + "grad_norm": 0.5029359625093429, + "learning_rate": 7.986857123889336e-07, + "loss": 0.2684, + "step": 1657 + }, + { + "epoch": 0.9104887424492037, + "grad_norm": 0.608385988374047, + "learning_rate": 7.984527396124174e-07, + "loss": 0.2503, + "step": 1658 + }, + { + "epoch": 0.9110378912685337, + "grad_norm": 0.4238556930706127, + "learning_rate": 7.982196661359698e-07, + "loss": 0.2746, + "step": 1659 + }, + { + "epoch": 0.9115870400878638, + "grad_norm": 0.42403113424925926, + "learning_rate": 7.979864920382449e-07, + "loss": 0.2727, + "step": 1660 + }, + { + "epoch": 0.9121361889071938, + "grad_norm": 0.439014572595048, + "learning_rate": 7.977532173979303e-07, + "loss": 0.2696, + "step": 1661 + }, + { + "epoch": 0.9126853377265239, + "grad_norm": 0.45385586171833314, + "learning_rate": 7.975198422937477e-07, + "loss": 0.2512, + "step": 1662 + }, + { + "epoch": 0.9132344865458539, + "grad_norm": 0.6168347452883322, + "learning_rate": 7.972863668044524e-07, + "loss": 0.2595, + "step": 1663 + }, + { + "epoch": 0.9137836353651839, + "grad_norm": 0.6654375343851714, + "learning_rate": 7.970527910088338e-07, + "loss": 0.2848, + "step": 1664 + }, + { + "epoch": 0.914332784184514, + "grad_norm": 0.5236618734783829, + "learning_rate": 7.968191149857152e-07, + "loss": 0.2408, + "step": 1665 + }, + { + "epoch": 0.914881933003844, + "grad_norm": 0.5161043973919437, + "learning_rate": 7.965853388139539e-07, + "loss": 0.2606, + "step": 1666 + }, + { + "epoch": 0.9154310818231741, + "grad_norm": 0.5369182393620026, + "learning_rate": 7.963514625724402e-07, + "loss": 0.3086, + "step": 1667 + }, + { + "epoch": 0.9159802306425041, + "grad_norm": 0.4409827118930272, + "learning_rate": 7.96117486340099e-07, + "loss": 0.2603, + "step": 1668 + }, + { + "epoch": 0.9165293794618341, + "grad_norm": 0.44597728661676106, + "learning_rate": 7.958834101958888e-07, + "loss": 0.2326, + "step": 1669 + }, + { + "epoch": 0.9170785282811642, + "grad_norm": 0.5629941370524331, + "learning_rate": 7.956492342188015e-07, + "loss": 0.2472, + "step": 1670 + }, + { + "epoch": 0.9176276771004942, + "grad_norm": 0.4062052252940199, + "learning_rate": 7.954149584878628e-07, + "loss": 0.2827, + "step": 1671 + }, + { + "epoch": 0.9181768259198243, + "grad_norm": 0.6324067401399381, + "learning_rate": 7.951805830821323e-07, + "loss": 0.2762, + "step": 1672 + }, + { + "epoch": 0.9187259747391543, + "grad_norm": 0.49222416226976573, + "learning_rate": 7.94946108080703e-07, + "loss": 0.2779, + "step": 1673 + }, + { + "epoch": 0.9192751235584844, + "grad_norm": 0.5447677537696793, + "learning_rate": 7.947115335627017e-07, + "loss": 0.259, + "step": 1674 + }, + { + "epoch": 0.9198242723778144, + "grad_norm": 0.7217111109642705, + "learning_rate": 7.944768596072884e-07, + "loss": 0.3151, + "step": 1675 + }, + { + "epoch": 0.9203734211971444, + "grad_norm": 0.45892564156643595, + "learning_rate": 7.942420862936569e-07, + "loss": 0.2401, + "step": 1676 + }, + { + "epoch": 0.9209225700164745, + "grad_norm": 0.43348993272683223, + "learning_rate": 7.940072137010348e-07, + "loss": 0.2601, + "step": 1677 + }, + { + "epoch": 0.9214717188358045, + "grad_norm": 0.4819214666969866, + "learning_rate": 7.937722419086829e-07, + "loss": 0.273, + "step": 1678 + }, + { + "epoch": 0.9220208676551346, + "grad_norm": 0.5343198814948842, + "learning_rate": 7.935371709958953e-07, + "loss": 0.2921, + "step": 1679 + }, + { + "epoch": 0.9225700164744646, + "grad_norm": 0.434012804711229, + "learning_rate": 7.933020010420001e-07, + "loss": 0.3039, + "step": 1680 + }, + { + "epoch": 0.9231191652937946, + "grad_norm": 0.4982008110946288, + "learning_rate": 7.930667321263583e-07, + "loss": 0.3365, + "step": 1681 + }, + { + "epoch": 0.9236683141131247, + "grad_norm": 0.4736340382316756, + "learning_rate": 7.928313643283644e-07, + "loss": 0.2361, + "step": 1682 + }, + { + "epoch": 0.9242174629324547, + "grad_norm": 0.5454444242360594, + "learning_rate": 7.925958977274464e-07, + "loss": 0.2323, + "step": 1683 + }, + { + "epoch": 0.9247666117517848, + "grad_norm": 0.47791707385936805, + "learning_rate": 7.923603324030658e-07, + "loss": 0.2589, + "step": 1684 + }, + { + "epoch": 0.9253157605711148, + "grad_norm": 0.5211350113278819, + "learning_rate": 7.92124668434717e-07, + "loss": 0.3068, + "step": 1685 + }, + { + "epoch": 0.9258649093904449, + "grad_norm": 0.4796579128979127, + "learning_rate": 7.918889059019283e-07, + "loss": 0.2769, + "step": 1686 + }, + { + "epoch": 0.9264140582097748, + "grad_norm": 0.49232718745012527, + "learning_rate": 7.916530448842604e-07, + "loss": 0.2439, + "step": 1687 + }, + { + "epoch": 0.9269632070291048, + "grad_norm": 0.4558609718678011, + "learning_rate": 7.914170854613076e-07, + "loss": 0.2695, + "step": 1688 + }, + { + "epoch": 0.9275123558484349, + "grad_norm": 0.4682236398006378, + "learning_rate": 7.911810277126981e-07, + "loss": 0.3055, + "step": 1689 + }, + { + "epoch": 0.9280615046677649, + "grad_norm": 0.5488883413713245, + "learning_rate": 7.909448717180924e-07, + "loss": 0.2791, + "step": 1690 + }, + { + "epoch": 0.928610653487095, + "grad_norm": 0.6152742935623089, + "learning_rate": 7.907086175571841e-07, + "loss": 0.3109, + "step": 1691 + }, + { + "epoch": 0.929159802306425, + "grad_norm": 0.47479565309022026, + "learning_rate": 7.90472265309701e-07, + "loss": 0.2983, + "step": 1692 + }, + { + "epoch": 0.9297089511257551, + "grad_norm": 0.49437316447878904, + "learning_rate": 7.902358150554027e-07, + "loss": 0.246, + "step": 1693 + }, + { + "epoch": 0.9302580999450851, + "grad_norm": 0.5376412305321454, + "learning_rate": 7.899992668740826e-07, + "loss": 0.3165, + "step": 1694 + }, + { + "epoch": 0.9308072487644151, + "grad_norm": 0.5358008268830613, + "learning_rate": 7.89762620845567e-07, + "loss": 0.2645, + "step": 1695 + }, + { + "epoch": 0.9313563975837452, + "grad_norm": 0.45118466846144795, + "learning_rate": 7.895258770497154e-07, + "loss": 0.2682, + "step": 1696 + }, + { + "epoch": 0.9319055464030752, + "grad_norm": 0.5208255452849432, + "learning_rate": 7.892890355664199e-07, + "loss": 0.2207, + "step": 1697 + }, + { + "epoch": 0.9324546952224053, + "grad_norm": 0.4731153441278961, + "learning_rate": 7.890520964756058e-07, + "loss": 0.2527, + "step": 1698 + }, + { + "epoch": 0.9330038440417353, + "grad_norm": 0.4575028765301887, + "learning_rate": 7.888150598572311e-07, + "loss": 0.2402, + "step": 1699 + }, + { + "epoch": 0.9335529928610653, + "grad_norm": 0.47803200976429167, + "learning_rate": 7.885779257912876e-07, + "loss": 0.2648, + "step": 1700 + }, + { + "epoch": 0.9341021416803954, + "grad_norm": 0.5246748879091071, + "learning_rate": 7.883406943577985e-07, + "loss": 0.2737, + "step": 1701 + }, + { + "epoch": 0.9346512904997254, + "grad_norm": 0.5588355495947951, + "learning_rate": 7.881033656368212e-07, + "loss": 0.252, + "step": 1702 + }, + { + "epoch": 0.9352004393190555, + "grad_norm": 0.5183516008396057, + "learning_rate": 7.878659397084453e-07, + "loss": 0.3132, + "step": 1703 + }, + { + "epoch": 0.9357495881383855, + "grad_norm": 0.4042180592091759, + "learning_rate": 7.876284166527931e-07, + "loss": 0.2299, + "step": 1704 + }, + { + "epoch": 0.9362987369577156, + "grad_norm": 0.6813981873075216, + "learning_rate": 7.873907965500201e-07, + "loss": 0.2534, + "step": 1705 + }, + { + "epoch": 0.9368478857770456, + "grad_norm": 0.4490906623651215, + "learning_rate": 7.871530794803144e-07, + "loss": 0.2608, + "step": 1706 + }, + { + "epoch": 0.9373970345963756, + "grad_norm": 0.41357000685914597, + "learning_rate": 7.869152655238965e-07, + "loss": 0.2657, + "step": 1707 + }, + { + "epoch": 0.9379461834157057, + "grad_norm": 0.5447434118355732, + "learning_rate": 7.8667735476102e-07, + "loss": 0.2722, + "step": 1708 + }, + { + "epoch": 0.9384953322350357, + "grad_norm": 0.4530362436525533, + "learning_rate": 7.86439347271971e-07, + "loss": 0.2605, + "step": 1709 + }, + { + "epoch": 0.9390444810543658, + "grad_norm": 0.491906698727782, + "learning_rate": 7.862012431370681e-07, + "loss": 0.2476, + "step": 1710 + }, + { + "epoch": 0.9395936298736958, + "grad_norm": 0.39280246518759476, + "learning_rate": 7.859630424366628e-07, + "loss": 0.2809, + "step": 1711 + }, + { + "epoch": 0.9401427786930258, + "grad_norm": 0.4930803786947981, + "learning_rate": 7.857247452511393e-07, + "loss": 0.2702, + "step": 1712 + }, + { + "epoch": 0.9406919275123559, + "grad_norm": 0.5755809098505037, + "learning_rate": 7.854863516609137e-07, + "loss": 0.2313, + "step": 1713 + }, + { + "epoch": 0.9412410763316859, + "grad_norm": 0.5049170443148283, + "learning_rate": 7.852478617464354e-07, + "loss": 0.2807, + "step": 1714 + }, + { + "epoch": 0.941790225151016, + "grad_norm": 0.4925496057542274, + "learning_rate": 7.850092755881855e-07, + "loss": 0.2496, + "step": 1715 + }, + { + "epoch": 0.942339373970346, + "grad_norm": 0.5374667297046075, + "learning_rate": 7.847705932666786e-07, + "loss": 0.2715, + "step": 1716 + }, + { + "epoch": 0.942888522789676, + "grad_norm": 0.4641633013192042, + "learning_rate": 7.84531814862461e-07, + "loss": 0.3116, + "step": 1717 + }, + { + "epoch": 0.943437671609006, + "grad_norm": 0.5050485118505725, + "learning_rate": 7.842929404561114e-07, + "loss": 0.2595, + "step": 1718 + }, + { + "epoch": 0.943986820428336, + "grad_norm": 0.5580960097482179, + "learning_rate": 7.840539701282412e-07, + "loss": 0.2631, + "step": 1719 + }, + { + "epoch": 0.9445359692476661, + "grad_norm": 0.5289379181835928, + "learning_rate": 7.838149039594943e-07, + "loss": 0.2737, + "step": 1720 + }, + { + "epoch": 0.9450851180669961, + "grad_norm": 0.5371059822933462, + "learning_rate": 7.835757420305465e-07, + "loss": 0.3384, + "step": 1721 + }, + { + "epoch": 0.9456342668863262, + "grad_norm": 0.40728473281221633, + "learning_rate": 7.833364844221065e-07, + "loss": 0.2973, + "step": 1722 + }, + { + "epoch": 0.9461834157056562, + "grad_norm": 0.4919831080277447, + "learning_rate": 7.830971312149143e-07, + "loss": 0.2784, + "step": 1723 + }, + { + "epoch": 0.9467325645249862, + "grad_norm": 0.4836977422329483, + "learning_rate": 7.828576824897431e-07, + "loss": 0.2543, + "step": 1724 + }, + { + "epoch": 0.9472817133443163, + "grad_norm": 0.437616976352026, + "learning_rate": 7.826181383273982e-07, + "loss": 0.2612, + "step": 1725 + }, + { + "epoch": 0.9478308621636463, + "grad_norm": 0.45875893844486315, + "learning_rate": 7.823784988087166e-07, + "loss": 0.2609, + "step": 1726 + }, + { + "epoch": 0.9483800109829764, + "grad_norm": 0.5033621962179399, + "learning_rate": 7.821387640145682e-07, + "loss": 0.2758, + "step": 1727 + }, + { + "epoch": 0.9489291598023064, + "grad_norm": 0.4257649240334981, + "learning_rate": 7.818989340258543e-07, + "loss": 0.2538, + "step": 1728 + }, + { + "epoch": 0.9494783086216365, + "grad_norm": 0.42844787389189803, + "learning_rate": 7.81659008923509e-07, + "loss": 0.2584, + "step": 1729 + }, + { + "epoch": 0.9500274574409665, + "grad_norm": 0.5846160464916133, + "learning_rate": 7.81418988788498e-07, + "loss": 0.27, + "step": 1730 + }, + { + "epoch": 0.9505766062602965, + "grad_norm": 0.43208504923518043, + "learning_rate": 7.811788737018192e-07, + "loss": 0.2354, + "step": 1731 + }, + { + "epoch": 0.9511257550796266, + "grad_norm": 0.44715498190261227, + "learning_rate": 7.809386637445025e-07, + "loss": 0.2667, + "step": 1732 + }, + { + "epoch": 0.9516749038989566, + "grad_norm": 0.47207132803701624, + "learning_rate": 7.806983589976103e-07, + "loss": 0.2686, + "step": 1733 + }, + { + "epoch": 0.9522240527182867, + "grad_norm": 0.4787218670132171, + "learning_rate": 7.804579595422362e-07, + "loss": 0.2435, + "step": 1734 + }, + { + "epoch": 0.9527732015376167, + "grad_norm": 0.45628303919484353, + "learning_rate": 7.802174654595065e-07, + "loss": 0.2728, + "step": 1735 + }, + { + "epoch": 0.9533223503569467, + "grad_norm": 0.38018083744914055, + "learning_rate": 7.799768768305789e-07, + "loss": 0.2429, + "step": 1736 + }, + { + "epoch": 0.9538714991762768, + "grad_norm": 0.5112013260262124, + "learning_rate": 7.797361937366432e-07, + "loss": 0.2694, + "step": 1737 + }, + { + "epoch": 0.9544206479956068, + "grad_norm": 0.45696805500952237, + "learning_rate": 7.79495416258921e-07, + "loss": 0.2677, + "step": 1738 + }, + { + "epoch": 0.9549697968149369, + "grad_norm": 0.4835116137567483, + "learning_rate": 7.792545444786661e-07, + "loss": 0.2727, + "step": 1739 + }, + { + "epoch": 0.9555189456342669, + "grad_norm": 0.4496758064298272, + "learning_rate": 7.790135784771637e-07, + "loss": 0.2414, + "step": 1740 + }, + { + "epoch": 0.956068094453597, + "grad_norm": 0.4659569699239641, + "learning_rate": 7.787725183357307e-07, + "loss": 0.2512, + "step": 1741 + }, + { + "epoch": 0.956617243272927, + "grad_norm": 0.3994140737240832, + "learning_rate": 7.785313641357166e-07, + "loss": 0.2714, + "step": 1742 + }, + { + "epoch": 0.957166392092257, + "grad_norm": 0.40864015820755256, + "learning_rate": 7.782901159585015e-07, + "loss": 0.266, + "step": 1743 + }, + { + "epoch": 0.9577155409115871, + "grad_norm": 0.5461071439054702, + "learning_rate": 7.780487738854981e-07, + "loss": 0.2685, + "step": 1744 + }, + { + "epoch": 0.958264689730917, + "grad_norm": 0.5286460933222177, + "learning_rate": 7.778073379981501e-07, + "loss": 0.2635, + "step": 1745 + }, + { + "epoch": 0.9588138385502472, + "grad_norm": 0.4810688479863782, + "learning_rate": 7.775658083779335e-07, + "loss": 0.2601, + "step": 1746 + }, + { + "epoch": 0.9593629873695771, + "grad_norm": 0.4415328360929715, + "learning_rate": 7.773241851063558e-07, + "loss": 0.2582, + "step": 1747 + }, + { + "epoch": 0.9599121361889071, + "grad_norm": 0.6042766028826041, + "learning_rate": 7.770824682649557e-07, + "loss": 0.2494, + "step": 1748 + }, + { + "epoch": 0.9604612850082372, + "grad_norm": 0.45869160117730795, + "learning_rate": 7.768406579353036e-07, + "loss": 0.2498, + "step": 1749 + }, + { + "epoch": 0.9610104338275672, + "grad_norm": 0.5918779105040332, + "learning_rate": 7.76598754199002e-07, + "loss": 0.2829, + "step": 1750 + }, + { + "epoch": 0.9615595826468973, + "grad_norm": 0.5221014159845146, + "learning_rate": 7.763567571376841e-07, + "loss": 0.2558, + "step": 1751 + }, + { + "epoch": 0.9621087314662273, + "grad_norm": 0.445451975309114, + "learning_rate": 7.761146668330152e-07, + "loss": 0.2386, + "step": 1752 + }, + { + "epoch": 0.9626578802855574, + "grad_norm": 0.42868800693247866, + "learning_rate": 7.758724833666919e-07, + "loss": 0.2292, + "step": 1753 + }, + { + "epoch": 0.9632070291048874, + "grad_norm": 0.4563793054125372, + "learning_rate": 7.75630206820442e-07, + "loss": 0.2541, + "step": 1754 + }, + { + "epoch": 0.9637561779242174, + "grad_norm": 0.44344498132545906, + "learning_rate": 7.753878372760251e-07, + "loss": 0.2683, + "step": 1755 + }, + { + "epoch": 0.9643053267435475, + "grad_norm": 0.3956813660687399, + "learning_rate": 7.751453748152318e-07, + "loss": 0.2523, + "step": 1756 + }, + { + "epoch": 0.9648544755628775, + "grad_norm": 0.4489393997313342, + "learning_rate": 7.749028195198843e-07, + "loss": 0.2468, + "step": 1757 + }, + { + "epoch": 0.9654036243822076, + "grad_norm": 0.5866565355532205, + "learning_rate": 7.746601714718362e-07, + "loss": 0.2968, + "step": 1758 + }, + { + "epoch": 0.9659527732015376, + "grad_norm": 0.4945840896004298, + "learning_rate": 7.744174307529725e-07, + "loss": 0.2575, + "step": 1759 + }, + { + "epoch": 0.9665019220208677, + "grad_norm": 0.5579849532829233, + "learning_rate": 7.741745974452088e-07, + "loss": 0.2362, + "step": 1760 + }, + { + "epoch": 0.9670510708401977, + "grad_norm": 0.4707748415892224, + "learning_rate": 7.739316716304924e-07, + "loss": 0.2783, + "step": 1761 + }, + { + "epoch": 0.9676002196595277, + "grad_norm": 0.8143075151003899, + "learning_rate": 7.736886533908019e-07, + "loss": 0.2757, + "step": 1762 + }, + { + "epoch": 0.9681493684788578, + "grad_norm": 0.43106514200214663, + "learning_rate": 7.734455428081473e-07, + "loss": 0.2901, + "step": 1763 + }, + { + "epoch": 0.9686985172981878, + "grad_norm": 0.43445368935880396, + "learning_rate": 7.732023399645692e-07, + "loss": 0.2912, + "step": 1764 + }, + { + "epoch": 0.9692476661175179, + "grad_norm": 0.40441583855768226, + "learning_rate": 7.729590449421396e-07, + "loss": 0.2796, + "step": 1765 + }, + { + "epoch": 0.9697968149368479, + "grad_norm": 0.5170178164470397, + "learning_rate": 7.727156578229616e-07, + "loss": 0.2677, + "step": 1766 + }, + { + "epoch": 0.9703459637561779, + "grad_norm": 0.5443808284667313, + "learning_rate": 7.724721786891695e-07, + "loss": 0.2853, + "step": 1767 + }, + { + "epoch": 0.970895112575508, + "grad_norm": 0.5927298437635197, + "learning_rate": 7.722286076229284e-07, + "loss": 0.283, + "step": 1768 + }, + { + "epoch": 0.971444261394838, + "grad_norm": 0.6127684542817139, + "learning_rate": 7.719849447064347e-07, + "loss": 0.2522, + "step": 1769 + }, + { + "epoch": 0.9719934102141681, + "grad_norm": 0.5315395611227436, + "learning_rate": 7.717411900219155e-07, + "loss": 0.2699, + "step": 1770 + }, + { + "epoch": 0.9725425590334981, + "grad_norm": 0.5351063404606797, + "learning_rate": 7.714973436516294e-07, + "loss": 0.2599, + "step": 1771 + }, + { + "epoch": 0.9730917078528282, + "grad_norm": 0.8519000592417668, + "learning_rate": 7.712534056778649e-07, + "loss": 0.2886, + "step": 1772 + }, + { + "epoch": 0.9736408566721582, + "grad_norm": 0.3980995553922157, + "learning_rate": 7.710093761829425e-07, + "loss": 0.2725, + "step": 1773 + }, + { + "epoch": 0.9741900054914882, + "grad_norm": 0.5201613757318241, + "learning_rate": 7.707652552492134e-07, + "loss": 0.2693, + "step": 1774 + }, + { + "epoch": 0.9747391543108183, + "grad_norm": 0.558676030612656, + "learning_rate": 7.705210429590589e-07, + "loss": 0.303, + "step": 1775 + }, + { + "epoch": 0.9752883031301482, + "grad_norm": 0.47812981544883876, + "learning_rate": 7.70276739394892e-07, + "loss": 0.2667, + "step": 1776 + }, + { + "epoch": 0.9758374519494784, + "grad_norm": 0.547770104025705, + "learning_rate": 7.70032344639156e-07, + "loss": 0.2521, + "step": 1777 + }, + { + "epoch": 0.9763866007688083, + "grad_norm": 0.48037326364239863, + "learning_rate": 7.697878587743251e-07, + "loss": 0.2281, + "step": 1778 + }, + { + "epoch": 0.9769357495881383, + "grad_norm": 0.5667793390416505, + "learning_rate": 7.695432818829046e-07, + "loss": 0.2451, + "step": 1779 + }, + { + "epoch": 0.9774848984074684, + "grad_norm": 0.42210473340749294, + "learning_rate": 7.692986140474297e-07, + "loss": 0.2484, + "step": 1780 + }, + { + "epoch": 0.9780340472267984, + "grad_norm": 0.4090286636235274, + "learning_rate": 7.69053855350467e-07, + "loss": 0.3056, + "step": 1781 + }, + { + "epoch": 0.9785831960461285, + "grad_norm": 0.4898439846013174, + "learning_rate": 7.688090058746136e-07, + "loss": 0.2671, + "step": 1782 + }, + { + "epoch": 0.9791323448654585, + "grad_norm": 0.5181603399804232, + "learning_rate": 7.685640657024973e-07, + "loss": 0.271, + "step": 1783 + }, + { + "epoch": 0.9796814936847886, + "grad_norm": 0.6101604825789101, + "learning_rate": 7.683190349167761e-07, + "loss": 0.2635, + "step": 1784 + }, + { + "epoch": 0.9802306425041186, + "grad_norm": 0.5039024071563895, + "learning_rate": 7.68073913600139e-07, + "loss": 0.2558, + "step": 1785 + }, + { + "epoch": 0.9807797913234486, + "grad_norm": 0.46516505002599134, + "learning_rate": 7.678287018353054e-07, + "loss": 0.2702, + "step": 1786 + }, + { + "epoch": 0.9813289401427787, + "grad_norm": 0.46409827859508324, + "learning_rate": 7.675833997050253e-07, + "loss": 0.2528, + "step": 1787 + }, + { + "epoch": 0.9818780889621087, + "grad_norm": 0.5400606318479371, + "learning_rate": 7.673380072920788e-07, + "loss": 0.2825, + "step": 1788 + }, + { + "epoch": 0.9824272377814388, + "grad_norm": 0.5564284746876682, + "learning_rate": 7.670925246792773e-07, + "loss": 0.2511, + "step": 1789 + }, + { + "epoch": 0.9829763866007688, + "grad_norm": 0.4923896529721891, + "learning_rate": 7.668469519494619e-07, + "loss": 0.2346, + "step": 1790 + }, + { + "epoch": 0.9835255354200988, + "grad_norm": 0.39437563269135684, + "learning_rate": 7.666012891855043e-07, + "loss": 0.2284, + "step": 1791 + }, + { + "epoch": 0.9840746842394289, + "grad_norm": 0.41906052169314323, + "learning_rate": 7.663555364703066e-07, + "loss": 0.2636, + "step": 1792 + }, + { + "epoch": 0.9846238330587589, + "grad_norm": 0.4023326714682953, + "learning_rate": 7.661096938868013e-07, + "loss": 0.2512, + "step": 1793 + }, + { + "epoch": 0.985172981878089, + "grad_norm": 0.4838286623553397, + "learning_rate": 7.658637615179516e-07, + "loss": 0.2323, + "step": 1794 + }, + { + "epoch": 0.985722130697419, + "grad_norm": 0.5575885605065329, + "learning_rate": 7.656177394467502e-07, + "loss": 0.275, + "step": 1795 + }, + { + "epoch": 0.9862712795167491, + "grad_norm": 0.4260296588110545, + "learning_rate": 7.653716277562204e-07, + "loss": 0.2806, + "step": 1796 + }, + { + "epoch": 0.9868204283360791, + "grad_norm": 0.46565438595880204, + "learning_rate": 7.651254265294163e-07, + "loss": 0.2402, + "step": 1797 + }, + { + "epoch": 0.9873695771554091, + "grad_norm": 0.4230243180213196, + "learning_rate": 7.648791358494213e-07, + "loss": 0.2655, + "step": 1798 + }, + { + "epoch": 0.9879187259747392, + "grad_norm": 0.41038079843468256, + "learning_rate": 7.646327557993495e-07, + "loss": 0.2921, + "step": 1799 + }, + { + "epoch": 0.9884678747940692, + "grad_norm": 0.4164321056674212, + "learning_rate": 7.643862864623453e-07, + "loss": 0.2521, + "step": 1800 + }, + { + "epoch": 0.9884678747940692, + "eval_loss": 0.34528762102127075, + "eval_runtime": 18.5439, + "eval_samples_per_second": 23.889, + "eval_steps_per_second": 1.025, + "step": 1800 + }, + { + "epoch": 0.9890170236133993, + "grad_norm": 0.4836515649227938, + "learning_rate": 7.641397279215829e-07, + "loss": 0.2154, + "step": 1801 + }, + { + "epoch": 0.9895661724327293, + "grad_norm": 0.46368817362105696, + "learning_rate": 7.638930802602665e-07, + "loss": 0.2586, + "step": 1802 + }, + { + "epoch": 0.9901153212520593, + "grad_norm": 0.4070155869632062, + "learning_rate": 7.636463435616312e-07, + "loss": 0.2453, + "step": 1803 + }, + { + "epoch": 0.9906644700713894, + "grad_norm": 0.45395483380394075, + "learning_rate": 7.633995179089409e-07, + "loss": 0.2608, + "step": 1804 + }, + { + "epoch": 0.9912136188907194, + "grad_norm": 0.3913694336666719, + "learning_rate": 7.631526033854905e-07, + "loss": 0.2803, + "step": 1805 + }, + { + "epoch": 0.9917627677100495, + "grad_norm": 0.6396163432276057, + "learning_rate": 7.629056000746046e-07, + "loss": 0.2946, + "step": 1806 + }, + { + "epoch": 0.9923119165293794, + "grad_norm": 0.6048936670137287, + "learning_rate": 7.626585080596375e-07, + "loss": 0.3021, + "step": 1807 + }, + { + "epoch": 0.9928610653487095, + "grad_norm": 0.5241290650324293, + "learning_rate": 7.624113274239739e-07, + "loss": 0.2267, + "step": 1808 + }, + { + "epoch": 0.9934102141680395, + "grad_norm": 0.5999421099843125, + "learning_rate": 7.621640582510277e-07, + "loss": 0.2728, + "step": 1809 + }, + { + "epoch": 0.9939593629873695, + "grad_norm": 0.5236069631135204, + "learning_rate": 7.619167006242437e-07, + "loss": 0.2887, + "step": 1810 + }, + { + "epoch": 0.9945085118066996, + "grad_norm": 0.48630894161056804, + "learning_rate": 7.616692546270956e-07, + "loss": 0.3145, + "step": 1811 + }, + { + "epoch": 0.9950576606260296, + "grad_norm": 0.4930936152558026, + "learning_rate": 7.614217203430874e-07, + "loss": 0.2494, + "step": 1812 + }, + { + "epoch": 0.9956068094453597, + "grad_norm": 0.44139600354087577, + "learning_rate": 7.611740978557531e-07, + "loss": 0.2858, + "step": 1813 + }, + { + "epoch": 0.9961559582646897, + "grad_norm": 0.4877427860404887, + "learning_rate": 7.609263872486557e-07, + "loss": 0.2584, + "step": 1814 + }, + { + "epoch": 0.9967051070840197, + "grad_norm": 0.484164544649045, + "learning_rate": 7.606785886053887e-07, + "loss": 0.2517, + "step": 1815 + }, + { + "epoch": 0.9972542559033498, + "grad_norm": 0.44687816830556487, + "learning_rate": 7.60430702009575e-07, + "loss": 0.2623, + "step": 1816 + }, + { + "epoch": 0.9978034047226798, + "grad_norm": 0.5620673692002126, + "learning_rate": 7.60182727544867e-07, + "loss": 0.2746, + "step": 1817 + }, + { + "epoch": 0.9983525535420099, + "grad_norm": 0.43170546576838636, + "learning_rate": 7.599346652949471e-07, + "loss": 0.2665, + "step": 1818 + }, + { + "epoch": 0.9989017023613399, + "grad_norm": 0.4612571656231133, + "learning_rate": 7.596865153435271e-07, + "loss": 0.2755, + "step": 1819 + }, + { + "epoch": 0.99945085118067, + "grad_norm": 0.47048451172823025, + "learning_rate": 7.594382777743487e-07, + "loss": 0.2623, + "step": 1820 + }, + { + "epoch": 1.0, + "grad_norm": 0.6197317419503009, + "learning_rate": 7.591899526711826e-07, + "loss": 0.3271, + "step": 1821 + }, + { + "epoch": 1.00054914881933, + "grad_norm": 0.41578437413931263, + "learning_rate": 7.589415401178294e-07, + "loss": 0.2675, + "step": 1822 + }, + { + "epoch": 1.00109829763866, + "grad_norm": 0.4842886147580413, + "learning_rate": 7.586930401981195e-07, + "loss": 0.2516, + "step": 1823 + }, + { + "epoch": 1.00164744645799, + "grad_norm": 0.5930650394982593, + "learning_rate": 7.584444529959122e-07, + "loss": 0.3383, + "step": 1824 + }, + { + "epoch": 1.0021965952773202, + "grad_norm": 0.43158603874350837, + "learning_rate": 7.581957785950966e-07, + "loss": 0.2677, + "step": 1825 + }, + { + "epoch": 1.0027457440966503, + "grad_norm": 0.4364266572708857, + "learning_rate": 7.579470170795911e-07, + "loss": 0.2454, + "step": 1826 + }, + { + "epoch": 1.0032948929159802, + "grad_norm": 0.43961576365943744, + "learning_rate": 7.576981685333436e-07, + "loss": 0.2319, + "step": 1827 + }, + { + "epoch": 1.0038440417353103, + "grad_norm": 0.5380930158053616, + "learning_rate": 7.574492330403313e-07, + "loss": 0.2559, + "step": 1828 + }, + { + "epoch": 1.0043931905546404, + "grad_norm": 0.52376181353432, + "learning_rate": 7.572002106845606e-07, + "loss": 0.2791, + "step": 1829 + }, + { + "epoch": 1.0049423393739703, + "grad_norm": 0.6060189145093499, + "learning_rate": 7.569511015500678e-07, + "loss": 0.2875, + "step": 1830 + }, + { + "epoch": 1.0054914881933004, + "grad_norm": 0.4892904473365606, + "learning_rate": 7.567019057209177e-07, + "loss": 0.2662, + "step": 1831 + }, + { + "epoch": 1.0060406370126305, + "grad_norm": 0.6393816945523433, + "learning_rate": 7.564526232812048e-07, + "loss": 0.32, + "step": 1832 + }, + { + "epoch": 1.0065897858319606, + "grad_norm": 0.4849396292906131, + "learning_rate": 7.562032543150527e-07, + "loss": 0.2491, + "step": 1833 + }, + { + "epoch": 1.0071389346512905, + "grad_norm": 0.5726582764023308, + "learning_rate": 7.559537989066141e-07, + "loss": 0.277, + "step": 1834 + }, + { + "epoch": 1.0076880834706206, + "grad_norm": 0.5805280895481991, + "learning_rate": 7.557042571400716e-07, + "loss": 0.2913, + "step": 1835 + }, + { + "epoch": 1.0082372322899507, + "grad_norm": 0.4523204870483219, + "learning_rate": 7.554546290996356e-07, + "loss": 0.2728, + "step": 1836 + }, + { + "epoch": 1.0087863811092805, + "grad_norm": 0.4392558372785879, + "learning_rate": 7.552049148695469e-07, + "loss": 0.2389, + "step": 1837 + }, + { + "epoch": 1.0093355299286106, + "grad_norm": 0.3978654258602959, + "learning_rate": 7.549551145340746e-07, + "loss": 0.2833, + "step": 1838 + }, + { + "epoch": 1.0098846787479407, + "grad_norm": 0.6288463716170226, + "learning_rate": 7.547052281775171e-07, + "loss": 0.2683, + "step": 1839 + }, + { + "epoch": 1.0104338275672706, + "grad_norm": 0.4363803769542127, + "learning_rate": 7.54455255884202e-07, + "loss": 0.2503, + "step": 1840 + }, + { + "epoch": 1.0109829763866007, + "grad_norm": 0.47027345127633846, + "learning_rate": 7.542051977384857e-07, + "loss": 0.2408, + "step": 1841 + }, + { + "epoch": 1.0115321252059308, + "grad_norm": 0.6418223290281864, + "learning_rate": 7.539550538247533e-07, + "loss": 0.3113, + "step": 1842 + }, + { + "epoch": 1.012081274025261, + "grad_norm": 0.5411808923595263, + "learning_rate": 7.537048242274196e-07, + "loss": 0.2474, + "step": 1843 + }, + { + "epoch": 1.0126304228445908, + "grad_norm": 0.47371677464110296, + "learning_rate": 7.534545090309276e-07, + "loss": 0.2742, + "step": 1844 + }, + { + "epoch": 1.013179571663921, + "grad_norm": 0.4140319891610881, + "learning_rate": 7.532041083197497e-07, + "loss": 0.2312, + "step": 1845 + }, + { + "epoch": 1.013728720483251, + "grad_norm": 0.47098614805721045, + "learning_rate": 7.529536221783867e-07, + "loss": 0.2358, + "step": 1846 + }, + { + "epoch": 1.014277869302581, + "grad_norm": 0.489341438823661, + "learning_rate": 7.527030506913686e-07, + "loss": 0.2526, + "step": 1847 + }, + { + "epoch": 1.014827018121911, + "grad_norm": 0.5073707910023101, + "learning_rate": 7.524523939432538e-07, + "loss": 0.2859, + "step": 1848 + }, + { + "epoch": 1.015376166941241, + "grad_norm": 0.471135300684498, + "learning_rate": 7.522016520186299e-07, + "loss": 0.2367, + "step": 1849 + }, + { + "epoch": 1.0159253157605712, + "grad_norm": 0.5408069547889525, + "learning_rate": 7.519508250021129e-07, + "loss": 0.2933, + "step": 1850 + }, + { + "epoch": 1.016474464579901, + "grad_norm": 0.4927024626190959, + "learning_rate": 7.516999129783479e-07, + "loss": 0.2839, + "step": 1851 + }, + { + "epoch": 1.0170236133992312, + "grad_norm": 0.455187961758037, + "learning_rate": 7.514489160320083e-07, + "loss": 0.2126, + "step": 1852 + }, + { + "epoch": 1.0175727622185613, + "grad_norm": 0.5709789836912843, + "learning_rate": 7.511978342477965e-07, + "loss": 0.2922, + "step": 1853 + }, + { + "epoch": 1.0181219110378912, + "grad_norm": 0.5311031354348835, + "learning_rate": 7.509466677104432e-07, + "loss": 0.2962, + "step": 1854 + }, + { + "epoch": 1.0186710598572213, + "grad_norm": 0.46834918729813446, + "learning_rate": 7.50695416504708e-07, + "loss": 0.2854, + "step": 1855 + }, + { + "epoch": 1.0192202086765514, + "grad_norm": 0.4601976267797454, + "learning_rate": 7.504440807153787e-07, + "loss": 0.2556, + "step": 1856 + }, + { + "epoch": 1.0197693574958815, + "grad_norm": 0.8922349338001272, + "learning_rate": 7.501926604272721e-07, + "loss": 0.4347, + "step": 1857 + }, + { + "epoch": 1.0203185063152114, + "grad_norm": 0.42685723914488466, + "learning_rate": 7.49941155725233e-07, + "loss": 0.2604, + "step": 1858 + }, + { + "epoch": 1.0208676551345415, + "grad_norm": 0.4826636252760197, + "learning_rate": 7.496895666941353e-07, + "loss": 0.2563, + "step": 1859 + }, + { + "epoch": 1.0214168039538716, + "grad_norm": 0.547492542573778, + "learning_rate": 7.494378934188808e-07, + "loss": 0.2679, + "step": 1860 + }, + { + "epoch": 1.0219659527732015, + "grad_norm": 0.43265999361011936, + "learning_rate": 7.491861359844e-07, + "loss": 0.2276, + "step": 1861 + }, + { + "epoch": 1.0225151015925316, + "grad_norm": 0.4875130613505053, + "learning_rate": 7.489342944756519e-07, + "loss": 0.2241, + "step": 1862 + }, + { + "epoch": 1.0230642504118617, + "grad_norm": 0.5670497970250489, + "learning_rate": 7.486823689776235e-07, + "loss": 0.2196, + "step": 1863 + }, + { + "epoch": 1.0236133992311915, + "grad_norm": 0.5044139270536077, + "learning_rate": 7.484303595753307e-07, + "loss": 0.2445, + "step": 1864 + }, + { + "epoch": 1.0241625480505216, + "grad_norm": 0.6485812641315216, + "learning_rate": 7.48178266353817e-07, + "loss": 0.2694, + "step": 1865 + }, + { + "epoch": 1.0247116968698518, + "grad_norm": 0.4818520361512355, + "learning_rate": 7.479260893981548e-07, + "loss": 0.2943, + "step": 1866 + }, + { + "epoch": 1.0252608456891819, + "grad_norm": 0.5622173081405858, + "learning_rate": 7.476738287934445e-07, + "loss": 0.2483, + "step": 1867 + }, + { + "epoch": 1.0258099945085117, + "grad_norm": 0.4978481006420446, + "learning_rate": 7.474214846248148e-07, + "loss": 0.2485, + "step": 1868 + }, + { + "epoch": 1.0263591433278418, + "grad_norm": 0.4195966967278341, + "learning_rate": 7.471690569774224e-07, + "loss": 0.2662, + "step": 1869 + }, + { + "epoch": 1.026908292147172, + "grad_norm": 0.5085901706757727, + "learning_rate": 7.469165459364526e-07, + "loss": 0.2416, + "step": 1870 + }, + { + "epoch": 1.0274574409665018, + "grad_norm": 0.4389130842819843, + "learning_rate": 7.466639515871183e-07, + "loss": 0.2407, + "step": 1871 + }, + { + "epoch": 1.028006589785832, + "grad_norm": 0.46093514222600845, + "learning_rate": 7.464112740146612e-07, + "loss": 0.2681, + "step": 1872 + }, + { + "epoch": 1.028555738605162, + "grad_norm": 0.47502751939237914, + "learning_rate": 7.4615851330435e-07, + "loss": 0.2754, + "step": 1873 + }, + { + "epoch": 1.0291048874244921, + "grad_norm": 0.6312240664022183, + "learning_rate": 7.459056695414827e-07, + "loss": 0.3064, + "step": 1874 + }, + { + "epoch": 1.029654036243822, + "grad_norm": 0.3965834518947881, + "learning_rate": 7.456527428113845e-07, + "loss": 0.2849, + "step": 1875 + }, + { + "epoch": 1.0302031850631521, + "grad_norm": 0.4507770290194585, + "learning_rate": 7.45399733199409e-07, + "loss": 0.2682, + "step": 1876 + }, + { + "epoch": 1.0307523338824822, + "grad_norm": 0.49520375633819635, + "learning_rate": 7.451466407909374e-07, + "loss": 0.2795, + "step": 1877 + }, + { + "epoch": 1.031301482701812, + "grad_norm": 0.41089911654635053, + "learning_rate": 7.448934656713792e-07, + "loss": 0.2687, + "step": 1878 + }, + { + "epoch": 1.0318506315211422, + "grad_norm": 0.5360605994884401, + "learning_rate": 7.446402079261718e-07, + "loss": 0.2961, + "step": 1879 + }, + { + "epoch": 1.0323997803404723, + "grad_norm": 0.48125365131606435, + "learning_rate": 7.443868676407801e-07, + "loss": 0.288, + "step": 1880 + }, + { + "epoch": 1.0329489291598024, + "grad_norm": 0.3705486460941909, + "learning_rate": 7.441334449006974e-07, + "loss": 0.2579, + "step": 1881 + }, + { + "epoch": 1.0334980779791323, + "grad_norm": 0.41412298669102643, + "learning_rate": 7.438799397914442e-07, + "loss": 0.2556, + "step": 1882 + }, + { + "epoch": 1.0340472267984624, + "grad_norm": 0.4340679652922135, + "learning_rate": 7.436263523985695e-07, + "loss": 0.2577, + "step": 1883 + }, + { + "epoch": 1.0345963756177925, + "grad_norm": 0.5985237083166384, + "learning_rate": 7.433726828076496e-07, + "loss": 0.2409, + "step": 1884 + }, + { + "epoch": 1.0351455244371224, + "grad_norm": 0.4307423706499519, + "learning_rate": 7.431189311042883e-07, + "loss": 0.2515, + "step": 1885 + }, + { + "epoch": 1.0356946732564525, + "grad_norm": 0.613209077905052, + "learning_rate": 7.428650973741179e-07, + "loss": 0.2938, + "step": 1886 + }, + { + "epoch": 1.0362438220757826, + "grad_norm": 0.4686732770862708, + "learning_rate": 7.426111817027976e-07, + "loss": 0.2466, + "step": 1887 + }, + { + "epoch": 1.0367929708951125, + "grad_norm": 0.48015529289259584, + "learning_rate": 7.423571841760149e-07, + "loss": 0.2461, + "step": 1888 + }, + { + "epoch": 1.0373421197144426, + "grad_norm": 0.46120839265425806, + "learning_rate": 7.421031048794843e-07, + "loss": 0.273, + "step": 1889 + }, + { + "epoch": 1.0378912685337727, + "grad_norm": 0.3666383063710738, + "learning_rate": 7.418489438989485e-07, + "loss": 0.2929, + "step": 1890 + }, + { + "epoch": 1.0384404173531028, + "grad_norm": 0.5855318134964513, + "learning_rate": 7.415947013201773e-07, + "loss": 0.2643, + "step": 1891 + }, + { + "epoch": 1.0389895661724327, + "grad_norm": 0.5622034772547164, + "learning_rate": 7.413403772289678e-07, + "loss": 0.2965, + "step": 1892 + }, + { + "epoch": 1.0395387149917628, + "grad_norm": 0.4851139456124104, + "learning_rate": 7.410859717111459e-07, + "loss": 0.2934, + "step": 1893 + }, + { + "epoch": 1.0400878638110929, + "grad_norm": 0.4526450023997006, + "learning_rate": 7.408314848525634e-07, + "loss": 0.2472, + "step": 1894 + }, + { + "epoch": 1.0406370126304227, + "grad_norm": 0.510980318120092, + "learning_rate": 7.405769167391005e-07, + "loss": 0.2737, + "step": 1895 + }, + { + "epoch": 1.0411861614497528, + "grad_norm": 0.5989982281270091, + "learning_rate": 7.403222674566647e-07, + "loss": 0.3381, + "step": 1896 + }, + { + "epoch": 1.041735310269083, + "grad_norm": 0.5201095290469655, + "learning_rate": 7.400675370911903e-07, + "loss": 0.2846, + "step": 1897 + }, + { + "epoch": 1.042284459088413, + "grad_norm": 0.5081609618306947, + "learning_rate": 7.398127257286399e-07, + "loss": 0.2573, + "step": 1898 + }, + { + "epoch": 1.042833607907743, + "grad_norm": 1.328065481774891, + "learning_rate": 7.395578334550026e-07, + "loss": 0.2806, + "step": 1899 + }, + { + "epoch": 1.043382756727073, + "grad_norm": 0.4588942803499214, + "learning_rate": 7.393028603562952e-07, + "loss": 0.2728, + "step": 1900 + }, + { + "epoch": 1.0439319055464031, + "grad_norm": 0.5519204757660947, + "learning_rate": 7.39047806518562e-07, + "loss": 0.2763, + "step": 1901 + }, + { + "epoch": 1.044481054365733, + "grad_norm": 0.4273072437038306, + "learning_rate": 7.387926720278739e-07, + "loss": 0.2467, + "step": 1902 + }, + { + "epoch": 1.0450302031850631, + "grad_norm": 0.5603419599277042, + "learning_rate": 7.385374569703296e-07, + "loss": 0.3278, + "step": 1903 + }, + { + "epoch": 1.0455793520043932, + "grad_norm": 0.46282749221860814, + "learning_rate": 7.38282161432055e-07, + "loss": 0.2517, + "step": 1904 + }, + { + "epoch": 1.0461285008237233, + "grad_norm": 0.5234512677935415, + "learning_rate": 7.380267854992024e-07, + "loss": 0.2579, + "step": 1905 + }, + { + "epoch": 1.0466776496430532, + "grad_norm": 0.4939694171578199, + "learning_rate": 7.37771329257952e-07, + "loss": 0.2263, + "step": 1906 + }, + { + "epoch": 1.0472267984623833, + "grad_norm": 0.4612044728123528, + "learning_rate": 7.375157927945111e-07, + "loss": 0.3081, + "step": 1907 + }, + { + "epoch": 1.0477759472817134, + "grad_norm": 0.42088853837759654, + "learning_rate": 7.372601761951137e-07, + "loss": 0.2223, + "step": 1908 + }, + { + "epoch": 1.0483250961010433, + "grad_norm": 0.5144396275656794, + "learning_rate": 7.37004479546021e-07, + "loss": 0.2526, + "step": 1909 + }, + { + "epoch": 1.0488742449203734, + "grad_norm": 0.5106578937429289, + "learning_rate": 7.36748702933521e-07, + "loss": 0.2532, + "step": 1910 + }, + { + "epoch": 1.0494233937397035, + "grad_norm": 0.4233076360404985, + "learning_rate": 7.36492846443929e-07, + "loss": 0.2318, + "step": 1911 + }, + { + "epoch": 1.0499725425590336, + "grad_norm": 0.5059375016312809, + "learning_rate": 7.362369101635874e-07, + "loss": 0.2462, + "step": 1912 + }, + { + "epoch": 1.0505216913783635, + "grad_norm": 0.4903167549395599, + "learning_rate": 7.359808941788647e-07, + "loss": 0.265, + "step": 1913 + }, + { + "epoch": 1.0510708401976936, + "grad_norm": 0.5830956306818907, + "learning_rate": 7.357247985761574e-07, + "loss": 0.3169, + "step": 1914 + }, + { + "epoch": 1.0516199890170237, + "grad_norm": 0.4202106558059594, + "learning_rate": 7.354686234418883e-07, + "loss": 0.2556, + "step": 1915 + }, + { + "epoch": 1.0521691378363536, + "grad_norm": 0.4937607561452683, + "learning_rate": 7.352123688625066e-07, + "loss": 0.2307, + "step": 1916 + }, + { + "epoch": 1.0527182866556837, + "grad_norm": 0.5376362404685917, + "learning_rate": 7.349560349244894e-07, + "loss": 0.2798, + "step": 1917 + }, + { + "epoch": 1.0532674354750138, + "grad_norm": 0.6365819727585691, + "learning_rate": 7.346996217143394e-07, + "loss": 0.2861, + "step": 1918 + }, + { + "epoch": 1.0538165842943437, + "grad_norm": 0.5826156240105614, + "learning_rate": 7.34443129318587e-07, + "loss": 0.2863, + "step": 1919 + }, + { + "epoch": 1.0543657331136738, + "grad_norm": 0.4598109134569838, + "learning_rate": 7.341865578237888e-07, + "loss": 0.267, + "step": 1920 + }, + { + "epoch": 1.0549148819330039, + "grad_norm": 0.5167649148140777, + "learning_rate": 7.33929907316528e-07, + "loss": 0.2504, + "step": 1921 + }, + { + "epoch": 1.055464030752334, + "grad_norm": 0.5735152148626464, + "learning_rate": 7.336731778834151e-07, + "loss": 0.2801, + "step": 1922 + }, + { + "epoch": 1.0560131795716639, + "grad_norm": 0.6863260320534952, + "learning_rate": 7.334163696110866e-07, + "loss": 0.2954, + "step": 1923 + }, + { + "epoch": 1.056562328390994, + "grad_norm": 0.5188716198417976, + "learning_rate": 7.331594825862059e-07, + "loss": 0.3047, + "step": 1924 + }, + { + "epoch": 1.057111477210324, + "grad_norm": 0.524826132487868, + "learning_rate": 7.329025168954629e-07, + "loss": 0.2566, + "step": 1925 + }, + { + "epoch": 1.057660626029654, + "grad_norm": 0.6298929407110873, + "learning_rate": 7.326454726255738e-07, + "loss": 0.2811, + "step": 1926 + }, + { + "epoch": 1.058209774848984, + "grad_norm": 0.40592329840524316, + "learning_rate": 7.323883498632821e-07, + "loss": 0.2698, + "step": 1927 + }, + { + "epoch": 1.0587589236683141, + "grad_norm": 0.5492557069011063, + "learning_rate": 7.321311486953567e-07, + "loss": 0.2753, + "step": 1928 + }, + { + "epoch": 1.0593080724876442, + "grad_norm": 0.4372373730770258, + "learning_rate": 7.318738692085939e-07, + "loss": 0.2531, + "step": 1929 + }, + { + "epoch": 1.0598572213069741, + "grad_norm": 0.4890468250542675, + "learning_rate": 7.31616511489816e-07, + "loss": 0.2619, + "step": 1930 + }, + { + "epoch": 1.0604063701263042, + "grad_norm": 0.48686492305275036, + "learning_rate": 7.313590756258717e-07, + "loss": 0.2671, + "step": 1931 + }, + { + "epoch": 1.0609555189456343, + "grad_norm": 0.5904849452947342, + "learning_rate": 7.311015617036359e-07, + "loss": 0.3122, + "step": 1932 + }, + { + "epoch": 1.0615046677649642, + "grad_norm": 0.5514617975174491, + "learning_rate": 7.308439698100103e-07, + "loss": 0.2769, + "step": 1933 + }, + { + "epoch": 1.0620538165842943, + "grad_norm": 0.4893794046153872, + "learning_rate": 7.305863000319228e-07, + "loss": 0.2741, + "step": 1934 + }, + { + "epoch": 1.0626029654036244, + "grad_norm": 0.49855065944451776, + "learning_rate": 7.303285524563271e-07, + "loss": 0.2617, + "step": 1935 + }, + { + "epoch": 1.0631521142229543, + "grad_norm": 0.5080905203056124, + "learning_rate": 7.300707271702038e-07, + "loss": 0.2751, + "step": 1936 + }, + { + "epoch": 1.0637012630422844, + "grad_norm": 0.509009548533992, + "learning_rate": 7.298128242605592e-07, + "loss": 0.3173, + "step": 1937 + }, + { + "epoch": 1.0642504118616145, + "grad_norm": 0.4902855440171685, + "learning_rate": 7.295548438144264e-07, + "loss": 0.2549, + "step": 1938 + }, + { + "epoch": 1.0647995606809446, + "grad_norm": 0.48230088700943624, + "learning_rate": 7.292967859188638e-07, + "loss": 0.2957, + "step": 1939 + }, + { + "epoch": 1.0653487095002745, + "grad_norm": 0.5915589072443312, + "learning_rate": 7.290386506609567e-07, + "loss": 0.2393, + "step": 1940 + }, + { + "epoch": 1.0658978583196046, + "grad_norm": 0.5812434986966898, + "learning_rate": 7.287804381278164e-07, + "loss": 0.3074, + "step": 1941 + }, + { + "epoch": 1.0664470071389347, + "grad_norm": 0.5742467416143907, + "learning_rate": 7.2852214840658e-07, + "loss": 0.259, + "step": 1942 + }, + { + "epoch": 1.0669961559582646, + "grad_norm": 0.44473181705933523, + "learning_rate": 7.282637815844105e-07, + "loss": 0.2624, + "step": 1943 + }, + { + "epoch": 1.0675453047775947, + "grad_norm": 0.43849484593757254, + "learning_rate": 7.280053377484974e-07, + "loss": 0.2433, + "step": 1944 + }, + { + "epoch": 1.0680944535969248, + "grad_norm": 0.44154688572759393, + "learning_rate": 7.277468169860562e-07, + "loss": 0.266, + "step": 1945 + }, + { + "epoch": 1.068643602416255, + "grad_norm": 0.47092833108215504, + "learning_rate": 7.274882193843278e-07, + "loss": 0.2513, + "step": 1946 + }, + { + "epoch": 1.0691927512355848, + "grad_norm": 0.3867134794448388, + "learning_rate": 7.272295450305793e-07, + "loss": 0.288, + "step": 1947 + }, + { + "epoch": 1.0697419000549149, + "grad_norm": 0.466449706152349, + "learning_rate": 7.269707940121041e-07, + "loss": 0.2798, + "step": 1948 + }, + { + "epoch": 1.070291048874245, + "grad_norm": 0.5372199937100748, + "learning_rate": 7.26711966416221e-07, + "loss": 0.2769, + "step": 1949 + }, + { + "epoch": 1.0708401976935749, + "grad_norm": 0.5021622348268454, + "learning_rate": 7.264530623302746e-07, + "loss": 0.2481, + "step": 1950 + }, + { + "epoch": 1.071389346512905, + "grad_norm": 0.44674989073601945, + "learning_rate": 7.261940818416358e-07, + "loss": 0.2557, + "step": 1951 + }, + { + "epoch": 1.071938495332235, + "grad_norm": 0.46583884140733256, + "learning_rate": 7.259350250377007e-07, + "loss": 0.278, + "step": 1952 + }, + { + "epoch": 1.0724876441515652, + "grad_norm": 0.4669746396459837, + "learning_rate": 7.256758920058916e-07, + "loss": 0.2328, + "step": 1953 + }, + { + "epoch": 1.073036792970895, + "grad_norm": 0.4707492711977006, + "learning_rate": 7.254166828336562e-07, + "loss": 0.2644, + "step": 1954 + }, + { + "epoch": 1.0735859417902252, + "grad_norm": 0.3834561654166051, + "learning_rate": 7.251573976084681e-07, + "loss": 0.261, + "step": 1955 + }, + { + "epoch": 1.0741350906095553, + "grad_norm": 0.4828302677996881, + "learning_rate": 7.248980364178269e-07, + "loss": 0.2441, + "step": 1956 + }, + { + "epoch": 1.0746842394288851, + "grad_norm": 0.5274084959211662, + "learning_rate": 7.246385993492566e-07, + "loss": 0.239, + "step": 1957 + }, + { + "epoch": 1.0752333882482152, + "grad_norm": 0.5159718381356101, + "learning_rate": 7.243790864903085e-07, + "loss": 0.2636, + "step": 1958 + }, + { + "epoch": 1.0757825370675453, + "grad_norm": 0.5249058530492751, + "learning_rate": 7.24119497928558e-07, + "loss": 0.2444, + "step": 1959 + }, + { + "epoch": 1.0763316858868754, + "grad_norm": 0.5123617875634998, + "learning_rate": 7.238598337516072e-07, + "loss": 0.2291, + "step": 1960 + }, + { + "epoch": 1.0768808347062053, + "grad_norm": 0.4976553827927659, + "learning_rate": 7.236000940470829e-07, + "loss": 0.2631, + "step": 1961 + }, + { + "epoch": 1.0774299835255354, + "grad_norm": 0.5989997251261129, + "learning_rate": 7.233402789026376e-07, + "loss": 0.2583, + "step": 1962 + }, + { + "epoch": 1.0779791323448655, + "grad_norm": 0.475111591657095, + "learning_rate": 7.230803884059497e-07, + "loss": 0.2492, + "step": 1963 + }, + { + "epoch": 1.0785282811641954, + "grad_norm": 0.4123066196218989, + "learning_rate": 7.22820422644722e-07, + "loss": 0.2771, + "step": 1964 + }, + { + "epoch": 1.0790774299835255, + "grad_norm": 0.47652151034073204, + "learning_rate": 7.225603817066842e-07, + "loss": 0.2666, + "step": 1965 + }, + { + "epoch": 1.0796265788028556, + "grad_norm": 0.46457087787614904, + "learning_rate": 7.223002656795901e-07, + "loss": 0.2641, + "step": 1966 + }, + { + "epoch": 1.0801757276221857, + "grad_norm": 0.47053450897897553, + "learning_rate": 7.220400746512191e-07, + "loss": 0.2472, + "step": 1967 + }, + { + "epoch": 1.0807248764415156, + "grad_norm": 0.5973322780378363, + "learning_rate": 7.217798087093765e-07, + "loss": 0.2961, + "step": 1968 + }, + { + "epoch": 1.0812740252608457, + "grad_norm": 0.4725500720302298, + "learning_rate": 7.21519467941892e-07, + "loss": 0.2802, + "step": 1969 + }, + { + "epoch": 1.0818231740801758, + "grad_norm": 0.4801608388435618, + "learning_rate": 7.212590524366217e-07, + "loss": 0.2658, + "step": 1970 + }, + { + "epoch": 1.0823723228995057, + "grad_norm": 0.483371735435856, + "learning_rate": 7.209985622814456e-07, + "loss": 0.24, + "step": 1971 + }, + { + "epoch": 1.0829214717188358, + "grad_norm": 0.5346929497518315, + "learning_rate": 7.207379975642695e-07, + "loss": 0.2363, + "step": 1972 + }, + { + "epoch": 1.083470620538166, + "grad_norm": 0.48291327494312974, + "learning_rate": 7.20477358373025e-07, + "loss": 0.2688, + "step": 1973 + }, + { + "epoch": 1.084019769357496, + "grad_norm": 0.5059346187117079, + "learning_rate": 7.202166447956677e-07, + "loss": 0.2747, + "step": 1974 + }, + { + "epoch": 1.0845689181768259, + "grad_norm": 0.809059863872899, + "learning_rate": 7.199558569201793e-07, + "loss": 0.4501, + "step": 1975 + }, + { + "epoch": 1.085118066996156, + "grad_norm": 0.5839461010903824, + "learning_rate": 7.196949948345653e-07, + "loss": 0.2674, + "step": 1976 + }, + { + "epoch": 1.085667215815486, + "grad_norm": 0.7100556007052125, + "learning_rate": 7.194340586268578e-07, + "loss": 0.2816, + "step": 1977 + }, + { + "epoch": 1.086216364634816, + "grad_norm": 0.4760876293933374, + "learning_rate": 7.191730483851129e-07, + "loss": 0.2714, + "step": 1978 + }, + { + "epoch": 1.086765513454146, + "grad_norm": 0.5396483034010165, + "learning_rate": 7.189119641974118e-07, + "loss": 0.259, + "step": 1979 + }, + { + "epoch": 1.0873146622734762, + "grad_norm": 0.5500828313729427, + "learning_rate": 7.186508061518612e-07, + "loss": 0.3548, + "step": 1980 + }, + { + "epoch": 1.087863811092806, + "grad_norm": 0.49716425854017776, + "learning_rate": 7.183895743365919e-07, + "loss": 0.2554, + "step": 1981 + }, + { + "epoch": 1.0884129599121362, + "grad_norm": 0.4757884185013021, + "learning_rate": 7.181282688397602e-07, + "loss": 0.2379, + "step": 1982 + }, + { + "epoch": 1.0889621087314663, + "grad_norm": 0.4916505537302754, + "learning_rate": 7.178668897495469e-07, + "loss": 0.27, + "step": 1983 + }, + { + "epoch": 1.0895112575507964, + "grad_norm": 0.4118877508007891, + "learning_rate": 7.176054371541582e-07, + "loss": 0.275, + "step": 1984 + }, + { + "epoch": 1.0900604063701262, + "grad_norm": 0.5379753195409632, + "learning_rate": 7.173439111418243e-07, + "loss": 0.285, + "step": 1985 + }, + { + "epoch": 1.0906095551894563, + "grad_norm": 0.48881307212765185, + "learning_rate": 7.170823118008009e-07, + "loss": 0.2928, + "step": 1986 + }, + { + "epoch": 1.0911587040087865, + "grad_norm": 0.4795189129936405, + "learning_rate": 7.168206392193678e-07, + "loss": 0.245, + "step": 1987 + }, + { + "epoch": 1.0917078528281163, + "grad_norm": 0.44271120457183727, + "learning_rate": 7.165588934858303e-07, + "loss": 0.2397, + "step": 1988 + }, + { + "epoch": 1.0922570016474464, + "grad_norm": 0.47926309831555297, + "learning_rate": 7.162970746885176e-07, + "loss": 0.2293, + "step": 1989 + }, + { + "epoch": 1.0928061504667765, + "grad_norm": 0.44594922292612355, + "learning_rate": 7.16035182915784e-07, + "loss": 0.2368, + "step": 1990 + }, + { + "epoch": 1.0933552992861064, + "grad_norm": 0.6782100494947264, + "learning_rate": 7.157732182560082e-07, + "loss": 0.2882, + "step": 1991 + }, + { + "epoch": 1.0939044481054365, + "grad_norm": 0.6672096767124965, + "learning_rate": 7.15511180797594e-07, + "loss": 0.2713, + "step": 1992 + }, + { + "epoch": 1.0944535969247666, + "grad_norm": 0.431389267378025, + "learning_rate": 7.15249070628969e-07, + "loss": 0.3044, + "step": 1993 + }, + { + "epoch": 1.0950027457440967, + "grad_norm": 0.4249831014754951, + "learning_rate": 7.149868878385859e-07, + "loss": 0.2422, + "step": 1994 + }, + { + "epoch": 1.0955518945634266, + "grad_norm": 0.4857078877857519, + "learning_rate": 7.147246325149218e-07, + "loss": 0.2446, + "step": 1995 + }, + { + "epoch": 1.0961010433827567, + "grad_norm": 0.617692271368372, + "learning_rate": 7.144623047464779e-07, + "loss": 0.2952, + "step": 1996 + }, + { + "epoch": 1.0966501922020868, + "grad_norm": 0.5568761916886986, + "learning_rate": 7.141999046217806e-07, + "loss": 0.2864, + "step": 1997 + }, + { + "epoch": 1.0971993410214167, + "grad_norm": 0.5293561416445784, + "learning_rate": 7.1393743222938e-07, + "loss": 0.2595, + "step": 1998 + }, + { + "epoch": 1.0977484898407468, + "grad_norm": 0.5180200287324501, + "learning_rate": 7.136748876578508e-07, + "loss": 0.2537, + "step": 1999 + }, + { + "epoch": 1.098297638660077, + "grad_norm": 0.3936420295809807, + "learning_rate": 7.134122709957921e-07, + "loss": 0.246, + "step": 2000 + }, + { + "epoch": 1.098297638660077, + "eval_loss": 0.34057578444480896, + "eval_runtime": 18.722, + "eval_samples_per_second": 23.662, + "eval_steps_per_second": 1.015, + "step": 2000 + }, + { + "epoch": 1.098846787479407, + "grad_norm": 0.4214351543698219, + "learning_rate": 7.131495823318278e-07, + "loss": 0.2396, + "step": 2001 + }, + { + "epoch": 1.0993959362987369, + "grad_norm": 0.44422758067504065, + "learning_rate": 7.128868217546051e-07, + "loss": 0.2881, + "step": 2002 + }, + { + "epoch": 1.099945085118067, + "grad_norm": 0.5013125845106579, + "learning_rate": 7.126239893527964e-07, + "loss": 0.2792, + "step": 2003 + }, + { + "epoch": 1.100494233937397, + "grad_norm": 0.533982878969831, + "learning_rate": 7.123610852150975e-07, + "loss": 0.274, + "step": 2004 + }, + { + "epoch": 1.101043382756727, + "grad_norm": 0.4459148061825294, + "learning_rate": 7.120981094302293e-07, + "loss": 0.275, + "step": 2005 + }, + { + "epoch": 1.101592531576057, + "grad_norm": 0.5100661828730729, + "learning_rate": 7.118350620869363e-07, + "loss": 0.2677, + "step": 2006 + }, + { + "epoch": 1.1021416803953872, + "grad_norm": 0.48410171155944226, + "learning_rate": 7.115719432739873e-07, + "loss": 0.2836, + "step": 2007 + }, + { + "epoch": 1.1026908292147173, + "grad_norm": 0.5380579317818759, + "learning_rate": 7.11308753080175e-07, + "loss": 0.283, + "step": 2008 + }, + { + "epoch": 1.1032399780340472, + "grad_norm": 0.46313197561519037, + "learning_rate": 7.11045491594317e-07, + "loss": 0.2858, + "step": 2009 + }, + { + "epoch": 1.1037891268533773, + "grad_norm": 0.4891923392124667, + "learning_rate": 7.107821589052536e-07, + "loss": 0.292, + "step": 2010 + }, + { + "epoch": 1.1043382756727074, + "grad_norm": 0.46855250123693937, + "learning_rate": 7.105187551018502e-07, + "loss": 0.2559, + "step": 2011 + }, + { + "epoch": 1.1048874244920373, + "grad_norm": 0.5241046486898976, + "learning_rate": 7.102552802729958e-07, + "loss": 0.2906, + "step": 2012 + }, + { + "epoch": 1.1054365733113674, + "grad_norm": 0.5115287830834213, + "learning_rate": 7.099917345076037e-07, + "loss": 0.2681, + "step": 2013 + }, + { + "epoch": 1.1059857221306975, + "grad_norm": 0.4389809327921077, + "learning_rate": 7.097281178946109e-07, + "loss": 0.2449, + "step": 2014 + }, + { + "epoch": 1.1065348709500276, + "grad_norm": 0.4197311446414432, + "learning_rate": 7.094644305229778e-07, + "loss": 0.2562, + "step": 2015 + }, + { + "epoch": 1.1070840197693574, + "grad_norm": 0.6275586745185258, + "learning_rate": 7.092006724816896e-07, + "loss": 0.2374, + "step": 2016 + }, + { + "epoch": 1.1076331685886875, + "grad_norm": 0.5791198925530107, + "learning_rate": 7.089368438597548e-07, + "loss": 0.2661, + "step": 2017 + }, + { + "epoch": 1.1081823174080176, + "grad_norm": 0.5080422059573922, + "learning_rate": 7.086729447462057e-07, + "loss": 0.2433, + "step": 2018 + }, + { + "epoch": 1.1087314662273475, + "grad_norm": 0.538816618379428, + "learning_rate": 7.08408975230099e-07, + "loss": 0.2457, + "step": 2019 + }, + { + "epoch": 1.1092806150466776, + "grad_norm": 0.5395042572172819, + "learning_rate": 7.081449354005142e-07, + "loss": 0.2553, + "step": 2020 + }, + { + "epoch": 1.1098297638660077, + "grad_norm": 0.45801502622178425, + "learning_rate": 7.078808253465551e-07, + "loss": 0.2574, + "step": 2021 + }, + { + "epoch": 1.1103789126853378, + "grad_norm": 0.5294022144766461, + "learning_rate": 7.076166451573494e-07, + "loss": 0.261, + "step": 2022 + }, + { + "epoch": 1.1109280615046677, + "grad_norm": 0.4751407036783218, + "learning_rate": 7.073523949220478e-07, + "loss": 0.2569, + "step": 2023 + }, + { + "epoch": 1.1114772103239978, + "grad_norm": 0.40491857274890025, + "learning_rate": 7.070880747298252e-07, + "loss": 0.2538, + "step": 2024 + }, + { + "epoch": 1.112026359143328, + "grad_norm": 0.5685897807916839, + "learning_rate": 7.0682368466988e-07, + "loss": 0.3149, + "step": 2025 + }, + { + "epoch": 1.1125755079626578, + "grad_norm": 0.551199924674557, + "learning_rate": 7.06559224831434e-07, + "loss": 0.2611, + "step": 2026 + }, + { + "epoch": 1.113124656781988, + "grad_norm": 0.5561341930314058, + "learning_rate": 7.062946953037327e-07, + "loss": 0.2711, + "step": 2027 + }, + { + "epoch": 1.113673805601318, + "grad_norm": 0.4435343747176299, + "learning_rate": 7.06030096176045e-07, + "loss": 0.2519, + "step": 2028 + }, + { + "epoch": 1.114222954420648, + "grad_norm": 0.4197566778935351, + "learning_rate": 7.057654275376635e-07, + "loss": 0.2416, + "step": 2029 + }, + { + "epoch": 1.114772103239978, + "grad_norm": 0.4148005145919897, + "learning_rate": 7.055006894779038e-07, + "loss": 0.2129, + "step": 2030 + }, + { + "epoch": 1.115321252059308, + "grad_norm": 0.4406072147515118, + "learning_rate": 7.052358820861058e-07, + "loss": 0.2471, + "step": 2031 + }, + { + "epoch": 1.1158704008786382, + "grad_norm": 0.4715665303241404, + "learning_rate": 7.049710054516316e-07, + "loss": 0.2744, + "step": 2032 + }, + { + "epoch": 1.116419549697968, + "grad_norm": 0.543210091975553, + "learning_rate": 7.047060596638679e-07, + "loss": 0.3029, + "step": 2033 + }, + { + "epoch": 1.1169686985172982, + "grad_norm": 0.44540658979555325, + "learning_rate": 7.044410448122236e-07, + "loss": 0.2495, + "step": 2034 + }, + { + "epoch": 1.1175178473366283, + "grad_norm": 0.4778455656092044, + "learning_rate": 7.041759609861316e-07, + "loss": 0.2196, + "step": 2035 + }, + { + "epoch": 1.1180669961559582, + "grad_norm": 0.4108049274170457, + "learning_rate": 7.039108082750484e-07, + "loss": 0.2855, + "step": 2036 + }, + { + "epoch": 1.1186161449752883, + "grad_norm": 0.6344012446162273, + "learning_rate": 7.036455867684525e-07, + "loss": 0.3361, + "step": 2037 + }, + { + "epoch": 1.1191652937946184, + "grad_norm": 0.47896641824630637, + "learning_rate": 7.033802965558471e-07, + "loss": 0.2488, + "step": 2038 + }, + { + "epoch": 1.1197144426139485, + "grad_norm": 0.4859682210256004, + "learning_rate": 7.031149377267574e-07, + "loss": 0.2696, + "step": 2039 + }, + { + "epoch": 1.1202635914332784, + "grad_norm": 0.6128079625103663, + "learning_rate": 7.028495103707324e-07, + "loss": 0.2487, + "step": 2040 + }, + { + "epoch": 1.1208127402526085, + "grad_norm": 0.5375253310545763, + "learning_rate": 7.025840145773441e-07, + "loss": 0.2953, + "step": 2041 + }, + { + "epoch": 1.1213618890719386, + "grad_norm": 0.5447963110673394, + "learning_rate": 7.023184504361874e-07, + "loss": 0.2275, + "step": 2042 + }, + { + "epoch": 1.1219110378912684, + "grad_norm": 0.46751918113842783, + "learning_rate": 7.020528180368805e-07, + "loss": 0.2649, + "step": 2043 + }, + { + "epoch": 1.1224601867105986, + "grad_norm": 0.5164872267535395, + "learning_rate": 7.017871174690647e-07, + "loss": 0.2354, + "step": 2044 + }, + { + "epoch": 1.1230093355299287, + "grad_norm": 0.37121250720665727, + "learning_rate": 7.015213488224039e-07, + "loss": 0.2606, + "step": 2045 + }, + { + "epoch": 1.1235584843492585, + "grad_norm": 0.5491751102779595, + "learning_rate": 7.012555121865853e-07, + "loss": 0.2766, + "step": 2046 + }, + { + "epoch": 1.1241076331685886, + "grad_norm": 0.46326418719741624, + "learning_rate": 7.009896076513191e-07, + "loss": 0.2743, + "step": 2047 + }, + { + "epoch": 1.1246567819879187, + "grad_norm": 0.3796077874222295, + "learning_rate": 7.00723635306338e-07, + "loss": 0.2438, + "step": 2048 + }, + { + "epoch": 1.1252059308072488, + "grad_norm": 0.45247177404577776, + "learning_rate": 7.004575952413982e-07, + "loss": 0.2703, + "step": 2049 + }, + { + "epoch": 1.1257550796265787, + "grad_norm": 0.511537686308602, + "learning_rate": 7.001914875462784e-07, + "loss": 0.2713, + "step": 2050 + }, + { + "epoch": 1.1263042284459088, + "grad_norm": 0.40394344228233975, + "learning_rate": 6.999253123107798e-07, + "loss": 0.2645, + "step": 2051 + }, + { + "epoch": 1.126853377265239, + "grad_norm": 0.4322337380009016, + "learning_rate": 6.996590696247268e-07, + "loss": 0.2697, + "step": 2052 + }, + { + "epoch": 1.1274025260845688, + "grad_norm": 0.6304146195653357, + "learning_rate": 6.99392759577967e-07, + "loss": 0.2681, + "step": 2053 + }, + { + "epoch": 1.127951674903899, + "grad_norm": 0.5796610147192044, + "learning_rate": 6.991263822603697e-07, + "loss": 0.28, + "step": 2054 + }, + { + "epoch": 1.128500823723229, + "grad_norm": 0.44163012072057845, + "learning_rate": 6.988599377618277e-07, + "loss": 0.243, + "step": 2055 + }, + { + "epoch": 1.1290499725425591, + "grad_norm": 0.7345339727406786, + "learning_rate": 6.985934261722561e-07, + "loss": 0.2687, + "step": 2056 + }, + { + "epoch": 1.129599121361889, + "grad_norm": 0.5530745437452842, + "learning_rate": 6.983268475815925e-07, + "loss": 0.233, + "step": 2057 + }, + { + "epoch": 1.130148270181219, + "grad_norm": 0.42956914146761865, + "learning_rate": 6.98060202079798e-07, + "loss": 0.2514, + "step": 2058 + }, + { + "epoch": 1.1306974190005492, + "grad_norm": 0.394950067161238, + "learning_rate": 6.977934897568551e-07, + "loss": 0.2743, + "step": 2059 + }, + { + "epoch": 1.131246567819879, + "grad_norm": 0.47320784491587936, + "learning_rate": 6.975267107027694e-07, + "loss": 0.2677, + "step": 2060 + }, + { + "epoch": 1.1317957166392092, + "grad_norm": 0.4566836815984222, + "learning_rate": 6.972598650075693e-07, + "loss": 0.2222, + "step": 2061 + }, + { + "epoch": 1.1323448654585393, + "grad_norm": 0.5279458488988885, + "learning_rate": 6.969929527613051e-07, + "loss": 0.2704, + "step": 2062 + }, + { + "epoch": 1.1328940142778694, + "grad_norm": 0.44655930508101255, + "learning_rate": 6.9672597405405e-07, + "loss": 0.2294, + "step": 2063 + }, + { + "epoch": 1.1334431630971993, + "grad_norm": 0.564628631594905, + "learning_rate": 6.964589289758995e-07, + "loss": 0.2675, + "step": 2064 + }, + { + "epoch": 1.1339923119165294, + "grad_norm": 0.3847657136218291, + "learning_rate": 6.961918176169715e-07, + "loss": 0.2264, + "step": 2065 + }, + { + "epoch": 1.1345414607358595, + "grad_norm": 0.43930699845250576, + "learning_rate": 6.959246400674059e-07, + "loss": 0.2445, + "step": 2066 + }, + { + "epoch": 1.1350906095551894, + "grad_norm": 0.47231699866567417, + "learning_rate": 6.956573964173657e-07, + "loss": 0.2174, + "step": 2067 + }, + { + "epoch": 1.1356397583745195, + "grad_norm": 0.47811092606348193, + "learning_rate": 6.953900867570357e-07, + "loss": 0.2703, + "step": 2068 + }, + { + "epoch": 1.1361889071938496, + "grad_norm": 0.5014165167841236, + "learning_rate": 6.951227111766229e-07, + "loss": 0.2793, + "step": 2069 + }, + { + "epoch": 1.1367380560131797, + "grad_norm": 0.4219943590198461, + "learning_rate": 6.948552697663568e-07, + "loss": 0.3047, + "step": 2070 + }, + { + "epoch": 1.1372872048325096, + "grad_norm": 0.5171397486569508, + "learning_rate": 6.94587762616489e-07, + "loss": 0.2813, + "step": 2071 + }, + { + "epoch": 1.1378363536518397, + "grad_norm": 0.4343622223122472, + "learning_rate": 6.943201898172934e-07, + "loss": 0.241, + "step": 2072 + }, + { + "epoch": 1.1383855024711698, + "grad_norm": 0.42913308473769335, + "learning_rate": 6.940525514590657e-07, + "loss": 0.2534, + "step": 2073 + }, + { + "epoch": 1.1389346512904996, + "grad_norm": 0.5067389680375134, + "learning_rate": 6.937848476321244e-07, + "loss": 0.2715, + "step": 2074 + }, + { + "epoch": 1.1394838001098297, + "grad_norm": 0.44136904618348255, + "learning_rate": 6.935170784268097e-07, + "loss": 0.2364, + "step": 2075 + }, + { + "epoch": 1.1400329489291599, + "grad_norm": 0.5461600370571491, + "learning_rate": 6.93249243933483e-07, + "loss": 0.2759, + "step": 2076 + }, + { + "epoch": 1.14058209774849, + "grad_norm": 0.556403684418123, + "learning_rate": 6.929813442425297e-07, + "loss": 0.3059, + "step": 2077 + }, + { + "epoch": 1.1411312465678198, + "grad_norm": 0.49028773016860017, + "learning_rate": 6.927133794443552e-07, + "loss": 0.2658, + "step": 2078 + }, + { + "epoch": 1.14168039538715, + "grad_norm": 0.6142294814576017, + "learning_rate": 6.924453496293883e-07, + "loss": 0.3269, + "step": 2079 + }, + { + "epoch": 1.14222954420648, + "grad_norm": 0.4334593647887194, + "learning_rate": 6.921772548880789e-07, + "loss": 0.2666, + "step": 2080 + }, + { + "epoch": 1.14277869302581, + "grad_norm": 0.5841749149027798, + "learning_rate": 6.919090953108993e-07, + "loss": 0.3252, + "step": 2081 + }, + { + "epoch": 1.14332784184514, + "grad_norm": 0.5172855884529847, + "learning_rate": 6.916408709883432e-07, + "loss": 0.2627, + "step": 2082 + }, + { + "epoch": 1.1438769906644701, + "grad_norm": 0.5713609460606477, + "learning_rate": 6.913725820109266e-07, + "loss": 0.2772, + "step": 2083 + }, + { + "epoch": 1.1444261394838002, + "grad_norm": 0.5302134133778692, + "learning_rate": 6.911042284691872e-07, + "loss": 0.2656, + "step": 2084 + }, + { + "epoch": 1.1449752883031301, + "grad_norm": 0.4973967230332425, + "learning_rate": 6.908358104536843e-07, + "loss": 0.2605, + "step": 2085 + }, + { + "epoch": 1.1455244371224602, + "grad_norm": 0.4673268626434139, + "learning_rate": 6.905673280549993e-07, + "loss": 0.2444, + "step": 2086 + }, + { + "epoch": 1.1460735859417903, + "grad_norm": 0.4682573088269765, + "learning_rate": 6.90298781363735e-07, + "loss": 0.2248, + "step": 2087 + }, + { + "epoch": 1.1466227347611202, + "grad_norm": 0.43816531275429343, + "learning_rate": 6.900301704705158e-07, + "loss": 0.2446, + "step": 2088 + }, + { + "epoch": 1.1471718835804503, + "grad_norm": 0.41736772392728405, + "learning_rate": 6.897614954659883e-07, + "loss": 0.2468, + "step": 2089 + }, + { + "epoch": 1.1477210323997804, + "grad_norm": 0.5686596989297472, + "learning_rate": 6.894927564408202e-07, + "loss": 0.247, + "step": 2090 + }, + { + "epoch": 1.1482701812191103, + "grad_norm": 0.5074497667087635, + "learning_rate": 6.892239534857013e-07, + "loss": 0.2963, + "step": 2091 + }, + { + "epoch": 1.1488193300384404, + "grad_norm": 0.5802737362297152, + "learning_rate": 6.889550866913423e-07, + "loss": 0.276, + "step": 2092 + }, + { + "epoch": 1.1493684788577705, + "grad_norm": 0.47884515587841325, + "learning_rate": 6.88686156148476e-07, + "loss": 0.2751, + "step": 2093 + }, + { + "epoch": 1.1499176276771004, + "grad_norm": 0.4628259021096923, + "learning_rate": 6.884171619478568e-07, + "loss": 0.2487, + "step": 2094 + }, + { + "epoch": 1.1504667764964305, + "grad_norm": 0.4250249064931077, + "learning_rate": 6.881481041802601e-07, + "loss": 0.2659, + "step": 2095 + }, + { + "epoch": 1.1510159253157606, + "grad_norm": 0.446772783883665, + "learning_rate": 6.878789829364828e-07, + "loss": 0.2645, + "step": 2096 + }, + { + "epoch": 1.1515650741350907, + "grad_norm": 0.4952865235990931, + "learning_rate": 6.876097983073437e-07, + "loss": 0.2596, + "step": 2097 + }, + { + "epoch": 1.1521142229544206, + "grad_norm": 0.5780570036531998, + "learning_rate": 6.873405503836827e-07, + "loss": 0.2373, + "step": 2098 + }, + { + "epoch": 1.1526633717737507, + "grad_norm": 0.5874807740289929, + "learning_rate": 6.870712392563611e-07, + "loss": 0.2528, + "step": 2099 + }, + { + "epoch": 1.1532125205930808, + "grad_norm": 0.5298117374415783, + "learning_rate": 6.868018650162612e-07, + "loss": 0.2903, + "step": 2100 + }, + { + "epoch": 1.1537616694124107, + "grad_norm": 0.40908675157594043, + "learning_rate": 6.865324277542869e-07, + "loss": 0.2787, + "step": 2101 + }, + { + "epoch": 1.1543108182317408, + "grad_norm": 0.4934033397721633, + "learning_rate": 6.862629275613637e-07, + "loss": 0.2856, + "step": 2102 + }, + { + "epoch": 1.1548599670510709, + "grad_norm": 0.5388145674356845, + "learning_rate": 6.859933645284376e-07, + "loss": 0.2621, + "step": 2103 + }, + { + "epoch": 1.155409115870401, + "grad_norm": 0.45639083282557674, + "learning_rate": 6.857237387464765e-07, + "loss": 0.2576, + "step": 2104 + }, + { + "epoch": 1.1559582646897308, + "grad_norm": 0.5851889731414385, + "learning_rate": 6.854540503064688e-07, + "loss": 0.2808, + "step": 2105 + }, + { + "epoch": 1.156507413509061, + "grad_norm": 0.43187611981818386, + "learning_rate": 6.851842992994248e-07, + "loss": 0.2435, + "step": 2106 + }, + { + "epoch": 1.157056562328391, + "grad_norm": 0.52972603231515, + "learning_rate": 6.849144858163754e-07, + "loss": 0.2839, + "step": 2107 + }, + { + "epoch": 1.157605711147721, + "grad_norm": 0.3605870813580465, + "learning_rate": 6.846446099483726e-07, + "loss": 0.2373, + "step": 2108 + }, + { + "epoch": 1.158154859967051, + "grad_norm": 0.436749403383961, + "learning_rate": 6.843746717864898e-07, + "loss": 0.2734, + "step": 2109 + }, + { + "epoch": 1.1587040087863811, + "grad_norm": 0.44144694431352005, + "learning_rate": 6.841046714218209e-07, + "loss": 0.2299, + "step": 2110 + }, + { + "epoch": 1.1592531576057112, + "grad_norm": 0.4984618775558761, + "learning_rate": 6.83834608945481e-07, + "loss": 0.2781, + "step": 2111 + }, + { + "epoch": 1.1598023064250411, + "grad_norm": 0.6043965526426754, + "learning_rate": 6.835644844486067e-07, + "loss": 0.2756, + "step": 2112 + }, + { + "epoch": 1.1603514552443712, + "grad_norm": 0.5394594511776718, + "learning_rate": 6.832942980223547e-07, + "loss": 0.2896, + "step": 2113 + }, + { + "epoch": 1.1609006040637013, + "grad_norm": 0.5769961522334025, + "learning_rate": 6.83024049757903e-07, + "loss": 0.2822, + "step": 2114 + }, + { + "epoch": 1.1614497528830312, + "grad_norm": 0.45650487649307786, + "learning_rate": 6.827537397464507e-07, + "loss": 0.2175, + "step": 2115 + }, + { + "epoch": 1.1619989017023613, + "grad_norm": 0.45461813764392106, + "learning_rate": 6.824833680792172e-07, + "loss": 0.3091, + "step": 2116 + }, + { + "epoch": 1.1625480505216914, + "grad_norm": 0.4373880082156897, + "learning_rate": 6.82212934847443e-07, + "loss": 0.3065, + "step": 2117 + }, + { + "epoch": 1.1630971993410215, + "grad_norm": 0.5695126690017801, + "learning_rate": 6.819424401423894e-07, + "loss": 0.2355, + "step": 2118 + }, + { + "epoch": 1.1636463481603514, + "grad_norm": 0.5803660099912648, + "learning_rate": 6.816718840553384e-07, + "loss": 0.2636, + "step": 2119 + }, + { + "epoch": 1.1641954969796815, + "grad_norm": 0.3907169640570221, + "learning_rate": 6.814012666775928e-07, + "loss": 0.2691, + "step": 2120 + }, + { + "epoch": 1.1647446457990116, + "grad_norm": 0.5566375476157985, + "learning_rate": 6.811305881004758e-07, + "loss": 0.2294, + "step": 2121 + }, + { + "epoch": 1.1652937946183415, + "grad_norm": 0.4369089606070346, + "learning_rate": 6.808598484153315e-07, + "loss": 0.2631, + "step": 2122 + }, + { + "epoch": 1.1658429434376716, + "grad_norm": 0.41317936199448335, + "learning_rate": 6.805890477135247e-07, + "loss": 0.2501, + "step": 2123 + }, + { + "epoch": 1.1663920922570017, + "grad_norm": 0.45122740739912137, + "learning_rate": 6.803181860864406e-07, + "loss": 0.2426, + "step": 2124 + }, + { + "epoch": 1.1669412410763318, + "grad_norm": 0.5024729709068514, + "learning_rate": 6.80047263625485e-07, + "loss": 0.2583, + "step": 2125 + }, + { + "epoch": 1.1674903898956617, + "grad_norm": 0.4717435664928225, + "learning_rate": 6.797762804220843e-07, + "loss": 0.2934, + "step": 2126 + }, + { + "epoch": 1.1680395387149918, + "grad_norm": 0.598639358461304, + "learning_rate": 6.795052365676854e-07, + "loss": 0.2626, + "step": 2127 + }, + { + "epoch": 1.1685886875343219, + "grad_norm": 0.5251482151620749, + "learning_rate": 6.792341321537551e-07, + "loss": 0.2678, + "step": 2128 + }, + { + "epoch": 1.1691378363536518, + "grad_norm": 0.41750186755239965, + "learning_rate": 6.78962967271782e-07, + "loss": 0.2587, + "step": 2129 + }, + { + "epoch": 1.1696869851729819, + "grad_norm": 0.43898996882748204, + "learning_rate": 6.786917420132735e-07, + "loss": 0.2399, + "step": 2130 + }, + { + "epoch": 1.170236133992312, + "grad_norm": 0.8178925836590968, + "learning_rate": 6.784204564697587e-07, + "loss": 0.3316, + "step": 2131 + }, + { + "epoch": 1.170785282811642, + "grad_norm": 0.713210199250082, + "learning_rate": 6.781491107327863e-07, + "loss": 0.2887, + "step": 2132 + }, + { + "epoch": 1.171334431630972, + "grad_norm": 0.5199231649885091, + "learning_rate": 6.77877704893925e-07, + "loss": 0.2555, + "step": 2133 + }, + { + "epoch": 1.171883580450302, + "grad_norm": 0.38952773013299286, + "learning_rate": 6.776062390447649e-07, + "loss": 0.2171, + "step": 2134 + }, + { + "epoch": 1.1724327292696322, + "grad_norm": 0.4734649835268734, + "learning_rate": 6.773347132769157e-07, + "loss": 0.2498, + "step": 2135 + }, + { + "epoch": 1.172981878088962, + "grad_norm": 0.5314347504365042, + "learning_rate": 6.77063127682007e-07, + "loss": 0.2612, + "step": 2136 + }, + { + "epoch": 1.1735310269082921, + "grad_norm": 0.5761442175693938, + "learning_rate": 6.767914823516891e-07, + "loss": 0.2756, + "step": 2137 + }, + { + "epoch": 1.1740801757276222, + "grad_norm": 0.6230565960967867, + "learning_rate": 6.765197773776323e-07, + "loss": 0.2802, + "step": 2138 + }, + { + "epoch": 1.1746293245469523, + "grad_norm": 0.5339967050316438, + "learning_rate": 6.76248012851527e-07, + "loss": 0.2896, + "step": 2139 + }, + { + "epoch": 1.1751784733662822, + "grad_norm": 0.4020019474671675, + "learning_rate": 6.759761888650836e-07, + "loss": 0.2788, + "step": 2140 + }, + { + "epoch": 1.1757276221856123, + "grad_norm": 0.4480326148913986, + "learning_rate": 6.75704305510033e-07, + "loss": 0.2719, + "step": 2141 + }, + { + "epoch": 1.1762767710049424, + "grad_norm": 0.5296871559178602, + "learning_rate": 6.754323628781256e-07, + "loss": 0.2692, + "step": 2142 + }, + { + "epoch": 1.1768259198242723, + "grad_norm": 0.49631193645903, + "learning_rate": 6.751603610611321e-07, + "loss": 0.2572, + "step": 2143 + }, + { + "epoch": 1.1773750686436024, + "grad_norm": 0.48103885412321784, + "learning_rate": 6.748883001508428e-07, + "loss": 0.2874, + "step": 2144 + }, + { + "epoch": 1.1779242174629325, + "grad_norm": 0.3762440711682999, + "learning_rate": 6.746161802390686e-07, + "loss": 0.2643, + "step": 2145 + }, + { + "epoch": 1.1784733662822624, + "grad_norm": 0.5509963932256832, + "learning_rate": 6.743440014176397e-07, + "loss": 0.2231, + "step": 2146 + }, + { + "epoch": 1.1790225151015925, + "grad_norm": 0.4922225222351292, + "learning_rate": 6.740717637784066e-07, + "loss": 0.2823, + "step": 2147 + }, + { + "epoch": 1.1795716639209226, + "grad_norm": 0.5191490587212442, + "learning_rate": 6.737994674132394e-07, + "loss": 0.2255, + "step": 2148 + }, + { + "epoch": 1.1801208127402525, + "grad_norm": 0.430135653084065, + "learning_rate": 6.735271124140283e-07, + "loss": 0.2548, + "step": 2149 + }, + { + "epoch": 1.1806699615595826, + "grad_norm": 0.6041380159321779, + "learning_rate": 6.732546988726826e-07, + "loss": 0.261, + "step": 2150 + }, + { + "epoch": 1.1812191103789127, + "grad_norm": 0.7758435926062973, + "learning_rate": 6.729822268811321e-07, + "loss": 0.2759, + "step": 2151 + }, + { + "epoch": 1.1817682591982428, + "grad_norm": 0.5450707824337424, + "learning_rate": 6.727096965313262e-07, + "loss": 0.2911, + "step": 2152 + }, + { + "epoch": 1.1823174080175727, + "grad_norm": 0.47390364020595727, + "learning_rate": 6.724371079152337e-07, + "loss": 0.2626, + "step": 2153 + }, + { + "epoch": 1.1828665568369028, + "grad_norm": 0.6348318611884215, + "learning_rate": 6.721644611248433e-07, + "loss": 0.2291, + "step": 2154 + }, + { + "epoch": 1.1834157056562329, + "grad_norm": 0.6086643592240616, + "learning_rate": 6.71891756252163e-07, + "loss": 0.2545, + "step": 2155 + }, + { + "epoch": 1.1839648544755628, + "grad_norm": 0.4568836736824567, + "learning_rate": 6.716189933892209e-07, + "loss": 0.2641, + "step": 2156 + }, + { + "epoch": 1.1845140032948929, + "grad_norm": 0.45688844961111974, + "learning_rate": 6.713461726280646e-07, + "loss": 0.2594, + "step": 2157 + }, + { + "epoch": 1.185063152114223, + "grad_norm": 0.4574225410506315, + "learning_rate": 6.710732940607605e-07, + "loss": 0.2614, + "step": 2158 + }, + { + "epoch": 1.185612300933553, + "grad_norm": 0.5777129559039402, + "learning_rate": 6.708003577793954e-07, + "loss": 0.2734, + "step": 2159 + }, + { + "epoch": 1.186161449752883, + "grad_norm": 0.5471811744350423, + "learning_rate": 6.705273638760752e-07, + "loss": 0.2598, + "step": 2160 + }, + { + "epoch": 1.186710598572213, + "grad_norm": 0.48434331771972533, + "learning_rate": 6.702543124429253e-07, + "loss": 0.2532, + "step": 2161 + }, + { + "epoch": 1.1872597473915432, + "grad_norm": 0.5328438659996083, + "learning_rate": 6.699812035720906e-07, + "loss": 0.2466, + "step": 2162 + }, + { + "epoch": 1.187808896210873, + "grad_norm": 0.6049055274335498, + "learning_rate": 6.697080373557352e-07, + "loss": 0.2526, + "step": 2163 + }, + { + "epoch": 1.1883580450302031, + "grad_norm": 0.5519016985830592, + "learning_rate": 6.694348138860425e-07, + "loss": 0.2407, + "step": 2164 + }, + { + "epoch": 1.1889071938495333, + "grad_norm": 0.46036100366904986, + "learning_rate": 6.691615332552154e-07, + "loss": 0.2537, + "step": 2165 + }, + { + "epoch": 1.1894563426688634, + "grad_norm": 0.4785366472196232, + "learning_rate": 6.688881955554764e-07, + "loss": 0.261, + "step": 2166 + }, + { + "epoch": 1.1900054914881932, + "grad_norm": 0.5024024129155988, + "learning_rate": 6.686148008790663e-07, + "loss": 0.2688, + "step": 2167 + }, + { + "epoch": 1.1905546403075233, + "grad_norm": 0.4873968713537025, + "learning_rate": 6.683413493182461e-07, + "loss": 0.263, + "step": 2168 + }, + { + "epoch": 1.1911037891268534, + "grad_norm": 0.7152654341361451, + "learning_rate": 6.680678409652957e-07, + "loss": 0.3223, + "step": 2169 + }, + { + "epoch": 1.1916529379461833, + "grad_norm": 0.4766889123590685, + "learning_rate": 6.677942759125136e-07, + "loss": 0.3251, + "step": 2170 + }, + { + "epoch": 1.1922020867655134, + "grad_norm": 0.5314049439535203, + "learning_rate": 6.675206542522188e-07, + "loss": 0.2347, + "step": 2171 + }, + { + "epoch": 1.1927512355848435, + "grad_norm": 0.47337531507436137, + "learning_rate": 6.672469760767477e-07, + "loss": 0.2853, + "step": 2172 + }, + { + "epoch": 1.1933003844041736, + "grad_norm": 0.5995445290549404, + "learning_rate": 6.66973241478457e-07, + "loss": 0.2598, + "step": 2173 + }, + { + "epoch": 1.1938495332235035, + "grad_norm": 0.4356276959607383, + "learning_rate": 6.666994505497221e-07, + "loss": 0.2306, + "step": 2174 + }, + { + "epoch": 1.1943986820428336, + "grad_norm": 0.5211127414669526, + "learning_rate": 6.664256033829369e-07, + "loss": 0.2946, + "step": 2175 + }, + { + "epoch": 1.1949478308621637, + "grad_norm": 0.405545643753939, + "learning_rate": 6.661517000705154e-07, + "loss": 0.2047, + "step": 2176 + }, + { + "epoch": 1.1954969796814936, + "grad_norm": 0.4082555110920402, + "learning_rate": 6.658777407048894e-07, + "loss": 0.2602, + "step": 2177 + }, + { + "epoch": 1.1960461285008237, + "grad_norm": 0.5039976872573545, + "learning_rate": 6.656037253785101e-07, + "loss": 0.2382, + "step": 2178 + }, + { + "epoch": 1.1965952773201538, + "grad_norm": 0.467384634086635, + "learning_rate": 6.653296541838478e-07, + "loss": 0.2694, + "step": 2179 + }, + { + "epoch": 1.197144426139484, + "grad_norm": 0.4733061718436572, + "learning_rate": 6.650555272133912e-07, + "loss": 0.3107, + "step": 2180 + }, + { + "epoch": 1.1976935749588138, + "grad_norm": 0.43902449684695394, + "learning_rate": 6.647813445596483e-07, + "loss": 0.2776, + "step": 2181 + }, + { + "epoch": 1.198242723778144, + "grad_norm": 0.5524664763531649, + "learning_rate": 6.645071063151454e-07, + "loss": 0.2498, + "step": 2182 + }, + { + "epoch": 1.198791872597474, + "grad_norm": 0.5103790371570295, + "learning_rate": 6.64232812572428e-07, + "loss": 0.2571, + "step": 2183 + }, + { + "epoch": 1.1993410214168039, + "grad_norm": 0.399707347690683, + "learning_rate": 6.639584634240602e-07, + "loss": 0.2989, + "step": 2184 + }, + { + "epoch": 1.199890170236134, + "grad_norm": 0.49242600158546507, + "learning_rate": 6.636840589626243e-07, + "loss": 0.27, + "step": 2185 + }, + { + "epoch": 1.200439319055464, + "grad_norm": 0.49997191726719, + "learning_rate": 6.634095992807221e-07, + "loss": 0.2782, + "step": 2186 + }, + { + "epoch": 1.2009884678747942, + "grad_norm": 0.4598548272948519, + "learning_rate": 6.631350844709737e-07, + "loss": 0.2668, + "step": 2187 + }, + { + "epoch": 1.201537616694124, + "grad_norm": 0.6873319689011373, + "learning_rate": 6.628605146260174e-07, + "loss": 0.2462, + "step": 2188 + }, + { + "epoch": 1.2020867655134542, + "grad_norm": 0.3998755946484521, + "learning_rate": 6.625858898385104e-07, + "loss": 0.2939, + "step": 2189 + }, + { + "epoch": 1.2026359143327843, + "grad_norm": 0.5351111947289814, + "learning_rate": 6.623112102011289e-07, + "loss": 0.258, + "step": 2190 + }, + { + "epoch": 1.2031850631521142, + "grad_norm": 0.508085769994925, + "learning_rate": 6.620364758065667e-07, + "loss": 0.2641, + "step": 2191 + }, + { + "epoch": 1.2037342119714443, + "grad_norm": 0.55888243333363, + "learning_rate": 6.617616867475366e-07, + "loss": 0.2585, + "step": 2192 + }, + { + "epoch": 1.2042833607907744, + "grad_norm": 0.4938157902879376, + "learning_rate": 6.614868431167701e-07, + "loss": 0.2292, + "step": 2193 + }, + { + "epoch": 1.2048325096101045, + "grad_norm": 0.5587859685736685, + "learning_rate": 6.612119450070164e-07, + "loss": 0.2579, + "step": 2194 + }, + { + "epoch": 1.2053816584294343, + "grad_norm": 0.47235668111834467, + "learning_rate": 6.609369925110437e-07, + "loss": 0.2392, + "step": 2195 + }, + { + "epoch": 1.2059308072487644, + "grad_norm": 0.5490419319348965, + "learning_rate": 6.606619857216384e-07, + "loss": 0.2449, + "step": 2196 + }, + { + "epoch": 1.2064799560680946, + "grad_norm": 0.4822197201239783, + "learning_rate": 6.603869247316051e-07, + "loss": 0.2786, + "step": 2197 + }, + { + "epoch": 1.2070291048874244, + "grad_norm": 0.5307496384306721, + "learning_rate": 6.601118096337668e-07, + "loss": 0.2261, + "step": 2198 + }, + { + "epoch": 1.2075782537067545, + "grad_norm": 0.5091163573548654, + "learning_rate": 6.598366405209645e-07, + "loss": 0.2693, + "step": 2199 + }, + { + "epoch": 1.2081274025260846, + "grad_norm": 0.42126907648894085, + "learning_rate": 6.595614174860577e-07, + "loss": 0.2272, + "step": 2200 + }, + { + "epoch": 1.2081274025260846, + "eval_loss": 0.3368612229824066, + "eval_runtime": 18.6625, + "eval_samples_per_second": 23.737, + "eval_steps_per_second": 1.018, + "step": 2200 + }, + { + "epoch": 1.2086765513454145, + "grad_norm": 0.43173874800445644, + "learning_rate": 6.59286140621924e-07, + "loss": 0.2622, + "step": 2201 + }, + { + "epoch": 1.2092257001647446, + "grad_norm": 0.5411498413668017, + "learning_rate": 6.590108100214596e-07, + "loss": 0.303, + "step": 2202 + }, + { + "epoch": 1.2097748489840747, + "grad_norm": 0.4462051774365792, + "learning_rate": 6.58735425777578e-07, + "loss": 0.2574, + "step": 2203 + }, + { + "epoch": 1.2103239978034046, + "grad_norm": 0.5938157804849086, + "learning_rate": 6.584599879832112e-07, + "loss": 0.2491, + "step": 2204 + }, + { + "epoch": 1.2108731466227347, + "grad_norm": 0.44387246494698895, + "learning_rate": 6.581844967313097e-07, + "loss": 0.2294, + "step": 2205 + }, + { + "epoch": 1.2114222954420648, + "grad_norm": 0.7786217814585868, + "learning_rate": 6.579089521148412e-07, + "loss": 0.2165, + "step": 2206 + }, + { + "epoch": 1.211971444261395, + "grad_norm": 0.6231069612650889, + "learning_rate": 6.576333542267925e-07, + "loss": 0.2619, + "step": 2207 + }, + { + "epoch": 1.2125205930807248, + "grad_norm": 0.5102568325667478, + "learning_rate": 6.573577031601669e-07, + "loss": 0.2296, + "step": 2208 + }, + { + "epoch": 1.213069741900055, + "grad_norm": 0.5079946725298959, + "learning_rate": 6.570819990079872e-07, + "loss": 0.261, + "step": 2209 + }, + { + "epoch": 1.213618890719385, + "grad_norm": 0.49278078610283893, + "learning_rate": 6.568062418632928e-07, + "loss": 0.2761, + "step": 2210 + }, + { + "epoch": 1.2141680395387149, + "grad_norm": 0.5001108108788075, + "learning_rate": 6.565304318191419e-07, + "loss": 0.2622, + "step": 2211 + }, + { + "epoch": 1.214717188358045, + "grad_norm": 0.5101032372498414, + "learning_rate": 6.562545689686105e-07, + "loss": 0.214, + "step": 2212 + }, + { + "epoch": 1.215266337177375, + "grad_norm": 0.4635276398319039, + "learning_rate": 6.559786534047916e-07, + "loss": 0.2378, + "step": 2213 + }, + { + "epoch": 1.2158154859967052, + "grad_norm": 0.6141265780112877, + "learning_rate": 6.557026852207966e-07, + "loss": 0.2588, + "step": 2214 + }, + { + "epoch": 1.216364634816035, + "grad_norm": 0.4326858319468491, + "learning_rate": 6.554266645097553e-07, + "loss": 0.2495, + "step": 2215 + }, + { + "epoch": 1.2169137836353652, + "grad_norm": 0.4565182435415461, + "learning_rate": 6.551505913648135e-07, + "loss": 0.2408, + "step": 2216 + }, + { + "epoch": 1.2174629324546953, + "grad_norm": 0.4313385886030161, + "learning_rate": 6.548744658791365e-07, + "loss": 0.2864, + "step": 2217 + }, + { + "epoch": 1.2180120812740252, + "grad_norm": 0.42757787582310125, + "learning_rate": 6.545982881459063e-07, + "loss": 0.2772, + "step": 2218 + }, + { + "epoch": 1.2185612300933553, + "grad_norm": 0.6135586508110499, + "learning_rate": 6.543220582583222e-07, + "loss": 0.2618, + "step": 2219 + }, + { + "epoch": 1.2191103789126854, + "grad_norm": 0.47535001423953216, + "learning_rate": 6.540457763096022e-07, + "loss": 0.2447, + "step": 2220 + }, + { + "epoch": 1.2196595277320155, + "grad_norm": 0.45185378186179237, + "learning_rate": 6.537694423929813e-07, + "loss": 0.2389, + "step": 2221 + }, + { + "epoch": 1.2202086765513454, + "grad_norm": 0.46088105353148384, + "learning_rate": 6.534930566017116e-07, + "loss": 0.2662, + "step": 2222 + }, + { + "epoch": 1.2207578253706755, + "grad_norm": 0.5412847710183424, + "learning_rate": 6.532166190290634e-07, + "loss": 0.2229, + "step": 2223 + }, + { + "epoch": 1.2213069741900056, + "grad_norm": 0.5463083189980393, + "learning_rate": 6.52940129768324e-07, + "loss": 0.2247, + "step": 2224 + }, + { + "epoch": 1.2218561230093354, + "grad_norm": 0.49425146294854966, + "learning_rate": 6.526635889127986e-07, + "loss": 0.2384, + "step": 2225 + }, + { + "epoch": 1.2224052718286655, + "grad_norm": 0.4851479181683103, + "learning_rate": 6.523869965558094e-07, + "loss": 0.2432, + "step": 2226 + }, + { + "epoch": 1.2229544206479956, + "grad_norm": 0.5009516478893922, + "learning_rate": 6.521103527906962e-07, + "loss": 0.2311, + "step": 2227 + }, + { + "epoch": 1.2235035694673257, + "grad_norm": 0.717558020742917, + "learning_rate": 6.518336577108159e-07, + "loss": 0.2592, + "step": 2228 + }, + { + "epoch": 1.2240527182866556, + "grad_norm": 0.5183795004182595, + "learning_rate": 6.515569114095431e-07, + "loss": 0.2783, + "step": 2229 + }, + { + "epoch": 1.2246018671059857, + "grad_norm": 0.3866864111750265, + "learning_rate": 6.512801139802694e-07, + "loss": 0.2163, + "step": 2230 + }, + { + "epoch": 1.2251510159253158, + "grad_norm": 0.46290113518458137, + "learning_rate": 6.510032655164037e-07, + "loss": 0.2693, + "step": 2231 + }, + { + "epoch": 1.2257001647446457, + "grad_norm": 0.4272308297689702, + "learning_rate": 6.507263661113723e-07, + "loss": 0.2587, + "step": 2232 + }, + { + "epoch": 1.2262493135639758, + "grad_norm": 0.45163920893859505, + "learning_rate": 6.504494158586183e-07, + "loss": 0.2468, + "step": 2233 + }, + { + "epoch": 1.226798462383306, + "grad_norm": 0.5738383287617221, + "learning_rate": 6.501724148516026e-07, + "loss": 0.2574, + "step": 2234 + }, + { + "epoch": 1.227347611202636, + "grad_norm": 0.45125445739106435, + "learning_rate": 6.498953631838022e-07, + "loss": 0.234, + "step": 2235 + }, + { + "epoch": 1.227896760021966, + "grad_norm": 0.43967210351881736, + "learning_rate": 6.496182609487124e-07, + "loss": 0.2484, + "step": 2236 + }, + { + "epoch": 1.228445908841296, + "grad_norm": 0.5875977907965385, + "learning_rate": 6.493411082398449e-07, + "loss": 0.2629, + "step": 2237 + }, + { + "epoch": 1.2289950576606261, + "grad_norm": 0.5347991927899313, + "learning_rate": 6.490639051507282e-07, + "loss": 0.2297, + "step": 2238 + }, + { + "epoch": 1.229544206479956, + "grad_norm": 0.4881992460120709, + "learning_rate": 6.487866517749087e-07, + "loss": 0.2559, + "step": 2239 + }, + { + "epoch": 1.230093355299286, + "grad_norm": 0.5231473624481398, + "learning_rate": 6.485093482059487e-07, + "loss": 0.2343, + "step": 2240 + }, + { + "epoch": 1.2306425041186162, + "grad_norm": 0.43789417940924563, + "learning_rate": 6.482319945374281e-07, + "loss": 0.2332, + "step": 2241 + }, + { + "epoch": 1.2311916529379463, + "grad_norm": 0.4563740639069909, + "learning_rate": 6.479545908629436e-07, + "loss": 0.2424, + "step": 2242 + }, + { + "epoch": 1.2317408017572762, + "grad_norm": 0.5219025626713153, + "learning_rate": 6.476771372761086e-07, + "loss": 0.2879, + "step": 2243 + }, + { + "epoch": 1.2322899505766063, + "grad_norm": 0.5540590324315269, + "learning_rate": 6.473996338705538e-07, + "loss": 0.2815, + "step": 2244 + }, + { + "epoch": 1.2328390993959364, + "grad_norm": 0.5358830340500076, + "learning_rate": 6.471220807399258e-07, + "loss": 0.2716, + "step": 2245 + }, + { + "epoch": 1.2333882482152663, + "grad_norm": 0.5241034280704541, + "learning_rate": 6.46844477977889e-07, + "loss": 0.2217, + "step": 2246 + }, + { + "epoch": 1.2339373970345964, + "grad_norm": 0.42898456649126404, + "learning_rate": 6.465668256781239e-07, + "loss": 0.2063, + "step": 2247 + }, + { + "epoch": 1.2344865458539265, + "grad_norm": 0.6288596020822125, + "learning_rate": 6.462891239343282e-07, + "loss": 0.2302, + "step": 2248 + }, + { + "epoch": 1.2350356946732564, + "grad_norm": 0.4849890945681238, + "learning_rate": 6.460113728402157e-07, + "loss": 0.2808, + "step": 2249 + }, + { + "epoch": 1.2355848434925865, + "grad_norm": 0.4743214066810768, + "learning_rate": 6.45733572489517e-07, + "loss": 0.216, + "step": 2250 + }, + { + "epoch": 1.2361339923119166, + "grad_norm": 0.4346520676028857, + "learning_rate": 6.454557229759802e-07, + "loss": 0.2658, + "step": 2251 + }, + { + "epoch": 1.2366831411312464, + "grad_norm": 0.4921299582866448, + "learning_rate": 6.451778243933685e-07, + "loss": 0.2489, + "step": 2252 + }, + { + "epoch": 1.2372322899505765, + "grad_norm": 0.5305772672272118, + "learning_rate": 6.448998768354627e-07, + "loss": 0.2622, + "step": 2253 + }, + { + "epoch": 1.2377814387699067, + "grad_norm": 0.544947416318495, + "learning_rate": 6.446218803960602e-07, + "loss": 0.2409, + "step": 2254 + }, + { + "epoch": 1.2383305875892368, + "grad_norm": 0.5296315820812031, + "learning_rate": 6.443438351689741e-07, + "loss": 0.2688, + "step": 2255 + }, + { + "epoch": 1.2388797364085666, + "grad_norm": 0.4260918953344602, + "learning_rate": 6.440657412480348e-07, + "loss": 0.2781, + "step": 2256 + }, + { + "epoch": 1.2394288852278967, + "grad_norm": 0.43246584287674655, + "learning_rate": 6.437875987270883e-07, + "loss": 0.2483, + "step": 2257 + }, + { + "epoch": 1.2399780340472268, + "grad_norm": 0.46545700892403996, + "learning_rate": 6.435094076999979e-07, + "loss": 0.2559, + "step": 2258 + }, + { + "epoch": 1.2405271828665567, + "grad_norm": 0.4977965704907033, + "learning_rate": 6.432311682606424e-07, + "loss": 0.2914, + "step": 2259 + }, + { + "epoch": 1.2410763316858868, + "grad_norm": 0.41980461778097733, + "learning_rate": 6.429528805029178e-07, + "loss": 0.2415, + "step": 2260 + }, + { + "epoch": 1.241625480505217, + "grad_norm": 0.4945950621335048, + "learning_rate": 6.426745445207356e-07, + "loss": 0.2434, + "step": 2261 + }, + { + "epoch": 1.242174629324547, + "grad_norm": 0.5706188656276096, + "learning_rate": 6.423961604080242e-07, + "loss": 0.2085, + "step": 2262 + }, + { + "epoch": 1.242723778143877, + "grad_norm": 0.696326144316046, + "learning_rate": 6.421177282587278e-07, + "loss": 0.3354, + "step": 2263 + }, + { + "epoch": 1.243272926963207, + "grad_norm": 0.43582593221838645, + "learning_rate": 6.418392481668072e-07, + "loss": 0.2611, + "step": 2264 + }, + { + "epoch": 1.2438220757825371, + "grad_norm": 0.4350407387790691, + "learning_rate": 6.415607202262388e-07, + "loss": 0.2276, + "step": 2265 + }, + { + "epoch": 1.244371224601867, + "grad_norm": 0.4842562141798357, + "learning_rate": 6.41282144531016e-07, + "loss": 0.2288, + "step": 2266 + }, + { + "epoch": 1.244920373421197, + "grad_norm": 0.48553133766101747, + "learning_rate": 6.410035211751474e-07, + "loss": 0.247, + "step": 2267 + }, + { + "epoch": 1.2454695222405272, + "grad_norm": 0.47871644827328985, + "learning_rate": 6.407248502526584e-07, + "loss": 0.2323, + "step": 2268 + }, + { + "epoch": 1.2460186710598573, + "grad_norm": 0.41290906913161507, + "learning_rate": 6.4044613185759e-07, + "loss": 0.2436, + "step": 2269 + }, + { + "epoch": 1.2465678198791872, + "grad_norm": 0.439177301848866, + "learning_rate": 6.401673660839996e-07, + "loss": 0.2239, + "step": 2270 + }, + { + "epoch": 1.2471169686985173, + "grad_norm": 0.4698859836528455, + "learning_rate": 6.398885530259603e-07, + "loss": 0.2219, + "step": 2271 + }, + { + "epoch": 1.2476661175178474, + "grad_norm": 0.42057132323746077, + "learning_rate": 6.396096927775608e-07, + "loss": 0.2156, + "step": 2272 + }, + { + "epoch": 1.2482152663371773, + "grad_norm": 0.4235043642535411, + "learning_rate": 6.393307854329069e-07, + "loss": 0.2327, + "step": 2273 + }, + { + "epoch": 1.2487644151565074, + "grad_norm": 0.4592560435417908, + "learning_rate": 6.39051831086119e-07, + "loss": 0.2404, + "step": 2274 + }, + { + "epoch": 1.2493135639758375, + "grad_norm": 0.45954551262002075, + "learning_rate": 6.387728298313343e-07, + "loss": 0.2631, + "step": 2275 + }, + { + "epoch": 1.2498627127951676, + "grad_norm": 0.4521472741078411, + "learning_rate": 6.384937817627052e-07, + "loss": 0.3033, + "step": 2276 + }, + { + "epoch": 1.2504118616144975, + "grad_norm": 0.5170834467265043, + "learning_rate": 6.382146869744001e-07, + "loss": 0.2383, + "step": 2277 + }, + { + "epoch": 1.2509610104338276, + "grad_norm": 0.4056977366801977, + "learning_rate": 6.379355455606036e-07, + "loss": 0.2901, + "step": 2278 + }, + { + "epoch": 1.2515101592531577, + "grad_norm": 0.5621462505062037, + "learning_rate": 6.376563576155149e-07, + "loss": 0.2483, + "step": 2279 + }, + { + "epoch": 1.2520593080724876, + "grad_norm": 0.5759149122716783, + "learning_rate": 6.373771232333504e-07, + "loss": 0.2368, + "step": 2280 + }, + { + "epoch": 1.2526084568918177, + "grad_norm": 0.5583678983849059, + "learning_rate": 6.370978425083411e-07, + "loss": 0.2348, + "step": 2281 + }, + { + "epoch": 1.2531576057111478, + "grad_norm": 0.5609094898378422, + "learning_rate": 6.368185155347338e-07, + "loss": 0.2264, + "step": 2282 + }, + { + "epoch": 1.2537067545304779, + "grad_norm": 0.5260052374881327, + "learning_rate": 6.365391424067915e-07, + "loss": 0.2712, + "step": 2283 + }, + { + "epoch": 1.2542559033498077, + "grad_norm": 0.5451011571179325, + "learning_rate": 6.362597232187917e-07, + "loss": 0.2381, + "step": 2284 + }, + { + "epoch": 1.2548050521691378, + "grad_norm": 0.5356336125938774, + "learning_rate": 6.359802580650287e-07, + "loss": 0.2634, + "step": 2285 + }, + { + "epoch": 1.255354200988468, + "grad_norm": 0.5011731436450628, + "learning_rate": 6.357007470398114e-07, + "loss": 0.2431, + "step": 2286 + }, + { + "epoch": 1.2559033498077978, + "grad_norm": 0.4015543539043109, + "learning_rate": 6.354211902374645e-07, + "loss": 0.2351, + "step": 2287 + }, + { + "epoch": 1.256452498627128, + "grad_norm": 0.47037413804392814, + "learning_rate": 6.351415877523281e-07, + "loss": 0.2684, + "step": 2288 + }, + { + "epoch": 1.257001647446458, + "grad_norm": 0.664137721681803, + "learning_rate": 6.34861939678758e-07, + "loss": 0.3004, + "step": 2289 + }, + { + "epoch": 1.2575507962657881, + "grad_norm": 0.4530660333600251, + "learning_rate": 6.345822461111248e-07, + "loss": 0.246, + "step": 2290 + }, + { + "epoch": 1.258099945085118, + "grad_norm": 0.4742814296200423, + "learning_rate": 6.343025071438147e-07, + "loss": 0.2397, + "step": 2291 + }, + { + "epoch": 1.2586490939044481, + "grad_norm": 0.47323844601798776, + "learning_rate": 6.340227228712296e-07, + "loss": 0.2552, + "step": 2292 + }, + { + "epoch": 1.259198242723778, + "grad_norm": 0.39972734589093006, + "learning_rate": 6.337428933877861e-07, + "loss": 0.2259, + "step": 2293 + }, + { + "epoch": 1.259747391543108, + "grad_norm": 0.497524206282316, + "learning_rate": 6.334630187879167e-07, + "loss": 0.2647, + "step": 2294 + }, + { + "epoch": 1.2602965403624382, + "grad_norm": 0.5203691818206629, + "learning_rate": 6.331830991660685e-07, + "loss": 0.2977, + "step": 2295 + }, + { + "epoch": 1.2608456891817683, + "grad_norm": 0.5492122693014649, + "learning_rate": 6.329031346167041e-07, + "loss": 0.2456, + "step": 2296 + }, + { + "epoch": 1.2613948380010984, + "grad_norm": 0.5250374161432911, + "learning_rate": 6.326231252343012e-07, + "loss": 0.2185, + "step": 2297 + }, + { + "epoch": 1.2619439868204283, + "grad_norm": 0.4622927218160168, + "learning_rate": 6.323430711133527e-07, + "loss": 0.2664, + "step": 2298 + }, + { + "epoch": 1.2624931356397584, + "grad_norm": 0.6355792112496628, + "learning_rate": 6.320629723483665e-07, + "loss": 0.2761, + "step": 2299 + }, + { + "epoch": 1.2630422844590883, + "grad_norm": 0.38958125945190264, + "learning_rate": 6.317828290338659e-07, + "loss": 0.2437, + "step": 2300 + }, + { + "epoch": 1.2635914332784184, + "grad_norm": 0.485900382530048, + "learning_rate": 6.315026412643886e-07, + "loss": 0.2431, + "step": 2301 + }, + { + "epoch": 1.2641405820977485, + "grad_norm": 0.4710573802841512, + "learning_rate": 6.312224091344876e-07, + "loss": 0.2492, + "step": 2302 + }, + { + "epoch": 1.2646897309170786, + "grad_norm": 0.4378029888574416, + "learning_rate": 6.309421327387312e-07, + "loss": 0.2715, + "step": 2303 + }, + { + "epoch": 1.2652388797364087, + "grad_norm": 0.5038211334071603, + "learning_rate": 6.306618121717022e-07, + "loss": 0.2218, + "step": 2304 + }, + { + "epoch": 1.2657880285557386, + "grad_norm": 0.39342863304494163, + "learning_rate": 6.303814475279985e-07, + "loss": 0.2382, + "step": 2305 + }, + { + "epoch": 1.2663371773750687, + "grad_norm": 0.5472122124005581, + "learning_rate": 6.301010389022329e-07, + "loss": 0.2476, + "step": 2306 + }, + { + "epoch": 1.2668863261943986, + "grad_norm": 0.4204149320928284, + "learning_rate": 6.298205863890329e-07, + "loss": 0.2305, + "step": 2307 + }, + { + "epoch": 1.2674354750137287, + "grad_norm": 0.48839023347011995, + "learning_rate": 6.295400900830407e-07, + "loss": 0.2399, + "step": 2308 + }, + { + "epoch": 1.2679846238330588, + "grad_norm": 0.48381023915923504, + "learning_rate": 6.29259550078914e-07, + "loss": 0.2662, + "step": 2309 + }, + { + "epoch": 1.2685337726523889, + "grad_norm": 0.46112448013887636, + "learning_rate": 6.289789664713239e-07, + "loss": 0.2899, + "step": 2310 + }, + { + "epoch": 1.269082921471719, + "grad_norm": 0.5001991882835727, + "learning_rate": 6.286983393549581e-07, + "loss": 0.2586, + "step": 2311 + }, + { + "epoch": 1.2696320702910489, + "grad_norm": 0.5270895575498046, + "learning_rate": 6.28417668824517e-07, + "loss": 0.2145, + "step": 2312 + }, + { + "epoch": 1.270181219110379, + "grad_norm": 0.5899290181009109, + "learning_rate": 6.28136954974717e-07, + "loss": 0.2667, + "step": 2313 + }, + { + "epoch": 1.2707303679297088, + "grad_norm": 0.5022821468700811, + "learning_rate": 6.278561979002886e-07, + "loss": 0.3089, + "step": 2314 + }, + { + "epoch": 1.271279516749039, + "grad_norm": 0.528402273141273, + "learning_rate": 6.275753976959767e-07, + "loss": 0.2709, + "step": 2315 + }, + { + "epoch": 1.271828665568369, + "grad_norm": 0.4359182504560581, + "learning_rate": 6.272945544565416e-07, + "loss": 0.2355, + "step": 2316 + }, + { + "epoch": 1.2723778143876991, + "grad_norm": 0.39775066322042124, + "learning_rate": 6.270136682767571e-07, + "loss": 0.2522, + "step": 2317 + }, + { + "epoch": 1.272926963207029, + "grad_norm": 0.48430113812871545, + "learning_rate": 6.26732739251412e-07, + "loss": 0.216, + "step": 2318 + }, + { + "epoch": 1.2734761120263591, + "grad_norm": 0.5248439607982119, + "learning_rate": 6.264517674753096e-07, + "loss": 0.272, + "step": 2319 + }, + { + "epoch": 1.2740252608456892, + "grad_norm": 0.4589812459543606, + "learning_rate": 6.261707530432676e-07, + "loss": 0.2425, + "step": 2320 + }, + { + "epoch": 1.2745744096650191, + "grad_norm": 0.5294243029520208, + "learning_rate": 6.258896960501177e-07, + "loss": 0.2525, + "step": 2321 + }, + { + "epoch": 1.2751235584843492, + "grad_norm": 0.4806539546141748, + "learning_rate": 6.256085965907065e-07, + "loss": 0.2698, + "step": 2322 + }, + { + "epoch": 1.2756727073036793, + "grad_norm": 0.6216759041487745, + "learning_rate": 6.253274547598948e-07, + "loss": 0.2443, + "step": 2323 + }, + { + "epoch": 1.2762218561230094, + "grad_norm": 0.3662546022104597, + "learning_rate": 6.250462706525574e-07, + "loss": 0.2642, + "step": 2324 + }, + { + "epoch": 1.2767710049423393, + "grad_norm": 0.43009471837584184, + "learning_rate": 6.247650443635837e-07, + "loss": 0.2298, + "step": 2325 + }, + { + "epoch": 1.2773201537616694, + "grad_norm": 0.5503477309160206, + "learning_rate": 6.244837759878773e-07, + "loss": 0.2651, + "step": 2326 + }, + { + "epoch": 1.2778693025809995, + "grad_norm": 0.5037855423476184, + "learning_rate": 6.242024656203556e-07, + "loss": 0.2818, + "step": 2327 + }, + { + "epoch": 1.2784184514003294, + "grad_norm": 0.45081964596908497, + "learning_rate": 6.239211133559509e-07, + "loss": 0.2422, + "step": 2328 + }, + { + "epoch": 1.2789676002196595, + "grad_norm": 0.4384904442778476, + "learning_rate": 6.236397192896089e-07, + "loss": 0.2504, + "step": 2329 + }, + { + "epoch": 1.2795167490389896, + "grad_norm": 0.6192084212713824, + "learning_rate": 6.233582835162896e-07, + "loss": 0.2529, + "step": 2330 + }, + { + "epoch": 1.2800658978583197, + "grad_norm": 0.6199476574200089, + "learning_rate": 6.230768061309679e-07, + "loss": 0.2558, + "step": 2331 + }, + { + "epoch": 1.2806150466776496, + "grad_norm": 0.44252914439257596, + "learning_rate": 6.227952872286313e-07, + "loss": 0.2827, + "step": 2332 + }, + { + "epoch": 1.2811641954969797, + "grad_norm": 0.4212682730812276, + "learning_rate": 6.225137269042824e-07, + "loss": 0.2503, + "step": 2333 + }, + { + "epoch": 1.2817133443163098, + "grad_norm": 0.5047942427404624, + "learning_rate": 6.222321252529375e-07, + "loss": 0.2514, + "step": 2334 + }, + { + "epoch": 1.2822624931356397, + "grad_norm": 0.5303095165684414, + "learning_rate": 6.219504823696264e-07, + "loss": 0.2264, + "step": 2335 + }, + { + "epoch": 1.2828116419549698, + "grad_norm": 0.5949753194922828, + "learning_rate": 6.216687983493938e-07, + "loss": 0.2713, + "step": 2336 + }, + { + "epoch": 1.2833607907742999, + "grad_norm": 0.5717953850256383, + "learning_rate": 6.213870732872971e-07, + "loss": 0.3197, + "step": 2337 + }, + { + "epoch": 1.28390993959363, + "grad_norm": 0.4909180922219964, + "learning_rate": 6.211053072784085e-07, + "loss": 0.27, + "step": 2338 + }, + { + "epoch": 1.2844590884129599, + "grad_norm": 0.4575632010776429, + "learning_rate": 6.208235004178135e-07, + "loss": 0.2818, + "step": 2339 + }, + { + "epoch": 1.28500823723229, + "grad_norm": 0.4638671824982353, + "learning_rate": 6.205416528006116e-07, + "loss": 0.2573, + "step": 2340 + }, + { + "epoch": 1.28555738605162, + "grad_norm": 0.5360334357515462, + "learning_rate": 6.202597645219158e-07, + "loss": 0.2281, + "step": 2341 + }, + { + "epoch": 1.28610653487095, + "grad_norm": 0.4988337616203063, + "learning_rate": 6.199778356768533e-07, + "loss": 0.2774, + "step": 2342 + }, + { + "epoch": 1.28665568369028, + "grad_norm": 0.3520819615679302, + "learning_rate": 6.196958663605645e-07, + "loss": 0.249, + "step": 2343 + }, + { + "epoch": 1.2872048325096102, + "grad_norm": 0.49948810745698624, + "learning_rate": 6.194138566682038e-07, + "loss": 0.2423, + "step": 2344 + }, + { + "epoch": 1.2877539813289403, + "grad_norm": 0.5670759727891223, + "learning_rate": 6.191318066949388e-07, + "loss": 0.2937, + "step": 2345 + }, + { + "epoch": 1.2883031301482701, + "grad_norm": 0.4802610824409761, + "learning_rate": 6.188497165359514e-07, + "loss": 0.2392, + "step": 2346 + }, + { + "epoch": 1.2888522789676002, + "grad_norm": 0.46163550280303034, + "learning_rate": 6.185675862864361e-07, + "loss": 0.2625, + "step": 2347 + }, + { + "epoch": 1.2894014277869301, + "grad_norm": 0.4941057724688069, + "learning_rate": 6.18285416041602e-07, + "loss": 0.2779, + "step": 2348 + }, + { + "epoch": 1.2899505766062602, + "grad_norm": 0.38576050511059606, + "learning_rate": 6.180032058966708e-07, + "loss": 0.276, + "step": 2349 + }, + { + "epoch": 1.2904997254255903, + "grad_norm": 0.5161901157687193, + "learning_rate": 6.177209559468783e-07, + "loss": 0.2367, + "step": 2350 + }, + { + "epoch": 1.2910488742449204, + "grad_norm": 0.5185303933376392, + "learning_rate": 6.174386662874731e-07, + "loss": 0.2016, + "step": 2351 + }, + { + "epoch": 1.2915980230642505, + "grad_norm": 0.3869660677666794, + "learning_rate": 6.171563370137177e-07, + "loss": 0.2188, + "step": 2352 + }, + { + "epoch": 1.2921471718835804, + "grad_norm": 0.5150224562754442, + "learning_rate": 6.168739682208883e-07, + "loss": 0.2823, + "step": 2353 + }, + { + "epoch": 1.2926963207029105, + "grad_norm": 0.48286002695917574, + "learning_rate": 6.165915600042732e-07, + "loss": 0.2515, + "step": 2354 + }, + { + "epoch": 1.2932454695222404, + "grad_norm": 0.608242619759271, + "learning_rate": 6.163091124591754e-07, + "loss": 0.3008, + "step": 2355 + }, + { + "epoch": 1.2937946183415705, + "grad_norm": 0.48636372186267857, + "learning_rate": 6.160266256809101e-07, + "loss": 0.2594, + "step": 2356 + }, + { + "epoch": 1.2943437671609006, + "grad_norm": 0.44609050407570444, + "learning_rate": 6.157440997648066e-07, + "loss": 0.2729, + "step": 2357 + }, + { + "epoch": 1.2948929159802307, + "grad_norm": 0.529375457396473, + "learning_rate": 6.154615348062066e-07, + "loss": 0.2274, + "step": 2358 + }, + { + "epoch": 1.2954420647995608, + "grad_norm": 0.4845060690017235, + "learning_rate": 6.151789309004653e-07, + "loss": 0.2618, + "step": 2359 + }, + { + "epoch": 1.2959912136188907, + "grad_norm": 0.5704097552229095, + "learning_rate": 6.148962881429515e-07, + "loss": 0.2568, + "step": 2360 + }, + { + "epoch": 1.2965403624382208, + "grad_norm": 0.7886896850220623, + "learning_rate": 6.146136066290466e-07, + "loss": 0.2523, + "step": 2361 + }, + { + "epoch": 1.2970895112575507, + "grad_norm": 0.5737922948999047, + "learning_rate": 6.14330886454145e-07, + "loss": 0.283, + "step": 2362 + }, + { + "epoch": 1.2976386600768808, + "grad_norm": 0.5328075772317451, + "learning_rate": 6.140481277136545e-07, + "loss": 0.2308, + "step": 2363 + }, + { + "epoch": 1.2981878088962109, + "grad_norm": 0.43702475603721397, + "learning_rate": 6.137653305029959e-07, + "loss": 0.2577, + "step": 2364 + }, + { + "epoch": 1.298736957715541, + "grad_norm": 0.4837406904190512, + "learning_rate": 6.134824949176025e-07, + "loss": 0.2772, + "step": 2365 + }, + { + "epoch": 1.299286106534871, + "grad_norm": 0.5833756849252187, + "learning_rate": 6.131996210529211e-07, + "loss": 0.2759, + "step": 2366 + }, + { + "epoch": 1.299835255354201, + "grad_norm": 0.61194212598957, + "learning_rate": 6.129167090044112e-07, + "loss": 0.2722, + "step": 2367 + }, + { + "epoch": 1.300384404173531, + "grad_norm": 0.49804604937916336, + "learning_rate": 6.12633758867545e-07, + "loss": 0.2483, + "step": 2368 + }, + { + "epoch": 1.300933552992861, + "grad_norm": 0.5437653024539507, + "learning_rate": 6.123507707378082e-07, + "loss": 0.3221, + "step": 2369 + }, + { + "epoch": 1.301482701812191, + "grad_norm": 0.7506257884529773, + "learning_rate": 6.120677447106985e-07, + "loss": 0.2646, + "step": 2370 + }, + { + "epoch": 1.3020318506315212, + "grad_norm": 0.4401457582562464, + "learning_rate": 6.117846808817265e-07, + "loss": 0.2229, + "step": 2371 + }, + { + "epoch": 1.3025809994508513, + "grad_norm": 0.3913317399340691, + "learning_rate": 6.115015793464166e-07, + "loss": 0.2438, + "step": 2372 + }, + { + "epoch": 1.3031301482701811, + "grad_norm": 0.5488944304946382, + "learning_rate": 6.112184402003043e-07, + "loss": 0.2681, + "step": 2373 + }, + { + "epoch": 1.3036792970895112, + "grad_norm": 0.5214923752537053, + "learning_rate": 6.109352635389393e-07, + "loss": 0.2476, + "step": 2374 + }, + { + "epoch": 1.3042284459088413, + "grad_norm": 0.5386440607410913, + "learning_rate": 6.10652049457883e-07, + "loss": 0.254, + "step": 2375 + }, + { + "epoch": 1.3047775947281712, + "grad_norm": 0.459380186973586, + "learning_rate": 6.103687980527096e-07, + "loss": 0.2572, + "step": 2376 + }, + { + "epoch": 1.3053267435475013, + "grad_norm": 0.44055375050561413, + "learning_rate": 6.100855094190063e-07, + "loss": 0.2573, + "step": 2377 + }, + { + "epoch": 1.3058758923668314, + "grad_norm": 0.556157564051401, + "learning_rate": 6.09802183652372e-07, + "loss": 0.2286, + "step": 2378 + }, + { + "epoch": 1.3064250411861615, + "grad_norm": 0.4735665444049734, + "learning_rate": 6.095188208484196e-07, + "loss": 0.2626, + "step": 2379 + }, + { + "epoch": 1.3069741900054914, + "grad_norm": 0.5276252703781577, + "learning_rate": 6.092354211027728e-07, + "loss": 0.2729, + "step": 2380 + }, + { + "epoch": 1.3075233388248215, + "grad_norm": 0.41114834934992206, + "learning_rate": 6.089519845110691e-07, + "loss": 0.2392, + "step": 2381 + }, + { + "epoch": 1.3080724876441516, + "grad_norm": 0.47880102615558046, + "learning_rate": 6.086685111689578e-07, + "loss": 0.2416, + "step": 2382 + }, + { + "epoch": 1.3086216364634815, + "grad_norm": 0.529909762649163, + "learning_rate": 6.083850011721005e-07, + "loss": 0.2222, + "step": 2383 + }, + { + "epoch": 1.3091707852828116, + "grad_norm": 0.5119668027464246, + "learning_rate": 6.081014546161715e-07, + "loss": 0.2052, + "step": 2384 + }, + { + "epoch": 1.3097199341021417, + "grad_norm": 0.4980503614782221, + "learning_rate": 6.078178715968572e-07, + "loss": 0.2347, + "step": 2385 + }, + { + "epoch": 1.3102690829214718, + "grad_norm": 0.4396007619534081, + "learning_rate": 6.075342522098568e-07, + "loss": 0.2392, + "step": 2386 + }, + { + "epoch": 1.3108182317408017, + "grad_norm": 0.399550761859906, + "learning_rate": 6.072505965508809e-07, + "loss": 0.2772, + "step": 2387 + }, + { + "epoch": 1.3113673805601318, + "grad_norm": 0.5174675878715941, + "learning_rate": 6.069669047156531e-07, + "loss": 0.2655, + "step": 2388 + }, + { + "epoch": 1.311916529379462, + "grad_norm": 0.4691218248171227, + "learning_rate": 6.06683176799909e-07, + "loss": 0.2527, + "step": 2389 + }, + { + "epoch": 1.3124656781987918, + "grad_norm": 0.3996865937564217, + "learning_rate": 6.063994128993962e-07, + "loss": 0.2358, + "step": 2390 + }, + { + "epoch": 1.3130148270181219, + "grad_norm": 0.5172584538084732, + "learning_rate": 6.061156131098747e-07, + "loss": 0.2295, + "step": 2391 + }, + { + "epoch": 1.313563975837452, + "grad_norm": 0.45495144427974676, + "learning_rate": 6.058317775271161e-07, + "loss": 0.2852, + "step": 2392 + }, + { + "epoch": 1.314113124656782, + "grad_norm": 0.46218713079339824, + "learning_rate": 6.055479062469049e-07, + "loss": 0.2768, + "step": 2393 + }, + { + "epoch": 1.314662273476112, + "grad_norm": 0.471574396130733, + "learning_rate": 6.052639993650371e-07, + "loss": 0.235, + "step": 2394 + }, + { + "epoch": 1.315211422295442, + "grad_norm": 0.426589813019802, + "learning_rate": 6.049800569773205e-07, + "loss": 0.2944, + "step": 2395 + }, + { + "epoch": 1.3157605711147722, + "grad_norm": 0.5787804052387165, + "learning_rate": 6.046960791795755e-07, + "loss": 0.2671, + "step": 2396 + }, + { + "epoch": 1.316309719934102, + "grad_norm": 0.696148070546797, + "learning_rate": 6.044120660676341e-07, + "loss": 0.2613, + "step": 2397 + }, + { + "epoch": 1.3168588687534322, + "grad_norm": 0.477230687781496, + "learning_rate": 6.041280177373403e-07, + "loss": 0.2609, + "step": 2398 + }, + { + "epoch": 1.3174080175727623, + "grad_norm": 0.6922259431504914, + "learning_rate": 6.038439342845498e-07, + "loss": 0.2646, + "step": 2399 + }, + { + "epoch": 1.3179571663920924, + "grad_norm": 0.43326734156572505, + "learning_rate": 6.035598158051304e-07, + "loss": 0.25, + "step": 2400 + }, + { + "epoch": 1.3179571663920924, + "eval_loss": 0.3342709541320801, + "eval_runtime": 18.6615, + "eval_samples_per_second": 23.739, + "eval_steps_per_second": 1.018, + "step": 2400 + }, + { + "epoch": 1.3185063152114223, + "grad_norm": 0.5582642862296483, + "learning_rate": 6.032756623949617e-07, + "loss": 0.2237, + "step": 2401 + }, + { + "epoch": 1.3190554640307524, + "grad_norm": 0.6070163777152277, + "learning_rate": 6.029914741499348e-07, + "loss": 0.2473, + "step": 2402 + }, + { + "epoch": 1.3196046128500822, + "grad_norm": 0.46983240480105454, + "learning_rate": 6.027072511659529e-07, + "loss": 0.2637, + "step": 2403 + }, + { + "epoch": 1.3201537616694123, + "grad_norm": 0.5640555729076856, + "learning_rate": 6.024229935389307e-07, + "loss": 0.2846, + "step": 2404 + }, + { + "epoch": 1.3207029104887424, + "grad_norm": 0.4701649920665535, + "learning_rate": 6.021387013647949e-07, + "loss": 0.2348, + "step": 2405 + }, + { + "epoch": 1.3212520593080725, + "grad_norm": 0.413143040151184, + "learning_rate": 6.018543747394832e-07, + "loss": 0.2576, + "step": 2406 + }, + { + "epoch": 1.3218012081274026, + "grad_norm": 0.5772483056703775, + "learning_rate": 6.015700137589456e-07, + "loss": 0.258, + "step": 2407 + }, + { + "epoch": 1.3223503569467325, + "grad_norm": 0.4289702048272331, + "learning_rate": 6.012856185191437e-07, + "loss": 0.2904, + "step": 2408 + }, + { + "epoch": 1.3228995057660626, + "grad_norm": 0.4529811899966002, + "learning_rate": 6.010011891160501e-07, + "loss": 0.2551, + "step": 2409 + }, + { + "epoch": 1.3234486545853925, + "grad_norm": 0.4727635072521221, + "learning_rate": 6.007167256456494e-07, + "loss": 0.2906, + "step": 2410 + }, + { + "epoch": 1.3239978034047226, + "grad_norm": 0.48081553901207286, + "learning_rate": 6.004322282039376e-07, + "loss": 0.2549, + "step": 2411 + }, + { + "epoch": 1.3245469522240527, + "grad_norm": 0.4640237959522978, + "learning_rate": 6.001476968869217e-07, + "loss": 0.2852, + "step": 2412 + }, + { + "epoch": 1.3250961010433828, + "grad_norm": 0.5688770709079247, + "learning_rate": 5.998631317906211e-07, + "loss": 0.282, + "step": 2413 + }, + { + "epoch": 1.325645249862713, + "grad_norm": 0.4770977407895928, + "learning_rate": 5.995785330110655e-07, + "loss": 0.2513, + "step": 2414 + }, + { + "epoch": 1.3261943986820428, + "grad_norm": 0.5102072642963997, + "learning_rate": 5.99293900644297e-07, + "loss": 0.258, + "step": 2415 + }, + { + "epoch": 1.326743547501373, + "grad_norm": 0.47970741903012687, + "learning_rate": 5.990092347863681e-07, + "loss": 0.2415, + "step": 2416 + }, + { + "epoch": 1.3272926963207028, + "grad_norm": 0.4310159790090193, + "learning_rate": 5.987245355333433e-07, + "loss": 0.2532, + "step": 2417 + }, + { + "epoch": 1.327841845140033, + "grad_norm": 0.4263900358912085, + "learning_rate": 5.984398029812982e-07, + "loss": 0.2312, + "step": 2418 + }, + { + "epoch": 1.328390993959363, + "grad_norm": 0.5669634055433357, + "learning_rate": 5.981550372263194e-07, + "loss": 0.2703, + "step": 2419 + }, + { + "epoch": 1.328940142778693, + "grad_norm": 0.3807539676192585, + "learning_rate": 5.978702383645047e-07, + "loss": 0.2191, + "step": 2420 + }, + { + "epoch": 1.329489291598023, + "grad_norm": 0.47136000678226764, + "learning_rate": 5.975854064919633e-07, + "loss": 0.2571, + "step": 2421 + }, + { + "epoch": 1.330038440417353, + "grad_norm": 0.4041068511004413, + "learning_rate": 5.973005417048157e-07, + "loss": 0.2603, + "step": 2422 + }, + { + "epoch": 1.3305875892366832, + "grad_norm": 0.551995659922425, + "learning_rate": 5.97015644099193e-07, + "loss": 0.2459, + "step": 2423 + }, + { + "epoch": 1.331136738056013, + "grad_norm": 0.48285534348477194, + "learning_rate": 5.967307137712379e-07, + "loss": 0.2208, + "step": 2424 + }, + { + "epoch": 1.3316858868753432, + "grad_norm": 0.46239674419313187, + "learning_rate": 5.964457508171035e-07, + "loss": 0.2396, + "step": 2425 + }, + { + "epoch": 1.3322350356946733, + "grad_norm": 0.4840796632741789, + "learning_rate": 5.961607553329546e-07, + "loss": 0.2819, + "step": 2426 + }, + { + "epoch": 1.3327841845140034, + "grad_norm": 0.42200141093854815, + "learning_rate": 5.95875727414967e-07, + "loss": 0.2655, + "step": 2427 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.47428505447130925, + "learning_rate": 5.955906671593264e-07, + "loss": 0.2841, + "step": 2428 + }, + { + "epoch": 1.3338824821526634, + "grad_norm": 0.4781691837979046, + "learning_rate": 5.953055746622304e-07, + "loss": 0.2322, + "step": 2429 + }, + { + "epoch": 1.3344316309719935, + "grad_norm": 0.5036167432226426, + "learning_rate": 5.950204500198875e-07, + "loss": 0.307, + "step": 2430 + }, + { + "epoch": 1.3349807797913233, + "grad_norm": 0.42585647119331005, + "learning_rate": 5.947352933285163e-07, + "loss": 0.2745, + "step": 2431 + }, + { + "epoch": 1.3355299286106534, + "grad_norm": 0.4329544521371594, + "learning_rate": 5.944501046843472e-07, + "loss": 0.2707, + "step": 2432 + }, + { + "epoch": 1.3360790774299836, + "grad_norm": 0.6444549695104927, + "learning_rate": 5.941648841836203e-07, + "loss": 0.3025, + "step": 2433 + }, + { + "epoch": 1.3366282262493137, + "grad_norm": 0.40606083158676226, + "learning_rate": 5.938796319225875e-07, + "loss": 0.2349, + "step": 2434 + }, + { + "epoch": 1.3371773750686435, + "grad_norm": 0.47466180277619985, + "learning_rate": 5.935943479975108e-07, + "loss": 0.2485, + "step": 2435 + }, + { + "epoch": 1.3377265238879736, + "grad_norm": 0.39685006021856656, + "learning_rate": 5.933090325046628e-07, + "loss": 0.241, + "step": 2436 + }, + { + "epoch": 1.3382756727073037, + "grad_norm": 0.492738848161958, + "learning_rate": 5.93023685540327e-07, + "loss": 0.2567, + "step": 2437 + }, + { + "epoch": 1.3388248215266336, + "grad_norm": 0.49493265820804205, + "learning_rate": 5.927383072007977e-07, + "loss": 0.2569, + "step": 2438 + }, + { + "epoch": 1.3393739703459637, + "grad_norm": 0.542842745466424, + "learning_rate": 5.924528975823797e-07, + "loss": 0.2795, + "step": 2439 + }, + { + "epoch": 1.3399231191652938, + "grad_norm": 0.4239838438167338, + "learning_rate": 5.921674567813877e-07, + "loss": 0.2557, + "step": 2440 + }, + { + "epoch": 1.340472267984624, + "grad_norm": 0.4966233028405723, + "learning_rate": 5.91881984894148e-07, + "loss": 0.2496, + "step": 2441 + }, + { + "epoch": 1.3410214168039538, + "grad_norm": 0.5519329968073597, + "learning_rate": 5.915964820169965e-07, + "loss": 0.2565, + "step": 2442 + }, + { + "epoch": 1.341570565623284, + "grad_norm": 0.7372612067644724, + "learning_rate": 5.913109482462799e-07, + "loss": 0.2613, + "step": 2443 + }, + { + "epoch": 1.342119714442614, + "grad_norm": 0.4796087341765942, + "learning_rate": 5.910253836783555e-07, + "loss": 0.2801, + "step": 2444 + }, + { + "epoch": 1.342668863261944, + "grad_norm": 0.4536777578695405, + "learning_rate": 5.907397884095909e-07, + "loss": 0.268, + "step": 2445 + }, + { + "epoch": 1.343218012081274, + "grad_norm": 0.5109361683793235, + "learning_rate": 5.904541625363636e-07, + "loss": 0.264, + "step": 2446 + }, + { + "epoch": 1.343767160900604, + "grad_norm": 0.41996241567495374, + "learning_rate": 5.901685061550622e-07, + "loss": 0.2869, + "step": 2447 + }, + { + "epoch": 1.3443163097199342, + "grad_norm": 0.5003238455776378, + "learning_rate": 5.89882819362085e-07, + "loss": 0.3071, + "step": 2448 + }, + { + "epoch": 1.344865458539264, + "grad_norm": 0.4697344593713444, + "learning_rate": 5.895971022538409e-07, + "loss": 0.2568, + "step": 2449 + }, + { + "epoch": 1.3454146073585942, + "grad_norm": 0.40731067950571975, + "learning_rate": 5.893113549267485e-07, + "loss": 0.2122, + "step": 2450 + }, + { + "epoch": 1.3459637561779243, + "grad_norm": 0.5396908951315318, + "learning_rate": 5.890255774772377e-07, + "loss": 0.2813, + "step": 2451 + }, + { + "epoch": 1.3465129049972542, + "grad_norm": 0.5313718306793825, + "learning_rate": 5.887397700017474e-07, + "loss": 0.3161, + "step": 2452 + }, + { + "epoch": 1.3470620538165843, + "grad_norm": 0.47293868100635245, + "learning_rate": 5.88453932596727e-07, + "loss": 0.2389, + "step": 2453 + }, + { + "epoch": 1.3476112026359144, + "grad_norm": 0.5193843611130018, + "learning_rate": 5.881680653586365e-07, + "loss": 0.2494, + "step": 2454 + }, + { + "epoch": 1.3481603514552445, + "grad_norm": 0.5043662695880984, + "learning_rate": 5.87882168383945e-07, + "loss": 0.2277, + "step": 2455 + }, + { + "epoch": 1.3487095002745744, + "grad_norm": 0.44989104679921166, + "learning_rate": 5.875962417691327e-07, + "loss": 0.2243, + "step": 2456 + }, + { + "epoch": 1.3492586490939045, + "grad_norm": 0.6009330874211384, + "learning_rate": 5.873102856106892e-07, + "loss": 0.2492, + "step": 2457 + }, + { + "epoch": 1.3498077979132344, + "grad_norm": 0.5758412385916694, + "learning_rate": 5.87024300005114e-07, + "loss": 0.2785, + "step": 2458 + }, + { + "epoch": 1.3503569467325645, + "grad_norm": 0.5857406608086171, + "learning_rate": 5.867382850489168e-07, + "loss": 0.3015, + "step": 2459 + }, + { + "epoch": 1.3509060955518946, + "grad_norm": 0.599070986036983, + "learning_rate": 5.864522408386171e-07, + "loss": 0.2387, + "step": 2460 + }, + { + "epoch": 1.3514552443712247, + "grad_norm": 0.5288798150966122, + "learning_rate": 5.861661674707444e-07, + "loss": 0.2274, + "step": 2461 + }, + { + "epoch": 1.3520043931905548, + "grad_norm": 0.4573537222909525, + "learning_rate": 5.858800650418375e-07, + "loss": 0.2865, + "step": 2462 + }, + { + "epoch": 1.3525535420098846, + "grad_norm": 0.5769587325016156, + "learning_rate": 5.85593933648446e-07, + "loss": 0.3223, + "step": 2463 + }, + { + "epoch": 1.3531026908292147, + "grad_norm": 0.5953040386204894, + "learning_rate": 5.853077733871283e-07, + "loss": 0.2422, + "step": 2464 + }, + { + "epoch": 1.3536518396485446, + "grad_norm": 0.5418375818296769, + "learning_rate": 5.850215843544533e-07, + "loss": 0.2603, + "step": 2465 + }, + { + "epoch": 1.3542009884678747, + "grad_norm": 0.5155977732195074, + "learning_rate": 5.847353666469988e-07, + "loss": 0.285, + "step": 2466 + }, + { + "epoch": 1.3547501372872048, + "grad_norm": 0.4692721414597744, + "learning_rate": 5.844491203613531e-07, + "loss": 0.2382, + "step": 2467 + }, + { + "epoch": 1.355299286106535, + "grad_norm": 0.4578483513598431, + "learning_rate": 5.841628455941135e-07, + "loss": 0.2494, + "step": 2468 + }, + { + "epoch": 1.355848434925865, + "grad_norm": 0.45991865497465073, + "learning_rate": 5.838765424418875e-07, + "loss": 0.2398, + "step": 2469 + }, + { + "epoch": 1.356397583745195, + "grad_norm": 0.4556434117858961, + "learning_rate": 5.835902110012916e-07, + "loss": 0.2242, + "step": 2470 + }, + { + "epoch": 1.356946732564525, + "grad_norm": 0.45410712187680624, + "learning_rate": 5.833038513689523e-07, + "loss": 0.2664, + "step": 2471 + }, + { + "epoch": 1.357495881383855, + "grad_norm": 0.5445284448606036, + "learning_rate": 5.830174636415052e-07, + "loss": 0.2714, + "step": 2472 + }, + { + "epoch": 1.358045030203185, + "grad_norm": 0.4794969236306016, + "learning_rate": 5.827310479155959e-07, + "loss": 0.2502, + "step": 2473 + }, + { + "epoch": 1.3585941790225151, + "grad_norm": 0.5442755281048148, + "learning_rate": 5.82444604287879e-07, + "loss": 0.2825, + "step": 2474 + }, + { + "epoch": 1.3591433278418452, + "grad_norm": 0.5662614867465593, + "learning_rate": 5.821581328550184e-07, + "loss": 0.2532, + "step": 2475 + }, + { + "epoch": 1.359692476661175, + "grad_norm": 0.5073370931357127, + "learning_rate": 5.818716337136884e-07, + "loss": 0.2464, + "step": 2476 + }, + { + "epoch": 1.3602416254805052, + "grad_norm": 0.5424531251966654, + "learning_rate": 5.815851069605711e-07, + "loss": 0.2616, + "step": 2477 + }, + { + "epoch": 1.3607907742998353, + "grad_norm": 0.52104895308535, + "learning_rate": 5.812985526923591e-07, + "loss": 0.3444, + "step": 2478 + }, + { + "epoch": 1.3613399231191652, + "grad_norm": 0.4914302070386866, + "learning_rate": 5.810119710057538e-07, + "loss": 0.2669, + "step": 2479 + }, + { + "epoch": 1.3618890719384953, + "grad_norm": 0.49525888737768364, + "learning_rate": 5.807253619974662e-07, + "loss": 0.2473, + "step": 2480 + }, + { + "epoch": 1.3624382207578254, + "grad_norm": 0.45723054270017466, + "learning_rate": 5.804387257642161e-07, + "loss": 0.2381, + "step": 2481 + }, + { + "epoch": 1.3629873695771555, + "grad_norm": 0.6350684233287965, + "learning_rate": 5.801520624027325e-07, + "loss": 0.2564, + "step": 2482 + }, + { + "epoch": 1.3635365183964854, + "grad_norm": 0.6547472277532573, + "learning_rate": 5.79865372009754e-07, + "loss": 0.3224, + "step": 2483 + }, + { + "epoch": 1.3640856672158155, + "grad_norm": 0.6118024936507656, + "learning_rate": 5.795786546820281e-07, + "loss": 0.3172, + "step": 2484 + }, + { + "epoch": 1.3646348160351456, + "grad_norm": 0.562626551495939, + "learning_rate": 5.79291910516311e-07, + "loss": 0.2578, + "step": 2485 + }, + { + "epoch": 1.3651839648544755, + "grad_norm": 0.6253248867173034, + "learning_rate": 5.790051396093685e-07, + "loss": 0.2506, + "step": 2486 + }, + { + "epoch": 1.3657331136738056, + "grad_norm": 0.5364924692535818, + "learning_rate": 5.787183420579751e-07, + "loss": 0.2306, + "step": 2487 + }, + { + "epoch": 1.3662822624931357, + "grad_norm": 0.4977927712948899, + "learning_rate": 5.784315179589147e-07, + "loss": 0.2373, + "step": 2488 + }, + { + "epoch": 1.3668314113124658, + "grad_norm": 0.480445926524663, + "learning_rate": 5.781446674089795e-07, + "loss": 0.2312, + "step": 2489 + }, + { + "epoch": 1.3673805601317957, + "grad_norm": 0.47823587381587335, + "learning_rate": 5.778577905049712e-07, + "loss": 0.2514, + "step": 2490 + }, + { + "epoch": 1.3679297089511258, + "grad_norm": 0.5089134721308354, + "learning_rate": 5.775708873437002e-07, + "loss": 0.2489, + "step": 2491 + }, + { + "epoch": 1.3684788577704559, + "grad_norm": 0.543664384488189, + "learning_rate": 5.772839580219855e-07, + "loss": 0.2633, + "step": 2492 + }, + { + "epoch": 1.3690280065897857, + "grad_norm": 0.5718583790966169, + "learning_rate": 5.769970026366558e-07, + "loss": 0.239, + "step": 2493 + }, + { + "epoch": 1.3695771554091158, + "grad_norm": 0.636404273665429, + "learning_rate": 5.767100212845469e-07, + "loss": 0.2265, + "step": 2494 + }, + { + "epoch": 1.370126304228446, + "grad_norm": 0.43856841466032964, + "learning_rate": 5.764230140625055e-07, + "loss": 0.2565, + "step": 2495 + }, + { + "epoch": 1.370675453047776, + "grad_norm": 0.5263825798160457, + "learning_rate": 5.761359810673854e-07, + "loss": 0.2261, + "step": 2496 + }, + { + "epoch": 1.371224601867106, + "grad_norm": 0.508098245950131, + "learning_rate": 5.758489223960499e-07, + "loss": 0.2793, + "step": 2497 + }, + { + "epoch": 1.371773750686436, + "grad_norm": 0.45843999919833206, + "learning_rate": 5.755618381453705e-07, + "loss": 0.2365, + "step": 2498 + }, + { + "epoch": 1.3723228995057661, + "grad_norm": 0.42731087370538556, + "learning_rate": 5.752747284122278e-07, + "loss": 0.2247, + "step": 2499 + }, + { + "epoch": 1.372872048325096, + "grad_norm": 0.4160735941327004, + "learning_rate": 5.749875932935106e-07, + "loss": 0.2756, + "step": 2500 + }, + { + "epoch": 1.3734211971444261, + "grad_norm": 0.4423198050966064, + "learning_rate": 5.747004328861164e-07, + "loss": 0.2425, + "step": 2501 + }, + { + "epoch": 1.3739703459637562, + "grad_norm": 0.5042669213959131, + "learning_rate": 5.744132472869513e-07, + "loss": 0.2408, + "step": 2502 + }, + { + "epoch": 1.3745194947830863, + "grad_norm": 0.5972578833811647, + "learning_rate": 5.741260365929299e-07, + "loss": 0.2335, + "step": 2503 + }, + { + "epoch": 1.3750686436024162, + "grad_norm": 0.43908733900145513, + "learning_rate": 5.738388009009752e-07, + "loss": 0.2528, + "step": 2504 + }, + { + "epoch": 1.3756177924217463, + "grad_norm": 0.4893645423258682, + "learning_rate": 5.735515403080186e-07, + "loss": 0.2635, + "step": 2505 + }, + { + "epoch": 1.3761669412410764, + "grad_norm": 0.4935391899408243, + "learning_rate": 5.732642549110001e-07, + "loss": 0.2575, + "step": 2506 + }, + { + "epoch": 1.3767160900604063, + "grad_norm": 0.5785809231834036, + "learning_rate": 5.72976944806868e-07, + "loss": 0.2845, + "step": 2507 + }, + { + "epoch": 1.3772652388797364, + "grad_norm": 0.6537242709620882, + "learning_rate": 5.726896100925786e-07, + "loss": 0.3096, + "step": 2508 + }, + { + "epoch": 1.3778143876990665, + "grad_norm": 0.4137915299865373, + "learning_rate": 5.72402250865097e-07, + "loss": 0.2323, + "step": 2509 + }, + { + "epoch": 1.3783635365183966, + "grad_norm": 0.5136053530625145, + "learning_rate": 5.721148672213963e-07, + "loss": 0.2168, + "step": 2510 + }, + { + "epoch": 1.3789126853377265, + "grad_norm": 0.4159143943694344, + "learning_rate": 5.718274592584578e-07, + "loss": 0.2794, + "step": 2511 + }, + { + "epoch": 1.3794618341570566, + "grad_norm": 0.5419850159318301, + "learning_rate": 5.715400270732712e-07, + "loss": 0.2312, + "step": 2512 + }, + { + "epoch": 1.3800109829763865, + "grad_norm": 0.4186260239621793, + "learning_rate": 5.712525707628341e-07, + "loss": 0.2495, + "step": 2513 + }, + { + "epoch": 1.3805601317957166, + "grad_norm": 0.47562755374013893, + "learning_rate": 5.709650904241527e-07, + "loss": 0.263, + "step": 2514 + }, + { + "epoch": 1.3811092806150467, + "grad_norm": 0.5295503656180506, + "learning_rate": 5.70677586154241e-07, + "loss": 0.2605, + "step": 2515 + }, + { + "epoch": 1.3816584294343768, + "grad_norm": 0.46895869887789937, + "learning_rate": 5.703900580501208e-07, + "loss": 0.2222, + "step": 2516 + }, + { + "epoch": 1.3822075782537069, + "grad_norm": 0.4913421000376898, + "learning_rate": 5.701025062088224e-07, + "loss": 0.2349, + "step": 2517 + }, + { + "epoch": 1.3827567270730368, + "grad_norm": 0.803768407645122, + "learning_rate": 5.698149307273842e-07, + "loss": 0.2827, + "step": 2518 + }, + { + "epoch": 1.3833058758923669, + "grad_norm": 0.5402814334114402, + "learning_rate": 5.695273317028519e-07, + "loss": 0.2939, + "step": 2519 + }, + { + "epoch": 1.3838550247116967, + "grad_norm": 0.5907488125202154, + "learning_rate": 5.692397092322799e-07, + "loss": 0.2706, + "step": 2520 + }, + { + "epoch": 1.3844041735310268, + "grad_norm": 0.4638994814653254, + "learning_rate": 5.6895206341273e-07, + "loss": 0.2343, + "step": 2521 + }, + { + "epoch": 1.384953322350357, + "grad_norm": 0.5495487458929103, + "learning_rate": 5.686643943412721e-07, + "loss": 0.2576, + "step": 2522 + }, + { + "epoch": 1.385502471169687, + "grad_norm": 0.576275510503027, + "learning_rate": 5.68376702114984e-07, + "loss": 0.2188, + "step": 2523 + }, + { + "epoch": 1.3860516199890172, + "grad_norm": 0.4657395568890571, + "learning_rate": 5.680889868309512e-07, + "loss": 0.2529, + "step": 2524 + }, + { + "epoch": 1.386600768808347, + "grad_norm": 0.5293121771997737, + "learning_rate": 5.67801248586267e-07, + "loss": 0.2514, + "step": 2525 + }, + { + "epoch": 1.3871499176276771, + "grad_norm": 0.7043331956655084, + "learning_rate": 5.675134874780324e-07, + "loss": 0.3011, + "step": 2526 + }, + { + "epoch": 1.387699066447007, + "grad_norm": 0.5158144099322439, + "learning_rate": 5.672257036033563e-07, + "loss": 0.2636, + "step": 2527 + }, + { + "epoch": 1.3882482152663371, + "grad_norm": 0.4884975221293426, + "learning_rate": 5.66937897059355e-07, + "loss": 0.2743, + "step": 2528 + }, + { + "epoch": 1.3887973640856672, + "grad_norm": 0.45123586354908496, + "learning_rate": 5.666500679431527e-07, + "loss": 0.2593, + "step": 2529 + }, + { + "epoch": 1.3893465129049973, + "grad_norm": 0.4325611101252934, + "learning_rate": 5.663622163518809e-07, + "loss": 0.2467, + "step": 2530 + }, + { + "epoch": 1.3898956617243272, + "grad_norm": 0.43465666319707863, + "learning_rate": 5.660743423826794e-07, + "loss": 0.2574, + "step": 2531 + }, + { + "epoch": 1.3904448105436573, + "grad_norm": 0.5231091803343897, + "learning_rate": 5.657864461326948e-07, + "loss": 0.2795, + "step": 2532 + }, + { + "epoch": 1.3909939593629874, + "grad_norm": 0.5357165792952369, + "learning_rate": 5.654985276990812e-07, + "loss": 0.2226, + "step": 2533 + }, + { + "epoch": 1.3915431081823173, + "grad_norm": 0.509134167848142, + "learning_rate": 5.652105871790007e-07, + "loss": 0.2304, + "step": 2534 + }, + { + "epoch": 1.3920922570016474, + "grad_norm": 0.6021000356961625, + "learning_rate": 5.649226246696227e-07, + "loss": 0.2812, + "step": 2535 + }, + { + "epoch": 1.3926414058209775, + "grad_norm": 0.5313372727157338, + "learning_rate": 5.646346402681239e-07, + "loss": 0.2286, + "step": 2536 + }, + { + "epoch": 1.3931905546403076, + "grad_norm": 0.5913908856058032, + "learning_rate": 5.643466340716884e-07, + "loss": 0.2776, + "step": 2537 + }, + { + "epoch": 1.3937397034596375, + "grad_norm": 0.5329024596754857, + "learning_rate": 5.640586061775076e-07, + "loss": 0.2174, + "step": 2538 + }, + { + "epoch": 1.3942888522789676, + "grad_norm": 0.4571165121305411, + "learning_rate": 5.637705566827805e-07, + "loss": 0.283, + "step": 2539 + }, + { + "epoch": 1.3948380010982977, + "grad_norm": 0.44971009680316143, + "learning_rate": 5.63482485684713e-07, + "loss": 0.2193, + "step": 2540 + }, + { + "epoch": 1.3953871499176276, + "grad_norm": 0.5243972938723075, + "learning_rate": 5.631943932805184e-07, + "loss": 0.2332, + "step": 2541 + }, + { + "epoch": 1.3959362987369577, + "grad_norm": 0.6058049010178508, + "learning_rate": 5.629062795674176e-07, + "loss": 0.2506, + "step": 2542 + }, + { + "epoch": 1.3964854475562878, + "grad_norm": 0.5101434796348728, + "learning_rate": 5.626181446426381e-07, + "loss": 0.2515, + "step": 2543 + }, + { + "epoch": 1.3970345963756179, + "grad_norm": 0.4129566191730494, + "learning_rate": 5.623299886034148e-07, + "loss": 0.248, + "step": 2544 + }, + { + "epoch": 1.3975837451949478, + "grad_norm": 0.4633916935043192, + "learning_rate": 5.620418115469897e-07, + "loss": 0.223, + "step": 2545 + }, + { + "epoch": 1.3981328940142779, + "grad_norm": 0.437336720089803, + "learning_rate": 5.617536135706123e-07, + "loss": 0.2577, + "step": 2546 + }, + { + "epoch": 1.398682042833608, + "grad_norm": 0.5145684666117926, + "learning_rate": 5.614653947715384e-07, + "loss": 0.2439, + "step": 2547 + }, + { + "epoch": 1.3992311916529379, + "grad_norm": 0.42040693609440827, + "learning_rate": 5.611771552470314e-07, + "loss": 0.2589, + "step": 2548 + }, + { + "epoch": 1.399780340472268, + "grad_norm": 0.46380505194661154, + "learning_rate": 5.608888950943615e-07, + "loss": 0.2593, + "step": 2549 + }, + { + "epoch": 1.400329489291598, + "grad_norm": 0.475476396017217, + "learning_rate": 5.60600614410806e-07, + "loss": 0.2959, + "step": 2550 + }, + { + "epoch": 1.4008786381109282, + "grad_norm": 0.4021118305109138, + "learning_rate": 5.603123132936488e-07, + "loss": 0.2564, + "step": 2551 + }, + { + "epoch": 1.401427786930258, + "grad_norm": 0.7449958522114163, + "learning_rate": 5.600239918401809e-07, + "loss": 0.3015, + "step": 2552 + }, + { + "epoch": 1.4019769357495881, + "grad_norm": 0.4923443819161266, + "learning_rate": 5.597356501477004e-07, + "loss": 0.2796, + "step": 2553 + }, + { + "epoch": 1.4025260845689183, + "grad_norm": 0.5396102012256275, + "learning_rate": 5.59447288313512e-07, + "loss": 0.2577, + "step": 2554 + }, + { + "epoch": 1.4030752333882481, + "grad_norm": 0.46616252587884144, + "learning_rate": 5.59158906434927e-07, + "loss": 0.2478, + "step": 2555 + }, + { + "epoch": 1.4036243822075782, + "grad_norm": 0.4940803585234028, + "learning_rate": 5.588705046092635e-07, + "loss": 0.275, + "step": 2556 + }, + { + "epoch": 1.4041735310269083, + "grad_norm": 0.49968851038405493, + "learning_rate": 5.585820829338468e-07, + "loss": 0.2337, + "step": 2557 + }, + { + "epoch": 1.4047226798462384, + "grad_norm": 0.5900768026188389, + "learning_rate": 5.582936415060086e-07, + "loss": 0.2702, + "step": 2558 + }, + { + "epoch": 1.4052718286655683, + "grad_norm": 0.4433791718295024, + "learning_rate": 5.580051804230872e-07, + "loss": 0.2835, + "step": 2559 + }, + { + "epoch": 1.4058209774848984, + "grad_norm": 0.5546085479721924, + "learning_rate": 5.577166997824275e-07, + "loss": 0.2538, + "step": 2560 + }, + { + "epoch": 1.4063701263042283, + "grad_norm": 0.49338905195185423, + "learning_rate": 5.574281996813811e-07, + "loss": 0.26, + "step": 2561 + }, + { + "epoch": 1.4069192751235584, + "grad_norm": 0.43563317633591875, + "learning_rate": 5.571396802173062e-07, + "loss": 0.2182, + "step": 2562 + }, + { + "epoch": 1.4074684239428885, + "grad_norm": 0.5110553028461078, + "learning_rate": 5.568511414875675e-07, + "loss": 0.2453, + "step": 2563 + }, + { + "epoch": 1.4080175727622186, + "grad_norm": 0.47177296049403633, + "learning_rate": 5.565625835895361e-07, + "loss": 0.2822, + "step": 2564 + }, + { + "epoch": 1.4085667215815487, + "grad_norm": 0.5496409664971602, + "learning_rate": 5.562740066205898e-07, + "loss": 0.2947, + "step": 2565 + }, + { + "epoch": 1.4091158704008786, + "grad_norm": 0.39637733569512956, + "learning_rate": 5.559854106781127e-07, + "loss": 0.2709, + "step": 2566 + }, + { + "epoch": 1.4096650192202087, + "grad_norm": 0.6151857303727866, + "learning_rate": 5.556967958594953e-07, + "loss": 0.2568, + "step": 2567 + }, + { + "epoch": 1.4102141680395386, + "grad_norm": 0.5803853484798582, + "learning_rate": 5.55408162262134e-07, + "loss": 0.2518, + "step": 2568 + }, + { + "epoch": 1.4107633168588687, + "grad_norm": 0.4794455894749099, + "learning_rate": 5.551195099834326e-07, + "loss": 0.2411, + "step": 2569 + }, + { + "epoch": 1.4113124656781988, + "grad_norm": 0.7649530136773677, + "learning_rate": 5.548308391208007e-07, + "loss": 0.3087, + "step": 2570 + }, + { + "epoch": 1.411861614497529, + "grad_norm": 0.4666597748023007, + "learning_rate": 5.545421497716533e-07, + "loss": 0.2248, + "step": 2571 + }, + { + "epoch": 1.412410763316859, + "grad_norm": 0.5121423297813796, + "learning_rate": 5.542534420334132e-07, + "loss": 0.2334, + "step": 2572 + }, + { + "epoch": 1.4129599121361889, + "grad_norm": 0.43375497668417196, + "learning_rate": 5.539647160035084e-07, + "loss": 0.2429, + "step": 2573 + }, + { + "epoch": 1.413509060955519, + "grad_norm": 0.5666466346825227, + "learning_rate": 5.536759717793731e-07, + "loss": 0.2768, + "step": 2574 + }, + { + "epoch": 1.4140582097748489, + "grad_norm": 0.4708220637290964, + "learning_rate": 5.53387209458448e-07, + "loss": 0.243, + "step": 2575 + }, + { + "epoch": 1.414607358594179, + "grad_norm": 0.3731129077020286, + "learning_rate": 5.530984291381798e-07, + "loss": 0.2716, + "step": 2576 + }, + { + "epoch": 1.415156507413509, + "grad_norm": 0.4546681364342084, + "learning_rate": 5.52809630916021e-07, + "loss": 0.2629, + "step": 2577 + }, + { + "epoch": 1.4157056562328392, + "grad_norm": 0.6906878348430655, + "learning_rate": 5.525208148894306e-07, + "loss": 0.2748, + "step": 2578 + }, + { + "epoch": 1.4162548050521693, + "grad_norm": 0.46942272826029585, + "learning_rate": 5.522319811558732e-07, + "loss": 0.2676, + "step": 2579 + }, + { + "epoch": 1.4168039538714992, + "grad_norm": 0.48851766663718893, + "learning_rate": 5.519431298128196e-07, + "loss": 0.2085, + "step": 2580 + }, + { + "epoch": 1.4173531026908293, + "grad_norm": 0.504752960820369, + "learning_rate": 5.516542609577467e-07, + "loss": 0.2415, + "step": 2581 + }, + { + "epoch": 1.4179022515101591, + "grad_norm": 0.605966929059181, + "learning_rate": 5.513653746881365e-07, + "loss": 0.2698, + "step": 2582 + }, + { + "epoch": 1.4184514003294892, + "grad_norm": 0.573766959448316, + "learning_rate": 5.510764711014782e-07, + "loss": 0.2603, + "step": 2583 + }, + { + "epoch": 1.4190005491488193, + "grad_norm": 0.5586293384385642, + "learning_rate": 5.507875502952657e-07, + "loss": 0.2489, + "step": 2584 + }, + { + "epoch": 1.4195496979681494, + "grad_norm": 0.546072716851964, + "learning_rate": 5.504986123669993e-07, + "loss": 0.2878, + "step": 2585 + }, + { + "epoch": 1.4200988467874793, + "grad_norm": 0.436997322045323, + "learning_rate": 5.502096574141844e-07, + "loss": 0.2502, + "step": 2586 + }, + { + "epoch": 1.4206479956068094, + "grad_norm": 0.5397995331885239, + "learning_rate": 5.499206855343336e-07, + "loss": 0.2827, + "step": 2587 + }, + { + "epoch": 1.4211971444261395, + "grad_norm": 0.680275200376474, + "learning_rate": 5.496316968249634e-07, + "loss": 0.2992, + "step": 2588 + }, + { + "epoch": 1.4217462932454694, + "grad_norm": 0.4541928028488033, + "learning_rate": 5.493426913835973e-07, + "loss": 0.3032, + "step": 2589 + }, + { + "epoch": 1.4222954420647995, + "grad_norm": 0.5513539341347823, + "learning_rate": 5.490536693077639e-07, + "loss": 0.251, + "step": 2590 + }, + { + "epoch": 1.4228445908841296, + "grad_norm": 0.45338952528699095, + "learning_rate": 5.487646306949973e-07, + "loss": 0.2233, + "step": 2591 + }, + { + "epoch": 1.4233937397034597, + "grad_norm": 0.5300731874273379, + "learning_rate": 5.484755756428378e-07, + "loss": 0.2369, + "step": 2592 + }, + { + "epoch": 1.4239428885227896, + "grad_norm": 0.5631313003508217, + "learning_rate": 5.481865042488303e-07, + "loss": 0.2341, + "step": 2593 + }, + { + "epoch": 1.4244920373421197, + "grad_norm": 0.39539025393663096, + "learning_rate": 5.478974166105261e-07, + "loss": 0.2461, + "step": 2594 + }, + { + "epoch": 1.4250411861614498, + "grad_norm": 0.4185735648095177, + "learning_rate": 5.476083128254817e-07, + "loss": 0.2175, + "step": 2595 + }, + { + "epoch": 1.4255903349807797, + "grad_norm": 0.4015870712824598, + "learning_rate": 5.473191929912586e-07, + "loss": 0.2488, + "step": 2596 + }, + { + "epoch": 1.4261394838001098, + "grad_norm": 0.5286556184113277, + "learning_rate": 5.470300572054246e-07, + "loss": 0.2803, + "step": 2597 + }, + { + "epoch": 1.42668863261944, + "grad_norm": 0.45533696118154177, + "learning_rate": 5.467409055655519e-07, + "loss": 0.26, + "step": 2598 + }, + { + "epoch": 1.42723778143877, + "grad_norm": 0.43986376000385236, + "learning_rate": 5.464517381692188e-07, + "loss": 0.2551, + "step": 2599 + }, + { + "epoch": 1.4277869302580999, + "grad_norm": 0.5337924992622999, + "learning_rate": 5.461625551140085e-07, + "loss": 0.2679, + "step": 2600 + }, + { + "epoch": 1.4277869302580999, + "eval_loss": 0.3316677212715149, + "eval_runtime": 18.662, + "eval_samples_per_second": 23.738, + "eval_steps_per_second": 1.018, + "step": 2600 + }, + { + "epoch": 1.42833607907743, + "grad_norm": 0.506140021645145, + "learning_rate": 5.458733564975097e-07, + "loss": 0.2353, + "step": 2601 + }, + { + "epoch": 1.42888522789676, + "grad_norm": 0.42814061027411326, + "learning_rate": 5.455841424173163e-07, + "loss": 0.2567, + "step": 2602 + }, + { + "epoch": 1.42943437671609, + "grad_norm": 0.5344115049008108, + "learning_rate": 5.452949129710275e-07, + "loss": 0.2852, + "step": 2603 + }, + { + "epoch": 1.42998352553542, + "grad_norm": 0.47070608186687296, + "learning_rate": 5.450056682562473e-07, + "loss": 0.2435, + "step": 2604 + }, + { + "epoch": 1.4305326743547502, + "grad_norm": 0.6258021311931141, + "learning_rate": 5.447164083705852e-07, + "loss": 0.2634, + "step": 2605 + }, + { + "epoch": 1.4310818231740803, + "grad_norm": 0.45701378140338594, + "learning_rate": 5.44427133411656e-07, + "loss": 0.2232, + "step": 2606 + }, + { + "epoch": 1.4316309719934102, + "grad_norm": 0.3758460619300753, + "learning_rate": 5.441378434770793e-07, + "loss": 0.2566, + "step": 2607 + }, + { + "epoch": 1.4321801208127403, + "grad_norm": 0.56930879889863, + "learning_rate": 5.438485386644793e-07, + "loss": 0.2415, + "step": 2608 + }, + { + "epoch": 1.4327292696320704, + "grad_norm": 0.5159955252920533, + "learning_rate": 5.435592190714865e-07, + "loss": 0.2395, + "step": 2609 + }, + { + "epoch": 1.4332784184514002, + "grad_norm": 1.4458408876279631, + "learning_rate": 5.432698847957349e-07, + "loss": 0.2974, + "step": 2610 + }, + { + "epoch": 1.4338275672707304, + "grad_norm": 0.5688117102745841, + "learning_rate": 5.429805359348647e-07, + "loss": 0.2404, + "step": 2611 + }, + { + "epoch": 1.4343767160900605, + "grad_norm": 0.46225900616139803, + "learning_rate": 5.426911725865199e-07, + "loss": 0.2384, + "step": 2612 + }, + { + "epoch": 1.4349258649093906, + "grad_norm": 0.5214826644002363, + "learning_rate": 5.424017948483504e-07, + "loss": 0.2606, + "step": 2613 + }, + { + "epoch": 1.4354750137287204, + "grad_norm": 0.44529699562222247, + "learning_rate": 5.421124028180108e-07, + "loss": 0.2177, + "step": 2614 + }, + { + "epoch": 1.4360241625480505, + "grad_norm": 0.47321200397728697, + "learning_rate": 5.418229965931594e-07, + "loss": 0.2409, + "step": 2615 + }, + { + "epoch": 1.4365733113673804, + "grad_norm": 0.5576254840904888, + "learning_rate": 5.415335762714609e-07, + "loss": 0.2344, + "step": 2616 + }, + { + "epoch": 1.4371224601867105, + "grad_norm": 0.42250710323786533, + "learning_rate": 5.412441419505838e-07, + "loss": 0.2515, + "step": 2617 + }, + { + "epoch": 1.4376716090060406, + "grad_norm": 0.4731986090222987, + "learning_rate": 5.409546937282013e-07, + "loss": 0.2467, + "step": 2618 + }, + { + "epoch": 1.4382207578253707, + "grad_norm": 0.4880657685005266, + "learning_rate": 5.406652317019916e-07, + "loss": 0.2683, + "step": 2619 + }, + { + "epoch": 1.4387699066447008, + "grad_norm": 0.43509023256039153, + "learning_rate": 5.403757559696376e-07, + "loss": 0.261, + "step": 2620 + }, + { + "epoch": 1.4393190554640307, + "grad_norm": 0.6029958800878207, + "learning_rate": 5.400862666288265e-07, + "loss": 0.2525, + "step": 2621 + }, + { + "epoch": 1.4398682042833608, + "grad_norm": 0.47333294480863286, + "learning_rate": 5.397967637772505e-07, + "loss": 0.2344, + "step": 2622 + }, + { + "epoch": 1.4404173531026907, + "grad_norm": 0.5498391862105979, + "learning_rate": 5.39507247512606e-07, + "loss": 0.2805, + "step": 2623 + }, + { + "epoch": 1.4409665019220208, + "grad_norm": 0.5792132187858705, + "learning_rate": 5.392177179325941e-07, + "loss": 0.2677, + "step": 2624 + }, + { + "epoch": 1.441515650741351, + "grad_norm": 0.45634207142877636, + "learning_rate": 5.389281751349205e-07, + "loss": 0.2695, + "step": 2625 + }, + { + "epoch": 1.442064799560681, + "grad_norm": 0.5111337837092467, + "learning_rate": 5.38638619217295e-07, + "loss": 0.2347, + "step": 2626 + }, + { + "epoch": 1.4426139483800111, + "grad_norm": 0.5226584799766982, + "learning_rate": 5.383490502774321e-07, + "loss": 0.2445, + "step": 2627 + }, + { + "epoch": 1.443163097199341, + "grad_norm": 0.6101541611251752, + "learning_rate": 5.38059468413051e-07, + "loss": 0.2609, + "step": 2628 + }, + { + "epoch": 1.443712246018671, + "grad_norm": 0.616957255212788, + "learning_rate": 5.377698737218742e-07, + "loss": 0.2199, + "step": 2629 + }, + { + "epoch": 1.444261394838001, + "grad_norm": 0.5183692949811273, + "learning_rate": 5.374802663016299e-07, + "loss": 0.2941, + "step": 2630 + }, + { + "epoch": 1.444810543657331, + "grad_norm": 0.5860769920445142, + "learning_rate": 5.371906462500499e-07, + "loss": 0.2391, + "step": 2631 + }, + { + "epoch": 1.4453596924766612, + "grad_norm": 0.4543579287778307, + "learning_rate": 5.369010136648698e-07, + "loss": 0.2356, + "step": 2632 + }, + { + "epoch": 1.4459088412959913, + "grad_norm": 0.4858947003640477, + "learning_rate": 5.366113686438304e-07, + "loss": 0.2688, + "step": 2633 + }, + { + "epoch": 1.4464579901153214, + "grad_norm": 0.39022698174573645, + "learning_rate": 5.36321711284676e-07, + "loss": 0.2425, + "step": 2634 + }, + { + "epoch": 1.4470071389346513, + "grad_norm": 0.38506034226710706, + "learning_rate": 5.360320416851552e-07, + "loss": 0.2415, + "step": 2635 + }, + { + "epoch": 1.4475562877539814, + "grad_norm": 0.6389120861997443, + "learning_rate": 5.357423599430212e-07, + "loss": 0.25, + "step": 2636 + }, + { + "epoch": 1.4481054365733113, + "grad_norm": 0.4498904953062231, + "learning_rate": 5.354526661560305e-07, + "loss": 0.2901, + "step": 2637 + }, + { + "epoch": 1.4486545853926414, + "grad_norm": 0.5173101027188047, + "learning_rate": 5.351629604219444e-07, + "loss": 0.2732, + "step": 2638 + }, + { + "epoch": 1.4492037342119715, + "grad_norm": 0.4845016181242439, + "learning_rate": 5.348732428385276e-07, + "loss": 0.2719, + "step": 2639 + }, + { + "epoch": 1.4497528830313016, + "grad_norm": 0.4825193919964431, + "learning_rate": 5.345835135035493e-07, + "loss": 0.2437, + "step": 2640 + }, + { + "epoch": 1.4503020318506314, + "grad_norm": 0.41436395796893055, + "learning_rate": 5.342937725147824e-07, + "loss": 0.2313, + "step": 2641 + }, + { + "epoch": 1.4508511806699615, + "grad_norm": 0.46689583167545246, + "learning_rate": 5.34004019970004e-07, + "loss": 0.2203, + "step": 2642 + }, + { + "epoch": 1.4514003294892917, + "grad_norm": 0.4408505165786209, + "learning_rate": 5.337142559669947e-07, + "loss": 0.2763, + "step": 2643 + }, + { + "epoch": 1.4519494783086215, + "grad_norm": 0.6343247874570068, + "learning_rate": 5.334244806035393e-07, + "loss": 0.2719, + "step": 2644 + }, + { + "epoch": 1.4524986271279516, + "grad_norm": 0.39150453636226124, + "learning_rate": 5.331346939774262e-07, + "loss": 0.2212, + "step": 2645 + }, + { + "epoch": 1.4530477759472817, + "grad_norm": 0.5350730065114141, + "learning_rate": 5.328448961864476e-07, + "loss": 0.2622, + "step": 2646 + }, + { + "epoch": 1.4535969247666118, + "grad_norm": 0.4290653147645335, + "learning_rate": 5.325550873284002e-07, + "loss": 0.2453, + "step": 2647 + }, + { + "epoch": 1.4541460735859417, + "grad_norm": 0.42234693864061046, + "learning_rate": 5.322652675010831e-07, + "loss": 0.2351, + "step": 2648 + }, + { + "epoch": 1.4546952224052718, + "grad_norm": 0.5251760271542221, + "learning_rate": 5.319754368022999e-07, + "loss": 0.257, + "step": 2649 + }, + { + "epoch": 1.455244371224602, + "grad_norm": 0.6055466949052233, + "learning_rate": 5.316855953298581e-07, + "loss": 0.2956, + "step": 2650 + }, + { + "epoch": 1.4557935200439318, + "grad_norm": 0.42472652979370273, + "learning_rate": 5.313957431815683e-07, + "loss": 0.2299, + "step": 2651 + }, + { + "epoch": 1.456342668863262, + "grad_norm": 0.4978694832161189, + "learning_rate": 5.311058804552451e-07, + "loss": 0.2561, + "step": 2652 + }, + { + "epoch": 1.456891817682592, + "grad_norm": 0.5700744751729543, + "learning_rate": 5.308160072487063e-07, + "loss": 0.2548, + "step": 2653 + }, + { + "epoch": 1.4574409665019221, + "grad_norm": 0.5785968744616552, + "learning_rate": 5.305261236597736e-07, + "loss": 0.2384, + "step": 2654 + }, + { + "epoch": 1.457990115321252, + "grad_norm": 0.6136199304080862, + "learning_rate": 5.30236229786272e-07, + "loss": 0.297, + "step": 2655 + }, + { + "epoch": 1.458539264140582, + "grad_norm": 0.50998355629445, + "learning_rate": 5.299463257260298e-07, + "loss": 0.2589, + "step": 2656 + }, + { + "epoch": 1.4590884129599122, + "grad_norm": 0.5930326135534739, + "learning_rate": 5.296564115768791e-07, + "loss": 0.2141, + "step": 2657 + }, + { + "epoch": 1.459637561779242, + "grad_norm": 0.5905736821105259, + "learning_rate": 5.293664874366553e-07, + "loss": 0.2429, + "step": 2658 + }, + { + "epoch": 1.4601867105985722, + "grad_norm": 0.4964043577377883, + "learning_rate": 5.290765534031969e-07, + "loss": 0.3254, + "step": 2659 + }, + { + "epoch": 1.4607358594179023, + "grad_norm": 0.5603752880220949, + "learning_rate": 5.287866095743462e-07, + "loss": 0.2859, + "step": 2660 + }, + { + "epoch": 1.4612850082372324, + "grad_norm": 0.6600462159729269, + "learning_rate": 5.284966560479485e-07, + "loss": 0.3033, + "step": 2661 + }, + { + "epoch": 1.4618341570565623, + "grad_norm": 0.4699250880858402, + "learning_rate": 5.282066929218524e-07, + "loss": 0.2528, + "step": 2662 + }, + { + "epoch": 1.4623833058758924, + "grad_norm": 0.5797770154985005, + "learning_rate": 5.279167202939098e-07, + "loss": 0.2465, + "step": 2663 + }, + { + "epoch": 1.4629324546952225, + "grad_norm": 0.583065554331777, + "learning_rate": 5.276267382619757e-07, + "loss": 0.3069, + "step": 2664 + }, + { + "epoch": 1.4634816035145524, + "grad_norm": 0.5649048337148856, + "learning_rate": 5.273367469239083e-07, + "loss": 0.283, + "step": 2665 + }, + { + "epoch": 1.4640307523338825, + "grad_norm": 0.6121639525245796, + "learning_rate": 5.270467463775691e-07, + "loss": 0.2612, + "step": 2666 + }, + { + "epoch": 1.4645799011532126, + "grad_norm": 0.45398134711720356, + "learning_rate": 5.267567367208227e-07, + "loss": 0.2453, + "step": 2667 + }, + { + "epoch": 1.4651290499725427, + "grad_norm": 0.4282179423529771, + "learning_rate": 5.264667180515365e-07, + "loss": 0.2805, + "step": 2668 + }, + { + "epoch": 1.4656781987918726, + "grad_norm": 0.5531656173222034, + "learning_rate": 5.261766904675813e-07, + "loss": 0.2605, + "step": 2669 + }, + { + "epoch": 1.4662273476112027, + "grad_norm": 0.5565252140594215, + "learning_rate": 5.258866540668305e-07, + "loss": 0.2953, + "step": 2670 + }, + { + "epoch": 1.4667764964305325, + "grad_norm": 0.4665867633885385, + "learning_rate": 5.255966089471607e-07, + "loss": 0.2815, + "step": 2671 + }, + { + "epoch": 1.4673256452498626, + "grad_norm": 0.6569944690830295, + "learning_rate": 5.253065552064517e-07, + "loss": 0.2333, + "step": 2672 + }, + { + "epoch": 1.4678747940691927, + "grad_norm": 0.572554215877364, + "learning_rate": 5.250164929425858e-07, + "loss": 0.2892, + "step": 2673 + }, + { + "epoch": 1.4684239428885228, + "grad_norm": 0.4238094446643601, + "learning_rate": 5.247264222534483e-07, + "loss": 0.3005, + "step": 2674 + }, + { + "epoch": 1.468973091707853, + "grad_norm": 0.40406640471735505, + "learning_rate": 5.244363432369274e-07, + "loss": 0.2355, + "step": 2675 + }, + { + "epoch": 1.4695222405271828, + "grad_norm": 0.43470440123019133, + "learning_rate": 5.241462559909142e-07, + "loss": 0.244, + "step": 2676 + }, + { + "epoch": 1.470071389346513, + "grad_norm": 0.4149616378586847, + "learning_rate": 5.23856160613302e-07, + "loss": 0.2605, + "step": 2677 + }, + { + "epoch": 1.4706205381658428, + "grad_norm": 0.47488965214710716, + "learning_rate": 5.235660572019879e-07, + "loss": 0.2556, + "step": 2678 + }, + { + "epoch": 1.471169686985173, + "grad_norm": 0.4834643430587485, + "learning_rate": 5.23275945854871e-07, + "loss": 0.2627, + "step": 2679 + }, + { + "epoch": 1.471718835804503, + "grad_norm": 0.46479757121507154, + "learning_rate": 5.229858266698527e-07, + "loss": 0.2378, + "step": 2680 + }, + { + "epoch": 1.4722679846238331, + "grad_norm": 0.45106727718447925, + "learning_rate": 5.226956997448381e-07, + "loss": 0.2366, + "step": 2681 + }, + { + "epoch": 1.4728171334431632, + "grad_norm": 0.6245623747590792, + "learning_rate": 5.224055651777341e-07, + "loss": 0.2245, + "step": 2682 + }, + { + "epoch": 1.473366282262493, + "grad_norm": 0.5333705901248235, + "learning_rate": 5.221154230664503e-07, + "loss": 0.2585, + "step": 2683 + }, + { + "epoch": 1.4739154310818232, + "grad_norm": 0.4454217985917844, + "learning_rate": 5.218252735088994e-07, + "loss": 0.2709, + "step": 2684 + }, + { + "epoch": 1.474464579901153, + "grad_norm": 0.492677155289305, + "learning_rate": 5.215351166029958e-07, + "loss": 0.2333, + "step": 2685 + }, + { + "epoch": 1.4750137287204832, + "grad_norm": 0.541343606657762, + "learning_rate": 5.212449524466568e-07, + "loss": 0.2484, + "step": 2686 + }, + { + "epoch": 1.4755628775398133, + "grad_norm": 0.599707183663272, + "learning_rate": 5.209547811378024e-07, + "loss": 0.2643, + "step": 2687 + }, + { + "epoch": 1.4761120263591434, + "grad_norm": 0.4764911838342862, + "learning_rate": 5.206646027743542e-07, + "loss": 0.2394, + "step": 2688 + }, + { + "epoch": 1.4766611751784733, + "grad_norm": 0.702106968173194, + "learning_rate": 5.203744174542373e-07, + "loss": 0.2314, + "step": 2689 + }, + { + "epoch": 1.4772103239978034, + "grad_norm": 0.5433508843543885, + "learning_rate": 5.200842252753783e-07, + "loss": 0.257, + "step": 2690 + }, + { + "epoch": 1.4777594728171335, + "grad_norm": 0.39340349662844637, + "learning_rate": 5.197940263357064e-07, + "loss": 0.2511, + "step": 2691 + }, + { + "epoch": 1.4783086216364634, + "grad_norm": 0.47094471464433973, + "learning_rate": 5.195038207331526e-07, + "loss": 0.2188, + "step": 2692 + }, + { + "epoch": 1.4788577704557935, + "grad_norm": 0.6779354378610623, + "learning_rate": 5.192136085656513e-07, + "loss": 0.2733, + "step": 2693 + }, + { + "epoch": 1.4794069192751236, + "grad_norm": 0.4980524396937037, + "learning_rate": 5.189233899311382e-07, + "loss": 0.2618, + "step": 2694 + }, + { + "epoch": 1.4799560680944537, + "grad_norm": 0.468198827537501, + "learning_rate": 5.186331649275513e-07, + "loss": 0.2579, + "step": 2695 + }, + { + "epoch": 1.4805052169137836, + "grad_norm": 0.6309454274631521, + "learning_rate": 5.183429336528308e-07, + "loss": 0.2329, + "step": 2696 + }, + { + "epoch": 1.4810543657331137, + "grad_norm": 0.6001086497412195, + "learning_rate": 5.18052696204919e-07, + "loss": 0.2316, + "step": 2697 + }, + { + "epoch": 1.4816035145524438, + "grad_norm": 0.5296571386786367, + "learning_rate": 5.177624526817605e-07, + "loss": 0.2621, + "step": 2698 + }, + { + "epoch": 1.4821526633717736, + "grad_norm": 0.5165644413207069, + "learning_rate": 5.174722031813019e-07, + "loss": 0.2635, + "step": 2699 + }, + { + "epoch": 1.4827018121911038, + "grad_norm": 0.4407409227683134, + "learning_rate": 5.171819478014915e-07, + "loss": 0.3084, + "step": 2700 + }, + { + "epoch": 1.4832509610104339, + "grad_norm": 0.46859597156517513, + "learning_rate": 5.1689168664028e-07, + "loss": 0.2609, + "step": 2701 + }, + { + "epoch": 1.483800109829764, + "grad_norm": 0.45523213938016316, + "learning_rate": 5.166014197956197e-07, + "loss": 0.2758, + "step": 2702 + }, + { + "epoch": 1.4843492586490938, + "grad_norm": 0.42492725823899286, + "learning_rate": 5.163111473654649e-07, + "loss": 0.2527, + "step": 2703 + }, + { + "epoch": 1.484898407468424, + "grad_norm": 0.5647787170303656, + "learning_rate": 5.160208694477719e-07, + "loss": 0.2488, + "step": 2704 + }, + { + "epoch": 1.485447556287754, + "grad_norm": 0.5579652793177825, + "learning_rate": 5.157305861404989e-07, + "loss": 0.2609, + "step": 2705 + }, + { + "epoch": 1.485996705107084, + "grad_norm": 0.49343176120630367, + "learning_rate": 5.154402975416059e-07, + "loss": 0.2449, + "step": 2706 + }, + { + "epoch": 1.486545853926414, + "grad_norm": 1.3426944652586155, + "learning_rate": 5.151500037490544e-07, + "loss": 0.3039, + "step": 2707 + }, + { + "epoch": 1.4870950027457441, + "grad_norm": 0.4756923426018933, + "learning_rate": 5.148597048608079e-07, + "loss": 0.2405, + "step": 2708 + }, + { + "epoch": 1.4876441515650742, + "grad_norm": 0.5196741383174757, + "learning_rate": 5.145694009748316e-07, + "loss": 0.2588, + "step": 2709 + }, + { + "epoch": 1.4881933003844041, + "grad_norm": 0.4485991373916076, + "learning_rate": 5.142790921890923e-07, + "loss": 0.2388, + "step": 2710 + }, + { + "epoch": 1.4887424492037342, + "grad_norm": 0.6206980603521447, + "learning_rate": 5.139887786015589e-07, + "loss": 0.2534, + "step": 2711 + }, + { + "epoch": 1.4892915980230643, + "grad_norm": 0.4207048720145005, + "learning_rate": 5.136984603102011e-07, + "loss": 0.2493, + "step": 2712 + }, + { + "epoch": 1.4898407468423942, + "grad_norm": 0.4686330635148352, + "learning_rate": 5.134081374129908e-07, + "loss": 0.2236, + "step": 2713 + }, + { + "epoch": 1.4903898956617243, + "grad_norm": 0.3882104550111546, + "learning_rate": 5.13117810007901e-07, + "loss": 0.2929, + "step": 2714 + }, + { + "epoch": 1.4909390444810544, + "grad_norm": 0.610081166844695, + "learning_rate": 5.128274781929069e-07, + "loss": 0.2396, + "step": 2715 + }, + { + "epoch": 1.4914881933003845, + "grad_norm": 0.4324690403827052, + "learning_rate": 5.125371420659848e-07, + "loss": 0.2799, + "step": 2716 + }, + { + "epoch": 1.4920373421197144, + "grad_norm": 0.44893638495618865, + "learning_rate": 5.122468017251123e-07, + "loss": 0.2493, + "step": 2717 + }, + { + "epoch": 1.4925864909390445, + "grad_norm": 0.6030894086321118, + "learning_rate": 5.119564572682684e-07, + "loss": 0.2486, + "step": 2718 + }, + { + "epoch": 1.4931356397583746, + "grad_norm": 0.5319080072435015, + "learning_rate": 5.116661087934339e-07, + "loss": 0.2611, + "step": 2719 + }, + { + "epoch": 1.4936847885777045, + "grad_norm": 0.4904603796543683, + "learning_rate": 5.113757563985905e-07, + "loss": 0.2143, + "step": 2720 + }, + { + "epoch": 1.4942339373970346, + "grad_norm": 0.3943689438120682, + "learning_rate": 5.110854001817218e-07, + "loss": 0.2338, + "step": 2721 + }, + { + "epoch": 1.4947830862163647, + "grad_norm": 0.4280408818761351, + "learning_rate": 5.107950402408117e-07, + "loss": 0.2532, + "step": 2722 + }, + { + "epoch": 1.4953322350356948, + "grad_norm": 0.4712825699658468, + "learning_rate": 5.105046766738468e-07, + "loss": 0.2693, + "step": 2723 + }, + { + "epoch": 1.4958813838550247, + "grad_norm": 0.6279286985478026, + "learning_rate": 5.102143095788136e-07, + "loss": 0.2684, + "step": 2724 + }, + { + "epoch": 1.4964305326743548, + "grad_norm": 0.4676177476682958, + "learning_rate": 5.099239390537003e-07, + "loss": 0.2419, + "step": 2725 + }, + { + "epoch": 1.4969796814936847, + "grad_norm": 0.4783012679225463, + "learning_rate": 5.096335651964962e-07, + "loss": 0.229, + "step": 2726 + }, + { + "epoch": 1.4975288303130148, + "grad_norm": 0.5479412453284265, + "learning_rate": 5.093431881051923e-07, + "loss": 0.2677, + "step": 2727 + }, + { + "epoch": 1.4980779791323449, + "grad_norm": 0.5287442832379158, + "learning_rate": 5.090528078777796e-07, + "loss": 0.2456, + "step": 2728 + }, + { + "epoch": 1.498627127951675, + "grad_norm": 0.4726957226628252, + "learning_rate": 5.087624246122509e-07, + "loss": 0.2559, + "step": 2729 + }, + { + "epoch": 1.499176276771005, + "grad_norm": 0.6389936118238199, + "learning_rate": 5.084720384065998e-07, + "loss": 0.2526, + "step": 2730 + }, + { + "epoch": 1.499725425590335, + "grad_norm": 0.6294137990739049, + "learning_rate": 5.081816493588209e-07, + "loss": 0.2595, + "step": 2731 + }, + { + "epoch": 1.500274574409665, + "grad_norm": 0.5661325736011077, + "learning_rate": 5.078912575669102e-07, + "loss": 0.2553, + "step": 2732 + }, + { + "epoch": 1.500823723228995, + "grad_norm": 0.44453987711884313, + "learning_rate": 5.076008631288639e-07, + "loss": 0.2507, + "step": 2733 + }, + { + "epoch": 1.501372872048325, + "grad_norm": 0.41941441739540863, + "learning_rate": 5.073104661426795e-07, + "loss": 0.2575, + "step": 2734 + }, + { + "epoch": 1.5019220208676551, + "grad_norm": 0.4024798561247544, + "learning_rate": 5.070200667063552e-07, + "loss": 0.2869, + "step": 2735 + }, + { + "epoch": 1.5024711696869852, + "grad_norm": 0.4851995634394856, + "learning_rate": 5.0672966491789e-07, + "loss": 0.2117, + "step": 2736 + }, + { + "epoch": 1.5030203185063153, + "grad_norm": 0.49586482497733075, + "learning_rate": 5.064392608752842e-07, + "loss": 0.2213, + "step": 2737 + }, + { + "epoch": 1.5035694673256452, + "grad_norm": 0.610983778213739, + "learning_rate": 5.061488546765381e-07, + "loss": 0.2996, + "step": 2738 + }, + { + "epoch": 1.5041186161449753, + "grad_norm": 0.4901422886842051, + "learning_rate": 5.058584464196535e-07, + "loss": 0.2479, + "step": 2739 + }, + { + "epoch": 1.5046677649643052, + "grad_norm": 0.4217227682290988, + "learning_rate": 5.05568036202632e-07, + "loss": 0.2812, + "step": 2740 + }, + { + "epoch": 1.5052169137836353, + "grad_norm": 0.4728839540110898, + "learning_rate": 5.052776241234765e-07, + "loss": 0.2468, + "step": 2741 + }, + { + "epoch": 1.5057660626029654, + "grad_norm": 0.4811207959723812, + "learning_rate": 5.049872102801907e-07, + "loss": 0.2514, + "step": 2742 + }, + { + "epoch": 1.5063152114222955, + "grad_norm": 0.6262813840019746, + "learning_rate": 5.04696794770778e-07, + "loss": 0.2567, + "step": 2743 + }, + { + "epoch": 1.5068643602416256, + "grad_norm": 0.6070475350339779, + "learning_rate": 5.044063776932435e-07, + "loss": 0.2682, + "step": 2744 + }, + { + "epoch": 1.5074135090609555, + "grad_norm": 0.5430928628778702, + "learning_rate": 5.04115959145592e-07, + "loss": 0.3159, + "step": 2745 + }, + { + "epoch": 1.5079626578802856, + "grad_norm": 0.42694634996063924, + "learning_rate": 5.038255392258292e-07, + "loss": 0.28, + "step": 2746 + }, + { + "epoch": 1.5085118066996155, + "grad_norm": 0.4997085437107576, + "learning_rate": 5.035351180319607e-07, + "loss": 0.2726, + "step": 2747 + }, + { + "epoch": 1.5090609555189456, + "grad_norm": 0.491112701998596, + "learning_rate": 5.032446956619933e-07, + "loss": 0.2488, + "step": 2748 + }, + { + "epoch": 1.5096101043382757, + "grad_norm": 0.5624050785288854, + "learning_rate": 5.02954272213934e-07, + "loss": 0.2479, + "step": 2749 + }, + { + "epoch": 1.5101592531576058, + "grad_norm": 0.46811582611774843, + "learning_rate": 5.026638477857898e-07, + "loss": 0.261, + "step": 2750 + }, + { + "epoch": 1.510708401976936, + "grad_norm": 0.5568232500099091, + "learning_rate": 5.023734224755682e-07, + "loss": 0.2668, + "step": 2751 + }, + { + "epoch": 1.5112575507962658, + "grad_norm": 0.504604675938705, + "learning_rate": 5.020829963812772e-07, + "loss": 0.2334, + "step": 2752 + }, + { + "epoch": 1.5118066996155957, + "grad_norm": 0.4298497785727823, + "learning_rate": 5.017925696009246e-07, + "loss": 0.25, + "step": 2753 + }, + { + "epoch": 1.5123558484349258, + "grad_norm": 0.5428490430779151, + "learning_rate": 5.015021422325191e-07, + "loss": 0.2309, + "step": 2754 + }, + { + "epoch": 1.5129049972542559, + "grad_norm": 0.4758247364992933, + "learning_rate": 5.012117143740691e-07, + "loss": 0.2321, + "step": 2755 + }, + { + "epoch": 1.513454146073586, + "grad_norm": 0.6265576010153409, + "learning_rate": 5.009212861235835e-07, + "loss": 0.2389, + "step": 2756 + }, + { + "epoch": 1.514003294892916, + "grad_norm": 0.45926949556654206, + "learning_rate": 5.006308575790705e-07, + "loss": 0.2486, + "step": 2757 + }, + { + "epoch": 1.5145524437122462, + "grad_norm": 0.5247480638344076, + "learning_rate": 5.003404288385398e-07, + "loss": 0.2453, + "step": 2758 + }, + { + "epoch": 1.515101592531576, + "grad_norm": 0.49608735657361896, + "learning_rate": 5.0005e-07, + "loss": 0.2524, + "step": 2759 + }, + { + "epoch": 1.515650741350906, + "grad_norm": 0.5732476954448429, + "learning_rate": 4.997595711614601e-07, + "loss": 0.2566, + "step": 2760 + }, + { + "epoch": 1.516199890170236, + "grad_norm": 0.5339956472299764, + "learning_rate": 4.994691424209294e-07, + "loss": 0.2865, + "step": 2761 + }, + { + "epoch": 1.5167490389895661, + "grad_norm": 0.511260164508659, + "learning_rate": 4.991787138764166e-07, + "loss": 0.2451, + "step": 2762 + }, + { + "epoch": 1.5172981878088962, + "grad_norm": 0.46790112493431585, + "learning_rate": 4.988882856259308e-07, + "loss": 0.2518, + "step": 2763 + }, + { + "epoch": 1.5178473366282264, + "grad_norm": 0.5529417222910061, + "learning_rate": 4.985978577674808e-07, + "loss": 0.2793, + "step": 2764 + }, + { + "epoch": 1.5183964854475562, + "grad_norm": 0.5244153677044221, + "learning_rate": 4.983074303990752e-07, + "loss": 0.2449, + "step": 2765 + }, + { + "epoch": 1.5189456342668863, + "grad_norm": 0.5624430538268501, + "learning_rate": 4.980170036187228e-07, + "loss": 0.2653, + "step": 2766 + }, + { + "epoch": 1.5194947830862162, + "grad_norm": 0.5570505670582001, + "learning_rate": 4.977265775244318e-07, + "loss": 0.2805, + "step": 2767 + }, + { + "epoch": 1.5200439319055463, + "grad_norm": 0.5538178841106489, + "learning_rate": 4.974361522142103e-07, + "loss": 0.3197, + "step": 2768 + }, + { + "epoch": 1.5205930807248764, + "grad_norm": 0.3895195377805498, + "learning_rate": 4.971457277860661e-07, + "loss": 0.2549, + "step": 2769 + }, + { + "epoch": 1.5211422295442065, + "grad_norm": 0.5339255523984222, + "learning_rate": 4.968553043380066e-07, + "loss": 0.2525, + "step": 2770 + }, + { + "epoch": 1.5216913783635366, + "grad_norm": 0.5030742725309424, + "learning_rate": 4.965648819680394e-07, + "loss": 0.2389, + "step": 2771 + }, + { + "epoch": 1.5222405271828665, + "grad_norm": 0.4073478753597533, + "learning_rate": 4.962744607741711e-07, + "loss": 0.2592, + "step": 2772 + }, + { + "epoch": 1.5227896760021966, + "grad_norm": 0.47508916853304656, + "learning_rate": 4.959840408544082e-07, + "loss": 0.2232, + "step": 2773 + }, + { + "epoch": 1.5233388248215265, + "grad_norm": 0.45696601647300056, + "learning_rate": 4.956936223067565e-07, + "loss": 0.2431, + "step": 2774 + }, + { + "epoch": 1.5238879736408566, + "grad_norm": 0.578582244142802, + "learning_rate": 4.954032052292219e-07, + "loss": 0.2869, + "step": 2775 + }, + { + "epoch": 1.5244371224601867, + "grad_norm": 0.4447708933681857, + "learning_rate": 4.951127897198094e-07, + "loss": 0.2543, + "step": 2776 + }, + { + "epoch": 1.5249862712795168, + "grad_norm": 0.5865695691582431, + "learning_rate": 4.948223758765233e-07, + "loss": 0.2434, + "step": 2777 + }, + { + "epoch": 1.525535420098847, + "grad_norm": 0.4724304171462187, + "learning_rate": 4.945319637973682e-07, + "loss": 0.2439, + "step": 2778 + }, + { + "epoch": 1.5260845689181768, + "grad_norm": 0.5146074905577956, + "learning_rate": 4.942415535803467e-07, + "loss": 0.2325, + "step": 2779 + }, + { + "epoch": 1.526633717737507, + "grad_norm": 0.5559587529345579, + "learning_rate": 4.939511453234618e-07, + "loss": 0.2748, + "step": 2780 + }, + { + "epoch": 1.5271828665568368, + "grad_norm": 0.4366947372514167, + "learning_rate": 4.936607391247159e-07, + "loss": 0.2659, + "step": 2781 + }, + { + "epoch": 1.5277320153761669, + "grad_norm": 0.5169265145637658, + "learning_rate": 4.933703350821099e-07, + "loss": 0.2852, + "step": 2782 + }, + { + "epoch": 1.528281164195497, + "grad_norm": 0.616839517904532, + "learning_rate": 4.930799332936451e-07, + "loss": 0.2503, + "step": 2783 + }, + { + "epoch": 1.528830313014827, + "grad_norm": 0.4454459922958409, + "learning_rate": 4.927895338573206e-07, + "loss": 0.2705, + "step": 2784 + }, + { + "epoch": 1.5293794618341572, + "grad_norm": 0.46594004554378604, + "learning_rate": 4.924991368711361e-07, + "loss": 0.2615, + "step": 2785 + }, + { + "epoch": 1.529928610653487, + "grad_norm": 0.4909687962361828, + "learning_rate": 4.922087424330898e-07, + "loss": 0.2449, + "step": 2786 + }, + { + "epoch": 1.5304777594728172, + "grad_norm": 0.7049498724185411, + "learning_rate": 4.919183506411788e-07, + "loss": 0.2905, + "step": 2787 + }, + { + "epoch": 1.531026908292147, + "grad_norm": 0.4732403124777763, + "learning_rate": 4.916279615934001e-07, + "loss": 0.2602, + "step": 2788 + }, + { + "epoch": 1.5315760571114772, + "grad_norm": 0.5182010829093522, + "learning_rate": 4.913375753877492e-07, + "loss": 0.2672, + "step": 2789 + }, + { + "epoch": 1.5321252059308073, + "grad_norm": 0.5026743147169596, + "learning_rate": 4.910471921222205e-07, + "loss": 0.2625, + "step": 2790 + }, + { + "epoch": 1.5326743547501374, + "grad_norm": 0.44690930349256075, + "learning_rate": 4.907568118948077e-07, + "loss": 0.249, + "step": 2791 + }, + { + "epoch": 1.5332235035694675, + "grad_norm": 0.5718200409503246, + "learning_rate": 4.904664348035035e-07, + "loss": 0.2351, + "step": 2792 + }, + { + "epoch": 1.5337726523887973, + "grad_norm": 0.5512620151554342, + "learning_rate": 4.901760609462997e-07, + "loss": 0.2733, + "step": 2793 + }, + { + "epoch": 1.5343218012081274, + "grad_norm": 0.699421534858735, + "learning_rate": 4.898856904211865e-07, + "loss": 0.2359, + "step": 2794 + }, + { + "epoch": 1.5348709500274573, + "grad_norm": 0.509253187397344, + "learning_rate": 4.895953233261532e-07, + "loss": 0.2556, + "step": 2795 + }, + { + "epoch": 1.5354200988467874, + "grad_norm": 0.5551986883696253, + "learning_rate": 4.893049597591881e-07, + "loss": 0.2609, + "step": 2796 + }, + { + "epoch": 1.5359692476661175, + "grad_norm": 0.5507101551011097, + "learning_rate": 4.890145998182782e-07, + "loss": 0.2663, + "step": 2797 + }, + { + "epoch": 1.5365183964854476, + "grad_norm": 0.5518083046074918, + "learning_rate": 4.887242436014094e-07, + "loss": 0.225, + "step": 2798 + }, + { + "epoch": 1.5370675453047777, + "grad_norm": 0.5100213412006098, + "learning_rate": 4.884338912065661e-07, + "loss": 0.2843, + "step": 2799 + }, + { + "epoch": 1.5376166941241076, + "grad_norm": 0.5627764538731767, + "learning_rate": 4.881435427317318e-07, + "loss": 0.259, + "step": 2800 + }, + { + "epoch": 1.5376166941241076, + "eval_loss": 0.3302614986896515, + "eval_runtime": 18.6514, + "eval_samples_per_second": 23.752, + "eval_steps_per_second": 1.019, + "step": 2800 + }, + { + "epoch": 1.5381658429434377, + "grad_norm": 0.5079239392088903, + "learning_rate": 4.878531982748878e-07, + "loss": 0.221, + "step": 2801 + }, + { + "epoch": 1.5387149917627676, + "grad_norm": 0.47189967828123247, + "learning_rate": 4.875628579340152e-07, + "loss": 0.2636, + "step": 2802 + }, + { + "epoch": 1.5392641405820977, + "grad_norm": 0.46554472572458666, + "learning_rate": 4.872725218070929e-07, + "loss": 0.2355, + "step": 2803 + }, + { + "epoch": 1.5398132894014278, + "grad_norm": 0.5274608123664462, + "learning_rate": 4.869821899920989e-07, + "loss": 0.2428, + "step": 2804 + }, + { + "epoch": 1.540362438220758, + "grad_norm": 0.4220397869253295, + "learning_rate": 4.866918625870093e-07, + "loss": 0.2543, + "step": 2805 + }, + { + "epoch": 1.540911587040088, + "grad_norm": 0.5095811165668853, + "learning_rate": 4.864015396897991e-07, + "loss": 0.2538, + "step": 2806 + }, + { + "epoch": 1.541460735859418, + "grad_norm": 0.5697834414711409, + "learning_rate": 4.861112213984412e-07, + "loss": 0.2641, + "step": 2807 + }, + { + "epoch": 1.5420098846787478, + "grad_norm": 0.5035818037724551, + "learning_rate": 4.858209078109075e-07, + "loss": 0.2464, + "step": 2808 + }, + { + "epoch": 1.5425590334980779, + "grad_norm": 0.4456871751261883, + "learning_rate": 4.855305990251683e-07, + "loss": 0.2321, + "step": 2809 + }, + { + "epoch": 1.543108182317408, + "grad_norm": 0.5023359837200629, + "learning_rate": 4.852402951391921e-07, + "loss": 0.2935, + "step": 2810 + }, + { + "epoch": 1.543657331136738, + "grad_norm": 0.5208603097744479, + "learning_rate": 4.849499962509457e-07, + "loss": 0.2673, + "step": 2811 + }, + { + "epoch": 1.5442064799560682, + "grad_norm": 0.4771329519172141, + "learning_rate": 4.846597024583941e-07, + "loss": 0.2294, + "step": 2812 + }, + { + "epoch": 1.5447556287753983, + "grad_norm": 0.4586798953661012, + "learning_rate": 4.843694138595009e-07, + "loss": 0.2713, + "step": 2813 + }, + { + "epoch": 1.5453047775947282, + "grad_norm": 0.4293257092269013, + "learning_rate": 4.840791305522279e-07, + "loss": 0.2615, + "step": 2814 + }, + { + "epoch": 1.545853926414058, + "grad_norm": 0.5131812310994579, + "learning_rate": 4.837888526345351e-07, + "loss": 0.2406, + "step": 2815 + }, + { + "epoch": 1.5464030752333882, + "grad_norm": 0.41261192051933104, + "learning_rate": 4.834985802043805e-07, + "loss": 0.2561, + "step": 2816 + }, + { + "epoch": 1.5469522240527183, + "grad_norm": 0.5330911791964046, + "learning_rate": 4.832083133597201e-07, + "loss": 0.2512, + "step": 2817 + }, + { + "epoch": 1.5475013728720484, + "grad_norm": 0.4793069942382472, + "learning_rate": 4.829180521985084e-07, + "loss": 0.2337, + "step": 2818 + }, + { + "epoch": 1.5480505216913785, + "grad_norm": 0.6861835783449852, + "learning_rate": 4.826277968186981e-07, + "loss": 0.3115, + "step": 2819 + }, + { + "epoch": 1.5485996705107083, + "grad_norm": 0.47167697189091795, + "learning_rate": 4.823375473182394e-07, + "loss": 0.2505, + "step": 2820 + }, + { + "epoch": 1.5491488193300385, + "grad_norm": 0.4335175847578868, + "learning_rate": 4.820473037950809e-07, + "loss": 0.2408, + "step": 2821 + }, + { + "epoch": 1.5496979681493683, + "grad_norm": 0.5367871943997862, + "learning_rate": 4.817570663471693e-07, + "loss": 0.2811, + "step": 2822 + }, + { + "epoch": 1.5502471169686984, + "grad_norm": 0.516667475918073, + "learning_rate": 4.814668350724488e-07, + "loss": 0.2534, + "step": 2823 + }, + { + "epoch": 1.5507962657880285, + "grad_norm": 0.4311751040848791, + "learning_rate": 4.811766100688619e-07, + "loss": 0.2216, + "step": 2824 + }, + { + "epoch": 1.5513454146073586, + "grad_norm": 0.47806329077048054, + "learning_rate": 4.808863914343485e-07, + "loss": 0.2548, + "step": 2825 + }, + { + "epoch": 1.5518945634266887, + "grad_norm": 0.5159309659668649, + "learning_rate": 4.805961792668472e-07, + "loss": 0.2399, + "step": 2826 + }, + { + "epoch": 1.5524437122460186, + "grad_norm": 0.40036973603765497, + "learning_rate": 4.803059736642939e-07, + "loss": 0.243, + "step": 2827 + }, + { + "epoch": 1.5529928610653487, + "grad_norm": 0.4839111243014504, + "learning_rate": 4.800157747246218e-07, + "loss": 0.2011, + "step": 2828 + }, + { + "epoch": 1.5535420098846786, + "grad_norm": 0.5146675444979022, + "learning_rate": 4.797255825457627e-07, + "loss": 0.281, + "step": 2829 + }, + { + "epoch": 1.5540911587040087, + "grad_norm": 0.3958161927252821, + "learning_rate": 4.794353972256456e-07, + "loss": 0.2519, + "step": 2830 + }, + { + "epoch": 1.5546403075233388, + "grad_norm": 0.5641128052728285, + "learning_rate": 4.791452188621977e-07, + "loss": 0.2344, + "step": 2831 + }, + { + "epoch": 1.555189456342669, + "grad_norm": 0.47553884372946037, + "learning_rate": 4.788550475533431e-07, + "loss": 0.2475, + "step": 2832 + }, + { + "epoch": 1.555738605161999, + "grad_norm": 0.5227082372446495, + "learning_rate": 4.785648833970044e-07, + "loss": 0.2716, + "step": 2833 + }, + { + "epoch": 1.556287753981329, + "grad_norm": 0.5457467193642, + "learning_rate": 4.782747264911008e-07, + "loss": 0.2448, + "step": 2834 + }, + { + "epoch": 1.556836902800659, + "grad_norm": 0.519459123520073, + "learning_rate": 4.779845769335496e-07, + "loss": 0.2485, + "step": 2835 + }, + { + "epoch": 1.5573860516199889, + "grad_norm": 0.43250234112820973, + "learning_rate": 4.776944348222659e-07, + "loss": 0.2387, + "step": 2836 + }, + { + "epoch": 1.557935200439319, + "grad_norm": 0.4219763667737606, + "learning_rate": 4.774043002551619e-07, + "loss": 0.2414, + "step": 2837 + }, + { + "epoch": 1.558484349258649, + "grad_norm": 0.49870717394184294, + "learning_rate": 4.771141733301474e-07, + "loss": 0.2549, + "step": 2838 + }, + { + "epoch": 1.5590334980779792, + "grad_norm": 0.5180297132312424, + "learning_rate": 4.7682405414512914e-07, + "loss": 0.2339, + "step": 2839 + }, + { + "epoch": 1.5595826468973093, + "grad_norm": 0.4023740770065276, + "learning_rate": 4.765339427980121e-07, + "loss": 0.2531, + "step": 2840 + }, + { + "epoch": 1.5601317957166392, + "grad_norm": 0.6759746217795632, + "learning_rate": 4.7624383938669795e-07, + "loss": 0.2457, + "step": 2841 + }, + { + "epoch": 1.5606809445359693, + "grad_norm": 0.4582921175709837, + "learning_rate": 4.7595374400908586e-07, + "loss": 0.1942, + "step": 2842 + }, + { + "epoch": 1.5612300933552992, + "grad_norm": 0.4408915341293445, + "learning_rate": 4.7566365676307254e-07, + "loss": 0.24, + "step": 2843 + }, + { + "epoch": 1.5617792421746293, + "grad_norm": 0.5412665892755424, + "learning_rate": 4.753735777465517e-07, + "loss": 0.2497, + "step": 2844 + }, + { + "epoch": 1.5623283909939594, + "grad_norm": 0.5910076589366715, + "learning_rate": 4.750835070574143e-07, + "loss": 0.2483, + "step": 2845 + }, + { + "epoch": 1.5628775398132895, + "grad_norm": 0.47649446864655787, + "learning_rate": 4.747934447935483e-07, + "loss": 0.2403, + "step": 2846 + }, + { + "epoch": 1.5634266886326196, + "grad_norm": 0.5713036156338777, + "learning_rate": 4.745033910528392e-07, + "loss": 0.2355, + "step": 2847 + }, + { + "epoch": 1.5639758374519495, + "grad_norm": 0.5221159320948752, + "learning_rate": 4.742133459331695e-07, + "loss": 0.2497, + "step": 2848 + }, + { + "epoch": 1.5645249862712796, + "grad_norm": 0.4792748310165088, + "learning_rate": 4.739233095324189e-07, + "loss": 0.2677, + "step": 2849 + }, + { + "epoch": 1.5650741350906094, + "grad_norm": 0.5071728225173472, + "learning_rate": 4.736332819484636e-07, + "loss": 0.244, + "step": 2850 + }, + { + "epoch": 1.5656232839099395, + "grad_norm": 0.5242730812459762, + "learning_rate": 4.733432632791774e-07, + "loss": 0.2359, + "step": 2851 + }, + { + "epoch": 1.5661724327292696, + "grad_norm": 0.620705334260142, + "learning_rate": 4.730532536224308e-07, + "loss": 0.2556, + "step": 2852 + }, + { + "epoch": 1.5667215815485998, + "grad_norm": 0.4966527226893142, + "learning_rate": 4.7276325307609167e-07, + "loss": 0.259, + "step": 2853 + }, + { + "epoch": 1.5672707303679299, + "grad_norm": 0.5311382760062338, + "learning_rate": 4.7247326173802443e-07, + "loss": 0.2991, + "step": 2854 + }, + { + "epoch": 1.5678198791872597, + "grad_norm": 0.5417483707486203, + "learning_rate": 4.721832797060904e-07, + "loss": 0.2616, + "step": 2855 + }, + { + "epoch": 1.5683690280065898, + "grad_norm": 0.5628074090904469, + "learning_rate": 4.718933070781476e-07, + "loss": 0.2635, + "step": 2856 + }, + { + "epoch": 1.5689181768259197, + "grad_norm": 0.43905958760313846, + "learning_rate": 4.7160334395205145e-07, + "loss": 0.2276, + "step": 2857 + }, + { + "epoch": 1.5694673256452498, + "grad_norm": 0.4854982643494795, + "learning_rate": 4.713133904256537e-07, + "loss": 0.2446, + "step": 2858 + }, + { + "epoch": 1.57001647446458, + "grad_norm": 0.42413907062845707, + "learning_rate": 4.7102344659680295e-07, + "loss": 0.251, + "step": 2859 + }, + { + "epoch": 1.57056562328391, + "grad_norm": 0.5144845224316664, + "learning_rate": 4.7073351256334485e-07, + "loss": 0.2338, + "step": 2860 + }, + { + "epoch": 1.5711147721032401, + "grad_norm": 0.4153310956832003, + "learning_rate": 4.70443588423121e-07, + "loss": 0.2533, + "step": 2861 + }, + { + "epoch": 1.57166392092257, + "grad_norm": 0.46665532721938946, + "learning_rate": 4.701536742739703e-07, + "loss": 0.2849, + "step": 2862 + }, + { + "epoch": 1.5722130697419, + "grad_norm": 0.5837429769653172, + "learning_rate": 4.698637702137281e-07, + "loss": 0.2501, + "step": 2863 + }, + { + "epoch": 1.57276221856123, + "grad_norm": 0.5773208981833295, + "learning_rate": 4.695738763402263e-07, + "loss": 0.2606, + "step": 2864 + }, + { + "epoch": 1.57331136738056, + "grad_norm": 0.5369744819112152, + "learning_rate": 4.692839927512936e-07, + "loss": 0.2297, + "step": 2865 + }, + { + "epoch": 1.5738605161998902, + "grad_norm": 0.5004406673236037, + "learning_rate": 4.689941195447549e-07, + "loss": 0.261, + "step": 2866 + }, + { + "epoch": 1.5744096650192203, + "grad_norm": 0.5129416586224244, + "learning_rate": 4.6870425681843176e-07, + "loss": 0.2501, + "step": 2867 + }, + { + "epoch": 1.5749588138385504, + "grad_norm": 0.4909208400078727, + "learning_rate": 4.6841440467014196e-07, + "loss": 0.2621, + "step": 2868 + }, + { + "epoch": 1.5755079626578803, + "grad_norm": 0.5365978804718112, + "learning_rate": 4.6812456319770005e-07, + "loss": 0.3015, + "step": 2869 + }, + { + "epoch": 1.5760571114772102, + "grad_norm": 0.5630410598392107, + "learning_rate": 4.6783473249891695e-07, + "loss": 0.2805, + "step": 2870 + }, + { + "epoch": 1.5766062602965403, + "grad_norm": 0.4858405245119762, + "learning_rate": 4.6754491267160003e-07, + "loss": 0.2371, + "step": 2871 + }, + { + "epoch": 1.5771554091158704, + "grad_norm": 0.4915165415925142, + "learning_rate": 4.672551038135523e-07, + "loss": 0.2625, + "step": 2872 + }, + { + "epoch": 1.5777045579352005, + "grad_norm": 0.5179868761116622, + "learning_rate": 4.6696530602257377e-07, + "loss": 0.2886, + "step": 2873 + }, + { + "epoch": 1.5782537067545306, + "grad_norm": 0.5767617509982645, + "learning_rate": 4.666755193964607e-07, + "loss": 0.2261, + "step": 2874 + }, + { + "epoch": 1.5788028555738605, + "grad_norm": 0.5371114574703987, + "learning_rate": 4.663857440330052e-07, + "loss": 0.2732, + "step": 2875 + }, + { + "epoch": 1.5793520043931906, + "grad_norm": 0.5254404425452375, + "learning_rate": 4.660959800299958e-07, + "loss": 0.2201, + "step": 2876 + }, + { + "epoch": 1.5799011532125204, + "grad_norm": 0.5920513853064838, + "learning_rate": 4.658062274852177e-07, + "loss": 0.2594, + "step": 2877 + }, + { + "epoch": 1.5804503020318506, + "grad_norm": 0.45988148115272065, + "learning_rate": 4.655164864964507e-07, + "loss": 0.2548, + "step": 2878 + }, + { + "epoch": 1.5809994508511807, + "grad_norm": 0.4890117316439837, + "learning_rate": 4.6522675716147246e-07, + "loss": 0.2612, + "step": 2879 + }, + { + "epoch": 1.5815485996705108, + "grad_norm": 0.6594903247878485, + "learning_rate": 4.6493703957805577e-07, + "loss": 0.2809, + "step": 2880 + }, + { + "epoch": 1.5820977484898409, + "grad_norm": 0.46956046295488574, + "learning_rate": 4.6464733384396937e-07, + "loss": 0.2512, + "step": 2881 + }, + { + "epoch": 1.5826468973091707, + "grad_norm": 0.5311362838617913, + "learning_rate": 4.643576400569788e-07, + "loss": 0.2458, + "step": 2882 + }, + { + "epoch": 1.5831960461285008, + "grad_norm": 0.4114939896171229, + "learning_rate": 4.6406795831484474e-07, + "loss": 0.2386, + "step": 2883 + }, + { + "epoch": 1.5837451949478307, + "grad_norm": 0.4748702916828854, + "learning_rate": 4.6377828871532406e-07, + "loss": 0.2091, + "step": 2884 + }, + { + "epoch": 1.5842943437671608, + "grad_norm": 0.4888204489371328, + "learning_rate": 4.6348863135616967e-07, + "loss": 0.2239, + "step": 2885 + }, + { + "epoch": 1.584843492586491, + "grad_norm": 0.44056629172269635, + "learning_rate": 4.631989863351301e-07, + "loss": 0.2071, + "step": 2886 + }, + { + "epoch": 1.585392641405821, + "grad_norm": 0.4337837133265149, + "learning_rate": 4.629093537499501e-07, + "loss": 0.2311, + "step": 2887 + }, + { + "epoch": 1.5859417902251511, + "grad_norm": 0.5317649078035698, + "learning_rate": 4.6261973369837e-07, + "loss": 0.259, + "step": 2888 + }, + { + "epoch": 1.586490939044481, + "grad_norm": 0.5759756900852464, + "learning_rate": 4.623301262781257e-07, + "loss": 0.2461, + "step": 2889 + }, + { + "epoch": 1.5870400878638111, + "grad_norm": 0.4703292757241802, + "learning_rate": 4.620405315869491e-07, + "loss": 0.2526, + "step": 2890 + }, + { + "epoch": 1.587589236683141, + "grad_norm": 0.5109377423745527, + "learning_rate": 4.617509497225678e-07, + "loss": 0.2369, + "step": 2891 + }, + { + "epoch": 1.588138385502471, + "grad_norm": 0.4790144687244557, + "learning_rate": 4.61461380782705e-07, + "loss": 0.2666, + "step": 2892 + }, + { + "epoch": 1.5886875343218012, + "grad_norm": 0.42019280365685124, + "learning_rate": 4.6117182486507956e-07, + "loss": 0.2445, + "step": 2893 + }, + { + "epoch": 1.5892366831411313, + "grad_norm": 0.8250619793909048, + "learning_rate": 4.60882282067406e-07, + "loss": 0.2254, + "step": 2894 + }, + { + "epoch": 1.5897858319604614, + "grad_norm": 0.5050084157595379, + "learning_rate": 4.6059275248739403e-07, + "loss": 0.2502, + "step": 2895 + }, + { + "epoch": 1.5903349807797913, + "grad_norm": 0.6483260235065413, + "learning_rate": 4.6030323622274955e-07, + "loss": 0.2665, + "step": 2896 + }, + { + "epoch": 1.5908841295991214, + "grad_norm": 0.9964039036691177, + "learning_rate": 4.600137333711735e-07, + "loss": 0.4358, + "step": 2897 + }, + { + "epoch": 1.5914332784184513, + "grad_norm": 0.42513017428971134, + "learning_rate": 4.5972424403036235e-07, + "loss": 0.238, + "step": 2898 + }, + { + "epoch": 1.5919824272377814, + "grad_norm": 0.4474820186996626, + "learning_rate": 4.5943476829800855e-07, + "loss": 0.2518, + "step": 2899 + }, + { + "epoch": 1.5925315760571115, + "grad_norm": 0.5755466069582273, + "learning_rate": 4.5914530627179874e-07, + "loss": 0.2588, + "step": 2900 + }, + { + "epoch": 1.5930807248764416, + "grad_norm": 0.5813029226034779, + "learning_rate": 4.5885585804941625e-07, + "loss": 0.2491, + "step": 2901 + }, + { + "epoch": 1.5936298736957717, + "grad_norm": 0.5109848564477342, + "learning_rate": 4.5856642372853897e-07, + "loss": 0.2672, + "step": 2902 + }, + { + "epoch": 1.5941790225151016, + "grad_norm": 0.5573530714024166, + "learning_rate": 4.5827700340684033e-07, + "loss": 0.2252, + "step": 2903 + }, + { + "epoch": 1.5947281713344317, + "grad_norm": 0.5316256991857502, + "learning_rate": 4.579875971819892e-07, + "loss": 0.2401, + "step": 2904 + }, + { + "epoch": 1.5952773201537616, + "grad_norm": 0.45888881091440503, + "learning_rate": 4.576982051516494e-07, + "loss": 0.2523, + "step": 2905 + }, + { + "epoch": 1.5958264689730917, + "grad_norm": 0.600405516725246, + "learning_rate": 4.5740882741348003e-07, + "loss": 0.2582, + "step": 2906 + }, + { + "epoch": 1.5963756177924218, + "grad_norm": 0.5128721216329234, + "learning_rate": 4.5711946406513537e-07, + "loss": 0.2937, + "step": 2907 + }, + { + "epoch": 1.5969247666117519, + "grad_norm": 0.4440745041487263, + "learning_rate": 4.56830115204265e-07, + "loss": 0.2338, + "step": 2908 + }, + { + "epoch": 1.597473915431082, + "grad_norm": 0.47602391207624395, + "learning_rate": 4.5654078092851355e-07, + "loss": 0.1956, + "step": 2909 + }, + { + "epoch": 1.5980230642504119, + "grad_norm": 0.49377983855292046, + "learning_rate": 4.562514613355207e-07, + "loss": 0.2515, + "step": 2910 + }, + { + "epoch": 1.598572213069742, + "grad_norm": 0.4529738345543638, + "learning_rate": 4.559621565229209e-07, + "loss": 0.2605, + "step": 2911 + }, + { + "epoch": 1.5991213618890718, + "grad_norm": 0.4692302722186548, + "learning_rate": 4.55672866588344e-07, + "loss": 0.249, + "step": 2912 + }, + { + "epoch": 1.599670510708402, + "grad_norm": 0.4241445715442823, + "learning_rate": 4.553835916294147e-07, + "loss": 0.2417, + "step": 2913 + }, + { + "epoch": 1.600219659527732, + "grad_norm": 0.46085066528851054, + "learning_rate": 4.550943317437527e-07, + "loss": 0.2092, + "step": 2914 + }, + { + "epoch": 1.6007688083470621, + "grad_norm": 0.7157498983630428, + "learning_rate": 4.5480508702897244e-07, + "loss": 0.2866, + "step": 2915 + }, + { + "epoch": 1.6013179571663922, + "grad_norm": 0.47809016786972625, + "learning_rate": 4.545158575826838e-07, + "loss": 0.2467, + "step": 2916 + }, + { + "epoch": 1.6018671059857221, + "grad_norm": 0.5618657293761034, + "learning_rate": 4.5422664350249024e-07, + "loss": 0.2364, + "step": 2917 + }, + { + "epoch": 1.602416254805052, + "grad_norm": 0.43233356556320074, + "learning_rate": 4.539374448859915e-07, + "loss": 0.2402, + "step": 2918 + }, + { + "epoch": 1.6029654036243821, + "grad_norm": 0.7321003951557213, + "learning_rate": 4.536482618307813e-07, + "loss": 0.3063, + "step": 2919 + }, + { + "epoch": 1.6035145524437122, + "grad_norm": 0.43415863774815755, + "learning_rate": 4.5335909443444804e-07, + "loss": 0.2425, + "step": 2920 + }, + { + "epoch": 1.6040637012630423, + "grad_norm": 0.5596418258407864, + "learning_rate": 4.530699427945755e-07, + "loss": 0.2438, + "step": 2921 + }, + { + "epoch": 1.6046128500823724, + "grad_norm": 0.3735831192460479, + "learning_rate": 4.5278080700874135e-07, + "loss": 0.2149, + "step": 2922 + }, + { + "epoch": 1.6051619989017025, + "grad_norm": 0.5462719096950205, + "learning_rate": 4.5249168717451836e-07, + "loss": 0.2358, + "step": 2923 + }, + { + "epoch": 1.6057111477210324, + "grad_norm": 0.4586230731201346, + "learning_rate": 4.522025833894739e-07, + "loss": 0.2801, + "step": 2924 + }, + { + "epoch": 1.6062602965403623, + "grad_norm": 0.5270734219029176, + "learning_rate": 4.519134957511697e-07, + "loss": 0.2956, + "step": 2925 + }, + { + "epoch": 1.6068094453596924, + "grad_norm": 0.4319738323864442, + "learning_rate": 4.516244243571623e-07, + "loss": 0.2584, + "step": 2926 + }, + { + "epoch": 1.6073585941790225, + "grad_norm": 0.5245532255376102, + "learning_rate": 4.5133536930500275e-07, + "loss": 0.2119, + "step": 2927 + }, + { + "epoch": 1.6079077429983526, + "grad_norm": 0.5092278036719082, + "learning_rate": 4.5104633069223623e-07, + "loss": 0.2885, + "step": 2928 + }, + { + "epoch": 1.6084568918176827, + "grad_norm": 0.5176222098633811, + "learning_rate": 4.5075730861640263e-07, + "loss": 0.2648, + "step": 2929 + }, + { + "epoch": 1.6090060406370126, + "grad_norm": 0.47960674212322696, + "learning_rate": 4.504683031750365e-07, + "loss": 0.3016, + "step": 2930 + }, + { + "epoch": 1.6095551894563427, + "grad_norm": 0.6056835781585965, + "learning_rate": 4.5017931446566645e-07, + "loss": 0.2549, + "step": 2931 + }, + { + "epoch": 1.6101043382756726, + "grad_norm": 0.5065387766463945, + "learning_rate": 4.4989034258581554e-07, + "loss": 0.2848, + "step": 2932 + }, + { + "epoch": 1.6106534870950027, + "grad_norm": 0.4253787830554584, + "learning_rate": 4.496013876330009e-07, + "loss": 0.2213, + "step": 2933 + }, + { + "epoch": 1.6112026359143328, + "grad_norm": 0.4924427881477454, + "learning_rate": 4.493124497047343e-07, + "loss": 0.2763, + "step": 2934 + }, + { + "epoch": 1.6117517847336629, + "grad_norm": 0.5894542706753981, + "learning_rate": 4.490235288985218e-07, + "loss": 0.2547, + "step": 2935 + }, + { + "epoch": 1.612300933552993, + "grad_norm": 0.4289044751118621, + "learning_rate": 4.4873462531186336e-07, + "loss": 0.2465, + "step": 2936 + }, + { + "epoch": 1.6128500823723229, + "grad_norm": 0.4284522949866835, + "learning_rate": 4.484457390422533e-07, + "loss": 0.256, + "step": 2937 + }, + { + "epoch": 1.613399231191653, + "grad_norm": 0.46967226013457086, + "learning_rate": 4.4815687018718034e-07, + "loss": 0.2395, + "step": 2938 + }, + { + "epoch": 1.6139483800109828, + "grad_norm": 0.5372992493207225, + "learning_rate": 4.478680188441268e-07, + "loss": 0.2456, + "step": 2939 + }, + { + "epoch": 1.614497528830313, + "grad_norm": 0.4558068624278018, + "learning_rate": 4.475791851105694e-07, + "loss": 0.2242, + "step": 2940 + }, + { + "epoch": 1.615046677649643, + "grad_norm": 0.5065016767222446, + "learning_rate": 4.4729036908397897e-07, + "loss": 0.2326, + "step": 2941 + }, + { + "epoch": 1.6155958264689732, + "grad_norm": 0.49429867215469064, + "learning_rate": 4.470015708618202e-07, + "loss": 0.2504, + "step": 2942 + }, + { + "epoch": 1.6161449752883033, + "grad_norm": 0.5180948292580143, + "learning_rate": 4.4671279054155196e-07, + "loss": 0.2364, + "step": 2943 + }, + { + "epoch": 1.6166941241076331, + "grad_norm": 0.4372385560713874, + "learning_rate": 4.4642402822062693e-07, + "loss": 0.1974, + "step": 2944 + }, + { + "epoch": 1.6172432729269632, + "grad_norm": 0.6174486625859185, + "learning_rate": 4.461352839964916e-07, + "loss": 0.3246, + "step": 2945 + }, + { + "epoch": 1.6177924217462931, + "grad_norm": 0.4371823318333394, + "learning_rate": 4.458465579665866e-07, + "loss": 0.2382, + "step": 2946 + }, + { + "epoch": 1.6183415705656232, + "grad_norm": 0.38280662823048006, + "learning_rate": 4.455578502283465e-07, + "loss": 0.2617, + "step": 2947 + }, + { + "epoch": 1.6188907193849533, + "grad_norm": 0.5114612302213691, + "learning_rate": 4.452691608791994e-07, + "loss": 0.2317, + "step": 2948 + }, + { + "epoch": 1.6194398682042834, + "grad_norm": 0.5964975038278817, + "learning_rate": 4.449804900165673e-07, + "loss": 0.2241, + "step": 2949 + }, + { + "epoch": 1.6199890170236135, + "grad_norm": 0.4855671845593758, + "learning_rate": 4.44691837737866e-07, + "loss": 0.2514, + "step": 2950 + }, + { + "epoch": 1.6205381658429434, + "grad_norm": 0.7675216385164546, + "learning_rate": 4.444032041405049e-07, + "loss": 0.3687, + "step": 2951 + }, + { + "epoch": 1.6210873146622735, + "grad_norm": 0.4842077014594088, + "learning_rate": 4.441145893218873e-07, + "loss": 0.2206, + "step": 2952 + }, + { + "epoch": 1.6216364634816034, + "grad_norm": 0.4759565469441726, + "learning_rate": 4.4382599337941014e-07, + "loss": 0.2446, + "step": 2953 + }, + { + "epoch": 1.6221856123009335, + "grad_norm": 0.45472033527722416, + "learning_rate": 4.435374164104639e-07, + "loss": 0.293, + "step": 2954 + }, + { + "epoch": 1.6227347611202636, + "grad_norm": 0.45031174769397214, + "learning_rate": 4.432488585124326e-07, + "loss": 0.2354, + "step": 2955 + }, + { + "epoch": 1.6232839099395937, + "grad_norm": 0.5195973926986616, + "learning_rate": 4.429603197826938e-07, + "loss": 0.2423, + "step": 2956 + }, + { + "epoch": 1.6238330587589238, + "grad_norm": 0.5096366177397833, + "learning_rate": 4.426718003186189e-07, + "loss": 0.2297, + "step": 2957 + }, + { + "epoch": 1.6243822075782537, + "grad_norm": 0.5905113957478257, + "learning_rate": 4.4238330021757256e-07, + "loss": 0.2547, + "step": 2958 + }, + { + "epoch": 1.6249313563975838, + "grad_norm": 0.4565435197256595, + "learning_rate": 4.420948195769127e-07, + "loss": 0.2542, + "step": 2959 + }, + { + "epoch": 1.6254805052169137, + "grad_norm": 0.455943454240101, + "learning_rate": 4.4180635849399134e-07, + "loss": 0.243, + "step": 2960 + }, + { + "epoch": 1.6260296540362438, + "grad_norm": 0.4164779031807216, + "learning_rate": 4.415179170661532e-07, + "loss": 0.2381, + "step": 2961 + }, + { + "epoch": 1.6265788028555739, + "grad_norm": 0.595340020939723, + "learning_rate": 4.412294953907365e-07, + "loss": 0.2429, + "step": 2962 + }, + { + "epoch": 1.627127951674904, + "grad_norm": 0.6312148140574662, + "learning_rate": 4.4094109356507307e-07, + "loss": 0.2387, + "step": 2963 + }, + { + "epoch": 1.627677100494234, + "grad_norm": 0.4595678941474588, + "learning_rate": 4.40652711686488e-07, + "loss": 0.2327, + "step": 2964 + }, + { + "epoch": 1.628226249313564, + "grad_norm": 0.5612732792189951, + "learning_rate": 4.403643498522996e-07, + "loss": 0.2236, + "step": 2965 + }, + { + "epoch": 1.6287753981328938, + "grad_norm": 0.4130375784010273, + "learning_rate": 4.400760081598191e-07, + "loss": 0.261, + "step": 2966 + }, + { + "epoch": 1.629324546952224, + "grad_norm": 0.80573332259453, + "learning_rate": 4.397876867063512e-07, + "loss": 0.2718, + "step": 2967 + }, + { + "epoch": 1.629873695771554, + "grad_norm": 0.5073023708688011, + "learning_rate": 4.3949938558919403e-07, + "loss": 0.2437, + "step": 2968 + }, + { + "epoch": 1.6304228445908842, + "grad_norm": 0.6603871450293803, + "learning_rate": 4.392111049056385e-07, + "loss": 0.2148, + "step": 2969 + }, + { + "epoch": 1.6309719934102143, + "grad_norm": 0.583614769125041, + "learning_rate": 4.3892284475296857e-07, + "loss": 0.2409, + "step": 2970 + }, + { + "epoch": 1.6315211422295444, + "grad_norm": 0.5203909843693727, + "learning_rate": 4.3863460522846176e-07, + "loss": 0.236, + "step": 2971 + }, + { + "epoch": 1.6320702910488742, + "grad_norm": 0.5894065647013814, + "learning_rate": 4.3834638642938786e-07, + "loss": 0.2718, + "step": 2972 + }, + { + "epoch": 1.6326194398682041, + "grad_norm": 0.4757715836098419, + "learning_rate": 4.3805818845301025e-07, + "loss": 0.277, + "step": 2973 + }, + { + "epoch": 1.6331685886875342, + "grad_norm": 0.6137710197467259, + "learning_rate": 4.3777001139658524e-07, + "loss": 0.2649, + "step": 2974 + }, + { + "epoch": 1.6337177375068643, + "grad_norm": 0.45116394860291625, + "learning_rate": 4.3748185535736196e-07, + "loss": 0.241, + "step": 2975 + }, + { + "epoch": 1.6342668863261944, + "grad_norm": 0.5311190405784784, + "learning_rate": 4.3719372043258254e-07, + "loss": 0.2415, + "step": 2976 + }, + { + "epoch": 1.6348160351455245, + "grad_norm": 0.6620057299676249, + "learning_rate": 4.369056067194815e-07, + "loss": 0.2432, + "step": 2977 + }, + { + "epoch": 1.6353651839648546, + "grad_norm": 0.4879195016935656, + "learning_rate": 4.3661751431528703e-07, + "loss": 0.2338, + "step": 2978 + }, + { + "epoch": 1.6359143327841845, + "grad_norm": 0.5464755861012252, + "learning_rate": 4.3632944331721954e-07, + "loss": 0.2476, + "step": 2979 + }, + { + "epoch": 1.6364634816035144, + "grad_norm": 0.5635139062204975, + "learning_rate": 4.3604139382249224e-07, + "loss": 0.2515, + "step": 2980 + }, + { + "epoch": 1.6370126304228445, + "grad_norm": 0.5224059360813856, + "learning_rate": 4.3575336592831153e-07, + "loss": 0.2449, + "step": 2981 + }, + { + "epoch": 1.6375617792421746, + "grad_norm": 0.5869731629868171, + "learning_rate": 4.3546535973187603e-07, + "loss": 0.2375, + "step": 2982 + }, + { + "epoch": 1.6381109280615047, + "grad_norm": 0.4615835679590841, + "learning_rate": 4.351773753303772e-07, + "loss": 0.2849, + "step": 2983 + }, + { + "epoch": 1.6386600768808348, + "grad_norm": 0.5504512368035249, + "learning_rate": 4.3488941282099927e-07, + "loss": 0.2296, + "step": 2984 + }, + { + "epoch": 1.6392092257001647, + "grad_norm": 0.46984498389223545, + "learning_rate": 4.346014723009188e-07, + "loss": 0.2399, + "step": 2985 + }, + { + "epoch": 1.6397583745194948, + "grad_norm": 0.55724959746297, + "learning_rate": 4.3431355386730536e-07, + "loss": 0.2436, + "step": 2986 + }, + { + "epoch": 1.6403075233388247, + "grad_norm": 0.49595845804379907, + "learning_rate": 4.3402565761732063e-07, + "loss": 0.2256, + "step": 2987 + }, + { + "epoch": 1.6408566721581548, + "grad_norm": 0.5661298030128081, + "learning_rate": 4.337377836481191e-07, + "loss": 0.2426, + "step": 2988 + }, + { + "epoch": 1.6414058209774849, + "grad_norm": 0.5694723374714927, + "learning_rate": 4.334499320568474e-07, + "loss": 0.2575, + "step": 2989 + }, + { + "epoch": 1.641954969796815, + "grad_norm": 0.45196041404453813, + "learning_rate": 4.3316210294064496e-07, + "loss": 0.2122, + "step": 2990 + }, + { + "epoch": 1.642504118616145, + "grad_norm": 0.464440428690339, + "learning_rate": 4.328742963966437e-07, + "loss": 0.2192, + "step": 2991 + }, + { + "epoch": 1.643053267435475, + "grad_norm": 0.532738911522231, + "learning_rate": 4.325865125219675e-07, + "loss": 0.2703, + "step": 2992 + }, + { + "epoch": 1.643602416254805, + "grad_norm": 0.5140227276067537, + "learning_rate": 4.322987514137331e-07, + "loss": 0.2321, + "step": 2993 + }, + { + "epoch": 1.644151565074135, + "grad_norm": 0.5598448935949341, + "learning_rate": 4.320110131690487e-07, + "loss": 0.3058, + "step": 2994 + }, + { + "epoch": 1.644700713893465, + "grad_norm": 0.6395590360057778, + "learning_rate": 4.317232978850159e-07, + "loss": 0.2661, + "step": 2995 + }, + { + "epoch": 1.6452498627127952, + "grad_norm": 0.4245140397625883, + "learning_rate": 4.314356056587279e-07, + "loss": 0.2654, + "step": 2996 + }, + { + "epoch": 1.6457990115321253, + "grad_norm": 0.6458112749863569, + "learning_rate": 4.311479365872699e-07, + "loss": 0.2059, + "step": 2997 + }, + { + "epoch": 1.6463481603514554, + "grad_norm": 0.4576900188920051, + "learning_rate": 4.3086029076772025e-07, + "loss": 0.2247, + "step": 2998 + }, + { + "epoch": 1.6468973091707853, + "grad_norm": 0.4685017041085208, + "learning_rate": 4.305726682971481e-07, + "loss": 0.2431, + "step": 2999 + }, + { + "epoch": 1.6474464579901154, + "grad_norm": 0.5015590452693746, + "learning_rate": 4.302850692726159e-07, + "loss": 0.2525, + "step": 3000 + }, + { + "epoch": 1.6474464579901154, + "eval_loss": 0.32768309116363525, + "eval_runtime": 18.6661, + "eval_samples_per_second": 23.733, + "eval_steps_per_second": 1.018, + "step": 3000 + }, + { + "epoch": 1.6479956068094452, + "grad_norm": 0.5321880247918745, + "learning_rate": 4.2999749379117755e-07, + "loss": 0.226, + "step": 3001 + }, + { + "epoch": 1.6485447556287753, + "grad_norm": 0.5016436617857616, + "learning_rate": 4.2970994194987916e-07, + "loss": 0.2387, + "step": 3002 + }, + { + "epoch": 1.6490939044481054, + "grad_norm": 0.4711429385206377, + "learning_rate": 4.29422413845759e-07, + "loss": 0.2668, + "step": 3003 + }, + { + "epoch": 1.6496430532674355, + "grad_norm": 0.5024320098710312, + "learning_rate": 4.2913490957584725e-07, + "loss": 0.2489, + "step": 3004 + }, + { + "epoch": 1.6501922020867656, + "grad_norm": 0.5101561852971753, + "learning_rate": 4.2884742923716586e-07, + "loss": 0.239, + "step": 3005 + }, + { + "epoch": 1.6507413509060955, + "grad_norm": 0.44747282520626047, + "learning_rate": 4.285599729267289e-07, + "loss": 0.2242, + "step": 3006 + }, + { + "epoch": 1.6512904997254256, + "grad_norm": 0.5446473872220932, + "learning_rate": 4.2827254074154226e-07, + "loss": 0.2545, + "step": 3007 + }, + { + "epoch": 1.6518396485447555, + "grad_norm": 0.4644172645656962, + "learning_rate": 4.279851327786038e-07, + "loss": 0.3082, + "step": 3008 + }, + { + "epoch": 1.6523887973640856, + "grad_norm": 0.38494898532173805, + "learning_rate": 4.276977491349031e-07, + "loss": 0.2493, + "step": 3009 + }, + { + "epoch": 1.6529379461834157, + "grad_norm": 0.5527168698358322, + "learning_rate": 4.274103899074215e-07, + "loss": 0.2193, + "step": 3010 + }, + { + "epoch": 1.6534870950027458, + "grad_norm": 0.5531703585469031, + "learning_rate": 4.27123055193132e-07, + "loss": 0.2681, + "step": 3011 + }, + { + "epoch": 1.654036243822076, + "grad_norm": 0.8028292194172078, + "learning_rate": 4.268357450889998e-07, + "loss": 0.2343, + "step": 3012 + }, + { + "epoch": 1.6545853926414058, + "grad_norm": 0.5005892526439495, + "learning_rate": 4.2654845969198133e-07, + "loss": 0.2488, + "step": 3013 + }, + { + "epoch": 1.655134541460736, + "grad_norm": 0.4240992579928759, + "learning_rate": 4.262611990990247e-07, + "loss": 0.2698, + "step": 3014 + }, + { + "epoch": 1.6556836902800658, + "grad_norm": 0.5390274157023256, + "learning_rate": 4.2597396340707024e-07, + "loss": 0.2347, + "step": 3015 + }, + { + "epoch": 1.656232839099396, + "grad_norm": 0.407457690983785, + "learning_rate": 4.256867527130487e-07, + "loss": 0.2454, + "step": 3016 + }, + { + "epoch": 1.656781987918726, + "grad_norm": 0.567694031133933, + "learning_rate": 4.2539956711388363e-07, + "loss": 0.2617, + "step": 3017 + }, + { + "epoch": 1.657331136738056, + "grad_norm": 0.5975179865111244, + "learning_rate": 4.251124067064895e-07, + "loss": 0.254, + "step": 3018 + }, + { + "epoch": 1.6578802855573862, + "grad_norm": 0.5648313514852326, + "learning_rate": 4.248252715877722e-07, + "loss": 0.2823, + "step": 3019 + }, + { + "epoch": 1.658429434376716, + "grad_norm": 0.48146795297414174, + "learning_rate": 4.245381618546296e-07, + "loss": 0.2656, + "step": 3020 + }, + { + "epoch": 1.658978583196046, + "grad_norm": 0.5884997084633288, + "learning_rate": 4.242510776039501e-07, + "loss": 0.247, + "step": 3021 + }, + { + "epoch": 1.659527732015376, + "grad_norm": 0.40603972320622483, + "learning_rate": 4.2396401893261457e-07, + "loss": 0.282, + "step": 3022 + }, + { + "epoch": 1.6600768808347062, + "grad_norm": 0.4674583345888558, + "learning_rate": 4.2367698593749444e-07, + "loss": 0.2364, + "step": 3023 + }, + { + "epoch": 1.6606260296540363, + "grad_norm": 0.5658847248246237, + "learning_rate": 4.233899787154529e-07, + "loss": 0.2607, + "step": 3024 + }, + { + "epoch": 1.6611751784733664, + "grad_norm": 0.4368981402855396, + "learning_rate": 4.2310299736334435e-07, + "loss": 0.2381, + "step": 3025 + }, + { + "epoch": 1.6617243272926965, + "grad_norm": 0.5643001206864895, + "learning_rate": 4.228160419780145e-07, + "loss": 0.2748, + "step": 3026 + }, + { + "epoch": 1.6622734761120264, + "grad_norm": 0.43976914199520706, + "learning_rate": 4.225291126562999e-07, + "loss": 0.2514, + "step": 3027 + }, + { + "epoch": 1.6628226249313562, + "grad_norm": 0.4494410236378608, + "learning_rate": 4.2224220949502873e-07, + "loss": 0.25, + "step": 3028 + }, + { + "epoch": 1.6633717737506863, + "grad_norm": 0.5527842355106057, + "learning_rate": 4.2195533259102053e-07, + "loss": 0.2795, + "step": 3029 + }, + { + "epoch": 1.6639209225700164, + "grad_norm": 0.5640588007644353, + "learning_rate": 4.2166848204108527e-07, + "loss": 0.2404, + "step": 3030 + }, + { + "epoch": 1.6644700713893466, + "grad_norm": 0.6545184724717047, + "learning_rate": 4.213816579420249e-07, + "loss": 0.2909, + "step": 3031 + }, + { + "epoch": 1.6650192202086767, + "grad_norm": 0.5910175990205282, + "learning_rate": 4.2109486039063155e-07, + "loss": 0.2251, + "step": 3032 + }, + { + "epoch": 1.6655683690280065, + "grad_norm": 0.5683617619745938, + "learning_rate": 4.208080894836891e-07, + "loss": 0.291, + "step": 3033 + }, + { + "epoch": 1.6661175178473366, + "grad_norm": 0.4400071920958627, + "learning_rate": 4.2052134531797195e-07, + "loss": 0.2845, + "step": 3034 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.664955262231542, + "learning_rate": 4.2023462799024594e-07, + "loss": 0.2823, + "step": 3035 + }, + { + "epoch": 1.6672158154859966, + "grad_norm": 0.4348603489456307, + "learning_rate": 4.199479375972673e-07, + "loss": 0.2295, + "step": 3036 + }, + { + "epoch": 1.6677649643053267, + "grad_norm": 0.5921591066388595, + "learning_rate": 4.19661274235784e-07, + "loss": 0.2268, + "step": 3037 + }, + { + "epoch": 1.6683141131246568, + "grad_norm": 0.46590129443327344, + "learning_rate": 4.193746380025338e-07, + "loss": 0.2378, + "step": 3038 + }, + { + "epoch": 1.668863261943987, + "grad_norm": 0.5381341974645318, + "learning_rate": 4.1908802899424613e-07, + "loss": 0.2619, + "step": 3039 + }, + { + "epoch": 1.6694124107633168, + "grad_norm": 0.47844780942334575, + "learning_rate": 4.1880144730764096e-07, + "loss": 0.2662, + "step": 3040 + }, + { + "epoch": 1.669961559582647, + "grad_norm": 0.44402481051341247, + "learning_rate": 4.185148930394288e-07, + "loss": 0.2621, + "step": 3041 + }, + { + "epoch": 1.6705107084019768, + "grad_norm": 0.4071245777064109, + "learning_rate": 4.1822836628631176e-07, + "loss": 0.2404, + "step": 3042 + }, + { + "epoch": 1.671059857221307, + "grad_norm": 0.4723422532810768, + "learning_rate": 4.1794186714498154e-07, + "loss": 0.2385, + "step": 3043 + }, + { + "epoch": 1.671609006040637, + "grad_norm": 0.4566559495746314, + "learning_rate": 4.176553957121211e-07, + "loss": 0.236, + "step": 3044 + }, + { + "epoch": 1.672158154859967, + "grad_norm": 0.4081581679093222, + "learning_rate": 4.17368952084404e-07, + "loss": 0.2489, + "step": 3045 + }, + { + "epoch": 1.6727073036792972, + "grad_norm": 0.5154388675068508, + "learning_rate": 4.1708253635849464e-07, + "loss": 0.2631, + "step": 3046 + }, + { + "epoch": 1.673256452498627, + "grad_norm": 0.4720196217539502, + "learning_rate": 4.167961486310477e-07, + "loss": 0.2293, + "step": 3047 + }, + { + "epoch": 1.6738056013179572, + "grad_norm": 0.3739883659854765, + "learning_rate": 4.165097889987085e-07, + "loss": 0.2627, + "step": 3048 + }, + { + "epoch": 1.674354750137287, + "grad_norm": 0.4986845999349851, + "learning_rate": 4.162234575581126e-07, + "loss": 0.2416, + "step": 3049 + }, + { + "epoch": 1.6749038989566172, + "grad_norm": 0.4291976326128812, + "learning_rate": 4.159371544058864e-07, + "loss": 0.2425, + "step": 3050 + }, + { + "epoch": 1.6754530477759473, + "grad_norm": 0.47218705680666195, + "learning_rate": 4.156508796386469e-07, + "loss": 0.2339, + "step": 3051 + }, + { + "epoch": 1.6760021965952774, + "grad_norm": 0.48858651592646163, + "learning_rate": 4.153646333530012e-07, + "loss": 0.2503, + "step": 3052 + }, + { + "epoch": 1.6765513454146075, + "grad_norm": 0.4245944408227513, + "learning_rate": 4.150784156455469e-07, + "loss": 0.2271, + "step": 3053 + }, + { + "epoch": 1.6771004942339374, + "grad_norm": 0.4755699562336762, + "learning_rate": 4.147922266128718e-07, + "loss": 0.2638, + "step": 3054 + }, + { + "epoch": 1.6776496430532675, + "grad_norm": 0.44170874335767984, + "learning_rate": 4.14506066351554e-07, + "loss": 0.2599, + "step": 3055 + }, + { + "epoch": 1.6781987918725974, + "grad_norm": 0.5659207277110574, + "learning_rate": 4.1421993495816244e-07, + "loss": 0.2634, + "step": 3056 + }, + { + "epoch": 1.6787479406919275, + "grad_norm": 0.41536777601673885, + "learning_rate": 4.1393383252925576e-07, + "loss": 0.219, + "step": 3057 + }, + { + "epoch": 1.6792970895112576, + "grad_norm": 0.5545571789442999, + "learning_rate": 4.1364775916138283e-07, + "loss": 0.2945, + "step": 3058 + }, + { + "epoch": 1.6798462383305877, + "grad_norm": 0.5636197241564115, + "learning_rate": 4.133617149510832e-07, + "loss": 0.3089, + "step": 3059 + }, + { + "epoch": 1.6803953871499178, + "grad_norm": 0.4611510670906123, + "learning_rate": 4.1307569999488594e-07, + "loss": 0.2348, + "step": 3060 + }, + { + "epoch": 1.6809445359692476, + "grad_norm": 0.5169147010074834, + "learning_rate": 4.127897143893108e-07, + "loss": 0.2886, + "step": 3061 + }, + { + "epoch": 1.6814936847885777, + "grad_norm": 0.44274446558278335, + "learning_rate": 4.1250375823086714e-07, + "loss": 0.2342, + "step": 3062 + }, + { + "epoch": 1.6820428336079076, + "grad_norm": 0.4768397816729602, + "learning_rate": 4.1221783161605483e-07, + "loss": 0.2369, + "step": 3063 + }, + { + "epoch": 1.6825919824272377, + "grad_norm": 0.5264918258891385, + "learning_rate": 4.119319346413636e-07, + "loss": 0.2602, + "step": 3064 + }, + { + "epoch": 1.6831411312465678, + "grad_norm": 0.6069052234793488, + "learning_rate": 4.11646067403273e-07, + "loss": 0.2749, + "step": 3065 + }, + { + "epoch": 1.683690280065898, + "grad_norm": 0.42850816209249276, + "learning_rate": 4.113602299982527e-07, + "loss": 0.2609, + "step": 3066 + }, + { + "epoch": 1.684239428885228, + "grad_norm": 0.4603559031022033, + "learning_rate": 4.1107442252276225e-07, + "loss": 0.2471, + "step": 3067 + }, + { + "epoch": 1.684788577704558, + "grad_norm": 0.5446735982756209, + "learning_rate": 4.107886450732513e-07, + "loss": 0.2756, + "step": 3068 + }, + { + "epoch": 1.685337726523888, + "grad_norm": 0.5394376541359203, + "learning_rate": 4.1050289774615916e-07, + "loss": 0.2377, + "step": 3069 + }, + { + "epoch": 1.685886875343218, + "grad_norm": 0.45254873948708396, + "learning_rate": 4.102171806379151e-07, + "loss": 0.2346, + "step": 3070 + }, + { + "epoch": 1.686436024162548, + "grad_norm": 0.5343895926456478, + "learning_rate": 4.099314938449379e-07, + "loss": 0.2701, + "step": 3071 + }, + { + "epoch": 1.6869851729818781, + "grad_norm": 0.5108232989065905, + "learning_rate": 4.0964583746363635e-07, + "loss": 0.2409, + "step": 3072 + }, + { + "epoch": 1.6875343218012082, + "grad_norm": 0.5231078379118408, + "learning_rate": 4.0936021159040915e-07, + "loss": 0.2786, + "step": 3073 + }, + { + "epoch": 1.6880834706205383, + "grad_norm": 0.5152354640813975, + "learning_rate": 4.0907461632164447e-07, + "loss": 0.2575, + "step": 3074 + }, + { + "epoch": 1.6886326194398682, + "grad_norm": 0.5472219437014276, + "learning_rate": 4.087890517537202e-07, + "loss": 0.2332, + "step": 3075 + }, + { + "epoch": 1.689181768259198, + "grad_norm": 0.38895894219446736, + "learning_rate": 4.085035179830036e-07, + "loss": 0.2304, + "step": 3076 + }, + { + "epoch": 1.6897309170785282, + "grad_norm": 0.5220282932782652, + "learning_rate": 4.0821801510585205e-07, + "loss": 0.2685, + "step": 3077 + }, + { + "epoch": 1.6902800658978583, + "grad_norm": 0.5260645108427713, + "learning_rate": 4.079325432186122e-07, + "loss": 0.2433, + "step": 3078 + }, + { + "epoch": 1.6908292147171884, + "grad_norm": 0.5034220690315326, + "learning_rate": 4.076471024176202e-07, + "loss": 0.2558, + "step": 3079 + }, + { + "epoch": 1.6913783635365185, + "grad_norm": 0.5551764237867175, + "learning_rate": 4.07361692799202e-07, + "loss": 0.2574, + "step": 3080 + }, + { + "epoch": 1.6919275123558486, + "grad_norm": 0.5404895842846217, + "learning_rate": 4.070763144596729e-07, + "loss": 0.2607, + "step": 3081 + }, + { + "epoch": 1.6924766611751785, + "grad_norm": 0.5640804741252682, + "learning_rate": 4.067909674953373e-07, + "loss": 0.2453, + "step": 3082 + }, + { + "epoch": 1.6930258099945084, + "grad_norm": 0.5786779535412679, + "learning_rate": 4.0650565200248933e-07, + "loss": 0.2513, + "step": 3083 + }, + { + "epoch": 1.6935749588138385, + "grad_norm": 0.4598450893297382, + "learning_rate": 4.062203680774124e-07, + "loss": 0.2418, + "step": 3084 + }, + { + "epoch": 1.6941241076331686, + "grad_norm": 0.44104779650131115, + "learning_rate": 4.059351158163796e-07, + "loss": 0.2484, + "step": 3085 + }, + { + "epoch": 1.6946732564524987, + "grad_norm": 0.47550985987042466, + "learning_rate": 4.056498953156529e-07, + "loss": 0.238, + "step": 3086 + }, + { + "epoch": 1.6952224052718288, + "grad_norm": 0.6137502707603291, + "learning_rate": 4.053647066714837e-07, + "loss": 0.2755, + "step": 3087 + }, + { + "epoch": 1.6957715540911587, + "grad_norm": 0.5543583922579137, + "learning_rate": 4.0507954998011265e-07, + "loss": 0.2988, + "step": 3088 + }, + { + "epoch": 1.6963207029104888, + "grad_norm": 0.5893322682434013, + "learning_rate": 4.0479442533776955e-07, + "loss": 0.2442, + "step": 3089 + }, + { + "epoch": 1.6968698517298186, + "grad_norm": 0.45276982487131684, + "learning_rate": 4.0450933284067366e-07, + "loss": 0.2465, + "step": 3090 + }, + { + "epoch": 1.6974190005491487, + "grad_norm": 0.5278766670600922, + "learning_rate": 4.042242725850331e-07, + "loss": 0.2502, + "step": 3091 + }, + { + "epoch": 1.6979681493684788, + "grad_norm": 0.4758375739314072, + "learning_rate": 4.0393924466704534e-07, + "loss": 0.2442, + "step": 3092 + }, + { + "epoch": 1.698517298187809, + "grad_norm": 0.5885298342925978, + "learning_rate": 4.0365424918289644e-07, + "loss": 0.2368, + "step": 3093 + }, + { + "epoch": 1.699066447007139, + "grad_norm": 0.5152674847901895, + "learning_rate": 4.0336928622876215e-07, + "loss": 0.2194, + "step": 3094 + }, + { + "epoch": 1.699615595826469, + "grad_norm": 0.49793174083045577, + "learning_rate": 4.0308435590080705e-07, + "loss": 0.2816, + "step": 3095 + }, + { + "epoch": 1.700164744645799, + "grad_norm": 0.4300896716494866, + "learning_rate": 4.0279945829518423e-07, + "loss": 0.231, + "step": 3096 + }, + { + "epoch": 1.700713893465129, + "grad_norm": 0.49250786620509346, + "learning_rate": 4.025145935080368e-07, + "loss": 0.258, + "step": 3097 + }, + { + "epoch": 1.701263042284459, + "grad_norm": 0.5053087407780013, + "learning_rate": 4.0222976163549536e-07, + "loss": 0.2708, + "step": 3098 + }, + { + "epoch": 1.7018121911037891, + "grad_norm": 0.7154528004056316, + "learning_rate": 4.0194496277368075e-07, + "loss": 0.2989, + "step": 3099 + }, + { + "epoch": 1.7023613399231192, + "grad_norm": 0.5934695872602223, + "learning_rate": 4.0166019701870184e-07, + "loss": 0.2905, + "step": 3100 + }, + { + "epoch": 1.7029104887424493, + "grad_norm": 0.5199988431031511, + "learning_rate": 4.0137546446665647e-07, + "loss": 0.2362, + "step": 3101 + }, + { + "epoch": 1.7034596375617792, + "grad_norm": 0.4673650758981907, + "learning_rate": 4.010907652136318e-07, + "loss": 0.2466, + "step": 3102 + }, + { + "epoch": 1.7040087863811093, + "grad_norm": 0.44362318001821655, + "learning_rate": 4.008060993557031e-07, + "loss": 0.2822, + "step": 3103 + }, + { + "epoch": 1.7045579352004392, + "grad_norm": 0.4690346733420638, + "learning_rate": 4.005214669889345e-07, + "loss": 0.2859, + "step": 3104 + }, + { + "epoch": 1.7051070840197693, + "grad_norm": 0.5514408605871878, + "learning_rate": 4.0023686820937904e-07, + "loss": 0.2279, + "step": 3105 + }, + { + "epoch": 1.7056562328390994, + "grad_norm": 0.4465984950657351, + "learning_rate": 3.999523031130782e-07, + "loss": 0.2262, + "step": 3106 + }, + { + "epoch": 1.7062053816584295, + "grad_norm": 0.5291215831599918, + "learning_rate": 3.996677717960624e-07, + "loss": 0.2255, + "step": 3107 + }, + { + "epoch": 1.7067545304777596, + "grad_norm": 0.46027985056910864, + "learning_rate": 3.993832743543506e-07, + "loss": 0.2574, + "step": 3108 + }, + { + "epoch": 1.7073036792970895, + "grad_norm": 0.5165453949960084, + "learning_rate": 3.990988108839499e-07, + "loss": 0.2551, + "step": 3109 + }, + { + "epoch": 1.7078528281164196, + "grad_norm": 0.4490921720324647, + "learning_rate": 3.988143814808562e-07, + "loss": 0.2299, + "step": 3110 + }, + { + "epoch": 1.7084019769357495, + "grad_norm": 0.5796011417052417, + "learning_rate": 3.985299862410542e-07, + "loss": 0.307, + "step": 3111 + }, + { + "epoch": 1.7089511257550796, + "grad_norm": 0.5328523502035938, + "learning_rate": 3.9824562526051676e-07, + "loss": 0.2241, + "step": 3112 + }, + { + "epoch": 1.7095002745744097, + "grad_norm": 0.4315466244748324, + "learning_rate": 3.9796129863520525e-07, + "loss": 0.2432, + "step": 3113 + }, + { + "epoch": 1.7100494233937398, + "grad_norm": 0.5330556906878818, + "learning_rate": 3.976770064610694e-07, + "loss": 0.2579, + "step": 3114 + }, + { + "epoch": 1.7105985722130699, + "grad_norm": 0.6080950836886879, + "learning_rate": 3.973927488340471e-07, + "loss": 0.2288, + "step": 3115 + }, + { + "epoch": 1.7111477210323998, + "grad_norm": 0.45139733692259465, + "learning_rate": 3.971085258500652e-07, + "loss": 0.2501, + "step": 3116 + }, + { + "epoch": 1.7116968698517299, + "grad_norm": 0.6968899326731548, + "learning_rate": 3.9682433760503837e-07, + "loss": 0.2212, + "step": 3117 + }, + { + "epoch": 1.7122460186710597, + "grad_norm": 0.5001687952993538, + "learning_rate": 3.965401841948694e-07, + "loss": 0.232, + "step": 3118 + }, + { + "epoch": 1.7127951674903898, + "grad_norm": 0.42450290855908773, + "learning_rate": 3.9625606571545024e-07, + "loss": 0.2794, + "step": 3119 + }, + { + "epoch": 1.71334431630972, + "grad_norm": 0.41209379030662363, + "learning_rate": 3.959719822626597e-07, + "loss": 0.2359, + "step": 3120 + }, + { + "epoch": 1.71389346512905, + "grad_norm": 0.5675986413898608, + "learning_rate": 3.9568793393236584e-07, + "loss": 0.3001, + "step": 3121 + }, + { + "epoch": 1.7144426139483802, + "grad_norm": 0.5105181315278182, + "learning_rate": 3.9540392082042445e-07, + "loss": 0.2616, + "step": 3122 + }, + { + "epoch": 1.71499176276771, + "grad_norm": 0.49757355196998587, + "learning_rate": 3.9511994302267937e-07, + "loss": 0.2756, + "step": 3123 + }, + { + "epoch": 1.7155409115870401, + "grad_norm": 0.4234705397472543, + "learning_rate": 3.948360006349629e-07, + "loss": 0.2279, + "step": 3124 + }, + { + "epoch": 1.71609006040637, + "grad_norm": 0.45087437876275227, + "learning_rate": 3.945520937530951e-07, + "loss": 0.2425, + "step": 3125 + }, + { + "epoch": 1.7166392092257001, + "grad_norm": 0.3960204243058321, + "learning_rate": 3.942682224728839e-07, + "loss": 0.2591, + "step": 3126 + }, + { + "epoch": 1.7171883580450302, + "grad_norm": 0.5467669386972142, + "learning_rate": 3.9398438689012534e-07, + "loss": 0.2301, + "step": 3127 + }, + { + "epoch": 1.7177375068643603, + "grad_norm": 0.5064991548688178, + "learning_rate": 3.937005871006038e-07, + "loss": 0.2691, + "step": 3128 + }, + { + "epoch": 1.7182866556836904, + "grad_norm": 0.6087739473357225, + "learning_rate": 3.9341682320009094e-07, + "loss": 0.2398, + "step": 3129 + }, + { + "epoch": 1.7188358045030203, + "grad_norm": 0.649749375518376, + "learning_rate": 3.9313309528434693e-07, + "loss": 0.3193, + "step": 3130 + }, + { + "epoch": 1.7193849533223502, + "grad_norm": 0.5693126558938569, + "learning_rate": 3.928494034491192e-07, + "loss": 0.3099, + "step": 3131 + }, + { + "epoch": 1.7199341021416803, + "grad_norm": 0.627447786560114, + "learning_rate": 3.925657477901433e-07, + "loss": 0.2403, + "step": 3132 + }, + { + "epoch": 1.7204832509610104, + "grad_norm": 0.5176273730919496, + "learning_rate": 3.922821284031428e-07, + "loss": 0.2338, + "step": 3133 + }, + { + "epoch": 1.7210323997803405, + "grad_norm": 0.42653543726467824, + "learning_rate": 3.919985453838286e-07, + "loss": 0.2325, + "step": 3134 + }, + { + "epoch": 1.7215815485996706, + "grad_norm": 0.5938867839631261, + "learning_rate": 3.917149988278995e-07, + "loss": 0.2314, + "step": 3135 + }, + { + "epoch": 1.7221306974190007, + "grad_norm": 0.46559949079715124, + "learning_rate": 3.9143148883104245e-07, + "loss": 0.2181, + "step": 3136 + }, + { + "epoch": 1.7226798462383306, + "grad_norm": 0.5826948530045537, + "learning_rate": 3.911480154889308e-07, + "loss": 0.2622, + "step": 3137 + }, + { + "epoch": 1.7232289950576605, + "grad_norm": 0.4790098693673044, + "learning_rate": 3.9086457889722714e-07, + "loss": 0.2267, + "step": 3138 + }, + { + "epoch": 1.7237781438769906, + "grad_norm": 0.5398186957105922, + "learning_rate": 3.9058117915158045e-07, + "loss": 0.2485, + "step": 3139 + }, + { + "epoch": 1.7243272926963207, + "grad_norm": 0.48776148280007303, + "learning_rate": 3.902978163476278e-07, + "loss": 0.2418, + "step": 3140 + }, + { + "epoch": 1.7248764415156508, + "grad_norm": 0.44243516779979836, + "learning_rate": 3.900144905809939e-07, + "loss": 0.2854, + "step": 3141 + }, + { + "epoch": 1.7254255903349809, + "grad_norm": 0.43468136136378044, + "learning_rate": 3.8973120194729047e-07, + "loss": 0.2516, + "step": 3142 + }, + { + "epoch": 1.7259747391543108, + "grad_norm": 0.5795905403945281, + "learning_rate": 3.8944795054211714e-07, + "loss": 0.2697, + "step": 3143 + }, + { + "epoch": 1.7265238879736409, + "grad_norm": 0.501383559017645, + "learning_rate": 3.8916473646106073e-07, + "loss": 0.2557, + "step": 3144 + }, + { + "epoch": 1.7270730367929708, + "grad_norm": 0.5969208261131543, + "learning_rate": 3.888815597996956e-07, + "loss": 0.2819, + "step": 3145 + }, + { + "epoch": 1.7276221856123009, + "grad_norm": 0.6574309701900565, + "learning_rate": 3.8859842065358344e-07, + "loss": 0.2502, + "step": 3146 + }, + { + "epoch": 1.728171334431631, + "grad_norm": 0.47809567820512366, + "learning_rate": 3.8831531911827347e-07, + "loss": 0.2136, + "step": 3147 + }, + { + "epoch": 1.728720483250961, + "grad_norm": 0.4990969370217534, + "learning_rate": 3.8803225528930166e-07, + "loss": 0.2463, + "step": 3148 + }, + { + "epoch": 1.7292696320702912, + "grad_norm": 0.5446776872440375, + "learning_rate": 3.877492292621918e-07, + "loss": 0.266, + "step": 3149 + }, + { + "epoch": 1.729818780889621, + "grad_norm": 0.6854649798885917, + "learning_rate": 3.8746624113245487e-07, + "loss": 0.2274, + "step": 3150 + }, + { + "epoch": 1.7303679297089511, + "grad_norm": 0.4490608893339381, + "learning_rate": 3.871832909955888e-07, + "loss": 0.2257, + "step": 3151 + }, + { + "epoch": 1.730917078528281, + "grad_norm": 0.4650256158242345, + "learning_rate": 3.8690037894707897e-07, + "loss": 0.264, + "step": 3152 + }, + { + "epoch": 1.7314662273476111, + "grad_norm": 0.4402758653056426, + "learning_rate": 3.866175050823975e-07, + "loss": 0.2323, + "step": 3153 + }, + { + "epoch": 1.7320153761669412, + "grad_norm": 0.452233599288595, + "learning_rate": 3.863346694970041e-07, + "loss": 0.239, + "step": 3154 + }, + { + "epoch": 1.7325645249862713, + "grad_norm": 0.4767185355255673, + "learning_rate": 3.8605187228634537e-07, + "loss": 0.2672, + "step": 3155 + }, + { + "epoch": 1.7331136738056014, + "grad_norm": 0.4199317265503855, + "learning_rate": 3.857691135458549e-07, + "loss": 0.2456, + "step": 3156 + }, + { + "epoch": 1.7336628226249313, + "grad_norm": 0.5390364099938132, + "learning_rate": 3.854863933709533e-07, + "loss": 0.2227, + "step": 3157 + }, + { + "epoch": 1.7342119714442614, + "grad_norm": 0.48385503439722066, + "learning_rate": 3.852037118570484e-07, + "loss": 0.2242, + "step": 3158 + }, + { + "epoch": 1.7347611202635913, + "grad_norm": 0.4468525853930528, + "learning_rate": 3.849210690995346e-07, + "loss": 0.2603, + "step": 3159 + }, + { + "epoch": 1.7353102690829214, + "grad_norm": 0.506347106145234, + "learning_rate": 3.846384651937935e-07, + "loss": 0.2465, + "step": 3160 + }, + { + "epoch": 1.7358594179022515, + "grad_norm": 0.47819259152248483, + "learning_rate": 3.843559002351935e-07, + "loss": 0.2278, + "step": 3161 + }, + { + "epoch": 1.7364085667215816, + "grad_norm": 0.5426082721839471, + "learning_rate": 3.840733743190897e-07, + "loss": 0.247, + "step": 3162 + }, + { + "epoch": 1.7369577155409117, + "grad_norm": 0.47794256238335486, + "learning_rate": 3.837908875408246e-07, + "loss": 0.2339, + "step": 3163 + }, + { + "epoch": 1.7375068643602416, + "grad_norm": 0.5178755229089875, + "learning_rate": 3.835084399957267e-07, + "loss": 0.2361, + "step": 3164 + }, + { + "epoch": 1.7380560131795717, + "grad_norm": 0.6085767036546321, + "learning_rate": 3.832260317791118e-07, + "loss": 0.2676, + "step": 3165 + }, + { + "epoch": 1.7386051619989016, + "grad_norm": 0.59198103852928, + "learning_rate": 3.8294366298628205e-07, + "loss": 0.2874, + "step": 3166 + }, + { + "epoch": 1.7391543108182317, + "grad_norm": 0.5707234253565714, + "learning_rate": 3.8266133371252685e-07, + "loss": 0.2579, + "step": 3167 + }, + { + "epoch": 1.7397034596375618, + "grad_norm": 0.40381937534860596, + "learning_rate": 3.8237904405312176e-07, + "loss": 0.2159, + "step": 3168 + }, + { + "epoch": 1.740252608456892, + "grad_norm": 0.48095654262877086, + "learning_rate": 3.820967941033293e-07, + "loss": 0.2842, + "step": 3169 + }, + { + "epoch": 1.740801757276222, + "grad_norm": 0.9403320984223763, + "learning_rate": 3.8181458395839814e-07, + "loss": 0.4686, + "step": 3170 + }, + { + "epoch": 1.7413509060955519, + "grad_norm": 0.5686372139645117, + "learning_rate": 3.8153241371356387e-07, + "loss": 0.2682, + "step": 3171 + }, + { + "epoch": 1.741900054914882, + "grad_norm": 0.5594098645763892, + "learning_rate": 3.8125028346404877e-07, + "loss": 0.2843, + "step": 3172 + }, + { + "epoch": 1.7424492037342119, + "grad_norm": 0.4907330043285017, + "learning_rate": 3.809681933050612e-07, + "loss": 0.2389, + "step": 3173 + }, + { + "epoch": 1.742998352553542, + "grad_norm": 0.5919787133788562, + "learning_rate": 3.806861433317964e-07, + "loss": 0.2304, + "step": 3174 + }, + { + "epoch": 1.743547501372872, + "grad_norm": 0.45987727566740183, + "learning_rate": 3.8040413363943566e-07, + "loss": 0.2383, + "step": 3175 + }, + { + "epoch": 1.7440966501922022, + "grad_norm": 0.4470820653165698, + "learning_rate": 3.801221643231467e-07, + "loss": 0.2254, + "step": 3176 + }, + { + "epoch": 1.7446457990115323, + "grad_norm": 0.5258470182980035, + "learning_rate": 3.7984023547808413e-07, + "loss": 0.2005, + "step": 3177 + }, + { + "epoch": 1.7451949478308622, + "grad_norm": 0.47036590120171595, + "learning_rate": 3.7955834719938846e-07, + "loss": 0.2691, + "step": 3178 + }, + { + "epoch": 1.7457440966501923, + "grad_norm": 0.5159167021826463, + "learning_rate": 3.792764995821864e-07, + "loss": 0.2325, + "step": 3179 + }, + { + "epoch": 1.7462932454695221, + "grad_norm": 0.5269752323244645, + "learning_rate": 3.789946927215915e-07, + "loss": 0.2448, + "step": 3180 + }, + { + "epoch": 1.7468423942888522, + "grad_norm": 0.4794105661476877, + "learning_rate": 3.787129267127029e-07, + "loss": 0.2883, + "step": 3181 + }, + { + "epoch": 1.7473915431081823, + "grad_norm": 0.44920477675043563, + "learning_rate": 3.7843120165060627e-07, + "loss": 0.2428, + "step": 3182 + }, + { + "epoch": 1.7479406919275124, + "grad_norm": 0.6126053014623515, + "learning_rate": 3.781495176303734e-07, + "loss": 0.2475, + "step": 3183 + }, + { + "epoch": 1.7484898407468425, + "grad_norm": 0.578415721406623, + "learning_rate": 3.778678747470625e-07, + "loss": 0.2313, + "step": 3184 + }, + { + "epoch": 1.7490389895661724, + "grad_norm": 0.4291465808488704, + "learning_rate": 3.775862730957176e-07, + "loss": 0.2274, + "step": 3185 + }, + { + "epoch": 1.7495881383855023, + "grad_norm": 0.4346473547603644, + "learning_rate": 3.7730471277136873e-07, + "loss": 0.2413, + "step": 3186 + }, + { + "epoch": 1.7501372872048324, + "grad_norm": 0.44539489679557687, + "learning_rate": 3.7702319386903226e-07, + "loss": 0.2572, + "step": 3187 + }, + { + "epoch": 1.7506864360241625, + "grad_norm": 0.5458412759554239, + "learning_rate": 3.767417164837102e-07, + "loss": 0.2389, + "step": 3188 + }, + { + "epoch": 1.7512355848434926, + "grad_norm": 0.5770531557851083, + "learning_rate": 3.7646028071039116e-07, + "loss": 0.219, + "step": 3189 + }, + { + "epoch": 1.7517847336628227, + "grad_norm": 0.46108234758931604, + "learning_rate": 3.7617888664404913e-07, + "loss": 0.2703, + "step": 3190 + }, + { + "epoch": 1.7523338824821528, + "grad_norm": 0.4912210098982382, + "learning_rate": 3.7589753437964443e-07, + "loss": 0.3073, + "step": 3191 + }, + { + "epoch": 1.7528830313014827, + "grad_norm": 0.7505786550244599, + "learning_rate": 3.7561622401212283e-07, + "loss": 0.2649, + "step": 3192 + }, + { + "epoch": 1.7534321801208126, + "grad_norm": 0.41835237808575687, + "learning_rate": 3.753349556364162e-07, + "loss": 0.2188, + "step": 3193 + }, + { + "epoch": 1.7539813289401427, + "grad_norm": 0.462376610471587, + "learning_rate": 3.7505372934744263e-07, + "loss": 0.2526, + "step": 3194 + }, + { + "epoch": 1.7545304777594728, + "grad_norm": 0.6157864425204377, + "learning_rate": 3.7477254524010523e-07, + "loss": 0.2877, + "step": 3195 + }, + { + "epoch": 1.755079626578803, + "grad_norm": 0.37951275263709683, + "learning_rate": 3.744914034092936e-07, + "loss": 0.257, + "step": 3196 + }, + { + "epoch": 1.755628775398133, + "grad_norm": 0.49268215522244563, + "learning_rate": 3.742103039498823e-07, + "loss": 0.2472, + "step": 3197 + }, + { + "epoch": 1.7561779242174629, + "grad_norm": 0.48769839553429445, + "learning_rate": 3.739292469567325e-07, + "loss": 0.2508, + "step": 3198 + }, + { + "epoch": 1.756727073036793, + "grad_norm": 0.4251522379520149, + "learning_rate": 3.7364823252469033e-07, + "loss": 0.2451, + "step": 3199 + }, + { + "epoch": 1.7572762218561229, + "grad_norm": 0.5453702661324836, + "learning_rate": 3.733672607485879e-07, + "loss": 0.2479, + "step": 3200 + }, + { + "epoch": 1.7572762218561229, + "eval_loss": 0.32651084661483765, + "eval_runtime": 18.6705, + "eval_samples_per_second": 23.727, + "eval_steps_per_second": 1.018, + "step": 3200 + }, + { + "epoch": 1.757825370675453, + "grad_norm": 0.4903260712000596, + "learning_rate": 3.7308633172324283e-07, + "loss": 0.2282, + "step": 3201 + }, + { + "epoch": 1.758374519494783, + "grad_norm": 0.4486805009652084, + "learning_rate": 3.7280544554345846e-07, + "loss": 0.2306, + "step": 3202 + }, + { + "epoch": 1.7589236683141132, + "grad_norm": 0.5809799612838047, + "learning_rate": 3.725246023040232e-07, + "loss": 0.2739, + "step": 3203 + }, + { + "epoch": 1.7594728171334433, + "grad_norm": 0.6060920429058457, + "learning_rate": 3.7224380209971153e-07, + "loss": 0.2624, + "step": 3204 + }, + { + "epoch": 1.7600219659527732, + "grad_norm": 0.5634604983694165, + "learning_rate": 3.7196304502528297e-07, + "loss": 0.2484, + "step": 3205 + }, + { + "epoch": 1.7605711147721033, + "grad_norm": 0.5038282148433267, + "learning_rate": 3.71682331175483e-07, + "loss": 0.3132, + "step": 3206 + }, + { + "epoch": 1.7611202635914331, + "grad_norm": 0.6483740917769496, + "learning_rate": 3.7140166064504205e-07, + "loss": 0.2869, + "step": 3207 + }, + { + "epoch": 1.7616694124107632, + "grad_norm": 0.5641040189469726, + "learning_rate": 3.71121033528676e-07, + "loss": 0.2613, + "step": 3208 + }, + { + "epoch": 1.7622185612300933, + "grad_norm": 0.7041012804385147, + "learning_rate": 3.708404499210862e-07, + "loss": 0.2822, + "step": 3209 + }, + { + "epoch": 1.7627677100494235, + "grad_norm": 1.0419248578923135, + "learning_rate": 3.7055990991695916e-07, + "loss": 0.2868, + "step": 3210 + }, + { + "epoch": 1.7633168588687536, + "grad_norm": 0.46563562491042026, + "learning_rate": 3.702794136109672e-07, + "loss": 0.2698, + "step": 3211 + }, + { + "epoch": 1.7638660076880834, + "grad_norm": 0.4853378410871895, + "learning_rate": 3.6999896109776713e-07, + "loss": 0.2616, + "step": 3212 + }, + { + "epoch": 1.7644151565074135, + "grad_norm": 0.4546893316616232, + "learning_rate": 3.697185524720016e-07, + "loss": 0.2428, + "step": 3213 + }, + { + "epoch": 1.7649643053267434, + "grad_norm": 0.5519007941147956, + "learning_rate": 3.694381878282978e-07, + "loss": 0.2403, + "step": 3214 + }, + { + "epoch": 1.7655134541460735, + "grad_norm": 0.4528682823390998, + "learning_rate": 3.691578672612688e-07, + "loss": 0.265, + "step": 3215 + }, + { + "epoch": 1.7660626029654036, + "grad_norm": 0.4953911294349023, + "learning_rate": 3.6887759086551235e-07, + "loss": 0.2646, + "step": 3216 + }, + { + "epoch": 1.7666117517847337, + "grad_norm": 0.4707344059943829, + "learning_rate": 3.685973587356114e-07, + "loss": 0.228, + "step": 3217 + }, + { + "epoch": 1.7671609006040638, + "grad_norm": 0.5598948318476332, + "learning_rate": 3.6831717096613426e-07, + "loss": 0.281, + "step": 3218 + }, + { + "epoch": 1.7677100494233937, + "grad_norm": 0.5801101030852658, + "learning_rate": 3.6803702765163337e-07, + "loss": 0.2487, + "step": 3219 + }, + { + "epoch": 1.7682591982427238, + "grad_norm": 0.4717302770850839, + "learning_rate": 3.6775692888664723e-07, + "loss": 0.2389, + "step": 3220 + }, + { + "epoch": 1.7688083470620537, + "grad_norm": 0.446720715673289, + "learning_rate": 3.6747687476569883e-07, + "loss": 0.2276, + "step": 3221 + }, + { + "epoch": 1.7693574958813838, + "grad_norm": 0.4158763452649629, + "learning_rate": 3.671968653832959e-07, + "loss": 0.2236, + "step": 3222 + }, + { + "epoch": 1.769906644700714, + "grad_norm": 0.4693401163218621, + "learning_rate": 3.669169008339315e-07, + "loss": 0.2094, + "step": 3223 + }, + { + "epoch": 1.770455793520044, + "grad_norm": 0.48881947504972323, + "learning_rate": 3.6663698121208335e-07, + "loss": 0.2659, + "step": 3224 + }, + { + "epoch": 1.771004942339374, + "grad_norm": 0.48686041646318634, + "learning_rate": 3.663571066122139e-07, + "loss": 0.2526, + "step": 3225 + }, + { + "epoch": 1.771554091158704, + "grad_norm": 0.545055480574814, + "learning_rate": 3.660772771287706e-07, + "loss": 0.2464, + "step": 3226 + }, + { + "epoch": 1.772103239978034, + "grad_norm": 0.4794289708321269, + "learning_rate": 3.6579749285618526e-07, + "loss": 0.2081, + "step": 3227 + }, + { + "epoch": 1.772652388797364, + "grad_norm": 0.507152922032444, + "learning_rate": 3.655177538888753e-07, + "loss": 0.2634, + "step": 3228 + }, + { + "epoch": 1.773201537616694, + "grad_norm": 0.514742983722494, + "learning_rate": 3.652380603212422e-07, + "loss": 0.2687, + "step": 3229 + }, + { + "epoch": 1.7737506864360242, + "grad_norm": 0.5844575171720781, + "learning_rate": 3.6495841224767187e-07, + "loss": 0.3007, + "step": 3230 + }, + { + "epoch": 1.7742998352553543, + "grad_norm": 0.5095379695529898, + "learning_rate": 3.6467880976253546e-07, + "loss": 0.24, + "step": 3231 + }, + { + "epoch": 1.7748489840746844, + "grad_norm": 0.4665299076642937, + "learning_rate": 3.6439925296018855e-07, + "loss": 0.2715, + "step": 3232 + }, + { + "epoch": 1.7753981328940143, + "grad_norm": 0.564394016760433, + "learning_rate": 3.6411974193497124e-07, + "loss": 0.2711, + "step": 3233 + }, + { + "epoch": 1.7759472817133442, + "grad_norm": 0.5983560688762918, + "learning_rate": 3.638402767812081e-07, + "loss": 0.2964, + "step": 3234 + }, + { + "epoch": 1.7764964305326743, + "grad_norm": 0.5658658730567936, + "learning_rate": 3.635608575932087e-07, + "loss": 0.2441, + "step": 3235 + }, + { + "epoch": 1.7770455793520044, + "grad_norm": 0.5308114066814922, + "learning_rate": 3.6328148446526614e-07, + "loss": 0.2381, + "step": 3236 + }, + { + "epoch": 1.7775947281713345, + "grad_norm": 0.5556881448340648, + "learning_rate": 3.6300215749165895e-07, + "loss": 0.2229, + "step": 3237 + }, + { + "epoch": 1.7781438769906646, + "grad_norm": 0.4337866717149387, + "learning_rate": 3.627228767666496e-07, + "loss": 0.2351, + "step": 3238 + }, + { + "epoch": 1.7786930258099947, + "grad_norm": 0.5481610194304144, + "learning_rate": 3.624436423844849e-07, + "loss": 0.2915, + "step": 3239 + }, + { + "epoch": 1.7792421746293245, + "grad_norm": 0.512099984685357, + "learning_rate": 3.621644544393966e-07, + "loss": 0.2603, + "step": 3240 + }, + { + "epoch": 1.7797913234486544, + "grad_norm": 0.5051440651140077, + "learning_rate": 3.6188531302559984e-07, + "loss": 0.2201, + "step": 3241 + }, + { + "epoch": 1.7803404722679845, + "grad_norm": 0.47662284382696746, + "learning_rate": 3.6160621823729476e-07, + "loss": 0.2821, + "step": 3242 + }, + { + "epoch": 1.7808896210873146, + "grad_norm": 0.3649006325811564, + "learning_rate": 3.6132717016866567e-07, + "loss": 0.2186, + "step": 3243 + }, + { + "epoch": 1.7814387699066447, + "grad_norm": 0.5482396022855541, + "learning_rate": 3.6104816891388073e-07, + "loss": 0.2222, + "step": 3244 + }, + { + "epoch": 1.7819879187259748, + "grad_norm": 0.511544758700288, + "learning_rate": 3.60769214567093e-07, + "loss": 0.2574, + "step": 3245 + }, + { + "epoch": 1.782537067545305, + "grad_norm": 0.5246072658336789, + "learning_rate": 3.604903072224391e-07, + "loss": 0.2773, + "step": 3246 + }, + { + "epoch": 1.7830862163646348, + "grad_norm": 0.5869311556169076, + "learning_rate": 3.602114469740399e-07, + "loss": 0.2609, + "step": 3247 + }, + { + "epoch": 1.7836353651839647, + "grad_norm": 0.467989409545172, + "learning_rate": 3.5993263391600037e-07, + "loss": 0.2199, + "step": 3248 + }, + { + "epoch": 1.7841845140032948, + "grad_norm": 0.5192217759404012, + "learning_rate": 3.5965386814240987e-07, + "loss": 0.2379, + "step": 3249 + }, + { + "epoch": 1.784733662822625, + "grad_norm": 0.5701503659754419, + "learning_rate": 3.593751497473416e-07, + "loss": 0.259, + "step": 3250 + }, + { + "epoch": 1.785282811641955, + "grad_norm": 0.5643240402143217, + "learning_rate": 3.5909647882485266e-07, + "loss": 0.2308, + "step": 3251 + }, + { + "epoch": 1.7858319604612851, + "grad_norm": 0.5328041780015402, + "learning_rate": 3.588178554689842e-07, + "loss": 0.2214, + "step": 3252 + }, + { + "epoch": 1.786381109280615, + "grad_norm": 0.5426625576103913, + "learning_rate": 3.585392797737611e-07, + "loss": 0.2166, + "step": 3253 + }, + { + "epoch": 1.786930258099945, + "grad_norm": 0.4940794330177362, + "learning_rate": 3.5826075183319286e-07, + "loss": 0.2693, + "step": 3254 + }, + { + "epoch": 1.787479406919275, + "grad_norm": 0.48448048737137134, + "learning_rate": 3.579822717412722e-07, + "loss": 0.278, + "step": 3255 + }, + { + "epoch": 1.788028555738605, + "grad_norm": 0.5043826849912504, + "learning_rate": 3.5770383959197575e-07, + "loss": 0.2633, + "step": 3256 + }, + { + "epoch": 1.7885777045579352, + "grad_norm": 0.5112786403034757, + "learning_rate": 3.574254554792645e-07, + "loss": 0.2284, + "step": 3257 + }, + { + "epoch": 1.7891268533772653, + "grad_norm": 0.4758405516370161, + "learning_rate": 3.5714711949708226e-07, + "loss": 0.2379, + "step": 3258 + }, + { + "epoch": 1.7896760021965954, + "grad_norm": 0.899476067865799, + "learning_rate": 3.5686883173935763e-07, + "loss": 0.3947, + "step": 3259 + }, + { + "epoch": 1.7902251510159253, + "grad_norm": 0.5641049914931646, + "learning_rate": 3.565905923000022e-07, + "loss": 0.2643, + "step": 3260 + }, + { + "epoch": 1.7907742998352554, + "grad_norm": 0.7342891116262842, + "learning_rate": 3.563124012729116e-07, + "loss": 0.2914, + "step": 3261 + }, + { + "epoch": 1.7913234486545853, + "grad_norm": 0.512582626025038, + "learning_rate": 3.5603425875196534e-07, + "loss": 0.2101, + "step": 3262 + }, + { + "epoch": 1.7918725974739154, + "grad_norm": 0.58488868171998, + "learning_rate": 3.557561648310259e-07, + "loss": 0.2982, + "step": 3263 + }, + { + "epoch": 1.7924217462932455, + "grad_norm": 0.7268295553867393, + "learning_rate": 3.5547811960393985e-07, + "loss": 0.2613, + "step": 3264 + }, + { + "epoch": 1.7929708951125756, + "grad_norm": 0.5802114773102711, + "learning_rate": 3.5520012316453713e-07, + "loss": 0.2425, + "step": 3265 + }, + { + "epoch": 1.7935200439319057, + "grad_norm": 0.5748311928173353, + "learning_rate": 3.549221756066315e-07, + "loss": 0.247, + "step": 3266 + }, + { + "epoch": 1.7940691927512356, + "grad_norm": 0.47113767207193835, + "learning_rate": 3.5464427702401996e-07, + "loss": 0.2442, + "step": 3267 + }, + { + "epoch": 1.7946183415705657, + "grad_norm": 0.40046543758087666, + "learning_rate": 3.54366427510483e-07, + "loss": 0.2387, + "step": 3268 + }, + { + "epoch": 1.7951674903898955, + "grad_norm": 0.5703506825836538, + "learning_rate": 3.5408862715978447e-07, + "loss": 0.2606, + "step": 3269 + }, + { + "epoch": 1.7957166392092256, + "grad_norm": 0.7577000751785343, + "learning_rate": 3.5381087606567186e-07, + "loss": 0.2624, + "step": 3270 + }, + { + "epoch": 1.7962657880285557, + "grad_norm": 0.4937723803464274, + "learning_rate": 3.5353317432187606e-07, + "loss": 0.2512, + "step": 3271 + }, + { + "epoch": 1.7968149368478858, + "grad_norm": 0.5193242962032641, + "learning_rate": 3.53255522022111e-07, + "loss": 0.2516, + "step": 3272 + }, + { + "epoch": 1.797364085667216, + "grad_norm": 0.4701291006188349, + "learning_rate": 3.529779192600743e-07, + "loss": 0.2672, + "step": 3273 + }, + { + "epoch": 1.7979132344865458, + "grad_norm": 0.422034434328806, + "learning_rate": 3.527003661294464e-07, + "loss": 0.2642, + "step": 3274 + }, + { + "epoch": 1.798462383305876, + "grad_norm": 0.4939452338842813, + "learning_rate": 3.524228627238913e-07, + "loss": 0.2201, + "step": 3275 + }, + { + "epoch": 1.7990115321252058, + "grad_norm": 0.5613767994813553, + "learning_rate": 3.5214540913705635e-07, + "loss": 0.2228, + "step": 3276 + }, + { + "epoch": 1.799560680944536, + "grad_norm": 0.562746067724081, + "learning_rate": 3.5186800546257184e-07, + "loss": 0.2581, + "step": 3277 + }, + { + "epoch": 1.800109829763866, + "grad_norm": 0.43113111352859795, + "learning_rate": 3.5159065179405116e-07, + "loss": 0.215, + "step": 3278 + }, + { + "epoch": 1.8006589785831961, + "grad_norm": 0.566018725884266, + "learning_rate": 3.5131334822509134e-07, + "loss": 0.2443, + "step": 3279 + }, + { + "epoch": 1.8012081274025262, + "grad_norm": 0.5677231417463248, + "learning_rate": 3.510360948492716e-07, + "loss": 0.2718, + "step": 3280 + }, + { + "epoch": 1.801757276221856, + "grad_norm": 0.4536306527881557, + "learning_rate": 3.507588917601551e-07, + "loss": 0.2459, + "step": 3281 + }, + { + "epoch": 1.8023064250411862, + "grad_norm": 0.6184050401266717, + "learning_rate": 3.504817390512875e-07, + "loss": 0.2357, + "step": 3282 + }, + { + "epoch": 1.802855573860516, + "grad_norm": 2.1524517582018334, + "learning_rate": 3.502046368161977e-07, + "loss": 0.246, + "step": 3283 + }, + { + "epoch": 1.8034047226798462, + "grad_norm": 0.44682765231633387, + "learning_rate": 3.4992758514839767e-07, + "loss": 0.2236, + "step": 3284 + }, + { + "epoch": 1.8039538714991763, + "grad_norm": 0.5462948191383195, + "learning_rate": 3.496505841413818e-07, + "loss": 0.2359, + "step": 3285 + }, + { + "epoch": 1.8045030203185064, + "grad_norm": 0.4010125054167588, + "learning_rate": 3.4937363388862783e-07, + "loss": 0.2292, + "step": 3286 + }, + { + "epoch": 1.8050521691378365, + "grad_norm": 0.54869646098954, + "learning_rate": 3.4909673448359624e-07, + "loss": 0.2776, + "step": 3287 + }, + { + "epoch": 1.8056013179571664, + "grad_norm": 0.6495084283304535, + "learning_rate": 3.4881988601973055e-07, + "loss": 0.2488, + "step": 3288 + }, + { + "epoch": 1.8061504667764963, + "grad_norm": 0.5258360983492615, + "learning_rate": 3.485430885904569e-07, + "loss": 0.2687, + "step": 3289 + }, + { + "epoch": 1.8066996155958264, + "grad_norm": 0.46800944299671204, + "learning_rate": 3.4826634228918414e-07, + "loss": 0.245, + "step": 3290 + }, + { + "epoch": 1.8072487644151565, + "grad_norm": 0.4299142812589447, + "learning_rate": 3.4798964720930393e-07, + "loss": 0.2449, + "step": 3291 + }, + { + "epoch": 1.8077979132344866, + "grad_norm": 1.2017761612083697, + "learning_rate": 3.477130034441906e-07, + "loss": 0.4559, + "step": 3292 + }, + { + "epoch": 1.8083470620538167, + "grad_norm": 0.5119317605734569, + "learning_rate": 3.4743641108720135e-07, + "loss": 0.233, + "step": 3293 + }, + { + "epoch": 1.8088962108731468, + "grad_norm": 0.5050380547877428, + "learning_rate": 3.471598702316759e-07, + "loss": 0.2697, + "step": 3294 + }, + { + "epoch": 1.8094453596924767, + "grad_norm": 0.5954131961798498, + "learning_rate": 3.468833809709368e-07, + "loss": 0.238, + "step": 3295 + }, + { + "epoch": 1.8099945085118065, + "grad_norm": 0.5516829060282231, + "learning_rate": 3.466069433982884e-07, + "loss": 0.2457, + "step": 3296 + }, + { + "epoch": 1.8105436573311366, + "grad_norm": 0.45441878738749614, + "learning_rate": 3.463305576070188e-07, + "loss": 0.2516, + "step": 3297 + }, + { + "epoch": 1.8110928061504667, + "grad_norm": 0.5462777838903079, + "learning_rate": 3.460542236903977e-07, + "loss": 0.2525, + "step": 3298 + }, + { + "epoch": 1.8116419549697969, + "grad_norm": 0.47309562175546327, + "learning_rate": 3.457779417416776e-07, + "loss": 0.2307, + "step": 3299 + }, + { + "epoch": 1.812191103789127, + "grad_norm": 0.4240047692737509, + "learning_rate": 3.455017118540938e-07, + "loss": 0.2505, + "step": 3300 + }, + { + "epoch": 1.8127402526084568, + "grad_norm": 0.5762786980741748, + "learning_rate": 3.4522553412086353e-07, + "loss": 0.285, + "step": 3301 + }, + { + "epoch": 1.813289401427787, + "grad_norm": 0.5259079217713657, + "learning_rate": 3.4494940863518646e-07, + "loss": 0.2522, + "step": 3302 + }, + { + "epoch": 1.8138385502471168, + "grad_norm": 0.5426784610316483, + "learning_rate": 3.446733354902448e-07, + "loss": 0.2899, + "step": 3303 + }, + { + "epoch": 1.814387699066447, + "grad_norm": 0.4451335710910681, + "learning_rate": 3.443973147792031e-07, + "loss": 0.2415, + "step": 3304 + }, + { + "epoch": 1.814936847885777, + "grad_norm": 0.4617689131159005, + "learning_rate": 3.441213465952084e-07, + "loss": 0.2791, + "step": 3305 + }, + { + "epoch": 1.8154859967051071, + "grad_norm": 0.5168043908442046, + "learning_rate": 3.438454310313896e-07, + "loss": 0.2387, + "step": 3306 + }, + { + "epoch": 1.8160351455244372, + "grad_norm": 0.6146801816851613, + "learning_rate": 3.43569568180858e-07, + "loss": 0.2959, + "step": 3307 + }, + { + "epoch": 1.8165842943437671, + "grad_norm": 0.6069504528694258, + "learning_rate": 3.432937581367073e-07, + "loss": 0.2476, + "step": 3308 + }, + { + "epoch": 1.8171334431630972, + "grad_norm": 0.48388766222356844, + "learning_rate": 3.430180009920129e-07, + "loss": 0.2613, + "step": 3309 + }, + { + "epoch": 1.817682591982427, + "grad_norm": 0.48494068353860936, + "learning_rate": 3.4274229683983304e-07, + "loss": 0.2912, + "step": 3310 + }, + { + "epoch": 1.8182317408017572, + "grad_norm": 0.5403158036794079, + "learning_rate": 3.4246664577320765e-07, + "loss": 0.2287, + "step": 3311 + }, + { + "epoch": 1.8187808896210873, + "grad_norm": 0.5326817938968692, + "learning_rate": 3.421910478851588e-07, + "loss": 0.2312, + "step": 3312 + }, + { + "epoch": 1.8193300384404174, + "grad_norm": 0.4645358000272507, + "learning_rate": 3.4191550326869036e-07, + "loss": 0.2544, + "step": 3313 + }, + { + "epoch": 1.8198791872597475, + "grad_norm": 0.45602159824573074, + "learning_rate": 3.4164001201678875e-07, + "loss": 0.2687, + "step": 3314 + }, + { + "epoch": 1.8204283360790774, + "grad_norm": 0.506816176667018, + "learning_rate": 3.413645742224221e-07, + "loss": 0.2716, + "step": 3315 + }, + { + "epoch": 1.8209774848984075, + "grad_norm": 0.5293502363415263, + "learning_rate": 3.4108918997854033e-07, + "loss": 0.2548, + "step": 3316 + }, + { + "epoch": 1.8215266337177374, + "grad_norm": 0.5408978460323642, + "learning_rate": 3.40813859378076e-07, + "loss": 0.2634, + "step": 3317 + }, + { + "epoch": 1.8220757825370675, + "grad_norm": 0.5542902630733466, + "learning_rate": 3.405385825139424e-07, + "loss": 0.2401, + "step": 3318 + }, + { + "epoch": 1.8226249313563976, + "grad_norm": 0.5583277538987553, + "learning_rate": 3.402633594790357e-07, + "loss": 0.2298, + "step": 3319 + }, + { + "epoch": 1.8231740801757277, + "grad_norm": 0.4737070848412397, + "learning_rate": 3.3998819036623334e-07, + "loss": 0.278, + "step": 3320 + }, + { + "epoch": 1.8237232289950578, + "grad_norm": 0.5411237796452439, + "learning_rate": 3.397130752683948e-07, + "loss": 0.2631, + "step": 3321 + }, + { + "epoch": 1.8242723778143877, + "grad_norm": 0.4800744141929021, + "learning_rate": 3.3943801427836147e-07, + "loss": 0.2619, + "step": 3322 + }, + { + "epoch": 1.8248215266337178, + "grad_norm": 0.5408293775671056, + "learning_rate": 3.3916300748895615e-07, + "loss": 0.2472, + "step": 3323 + }, + { + "epoch": 1.8253706754530477, + "grad_norm": 0.537518396279777, + "learning_rate": 3.388880549929836e-07, + "loss": 0.2422, + "step": 3324 + }, + { + "epoch": 1.8259198242723778, + "grad_norm": 0.5445276380746559, + "learning_rate": 3.3861315688322995e-07, + "loss": 0.2627, + "step": 3325 + }, + { + "epoch": 1.8264689730917079, + "grad_norm": 0.6465367905144461, + "learning_rate": 3.3833831325246327e-07, + "loss": 0.2387, + "step": 3326 + }, + { + "epoch": 1.827018121911038, + "grad_norm": 0.6305545886703103, + "learning_rate": 3.3806352419343334e-07, + "loss": 0.2545, + "step": 3327 + }, + { + "epoch": 1.827567270730368, + "grad_norm": 0.4842206344722125, + "learning_rate": 3.3778878979887125e-07, + "loss": 0.2063, + "step": 3328 + }, + { + "epoch": 1.828116419549698, + "grad_norm": 0.5541104374469734, + "learning_rate": 3.3751411016148963e-07, + "loss": 0.2574, + "step": 3329 + }, + { + "epoch": 1.828665568369028, + "grad_norm": 0.43378948519634647, + "learning_rate": 3.372394853739827e-07, + "loss": 0.2492, + "step": 3330 + }, + { + "epoch": 1.829214717188358, + "grad_norm": 0.40222017261294285, + "learning_rate": 3.3696491552902635e-07, + "loss": 0.2522, + "step": 3331 + }, + { + "epoch": 1.829763866007688, + "grad_norm": 0.5094872750381726, + "learning_rate": 3.3669040071927783e-07, + "loss": 0.2362, + "step": 3332 + }, + { + "epoch": 1.8303130148270181, + "grad_norm": 0.5265832019033582, + "learning_rate": 3.364159410373755e-07, + "loss": 0.2619, + "step": 3333 + }, + { + "epoch": 1.8308621636463482, + "grad_norm": 0.5543041495016467, + "learning_rate": 3.3614153657594006e-07, + "loss": 0.2511, + "step": 3334 + }, + { + "epoch": 1.8314113124656783, + "grad_norm": 0.5691177331659057, + "learning_rate": 3.35867187427572e-07, + "loss": 0.2538, + "step": 3335 + }, + { + "epoch": 1.8319604612850082, + "grad_norm": 0.45534277969542813, + "learning_rate": 3.355928936848546e-07, + "loss": 0.183, + "step": 3336 + }, + { + "epoch": 1.8325096101043383, + "grad_norm": 0.49743842436769414, + "learning_rate": 3.3531865544035184e-07, + "loss": 0.2128, + "step": 3337 + }, + { + "epoch": 1.8330587589236682, + "grad_norm": 0.5037559589518178, + "learning_rate": 3.3504447278660867e-07, + "loss": 0.2175, + "step": 3338 + }, + { + "epoch": 1.8336079077429983, + "grad_norm": 0.4787201459836962, + "learning_rate": 3.347703458161524e-07, + "loss": 0.2323, + "step": 3339 + }, + { + "epoch": 1.8341570565623284, + "grad_norm": 0.4747446443338989, + "learning_rate": 3.3449627462149e-07, + "loss": 0.2302, + "step": 3340 + }, + { + "epoch": 1.8347062053816585, + "grad_norm": 0.4057185011756123, + "learning_rate": 3.342222592951107e-07, + "loss": 0.2408, + "step": 3341 + }, + { + "epoch": 1.8352553542009886, + "grad_norm": 0.5403296029683571, + "learning_rate": 3.339482999294847e-07, + "loss": 0.2215, + "step": 3342 + }, + { + "epoch": 1.8358045030203185, + "grad_norm": 0.5943642653391777, + "learning_rate": 3.3367439661706293e-07, + "loss": 0.2748, + "step": 3343 + }, + { + "epoch": 1.8363536518396484, + "grad_norm": 0.4129034959731085, + "learning_rate": 3.334005494502779e-07, + "loss": 0.2491, + "step": 3344 + }, + { + "epoch": 1.8369028006589785, + "grad_norm": 0.5002790970553266, + "learning_rate": 3.33126758521543e-07, + "loss": 0.2974, + "step": 3345 + }, + { + "epoch": 1.8374519494783086, + "grad_norm": 0.5174459510775207, + "learning_rate": 3.3285302392325233e-07, + "loss": 0.2508, + "step": 3346 + }, + { + "epoch": 1.8380010982976387, + "grad_norm": 0.5027383483532301, + "learning_rate": 3.3257934574778126e-07, + "loss": 0.2524, + "step": 3347 + }, + { + "epoch": 1.8385502471169688, + "grad_norm": 0.5751787865850971, + "learning_rate": 3.323057240874862e-07, + "loss": 0.2245, + "step": 3348 + }, + { + "epoch": 1.839099395936299, + "grad_norm": 0.5875041255322028, + "learning_rate": 3.320321590347044e-07, + "loss": 0.2432, + "step": 3349 + }, + { + "epoch": 1.8396485447556288, + "grad_norm": 0.49012613857855764, + "learning_rate": 3.3175865068175403e-07, + "loss": 0.2516, + "step": 3350 + }, + { + "epoch": 1.8401976935749587, + "grad_norm": 0.47469708730575483, + "learning_rate": 3.3148519912093387e-07, + "loss": 0.2149, + "step": 3351 + }, + { + "epoch": 1.8407468423942888, + "grad_norm": 0.5787021560105725, + "learning_rate": 3.3121180444452373e-07, + "loss": 0.2494, + "step": 3352 + }, + { + "epoch": 1.8412959912136189, + "grad_norm": 0.4949237819687673, + "learning_rate": 3.3093846674478455e-07, + "loss": 0.231, + "step": 3353 + }, + { + "epoch": 1.841845140032949, + "grad_norm": 0.5387514125064902, + "learning_rate": 3.306651861139575e-07, + "loss": 0.2346, + "step": 3354 + }, + { + "epoch": 1.842394288852279, + "grad_norm": 0.6502984452848531, + "learning_rate": 3.303919626442647e-07, + "loss": 0.2576, + "step": 3355 + }, + { + "epoch": 1.842943437671609, + "grad_norm": 0.514665672711255, + "learning_rate": 3.3011879642790947e-07, + "loss": 0.2291, + "step": 3356 + }, + { + "epoch": 1.843492586490939, + "grad_norm": 0.5011657894212321, + "learning_rate": 3.298456875570746e-07, + "loss": 0.2766, + "step": 3357 + }, + { + "epoch": 1.844041735310269, + "grad_norm": 0.4965709040699433, + "learning_rate": 3.2957263612392477e-07, + "loss": 0.244, + "step": 3358 + }, + { + "epoch": 1.844590884129599, + "grad_norm": 0.44766157578846966, + "learning_rate": 3.292996422206047e-07, + "loss": 0.2473, + "step": 3359 + }, + { + "epoch": 1.8451400329489291, + "grad_norm": 0.6333386369232549, + "learning_rate": 3.2902670593923946e-07, + "loss": 0.2303, + "step": 3360 + }, + { + "epoch": 1.8456891817682592, + "grad_norm": 0.46904986279500355, + "learning_rate": 3.287538273719356e-07, + "loss": 0.244, + "step": 3361 + }, + { + "epoch": 1.8462383305875893, + "grad_norm": 0.47952786079284393, + "learning_rate": 3.284810066107791e-07, + "loss": 0.2725, + "step": 3362 + }, + { + "epoch": 1.8467874794069192, + "grad_norm": 0.5417386364279941, + "learning_rate": 3.2820824374783695e-07, + "loss": 0.2637, + "step": 3363 + }, + { + "epoch": 1.8473366282262493, + "grad_norm": 0.5475352527154584, + "learning_rate": 3.2793553887515674e-07, + "loss": 0.2313, + "step": 3364 + }, + { + "epoch": 1.8478857770455792, + "grad_norm": 0.5229136802125284, + "learning_rate": 3.276628920847662e-07, + "loss": 0.2438, + "step": 3365 + }, + { + "epoch": 1.8484349258649093, + "grad_norm": 0.46736828229650823, + "learning_rate": 3.2739030346867377e-07, + "loss": 0.2259, + "step": 3366 + }, + { + "epoch": 1.8489840746842394, + "grad_norm": 0.44848653284047074, + "learning_rate": 3.271177731188679e-07, + "loss": 0.244, + "step": 3367 + }, + { + "epoch": 1.8495332235035695, + "grad_norm": 0.5167429015228152, + "learning_rate": 3.2684530112731746e-07, + "loss": 0.2685, + "step": 3368 + }, + { + "epoch": 1.8500823723228996, + "grad_norm": 0.4507284177451039, + "learning_rate": 3.2657288758597176e-07, + "loss": 0.2377, + "step": 3369 + }, + { + "epoch": 1.8506315211422295, + "grad_norm": 0.5828584903515878, + "learning_rate": 3.263005325867605e-07, + "loss": 0.2954, + "step": 3370 + }, + { + "epoch": 1.8511806699615596, + "grad_norm": 0.713476964102636, + "learning_rate": 3.260282362215933e-07, + "loss": 0.2486, + "step": 3371 + }, + { + "epoch": 1.8517298187808895, + "grad_norm": 0.545337100163429, + "learning_rate": 3.257559985823603e-07, + "loss": 0.2553, + "step": 3372 + }, + { + "epoch": 1.8522789676002196, + "grad_norm": 0.37106111183336316, + "learning_rate": 3.254838197609315e-07, + "loss": 0.2773, + "step": 3373 + }, + { + "epoch": 1.8528281164195497, + "grad_norm": 0.5001622745163616, + "learning_rate": 3.252116998491572e-07, + "loss": 0.2421, + "step": 3374 + }, + { + "epoch": 1.8533772652388798, + "grad_norm": 0.4840128639367088, + "learning_rate": 3.24939638938868e-07, + "loss": 0.2814, + "step": 3375 + }, + { + "epoch": 1.85392641405821, + "grad_norm": 0.3916739295962748, + "learning_rate": 3.246676371218744e-07, + "loss": 0.2504, + "step": 3376 + }, + { + "epoch": 1.8544755628775398, + "grad_norm": 0.4475002867875725, + "learning_rate": 3.2439569448996686e-07, + "loss": 0.2324, + "step": 3377 + }, + { + "epoch": 1.8550247116968699, + "grad_norm": 0.5275915896438861, + "learning_rate": 3.2412381113491623e-07, + "loss": 0.248, + "step": 3378 + }, + { + "epoch": 1.8555738605161998, + "grad_norm": 0.4353998259510274, + "learning_rate": 3.23851987148473e-07, + "loss": 0.2068, + "step": 3379 + }, + { + "epoch": 1.8561230093355299, + "grad_norm": 0.6358772487505907, + "learning_rate": 3.235802226223677e-07, + "loss": 0.2777, + "step": 3380 + }, + { + "epoch": 1.85667215815486, + "grad_norm": 0.6134352381471003, + "learning_rate": 3.233085176483109e-07, + "loss": 0.2779, + "step": 3381 + }, + { + "epoch": 1.85722130697419, + "grad_norm": 0.5711003635531834, + "learning_rate": 3.23036872317993e-07, + "loss": 0.2319, + "step": 3382 + }, + { + "epoch": 1.8577704557935202, + "grad_norm": 0.47049117151547104, + "learning_rate": 3.227652867230843e-07, + "loss": 0.2081, + "step": 3383 + }, + { + "epoch": 1.85831960461285, + "grad_norm": 0.49170460434593816, + "learning_rate": 3.22493760955235e-07, + "loss": 0.2548, + "step": 3384 + }, + { + "epoch": 1.8588687534321802, + "grad_norm": 0.47086735681156167, + "learning_rate": 3.22222295106075e-07, + "loss": 0.2275, + "step": 3385 + }, + { + "epoch": 1.85941790225151, + "grad_norm": 0.49382018819762935, + "learning_rate": 3.2195088926721384e-07, + "loss": 0.2476, + "step": 3386 + }, + { + "epoch": 1.8599670510708401, + "grad_norm": 0.5279623403524477, + "learning_rate": 3.216795435302413e-07, + "loss": 0.2399, + "step": 3387 + }, + { + "epoch": 1.8605161998901703, + "grad_norm": 0.4754473271036646, + "learning_rate": 3.214082579867264e-07, + "loss": 0.259, + "step": 3388 + }, + { + "epoch": 1.8610653487095004, + "grad_norm": 0.45962946804898536, + "learning_rate": 3.2113703272821816e-07, + "loss": 0.2587, + "step": 3389 + }, + { + "epoch": 1.8616144975288305, + "grad_norm": 0.4840917537347479, + "learning_rate": 3.2086586784624487e-07, + "loss": 0.283, + "step": 3390 + }, + { + "epoch": 1.8621636463481603, + "grad_norm": 0.49234760335355526, + "learning_rate": 3.205947634323147e-07, + "loss": 0.2413, + "step": 3391 + }, + { + "epoch": 1.8627127951674904, + "grad_norm": 0.453726410943816, + "learning_rate": 3.2032371957791564e-07, + "loss": 0.2055, + "step": 3392 + }, + { + "epoch": 1.8632619439868203, + "grad_norm": 0.6519542418912082, + "learning_rate": 3.200527363745149e-07, + "loss": 0.2732, + "step": 3393 + }, + { + "epoch": 1.8638110928061504, + "grad_norm": 0.45802618419018054, + "learning_rate": 3.1978181391355916e-07, + "loss": 0.2412, + "step": 3394 + }, + { + "epoch": 1.8643602416254805, + "grad_norm": 0.5231012394047956, + "learning_rate": 3.1951095228647516e-07, + "loss": 0.2564, + "step": 3395 + }, + { + "epoch": 1.8649093904448106, + "grad_norm": 0.6105060454516515, + "learning_rate": 3.1924015158466837e-07, + "loss": 0.2917, + "step": 3396 + }, + { + "epoch": 1.8654585392641407, + "grad_norm": 0.520991227776756, + "learning_rate": 3.189694118995242e-07, + "loss": 0.2467, + "step": 3397 + }, + { + "epoch": 1.8660076880834706, + "grad_norm": 0.4760909224745546, + "learning_rate": 3.186987333224073e-07, + "loss": 0.2304, + "step": 3398 + }, + { + "epoch": 1.8665568369028005, + "grad_norm": 0.5315733801345761, + "learning_rate": 3.1842811594466145e-07, + "loss": 0.2676, + "step": 3399 + }, + { + "epoch": 1.8671059857221306, + "grad_norm": 0.6339503831065317, + "learning_rate": 3.181575598576106e-07, + "loss": 0.2206, + "step": 3400 + }, + { + "epoch": 1.8671059857221306, + "eval_loss": 0.3253461718559265, + "eval_runtime": 18.6946, + "eval_samples_per_second": 23.697, + "eval_steps_per_second": 1.016, + "step": 3400 + }, + { + "epoch": 1.8676551345414607, + "grad_norm": 0.621225604830136, + "learning_rate": 3.1788706515255703e-07, + "loss": 0.2168, + "step": 3401 + }, + { + "epoch": 1.8682042833607908, + "grad_norm": 0.4668938231581029, + "learning_rate": 3.1761663192078285e-07, + "loss": 0.2342, + "step": 3402 + }, + { + "epoch": 1.868753432180121, + "grad_norm": 0.48267350744936544, + "learning_rate": 3.173462602535492e-07, + "loss": 0.2439, + "step": 3403 + }, + { + "epoch": 1.869302580999451, + "grad_norm": 0.4769968008709638, + "learning_rate": 3.170759502420968e-07, + "loss": 0.2246, + "step": 3404 + }, + { + "epoch": 1.869851729818781, + "grad_norm": 0.561999375905934, + "learning_rate": 3.1680570197764523e-07, + "loss": 0.2241, + "step": 3405 + }, + { + "epoch": 1.8704008786381108, + "grad_norm": 0.5396921113371931, + "learning_rate": 3.165355155513934e-07, + "loss": 0.196, + "step": 3406 + }, + { + "epoch": 1.8709500274574409, + "grad_norm": 0.5467559691250319, + "learning_rate": 3.16265391054519e-07, + "loss": 0.2593, + "step": 3407 + }, + { + "epoch": 1.871499176276771, + "grad_norm": 0.7266026109914191, + "learning_rate": 3.159953285781792e-07, + "loss": 0.3206, + "step": 3408 + }, + { + "epoch": 1.872048325096101, + "grad_norm": 0.4534962872981325, + "learning_rate": 3.1572532821351035e-07, + "loss": 0.2563, + "step": 3409 + }, + { + "epoch": 1.8725974739154312, + "grad_norm": 0.4942119231837779, + "learning_rate": 3.1545539005162735e-07, + "loss": 0.2551, + "step": 3410 + }, + { + "epoch": 1.873146622734761, + "grad_norm": 0.5206176123358769, + "learning_rate": 3.151855141836247e-07, + "loss": 0.2449, + "step": 3411 + }, + { + "epoch": 1.8736957715540912, + "grad_norm": 0.5132252752366697, + "learning_rate": 3.149157007005752e-07, + "loss": 0.2412, + "step": 3412 + }, + { + "epoch": 1.874244920373421, + "grad_norm": 0.787143973000236, + "learning_rate": 3.1464594969353115e-07, + "loss": 0.2841, + "step": 3413 + }, + { + "epoch": 1.8747940691927512, + "grad_norm": 0.5094105932039332, + "learning_rate": 3.143762612535236e-07, + "loss": 0.2616, + "step": 3414 + }, + { + "epoch": 1.8753432180120813, + "grad_norm": 0.4942261865413558, + "learning_rate": 3.141066354715625e-07, + "loss": 0.2572, + "step": 3415 + }, + { + "epoch": 1.8758923668314114, + "grad_norm": 0.5080046242839126, + "learning_rate": 3.138370724386362e-07, + "loss": 0.2582, + "step": 3416 + }, + { + "epoch": 1.8764415156507415, + "grad_norm": 0.5377222566433009, + "learning_rate": 3.13567572245713e-07, + "loss": 0.2412, + "step": 3417 + }, + { + "epoch": 1.8769906644700713, + "grad_norm": 0.4442819450254423, + "learning_rate": 3.1329813498373886e-07, + "loss": 0.2378, + "step": 3418 + }, + { + "epoch": 1.8775398132894014, + "grad_norm": 0.46640323843178394, + "learning_rate": 3.1302876074363896e-07, + "loss": 0.25, + "step": 3419 + }, + { + "epoch": 1.8780889621087313, + "grad_norm": 0.49314153111608217, + "learning_rate": 3.127594496163172e-07, + "loss": 0.2343, + "step": 3420 + }, + { + "epoch": 1.8786381109280614, + "grad_norm": 0.5668384776222555, + "learning_rate": 3.124902016926561e-07, + "loss": 0.2524, + "step": 3421 + }, + { + "epoch": 1.8791872597473915, + "grad_norm": 0.6377310032623316, + "learning_rate": 3.122210170635171e-07, + "loss": 0.2618, + "step": 3422 + }, + { + "epoch": 1.8797364085667216, + "grad_norm": 0.5092031029825205, + "learning_rate": 3.1195189581974004e-07, + "loss": 0.2725, + "step": 3423 + }, + { + "epoch": 1.8802855573860517, + "grad_norm": 0.5096082849109681, + "learning_rate": 3.1168283805214326e-07, + "loss": 0.2234, + "step": 3424 + }, + { + "epoch": 1.8808347062053816, + "grad_norm": 0.4481000777424056, + "learning_rate": 3.1141384385152383e-07, + "loss": 0.2336, + "step": 3425 + }, + { + "epoch": 1.8813838550247117, + "grad_norm": 0.6002745810595073, + "learning_rate": 3.111449133086577e-07, + "loss": 0.2937, + "step": 3426 + }, + { + "epoch": 1.8819330038440416, + "grad_norm": 0.44636072584504, + "learning_rate": 3.1087604651429876e-07, + "loss": 0.2439, + "step": 3427 + }, + { + "epoch": 1.8824821526633717, + "grad_norm": 0.5381264811098155, + "learning_rate": 3.106072435591798e-07, + "loss": 0.2427, + "step": 3428 + }, + { + "epoch": 1.8830313014827018, + "grad_norm": 0.5793866095485689, + "learning_rate": 3.103385045340118e-07, + "loss": 0.2389, + "step": 3429 + }, + { + "epoch": 1.883580450302032, + "grad_norm": 0.37894530138232296, + "learning_rate": 3.100698295294842e-07, + "loss": 0.2387, + "step": 3430 + }, + { + "epoch": 1.884129599121362, + "grad_norm": 0.520801339899228, + "learning_rate": 3.0980121863626506e-07, + "loss": 0.3204, + "step": 3431 + }, + { + "epoch": 1.884678747940692, + "grad_norm": 0.49529620861758034, + "learning_rate": 3.095326719450007e-07, + "loss": 0.2405, + "step": 3432 + }, + { + "epoch": 1.885227896760022, + "grad_norm": 0.45988611331257506, + "learning_rate": 3.092641895463157e-07, + "loss": 0.2391, + "step": 3433 + }, + { + "epoch": 1.8857770455793519, + "grad_norm": 0.4217670019251589, + "learning_rate": 3.089957715308128e-07, + "loss": 0.2176, + "step": 3434 + }, + { + "epoch": 1.886326194398682, + "grad_norm": 0.5640913476568804, + "learning_rate": 3.0872741798907337e-07, + "loss": 0.2394, + "step": 3435 + }, + { + "epoch": 1.886875343218012, + "grad_norm": 0.48773088253283076, + "learning_rate": 3.084591290116569e-07, + "loss": 0.2493, + "step": 3436 + }, + { + "epoch": 1.8874244920373422, + "grad_norm": 0.6007741500051827, + "learning_rate": 3.081909046891007e-07, + "loss": 0.2425, + "step": 3437 + }, + { + "epoch": 1.8879736408566723, + "grad_norm": 0.5142447056479005, + "learning_rate": 3.0792274511192103e-07, + "loss": 0.2222, + "step": 3438 + }, + { + "epoch": 1.8885227896760022, + "grad_norm": 0.4078709312343764, + "learning_rate": 3.0765465037061176e-07, + "loss": 0.2398, + "step": 3439 + }, + { + "epoch": 1.8890719384953323, + "grad_norm": 0.4681143045054727, + "learning_rate": 3.0738662055564474e-07, + "loss": 0.2735, + "step": 3440 + }, + { + "epoch": 1.8896210873146622, + "grad_norm": 0.48117129035144396, + "learning_rate": 3.071186557574705e-07, + "loss": 0.239, + "step": 3441 + }, + { + "epoch": 1.8901702361339923, + "grad_norm": 0.4733367001994018, + "learning_rate": 3.068507560665168e-07, + "loss": 0.2005, + "step": 3442 + }, + { + "epoch": 1.8907193849533224, + "grad_norm": 0.4714859656673274, + "learning_rate": 3.0658292157319047e-07, + "loss": 0.2383, + "step": 3443 + }, + { + "epoch": 1.8912685337726525, + "grad_norm": 0.4941764977268472, + "learning_rate": 3.063151523678755e-07, + "loss": 0.2693, + "step": 3444 + }, + { + "epoch": 1.8918176825919826, + "grad_norm": 0.4687583054869288, + "learning_rate": 3.060474485409342e-07, + "loss": 0.2559, + "step": 3445 + }, + { + "epoch": 1.8923668314113125, + "grad_norm": 0.39587052487787744, + "learning_rate": 3.057798101827067e-07, + "loss": 0.2273, + "step": 3446 + }, + { + "epoch": 1.8929159802306426, + "grad_norm": 0.5401863754440832, + "learning_rate": 3.0551223738351095e-07, + "loss": 0.2551, + "step": 3447 + }, + { + "epoch": 1.8934651290499724, + "grad_norm": 0.44288774422393146, + "learning_rate": 3.0524473023364324e-07, + "loss": 0.2147, + "step": 3448 + }, + { + "epoch": 1.8940142778693025, + "grad_norm": 0.4692914808594869, + "learning_rate": 3.0497728882337715e-07, + "loss": 0.24, + "step": 3449 + }, + { + "epoch": 1.8945634266886326, + "grad_norm": 0.4966494669074835, + "learning_rate": 3.0470991324296445e-07, + "loss": 0.2443, + "step": 3450 + }, + { + "epoch": 1.8951125755079627, + "grad_norm": 0.44355694562598424, + "learning_rate": 3.0444260358263427e-07, + "loss": 0.2669, + "step": 3451 + }, + { + "epoch": 1.8956617243272929, + "grad_norm": 0.45874816369881105, + "learning_rate": 3.041753599325941e-07, + "loss": 0.2717, + "step": 3452 + }, + { + "epoch": 1.8962108731466227, + "grad_norm": 0.5612745481377038, + "learning_rate": 3.039081823830286e-07, + "loss": 0.2576, + "step": 3453 + }, + { + "epoch": 1.8967600219659526, + "grad_norm": 0.4401127331815031, + "learning_rate": 3.036410710241004e-07, + "loss": 0.2378, + "step": 3454 + }, + { + "epoch": 1.8973091707852827, + "grad_norm": 0.5590536605079881, + "learning_rate": 3.033740259459501e-07, + "loss": 0.2553, + "step": 3455 + }, + { + "epoch": 1.8978583196046128, + "grad_norm": 0.3964795653782331, + "learning_rate": 3.031070472386949e-07, + "loss": 0.2223, + "step": 3456 + }, + { + "epoch": 1.898407468423943, + "grad_norm": 0.5081343904334528, + "learning_rate": 3.028401349924308e-07, + "loss": 0.2579, + "step": 3457 + }, + { + "epoch": 1.898956617243273, + "grad_norm": 0.6032386389083711, + "learning_rate": 3.025732892972306e-07, + "loss": 0.2557, + "step": 3458 + }, + { + "epoch": 1.8995057660626031, + "grad_norm": 0.6850028431438505, + "learning_rate": 3.0230651024314484e-07, + "loss": 0.2739, + "step": 3459 + }, + { + "epoch": 1.900054914881933, + "grad_norm": 0.491731791959141, + "learning_rate": 3.0203979792020196e-07, + "loss": 0.2309, + "step": 3460 + }, + { + "epoch": 1.900604063701263, + "grad_norm": 0.5856933081034505, + "learning_rate": 3.0177315241840736e-07, + "loss": 0.2584, + "step": 3461 + }, + { + "epoch": 1.901153212520593, + "grad_norm": 0.7204728664034864, + "learning_rate": 3.0150657382774396e-07, + "loss": 0.2644, + "step": 3462 + }, + { + "epoch": 1.901702361339923, + "grad_norm": 0.4154287994693699, + "learning_rate": 3.012400622381724e-07, + "loss": 0.2168, + "step": 3463 + }, + { + "epoch": 1.9022515101592532, + "grad_norm": 0.4574514949048822, + "learning_rate": 3.0097361773963025e-07, + "loss": 0.2398, + "step": 3464 + }, + { + "epoch": 1.9028006589785833, + "grad_norm": 0.5492084644887225, + "learning_rate": 3.00707240422033e-07, + "loss": 0.2025, + "step": 3465 + }, + { + "epoch": 1.9033498077979132, + "grad_norm": 0.385074751717564, + "learning_rate": 3.004409303752731e-07, + "loss": 0.2552, + "step": 3466 + }, + { + "epoch": 1.9038989566172433, + "grad_norm": 0.5363834118827968, + "learning_rate": 3.0017468768922036e-07, + "loss": 0.2806, + "step": 3467 + }, + { + "epoch": 1.9044481054365732, + "grad_norm": 0.5411701116909793, + "learning_rate": 2.999085124537217e-07, + "loss": 0.2507, + "step": 3468 + }, + { + "epoch": 1.9049972542559033, + "grad_norm": 0.5117768984504683, + "learning_rate": 2.9964240475860174e-07, + "loss": 0.2471, + "step": 3469 + }, + { + "epoch": 1.9055464030752334, + "grad_norm": 0.4404353668899281, + "learning_rate": 2.99376364693662e-07, + "loss": 0.2815, + "step": 3470 + }, + { + "epoch": 1.9060955518945635, + "grad_norm": 0.4788918464678227, + "learning_rate": 2.991103923486809e-07, + "loss": 0.2333, + "step": 3471 + }, + { + "epoch": 1.9066447007138936, + "grad_norm": 0.5785813725644967, + "learning_rate": 2.988444878134148e-07, + "loss": 0.2222, + "step": 3472 + }, + { + "epoch": 1.9071938495332235, + "grad_norm": 0.4232710318241545, + "learning_rate": 2.9857865117759607e-07, + "loss": 0.2382, + "step": 3473 + }, + { + "epoch": 1.9077429983525536, + "grad_norm": 0.52991427976696, + "learning_rate": 2.983128825309353e-07, + "loss": 0.2437, + "step": 3474 + }, + { + "epoch": 1.9082921471718834, + "grad_norm": 0.5550804056179012, + "learning_rate": 2.980471819631194e-07, + "loss": 0.2688, + "step": 3475 + }, + { + "epoch": 1.9088412959912135, + "grad_norm": 0.52100450679202, + "learning_rate": 2.9778154956381246e-07, + "loss": 0.2296, + "step": 3476 + }, + { + "epoch": 1.9093904448105437, + "grad_norm": 0.5542332546738085, + "learning_rate": 2.97515985422656e-07, + "loss": 0.2807, + "step": 3477 + }, + { + "epoch": 1.9099395936298738, + "grad_norm": 0.6044029928486673, + "learning_rate": 2.9725048962926757e-07, + "loss": 0.2589, + "step": 3478 + }, + { + "epoch": 1.9104887424492039, + "grad_norm": 0.4866484505258523, + "learning_rate": 2.969850622732426e-07, + "loss": 0.2316, + "step": 3479 + }, + { + "epoch": 1.9110378912685337, + "grad_norm": 0.4953313876800655, + "learning_rate": 2.967197034441529e-07, + "loss": 0.2546, + "step": 3480 + }, + { + "epoch": 1.9115870400878638, + "grad_norm": 0.7121089140791472, + "learning_rate": 2.964544132315473e-07, + "loss": 0.2661, + "step": 3481 + }, + { + "epoch": 1.9121361889071937, + "grad_norm": 0.43189198642144205, + "learning_rate": 2.961891917249516e-07, + "loss": 0.2404, + "step": 3482 + }, + { + "epoch": 1.9126853377265238, + "grad_norm": 0.4650863471224219, + "learning_rate": 2.959240390138683e-07, + "loss": 0.2601, + "step": 3483 + }, + { + "epoch": 1.913234486545854, + "grad_norm": 0.5929447563504545, + "learning_rate": 2.9565895518777647e-07, + "loss": 0.2323, + "step": 3484 + }, + { + "epoch": 1.913783635365184, + "grad_norm": 0.40527071770482453, + "learning_rate": 2.9539394033613216e-07, + "loss": 0.2515, + "step": 3485 + }, + { + "epoch": 1.9143327841845141, + "grad_norm": 0.8125714935134585, + "learning_rate": 2.9512899454836826e-07, + "loss": 0.292, + "step": 3486 + }, + { + "epoch": 1.914881933003844, + "grad_norm": 0.6883562582271736, + "learning_rate": 2.948641179138942e-07, + "loss": 0.3109, + "step": 3487 + }, + { + "epoch": 1.9154310818231741, + "grad_norm": 0.5018155614481865, + "learning_rate": 2.9459931052209617e-07, + "loss": 0.2455, + "step": 3488 + }, + { + "epoch": 1.915980230642504, + "grad_norm": 0.43482533807172197, + "learning_rate": 2.943345724623366e-07, + "loss": 0.2647, + "step": 3489 + }, + { + "epoch": 1.916529379461834, + "grad_norm": 0.5132415898751335, + "learning_rate": 2.940699038239549e-07, + "loss": 0.2127, + "step": 3490 + }, + { + "epoch": 1.9170785282811642, + "grad_norm": 0.45603569157175095, + "learning_rate": 2.938053046962673e-07, + "loss": 0.255, + "step": 3491 + }, + { + "epoch": 1.9176276771004943, + "grad_norm": 0.4516091899983879, + "learning_rate": 2.9354077516856593e-07, + "loss": 0.235, + "step": 3492 + }, + { + "epoch": 1.9181768259198244, + "grad_norm": 0.5579624955390328, + "learning_rate": 2.932763153301199e-07, + "loss": 0.2356, + "step": 3493 + }, + { + "epoch": 1.9187259747391543, + "grad_norm": 0.46280204989417323, + "learning_rate": 2.930119252701748e-07, + "loss": 0.2702, + "step": 3494 + }, + { + "epoch": 1.9192751235584844, + "grad_norm": 0.42000327519408087, + "learning_rate": 2.927476050779522e-07, + "loss": 0.2864, + "step": 3495 + }, + { + "epoch": 1.9198242723778143, + "grad_norm": 0.5388514746794255, + "learning_rate": 2.9248335484265064e-07, + "loss": 0.2892, + "step": 3496 + }, + { + "epoch": 1.9203734211971444, + "grad_norm": 0.5100549631576503, + "learning_rate": 2.922191746534448e-07, + "loss": 0.2065, + "step": 3497 + }, + { + "epoch": 1.9209225700164745, + "grad_norm": 0.5544308013162146, + "learning_rate": 2.9195506459948584e-07, + "loss": 0.2872, + "step": 3498 + }, + { + "epoch": 1.9214717188358046, + "grad_norm": 0.482773426027997, + "learning_rate": 2.9169102476990117e-07, + "loss": 0.241, + "step": 3499 + }, + { + "epoch": 1.9220208676551347, + "grad_norm": 0.5885970173162935, + "learning_rate": 2.9142705525379417e-07, + "loss": 0.2451, + "step": 3500 + }, + { + "epoch": 1.9225700164744646, + "grad_norm": 0.4753095417820037, + "learning_rate": 2.9116315614024524e-07, + "loss": 0.2964, + "step": 3501 + }, + { + "epoch": 1.9231191652937945, + "grad_norm": 0.4821001546327114, + "learning_rate": 2.9089932751831046e-07, + "loss": 0.2287, + "step": 3502 + }, + { + "epoch": 1.9236683141131246, + "grad_norm": 0.5030692832079965, + "learning_rate": 2.906355694770222e-07, + "loss": 0.236, + "step": 3503 + }, + { + "epoch": 1.9242174629324547, + "grad_norm": 0.8496379344842603, + "learning_rate": 2.903718821053891e-07, + "loss": 0.2729, + "step": 3504 + }, + { + "epoch": 1.9247666117517848, + "grad_norm": 0.5785599421220556, + "learning_rate": 2.901082654923962e-07, + "loss": 0.2235, + "step": 3505 + }, + { + "epoch": 1.9253157605711149, + "grad_norm": 0.9781381882079483, + "learning_rate": 2.898447197270041e-07, + "loss": 0.2572, + "step": 3506 + }, + { + "epoch": 1.925864909390445, + "grad_norm": 0.5339835928687925, + "learning_rate": 2.8958124489814984e-07, + "loss": 0.2758, + "step": 3507 + }, + { + "epoch": 1.9264140582097748, + "grad_norm": 0.48705531471153135, + "learning_rate": 2.893178410947466e-07, + "loss": 0.2161, + "step": 3508 + }, + { + "epoch": 1.9269632070291047, + "grad_norm": 0.609102743577628, + "learning_rate": 2.8905450840568315e-07, + "loss": 0.2846, + "step": 3509 + }, + { + "epoch": 1.9275123558484348, + "grad_norm": 0.46646050526114474, + "learning_rate": 2.8879124691982495e-07, + "loss": 0.2427, + "step": 3510 + }, + { + "epoch": 1.928061504667765, + "grad_norm": 0.5085058742071942, + "learning_rate": 2.885280567260127e-07, + "loss": 0.2391, + "step": 3511 + }, + { + "epoch": 1.928610653487095, + "grad_norm": 0.47156540177633205, + "learning_rate": 2.8826493791306385e-07, + "loss": 0.2462, + "step": 3512 + }, + { + "epoch": 1.9291598023064251, + "grad_norm": 0.4961401790197671, + "learning_rate": 2.880018905697707e-07, + "loss": 0.2497, + "step": 3513 + }, + { + "epoch": 1.9297089511257552, + "grad_norm": 0.5758535825905837, + "learning_rate": 2.8773891478490243e-07, + "loss": 0.2568, + "step": 3514 + }, + { + "epoch": 1.9302580999450851, + "grad_norm": 0.539416186952519, + "learning_rate": 2.8747601064720375e-07, + "loss": 0.259, + "step": 3515 + }, + { + "epoch": 1.930807248764415, + "grad_norm": 0.4670349550398961, + "learning_rate": 2.8721317824539506e-07, + "loss": 0.26, + "step": 3516 + }, + { + "epoch": 1.931356397583745, + "grad_norm": 0.46194109371862896, + "learning_rate": 2.869504176681723e-07, + "loss": 0.2583, + "step": 3517 + }, + { + "epoch": 1.9319055464030752, + "grad_norm": 0.49477642024608054, + "learning_rate": 2.866877290042077e-07, + "loss": 0.288, + "step": 3518 + }, + { + "epoch": 1.9324546952224053, + "grad_norm": 0.49374123699041367, + "learning_rate": 2.864251123421493e-07, + "loss": 0.2093, + "step": 3519 + }, + { + "epoch": 1.9330038440417354, + "grad_norm": 0.41737011331805424, + "learning_rate": 2.8616256777062005e-07, + "loss": 0.2337, + "step": 3520 + }, + { + "epoch": 1.9335529928610653, + "grad_norm": 0.48499202992710083, + "learning_rate": 2.8590009537821944e-07, + "loss": 0.278, + "step": 3521 + }, + { + "epoch": 1.9341021416803954, + "grad_norm": 0.5005301142547031, + "learning_rate": 2.856376952535221e-07, + "loss": 0.2182, + "step": 3522 + }, + { + "epoch": 1.9346512904997253, + "grad_norm": 0.46972440779576397, + "learning_rate": 2.8537536748507825e-07, + "loss": 0.185, + "step": 3523 + }, + { + "epoch": 1.9352004393190554, + "grad_norm": 0.4960262447388615, + "learning_rate": 2.8511311216141394e-07, + "loss": 0.2252, + "step": 3524 + }, + { + "epoch": 1.9357495881383855, + "grad_norm": 0.4767517721703788, + "learning_rate": 2.8485092937103097e-07, + "loss": 0.2459, + "step": 3525 + }, + { + "epoch": 1.9362987369577156, + "grad_norm": 0.5033764727629504, + "learning_rate": 2.845888192024059e-07, + "loss": 0.2443, + "step": 3526 + }, + { + "epoch": 1.9368478857770457, + "grad_norm": 0.41232197287742584, + "learning_rate": 2.8432678174399174e-07, + "loss": 0.2736, + "step": 3527 + }, + { + "epoch": 1.9373970345963756, + "grad_norm": 0.5101825855825676, + "learning_rate": 2.8406481708421595e-07, + "loss": 0.2533, + "step": 3528 + }, + { + "epoch": 1.9379461834157057, + "grad_norm": 0.5581123936733484, + "learning_rate": 2.8380292531148245e-07, + "loss": 0.2855, + "step": 3529 + }, + { + "epoch": 1.9384953322350356, + "grad_norm": 0.5321908556518186, + "learning_rate": 2.8354110651416975e-07, + "loss": 0.2003, + "step": 3530 + }, + { + "epoch": 1.9390444810543657, + "grad_norm": 0.4039345443259138, + "learning_rate": 2.8327936078063196e-07, + "loss": 0.2502, + "step": 3531 + }, + { + "epoch": 1.9395936298736958, + "grad_norm": 0.5358782010635976, + "learning_rate": 2.8301768819919915e-07, + "loss": 0.2571, + "step": 3532 + }, + { + "epoch": 1.9401427786930259, + "grad_norm": 0.5140520490582524, + "learning_rate": 2.8275608885817574e-07, + "loss": 0.2803, + "step": 3533 + }, + { + "epoch": 1.940691927512356, + "grad_norm": 0.4766739576154817, + "learning_rate": 2.8249456284584177e-07, + "loss": 0.2745, + "step": 3534 + }, + { + "epoch": 1.9412410763316859, + "grad_norm": 0.49560530532484964, + "learning_rate": 2.822331102504529e-07, + "loss": 0.2792, + "step": 3535 + }, + { + "epoch": 1.941790225151016, + "grad_norm": 0.6728348026040153, + "learning_rate": 2.819717311602398e-07, + "loss": 0.2834, + "step": 3536 + }, + { + "epoch": 1.9423393739703458, + "grad_norm": 0.44623214683764584, + "learning_rate": 2.8171042566340796e-07, + "loss": 0.2562, + "step": 3537 + }, + { + "epoch": 1.942888522789676, + "grad_norm": 0.5329961545712149, + "learning_rate": 2.814491938481388e-07, + "loss": 0.2642, + "step": 3538 + }, + { + "epoch": 1.943437671609006, + "grad_norm": 0.5382481918275226, + "learning_rate": 2.8118803580258813e-07, + "loss": 0.226, + "step": 3539 + }, + { + "epoch": 1.9439868204283361, + "grad_norm": 0.4294502804872192, + "learning_rate": 2.8092695161488707e-07, + "loss": 0.23, + "step": 3540 + }, + { + "epoch": 1.9445359692476663, + "grad_norm": 0.5181966375096064, + "learning_rate": 2.80665941373142e-07, + "loss": 0.1869, + "step": 3541 + }, + { + "epoch": 1.9450851180669961, + "grad_norm": 0.6269697155225653, + "learning_rate": 2.8040500516543463e-07, + "loss": 0.2309, + "step": 3542 + }, + { + "epoch": 1.9456342668863262, + "grad_norm": 0.6137518663724708, + "learning_rate": 2.8014414307982106e-07, + "loss": 0.2249, + "step": 3543 + }, + { + "epoch": 1.9461834157056561, + "grad_norm": 0.5313339834262895, + "learning_rate": 2.798833552043323e-07, + "loss": 0.2905, + "step": 3544 + }, + { + "epoch": 1.9467325645249862, + "grad_norm": 0.47623669426324017, + "learning_rate": 2.796226416269749e-07, + "loss": 0.2482, + "step": 3545 + }, + { + "epoch": 1.9472817133443163, + "grad_norm": 0.4970381567156964, + "learning_rate": 2.793620024357304e-07, + "loss": 0.2247, + "step": 3546 + }, + { + "epoch": 1.9478308621636464, + "grad_norm": 0.5149144769041067, + "learning_rate": 2.791014377185545e-07, + "loss": 0.2254, + "step": 3547 + }, + { + "epoch": 1.9483800109829765, + "grad_norm": 0.4714395363889867, + "learning_rate": 2.788409475633782e-07, + "loss": 0.2171, + "step": 3548 + }, + { + "epoch": 1.9489291598023064, + "grad_norm": 0.5546220391096439, + "learning_rate": 2.7858053205810775e-07, + "loss": 0.2507, + "step": 3549 + }, + { + "epoch": 1.9494783086216365, + "grad_norm": 0.5715019993948425, + "learning_rate": 2.7832019129062354e-07, + "loss": 0.2274, + "step": 3550 + }, + { + "epoch": 1.9500274574409664, + "grad_norm": 0.5221463850932294, + "learning_rate": 2.780599253487809e-07, + "loss": 0.248, + "step": 3551 + }, + { + "epoch": 1.9505766062602965, + "grad_norm": 0.6335872984457526, + "learning_rate": 2.7779973432040985e-07, + "loss": 0.2545, + "step": 3552 + }, + { + "epoch": 1.9511257550796266, + "grad_norm": 0.516797373127701, + "learning_rate": 2.775396182933158e-07, + "loss": 0.2929, + "step": 3553 + }, + { + "epoch": 1.9516749038989567, + "grad_norm": 0.508830977419197, + "learning_rate": 2.7727957735527797e-07, + "loss": 0.2417, + "step": 3554 + }, + { + "epoch": 1.9522240527182868, + "grad_norm": 0.5296110612524638, + "learning_rate": 2.770196115940504e-07, + "loss": 0.2285, + "step": 3555 + }, + { + "epoch": 1.9527732015376167, + "grad_norm": 0.4311907075317863, + "learning_rate": 2.7675972109736246e-07, + "loss": 0.2235, + "step": 3556 + }, + { + "epoch": 1.9533223503569466, + "grad_norm": 0.5738238851652983, + "learning_rate": 2.7649990595291714e-07, + "loss": 0.3177, + "step": 3557 + }, + { + "epoch": 1.9538714991762767, + "grad_norm": 0.5212528638403917, + "learning_rate": 2.762401662483927e-07, + "loss": 0.2731, + "step": 3558 + }, + { + "epoch": 1.9544206479956068, + "grad_norm": 0.5662470614689857, + "learning_rate": 2.759805020714419e-07, + "loss": 0.2553, + "step": 3559 + }, + { + "epoch": 1.9549697968149369, + "grad_norm": 0.478234869839864, + "learning_rate": 2.7572091350969166e-07, + "loss": 0.2559, + "step": 3560 + }, + { + "epoch": 1.955518945634267, + "grad_norm": 0.4569328627608001, + "learning_rate": 2.754614006507433e-07, + "loss": 0.2507, + "step": 3561 + }, + { + "epoch": 1.956068094453597, + "grad_norm": 0.49520417970736014, + "learning_rate": 2.7520196358217316e-07, + "loss": 0.2685, + "step": 3562 + }, + { + "epoch": 1.956617243272927, + "grad_norm": 0.6050404354330664, + "learning_rate": 2.749426023915318e-07, + "loss": 0.2357, + "step": 3563 + }, + { + "epoch": 1.9571663920922568, + "grad_norm": 0.5574042803861196, + "learning_rate": 2.746833171663437e-07, + "loss": 0.2732, + "step": 3564 + }, + { + "epoch": 1.957715540911587, + "grad_norm": 0.3887222643172956, + "learning_rate": 2.744241079941085e-07, + "loss": 0.2374, + "step": 3565 + }, + { + "epoch": 1.958264689730917, + "grad_norm": 0.4481860368134919, + "learning_rate": 2.741649749622992e-07, + "loss": 0.2913, + "step": 3566 + }, + { + "epoch": 1.9588138385502472, + "grad_norm": 0.5775864143543111, + "learning_rate": 2.7390591815836426e-07, + "loss": 0.2871, + "step": 3567 + }, + { + "epoch": 1.9593629873695773, + "grad_norm": 0.4395967867609971, + "learning_rate": 2.736469376697253e-07, + "loss": 0.2116, + "step": 3568 + }, + { + "epoch": 1.9599121361889071, + "grad_norm": 0.47823803480965066, + "learning_rate": 2.733880335837789e-07, + "loss": 0.2255, + "step": 3569 + }, + { + "epoch": 1.9604612850082372, + "grad_norm": 0.5883628932934369, + "learning_rate": 2.7312920598789584e-07, + "loss": 0.2271, + "step": 3570 + }, + { + "epoch": 1.9610104338275671, + "grad_norm": 0.507651767944914, + "learning_rate": 2.728704549694207e-07, + "loss": 0.2408, + "step": 3571 + }, + { + "epoch": 1.9615595826468972, + "grad_norm": 0.5376614816173333, + "learning_rate": 2.7261178061567225e-07, + "loss": 0.2269, + "step": 3572 + }, + { + "epoch": 1.9621087314662273, + "grad_norm": 0.49546209558327126, + "learning_rate": 2.723531830139439e-07, + "loss": 0.2244, + "step": 3573 + }, + { + "epoch": 1.9626578802855574, + "grad_norm": 0.7384842611794925, + "learning_rate": 2.7209466225150247e-07, + "loss": 0.2602, + "step": 3574 + }, + { + "epoch": 1.9632070291048875, + "grad_norm": 0.5869937728032119, + "learning_rate": 2.718362184155894e-07, + "loss": 0.2227, + "step": 3575 + }, + { + "epoch": 1.9637561779242174, + "grad_norm": 0.5177536804884671, + "learning_rate": 2.715778515934201e-07, + "loss": 0.2656, + "step": 3576 + }, + { + "epoch": 1.9643053267435475, + "grad_norm": 0.5577516716783074, + "learning_rate": 2.713195618721837e-07, + "loss": 0.257, + "step": 3577 + }, + { + "epoch": 1.9648544755628774, + "grad_norm": 0.5940239868467071, + "learning_rate": 2.710613493390432e-07, + "loss": 0.2235, + "step": 3578 + }, + { + "epoch": 1.9654036243822075, + "grad_norm": 0.5068552014319503, + "learning_rate": 2.7080321408113615e-07, + "loss": 0.2024, + "step": 3579 + }, + { + "epoch": 1.9659527732015376, + "grad_norm": 0.49311385759555865, + "learning_rate": 2.7054515618557375e-07, + "loss": 0.2476, + "step": 3580 + }, + { + "epoch": 1.9665019220208677, + "grad_norm": 0.4699113785027886, + "learning_rate": 2.702871757394407e-07, + "loss": 0.2094, + "step": 3581 + }, + { + "epoch": 1.9670510708401978, + "grad_norm": 0.47229523733660006, + "learning_rate": 2.700292728297963e-07, + "loss": 0.2264, + "step": 3582 + }, + { + "epoch": 1.9676002196595277, + "grad_norm": 0.4694630373614784, + "learning_rate": 2.697714475436729e-07, + "loss": 0.255, + "step": 3583 + }, + { + "epoch": 1.9681493684788578, + "grad_norm": 0.46771376675508464, + "learning_rate": 2.695136999680773e-07, + "loss": 0.2307, + "step": 3584 + }, + { + "epoch": 1.9686985172981877, + "grad_norm": 0.44396687966745196, + "learning_rate": 2.6925603018998966e-07, + "loss": 0.2485, + "step": 3585 + }, + { + "epoch": 1.9692476661175178, + "grad_norm": 0.44524074931127644, + "learning_rate": 2.6899843829636395e-07, + "loss": 0.2739, + "step": 3586 + }, + { + "epoch": 1.9697968149368479, + "grad_norm": 0.589200210770938, + "learning_rate": 2.6874092437412855e-07, + "loss": 0.2931, + "step": 3587 + }, + { + "epoch": 1.970345963756178, + "grad_norm": 0.47753013862159893, + "learning_rate": 2.68483488510184e-07, + "loss": 0.2236, + "step": 3588 + }, + { + "epoch": 1.970895112575508, + "grad_norm": 0.5227814670826163, + "learning_rate": 2.6822613079140597e-07, + "loss": 0.2315, + "step": 3589 + }, + { + "epoch": 1.971444261394838, + "grad_norm": 0.4266441980724434, + "learning_rate": 2.679688513046433e-07, + "loss": 0.2207, + "step": 3590 + }, + { + "epoch": 1.971993410214168, + "grad_norm": 0.4613314865314026, + "learning_rate": 2.6771165013671785e-07, + "loss": 0.2426, + "step": 3591 + }, + { + "epoch": 1.972542559033498, + "grad_norm": 0.7017836692745851, + "learning_rate": 2.67454527374426e-07, + "loss": 0.2831, + "step": 3592 + }, + { + "epoch": 1.973091707852828, + "grad_norm": 0.6804160652458832, + "learning_rate": 2.6719748310453714e-07, + "loss": 0.25, + "step": 3593 + }, + { + "epoch": 1.9736408566721582, + "grad_norm": 0.5091180316829615, + "learning_rate": 2.669405174137942e-07, + "loss": 0.269, + "step": 3594 + }, + { + "epoch": 1.9741900054914883, + "grad_norm": 0.48702463995146106, + "learning_rate": 2.666836303889134e-07, + "loss": 0.2365, + "step": 3595 + }, + { + "epoch": 1.9747391543108184, + "grad_norm": 0.44661616246263575, + "learning_rate": 2.664268221165848e-07, + "loss": 0.2264, + "step": 3596 + }, + { + "epoch": 1.9752883031301482, + "grad_norm": 0.5647887117299537, + "learning_rate": 2.661700926834719e-07, + "loss": 0.2458, + "step": 3597 + }, + { + "epoch": 1.9758374519494784, + "grad_norm": 0.7156215820950582, + "learning_rate": 2.6591344217621136e-07, + "loss": 0.2926, + "step": 3598 + }, + { + "epoch": 1.9763866007688082, + "grad_norm": 0.5652838793747974, + "learning_rate": 2.6565687068141306e-07, + "loss": 0.2483, + "step": 3599 + }, + { + "epoch": 1.9769357495881383, + "grad_norm": 0.43264227965890273, + "learning_rate": 2.654003782856605e-07, + "loss": 0.2261, + "step": 3600 + }, + { + "epoch": 1.9769357495881383, + "eval_loss": 0.32394054532051086, + "eval_runtime": 18.6725, + "eval_samples_per_second": 23.725, + "eval_steps_per_second": 1.018, + "step": 3600 + }, + { + "epoch": 1.9774848984074684, + "grad_norm": 0.6006732363553414, + "learning_rate": 2.651439650755107e-07, + "loss": 0.2306, + "step": 3601 + }, + { + "epoch": 1.9780340472267985, + "grad_norm": 0.43265428377104304, + "learning_rate": 2.6488763113749316e-07, + "loss": 0.2423, + "step": 3602 + }, + { + "epoch": 1.9785831960461286, + "grad_norm": 0.4812925459313404, + "learning_rate": 2.646313765581116e-07, + "loss": 0.2558, + "step": 3603 + }, + { + "epoch": 1.9791323448654585, + "grad_norm": 0.9687244444160324, + "learning_rate": 2.643752014238427e-07, + "loss": 0.3615, + "step": 3604 + }, + { + "epoch": 1.9796814936847886, + "grad_norm": 0.5550788339791402, + "learning_rate": 2.641191058211353e-07, + "loss": 0.288, + "step": 3605 + }, + { + "epoch": 1.9802306425041185, + "grad_norm": 0.47120334809379805, + "learning_rate": 2.6386308983641265e-07, + "loss": 0.2641, + "step": 3606 + }, + { + "epoch": 1.9807797913234486, + "grad_norm": 0.5795971406042949, + "learning_rate": 2.63607153556071e-07, + "loss": 0.2564, + "step": 3607 + }, + { + "epoch": 1.9813289401427787, + "grad_norm": 0.41810703743001404, + "learning_rate": 2.6335129706647904e-07, + "loss": 0.2483, + "step": 3608 + }, + { + "epoch": 1.9818780889621088, + "grad_norm": 0.5061621132036708, + "learning_rate": 2.630955204539792e-07, + "loss": 0.236, + "step": 3609 + }, + { + "epoch": 1.982427237781439, + "grad_norm": 0.820991755189141, + "learning_rate": 2.628398238048862e-07, + "loss": 0.2757, + "step": 3610 + }, + { + "epoch": 1.9829763866007688, + "grad_norm": 0.5066048790646377, + "learning_rate": 2.625842072054889e-07, + "loss": 0.2445, + "step": 3611 + }, + { + "epoch": 1.9835255354200987, + "grad_norm": 0.4935582575850033, + "learning_rate": 2.623286707420479e-07, + "loss": 0.2539, + "step": 3612 + }, + { + "epoch": 1.9840746842394288, + "grad_norm": 0.5648924305374863, + "learning_rate": 2.6207321450079757e-07, + "loss": 0.2352, + "step": 3613 + }, + { + "epoch": 1.984623833058759, + "grad_norm": 0.45576977983877387, + "learning_rate": 2.6181783856794516e-07, + "loss": 0.228, + "step": 3614 + }, + { + "epoch": 1.985172981878089, + "grad_norm": 0.44968516032300915, + "learning_rate": 2.6156254302967043e-07, + "loss": 0.2313, + "step": 3615 + }, + { + "epoch": 1.985722130697419, + "grad_norm": 0.42093194093827885, + "learning_rate": 2.6130732797212605e-07, + "loss": 0.2325, + "step": 3616 + }, + { + "epoch": 1.9862712795167492, + "grad_norm": 0.4888210529399285, + "learning_rate": 2.61052193481438e-07, + "loss": 0.2778, + "step": 3617 + }, + { + "epoch": 1.986820428336079, + "grad_norm": 0.500435220620874, + "learning_rate": 2.6079713964370476e-07, + "loss": 0.2284, + "step": 3618 + }, + { + "epoch": 1.987369577155409, + "grad_norm": 0.5204022908253535, + "learning_rate": 2.605421665449974e-07, + "loss": 0.2354, + "step": 3619 + }, + { + "epoch": 1.987918725974739, + "grad_norm": 0.5594751912788976, + "learning_rate": 2.602872742713602e-07, + "loss": 0.1998, + "step": 3620 + }, + { + "epoch": 1.9884678747940692, + "grad_norm": 0.48481399736334296, + "learning_rate": 2.600324629088098e-07, + "loss": 0.2534, + "step": 3621 + }, + { + "epoch": 1.9890170236133993, + "grad_norm": 0.5950512650619166, + "learning_rate": 2.597777325433354e-07, + "loss": 0.2515, + "step": 3622 + }, + { + "epoch": 1.9895661724327294, + "grad_norm": 0.4864217013229987, + "learning_rate": 2.5952308326089933e-07, + "loss": 0.2537, + "step": 3623 + }, + { + "epoch": 1.9901153212520593, + "grad_norm": 0.5592803340178949, + "learning_rate": 2.592685151474366e-07, + "loss": 0.2615, + "step": 3624 + }, + { + "epoch": 1.9906644700713894, + "grad_norm": 0.5036800182919009, + "learning_rate": 2.5901402828885405e-07, + "loss": 0.2597, + "step": 3625 + }, + { + "epoch": 1.9912136188907192, + "grad_norm": 0.510648531093211, + "learning_rate": 2.5875962277103215e-07, + "loss": 0.2357, + "step": 3626 + }, + { + "epoch": 1.9917627677100493, + "grad_norm": 0.4747778790674222, + "learning_rate": 2.5850529867982287e-07, + "loss": 0.2186, + "step": 3627 + }, + { + "epoch": 1.9923119165293794, + "grad_norm": 0.5567352724280351, + "learning_rate": 2.582510561010517e-07, + "loss": 0.2797, + "step": 3628 + }, + { + "epoch": 1.9928610653487095, + "grad_norm": 0.45626220406813023, + "learning_rate": 2.5799689512051566e-07, + "loss": 0.2197, + "step": 3629 + }, + { + "epoch": 1.9934102141680397, + "grad_norm": 0.4986417900459531, + "learning_rate": 2.5774281582398505e-07, + "loss": 0.2151, + "step": 3630 + }, + { + "epoch": 1.9939593629873695, + "grad_norm": 0.5717684346993673, + "learning_rate": 2.574888182972024e-07, + "loss": 0.2311, + "step": 3631 + }, + { + "epoch": 1.9945085118066996, + "grad_norm": 0.47070901347542604, + "learning_rate": 2.5723490262588226e-07, + "loss": 0.2663, + "step": 3632 + }, + { + "epoch": 1.9950576606260295, + "grad_norm": 0.5017313535995244, + "learning_rate": 2.569810688957117e-07, + "loss": 0.2418, + "step": 3633 + }, + { + "epoch": 1.9956068094453596, + "grad_norm": 0.6641539261341836, + "learning_rate": 2.567273171923505e-07, + "loss": 0.2822, + "step": 3634 + }, + { + "epoch": 1.9961559582646897, + "grad_norm": 0.5058400435451753, + "learning_rate": 2.5647364760143046e-07, + "loss": 0.247, + "step": 3635 + }, + { + "epoch": 1.9967051070840198, + "grad_norm": 0.5190345756668289, + "learning_rate": 2.5622006020855556e-07, + "loss": 0.2404, + "step": 3636 + }, + { + "epoch": 1.99725425590335, + "grad_norm": 0.4651825098744332, + "learning_rate": 2.559665550993027e-07, + "loss": 0.2277, + "step": 3637 + }, + { + "epoch": 1.9978034047226798, + "grad_norm": 0.5409533537145913, + "learning_rate": 2.5571313235922e-07, + "loss": 0.2398, + "step": 3638 + }, + { + "epoch": 1.99835255354201, + "grad_norm": 0.48219384988012653, + "learning_rate": 2.554597920738282e-07, + "loss": 0.2437, + "step": 3639 + }, + { + "epoch": 1.9989017023613398, + "grad_norm": 0.446420724986946, + "learning_rate": 2.5520653432862067e-07, + "loss": 0.2465, + "step": 3640 + }, + { + "epoch": 1.99945085118067, + "grad_norm": 0.560738757985528, + "learning_rate": 2.549533592090627e-07, + "loss": 0.2333, + "step": 3641 + }, + { + "epoch": 2.0, + "grad_norm": 0.522990454926645, + "learning_rate": 2.547002668005913e-07, + "loss": 0.3024, + "step": 3642 + }, + { + "epoch": 2.00054914881933, + "grad_norm": 0.4913238742866613, + "learning_rate": 2.544472571886156e-07, + "loss": 0.2508, + "step": 3643 + }, + { + "epoch": 2.00109829763866, + "grad_norm": 0.5890426877099678, + "learning_rate": 2.541943304585173e-07, + "loss": 0.2331, + "step": 3644 + }, + { + "epoch": 2.0016474464579903, + "grad_norm": 0.5080251643304994, + "learning_rate": 2.5394148669565e-07, + "loss": 0.2173, + "step": 3645 + }, + { + "epoch": 2.00219659527732, + "grad_norm": 0.44952741736715174, + "learning_rate": 2.5368872598533884e-07, + "loss": 0.2295, + "step": 3646 + }, + { + "epoch": 2.00274574409665, + "grad_norm": 0.524021575698407, + "learning_rate": 2.534360484128815e-07, + "loss": 0.2506, + "step": 3647 + }, + { + "epoch": 2.00329489291598, + "grad_norm": 0.5158759558335935, + "learning_rate": 2.531834540635473e-07, + "loss": 0.2503, + "step": 3648 + }, + { + "epoch": 2.0038440417353103, + "grad_norm": 0.5547007620685842, + "learning_rate": 2.5293094302257757e-07, + "loss": 0.2736, + "step": 3649 + }, + { + "epoch": 2.0043931905546404, + "grad_norm": 0.5083564571381022, + "learning_rate": 2.5267851537518517e-07, + "loss": 0.2623, + "step": 3650 + }, + { + "epoch": 2.0049423393739705, + "grad_norm": 0.4403522819862046, + "learning_rate": 2.524261712065553e-07, + "loss": 0.2615, + "step": 3651 + }, + { + "epoch": 2.0054914881933006, + "grad_norm": 0.4358984123669056, + "learning_rate": 2.5217391060184514e-07, + "loss": 0.2255, + "step": 3652 + }, + { + "epoch": 2.0060406370126302, + "grad_norm": 0.45244034398901245, + "learning_rate": 2.5192173364618305e-07, + "loss": 0.2241, + "step": 3653 + }, + { + "epoch": 2.0065897858319603, + "grad_norm": 0.4788176702233535, + "learning_rate": 2.5166964042466933e-07, + "loss": 0.268, + "step": 3654 + }, + { + "epoch": 2.0071389346512905, + "grad_norm": 0.4612233064431962, + "learning_rate": 2.514176310223765e-07, + "loss": 0.2261, + "step": 3655 + }, + { + "epoch": 2.0076880834706206, + "grad_norm": 0.5659295130333686, + "learning_rate": 2.5116570552434815e-07, + "loss": 0.2846, + "step": 3656 + }, + { + "epoch": 2.0082372322899507, + "grad_norm": 0.4493441227221307, + "learning_rate": 2.5091386401559986e-07, + "loss": 0.2796, + "step": 3657 + }, + { + "epoch": 2.0087863811092808, + "grad_norm": 0.5306724533364384, + "learning_rate": 2.5066210658111925e-07, + "loss": 0.2283, + "step": 3658 + }, + { + "epoch": 2.009335529928611, + "grad_norm": 0.4661457411455569, + "learning_rate": 2.504104333058649e-07, + "loss": 0.2593, + "step": 3659 + }, + { + "epoch": 2.0098846787479405, + "grad_norm": 0.4706345804215834, + "learning_rate": 2.50158844274767e-07, + "loss": 0.222, + "step": 3660 + }, + { + "epoch": 2.0104338275672706, + "grad_norm": 0.36380555660343594, + "learning_rate": 2.499073395727279e-07, + "loss": 0.2238, + "step": 3661 + }, + { + "epoch": 2.0109829763866007, + "grad_norm": 0.514270903659223, + "learning_rate": 2.4965591928462133e-07, + "loss": 0.2471, + "step": 3662 + }, + { + "epoch": 2.011532125205931, + "grad_norm": 0.4595319827290595, + "learning_rate": 2.49404583495292e-07, + "loss": 0.2211, + "step": 3663 + }, + { + "epoch": 2.012081274025261, + "grad_norm": 0.5108482565124504, + "learning_rate": 2.491533322895568e-07, + "loss": 0.2179, + "step": 3664 + }, + { + "epoch": 2.012630422844591, + "grad_norm": 0.4418248432507296, + "learning_rate": 2.4890216575220346e-07, + "loss": 0.1961, + "step": 3665 + }, + { + "epoch": 2.013179571663921, + "grad_norm": 0.5602175894663437, + "learning_rate": 2.486510839679917e-07, + "loss": 0.2121, + "step": 3666 + }, + { + "epoch": 2.013728720483251, + "grad_norm": 0.479482566881761, + "learning_rate": 2.484000870216521e-07, + "loss": 0.2356, + "step": 3667 + }, + { + "epoch": 2.014277869302581, + "grad_norm": 0.5451192328961294, + "learning_rate": 2.48149174997887e-07, + "loss": 0.2199, + "step": 3668 + }, + { + "epoch": 2.014827018121911, + "grad_norm": 0.4751755961725391, + "learning_rate": 2.4789834798137023e-07, + "loss": 0.2187, + "step": 3669 + }, + { + "epoch": 2.015376166941241, + "grad_norm": 0.4247030647692184, + "learning_rate": 2.476476060567464e-07, + "loss": 0.2448, + "step": 3670 + }, + { + "epoch": 2.015925315760571, + "grad_norm": 0.4889571587715578, + "learning_rate": 2.4739694930863154e-07, + "loss": 0.2401, + "step": 3671 + }, + { + "epoch": 2.0164744645799013, + "grad_norm": 0.5445404840598219, + "learning_rate": 2.471463778216134e-07, + "loss": 0.2522, + "step": 3672 + }, + { + "epoch": 2.017023613399231, + "grad_norm": 0.4251242325513694, + "learning_rate": 2.4689589168025025e-07, + "loss": 0.2479, + "step": 3673 + }, + { + "epoch": 2.017572762218561, + "grad_norm": 0.49828441205774543, + "learning_rate": 2.466454909690722e-07, + "loss": 0.2422, + "step": 3674 + }, + { + "epoch": 2.018121911037891, + "grad_norm": 0.42696144251965235, + "learning_rate": 2.463951757725804e-07, + "loss": 0.2272, + "step": 3675 + }, + { + "epoch": 2.0186710598572213, + "grad_norm": 0.5634285239560771, + "learning_rate": 2.461449461752468e-07, + "loss": 0.2193, + "step": 3676 + }, + { + "epoch": 2.0192202086765514, + "grad_norm": 0.5673945137309203, + "learning_rate": 2.458948022615144e-07, + "loss": 0.2329, + "step": 3677 + }, + { + "epoch": 2.0197693574958815, + "grad_norm": 0.5791208428472995, + "learning_rate": 2.456447441157979e-07, + "loss": 0.2709, + "step": 3678 + }, + { + "epoch": 2.0203185063152116, + "grad_norm": 0.521930087551579, + "learning_rate": 2.453947718224829e-07, + "loss": 0.249, + "step": 3679 + }, + { + "epoch": 2.0208676551345413, + "grad_norm": 0.4568367999451592, + "learning_rate": 2.4514488546592537e-07, + "loss": 0.2173, + "step": 3680 + }, + { + "epoch": 2.0214168039538714, + "grad_norm": 0.4385463580231092, + "learning_rate": 2.448950851304531e-07, + "loss": 0.1912, + "step": 3681 + }, + { + "epoch": 2.0219659527732015, + "grad_norm": 0.5235594459331956, + "learning_rate": 2.446453709003643e-07, + "loss": 0.2277, + "step": 3682 + }, + { + "epoch": 2.0225151015925316, + "grad_norm": 0.45791817816673974, + "learning_rate": 2.443957428599285e-07, + "loss": 0.2309, + "step": 3683 + }, + { + "epoch": 2.0230642504118617, + "grad_norm": 0.4945584772320004, + "learning_rate": 2.441462010933857e-07, + "loss": 0.2668, + "step": 3684 + }, + { + "epoch": 2.0236133992311918, + "grad_norm": 0.499310769481328, + "learning_rate": 2.4389674568494716e-07, + "loss": 0.2126, + "step": 3685 + }, + { + "epoch": 2.024162548050522, + "grad_norm": 0.524986527507172, + "learning_rate": 2.436473767187954e-07, + "loss": 0.2744, + "step": 3686 + }, + { + "epoch": 2.0247116968698515, + "grad_norm": 0.47052143792597523, + "learning_rate": 2.433980942790824e-07, + "loss": 0.2469, + "step": 3687 + }, + { + "epoch": 2.0252608456891816, + "grad_norm": 0.5317884135434877, + "learning_rate": 2.431488984499322e-07, + "loss": 0.2246, + "step": 3688 + }, + { + "epoch": 2.0258099945085117, + "grad_norm": 0.5329157293653187, + "learning_rate": 2.428997893154393e-07, + "loss": 0.2407, + "step": 3689 + }, + { + "epoch": 2.026359143327842, + "grad_norm": 0.5024132645160868, + "learning_rate": 2.4265076695966873e-07, + "loss": 0.3134, + "step": 3690 + }, + { + "epoch": 2.026908292147172, + "grad_norm": 0.5931602083943923, + "learning_rate": 2.4240183146665636e-07, + "loss": 0.2293, + "step": 3691 + }, + { + "epoch": 2.027457440966502, + "grad_norm": 0.5750470369442647, + "learning_rate": 2.42152982920409e-07, + "loss": 0.2907, + "step": 3692 + }, + { + "epoch": 2.028006589785832, + "grad_norm": 0.5384362284972389, + "learning_rate": 2.4190422140490353e-07, + "loss": 0.289, + "step": 3693 + }, + { + "epoch": 2.028555738605162, + "grad_norm": 0.6641661829926486, + "learning_rate": 2.4165554700408784e-07, + "loss": 0.236, + "step": 3694 + }, + { + "epoch": 2.029104887424492, + "grad_norm": 0.5166578794021188, + "learning_rate": 2.414069598018804e-07, + "loss": 0.2813, + "step": 3695 + }, + { + "epoch": 2.029654036243822, + "grad_norm": 0.5265678694848518, + "learning_rate": 2.4115845988217057e-07, + "loss": 0.2142, + "step": 3696 + }, + { + "epoch": 2.030203185063152, + "grad_norm": 0.47410386001414817, + "learning_rate": 2.409100473288175e-07, + "loss": 0.2498, + "step": 3697 + }, + { + "epoch": 2.030752333882482, + "grad_norm": 0.48895341463331876, + "learning_rate": 2.4066172222565136e-07, + "loss": 0.2118, + "step": 3698 + }, + { + "epoch": 2.0313014827018123, + "grad_norm": 0.4741926194621475, + "learning_rate": 2.404134846564727e-07, + "loss": 0.2523, + "step": 3699 + }, + { + "epoch": 2.0318506315211424, + "grad_norm": 0.5511789681328455, + "learning_rate": 2.401653347050529e-07, + "loss": 0.2604, + "step": 3700 + }, + { + "epoch": 2.032399780340472, + "grad_norm": 0.48089572061100344, + "learning_rate": 2.3991727245513293e-07, + "loss": 0.2568, + "step": 3701 + }, + { + "epoch": 2.032948929159802, + "grad_norm": 0.6666944723346373, + "learning_rate": 2.3966929799042484e-07, + "loss": 0.2271, + "step": 3702 + }, + { + "epoch": 2.0334980779791323, + "grad_norm": 0.4396406877108257, + "learning_rate": 2.3942141139461136e-07, + "loss": 0.2485, + "step": 3703 + }, + { + "epoch": 2.0340472267984624, + "grad_norm": 0.40949006014845785, + "learning_rate": 2.391736127513443e-07, + "loss": 0.2624, + "step": 3704 + }, + { + "epoch": 2.0345963756177925, + "grad_norm": 0.5933709531190807, + "learning_rate": 2.389259021442469e-07, + "loss": 0.2334, + "step": 3705 + }, + { + "epoch": 2.0351455244371226, + "grad_norm": 0.4982454202896799, + "learning_rate": 2.3867827965691256e-07, + "loss": 0.2179, + "step": 3706 + }, + { + "epoch": 2.0356946732564527, + "grad_norm": 0.5901651404969978, + "learning_rate": 2.3843074537290435e-07, + "loss": 0.2415, + "step": 3707 + }, + { + "epoch": 2.0362438220757824, + "grad_norm": 0.5306472157925395, + "learning_rate": 2.381832993757564e-07, + "loss": 0.2338, + "step": 3708 + }, + { + "epoch": 2.0367929708951125, + "grad_norm": 0.6982664677160196, + "learning_rate": 2.3793594174897228e-07, + "loss": 0.3125, + "step": 3709 + }, + { + "epoch": 2.0373421197144426, + "grad_norm": 0.53708927366282, + "learning_rate": 2.3768867257602638e-07, + "loss": 0.2446, + "step": 3710 + }, + { + "epoch": 2.0378912685337727, + "grad_norm": 0.5591794106420714, + "learning_rate": 2.3744149194036255e-07, + "loss": 0.2632, + "step": 3711 + }, + { + "epoch": 2.0384404173531028, + "grad_norm": 0.5474616318706089, + "learning_rate": 2.3719439992539537e-07, + "loss": 0.2802, + "step": 3712 + }, + { + "epoch": 2.038989566172433, + "grad_norm": 0.4685594000787345, + "learning_rate": 2.3694739661450942e-07, + "loss": 0.2537, + "step": 3713 + }, + { + "epoch": 2.039538714991763, + "grad_norm": 0.5497578317726087, + "learning_rate": 2.3670048209105916e-07, + "loss": 0.2472, + "step": 3714 + }, + { + "epoch": 2.0400878638110926, + "grad_norm": 0.4759317287317202, + "learning_rate": 2.3645365643836883e-07, + "loss": 0.2422, + "step": 3715 + }, + { + "epoch": 2.0406370126304227, + "grad_norm": 0.543921832696685, + "learning_rate": 2.362069197397333e-07, + "loss": 0.2381, + "step": 3716 + }, + { + "epoch": 2.041186161449753, + "grad_norm": 0.6741609031094414, + "learning_rate": 2.3596027207841718e-07, + "loss": 0.293, + "step": 3717 + }, + { + "epoch": 2.041735310269083, + "grad_norm": 0.5048027411023959, + "learning_rate": 2.3571371353765465e-07, + "loss": 0.2727, + "step": 3718 + }, + { + "epoch": 2.042284459088413, + "grad_norm": 0.4948460004611238, + "learning_rate": 2.3546724420065052e-07, + "loss": 0.2934, + "step": 3719 + }, + { + "epoch": 2.042833607907743, + "grad_norm": 0.47749876201506164, + "learning_rate": 2.3522086415057892e-07, + "loss": 0.2441, + "step": 3720 + }, + { + "epoch": 2.0433827567270733, + "grad_norm": 0.4855937909171848, + "learning_rate": 2.3497457347058383e-07, + "loss": 0.264, + "step": 3721 + }, + { + "epoch": 2.043931905546403, + "grad_norm": 0.5566052795249073, + "learning_rate": 2.347283722437795e-07, + "loss": 0.2005, + "step": 3722 + }, + { + "epoch": 2.044481054365733, + "grad_norm": 0.49769009933463193, + "learning_rate": 2.3448226055324988e-07, + "loss": 0.2423, + "step": 3723 + }, + { + "epoch": 2.045030203185063, + "grad_norm": 0.45765774246114027, + "learning_rate": 2.3423623848204838e-07, + "loss": 0.227, + "step": 3724 + }, + { + "epoch": 2.0455793520043932, + "grad_norm": 0.4684956250690747, + "learning_rate": 2.339903061131986e-07, + "loss": 0.2497, + "step": 3725 + }, + { + "epoch": 2.0461285008237233, + "grad_norm": 0.4850885699692227, + "learning_rate": 2.3374446352969334e-07, + "loss": 0.2531, + "step": 3726 + }, + { + "epoch": 2.0466776496430534, + "grad_norm": 0.5762772916999483, + "learning_rate": 2.3349871081449584e-07, + "loss": 0.2483, + "step": 3727 + }, + { + "epoch": 2.047226798462383, + "grad_norm": 0.9858405558390863, + "learning_rate": 2.3325304805053813e-07, + "loss": 0.3359, + "step": 3728 + }, + { + "epoch": 2.047775947281713, + "grad_norm": 0.5405403754986029, + "learning_rate": 2.3300747532072259e-07, + "loss": 0.2844, + "step": 3729 + }, + { + "epoch": 2.0483250961010433, + "grad_norm": 0.5968157813374904, + "learning_rate": 2.3276199270792115e-07, + "loss": 0.2556, + "step": 3730 + }, + { + "epoch": 2.0488742449203734, + "grad_norm": 0.4559604562747986, + "learning_rate": 2.3251660029497493e-07, + "loss": 0.2613, + "step": 3731 + }, + { + "epoch": 2.0494233937397035, + "grad_norm": 0.6698311755335101, + "learning_rate": 2.3227129816469465e-07, + "loss": 0.3045, + "step": 3732 + }, + { + "epoch": 2.0499725425590336, + "grad_norm": 0.4872289244983262, + "learning_rate": 2.3202608639986094e-07, + "loss": 0.2128, + "step": 3733 + }, + { + "epoch": 2.0505216913783637, + "grad_norm": 0.5772283582422382, + "learning_rate": 2.3178096508322396e-07, + "loss": 0.2739, + "step": 3734 + }, + { + "epoch": 2.0510708401976934, + "grad_norm": 0.584930686828448, + "learning_rate": 2.3153593429750263e-07, + "loss": 0.2598, + "step": 3735 + }, + { + "epoch": 2.0516199890170235, + "grad_norm": 0.47094955435821995, + "learning_rate": 2.3129099412538632e-07, + "loss": 0.2283, + "step": 3736 + }, + { + "epoch": 2.0521691378363536, + "grad_norm": 0.45991486335144016, + "learning_rate": 2.310461446495331e-07, + "loss": 0.2389, + "step": 3737 + }, + { + "epoch": 2.0527182866556837, + "grad_norm": 0.57405142005613, + "learning_rate": 2.3080138595257034e-07, + "loss": 0.2794, + "step": 3738 + }, + { + "epoch": 2.053267435475014, + "grad_norm": 0.4832220333411837, + "learning_rate": 2.3055671811709545e-07, + "loss": 0.2083, + "step": 3739 + }, + { + "epoch": 2.053816584294344, + "grad_norm": 0.4814050010028005, + "learning_rate": 2.303121412256749e-07, + "loss": 0.2296, + "step": 3740 + }, + { + "epoch": 2.054365733113674, + "grad_norm": 0.47238033740331364, + "learning_rate": 2.3006765536084415e-07, + "loss": 0.2336, + "step": 3741 + }, + { + "epoch": 2.0549148819330036, + "grad_norm": 0.48366892342414747, + "learning_rate": 2.298232606051081e-07, + "loss": 0.2692, + "step": 3742 + }, + { + "epoch": 2.0554640307523337, + "grad_norm": 0.4576644957872836, + "learning_rate": 2.2957895704094107e-07, + "loss": 0.2244, + "step": 3743 + }, + { + "epoch": 2.056013179571664, + "grad_norm": 0.544131458687228, + "learning_rate": 2.2933474475078672e-07, + "loss": 0.2591, + "step": 3744 + }, + { + "epoch": 2.056562328390994, + "grad_norm": 0.52715649908787, + "learning_rate": 2.2909062381705738e-07, + "loss": 0.2651, + "step": 3745 + }, + { + "epoch": 2.057111477210324, + "grad_norm": 0.6577435472712102, + "learning_rate": 2.28846594322135e-07, + "loss": 0.2148, + "step": 3746 + }, + { + "epoch": 2.057660626029654, + "grad_norm": 0.49543446218548004, + "learning_rate": 2.286026563483707e-07, + "loss": 0.2645, + "step": 3747 + }, + { + "epoch": 2.0582097748489843, + "grad_norm": 0.48652722294534334, + "learning_rate": 2.2835880997808452e-07, + "loss": 0.2218, + "step": 3748 + }, + { + "epoch": 2.058758923668314, + "grad_norm": 0.5418631667098508, + "learning_rate": 2.2811505529356525e-07, + "loss": 0.2675, + "step": 3749 + }, + { + "epoch": 2.059308072487644, + "grad_norm": 0.5638116910798441, + "learning_rate": 2.2787139237707142e-07, + "loss": 0.2651, + "step": 3750 + }, + { + "epoch": 2.059857221306974, + "grad_norm": 0.47532490014027007, + "learning_rate": 2.276278213108305e-07, + "loss": 0.2527, + "step": 3751 + }, + { + "epoch": 2.0604063701263042, + "grad_norm": 0.5046365348091819, + "learning_rate": 2.2738434217703845e-07, + "loss": 0.2678, + "step": 3752 + }, + { + "epoch": 2.0609555189456343, + "grad_norm": 0.4684628862107452, + "learning_rate": 2.2714095505786043e-07, + "loss": 0.2808, + "step": 3753 + }, + { + "epoch": 2.0615046677649644, + "grad_norm": 0.49971409070025663, + "learning_rate": 2.2689766003543092e-07, + "loss": 0.266, + "step": 3754 + }, + { + "epoch": 2.0620538165842945, + "grad_norm": 0.5453090647639149, + "learning_rate": 2.266544571918527e-07, + "loss": 0.2372, + "step": 3755 + }, + { + "epoch": 2.062602965403624, + "grad_norm": 0.46607772246298784, + "learning_rate": 2.2641134660919794e-07, + "loss": 0.2122, + "step": 3756 + }, + { + "epoch": 2.0631521142229543, + "grad_norm": 0.5628090209722687, + "learning_rate": 2.2616832836950768e-07, + "loss": 0.24, + "step": 3757 + }, + { + "epoch": 2.0637012630422844, + "grad_norm": 0.4282866311731098, + "learning_rate": 2.2592540255479147e-07, + "loss": 0.2517, + "step": 3758 + }, + { + "epoch": 2.0642504118616145, + "grad_norm": 0.6044630348501255, + "learning_rate": 2.256825692470276e-07, + "loss": 0.2344, + "step": 3759 + }, + { + "epoch": 2.0647995606809446, + "grad_norm": 0.5774481563162135, + "learning_rate": 2.2543982852816358e-07, + "loss": 0.2576, + "step": 3760 + }, + { + "epoch": 2.0653487095002747, + "grad_norm": 0.45519867540968767, + "learning_rate": 2.2519718048011563e-07, + "loss": 0.2087, + "step": 3761 + }, + { + "epoch": 2.065897858319605, + "grad_norm": 0.5417368969411511, + "learning_rate": 2.2495462518476815e-07, + "loss": 0.2109, + "step": 3762 + }, + { + "epoch": 2.0664470071389345, + "grad_norm": 0.3908792725996102, + "learning_rate": 2.24712162723975e-07, + "loss": 0.2117, + "step": 3763 + }, + { + "epoch": 2.0669961559582646, + "grad_norm": 0.5074637351207244, + "learning_rate": 2.2446979317955798e-07, + "loss": 0.2663, + "step": 3764 + }, + { + "epoch": 2.0675453047775947, + "grad_norm": 0.48395573281234366, + "learning_rate": 2.2422751663330825e-07, + "loss": 0.2516, + "step": 3765 + }, + { + "epoch": 2.068094453596925, + "grad_norm": 0.50185767696027, + "learning_rate": 2.2398533316698473e-07, + "loss": 0.2139, + "step": 3766 + }, + { + "epoch": 2.068643602416255, + "grad_norm": 0.39385996109314064, + "learning_rate": 2.237432428623158e-07, + "loss": 0.2435, + "step": 3767 + }, + { + "epoch": 2.069192751235585, + "grad_norm": 0.5282952753837166, + "learning_rate": 2.23501245800998e-07, + "loss": 0.2591, + "step": 3768 + }, + { + "epoch": 2.069741900054915, + "grad_norm": 0.5051684722163543, + "learning_rate": 2.232593420646964e-07, + "loss": 0.2451, + "step": 3769 + }, + { + "epoch": 2.0702910488742448, + "grad_norm": 0.5759166501310068, + "learning_rate": 2.2301753173504435e-07, + "loss": 0.2071, + "step": 3770 + }, + { + "epoch": 2.070840197693575, + "grad_norm": 0.5554354398589165, + "learning_rate": 2.2277581489364427e-07, + "loss": 0.2418, + "step": 3771 + }, + { + "epoch": 2.071389346512905, + "grad_norm": 0.6074187367943809, + "learning_rate": 2.225341916220664e-07, + "loss": 0.2464, + "step": 3772 + }, + { + "epoch": 2.071938495332235, + "grad_norm": 0.5100792478591916, + "learning_rate": 2.2229266200184982e-07, + "loss": 0.2188, + "step": 3773 + }, + { + "epoch": 2.072487644151565, + "grad_norm": 0.4474649241624674, + "learning_rate": 2.2205122611450203e-07, + "loss": 0.2158, + "step": 3774 + }, + { + "epoch": 2.0730367929708953, + "grad_norm": 0.5056464396208762, + "learning_rate": 2.2180988404149858e-07, + "loss": 0.2181, + "step": 3775 + }, + { + "epoch": 2.073585941790225, + "grad_norm": 0.4524101987449746, + "learning_rate": 2.2156863586428345e-07, + "loss": 0.2192, + "step": 3776 + }, + { + "epoch": 2.074135090609555, + "grad_norm": 0.4821130558052258, + "learning_rate": 2.213274816642691e-07, + "loss": 0.2227, + "step": 3777 + }, + { + "epoch": 2.074684239428885, + "grad_norm": 0.5370172777596177, + "learning_rate": 2.2108642152283632e-07, + "loss": 0.2448, + "step": 3778 + }, + { + "epoch": 2.0752333882482152, + "grad_norm": 0.49649226010859776, + "learning_rate": 2.2084545552133377e-07, + "loss": 0.2188, + "step": 3779 + }, + { + "epoch": 2.0757825370675453, + "grad_norm": 0.5363380279314732, + "learning_rate": 2.2060458374107887e-07, + "loss": 0.2503, + "step": 3780 + }, + { + "epoch": 2.0763316858868754, + "grad_norm": 0.5222857862752548, + "learning_rate": 2.203638062633567e-07, + "loss": 0.2589, + "step": 3781 + }, + { + "epoch": 2.0768808347062055, + "grad_norm": 0.4330076218078724, + "learning_rate": 2.2012312316942114e-07, + "loss": 0.1966, + "step": 3782 + }, + { + "epoch": 2.077429983525535, + "grad_norm": 0.41076589476972525, + "learning_rate": 2.1988253454049338e-07, + "loss": 0.2269, + "step": 3783 + }, + { + "epoch": 2.0779791323448653, + "grad_norm": 0.4842394243581011, + "learning_rate": 2.1964204045776354e-07, + "loss": 0.2263, + "step": 3784 + }, + { + "epoch": 2.0785282811641954, + "grad_norm": 0.5536101759418829, + "learning_rate": 2.1940164100238987e-07, + "loss": 0.2244, + "step": 3785 + }, + { + "epoch": 2.0790774299835255, + "grad_norm": 0.5055241417302644, + "learning_rate": 2.1916133625549752e-07, + "loss": 0.2547, + "step": 3786 + }, + { + "epoch": 2.0796265788028556, + "grad_norm": 0.6893800562974802, + "learning_rate": 2.189211262981809e-07, + "loss": 0.2134, + "step": 3787 + }, + { + "epoch": 2.0801757276221857, + "grad_norm": 0.45605780828887804, + "learning_rate": 2.1868101121150215e-07, + "loss": 0.2271, + "step": 3788 + }, + { + "epoch": 2.080724876441516, + "grad_norm": 0.5305595435770265, + "learning_rate": 2.1844099107649098e-07, + "loss": 0.2451, + "step": 3789 + }, + { + "epoch": 2.0812740252608455, + "grad_norm": 0.4558087135623228, + "learning_rate": 2.1820106597414552e-07, + "loss": 0.2343, + "step": 3790 + }, + { + "epoch": 2.0818231740801756, + "grad_norm": 0.4866574220451204, + "learning_rate": 2.1796123598543176e-07, + "loss": 0.2667, + "step": 3791 + }, + { + "epoch": 2.0823723228995057, + "grad_norm": 0.4521099309667922, + "learning_rate": 2.1772150119128337e-07, + "loss": 0.2898, + "step": 3792 + }, + { + "epoch": 2.082921471718836, + "grad_norm": 0.47755378233209367, + "learning_rate": 2.1748186167260182e-07, + "loss": 0.2671, + "step": 3793 + }, + { + "epoch": 2.083470620538166, + "grad_norm": 0.5074995119966317, + "learning_rate": 2.1724231751025682e-07, + "loss": 0.2196, + "step": 3794 + }, + { + "epoch": 2.084019769357496, + "grad_norm": 0.5878029375941489, + "learning_rate": 2.1700286878508575e-07, + "loss": 0.2571, + "step": 3795 + }, + { + "epoch": 2.084568918176826, + "grad_norm": 0.3962019074412491, + "learning_rate": 2.1676351557789374e-07, + "loss": 0.2601, + "step": 3796 + }, + { + "epoch": 2.0851180669961558, + "grad_norm": 0.5069977074525809, + "learning_rate": 2.1652425796945342e-07, + "loss": 0.2313, + "step": 3797 + }, + { + "epoch": 2.085667215815486, + "grad_norm": 0.48424608577258116, + "learning_rate": 2.1628509604050555e-07, + "loss": 0.2629, + "step": 3798 + }, + { + "epoch": 2.086216364634816, + "grad_norm": 0.43422893700090626, + "learning_rate": 2.1604602987175869e-07, + "loss": 0.2253, + "step": 3799 + }, + { + "epoch": 2.086765513454146, + "grad_norm": 0.4378541739451928, + "learning_rate": 2.1580705954388853e-07, + "loss": 0.2195, + "step": 3800 + }, + { + "epoch": 2.086765513454146, + "eval_loss": 0.32376018166542053, + "eval_runtime": 18.6635, + "eval_samples_per_second": 23.736, + "eval_steps_per_second": 1.018, + "step": 3800 + }, + { + "epoch": 2.087314662273476, + "grad_norm": 0.44835065325415924, + "learning_rate": 2.155681851375389e-07, + "loss": 0.2464, + "step": 3801 + }, + { + "epoch": 2.0878638110928063, + "grad_norm": 0.4203034524653535, + "learning_rate": 2.1532940673332145e-07, + "loss": 0.2427, + "step": 3802 + }, + { + "epoch": 2.0884129599121364, + "grad_norm": 0.5156345649781823, + "learning_rate": 2.150907244118144e-07, + "loss": 0.1809, + "step": 3803 + }, + { + "epoch": 2.088962108731466, + "grad_norm": 0.4733299234827269, + "learning_rate": 2.1485213825356465e-07, + "loss": 0.2605, + "step": 3804 + }, + { + "epoch": 2.089511257550796, + "grad_norm": 0.43428342259449787, + "learning_rate": 2.1461364833908639e-07, + "loss": 0.2136, + "step": 3805 + }, + { + "epoch": 2.0900604063701262, + "grad_norm": 0.4493739008877683, + "learning_rate": 2.1437525474886072e-07, + "loss": 0.2291, + "step": 3806 + }, + { + "epoch": 2.0906095551894563, + "grad_norm": 0.49859453829732686, + "learning_rate": 2.1413695756333722e-07, + "loss": 0.2744, + "step": 3807 + }, + { + "epoch": 2.0911587040087865, + "grad_norm": 0.4666041875816198, + "learning_rate": 2.138987568629319e-07, + "loss": 0.2166, + "step": 3808 + }, + { + "epoch": 2.0917078528281166, + "grad_norm": 0.47040347298971286, + "learning_rate": 2.1366065272802916e-07, + "loss": 0.2462, + "step": 3809 + }, + { + "epoch": 2.0922570016474467, + "grad_norm": 0.5370374026317133, + "learning_rate": 2.1342264523898002e-07, + "loss": 0.2566, + "step": 3810 + }, + { + "epoch": 2.0928061504667763, + "grad_norm": 0.43451435409501626, + "learning_rate": 2.131847344761034e-07, + "loss": 0.2608, + "step": 3811 + }, + { + "epoch": 2.0933552992861064, + "grad_norm": 0.48277301974793396, + "learning_rate": 2.129469205196856e-07, + "loss": 0.2288, + "step": 3812 + }, + { + "epoch": 2.0939044481054365, + "grad_norm": 0.5986344646388713, + "learning_rate": 2.1270920344997992e-07, + "loss": 0.2645, + "step": 3813 + }, + { + "epoch": 2.0944535969247666, + "grad_norm": 0.5817618984398569, + "learning_rate": 2.1247158334720682e-07, + "loss": 0.2352, + "step": 3814 + }, + { + "epoch": 2.0950027457440967, + "grad_norm": 0.49920736071095206, + "learning_rate": 2.1223406029155464e-07, + "loss": 0.2961, + "step": 3815 + }, + { + "epoch": 2.095551894563427, + "grad_norm": 0.5042657754474339, + "learning_rate": 2.119966343631788e-07, + "loss": 0.2604, + "step": 3816 + }, + { + "epoch": 2.096101043382757, + "grad_norm": 0.6238536057071339, + "learning_rate": 2.117593056422014e-07, + "loss": 0.223, + "step": 3817 + }, + { + "epoch": 2.0966501922020866, + "grad_norm": 0.48103575923231945, + "learning_rate": 2.1152207420871258e-07, + "loss": 0.2703, + "step": 3818 + }, + { + "epoch": 2.0971993410214167, + "grad_norm": 0.4944920323125504, + "learning_rate": 2.1128494014276896e-07, + "loss": 0.2668, + "step": 3819 + }, + { + "epoch": 2.097748489840747, + "grad_norm": 1.1698406483269248, + "learning_rate": 2.1104790352439438e-07, + "loss": 0.4154, + "step": 3820 + }, + { + "epoch": 2.098297638660077, + "grad_norm": 0.5817018983620151, + "learning_rate": 2.1081096443358012e-07, + "loss": 0.2501, + "step": 3821 + }, + { + "epoch": 2.098846787479407, + "grad_norm": 0.5829269175913295, + "learning_rate": 2.105741229502847e-07, + "loss": 0.2451, + "step": 3822 + }, + { + "epoch": 2.099395936298737, + "grad_norm": 0.54771538874892, + "learning_rate": 2.103373791544329e-07, + "loss": 0.246, + "step": 3823 + }, + { + "epoch": 2.099945085118067, + "grad_norm": 0.4884146239005686, + "learning_rate": 2.1010073312591745e-07, + "loss": 0.2794, + "step": 3824 + }, + { + "epoch": 2.100494233937397, + "grad_norm": 0.4313857458398654, + "learning_rate": 2.0986418494459728e-07, + "loss": 0.2601, + "step": 3825 + }, + { + "epoch": 2.101043382756727, + "grad_norm": 0.5389038758455438, + "learning_rate": 2.096277346902991e-07, + "loss": 0.2417, + "step": 3826 + }, + { + "epoch": 2.101592531576057, + "grad_norm": 0.44093859615205977, + "learning_rate": 2.0939138244281573e-07, + "loss": 0.2352, + "step": 3827 + }, + { + "epoch": 2.102141680395387, + "grad_norm": 0.5182439145384664, + "learning_rate": 2.0915512828190753e-07, + "loss": 0.2662, + "step": 3828 + }, + { + "epoch": 2.1026908292147173, + "grad_norm": 0.46981164250174345, + "learning_rate": 2.0891897228730185e-07, + "loss": 0.249, + "step": 3829 + }, + { + "epoch": 2.1032399780340474, + "grad_norm": 0.5498105127851545, + "learning_rate": 2.0868291453869236e-07, + "loss": 0.2696, + "step": 3830 + }, + { + "epoch": 2.1037891268533775, + "grad_norm": 0.5089161702593171, + "learning_rate": 2.084469551157397e-07, + "loss": 0.2374, + "step": 3831 + }, + { + "epoch": 2.104338275672707, + "grad_norm": 0.5217421166675897, + "learning_rate": 2.082110940980717e-07, + "loss": 0.2213, + "step": 3832 + }, + { + "epoch": 2.1048874244920373, + "grad_norm": 0.4910412197627618, + "learning_rate": 2.0797533156528289e-07, + "loss": 0.2286, + "step": 3833 + }, + { + "epoch": 2.1054365733113674, + "grad_norm": 0.4612081084092483, + "learning_rate": 2.0773966759693407e-07, + "loss": 0.2322, + "step": 3834 + }, + { + "epoch": 2.1059857221306975, + "grad_norm": 0.4837269622859846, + "learning_rate": 2.0750410227255355e-07, + "loss": 0.2537, + "step": 3835 + }, + { + "epoch": 2.1065348709500276, + "grad_norm": 0.49360714895182956, + "learning_rate": 2.0726863567163574e-07, + "loss": 0.2692, + "step": 3836 + }, + { + "epoch": 2.1070840197693577, + "grad_norm": 0.6477791973519866, + "learning_rate": 2.0703326787364184e-07, + "loss": 0.2729, + "step": 3837 + }, + { + "epoch": 2.1076331685886873, + "grad_norm": 0.4402285014292362, + "learning_rate": 2.0679799895799984e-07, + "loss": 0.2245, + "step": 3838 + }, + { + "epoch": 2.1081823174080174, + "grad_norm": 0.4973315046775512, + "learning_rate": 2.0656282900410465e-07, + "loss": 0.2357, + "step": 3839 + }, + { + "epoch": 2.1087314662273475, + "grad_norm": 0.479704554789677, + "learning_rate": 2.0632775809131726e-07, + "loss": 0.2585, + "step": 3840 + }, + { + "epoch": 2.1092806150466776, + "grad_norm": 0.4869899078769164, + "learning_rate": 2.0609278629896518e-07, + "loss": 0.2636, + "step": 3841 + }, + { + "epoch": 2.1098297638660077, + "grad_norm": 0.5585917548592232, + "learning_rate": 2.05857913706343e-07, + "loss": 0.2112, + "step": 3842 + }, + { + "epoch": 2.110378912685338, + "grad_norm": 0.710217625113275, + "learning_rate": 2.056231403927117e-07, + "loss": 0.244, + "step": 3843 + }, + { + "epoch": 2.110928061504668, + "grad_norm": 0.39836715100691994, + "learning_rate": 2.053884664372983e-07, + "loss": 0.2284, + "step": 3844 + }, + { + "epoch": 2.1114772103239976, + "grad_norm": 0.4458689999105017, + "learning_rate": 2.0515389191929678e-07, + "loss": 0.2548, + "step": 3845 + }, + { + "epoch": 2.1120263591433277, + "grad_norm": 0.5247679999941065, + "learning_rate": 2.0491941691786757e-07, + "loss": 0.2369, + "step": 3846 + }, + { + "epoch": 2.112575507962658, + "grad_norm": 0.5087362219602194, + "learning_rate": 2.0468504151213714e-07, + "loss": 0.2411, + "step": 3847 + }, + { + "epoch": 2.113124656781988, + "grad_norm": 0.5468988295400588, + "learning_rate": 2.0445076578119845e-07, + "loss": 0.2032, + "step": 3848 + }, + { + "epoch": 2.113673805601318, + "grad_norm": 0.47950365843329795, + "learning_rate": 2.0421658980411106e-07, + "loss": 0.2104, + "step": 3849 + }, + { + "epoch": 2.114222954420648, + "grad_norm": 0.3730845942111509, + "learning_rate": 2.0398251365990088e-07, + "loss": 0.2192, + "step": 3850 + }, + { + "epoch": 2.114772103239978, + "grad_norm": 0.45984650798458504, + "learning_rate": 2.0374853742755986e-07, + "loss": 0.2162, + "step": 3851 + }, + { + "epoch": 2.115321252059308, + "grad_norm": 0.4534019289876038, + "learning_rate": 2.035146611860462e-07, + "loss": 0.2434, + "step": 3852 + }, + { + "epoch": 2.115870400878638, + "grad_norm": 0.46751695961336376, + "learning_rate": 2.0328088501428477e-07, + "loss": 0.233, + "step": 3853 + }, + { + "epoch": 2.116419549697968, + "grad_norm": 0.4765868265720958, + "learning_rate": 2.0304720899116616e-07, + "loss": 0.2156, + "step": 3854 + }, + { + "epoch": 2.116968698517298, + "grad_norm": 0.4621921873096323, + "learning_rate": 2.0281363319554756e-07, + "loss": 0.2551, + "step": 3855 + }, + { + "epoch": 2.1175178473366283, + "grad_norm": 0.5497337891261651, + "learning_rate": 2.0258015770625238e-07, + "loss": 0.2263, + "step": 3856 + }, + { + "epoch": 2.1180669961559584, + "grad_norm": 0.5482509629339043, + "learning_rate": 2.0234678260206976e-07, + "loss": 0.2734, + "step": 3857 + }, + { + "epoch": 2.1186161449752885, + "grad_norm": 0.4433565742325993, + "learning_rate": 2.0211350796175513e-07, + "loss": 0.2204, + "step": 3858 + }, + { + "epoch": 2.119165293794618, + "grad_norm": 0.5116139811620837, + "learning_rate": 2.018803338640302e-07, + "loss": 0.2278, + "step": 3859 + }, + { + "epoch": 2.1197144426139483, + "grad_norm": 0.48212694774789133, + "learning_rate": 2.0164726038758276e-07, + "loss": 0.2534, + "step": 3860 + }, + { + "epoch": 2.1202635914332784, + "grad_norm": 0.5388688753747408, + "learning_rate": 2.0141428761106629e-07, + "loss": 0.204, + "step": 3861 + }, + { + "epoch": 2.1208127402526085, + "grad_norm": 0.48245621013780615, + "learning_rate": 2.0118141561310085e-07, + "loss": 0.2798, + "step": 3862 + }, + { + "epoch": 2.1213618890719386, + "grad_norm": 0.4260629640231019, + "learning_rate": 2.0094864447227168e-07, + "loss": 0.239, + "step": 3863 + }, + { + "epoch": 2.1219110378912687, + "grad_norm": 0.5264671645008749, + "learning_rate": 2.0071597426713094e-07, + "loss": 0.2631, + "step": 3864 + }, + { + "epoch": 2.1224601867105988, + "grad_norm": 0.5708440471622848, + "learning_rate": 2.0048340507619593e-07, + "loss": 0.2464, + "step": 3865 + }, + { + "epoch": 2.1230093355299284, + "grad_norm": 0.5607639535724869, + "learning_rate": 2.002509369779502e-07, + "loss": 0.2247, + "step": 3866 + }, + { + "epoch": 2.1235584843492585, + "grad_norm": 0.4672443926232953, + "learning_rate": 2.0001857005084348e-07, + "loss": 0.2548, + "step": 3867 + }, + { + "epoch": 2.1241076331685886, + "grad_norm": 0.3878676680814944, + "learning_rate": 1.9978630437329086e-07, + "loss": 0.2814, + "step": 3868 + }, + { + "epoch": 2.1246567819879187, + "grad_norm": 0.5503157690141373, + "learning_rate": 1.9955414002367327e-07, + "loss": 0.2204, + "step": 3869 + }, + { + "epoch": 2.125205930807249, + "grad_norm": 0.5181351304480719, + "learning_rate": 1.9932207708033785e-07, + "loss": 0.2632, + "step": 3870 + }, + { + "epoch": 2.125755079626579, + "grad_norm": 0.5040983743257278, + "learning_rate": 1.990901156215971e-07, + "loss": 0.2393, + "step": 3871 + }, + { + "epoch": 2.1263042284459086, + "grad_norm": 0.39817991322482477, + "learning_rate": 1.988582557257296e-07, + "loss": 0.2627, + "step": 3872 + }, + { + "epoch": 2.1268533772652387, + "grad_norm": 0.5551303017650082, + "learning_rate": 1.9862649747097967e-07, + "loss": 0.2556, + "step": 3873 + }, + { + "epoch": 2.127402526084569, + "grad_norm": 0.4220738732677619, + "learning_rate": 1.9839484093555707e-07, + "loss": 0.2284, + "step": 3874 + }, + { + "epoch": 2.127951674903899, + "grad_norm": 0.47052582757112177, + "learning_rate": 1.9816328619763706e-07, + "loss": 0.2609, + "step": 3875 + }, + { + "epoch": 2.128500823723229, + "grad_norm": 0.4152980893138041, + "learning_rate": 1.979318333353611e-07, + "loss": 0.2511, + "step": 3876 + }, + { + "epoch": 2.129049972542559, + "grad_norm": 0.49090859663218683, + "learning_rate": 1.9770048242683616e-07, + "loss": 0.2538, + "step": 3877 + }, + { + "epoch": 2.1295991213618892, + "grad_norm": 0.48114834075147034, + "learning_rate": 1.9746923355013425e-07, + "loss": 0.2674, + "step": 3878 + }, + { + "epoch": 2.130148270181219, + "grad_norm": 0.503727489866788, + "learning_rate": 1.9723808678329377e-07, + "loss": 0.2595, + "step": 3879 + }, + { + "epoch": 2.130697419000549, + "grad_norm": 0.5008553988693354, + "learning_rate": 1.9700704220431785e-07, + "loss": 0.2448, + "step": 3880 + }, + { + "epoch": 2.131246567819879, + "grad_norm": 0.49410063409296384, + "learning_rate": 1.967760998911759e-07, + "loss": 0.2815, + "step": 3881 + }, + { + "epoch": 2.131795716639209, + "grad_norm": 0.497035474782331, + "learning_rate": 1.9654525992180203e-07, + "loss": 0.2366, + "step": 3882 + }, + { + "epoch": 2.1323448654585393, + "grad_norm": 0.5782493695908826, + "learning_rate": 1.9631452237409648e-07, + "loss": 0.2298, + "step": 3883 + }, + { + "epoch": 2.1328940142778694, + "grad_norm": 0.4393680726477153, + "learning_rate": 1.9608388732592495e-07, + "loss": 0.285, + "step": 3884 + }, + { + "epoch": 2.1334431630971995, + "grad_norm": 0.4928816193499844, + "learning_rate": 1.9585335485511763e-07, + "loss": 0.2413, + "step": 3885 + }, + { + "epoch": 2.133992311916529, + "grad_norm": 0.5164242630041197, + "learning_rate": 1.9562292503947107e-07, + "loss": 0.265, + "step": 3886 + }, + { + "epoch": 2.1345414607358593, + "grad_norm": 0.6824778741241071, + "learning_rate": 1.9539259795674698e-07, + "loss": 0.2297, + "step": 3887 + }, + { + "epoch": 2.1350906095551894, + "grad_norm": 0.5217338025973458, + "learning_rate": 1.9516237368467194e-07, + "loss": 0.2346, + "step": 3888 + }, + { + "epoch": 2.1356397583745195, + "grad_norm": 0.6202267849174579, + "learning_rate": 1.9493225230093831e-07, + "loss": 0.2259, + "step": 3889 + }, + { + "epoch": 2.1361889071938496, + "grad_norm": 0.47296869159058896, + "learning_rate": 1.9470223388320386e-07, + "loss": 0.2318, + "step": 3890 + }, + { + "epoch": 2.1367380560131797, + "grad_norm": 0.5412293417627534, + "learning_rate": 1.9447231850909103e-07, + "loss": 0.242, + "step": 3891 + }, + { + "epoch": 2.13728720483251, + "grad_norm": 0.4307234486208026, + "learning_rate": 1.942425062561877e-07, + "loss": 0.2326, + "step": 3892 + }, + { + "epoch": 2.1378363536518394, + "grad_norm": 0.491847468059347, + "learning_rate": 1.9401279720204712e-07, + "loss": 0.2477, + "step": 3893 + }, + { + "epoch": 2.1383855024711695, + "grad_norm": 0.4957298364097486, + "learning_rate": 1.937831914241879e-07, + "loss": 0.2552, + "step": 3894 + }, + { + "epoch": 2.1389346512904996, + "grad_norm": 0.597020614326026, + "learning_rate": 1.935536890000933e-07, + "loss": 0.2489, + "step": 3895 + }, + { + "epoch": 2.1394838001098297, + "grad_norm": 0.425252952318177, + "learning_rate": 1.9332429000721178e-07, + "loss": 0.2408, + "step": 3896 + }, + { + "epoch": 2.14003294892916, + "grad_norm": 0.5941925186049234, + "learning_rate": 1.9309499452295727e-07, + "loss": 0.3189, + "step": 3897 + }, + { + "epoch": 2.14058209774849, + "grad_norm": 0.4277229126926347, + "learning_rate": 1.9286580262470858e-07, + "loss": 0.2522, + "step": 3898 + }, + { + "epoch": 2.14113124656782, + "grad_norm": 0.4731538800259954, + "learning_rate": 1.9263671438980938e-07, + "loss": 0.2911, + "step": 3899 + }, + { + "epoch": 2.1416803953871497, + "grad_norm": 0.5085825572256085, + "learning_rate": 1.9240772989556855e-07, + "loss": 0.2495, + "step": 3900 + }, + { + "epoch": 2.14222954420648, + "grad_norm": 1.217562512470521, + "learning_rate": 1.9217884921926027e-07, + "loss": 0.4166, + "step": 3901 + }, + { + "epoch": 2.14277869302581, + "grad_norm": 0.43371609794287097, + "learning_rate": 1.919500724381227e-07, + "loss": 0.2491, + "step": 3902 + }, + { + "epoch": 2.14332784184514, + "grad_norm": 0.4917208768632998, + "learning_rate": 1.9172139962935987e-07, + "loss": 0.2538, + "step": 3903 + }, + { + "epoch": 2.14387699066447, + "grad_norm": 0.45810023421376667, + "learning_rate": 1.9149283087014064e-07, + "loss": 0.2746, + "step": 3904 + }, + { + "epoch": 2.1444261394838002, + "grad_norm": 0.4372699453223447, + "learning_rate": 1.912643662375982e-07, + "loss": 0.2112, + "step": 3905 + }, + { + "epoch": 2.1449752883031303, + "grad_norm": 0.43792039356987905, + "learning_rate": 1.9103600580883106e-07, + "loss": 0.2409, + "step": 3906 + }, + { + "epoch": 2.14552443712246, + "grad_norm": 0.5609103810209406, + "learning_rate": 1.908077496609027e-07, + "loss": 0.276, + "step": 3907 + }, + { + "epoch": 2.14607358594179, + "grad_norm": 0.42324528721595817, + "learning_rate": 1.9057959787084098e-07, + "loss": 0.2607, + "step": 3908 + }, + { + "epoch": 2.14662273476112, + "grad_norm": 0.4715374997362142, + "learning_rate": 1.9035155051563847e-07, + "loss": 0.2111, + "step": 3909 + }, + { + "epoch": 2.1471718835804503, + "grad_norm": 0.49983634210228417, + "learning_rate": 1.9012360767225304e-07, + "loss": 0.2451, + "step": 3910 + }, + { + "epoch": 2.1477210323997804, + "grad_norm": 0.4661224660618886, + "learning_rate": 1.898957694176071e-07, + "loss": 0.2685, + "step": 3911 + }, + { + "epoch": 2.1482701812191105, + "grad_norm": 0.4300289303413506, + "learning_rate": 1.8966803582858745e-07, + "loss": 0.2542, + "step": 3912 + }, + { + "epoch": 2.1488193300384406, + "grad_norm": 0.4619576387584257, + "learning_rate": 1.894404069820457e-07, + "loss": 0.2225, + "step": 3913 + }, + { + "epoch": 2.1493684788577703, + "grad_norm": 0.5031134581390188, + "learning_rate": 1.8921288295479842e-07, + "loss": 0.1993, + "step": 3914 + }, + { + "epoch": 2.1499176276771004, + "grad_norm": 0.47178946077145767, + "learning_rate": 1.8898546382362663e-07, + "loss": 0.2321, + "step": 3915 + }, + { + "epoch": 2.1504667764964305, + "grad_norm": 0.543018371515395, + "learning_rate": 1.8875814966527565e-07, + "loss": 0.2398, + "step": 3916 + }, + { + "epoch": 2.1510159253157606, + "grad_norm": 0.49361401099524627, + "learning_rate": 1.885309405564559e-07, + "loss": 0.2135, + "step": 3917 + }, + { + "epoch": 2.1515650741350907, + "grad_norm": 0.5388751454699288, + "learning_rate": 1.8830383657384193e-07, + "loss": 0.2438, + "step": 3918 + }, + { + "epoch": 2.152114222954421, + "grad_norm": 0.5655443709542992, + "learning_rate": 1.8807683779407294e-07, + "loss": 0.2579, + "step": 3919 + }, + { + "epoch": 2.152663371773751, + "grad_norm": 0.46217653882996385, + "learning_rate": 1.8784994429375265e-07, + "loss": 0.2658, + "step": 3920 + }, + { + "epoch": 2.1532125205930805, + "grad_norm": 0.5998136405598767, + "learning_rate": 1.8762315614944943e-07, + "loss": 0.2581, + "step": 3921 + }, + { + "epoch": 2.1537616694124107, + "grad_norm": 0.417790543709564, + "learning_rate": 1.8739647343769571e-07, + "loss": 0.223, + "step": 3922 + }, + { + "epoch": 2.1543108182317408, + "grad_norm": 0.5439570688326004, + "learning_rate": 1.8716989623498882e-07, + "loss": 0.2454, + "step": 3923 + }, + { + "epoch": 2.154859967051071, + "grad_norm": 0.38858703529285105, + "learning_rate": 1.8694342461778987e-07, + "loss": 0.2413, + "step": 3924 + }, + { + "epoch": 2.155409115870401, + "grad_norm": 0.5126797149447225, + "learning_rate": 1.8671705866252507e-07, + "loss": 0.202, + "step": 3925 + }, + { + "epoch": 2.155958264689731, + "grad_norm": 0.3872490208690947, + "learning_rate": 1.8649079844558418e-07, + "loss": 0.2347, + "step": 3926 + }, + { + "epoch": 2.156507413509061, + "grad_norm": 0.5346932465994957, + "learning_rate": 1.8626464404332194e-07, + "loss": 0.2743, + "step": 3927 + }, + { + "epoch": 2.157056562328391, + "grad_norm": 0.5104704753214021, + "learning_rate": 1.8603859553205726e-07, + "loss": 0.2579, + "step": 3928 + }, + { + "epoch": 2.157605711147721, + "grad_norm": 0.44627563518851293, + "learning_rate": 1.8581265298807296e-07, + "loss": 0.2235, + "step": 3929 + }, + { + "epoch": 2.158154859967051, + "grad_norm": 0.5252819958388432, + "learning_rate": 1.8558681648761622e-07, + "loss": 0.2501, + "step": 3930 + }, + { + "epoch": 2.158704008786381, + "grad_norm": 0.5322887516973847, + "learning_rate": 1.8536108610689877e-07, + "loss": 0.211, + "step": 3931 + }, + { + "epoch": 2.1592531576057112, + "grad_norm": 0.4791201013389418, + "learning_rate": 1.8513546192209635e-07, + "loss": 0.2299, + "step": 3932 + }, + { + "epoch": 2.1598023064250413, + "grad_norm": 0.48616022178910523, + "learning_rate": 1.8490994400934848e-07, + "loss": 0.2426, + "step": 3933 + }, + { + "epoch": 2.1603514552443714, + "grad_norm": 0.5260594505768331, + "learning_rate": 1.8468453244475954e-07, + "loss": 0.2564, + "step": 3934 + }, + { + "epoch": 2.160900604063701, + "grad_norm": 0.4914406655976773, + "learning_rate": 1.8445922730439746e-07, + "loss": 0.2643, + "step": 3935 + }, + { + "epoch": 2.161449752883031, + "grad_norm": 0.5328852501050172, + "learning_rate": 1.8423402866429425e-07, + "loss": 0.2216, + "step": 3936 + }, + { + "epoch": 2.1619989017023613, + "grad_norm": 0.4707301182259253, + "learning_rate": 1.8400893660044627e-07, + "loss": 0.272, + "step": 3937 + }, + { + "epoch": 2.1625480505216914, + "grad_norm": 0.5433831647293316, + "learning_rate": 1.8378395118881397e-07, + "loss": 0.2202, + "step": 3938 + }, + { + "epoch": 2.1630971993410215, + "grad_norm": 0.42325050341266984, + "learning_rate": 1.8355907250532147e-07, + "loss": 0.2303, + "step": 3939 + }, + { + "epoch": 2.1636463481603516, + "grad_norm": 0.5721251469300657, + "learning_rate": 1.833343006258571e-07, + "loss": 0.2647, + "step": 3940 + }, + { + "epoch": 2.1641954969796817, + "grad_norm": 0.46529600474690497, + "learning_rate": 1.8310963562627295e-07, + "loss": 0.2635, + "step": 3941 + }, + { + "epoch": 2.1647446457990114, + "grad_norm": 0.563222643702606, + "learning_rate": 1.8288507758238547e-07, + "loss": 0.2419, + "step": 3942 + }, + { + "epoch": 2.1652937946183415, + "grad_norm": 0.4081828292050501, + "learning_rate": 1.826606265699744e-07, + "loss": 0.2625, + "step": 3943 + }, + { + "epoch": 2.1658429434376716, + "grad_norm": 0.4530398764917165, + "learning_rate": 1.824362826647838e-07, + "loss": 0.2732, + "step": 3944 + }, + { + "epoch": 2.1663920922570017, + "grad_norm": 0.5222420863608187, + "learning_rate": 1.8221204594252177e-07, + "loss": 0.2733, + "step": 3945 + }, + { + "epoch": 2.166941241076332, + "grad_norm": 0.6618122020250323, + "learning_rate": 1.8198791647885958e-07, + "loss": 0.2984, + "step": 3946 + }, + { + "epoch": 2.167490389895662, + "grad_norm": 0.5269650152741272, + "learning_rate": 1.8176389434943267e-07, + "loss": 0.2706, + "step": 3947 + }, + { + "epoch": 2.168039538714992, + "grad_norm": 0.5129607360245605, + "learning_rate": 1.8153997962984046e-07, + "loss": 0.254, + "step": 3948 + }, + { + "epoch": 2.1685886875343217, + "grad_norm": 0.4855314552044702, + "learning_rate": 1.8131617239564572e-07, + "loss": 0.2485, + "step": 3949 + }, + { + "epoch": 2.1691378363536518, + "grad_norm": 0.49262213174779745, + "learning_rate": 1.8109247272237514e-07, + "loss": 0.2591, + "step": 3950 + }, + { + "epoch": 2.169686985172982, + "grad_norm": 0.44830683718731024, + "learning_rate": 1.808688806855195e-07, + "loss": 0.2602, + "step": 3951 + }, + { + "epoch": 2.170236133992312, + "grad_norm": 0.6136236495959323, + "learning_rate": 1.8064539636053255e-07, + "loss": 0.2747, + "step": 3952 + }, + { + "epoch": 2.170785282811642, + "grad_norm": 0.672191262207508, + "learning_rate": 1.8042201982283185e-07, + "loss": 0.3057, + "step": 3953 + }, + { + "epoch": 2.171334431630972, + "grad_norm": 0.447855063441009, + "learning_rate": 1.8019875114779902e-07, + "loss": 0.2123, + "step": 3954 + }, + { + "epoch": 2.171883580450302, + "grad_norm": 0.5069398622137821, + "learning_rate": 1.7997559041077915e-07, + "loss": 0.2471, + "step": 3955 + }, + { + "epoch": 2.172432729269632, + "grad_norm": 0.5361294945466532, + "learning_rate": 1.7975253768708054e-07, + "loss": 0.2365, + "step": 3956 + }, + { + "epoch": 2.172981878088962, + "grad_norm": 0.6224601248513151, + "learning_rate": 1.7952959305197513e-07, + "loss": 0.215, + "step": 3957 + }, + { + "epoch": 2.173531026908292, + "grad_norm": 0.5168587742842273, + "learning_rate": 1.7930675658069868e-07, + "loss": 0.2629, + "step": 3958 + }, + { + "epoch": 2.1740801757276222, + "grad_norm": 0.5258211594824284, + "learning_rate": 1.790840283484505e-07, + "loss": 0.223, + "step": 3959 + }, + { + "epoch": 2.1746293245469523, + "grad_norm": 0.41397066435051055, + "learning_rate": 1.7886140843039277e-07, + "loss": 0.2183, + "step": 3960 + }, + { + "epoch": 2.1751784733662825, + "grad_norm": 0.5916820698763788, + "learning_rate": 1.7863889690165168e-07, + "loss": 0.262, + "step": 3961 + }, + { + "epoch": 2.175727622185612, + "grad_norm": 0.44775479293884324, + "learning_rate": 1.7841649383731686e-07, + "loss": 0.2783, + "step": 3962 + }, + { + "epoch": 2.176276771004942, + "grad_norm": 0.4606341896234673, + "learning_rate": 1.7819419931244104e-07, + "loss": 0.2333, + "step": 3963 + }, + { + "epoch": 2.1768259198242723, + "grad_norm": 0.5536635371358996, + "learning_rate": 1.7797201340204019e-07, + "loss": 0.2304, + "step": 3964 + }, + { + "epoch": 2.1773750686436024, + "grad_norm": 0.486297622134742, + "learning_rate": 1.7774993618109423e-07, + "loss": 0.2248, + "step": 3965 + }, + { + "epoch": 2.1779242174629325, + "grad_norm": 0.523951370415108, + "learning_rate": 1.7752796772454567e-07, + "loss": 0.2684, + "step": 3966 + }, + { + "epoch": 2.1784733662822626, + "grad_norm": 0.4708976868499049, + "learning_rate": 1.773061081073011e-07, + "loss": 0.2529, + "step": 3967 + }, + { + "epoch": 2.1790225151015927, + "grad_norm": 0.5273083356613111, + "learning_rate": 1.7708435740422958e-07, + "loss": 0.254, + "step": 3968 + }, + { + "epoch": 2.1795716639209224, + "grad_norm": 0.5195379750212576, + "learning_rate": 1.7686271569016418e-07, + "loss": 0.2283, + "step": 3969 + }, + { + "epoch": 2.1801208127402525, + "grad_norm": 1.1111378467366022, + "learning_rate": 1.7664118303990036e-07, + "loss": 0.2481, + "step": 3970 + }, + { + "epoch": 2.1806699615595826, + "grad_norm": 0.5094539722800087, + "learning_rate": 1.764197595281975e-07, + "loss": 0.2321, + "step": 3971 + }, + { + "epoch": 2.1812191103789127, + "grad_norm": 0.5543417892199277, + "learning_rate": 1.7619844522977807e-07, + "loss": 0.257, + "step": 3972 + }, + { + "epoch": 2.181768259198243, + "grad_norm": 0.4368811340571751, + "learning_rate": 1.7597724021932723e-07, + "loss": 0.2554, + "step": 3973 + }, + { + "epoch": 2.182317408017573, + "grad_norm": 0.5215848285605968, + "learning_rate": 1.7575614457149336e-07, + "loss": 0.219, + "step": 3974 + }, + { + "epoch": 2.182866556836903, + "grad_norm": 0.48589017065286566, + "learning_rate": 1.755351583608884e-07, + "loss": 0.2306, + "step": 3975 + }, + { + "epoch": 2.1834157056562327, + "grad_norm": 0.5080207906688253, + "learning_rate": 1.7531428166208705e-07, + "loss": 0.2147, + "step": 3976 + }, + { + "epoch": 2.1839648544755628, + "grad_norm": 0.44408588542997157, + "learning_rate": 1.7509351454962684e-07, + "loss": 0.2345, + "step": 3977 + }, + { + "epoch": 2.184514003294893, + "grad_norm": 0.5653806576790686, + "learning_rate": 1.748728570980088e-07, + "loss": 0.2128, + "step": 3978 + }, + { + "epoch": 2.185063152114223, + "grad_norm": 0.3957641151179787, + "learning_rate": 1.7465230938169658e-07, + "loss": 0.2201, + "step": 3979 + }, + { + "epoch": 2.185612300933553, + "grad_norm": 0.5583549334857763, + "learning_rate": 1.7443187147511676e-07, + "loss": 0.2166, + "step": 3980 + }, + { + "epoch": 2.186161449752883, + "grad_norm": 0.5544191422615505, + "learning_rate": 1.7421154345265905e-07, + "loss": 0.2723, + "step": 3981 + }, + { + "epoch": 2.186710598572213, + "grad_norm": 0.5012992234043202, + "learning_rate": 1.7399132538867637e-07, + "loss": 0.2341, + "step": 3982 + }, + { + "epoch": 2.187259747391543, + "grad_norm": 0.49661256429628026, + "learning_rate": 1.7377121735748376e-07, + "loss": 0.22, + "step": 3983 + }, + { + "epoch": 2.187808896210873, + "grad_norm": 0.5815286108172486, + "learning_rate": 1.7355121943335991e-07, + "loss": 0.2687, + "step": 3984 + }, + { + "epoch": 2.188358045030203, + "grad_norm": 0.47338165728106446, + "learning_rate": 1.7333133169054572e-07, + "loss": 0.2523, + "step": 3985 + }, + { + "epoch": 2.1889071938495333, + "grad_norm": 0.4074535547435078, + "learning_rate": 1.7311155420324557e-07, + "loss": 0.2485, + "step": 3986 + }, + { + "epoch": 2.1894563426688634, + "grad_norm": 0.5842482722695632, + "learning_rate": 1.7289188704562588e-07, + "loss": 0.2753, + "step": 3987 + }, + { + "epoch": 2.1900054914881935, + "grad_norm": 0.503630989785303, + "learning_rate": 1.7267233029181638e-07, + "loss": 0.232, + "step": 3988 + }, + { + "epoch": 2.190554640307523, + "grad_norm": 0.5080407192677118, + "learning_rate": 1.7245288401590955e-07, + "loss": 0.2207, + "step": 3989 + }, + { + "epoch": 2.191103789126853, + "grad_norm": 0.44232842031396075, + "learning_rate": 1.7223354829196025e-07, + "loss": 0.2272, + "step": 3990 + }, + { + "epoch": 2.1916529379461833, + "grad_norm": 0.5701332853466663, + "learning_rate": 1.720143231939861e-07, + "loss": 0.2555, + "step": 3991 + }, + { + "epoch": 2.1922020867655134, + "grad_norm": 0.5214866224133593, + "learning_rate": 1.7179520879596768e-07, + "loss": 0.2368, + "step": 3992 + }, + { + "epoch": 2.1927512355848435, + "grad_norm": 0.4714742898345364, + "learning_rate": 1.7157620517184806e-07, + "loss": 0.2475, + "step": 3993 + }, + { + "epoch": 2.1933003844041736, + "grad_norm": 0.5685763605519386, + "learning_rate": 1.713573123955327e-07, + "loss": 0.2593, + "step": 3994 + }, + { + "epoch": 2.1938495332235037, + "grad_norm": 0.6082992119483821, + "learning_rate": 1.7113853054089006e-07, + "loss": 0.2467, + "step": 3995 + }, + { + "epoch": 2.1943986820428334, + "grad_norm": 0.4996313225914673, + "learning_rate": 1.7091985968175087e-07, + "loss": 0.257, + "step": 3996 + }, + { + "epoch": 2.1949478308621635, + "grad_norm": 0.4858029386986306, + "learning_rate": 1.7070129989190832e-07, + "loss": 0.2402, + "step": 3997 + }, + { + "epoch": 2.1954969796814936, + "grad_norm": 0.5147496668729445, + "learning_rate": 1.7048285124511844e-07, + "loss": 0.2684, + "step": 3998 + }, + { + "epoch": 2.1960461285008237, + "grad_norm": 0.6161496259139179, + "learning_rate": 1.7026451381509976e-07, + "loss": 0.21, + "step": 3999 + }, + { + "epoch": 2.196595277320154, + "grad_norm": 0.4815709985299374, + "learning_rate": 1.70046287675533e-07, + "loss": 0.2623, + "step": 4000 + }, + { + "epoch": 2.196595277320154, + "eval_loss": 0.322488009929657, + "eval_runtime": 18.6785, + "eval_samples_per_second": 23.717, + "eval_steps_per_second": 1.017, + "step": 4000 + }, + { + "epoch": 2.197144426139484, + "grad_norm": 0.6414869548260358, + "learning_rate": 1.6982817290006112e-07, + "loss": 0.2525, + "step": 4001 + }, + { + "epoch": 2.197693574958814, + "grad_norm": 0.4105625253986696, + "learning_rate": 1.696101695622902e-07, + "loss": 0.2442, + "step": 4002 + }, + { + "epoch": 2.1982427237781437, + "grad_norm": 0.4620457673593199, + "learning_rate": 1.6939227773578836e-07, + "loss": 0.2443, + "step": 4003 + }, + { + "epoch": 2.1987918725974738, + "grad_norm": 0.5645562413226259, + "learning_rate": 1.6917449749408576e-07, + "loss": 0.2851, + "step": 4004 + }, + { + "epoch": 2.199341021416804, + "grad_norm": 0.4889406909450005, + "learning_rate": 1.6895682891067544e-07, + "loss": 0.2739, + "step": 4005 + }, + { + "epoch": 2.199890170236134, + "grad_norm": 0.6484999147281099, + "learning_rate": 1.687392720590126e-07, + "loss": 0.2605, + "step": 4006 + }, + { + "epoch": 2.200439319055464, + "grad_norm": 0.5146071446875392, + "learning_rate": 1.6852182701251455e-07, + "loss": 0.2317, + "step": 4007 + }, + { + "epoch": 2.200988467874794, + "grad_norm": 0.41168765729540324, + "learning_rate": 1.683044938445608e-07, + "loss": 0.2818, + "step": 4008 + }, + { + "epoch": 2.2015376166941243, + "grad_norm": 0.539069737040092, + "learning_rate": 1.680872726284934e-07, + "loss": 0.2349, + "step": 4009 + }, + { + "epoch": 2.202086765513454, + "grad_norm": 0.4869059414028793, + "learning_rate": 1.6787016343761678e-07, + "loss": 0.2308, + "step": 4010 + }, + { + "epoch": 2.202635914332784, + "grad_norm": 0.5087358252541315, + "learning_rate": 1.6765316634519707e-07, + "loss": 0.2546, + "step": 4011 + }, + { + "epoch": 2.203185063152114, + "grad_norm": 0.45386299329004964, + "learning_rate": 1.6743628142446264e-07, + "loss": 0.2121, + "step": 4012 + }, + { + "epoch": 2.2037342119714443, + "grad_norm": 0.5280596873106801, + "learning_rate": 1.6721950874860454e-07, + "loss": 0.2551, + "step": 4013 + }, + { + "epoch": 2.2042833607907744, + "grad_norm": 1.02622346579664, + "learning_rate": 1.670028483907751e-07, + "loss": 0.4209, + "step": 4014 + }, + { + "epoch": 2.2048325096101045, + "grad_norm": 0.43413318763392755, + "learning_rate": 1.6678630042408952e-07, + "loss": 0.2374, + "step": 4015 + }, + { + "epoch": 2.2053816584294346, + "grad_norm": 0.43318741293930946, + "learning_rate": 1.6656986492162478e-07, + "loss": 0.2331, + "step": 4016 + }, + { + "epoch": 2.2059308072487642, + "grad_norm": 0.489768975704594, + "learning_rate": 1.6635354195641985e-07, + "loss": 0.2421, + "step": 4017 + }, + { + "epoch": 2.2064799560680943, + "grad_norm": 0.46233173815457396, + "learning_rate": 1.6613733160147554e-07, + "loss": 0.2585, + "step": 4018 + }, + { + "epoch": 2.2070291048874244, + "grad_norm": 0.5509875085989772, + "learning_rate": 1.6592123392975505e-07, + "loss": 0.2445, + "step": 4019 + }, + { + "epoch": 2.2075782537067545, + "grad_norm": 0.520179801161191, + "learning_rate": 1.6570524901418356e-07, + "loss": 0.2334, + "step": 4020 + }, + { + "epoch": 2.2081274025260846, + "grad_norm": 0.5163865066842153, + "learning_rate": 1.654893769276477e-07, + "loss": 0.2159, + "step": 4021 + }, + { + "epoch": 2.2086765513454147, + "grad_norm": 0.4984183004108988, + "learning_rate": 1.652736177429966e-07, + "loss": 0.3069, + "step": 4022 + }, + { + "epoch": 2.209225700164745, + "grad_norm": 0.42803332023888097, + "learning_rate": 1.6505797153304082e-07, + "loss": 0.2599, + "step": 4023 + }, + { + "epoch": 2.2097748489840745, + "grad_norm": 0.5260417229374884, + "learning_rate": 1.6484243837055327e-07, + "loss": 0.2287, + "step": 4024 + }, + { + "epoch": 2.2103239978034046, + "grad_norm": 0.5136301521855394, + "learning_rate": 1.6462701832826814e-07, + "loss": 0.2382, + "step": 4025 + }, + { + "epoch": 2.2108731466227347, + "grad_norm": 0.5088282106460268, + "learning_rate": 1.6441171147888187e-07, + "loss": 0.2285, + "step": 4026 + }, + { + "epoch": 2.211422295442065, + "grad_norm": 0.45164567314033205, + "learning_rate": 1.6419651789505285e-07, + "loss": 0.2727, + "step": 4027 + }, + { + "epoch": 2.211971444261395, + "grad_norm": 0.603651328225906, + "learning_rate": 1.639814376494008e-07, + "loss": 0.2463, + "step": 4028 + }, + { + "epoch": 2.212520593080725, + "grad_norm": 0.485303946701434, + "learning_rate": 1.6376647081450717e-07, + "loss": 0.235, + "step": 4029 + }, + { + "epoch": 2.213069741900055, + "grad_norm": 0.6459471769862432, + "learning_rate": 1.6355161746291568e-07, + "loss": 0.2565, + "step": 4030 + }, + { + "epoch": 2.213618890719385, + "grad_norm": 0.6934801056751915, + "learning_rate": 1.633368776671311e-07, + "loss": 0.3024, + "step": 4031 + }, + { + "epoch": 2.214168039538715, + "grad_norm": 0.577355114181319, + "learning_rate": 1.6312225149962038e-07, + "loss": 0.2506, + "step": 4032 + }, + { + "epoch": 2.214717188358045, + "grad_norm": 0.5525564996174658, + "learning_rate": 1.6290773903281215e-07, + "loss": 0.2512, + "step": 4033 + }, + { + "epoch": 2.215266337177375, + "grad_norm": 0.5917190480644012, + "learning_rate": 1.626933403390962e-07, + "loss": 0.2301, + "step": 4034 + }, + { + "epoch": 2.215815485996705, + "grad_norm": 0.5862396722785724, + "learning_rate": 1.624790554908241e-07, + "loss": 0.2596, + "step": 4035 + }, + { + "epoch": 2.2163646348160353, + "grad_norm": 0.5124609754820473, + "learning_rate": 1.622648845603092e-07, + "loss": 0.3049, + "step": 4036 + }, + { + "epoch": 2.2169137836353654, + "grad_norm": 0.5090848377694975, + "learning_rate": 1.6205082761982656e-07, + "loss": 0.2601, + "step": 4037 + }, + { + "epoch": 2.217462932454695, + "grad_norm": 0.5332019121630734, + "learning_rate": 1.6183688474161207e-07, + "loss": 0.247, + "step": 4038 + }, + { + "epoch": 2.218012081274025, + "grad_norm": 0.4836118264097785, + "learning_rate": 1.61623055997864e-07, + "loss": 0.2577, + "step": 4039 + }, + { + "epoch": 2.2185612300933553, + "grad_norm": 0.4831484679955604, + "learning_rate": 1.6140934146074122e-07, + "loss": 0.2323, + "step": 4040 + }, + { + "epoch": 2.2191103789126854, + "grad_norm": 0.4385352786071897, + "learning_rate": 1.6119574120236496e-07, + "loss": 0.2439, + "step": 4041 + }, + { + "epoch": 2.2196595277320155, + "grad_norm": 0.44689419860704854, + "learning_rate": 1.6098225529481705e-07, + "loss": 0.2456, + "step": 4042 + }, + { + "epoch": 2.2202086765513456, + "grad_norm": 0.5279272153579984, + "learning_rate": 1.6076888381014133e-07, + "loss": 0.2743, + "step": 4043 + }, + { + "epoch": 2.2207578253706757, + "grad_norm": 0.47364791781897286, + "learning_rate": 1.6055562682034306e-07, + "loss": 0.2756, + "step": 4044 + }, + { + "epoch": 2.2213069741900053, + "grad_norm": 0.4418247722099023, + "learning_rate": 1.6034248439738808e-07, + "loss": 0.2634, + "step": 4045 + }, + { + "epoch": 2.2218561230093354, + "grad_norm": 0.6459587624664268, + "learning_rate": 1.601294566132043e-07, + "loss": 0.2784, + "step": 4046 + }, + { + "epoch": 2.2224052718286655, + "grad_norm": 0.5170173724091901, + "learning_rate": 1.5991654353968095e-07, + "loss": 0.2439, + "step": 4047 + }, + { + "epoch": 2.2229544206479956, + "grad_norm": 0.46719571459521797, + "learning_rate": 1.597037452486681e-07, + "loss": 0.2325, + "step": 4048 + }, + { + "epoch": 2.2235035694673257, + "grad_norm": 0.6445040020296987, + "learning_rate": 1.5949106181197745e-07, + "loss": 0.2374, + "step": 4049 + }, + { + "epoch": 2.224052718286656, + "grad_norm": 0.646825191451552, + "learning_rate": 1.59278493301382e-07, + "loss": 0.2676, + "step": 4050 + }, + { + "epoch": 2.224601867105986, + "grad_norm": 0.4673100278924, + "learning_rate": 1.5906603978861559e-07, + "loss": 0.2476, + "step": 4051 + }, + { + "epoch": 2.2251510159253156, + "grad_norm": 0.5407872055791325, + "learning_rate": 1.5885370134537327e-07, + "loss": 0.2062, + "step": 4052 + }, + { + "epoch": 2.2257001647446457, + "grad_norm": 0.4708126322166367, + "learning_rate": 1.5864147804331166e-07, + "loss": 0.239, + "step": 4053 + }, + { + "epoch": 2.226249313563976, + "grad_norm": 0.5738989256146745, + "learning_rate": 1.5842936995404848e-07, + "loss": 0.235, + "step": 4054 + }, + { + "epoch": 2.226798462383306, + "grad_norm": 0.4213264483050754, + "learning_rate": 1.5821737714916222e-07, + "loss": 0.2569, + "step": 4055 + }, + { + "epoch": 2.227347611202636, + "grad_norm": 0.4920217189253605, + "learning_rate": 1.5800549970019243e-07, + "loss": 0.2559, + "step": 4056 + }, + { + "epoch": 2.227896760021966, + "grad_norm": 0.49807406052357484, + "learning_rate": 1.5779373767864017e-07, + "loss": 0.2486, + "step": 4057 + }, + { + "epoch": 2.228445908841296, + "grad_norm": 0.41236021888645474, + "learning_rate": 1.5758209115596746e-07, + "loss": 0.2685, + "step": 4058 + }, + { + "epoch": 2.228995057660626, + "grad_norm": 0.5765654989844405, + "learning_rate": 1.5737056020359682e-07, + "loss": 0.2405, + "step": 4059 + }, + { + "epoch": 2.229544206479956, + "grad_norm": 0.4717565799493322, + "learning_rate": 1.5715914489291244e-07, + "loss": 0.2028, + "step": 4060 + }, + { + "epoch": 2.230093355299286, + "grad_norm": 0.43428493381873695, + "learning_rate": 1.5694784529525938e-07, + "loss": 0.2317, + "step": 4061 + }, + { + "epoch": 2.230642504118616, + "grad_norm": 0.44660722205331316, + "learning_rate": 1.5673666148194295e-07, + "loss": 0.226, + "step": 4062 + }, + { + "epoch": 2.2311916529379463, + "grad_norm": 0.7357898715805092, + "learning_rate": 1.565255935242302e-07, + "loss": 0.2154, + "step": 4063 + }, + { + "epoch": 2.2317408017572764, + "grad_norm": 0.4598953633135372, + "learning_rate": 1.563146414933489e-07, + "loss": 0.241, + "step": 4064 + }, + { + "epoch": 2.232289950576606, + "grad_norm": 0.4815475185276992, + "learning_rate": 1.5610380546048723e-07, + "loss": 0.2458, + "step": 4065 + }, + { + "epoch": 2.232839099395936, + "grad_norm": 0.46722349964631205, + "learning_rate": 1.5589308549679504e-07, + "loss": 0.2958, + "step": 4066 + }, + { + "epoch": 2.2333882482152663, + "grad_norm": 0.4280929222132165, + "learning_rate": 1.5568248167338217e-07, + "loss": 0.2783, + "step": 4067 + }, + { + "epoch": 2.2339373970345964, + "grad_norm": 0.6224497554969846, + "learning_rate": 1.5547199406131993e-07, + "loss": 0.2571, + "step": 4068 + }, + { + "epoch": 2.2344865458539265, + "grad_norm": 0.689987558103597, + "learning_rate": 1.5526162273163983e-07, + "loss": 0.2701, + "step": 4069 + }, + { + "epoch": 2.2350356946732566, + "grad_norm": 0.4536144589688099, + "learning_rate": 1.5505136775533463e-07, + "loss": 0.2664, + "step": 4070 + }, + { + "epoch": 2.2355848434925867, + "grad_norm": 0.44457846028632664, + "learning_rate": 1.548412292033578e-07, + "loss": 0.2524, + "step": 4071 + }, + { + "epoch": 2.2361339923119163, + "grad_norm": 0.5144545991105619, + "learning_rate": 1.5463120714662322e-07, + "loss": 0.2293, + "step": 4072 + }, + { + "epoch": 2.2366831411312464, + "grad_norm": 0.494227605491344, + "learning_rate": 1.5442130165600538e-07, + "loss": 0.2399, + "step": 4073 + }, + { + "epoch": 2.2372322899505765, + "grad_norm": 0.5653225293394155, + "learning_rate": 1.5421151280233982e-07, + "loss": 0.2508, + "step": 4074 + }, + { + "epoch": 2.2377814387699067, + "grad_norm": 0.4994693054488046, + "learning_rate": 1.5400184065642272e-07, + "loss": 0.2268, + "step": 4075 + }, + { + "epoch": 2.2383305875892368, + "grad_norm": 0.4689069691469193, + "learning_rate": 1.5379228528901043e-07, + "loss": 0.2931, + "step": 4076 + }, + { + "epoch": 2.238879736408567, + "grad_norm": 0.6373272585422195, + "learning_rate": 1.5358284677082042e-07, + "loss": 0.2347, + "step": 4077 + }, + { + "epoch": 2.239428885227897, + "grad_norm": 0.5542172172071798, + "learning_rate": 1.5337352517253032e-07, + "loss": 0.2371, + "step": 4078 + }, + { + "epoch": 2.2399780340472266, + "grad_norm": 0.53591832041364, + "learning_rate": 1.5316432056477836e-07, + "loss": 0.2347, + "step": 4079 + }, + { + "epoch": 2.2405271828665567, + "grad_norm": 0.5276035884714745, + "learning_rate": 1.5295523301816346e-07, + "loss": 0.2234, + "step": 4080 + }, + { + "epoch": 2.241076331685887, + "grad_norm": 0.41176842871437136, + "learning_rate": 1.527462626032452e-07, + "loss": 0.2099, + "step": 4081 + }, + { + "epoch": 2.241625480505217, + "grad_norm": 0.4666858806452414, + "learning_rate": 1.5253740939054306e-07, + "loss": 0.2212, + "step": 4082 + }, + { + "epoch": 2.242174629324547, + "grad_norm": 0.4843293028575849, + "learning_rate": 1.5232867345053764e-07, + "loss": 0.2774, + "step": 4083 + }, + { + "epoch": 2.242723778143877, + "grad_norm": 0.5509966873108286, + "learning_rate": 1.5212005485366918e-07, + "loss": 0.259, + "step": 4084 + }, + { + "epoch": 2.2432729269632072, + "grad_norm": 0.5969395767924995, + "learning_rate": 1.5191155367033924e-07, + "loss": 0.2954, + "step": 4085 + }, + { + "epoch": 2.243822075782537, + "grad_norm": 0.6112912825644582, + "learning_rate": 1.5170316997090892e-07, + "loss": 0.2618, + "step": 4086 + }, + { + "epoch": 2.244371224601867, + "grad_norm": 0.47695269373134713, + "learning_rate": 1.5149490382570017e-07, + "loss": 0.2208, + "step": 4087 + }, + { + "epoch": 2.244920373421197, + "grad_norm": 0.5348327908424828, + "learning_rate": 1.5128675530499537e-07, + "loss": 0.285, + "step": 4088 + }, + { + "epoch": 2.245469522240527, + "grad_norm": 0.49428017049523315, + "learning_rate": 1.5107872447903681e-07, + "loss": 0.2247, + "step": 4089 + }, + { + "epoch": 2.2460186710598573, + "grad_norm": 0.5260675279454065, + "learning_rate": 1.5087081141802696e-07, + "loss": 0.277, + "step": 4090 + }, + { + "epoch": 2.2465678198791874, + "grad_norm": 0.4333691550355185, + "learning_rate": 1.5066301619212916e-07, + "loss": 0.2315, + "step": 4091 + }, + { + "epoch": 2.247116968698517, + "grad_norm": 0.46681953633637757, + "learning_rate": 1.5045533887146663e-07, + "loss": 0.2437, + "step": 4092 + }, + { + "epoch": 2.247666117517847, + "grad_norm": 0.43472436170194234, + "learning_rate": 1.5024777952612255e-07, + "loss": 0.2199, + "step": 4093 + }, + { + "epoch": 2.2482152663371773, + "grad_norm": 0.3900325184868594, + "learning_rate": 1.500403382261409e-07, + "loss": 0.2391, + "step": 4094 + }, + { + "epoch": 2.2487644151565074, + "grad_norm": 0.4997800989330255, + "learning_rate": 1.4983301504152536e-07, + "loss": 0.2198, + "step": 4095 + }, + { + "epoch": 2.2493135639758375, + "grad_norm": 0.48044212882901877, + "learning_rate": 1.4962581004223954e-07, + "loss": 0.2373, + "step": 4096 + }, + { + "epoch": 2.2498627127951676, + "grad_norm": 0.6828704663809463, + "learning_rate": 1.4941872329820787e-07, + "loss": 0.2848, + "step": 4097 + }, + { + "epoch": 2.2504118616144977, + "grad_norm": 0.4851853837271703, + "learning_rate": 1.4921175487931452e-07, + "loss": 0.2422, + "step": 4098 + }, + { + "epoch": 2.2509610104338273, + "grad_norm": 0.4223467889212768, + "learning_rate": 1.490049048554035e-07, + "loss": 0.2341, + "step": 4099 + }, + { + "epoch": 2.2515101592531575, + "grad_norm": 0.804796153082989, + "learning_rate": 1.4879817329627905e-07, + "loss": 0.2767, + "step": 4100 + }, + { + "epoch": 2.2520593080724876, + "grad_norm": 0.5008326743721045, + "learning_rate": 1.4859156027170557e-07, + "loss": 0.2691, + "step": 4101 + }, + { + "epoch": 2.2526084568918177, + "grad_norm": 0.4789662682726989, + "learning_rate": 1.4838506585140746e-07, + "loss": 0.241, + "step": 4102 + }, + { + "epoch": 2.2531576057111478, + "grad_norm": 0.5204002515437153, + "learning_rate": 1.481786901050687e-07, + "loss": 0.2855, + "step": 4103 + }, + { + "epoch": 2.253706754530478, + "grad_norm": 0.44641206706778724, + "learning_rate": 1.4797243310233368e-07, + "loss": 0.2435, + "step": 4104 + }, + { + "epoch": 2.254255903349808, + "grad_norm": 0.4257416843597193, + "learning_rate": 1.4776629491280663e-07, + "loss": 0.2354, + "step": 4105 + }, + { + "epoch": 2.2548050521691376, + "grad_norm": 0.40645721899295234, + "learning_rate": 1.4756027560605144e-07, + "loss": 0.2239, + "step": 4106 + }, + { + "epoch": 2.2553542009884677, + "grad_norm": 0.5542036143621838, + "learning_rate": 1.4735437525159197e-07, + "loss": 0.2051, + "step": 4107 + }, + { + "epoch": 2.255903349807798, + "grad_norm": 0.5349724416344412, + "learning_rate": 1.4714859391891208e-07, + "loss": 0.2252, + "step": 4108 + }, + { + "epoch": 2.256452498627128, + "grad_norm": 0.5051684785400459, + "learning_rate": 1.4694293167745558e-07, + "loss": 0.2852, + "step": 4109 + }, + { + "epoch": 2.257001647446458, + "grad_norm": 0.469247869430049, + "learning_rate": 1.4673738859662574e-07, + "loss": 0.2155, + "step": 4110 + }, + { + "epoch": 2.257550796265788, + "grad_norm": 0.5772923371348587, + "learning_rate": 1.4653196474578557e-07, + "loss": 0.2255, + "step": 4111 + }, + { + "epoch": 2.2580999450851182, + "grad_norm": 0.4824546686346936, + "learning_rate": 1.4632666019425845e-07, + "loss": 0.2101, + "step": 4112 + }, + { + "epoch": 2.258649093904448, + "grad_norm": 0.5374394105576086, + "learning_rate": 1.461214750113267e-07, + "loss": 0.2365, + "step": 4113 + }, + { + "epoch": 2.259198242723778, + "grad_norm": 0.42661273111054115, + "learning_rate": 1.4591640926623304e-07, + "loss": 0.2303, + "step": 4114 + }, + { + "epoch": 2.259747391543108, + "grad_norm": 0.49170551201387735, + "learning_rate": 1.4571146302817958e-07, + "loss": 0.2261, + "step": 4115 + }, + { + "epoch": 2.260296540362438, + "grad_norm": 0.4792400377244397, + "learning_rate": 1.4550663636632815e-07, + "loss": 0.2247, + "step": 4116 + }, + { + "epoch": 2.2608456891817683, + "grad_norm": 0.5174771700880091, + "learning_rate": 1.4530192934979993e-07, + "loss": 0.2932, + "step": 4117 + }, + { + "epoch": 2.2613948380010984, + "grad_norm": 0.48536259945038496, + "learning_rate": 1.450973420476762e-07, + "loss": 0.2744, + "step": 4118 + }, + { + "epoch": 2.2619439868204285, + "grad_norm": 0.47287259796667797, + "learning_rate": 1.448928745289978e-07, + "loss": 0.2601, + "step": 4119 + }, + { + "epoch": 2.262493135639758, + "grad_norm": 0.5177657418212257, + "learning_rate": 1.446885268627646e-07, + "loss": 0.2379, + "step": 4120 + }, + { + "epoch": 2.2630422844590883, + "grad_norm": 0.6116402132194474, + "learning_rate": 1.4448429911793683e-07, + "loss": 0.2399, + "step": 4121 + }, + { + "epoch": 2.2635914332784184, + "grad_norm": 0.48870409654320696, + "learning_rate": 1.4428019136343343e-07, + "loss": 0.2247, + "step": 4122 + }, + { + "epoch": 2.2641405820977485, + "grad_norm": 0.43384224228435914, + "learning_rate": 1.4407620366813365e-07, + "loss": 0.2178, + "step": 4123 + }, + { + "epoch": 2.2646897309170786, + "grad_norm": 0.6020925406231517, + "learning_rate": 1.438723361008754e-07, + "loss": 0.2423, + "step": 4124 + }, + { + "epoch": 2.2652388797364087, + "grad_norm": 0.46034038408160577, + "learning_rate": 1.436685887304567e-07, + "loss": 0.2195, + "step": 4125 + }, + { + "epoch": 2.265788028555739, + "grad_norm": 0.4027509383441927, + "learning_rate": 1.4346496162563496e-07, + "loss": 0.2132, + "step": 4126 + }, + { + "epoch": 2.2663371773750685, + "grad_norm": 0.4291865889254372, + "learning_rate": 1.432614548551266e-07, + "loss": 0.2573, + "step": 4127 + }, + { + "epoch": 2.2668863261943986, + "grad_norm": 0.4991068444154207, + "learning_rate": 1.4305806848760748e-07, + "loss": 0.2392, + "step": 4128 + }, + { + "epoch": 2.2674354750137287, + "grad_norm": 0.526838059294347, + "learning_rate": 1.4285480259171346e-07, + "loss": 0.2817, + "step": 4129 + }, + { + "epoch": 2.2679846238330588, + "grad_norm": 0.50959453292258, + "learning_rate": 1.426516572360388e-07, + "loss": 0.2276, + "step": 4130 + }, + { + "epoch": 2.268533772652389, + "grad_norm": 0.5983390269755672, + "learning_rate": 1.4244863248913789e-07, + "loss": 0.2743, + "step": 4131 + }, + { + "epoch": 2.269082921471719, + "grad_norm": 0.4584851259788919, + "learning_rate": 1.4224572841952415e-07, + "loss": 0.2313, + "step": 4132 + }, + { + "epoch": 2.269632070291049, + "grad_norm": 0.5201587455828883, + "learning_rate": 1.4204294509567013e-07, + "loss": 0.2858, + "step": 4133 + }, + { + "epoch": 2.2701812191103787, + "grad_norm": 0.41876205462654115, + "learning_rate": 1.4184028258600756e-07, + "loss": 0.2502, + "step": 4134 + }, + { + "epoch": 2.270730367929709, + "grad_norm": 0.4640150705458623, + "learning_rate": 1.4163774095892772e-07, + "loss": 0.273, + "step": 4135 + }, + { + "epoch": 2.271279516749039, + "grad_norm": 0.46891782931303666, + "learning_rate": 1.414353202827811e-07, + "loss": 0.2476, + "step": 4136 + }, + { + "epoch": 2.271828665568369, + "grad_norm": 0.4752361435919389, + "learning_rate": 1.4123302062587685e-07, + "loss": 0.2374, + "step": 4137 + }, + { + "epoch": 2.272377814387699, + "grad_norm": 0.6349830260268547, + "learning_rate": 1.4103084205648407e-07, + "loss": 0.2081, + "step": 4138 + }, + { + "epoch": 2.2729269632070292, + "grad_norm": 0.6378449238164671, + "learning_rate": 1.408287846428303e-07, + "loss": 0.2051, + "step": 4139 + }, + { + "epoch": 2.2734761120263594, + "grad_norm": 0.5101686674793757, + "learning_rate": 1.4062684845310263e-07, + "loss": 0.2736, + "step": 4140 + }, + { + "epoch": 2.274025260845689, + "grad_norm": 0.4879750356050871, + "learning_rate": 1.4042503355544686e-07, + "loss": 0.2576, + "step": 4141 + }, + { + "epoch": 2.274574409665019, + "grad_norm": 0.5910931952269277, + "learning_rate": 1.4022334001796823e-07, + "loss": 0.281, + "step": 4142 + }, + { + "epoch": 2.275123558484349, + "grad_norm": 0.4879267299939271, + "learning_rate": 1.4002176790873118e-07, + "loss": 0.2519, + "step": 4143 + }, + { + "epoch": 2.2756727073036793, + "grad_norm": 0.5600512166164386, + "learning_rate": 1.398203172957583e-07, + "loss": 0.2039, + "step": 4144 + }, + { + "epoch": 2.2762218561230094, + "grad_norm": 0.4623371108049366, + "learning_rate": 1.3961898824703198e-07, + "loss": 0.2435, + "step": 4145 + }, + { + "epoch": 2.2767710049423395, + "grad_norm": 0.4657295018169696, + "learning_rate": 1.3941778083049355e-07, + "loss": 0.2361, + "step": 4146 + }, + { + "epoch": 2.2773201537616696, + "grad_norm": 0.3943734515947322, + "learning_rate": 1.3921669511404282e-07, + "loss": 0.2495, + "step": 4147 + }, + { + "epoch": 2.2778693025809993, + "grad_norm": 0.5030704399097244, + "learning_rate": 1.3901573116553891e-07, + "loss": 0.2137, + "step": 4148 + }, + { + "epoch": 2.2784184514003294, + "grad_norm": 0.5123039149905788, + "learning_rate": 1.3881488905279994e-07, + "loss": 0.2701, + "step": 4149 + }, + { + "epoch": 2.2789676002196595, + "grad_norm": 1.1897659495246475, + "learning_rate": 1.3861416884360257e-07, + "loss": 0.2341, + "step": 4150 + }, + { + "epoch": 2.2795167490389896, + "grad_norm": 0.49501193114645126, + "learning_rate": 1.3841357060568228e-07, + "loss": 0.2319, + "step": 4151 + }, + { + "epoch": 2.2800658978583197, + "grad_norm": 0.5728811954742753, + "learning_rate": 1.382130944067338e-07, + "loss": 0.2336, + "step": 4152 + }, + { + "epoch": 2.28061504667765, + "grad_norm": 0.5020682990748071, + "learning_rate": 1.3801274031441057e-07, + "loss": 0.2859, + "step": 4153 + }, + { + "epoch": 2.28116419549698, + "grad_norm": 0.4705747603793455, + "learning_rate": 1.378125083963246e-07, + "loss": 0.2467, + "step": 4154 + }, + { + "epoch": 2.2817133443163096, + "grad_norm": 0.4840372994927695, + "learning_rate": 1.3761239872004663e-07, + "loss": 0.2232, + "step": 4155 + }, + { + "epoch": 2.2822624931356397, + "grad_norm": 0.4573782643179784, + "learning_rate": 1.3741241135310638e-07, + "loss": 0.2246, + "step": 4156 + }, + { + "epoch": 2.2828116419549698, + "grad_norm": 0.48441613925674715, + "learning_rate": 1.372125463629924e-07, + "loss": 0.2308, + "step": 4157 + }, + { + "epoch": 2.2833607907743, + "grad_norm": 0.6238382663211087, + "learning_rate": 1.3701280381715151e-07, + "loss": 0.2608, + "step": 4158 + }, + { + "epoch": 2.28390993959363, + "grad_norm": 0.4027952861337082, + "learning_rate": 1.3681318378298963e-07, + "loss": 0.2332, + "step": 4159 + }, + { + "epoch": 2.28445908841296, + "grad_norm": 0.4280092420858081, + "learning_rate": 1.366136863278714e-07, + "loss": 0.2535, + "step": 4160 + }, + { + "epoch": 2.28500823723229, + "grad_norm": 0.4679994580993365, + "learning_rate": 1.3641431151911932e-07, + "loss": 0.2402, + "step": 4161 + }, + { + "epoch": 2.28555738605162, + "grad_norm": 0.5013205493799293, + "learning_rate": 1.3621505942401523e-07, + "loss": 0.2244, + "step": 4162 + }, + { + "epoch": 2.28610653487095, + "grad_norm": 0.5671218638074996, + "learning_rate": 1.3601593010979964e-07, + "loss": 0.2505, + "step": 4163 + }, + { + "epoch": 2.28665568369028, + "grad_norm": 0.5536695446176169, + "learning_rate": 1.35816923643671e-07, + "loss": 0.2236, + "step": 4164 + }, + { + "epoch": 2.28720483250961, + "grad_norm": 0.598254009050342, + "learning_rate": 1.3561804009278698e-07, + "loss": 0.2456, + "step": 4165 + }, + { + "epoch": 2.2877539813289403, + "grad_norm": 0.5059548541358106, + "learning_rate": 1.354192795242632e-07, + "loss": 0.2131, + "step": 4166 + }, + { + "epoch": 2.2883031301482704, + "grad_norm": 0.6420163200377998, + "learning_rate": 1.352206420051742e-07, + "loss": 0.1971, + "step": 4167 + }, + { + "epoch": 2.2888522789676005, + "grad_norm": 0.5120859935343264, + "learning_rate": 1.3502212760255262e-07, + "loss": 0.2105, + "step": 4168 + }, + { + "epoch": 2.28940142778693, + "grad_norm": 0.4237407187151969, + "learning_rate": 1.3482373638338991e-07, + "loss": 0.2348, + "step": 4169 + }, + { + "epoch": 2.2899505766062602, + "grad_norm": 0.5369619556933241, + "learning_rate": 1.3462546841463595e-07, + "loss": 0.2431, + "step": 4170 + }, + { + "epoch": 2.2904997254255903, + "grad_norm": 0.4841101372581998, + "learning_rate": 1.3442732376319868e-07, + "loss": 0.2372, + "step": 4171 + }, + { + "epoch": 2.2910488742449204, + "grad_norm": 0.4577820813197618, + "learning_rate": 1.3422930249594447e-07, + "loss": 0.2743, + "step": 4172 + }, + { + "epoch": 2.2915980230642505, + "grad_norm": 0.47536115409390084, + "learning_rate": 1.3403140467969833e-07, + "loss": 0.2825, + "step": 4173 + }, + { + "epoch": 2.2921471718835806, + "grad_norm": 0.46715608398591546, + "learning_rate": 1.338336303812438e-07, + "loss": 0.2347, + "step": 4174 + }, + { + "epoch": 2.2926963207029103, + "grad_norm": 0.539137596474045, + "learning_rate": 1.336359796673219e-07, + "loss": 0.2302, + "step": 4175 + }, + { + "epoch": 2.2932454695222404, + "grad_norm": 0.5050491248706318, + "learning_rate": 1.3343845260463288e-07, + "loss": 0.2821, + "step": 4176 + }, + { + "epoch": 2.2937946183415705, + "grad_norm": 0.5095488283924023, + "learning_rate": 1.3324104925983468e-07, + "loss": 0.2427, + "step": 4177 + }, + { + "epoch": 2.2943437671609006, + "grad_norm": 0.45178748063899143, + "learning_rate": 1.3304376969954354e-07, + "loss": 0.2393, + "step": 4178 + }, + { + "epoch": 2.2948929159802307, + "grad_norm": 0.4152125291904169, + "learning_rate": 1.3284661399033408e-07, + "loss": 0.2588, + "step": 4179 + }, + { + "epoch": 2.295442064799561, + "grad_norm": 0.4989093197992777, + "learning_rate": 1.3264958219873937e-07, + "loss": 0.2323, + "step": 4180 + }, + { + "epoch": 2.2959912136188905, + "grad_norm": 0.4718507509496132, + "learning_rate": 1.3245267439124998e-07, + "loss": 0.2469, + "step": 4181 + }, + { + "epoch": 2.2965403624382206, + "grad_norm": 0.5042173308827593, + "learning_rate": 1.322558906343154e-07, + "loss": 0.2469, + "step": 4182 + }, + { + "epoch": 2.2970895112575507, + "grad_norm": 0.5479451490372311, + "learning_rate": 1.3205923099434264e-07, + "loss": 0.2674, + "step": 4183 + }, + { + "epoch": 2.297638660076881, + "grad_norm": 0.48121238253968684, + "learning_rate": 1.318626955376973e-07, + "loss": 0.2754, + "step": 4184 + }, + { + "epoch": 2.298187808896211, + "grad_norm": 0.4613830281941604, + "learning_rate": 1.3166628433070255e-07, + "loss": 0.2644, + "step": 4185 + }, + { + "epoch": 2.298736957715541, + "grad_norm": 0.4497349512087476, + "learning_rate": 1.3146999743964013e-07, + "loss": 0.2197, + "step": 4186 + }, + { + "epoch": 2.299286106534871, + "grad_norm": 0.5615817774770319, + "learning_rate": 1.3127383493074981e-07, + "loss": 0.2507, + "step": 4187 + }, + { + "epoch": 2.2998352553542007, + "grad_norm": 0.4706206430873418, + "learning_rate": 1.31077796870229e-07, + "loss": 0.2641, + "step": 4188 + }, + { + "epoch": 2.300384404173531, + "grad_norm": 0.46509667714035746, + "learning_rate": 1.3088188332423322e-07, + "loss": 0.2251, + "step": 4189 + }, + { + "epoch": 2.300933552992861, + "grad_norm": 0.42678614100547047, + "learning_rate": 1.3068609435887611e-07, + "loss": 0.2259, + "step": 4190 + }, + { + "epoch": 2.301482701812191, + "grad_norm": 0.5081430522107518, + "learning_rate": 1.3049043004022956e-07, + "loss": 0.2709, + "step": 4191 + }, + { + "epoch": 2.302031850631521, + "grad_norm": 0.4473636918398846, + "learning_rate": 1.3029489043432267e-07, + "loss": 0.2467, + "step": 4192 + }, + { + "epoch": 2.3025809994508513, + "grad_norm": 0.44088709699424494, + "learning_rate": 1.3009947560714317e-07, + "loss": 0.238, + "step": 4193 + }, + { + "epoch": 2.3031301482701814, + "grad_norm": 0.46544621864065105, + "learning_rate": 1.2990418562463617e-07, + "loss": 0.2472, + "step": 4194 + }, + { + "epoch": 2.303679297089511, + "grad_norm": 0.4953664352897843, + "learning_rate": 1.297090205527048e-07, + "loss": 0.2749, + "step": 4195 + }, + { + "epoch": 2.304228445908841, + "grad_norm": 0.44324776675655786, + "learning_rate": 1.2951398045721013e-07, + "loss": 0.2695, + "step": 4196 + }, + { + "epoch": 2.3047775947281712, + "grad_norm": 0.5218129026698359, + "learning_rate": 1.2931906540397115e-07, + "loss": 0.2444, + "step": 4197 + }, + { + "epoch": 2.3053267435475013, + "grad_norm": 0.5470852378369458, + "learning_rate": 1.2912427545876448e-07, + "loss": 0.2496, + "step": 4198 + }, + { + "epoch": 2.3058758923668314, + "grad_norm": 0.4819022869823857, + "learning_rate": 1.289296106873243e-07, + "loss": 0.2459, + "step": 4199 + }, + { + "epoch": 2.3064250411861615, + "grad_norm": 0.5023444791148095, + "learning_rate": 1.2873507115534297e-07, + "loss": 0.2213, + "step": 4200 + }, + { + "epoch": 2.3064250411861615, + "eval_loss": 0.32245030999183655, + "eval_runtime": 18.6676, + "eval_samples_per_second": 23.731, + "eval_steps_per_second": 1.018, + "step": 4200 + }, + { + "epoch": 2.3069741900054916, + "grad_norm": 0.5028076886002473, + "learning_rate": 1.2854065692847057e-07, + "loss": 0.215, + "step": 4201 + }, + { + "epoch": 2.3075233388248213, + "grad_norm": 0.5222683948325819, + "learning_rate": 1.2834636807231442e-07, + "loss": 0.2789, + "step": 4202 + }, + { + "epoch": 2.3080724876441514, + "grad_norm": 0.4420205263254049, + "learning_rate": 1.2815220465244004e-07, + "loss": 0.2355, + "step": 4203 + }, + { + "epoch": 2.3086216364634815, + "grad_norm": 0.6417395145747327, + "learning_rate": 1.279581667343705e-07, + "loss": 0.3134, + "step": 4204 + }, + { + "epoch": 2.3091707852828116, + "grad_norm": 0.4976515160196446, + "learning_rate": 1.2776425438358644e-07, + "loss": 0.258, + "step": 4205 + }, + { + "epoch": 2.3097199341021417, + "grad_norm": 0.4119084196110188, + "learning_rate": 1.2757046766552583e-07, + "loss": 0.2363, + "step": 4206 + }, + { + "epoch": 2.310269082921472, + "grad_norm": 0.5520819338288431, + "learning_rate": 1.2737680664558474e-07, + "loss": 0.2691, + "step": 4207 + }, + { + "epoch": 2.310818231740802, + "grad_norm": 0.5878284488854353, + "learning_rate": 1.2718327138911692e-07, + "loss": 0.2321, + "step": 4208 + }, + { + "epoch": 2.3113673805601316, + "grad_norm": 0.4695433374773004, + "learning_rate": 1.2698986196143308e-07, + "loss": 0.2258, + "step": 4209 + }, + { + "epoch": 2.3119165293794617, + "grad_norm": 0.4606583882657425, + "learning_rate": 1.2679657842780164e-07, + "loss": 0.2397, + "step": 4210 + }, + { + "epoch": 2.312465678198792, + "grad_norm": 0.6718002381521354, + "learning_rate": 1.2660342085344904e-07, + "loss": 0.288, + "step": 4211 + }, + { + "epoch": 2.313014827018122, + "grad_norm": 0.599579526968771, + "learning_rate": 1.264103893035585e-07, + "loss": 0.2629, + "step": 4212 + }, + { + "epoch": 2.313563975837452, + "grad_norm": 0.662819334016803, + "learning_rate": 1.2621748384327125e-07, + "loss": 0.1986, + "step": 4213 + }, + { + "epoch": 2.314113124656782, + "grad_norm": 0.5326928265387822, + "learning_rate": 1.2602470453768582e-07, + "loss": 0.2403, + "step": 4214 + }, + { + "epoch": 2.314662273476112, + "grad_norm": 0.5305556106293511, + "learning_rate": 1.258320514518581e-07, + "loss": 0.2355, + "step": 4215 + }, + { + "epoch": 2.315211422295442, + "grad_norm": 0.44449672727673184, + "learning_rate": 1.256395246508012e-07, + "loss": 0.2361, + "step": 4216 + }, + { + "epoch": 2.315760571114772, + "grad_norm": 0.5026382580085097, + "learning_rate": 1.2544712419948597e-07, + "loss": 0.2485, + "step": 4217 + }, + { + "epoch": 2.316309719934102, + "grad_norm": 0.4567895108347193, + "learning_rate": 1.2525485016284066e-07, + "loss": 0.2275, + "step": 4218 + }, + { + "epoch": 2.316858868753432, + "grad_norm": 0.49035864016220915, + "learning_rate": 1.2506270260575034e-07, + "loss": 0.2487, + "step": 4219 + }, + { + "epoch": 2.3174080175727623, + "grad_norm": 0.4639398044704909, + "learning_rate": 1.2487068159305802e-07, + "loss": 0.2056, + "step": 4220 + }, + { + "epoch": 2.3179571663920924, + "grad_norm": 0.49657167218952303, + "learning_rate": 1.2467878718956345e-07, + "loss": 0.2729, + "step": 4221 + }, + { + "epoch": 2.3185063152114225, + "grad_norm": 0.5182790923584928, + "learning_rate": 1.2448701946002416e-07, + "loss": 0.2255, + "step": 4222 + }, + { + "epoch": 2.319055464030752, + "grad_norm": 0.45015121881861003, + "learning_rate": 1.2429537846915446e-07, + "loss": 0.2386, + "step": 4223 + }, + { + "epoch": 2.3196046128500822, + "grad_norm": 1.1108999488654705, + "learning_rate": 1.241038642816263e-07, + "loss": 0.2267, + "step": 4224 + }, + { + "epoch": 2.3201537616694123, + "grad_norm": 0.4938133111052092, + "learning_rate": 1.2391247696206871e-07, + "loss": 0.2223, + "step": 4225 + }, + { + "epoch": 2.3207029104887424, + "grad_norm": 0.5120856297107359, + "learning_rate": 1.237212165750678e-07, + "loss": 0.2604, + "step": 4226 + }, + { + "epoch": 2.3212520593080725, + "grad_norm": 0.4763940134692922, + "learning_rate": 1.235300831851667e-07, + "loss": 0.2249, + "step": 4227 + }, + { + "epoch": 2.3218012081274026, + "grad_norm": 0.5368401775453835, + "learning_rate": 1.2333907685686626e-07, + "loss": 0.2599, + "step": 4228 + }, + { + "epoch": 2.3223503569467328, + "grad_norm": 0.40448978934655716, + "learning_rate": 1.2314819765462365e-07, + "loss": 0.2459, + "step": 4229 + }, + { + "epoch": 2.3228995057660624, + "grad_norm": 0.5502038566417293, + "learning_rate": 1.229574456428539e-07, + "loss": 0.2701, + "step": 4230 + }, + { + "epoch": 2.3234486545853925, + "grad_norm": 0.6532108981027288, + "learning_rate": 1.2276682088592874e-07, + "loss": 0.2563, + "step": 4231 + }, + { + "epoch": 2.3239978034047226, + "grad_norm": 0.4854354415425811, + "learning_rate": 1.2257632344817694e-07, + "loss": 0.2718, + "step": 4232 + }, + { + "epoch": 2.3245469522240527, + "grad_norm": 0.45894294338083286, + "learning_rate": 1.2238595339388425e-07, + "loss": 0.2459, + "step": 4233 + }, + { + "epoch": 2.325096101043383, + "grad_norm": 0.5349834782133368, + "learning_rate": 1.221957107872937e-07, + "loss": 0.2448, + "step": 4234 + }, + { + "epoch": 2.325645249862713, + "grad_norm": 0.5371325491267095, + "learning_rate": 1.2200559569260526e-07, + "loss": 0.2224, + "step": 4235 + }, + { + "epoch": 2.326194398682043, + "grad_norm": 0.7260262052011621, + "learning_rate": 1.218156081739755e-07, + "loss": 0.2927, + "step": 4236 + }, + { + "epoch": 2.3267435475013727, + "grad_norm": 0.4847790014182052, + "learning_rate": 1.216257482955185e-07, + "loss": 0.2366, + "step": 4237 + }, + { + "epoch": 2.327292696320703, + "grad_norm": 0.4921433509960056, + "learning_rate": 1.2143601612130463e-07, + "loss": 0.259, + "step": 4238 + }, + { + "epoch": 2.327841845140033, + "grad_norm": 0.47500745214077006, + "learning_rate": 1.2124641171536192e-07, + "loss": 0.24, + "step": 4239 + }, + { + "epoch": 2.328390993959363, + "grad_norm": 0.5335704117403829, + "learning_rate": 1.2105693514167447e-07, + "loss": 0.213, + "step": 4240 + }, + { + "epoch": 2.328940142778693, + "grad_norm": 0.4364405302540622, + "learning_rate": 1.2086758646418388e-07, + "loss": 0.2679, + "step": 4241 + }, + { + "epoch": 2.329489291598023, + "grad_norm": 0.4609656074571149, + "learning_rate": 1.2067836574678852e-07, + "loss": 0.2616, + "step": 4242 + }, + { + "epoch": 2.3300384404173533, + "grad_norm": 0.5191173781151636, + "learning_rate": 1.2048927305334293e-07, + "loss": 0.2272, + "step": 4243 + }, + { + "epoch": 2.330587589236683, + "grad_norm": 0.46160447828026485, + "learning_rate": 1.203003084476592e-07, + "loss": 0.2217, + "step": 4244 + }, + { + "epoch": 2.331136738056013, + "grad_norm": 0.5488737327224031, + "learning_rate": 1.2011147199350604e-07, + "loss": 0.2557, + "step": 4245 + }, + { + "epoch": 2.331685886875343, + "grad_norm": 0.6538087947424733, + "learning_rate": 1.1992276375460852e-07, + "loss": 0.2326, + "step": 4246 + }, + { + "epoch": 2.3322350356946733, + "grad_norm": 0.4730574495749438, + "learning_rate": 1.1973418379464894e-07, + "loss": 0.2372, + "step": 4247 + }, + { + "epoch": 2.3327841845140034, + "grad_norm": 0.5177884979699207, + "learning_rate": 1.1954573217726606e-07, + "loss": 0.2656, + "step": 4248 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.5921690084074351, + "learning_rate": 1.1935740896605537e-07, + "loss": 0.2662, + "step": 4249 + }, + { + "epoch": 2.3338824821526636, + "grad_norm": 0.5201907411818603, + "learning_rate": 1.1916921422456882e-07, + "loss": 0.2207, + "step": 4250 + }, + { + "epoch": 2.3344316309719932, + "grad_norm": 0.5260692883393193, + "learning_rate": 1.1898114801631536e-07, + "loss": 0.2384, + "step": 4251 + }, + { + "epoch": 2.3349807797913233, + "grad_norm": 0.5756611759536572, + "learning_rate": 1.1879321040476047e-07, + "loss": 0.2512, + "step": 4252 + }, + { + "epoch": 2.3355299286106534, + "grad_norm": 0.4889136823495688, + "learning_rate": 1.1860540145332616e-07, + "loss": 0.2206, + "step": 4253 + }, + { + "epoch": 2.3360790774299836, + "grad_norm": 0.42958622210826153, + "learning_rate": 1.1841772122539078e-07, + "loss": 0.2953, + "step": 4254 + }, + { + "epoch": 2.3366282262493137, + "grad_norm": 0.44851739303000837, + "learning_rate": 1.1823016978428967e-07, + "loss": 0.2457, + "step": 4255 + }, + { + "epoch": 2.3371773750686438, + "grad_norm": 0.47484907975911395, + "learning_rate": 1.1804274719331467e-07, + "loss": 0.2373, + "step": 4256 + }, + { + "epoch": 2.337726523887974, + "grad_norm": 0.4319152563531555, + "learning_rate": 1.1785545351571377e-07, + "loss": 0.2239, + "step": 4257 + }, + { + "epoch": 2.3382756727073035, + "grad_norm": 0.41076212400247253, + "learning_rate": 1.1766828881469174e-07, + "loss": 0.2251, + "step": 4258 + }, + { + "epoch": 2.3388248215266336, + "grad_norm": 0.5131923747593286, + "learning_rate": 1.1748125315341012e-07, + "loss": 0.2626, + "step": 4259 + }, + { + "epoch": 2.3393739703459637, + "grad_norm": 0.4577463645190812, + "learning_rate": 1.1729434659498595e-07, + "loss": 0.2418, + "step": 4260 + }, + { + "epoch": 2.339923119165294, + "grad_norm": 0.6556110706574263, + "learning_rate": 1.1710756920249362e-07, + "loss": 0.2544, + "step": 4261 + }, + { + "epoch": 2.340472267984624, + "grad_norm": 0.5940060863553352, + "learning_rate": 1.1692092103896368e-07, + "loss": 0.2513, + "step": 4262 + }, + { + "epoch": 2.341021416803954, + "grad_norm": 0.46886447583609353, + "learning_rate": 1.1673440216738284e-07, + "loss": 0.2249, + "step": 4263 + }, + { + "epoch": 2.341570565623284, + "grad_norm": 0.5204159100522225, + "learning_rate": 1.1654801265069461e-07, + "loss": 0.2189, + "step": 4264 + }, + { + "epoch": 2.342119714442614, + "grad_norm": 0.4714101620279995, + "learning_rate": 1.1636175255179827e-07, + "loss": 0.2488, + "step": 4265 + }, + { + "epoch": 2.342668863261944, + "grad_norm": 0.46709236563612133, + "learning_rate": 1.1617562193354997e-07, + "loss": 0.2556, + "step": 4266 + }, + { + "epoch": 2.343218012081274, + "grad_norm": 0.4485807640582403, + "learning_rate": 1.1598962085876179e-07, + "loss": 0.2493, + "step": 4267 + }, + { + "epoch": 2.343767160900604, + "grad_norm": 0.4828530596839069, + "learning_rate": 1.1580374939020224e-07, + "loss": 0.2736, + "step": 4268 + }, + { + "epoch": 2.344316309719934, + "grad_norm": 0.5000389152064297, + "learning_rate": 1.1561800759059631e-07, + "loss": 0.244, + "step": 4269 + }, + { + "epoch": 2.3448654585392643, + "grad_norm": 0.4523261706748118, + "learning_rate": 1.1543239552262491e-07, + "loss": 0.2279, + "step": 4270 + }, + { + "epoch": 2.3454146073585944, + "grad_norm": 0.5014126950733165, + "learning_rate": 1.1524691324892504e-07, + "loss": 0.2257, + "step": 4271 + }, + { + "epoch": 2.345963756177924, + "grad_norm": 0.433614731458899, + "learning_rate": 1.150615608320903e-07, + "loss": 0.2479, + "step": 4272 + }, + { + "epoch": 2.346512904997254, + "grad_norm": 0.5294204203885148, + "learning_rate": 1.148763383346705e-07, + "loss": 0.2661, + "step": 4273 + }, + { + "epoch": 2.3470620538165843, + "grad_norm": 0.6317452629057796, + "learning_rate": 1.146912458191711e-07, + "loss": 0.2397, + "step": 4274 + }, + { + "epoch": 2.3476112026359144, + "grad_norm": 0.6496618431978495, + "learning_rate": 1.1450628334805424e-07, + "loss": 0.3358, + "step": 4275 + }, + { + "epoch": 2.3481603514552445, + "grad_norm": 0.5387620894767209, + "learning_rate": 1.1432145098373784e-07, + "loss": 0.2644, + "step": 4276 + }, + { + "epoch": 2.3487095002745746, + "grad_norm": 0.5069993642692732, + "learning_rate": 1.1413674878859586e-07, + "loss": 0.2929, + "step": 4277 + }, + { + "epoch": 2.3492586490939047, + "grad_norm": 0.420093953231744, + "learning_rate": 1.1395217682495869e-07, + "loss": 0.2621, + "step": 4278 + }, + { + "epoch": 2.3498077979132344, + "grad_norm": 0.5087446544423593, + "learning_rate": 1.1376773515511264e-07, + "loss": 0.276, + "step": 4279 + }, + { + "epoch": 2.3503569467325645, + "grad_norm": 0.5749831515642492, + "learning_rate": 1.1358342384129964e-07, + "loss": 0.2166, + "step": 4280 + }, + { + "epoch": 2.3509060955518946, + "grad_norm": 0.4719869175860627, + "learning_rate": 1.1339924294571836e-07, + "loss": 0.2641, + "step": 4281 + }, + { + "epoch": 2.3514552443712247, + "grad_norm": 0.400708437347147, + "learning_rate": 1.1321519253052279e-07, + "loss": 0.2407, + "step": 4282 + }, + { + "epoch": 2.3520043931905548, + "grad_norm": 0.43711396043717626, + "learning_rate": 1.1303127265782336e-07, + "loss": 0.2181, + "step": 4283 + }, + { + "epoch": 2.352553542009885, + "grad_norm": 0.54790020943926, + "learning_rate": 1.1284748338968601e-07, + "loss": 0.2371, + "step": 4284 + }, + { + "epoch": 2.3531026908292145, + "grad_norm": 0.5012709329124785, + "learning_rate": 1.1266382478813301e-07, + "loss": 0.2269, + "step": 4285 + }, + { + "epoch": 2.3536518396485446, + "grad_norm": 0.38092286863274005, + "learning_rate": 1.1248029691514248e-07, + "loss": 0.2236, + "step": 4286 + }, + { + "epoch": 2.3542009884678747, + "grad_norm": 0.47856482859649163, + "learning_rate": 1.1229689983264818e-07, + "loss": 0.2545, + "step": 4287 + }, + { + "epoch": 2.354750137287205, + "grad_norm": 0.6555891944072414, + "learning_rate": 1.121136336025397e-07, + "loss": 0.2283, + "step": 4288 + }, + { + "epoch": 2.355299286106535, + "grad_norm": 0.6117859112792935, + "learning_rate": 1.119304982866629e-07, + "loss": 0.2879, + "step": 4289 + }, + { + "epoch": 2.355848434925865, + "grad_norm": 0.5068529822453939, + "learning_rate": 1.117474939468192e-07, + "loss": 0.2465, + "step": 4290 + }, + { + "epoch": 2.3563975837451947, + "grad_norm": 0.4535440561125151, + "learning_rate": 1.1156462064476561e-07, + "loss": 0.2202, + "step": 4291 + }, + { + "epoch": 2.356946732564525, + "grad_norm": 0.5119287633543099, + "learning_rate": 1.1138187844221538e-07, + "loss": 0.2665, + "step": 4292 + }, + { + "epoch": 2.357495881383855, + "grad_norm": 0.5412579399554837, + "learning_rate": 1.1119926740083718e-07, + "loss": 0.2119, + "step": 4293 + }, + { + "epoch": 2.358045030203185, + "grad_norm": 0.38685315237206247, + "learning_rate": 1.1101678758225536e-07, + "loss": 0.2533, + "step": 4294 + }, + { + "epoch": 2.358594179022515, + "grad_norm": 0.47099203630115455, + "learning_rate": 1.1083443904805026e-07, + "loss": 0.2561, + "step": 4295 + }, + { + "epoch": 2.359143327841845, + "grad_norm": 0.6391279683496444, + "learning_rate": 1.1065222185975791e-07, + "loss": 0.2463, + "step": 4296 + }, + { + "epoch": 2.3596924766611753, + "grad_norm": 0.5486280607492743, + "learning_rate": 1.1047013607886977e-07, + "loss": 0.2534, + "step": 4297 + }, + { + "epoch": 2.360241625480505, + "grad_norm": 0.4241321222380373, + "learning_rate": 1.1028818176683295e-07, + "loss": 0.2747, + "step": 4298 + }, + { + "epoch": 2.360790774299835, + "grad_norm": 0.5938147227404399, + "learning_rate": 1.101063589850505e-07, + "loss": 0.3289, + "step": 4299 + }, + { + "epoch": 2.361339923119165, + "grad_norm": 0.5139592711795388, + "learning_rate": 1.0992466779488099e-07, + "loss": 0.2682, + "step": 4300 + }, + { + "epoch": 2.3618890719384953, + "grad_norm": 0.44562065599471057, + "learning_rate": 1.0974310825763829e-07, + "loss": 0.253, + "step": 4301 + }, + { + "epoch": 2.3624382207578254, + "grad_norm": 0.527512526247087, + "learning_rate": 1.0956168043459215e-07, + "loss": 0.2395, + "step": 4302 + }, + { + "epoch": 2.3629873695771555, + "grad_norm": 0.49398659460407146, + "learning_rate": 1.093803843869679e-07, + "loss": 0.2223, + "step": 4303 + }, + { + "epoch": 2.3635365183964856, + "grad_norm": 0.5210338927778311, + "learning_rate": 1.0919922017594612e-07, + "loss": 0.2948, + "step": 4304 + }, + { + "epoch": 2.3640856672158153, + "grad_norm": 0.49469326074013886, + "learning_rate": 1.0901818786266303e-07, + "loss": 0.2489, + "step": 4305 + }, + { + "epoch": 2.3646348160351454, + "grad_norm": 0.3976455802712967, + "learning_rate": 1.088372875082104e-07, + "loss": 0.2786, + "step": 4306 + }, + { + "epoch": 2.3651839648544755, + "grad_norm": 0.4604244835348604, + "learning_rate": 1.0865651917363561e-07, + "loss": 0.2306, + "step": 4307 + }, + { + "epoch": 2.3657331136738056, + "grad_norm": 0.5098586663669838, + "learning_rate": 1.0847588291994118e-07, + "loss": 0.2363, + "step": 4308 + }, + { + "epoch": 2.3662822624931357, + "grad_norm": 0.4770138024860164, + "learning_rate": 1.0829537880808503e-07, + "loss": 0.234, + "step": 4309 + }, + { + "epoch": 2.3668314113124658, + "grad_norm": 0.4544513169824785, + "learning_rate": 1.0811500689898097e-07, + "loss": 0.2674, + "step": 4310 + }, + { + "epoch": 2.367380560131796, + "grad_norm": 0.41911644776361084, + "learning_rate": 1.0793476725349751e-07, + "loss": 0.2227, + "step": 4311 + }, + { + "epoch": 2.3679297089511255, + "grad_norm": 0.40929455911116724, + "learning_rate": 1.0775465993245913e-07, + "loss": 0.2235, + "step": 4312 + }, + { + "epoch": 2.3684788577704556, + "grad_norm": 0.4778856964032043, + "learning_rate": 1.075746849966455e-07, + "loss": 0.2316, + "step": 4313 + }, + { + "epoch": 2.3690280065897857, + "grad_norm": 0.6766860883109573, + "learning_rate": 1.0739484250679135e-07, + "loss": 0.3291, + "step": 4314 + }, + { + "epoch": 2.369577155409116, + "grad_norm": 0.5104888238273152, + "learning_rate": 1.0721513252358688e-07, + "loss": 0.2885, + "step": 4315 + }, + { + "epoch": 2.370126304228446, + "grad_norm": 0.5276061311172372, + "learning_rate": 1.0703555510767761e-07, + "loss": 0.247, + "step": 4316 + }, + { + "epoch": 2.370675453047776, + "grad_norm": 0.45460748097813075, + "learning_rate": 1.068561103196645e-07, + "loss": 0.2388, + "step": 4317 + }, + { + "epoch": 2.371224601867106, + "grad_norm": 0.49649655949071936, + "learning_rate": 1.0667679822010326e-07, + "loss": 0.248, + "step": 4318 + }, + { + "epoch": 2.371773750686436, + "grad_norm": 0.5078286189025655, + "learning_rate": 1.0649761886950542e-07, + "loss": 0.2349, + "step": 4319 + }, + { + "epoch": 2.372322899505766, + "grad_norm": 0.4909254745863682, + "learning_rate": 1.063185723283371e-07, + "loss": 0.2359, + "step": 4320 + }, + { + "epoch": 2.372872048325096, + "grad_norm": 0.5059202788011038, + "learning_rate": 1.0613965865702015e-07, + "loss": 0.1924, + "step": 4321 + }, + { + "epoch": 2.373421197144426, + "grad_norm": 0.3974481013083046, + "learning_rate": 1.059608779159312e-07, + "loss": 0.2288, + "step": 4322 + }, + { + "epoch": 2.3739703459637562, + "grad_norm": 0.4573274318098116, + "learning_rate": 1.0578223016540212e-07, + "loss": 0.2637, + "step": 4323 + }, + { + "epoch": 2.3745194947830863, + "grad_norm": 0.45358793399633246, + "learning_rate": 1.0560371546572027e-07, + "loss": 0.2269, + "step": 4324 + }, + { + "epoch": 2.3750686436024164, + "grad_norm": 0.7731302920356223, + "learning_rate": 1.0542533387712747e-07, + "loss": 0.2488, + "step": 4325 + }, + { + "epoch": 2.375617792421746, + "grad_norm": 0.588222509849646, + "learning_rate": 1.052470854598209e-07, + "loss": 0.2218, + "step": 4326 + }, + { + "epoch": 2.376166941241076, + "grad_norm": 0.4793681619176967, + "learning_rate": 1.0506897027395312e-07, + "loss": 0.2213, + "step": 4327 + }, + { + "epoch": 2.3767160900604063, + "grad_norm": 0.5120876375606997, + "learning_rate": 1.048909883796311e-07, + "loss": 0.2425, + "step": 4328 + }, + { + "epoch": 2.3772652388797364, + "grad_norm": 0.531751945029049, + "learning_rate": 1.047131398369174e-07, + "loss": 0.2301, + "step": 4329 + }, + { + "epoch": 2.3778143876990665, + "grad_norm": 0.43461029557694114, + "learning_rate": 1.045354247058294e-07, + "loss": 0.2735, + "step": 4330 + }, + { + "epoch": 2.3783635365183966, + "grad_norm": 0.631674505967079, + "learning_rate": 1.0435784304633932e-07, + "loss": 0.2589, + "step": 4331 + }, + { + "epoch": 2.3789126853377267, + "grad_norm": 0.3914681156675498, + "learning_rate": 1.0418039491837425e-07, + "loss": 0.246, + "step": 4332 + }, + { + "epoch": 2.3794618341570564, + "grad_norm": 0.4413235226073941, + "learning_rate": 1.0400308038181661e-07, + "loss": 0.2, + "step": 4333 + }, + { + "epoch": 2.3800109829763865, + "grad_norm": 0.49201735353071807, + "learning_rate": 1.0382589949650357e-07, + "loss": 0.2672, + "step": 4334 + }, + { + "epoch": 2.3805601317957166, + "grad_norm": 0.5373055270318895, + "learning_rate": 1.0364885232222695e-07, + "loss": 0.2239, + "step": 4335 + }, + { + "epoch": 2.3811092806150467, + "grad_norm": 0.486141598801213, + "learning_rate": 1.0347193891873385e-07, + "loss": 0.198, + "step": 4336 + }, + { + "epoch": 2.381658429434377, + "grad_norm": 0.6010151478710597, + "learning_rate": 1.0329515934572584e-07, + "loss": 0.2476, + "step": 4337 + }, + { + "epoch": 2.382207578253707, + "grad_norm": 0.4759674124733145, + "learning_rate": 1.0311851366285973e-07, + "loss": 0.2297, + "step": 4338 + }, + { + "epoch": 2.382756727073037, + "grad_norm": 0.6590453929601003, + "learning_rate": 1.0294200192974665e-07, + "loss": 0.2399, + "step": 4339 + }, + { + "epoch": 2.3833058758923666, + "grad_norm": 0.44851794214500884, + "learning_rate": 1.0276562420595296e-07, + "loss": 0.2418, + "step": 4340 + }, + { + "epoch": 2.3838550247116967, + "grad_norm": 0.512476302877383, + "learning_rate": 1.0258938055099996e-07, + "loss": 0.2339, + "step": 4341 + }, + { + "epoch": 2.384404173531027, + "grad_norm": 0.4426795224852948, + "learning_rate": 1.0241327102436273e-07, + "loss": 0.2931, + "step": 4342 + }, + { + "epoch": 2.384953322350357, + "grad_norm": 0.4880026757668009, + "learning_rate": 1.0223729568547216e-07, + "loss": 0.2113, + "step": 4343 + }, + { + "epoch": 2.385502471169687, + "grad_norm": 0.5599612822022351, + "learning_rate": 1.020614545937135e-07, + "loss": 0.2274, + "step": 4344 + }, + { + "epoch": 2.386051619989017, + "grad_norm": 0.5012609443804696, + "learning_rate": 1.0188574780842638e-07, + "loss": 0.2711, + "step": 4345 + }, + { + "epoch": 2.3866007688083473, + "grad_norm": 0.5387146725539798, + "learning_rate": 1.0171017538890549e-07, + "loss": 0.2556, + "step": 4346 + }, + { + "epoch": 2.387149917627677, + "grad_norm": 0.5983525041283249, + "learning_rate": 1.0153473739440018e-07, + "loss": 0.2585, + "step": 4347 + }, + { + "epoch": 2.387699066447007, + "grad_norm": 0.5029789926137569, + "learning_rate": 1.0135943388411421e-07, + "loss": 0.208, + "step": 4348 + }, + { + "epoch": 2.388248215266337, + "grad_norm": 0.48430980156286985, + "learning_rate": 1.0118426491720595e-07, + "loss": 0.2597, + "step": 4349 + }, + { + "epoch": 2.3887973640856672, + "grad_norm": 0.4665689250511465, + "learning_rate": 1.0100923055278854e-07, + "loss": 0.2246, + "step": 4350 + }, + { + "epoch": 2.3893465129049973, + "grad_norm": 0.5592559655570383, + "learning_rate": 1.0083433084992975e-07, + "loss": 0.2664, + "step": 4351 + }, + { + "epoch": 2.3898956617243274, + "grad_norm": 0.6540075497942756, + "learning_rate": 1.0065956586765175e-07, + "loss": 0.2521, + "step": 4352 + }, + { + "epoch": 2.3904448105436575, + "grad_norm": 0.47202705992815247, + "learning_rate": 1.0048493566493109e-07, + "loss": 0.2563, + "step": 4353 + }, + { + "epoch": 2.390993959362987, + "grad_norm": 0.5767527481812633, + "learning_rate": 1.0031044030069913e-07, + "loss": 0.2849, + "step": 4354 + }, + { + "epoch": 2.3915431081823173, + "grad_norm": 0.46186204981589096, + "learning_rate": 1.0013607983384179e-07, + "loss": 0.2412, + "step": 4355 + }, + { + "epoch": 2.3920922570016474, + "grad_norm": 0.46172287020894826, + "learning_rate": 9.996185432319904e-08, + "loss": 0.2357, + "step": 4356 + }, + { + "epoch": 2.3926414058209775, + "grad_norm": 0.543553376912218, + "learning_rate": 9.978776382756572e-08, + "loss": 0.227, + "step": 4357 + }, + { + "epoch": 2.3931905546403076, + "grad_norm": 0.41105971152644444, + "learning_rate": 9.961380840569121e-08, + "loss": 0.2489, + "step": 4358 + }, + { + "epoch": 2.3937397034596377, + "grad_norm": 0.4974158564396423, + "learning_rate": 9.943998811627856e-08, + "loss": 0.2483, + "step": 4359 + }, + { + "epoch": 2.394288852278968, + "grad_norm": 0.5365922929244143, + "learning_rate": 9.926630301798592e-08, + "loss": 0.2906, + "step": 4360 + }, + { + "epoch": 2.3948380010982975, + "grad_norm": 0.4947522001725725, + "learning_rate": 9.909275316942574e-08, + "loss": 0.2398, + "step": 4361 + }, + { + "epoch": 2.3953871499176276, + "grad_norm": 0.5752732662987095, + "learning_rate": 9.891933862916449e-08, + "loss": 0.2288, + "step": 4362 + }, + { + "epoch": 2.3959362987369577, + "grad_norm": 0.3997133026869859, + "learning_rate": 9.874605945572346e-08, + "loss": 0.2253, + "step": 4363 + }, + { + "epoch": 2.396485447556288, + "grad_norm": 0.45715600577386994, + "learning_rate": 9.857291570757764e-08, + "loss": 0.2227, + "step": 4364 + }, + { + "epoch": 2.397034596375618, + "grad_norm": 0.4213770743715131, + "learning_rate": 9.839990744315699e-08, + "loss": 0.2402, + "step": 4365 + }, + { + "epoch": 2.397583745194948, + "grad_norm": 0.5166226279311078, + "learning_rate": 9.82270347208452e-08, + "loss": 0.2539, + "step": 4366 + }, + { + "epoch": 2.398132894014278, + "grad_norm": 0.452688891661523, + "learning_rate": 9.805429759898045e-08, + "loss": 0.2656, + "step": 4367 + }, + { + "epoch": 2.3986820428336078, + "grad_norm": 0.4586291992283334, + "learning_rate": 9.788169613585539e-08, + "loss": 0.2349, + "step": 4368 + }, + { + "epoch": 2.399231191652938, + "grad_norm": 0.4045984015829321, + "learning_rate": 9.770923038971654e-08, + "loss": 0.2479, + "step": 4369 + }, + { + "epoch": 2.399780340472268, + "grad_norm": 0.5521762898576388, + "learning_rate": 9.75369004187645e-08, + "loss": 0.2596, + "step": 4370 + }, + { + "epoch": 2.400329489291598, + "grad_norm": 0.419191957298065, + "learning_rate": 9.736470628115454e-08, + "loss": 0.2386, + "step": 4371 + }, + { + "epoch": 2.400878638110928, + "grad_norm": 0.4252823717825137, + "learning_rate": 9.71926480349959e-08, + "loss": 0.2214, + "step": 4372 + }, + { + "epoch": 2.4014277869302583, + "grad_norm": 0.5684931772677159, + "learning_rate": 9.70207257383516e-08, + "loss": 0.2426, + "step": 4373 + }, + { + "epoch": 2.4019769357495884, + "grad_norm": 0.4485266747106593, + "learning_rate": 9.684893944923945e-08, + "loss": 0.2338, + "step": 4374 + }, + { + "epoch": 2.402526084568918, + "grad_norm": 0.5105006286040609, + "learning_rate": 9.667728922563079e-08, + "loss": 0.2348, + "step": 4375 + }, + { + "epoch": 2.403075233388248, + "grad_norm": 0.577179913354242, + "learning_rate": 9.650577512545107e-08, + "loss": 0.2186, + "step": 4376 + }, + { + "epoch": 2.4036243822075782, + "grad_norm": 0.5205406358972823, + "learning_rate": 9.633439720658025e-08, + "loss": 0.2059, + "step": 4377 + }, + { + "epoch": 2.4041735310269083, + "grad_norm": 0.5416767468489839, + "learning_rate": 9.616315552685206e-08, + "loss": 0.2527, + "step": 4378 + }, + { + "epoch": 2.4047226798462384, + "grad_norm": 0.5383829047059119, + "learning_rate": 9.599205014405403e-08, + "loss": 0.2653, + "step": 4379 + }, + { + "epoch": 2.4052718286655685, + "grad_norm": 0.5205052997060304, + "learning_rate": 9.582108111592828e-08, + "loss": 0.2469, + "step": 4380 + }, + { + "epoch": 2.4058209774848986, + "grad_norm": 0.563233640111495, + "learning_rate": 9.565024850017018e-08, + "loss": 0.2469, + "step": 4381 + }, + { + "epoch": 2.4063701263042283, + "grad_norm": 0.4728240975570711, + "learning_rate": 9.547955235442973e-08, + "loss": 0.2205, + "step": 4382 + }, + { + "epoch": 2.4069192751235584, + "grad_norm": 0.4895681046957997, + "learning_rate": 9.530899273631037e-08, + "loss": 0.205, + "step": 4383 + }, + { + "epoch": 2.4074684239428885, + "grad_norm": 0.44901135203918346, + "learning_rate": 9.513856970336978e-08, + "loss": 0.2338, + "step": 4384 + }, + { + "epoch": 2.4080175727622186, + "grad_norm": 0.5643198771423413, + "learning_rate": 9.496828331311964e-08, + "loss": 0.2368, + "step": 4385 + }, + { + "epoch": 2.4085667215815487, + "grad_norm": 0.6805435945224101, + "learning_rate": 9.479813362302514e-08, + "loss": 0.301, + "step": 4386 + }, + { + "epoch": 2.409115870400879, + "grad_norm": 0.49187417084380314, + "learning_rate": 9.462812069050539e-08, + "loss": 0.2706, + "step": 4387 + }, + { + "epoch": 2.409665019220209, + "grad_norm": 0.4268395573369364, + "learning_rate": 9.445824457293372e-08, + "loss": 0.2374, + "step": 4388 + }, + { + "epoch": 2.4102141680395386, + "grad_norm": 0.46956514558084034, + "learning_rate": 9.428850532763706e-08, + "loss": 0.2406, + "step": 4389 + }, + { + "epoch": 2.4107633168588687, + "grad_norm": 0.5022831073057764, + "learning_rate": 9.411890301189598e-08, + "loss": 0.234, + "step": 4390 + }, + { + "epoch": 2.411312465678199, + "grad_norm": 0.3807012479632907, + "learning_rate": 9.394943768294525e-08, + "loss": 0.2192, + "step": 4391 + }, + { + "epoch": 2.411861614497529, + "grad_norm": 0.5172628021923034, + "learning_rate": 9.378010939797307e-08, + "loss": 0.2502, + "step": 4392 + }, + { + "epoch": 2.412410763316859, + "grad_norm": 0.46136877782017105, + "learning_rate": 9.361091821412134e-08, + "loss": 0.2591, + "step": 4393 + }, + { + "epoch": 2.412959912136189, + "grad_norm": 0.48927396447017557, + "learning_rate": 9.3441864188486e-08, + "loss": 0.2353, + "step": 4394 + }, + { + "epoch": 2.4135090609555188, + "grad_norm": 0.46291028976332893, + "learning_rate": 9.327294737811666e-08, + "loss": 0.2353, + "step": 4395 + }, + { + "epoch": 2.414058209774849, + "grad_norm": 0.4997470876740817, + "learning_rate": 9.31041678400164e-08, + "loss": 0.2547, + "step": 4396 + }, + { + "epoch": 2.414607358594179, + "grad_norm": 0.46639621530332454, + "learning_rate": 9.293552563114196e-08, + "loss": 0.2083, + "step": 4397 + }, + { + "epoch": 2.415156507413509, + "grad_norm": 0.49796551252809607, + "learning_rate": 9.276702080840402e-08, + "loss": 0.2186, + "step": 4398 + }, + { + "epoch": 2.415705656232839, + "grad_norm": 0.45295016261714366, + "learning_rate": 9.259865342866681e-08, + "loss": 0.2436, + "step": 4399 + }, + { + "epoch": 2.4162548050521693, + "grad_norm": 0.5644035711929897, + "learning_rate": 9.243042354874802e-08, + "loss": 0.2199, + "step": 4400 + }, + { + "epoch": 2.4162548050521693, + "eval_loss": 0.3218434751033783, + "eval_runtime": 18.8179, + "eval_samples_per_second": 23.541, + "eval_steps_per_second": 1.01, + "step": 4400 + }, + { + "epoch": 2.416803953871499, + "grad_norm": 0.4954053405517015, + "learning_rate": 9.226233122541902e-08, + "loss": 0.249, + "step": 4401 + }, + { + "epoch": 2.417353102690829, + "grad_norm": 0.5466308595919291, + "learning_rate": 9.209437651540493e-08, + "loss": 0.2358, + "step": 4402 + }, + { + "epoch": 2.417902251510159, + "grad_norm": 0.58704118126291, + "learning_rate": 9.19265594753842e-08, + "loss": 0.2745, + "step": 4403 + }, + { + "epoch": 2.4184514003294892, + "grad_norm": 0.5497120333793641, + "learning_rate": 9.175888016198873e-08, + "loss": 0.2349, + "step": 4404 + }, + { + "epoch": 2.4190005491488193, + "grad_norm": 0.4464225928039183, + "learning_rate": 9.159133863180433e-08, + "loss": 0.2645, + "step": 4405 + }, + { + "epoch": 2.4195496979681494, + "grad_norm": 0.44321208222461583, + "learning_rate": 9.142393494136993e-08, + "loss": 0.2287, + "step": 4406 + }, + { + "epoch": 2.4200988467874796, + "grad_norm": 0.6709706945677903, + "learning_rate": 9.125666914717822e-08, + "loss": 0.2722, + "step": 4407 + }, + { + "epoch": 2.420647995606809, + "grad_norm": 0.5212296637611338, + "learning_rate": 9.10895413056753e-08, + "loss": 0.2794, + "step": 4408 + }, + { + "epoch": 2.4211971444261393, + "grad_norm": 0.46196723960508035, + "learning_rate": 9.092255147326056e-08, + "loss": 0.2835, + "step": 4409 + }, + { + "epoch": 2.4217462932454694, + "grad_norm": 0.4501118265794812, + "learning_rate": 9.075569970628686e-08, + "loss": 0.2357, + "step": 4410 + }, + { + "epoch": 2.4222954420647995, + "grad_norm": 0.6753508987401065, + "learning_rate": 9.058898606106053e-08, + "loss": 0.2913, + "step": 4411 + }, + { + "epoch": 2.4228445908841296, + "grad_norm": 0.6366039990706125, + "learning_rate": 9.04224105938415e-08, + "loss": 0.2695, + "step": 4412 + }, + { + "epoch": 2.4233937397034597, + "grad_norm": 0.4195244742532176, + "learning_rate": 9.02559733608427e-08, + "loss": 0.2555, + "step": 4413 + }, + { + "epoch": 2.42394288852279, + "grad_norm": 0.524655756756528, + "learning_rate": 9.008967441823042e-08, + "loss": 0.2124, + "step": 4414 + }, + { + "epoch": 2.4244920373421195, + "grad_norm": 0.5548239347341882, + "learning_rate": 8.992351382212459e-08, + "loss": 0.2168, + "step": 4415 + }, + { + "epoch": 2.4250411861614496, + "grad_norm": 0.5828240840118025, + "learning_rate": 8.975749162859838e-08, + "loss": 0.2384, + "step": 4416 + }, + { + "epoch": 2.4255903349807797, + "grad_norm": 0.5240522025761735, + "learning_rate": 8.959160789367792e-08, + "loss": 0.2792, + "step": 4417 + }, + { + "epoch": 2.42613948380011, + "grad_norm": 0.5088922393469666, + "learning_rate": 8.942586267334307e-08, + "loss": 0.2841, + "step": 4418 + }, + { + "epoch": 2.42668863261944, + "grad_norm": 0.43944622161327423, + "learning_rate": 8.926025602352669e-08, + "loss": 0.2244, + "step": 4419 + }, + { + "epoch": 2.42723778143877, + "grad_norm": 0.4738918514059997, + "learning_rate": 8.909478800011494e-08, + "loss": 0.2439, + "step": 4420 + }, + { + "epoch": 2.4277869302581, + "grad_norm": 0.5248901979871129, + "learning_rate": 8.8929458658947e-08, + "loss": 0.2057, + "step": 4421 + }, + { + "epoch": 2.4283360790774298, + "grad_norm": 0.4695585077638441, + "learning_rate": 8.876426805581569e-08, + "loss": 0.2308, + "step": 4422 + }, + { + "epoch": 2.42888522789676, + "grad_norm": 0.4761902535846165, + "learning_rate": 8.85992162464665e-08, + "loss": 0.2591, + "step": 4423 + }, + { + "epoch": 2.42943437671609, + "grad_norm": 0.4752856968165827, + "learning_rate": 8.843430328659858e-08, + "loss": 0.256, + "step": 4424 + }, + { + "epoch": 2.42998352553542, + "grad_norm": 0.48433398470774713, + "learning_rate": 8.826952923186375e-08, + "loss": 0.2796, + "step": 4425 + }, + { + "epoch": 2.43053267435475, + "grad_norm": 0.5097786327147819, + "learning_rate": 8.810489413786743e-08, + "loss": 0.2361, + "step": 4426 + }, + { + "epoch": 2.4310818231740803, + "grad_norm": 0.49916075192925374, + "learning_rate": 8.794039806016759e-08, + "loss": 0.2463, + "step": 4427 + }, + { + "epoch": 2.4316309719934104, + "grad_norm": 0.4966817761403099, + "learning_rate": 8.777604105427581e-08, + "loss": 0.2215, + "step": 4428 + }, + { + "epoch": 2.43218012081274, + "grad_norm": 0.5574000482092913, + "learning_rate": 8.761182317565659e-08, + "loss": 0.2385, + "step": 4429 + }, + { + "epoch": 2.43272926963207, + "grad_norm": 0.4475154018773907, + "learning_rate": 8.744774447972733e-08, + "loss": 0.2652, + "step": 4430 + }, + { + "epoch": 2.4332784184514002, + "grad_norm": 0.5247334845105581, + "learning_rate": 8.728380502185838e-08, + "loss": 0.2308, + "step": 4431 + }, + { + "epoch": 2.4338275672707304, + "grad_norm": 0.5204925237892747, + "learning_rate": 8.712000485737344e-08, + "loss": 0.2527, + "step": 4432 + }, + { + "epoch": 2.4343767160900605, + "grad_norm": 0.5186614406128908, + "learning_rate": 8.695634404154914e-08, + "loss": 0.2481, + "step": 4433 + }, + { + "epoch": 2.4349258649093906, + "grad_norm": 0.42919351804571243, + "learning_rate": 8.679282262961467e-08, + "loss": 0.249, + "step": 4434 + }, + { + "epoch": 2.4354750137287207, + "grad_norm": 0.5287714075267016, + "learning_rate": 8.662944067675274e-08, + "loss": 0.2586, + "step": 4435 + }, + { + "epoch": 2.4360241625480503, + "grad_norm": 0.5175462565875877, + "learning_rate": 8.646619823809872e-08, + "loss": 0.2156, + "step": 4436 + }, + { + "epoch": 2.4365733113673804, + "grad_norm": 0.4817574722613448, + "learning_rate": 8.630309536874068e-08, + "loss": 0.2366, + "step": 4437 + }, + { + "epoch": 2.4371224601867105, + "grad_norm": 0.5957092456174624, + "learning_rate": 8.614013212372002e-08, + "loss": 0.257, + "step": 4438 + }, + { + "epoch": 2.4376716090060406, + "grad_norm": 0.5695344567779025, + "learning_rate": 8.597730855803093e-08, + "loss": 0.2312, + "step": 4439 + }, + { + "epoch": 2.4382207578253707, + "grad_norm": 0.5328666390892294, + "learning_rate": 8.581462472662011e-08, + "loss": 0.2982, + "step": 4440 + }, + { + "epoch": 2.438769906644701, + "grad_norm": 0.5243554604630376, + "learning_rate": 8.56520806843876e-08, + "loss": 0.2625, + "step": 4441 + }, + { + "epoch": 2.439319055464031, + "grad_norm": 0.4682395946228307, + "learning_rate": 8.548967648618579e-08, + "loss": 0.2602, + "step": 4442 + }, + { + "epoch": 2.4398682042833606, + "grad_norm": 0.5506235333576627, + "learning_rate": 8.53274121868204e-08, + "loss": 0.2365, + "step": 4443 + }, + { + "epoch": 2.4404173531026907, + "grad_norm": 0.5556772014264871, + "learning_rate": 8.51652878410494e-08, + "loss": 0.2424, + "step": 4444 + }, + { + "epoch": 2.440966501922021, + "grad_norm": 0.4626338906764464, + "learning_rate": 8.500330350358385e-08, + "loss": 0.2306, + "step": 4445 + }, + { + "epoch": 2.441515650741351, + "grad_norm": 0.39699238188640135, + "learning_rate": 8.484145922908761e-08, + "loss": 0.2548, + "step": 4446 + }, + { + "epoch": 2.442064799560681, + "grad_norm": 0.4829888207451511, + "learning_rate": 8.46797550721771e-08, + "loss": 0.2334, + "step": 4447 + }, + { + "epoch": 2.442613948380011, + "grad_norm": 0.5386809218348696, + "learning_rate": 8.451819108742143e-08, + "loss": 0.2187, + "step": 4448 + }, + { + "epoch": 2.443163097199341, + "grad_norm": 0.5237659146626669, + "learning_rate": 8.435676732934246e-08, + "loss": 0.2248, + "step": 4449 + }, + { + "epoch": 2.443712246018671, + "grad_norm": 0.5034164044930222, + "learning_rate": 8.419548385241503e-08, + "loss": 0.229, + "step": 4450 + }, + { + "epoch": 2.444261394838001, + "grad_norm": 0.5076426120158689, + "learning_rate": 8.403434071106605e-08, + "loss": 0.2304, + "step": 4451 + }, + { + "epoch": 2.444810543657331, + "grad_norm": 0.48753968449021995, + "learning_rate": 8.38733379596757e-08, + "loss": 0.2532, + "step": 4452 + }, + { + "epoch": 2.445359692476661, + "grad_norm": 0.523920882184603, + "learning_rate": 8.371247565257629e-08, + "loss": 0.2769, + "step": 4453 + }, + { + "epoch": 2.4459088412959913, + "grad_norm": 0.43749291452132416, + "learning_rate": 8.355175384405283e-08, + "loss": 0.2764, + "step": 4454 + }, + { + "epoch": 2.4464579901153214, + "grad_norm": 0.4863054375290364, + "learning_rate": 8.339117258834317e-08, + "loss": 0.2379, + "step": 4455 + }, + { + "epoch": 2.4470071389346515, + "grad_norm": 0.5243852634954214, + "learning_rate": 8.323073193963758e-08, + "loss": 0.2111, + "step": 4456 + }, + { + "epoch": 2.447556287753981, + "grad_norm": 0.7103775551260478, + "learning_rate": 8.307043195207887e-08, + "loss": 0.2828, + "step": 4457 + }, + { + "epoch": 2.4481054365733113, + "grad_norm": 0.5806254059960709, + "learning_rate": 8.291027267976216e-08, + "loss": 0.192, + "step": 4458 + }, + { + "epoch": 2.4486545853926414, + "grad_norm": 0.5596410581920414, + "learning_rate": 8.275025417673548e-08, + "loss": 0.3013, + "step": 4459 + }, + { + "epoch": 2.4492037342119715, + "grad_norm": 0.4114491355966605, + "learning_rate": 8.259037649699932e-08, + "loss": 0.2291, + "step": 4460 + }, + { + "epoch": 2.4497528830313016, + "grad_norm": 2.0832545864842773, + "learning_rate": 8.243063969450624e-08, + "loss": 0.256, + "step": 4461 + }, + { + "epoch": 2.4503020318506317, + "grad_norm": 0.44952515865386644, + "learning_rate": 8.22710438231616e-08, + "loss": 0.2542, + "step": 4462 + }, + { + "epoch": 2.4508511806699618, + "grad_norm": 0.4124600747021196, + "learning_rate": 8.21115889368233e-08, + "loss": 0.2271, + "step": 4463 + }, + { + "epoch": 2.4514003294892914, + "grad_norm": 0.47810416982513626, + "learning_rate": 8.195227508930136e-08, + "loss": 0.2556, + "step": 4464 + }, + { + "epoch": 2.4519494783086215, + "grad_norm": 0.4388783779008914, + "learning_rate": 8.179310233435819e-08, + "loss": 0.2416, + "step": 4465 + }, + { + "epoch": 2.4524986271279516, + "grad_norm": 0.6032316472019654, + "learning_rate": 8.163407072570892e-08, + "loss": 0.3062, + "step": 4466 + }, + { + "epoch": 2.4530477759472817, + "grad_norm": 0.4790239740382794, + "learning_rate": 8.147518031702092e-08, + "loss": 0.2549, + "step": 4467 + }, + { + "epoch": 2.453596924766612, + "grad_norm": 0.480721351821681, + "learning_rate": 8.131643116191371e-08, + "loss": 0.251, + "step": 4468 + }, + { + "epoch": 2.454146073585942, + "grad_norm": 0.4607013919199864, + "learning_rate": 8.115782331395924e-08, + "loss": 0.2279, + "step": 4469 + }, + { + "epoch": 2.454695222405272, + "grad_norm": 0.6073723304130222, + "learning_rate": 8.099935682668194e-08, + "loss": 0.2358, + "step": 4470 + }, + { + "epoch": 2.4552443712246017, + "grad_norm": 0.4751717768485519, + "learning_rate": 8.084103175355832e-08, + "loss": 0.2275, + "step": 4471 + }, + { + "epoch": 2.455793520043932, + "grad_norm": 0.47306168264036774, + "learning_rate": 8.068284814801725e-08, + "loss": 0.2093, + "step": 4472 + }, + { + "epoch": 2.456342668863262, + "grad_norm": 0.5320540279394275, + "learning_rate": 8.052480606344001e-08, + "loss": 0.2317, + "step": 4473 + }, + { + "epoch": 2.456891817682592, + "grad_norm": 0.5957291137148731, + "learning_rate": 8.036690555315996e-08, + "loss": 0.2292, + "step": 4474 + }, + { + "epoch": 2.457440966501922, + "grad_norm": 0.5102692225089511, + "learning_rate": 8.020914667046244e-08, + "loss": 0.2226, + "step": 4475 + }, + { + "epoch": 2.4579901153212522, + "grad_norm": 0.5950801030992386, + "learning_rate": 8.00515294685855e-08, + "loss": 0.2628, + "step": 4476 + }, + { + "epoch": 2.4585392641405823, + "grad_norm": 0.49189330529355263, + "learning_rate": 7.989405400071921e-08, + "loss": 0.2717, + "step": 4477 + }, + { + "epoch": 2.459088412959912, + "grad_norm": 0.4903939322800323, + "learning_rate": 7.97367203200055e-08, + "loss": 0.2349, + "step": 4478 + }, + { + "epoch": 2.459637561779242, + "grad_norm": 0.538097157514057, + "learning_rate": 7.957952847953895e-08, + "loss": 0.267, + "step": 4479 + }, + { + "epoch": 2.460186710598572, + "grad_norm": 0.5781107241347448, + "learning_rate": 7.94224785323657e-08, + "loss": 0.2512, + "step": 4480 + }, + { + "epoch": 2.4607358594179023, + "grad_norm": 0.48795582583770775, + "learning_rate": 7.926557053148471e-08, + "loss": 0.2625, + "step": 4481 + }, + { + "epoch": 2.4612850082372324, + "grad_norm": 0.6214673506142165, + "learning_rate": 7.910880452984625e-08, + "loss": 0.2736, + "step": 4482 + }, + { + "epoch": 2.4618341570565625, + "grad_norm": 0.48320027033809915, + "learning_rate": 7.895218058035325e-08, + "loss": 0.2472, + "step": 4483 + }, + { + "epoch": 2.4623833058758926, + "grad_norm": 0.47121023996135325, + "learning_rate": 7.879569873586071e-08, + "loss": 0.2093, + "step": 4484 + }, + { + "epoch": 2.4629324546952223, + "grad_norm": 0.4884511373491773, + "learning_rate": 7.863935904917524e-08, + "loss": 0.2336, + "step": 4485 + }, + { + "epoch": 2.4634816035145524, + "grad_norm": 0.4668186580042982, + "learning_rate": 7.848316157305574e-08, + "loss": 0.2444, + "step": 4486 + }, + { + "epoch": 2.4640307523338825, + "grad_norm": 0.6922833591825045, + "learning_rate": 7.832710636021325e-08, + "loss": 0.2328, + "step": 4487 + }, + { + "epoch": 2.4645799011532126, + "grad_norm": 0.5211608502495902, + "learning_rate": 7.817119346331042e-08, + "loss": 0.2208, + "step": 4488 + }, + { + "epoch": 2.4651290499725427, + "grad_norm": 0.5028986802921136, + "learning_rate": 7.801542293496228e-08, + "loss": 0.2701, + "step": 4489 + }, + { + "epoch": 2.4656781987918728, + "grad_norm": 0.49736401459194174, + "learning_rate": 7.785979482773573e-08, + "loss": 0.2339, + "step": 4490 + }, + { + "epoch": 2.466227347611203, + "grad_norm": 0.7339926705720224, + "learning_rate": 7.770430919414939e-08, + "loss": 0.2853, + "step": 4491 + }, + { + "epoch": 2.4667764964305325, + "grad_norm": 0.46659383366906854, + "learning_rate": 7.754896608667385e-08, + "loss": 0.257, + "step": 4492 + }, + { + "epoch": 2.4673256452498626, + "grad_norm": 0.5086141346490957, + "learning_rate": 7.739376555773176e-08, + "loss": 0.2006, + "step": 4493 + }, + { + "epoch": 2.4678747940691927, + "grad_norm": 0.5174660375398773, + "learning_rate": 7.723870765969778e-08, + "loss": 0.2473, + "step": 4494 + }, + { + "epoch": 2.468423942888523, + "grad_norm": 0.49977753171792233, + "learning_rate": 7.708379244489792e-08, + "loss": 0.2357, + "step": 4495 + }, + { + "epoch": 2.468973091707853, + "grad_norm": 0.48136542085855316, + "learning_rate": 7.692901996561063e-08, + "loss": 0.2814, + "step": 4496 + }, + { + "epoch": 2.469522240527183, + "grad_norm": 0.5758850227474778, + "learning_rate": 7.677439027406562e-08, + "loss": 0.2767, + "step": 4497 + }, + { + "epoch": 2.4700713893465127, + "grad_norm": 0.4671251903692303, + "learning_rate": 7.66199034224451e-08, + "loss": 0.2701, + "step": 4498 + }, + { + "epoch": 2.470620538165843, + "grad_norm": 0.5008892223899206, + "learning_rate": 7.646555946288228e-08, + "loss": 0.2265, + "step": 4499 + }, + { + "epoch": 2.471169686985173, + "grad_norm": 0.5094438666476996, + "learning_rate": 7.63113584474628e-08, + "loss": 0.2837, + "step": 4500 + }, + { + "epoch": 2.471718835804503, + "grad_norm": 0.5499901741684797, + "learning_rate": 7.615730042822398e-08, + "loss": 0.2597, + "step": 4501 + }, + { + "epoch": 2.472267984623833, + "grad_norm": 0.5113366532756419, + "learning_rate": 7.600338545715435e-08, + "loss": 0.227, + "step": 4502 + }, + { + "epoch": 2.4728171334431632, + "grad_norm": 0.43814711456255245, + "learning_rate": 7.584961358619464e-08, + "loss": 0.2647, + "step": 4503 + }, + { + "epoch": 2.473366282262493, + "grad_norm": 0.5650533464833986, + "learning_rate": 7.569598486723742e-08, + "loss": 0.2449, + "step": 4504 + }, + { + "epoch": 2.473915431081823, + "grad_norm": 0.5229518966054156, + "learning_rate": 7.55424993521265e-08, + "loss": 0.2533, + "step": 4505 + }, + { + "epoch": 2.474464579901153, + "grad_norm": 0.5845494639846216, + "learning_rate": 7.538915709265764e-08, + "loss": 0.2337, + "step": 4506 + }, + { + "epoch": 2.475013728720483, + "grad_norm": 0.47235258697002974, + "learning_rate": 7.523595814057832e-08, + "loss": 0.2312, + "step": 4507 + }, + { + "epoch": 2.4755628775398133, + "grad_norm": 0.6198207237108116, + "learning_rate": 7.508290254758745e-08, + "loss": 0.2642, + "step": 4508 + }, + { + "epoch": 2.4761120263591434, + "grad_norm": 0.5605875853866593, + "learning_rate": 7.49299903653355e-08, + "loss": 0.247, + "step": 4509 + }, + { + "epoch": 2.4766611751784735, + "grad_norm": 0.5745619828258272, + "learning_rate": 7.477722164542491e-08, + "loss": 0.2588, + "step": 4510 + }, + { + "epoch": 2.477210323997803, + "grad_norm": 0.5176779724039913, + "learning_rate": 7.46245964394095e-08, + "loss": 0.228, + "step": 4511 + }, + { + "epoch": 2.4777594728171333, + "grad_norm": 0.4248294792666142, + "learning_rate": 7.447211479879459e-08, + "loss": 0.2342, + "step": 4512 + }, + { + "epoch": 2.4783086216364634, + "grad_norm": 0.44726024423781924, + "learning_rate": 7.431977677503703e-08, + "loss": 0.2133, + "step": 4513 + }, + { + "epoch": 2.4788577704557935, + "grad_norm": 0.3952718732118721, + "learning_rate": 7.416758241954532e-08, + "loss": 0.2748, + "step": 4514 + }, + { + "epoch": 2.4794069192751236, + "grad_norm": 0.4332565378001954, + "learning_rate": 7.401553178367965e-08, + "loss": 0.2284, + "step": 4515 + }, + { + "epoch": 2.4799560680944537, + "grad_norm": 0.4947101319545579, + "learning_rate": 7.386362491875126e-08, + "loss": 0.2641, + "step": 4516 + }, + { + "epoch": 2.480505216913784, + "grad_norm": 0.49513473950212356, + "learning_rate": 7.371186187602318e-08, + "loss": 0.2427, + "step": 4517 + }, + { + "epoch": 2.4810543657331134, + "grad_norm": 0.3983030302307297, + "learning_rate": 7.356024270671012e-08, + "loss": 0.2258, + "step": 4518 + }, + { + "epoch": 2.4816035145524435, + "grad_norm": 0.4788093986685573, + "learning_rate": 7.34087674619775e-08, + "loss": 0.2549, + "step": 4519 + }, + { + "epoch": 2.4821526633717736, + "grad_norm": 0.5697330974536536, + "learning_rate": 7.32574361929429e-08, + "loss": 0.2366, + "step": 4520 + }, + { + "epoch": 2.4827018121911038, + "grad_norm": 0.440195295438078, + "learning_rate": 7.310624895067508e-08, + "loss": 0.2142, + "step": 4521 + }, + { + "epoch": 2.483250961010434, + "grad_norm": 0.9897982550390857, + "learning_rate": 7.295520578619398e-08, + "loss": 0.2756, + "step": 4522 + }, + { + "epoch": 2.483800109829764, + "grad_norm": 0.46753288558942485, + "learning_rate": 7.280430675047138e-08, + "loss": 0.2139, + "step": 4523 + }, + { + "epoch": 2.484349258649094, + "grad_norm": 0.5184665839066971, + "learning_rate": 7.265355189442982e-08, + "loss": 0.2265, + "step": 4524 + }, + { + "epoch": 2.4848984074684237, + "grad_norm": 0.4402984143153301, + "learning_rate": 7.250294126894384e-08, + "loss": 0.2255, + "step": 4525 + }, + { + "epoch": 2.485447556287754, + "grad_norm": 0.5200456503794312, + "learning_rate": 7.235247492483868e-08, + "loss": 0.2238, + "step": 4526 + }, + { + "epoch": 2.485996705107084, + "grad_norm": 0.4625367661801, + "learning_rate": 7.220215291289126e-08, + "loss": 0.2514, + "step": 4527 + }, + { + "epoch": 2.486545853926414, + "grad_norm": 0.5023040369718166, + "learning_rate": 7.205197528382993e-08, + "loss": 0.2306, + "step": 4528 + }, + { + "epoch": 2.487095002745744, + "grad_norm": 0.5331758263751909, + "learning_rate": 7.190194208833401e-08, + "loss": 0.2359, + "step": 4529 + }, + { + "epoch": 2.4876441515650742, + "grad_norm": 0.516711920347689, + "learning_rate": 7.175205337703395e-08, + "loss": 0.2352, + "step": 4530 + }, + { + "epoch": 2.4881933003844043, + "grad_norm": 0.5026366629981299, + "learning_rate": 7.160230920051183e-08, + "loss": 0.2371, + "step": 4531 + }, + { + "epoch": 2.488742449203734, + "grad_norm": 0.45576839945889447, + "learning_rate": 7.145270960930095e-08, + "loss": 0.2472, + "step": 4532 + }, + { + "epoch": 2.489291598023064, + "grad_norm": 0.6569633844071732, + "learning_rate": 7.130325465388541e-08, + "loss": 0.258, + "step": 4533 + }, + { + "epoch": 2.489840746842394, + "grad_norm": 0.5096238614233091, + "learning_rate": 7.115394438470095e-08, + "loss": 0.2183, + "step": 4534 + }, + { + "epoch": 2.4903898956617243, + "grad_norm": 0.5313299516427737, + "learning_rate": 7.100477885213421e-08, + "loss": 0.2261, + "step": 4535 + }, + { + "epoch": 2.4909390444810544, + "grad_norm": 0.474860027829337, + "learning_rate": 7.085575810652291e-08, + "loss": 0.2645, + "step": 4536 + }, + { + "epoch": 2.4914881933003845, + "grad_norm": 0.5323962353379724, + "learning_rate": 7.070688219815618e-08, + "loss": 0.2313, + "step": 4537 + }, + { + "epoch": 2.4920373421197146, + "grad_norm": 0.4636722831600837, + "learning_rate": 7.05581511772743e-08, + "loss": 0.3081, + "step": 4538 + }, + { + "epoch": 2.4925864909390443, + "grad_norm": 0.5509361312060048, + "learning_rate": 7.040956509406825e-08, + "loss": 0.235, + "step": 4539 + }, + { + "epoch": 2.4931356397583744, + "grad_norm": 0.471863275186115, + "learning_rate": 7.026112399868062e-08, + "loss": 0.2668, + "step": 4540 + }, + { + "epoch": 2.4936847885777045, + "grad_norm": 0.4683989622672536, + "learning_rate": 7.011282794120456e-08, + "loss": 0.2156, + "step": 4541 + }, + { + "epoch": 2.4942339373970346, + "grad_norm": 0.5776108222735982, + "learning_rate": 6.996467697168477e-08, + "loss": 0.2069, + "step": 4542 + }, + { + "epoch": 2.4947830862163647, + "grad_norm": 0.5306033669221574, + "learning_rate": 6.981667114011653e-08, + "loss": 0.2751, + "step": 4543 + }, + { + "epoch": 2.495332235035695, + "grad_norm": 0.5634408895800741, + "learning_rate": 6.966881049644639e-08, + "loss": 0.2308, + "step": 4544 + }, + { + "epoch": 2.495881383855025, + "grad_norm": 0.6693938448580908, + "learning_rate": 6.952109509057203e-08, + "loss": 0.2405, + "step": 4545 + }, + { + "epoch": 2.4964305326743546, + "grad_norm": 0.5068423685233001, + "learning_rate": 6.937352497234187e-08, + "loss": 0.2572, + "step": 4546 + }, + { + "epoch": 2.4969796814936847, + "grad_norm": 0.5496883168462887, + "learning_rate": 6.922610019155528e-08, + "loss": 0.2522, + "step": 4547 + }, + { + "epoch": 2.4975288303130148, + "grad_norm": 0.4884693918673925, + "learning_rate": 6.90788207979628e-08, + "loss": 0.2322, + "step": 4548 + }, + { + "epoch": 2.498077979132345, + "grad_norm": 0.5531472933888497, + "learning_rate": 6.893168684126583e-08, + "loss": 0.2279, + "step": 4549 + }, + { + "epoch": 2.498627127951675, + "grad_norm": 0.5063406033870022, + "learning_rate": 6.878469837111653e-08, + "loss": 0.2613, + "step": 4550 + }, + { + "epoch": 2.499176276771005, + "grad_norm": 0.6110378821811832, + "learning_rate": 6.863785543711825e-08, + "loss": 0.2879, + "step": 4551 + }, + { + "epoch": 2.499725425590335, + "grad_norm": 0.4579330647645518, + "learning_rate": 6.849115808882504e-08, + "loss": 0.2259, + "step": 4552 + }, + { + "epoch": 2.500274574409665, + "grad_norm": 0.46921298050231774, + "learning_rate": 6.834460637574168e-08, + "loss": 0.2349, + "step": 4553 + }, + { + "epoch": 2.500823723228995, + "grad_norm": 0.46107669884783736, + "learning_rate": 6.819820034732414e-08, + "loss": 0.2683, + "step": 4554 + }, + { + "epoch": 2.501372872048325, + "grad_norm": 0.386687713141255, + "learning_rate": 6.805194005297904e-08, + "loss": 0.2367, + "step": 4555 + }, + { + "epoch": 2.501922020867655, + "grad_norm": 0.4647654784828155, + "learning_rate": 6.790582554206392e-08, + "loss": 0.2556, + "step": 4556 + }, + { + "epoch": 2.5024711696869852, + "grad_norm": 0.5675260963879627, + "learning_rate": 6.775985686388675e-08, + "loss": 0.2563, + "step": 4557 + }, + { + "epoch": 2.5030203185063153, + "grad_norm": 0.47829580131106575, + "learning_rate": 6.761403406770683e-08, + "loss": 0.244, + "step": 4558 + }, + { + "epoch": 2.5035694673256454, + "grad_norm": 0.5103853474552995, + "learning_rate": 6.746835720273404e-08, + "loss": 0.2347, + "step": 4559 + }, + { + "epoch": 2.504118616144975, + "grad_norm": 0.5431709889759623, + "learning_rate": 6.732282631812872e-08, + "loss": 0.2478, + "step": 4560 + }, + { + "epoch": 2.504667764964305, + "grad_norm": 0.4391728531090908, + "learning_rate": 6.717744146300231e-08, + "loss": 0.2269, + "step": 4561 + }, + { + "epoch": 2.5052169137836353, + "grad_norm": 0.5637558717523288, + "learning_rate": 6.703220268641694e-08, + "loss": 0.2337, + "step": 4562 + }, + { + "epoch": 2.5057660626029654, + "grad_norm": 0.4756145350254178, + "learning_rate": 6.688711003738521e-08, + "loss": 0.2665, + "step": 4563 + }, + { + "epoch": 2.5063152114222955, + "grad_norm": 0.5487562631567913, + "learning_rate": 6.674216356487053e-08, + "loss": 0.2268, + "step": 4564 + }, + { + "epoch": 2.5068643602416256, + "grad_norm": 0.6230405643413375, + "learning_rate": 6.6597363317787e-08, + "loss": 0.2716, + "step": 4565 + }, + { + "epoch": 2.5074135090609557, + "grad_norm": 0.5576654051290235, + "learning_rate": 6.645270934499952e-08, + "loss": 0.2295, + "step": 4566 + }, + { + "epoch": 2.5079626578802854, + "grad_norm": 0.572549915909029, + "learning_rate": 6.630820169532339e-08, + "loss": 0.2414, + "step": 4567 + }, + { + "epoch": 2.5085118066996155, + "grad_norm": 0.5535643744964316, + "learning_rate": 6.616384041752447e-08, + "loss": 0.2632, + "step": 4568 + }, + { + "epoch": 2.5090609555189456, + "grad_norm": 0.44145340255293003, + "learning_rate": 6.601962556031963e-08, + "loss": 0.2191, + "step": 4569 + }, + { + "epoch": 2.5096101043382757, + "grad_norm": 0.43349912844193744, + "learning_rate": 6.587555717237578e-08, + "loss": 0.2384, + "step": 4570 + }, + { + "epoch": 2.510159253157606, + "grad_norm": 0.42865721939646434, + "learning_rate": 6.573163530231091e-08, + "loss": 0.2324, + "step": 4571 + }, + { + "epoch": 2.510708401976936, + "grad_norm": 0.5355449374415615, + "learning_rate": 6.55878599986934e-08, + "loss": 0.2199, + "step": 4572 + }, + { + "epoch": 2.511257550796266, + "grad_norm": 0.4551018733552217, + "learning_rate": 6.544423131004196e-08, + "loss": 0.2192, + "step": 4573 + }, + { + "epoch": 2.5118066996155957, + "grad_norm": 0.5076103463348527, + "learning_rate": 6.530074928482596e-08, + "loss": 0.2167, + "step": 4574 + }, + { + "epoch": 2.5123558484349258, + "grad_norm": 0.48417642715467457, + "learning_rate": 6.515741397146532e-08, + "loss": 0.2504, + "step": 4575 + }, + { + "epoch": 2.512904997254256, + "grad_norm": 0.456682360057578, + "learning_rate": 6.501422541833066e-08, + "loss": 0.2271, + "step": 4576 + }, + { + "epoch": 2.513454146073586, + "grad_norm": 0.44263984504166337, + "learning_rate": 6.487118367374251e-08, + "loss": 0.2292, + "step": 4577 + }, + { + "epoch": 2.514003294892916, + "grad_norm": 0.4579839038792415, + "learning_rate": 6.47282887859724e-08, + "loss": 0.253, + "step": 4578 + }, + { + "epoch": 2.514552443712246, + "grad_norm": 0.4514081843179174, + "learning_rate": 6.458554080324198e-08, + "loss": 0.2496, + "step": 4579 + }, + { + "epoch": 2.5151015925315763, + "grad_norm": 0.5035294651778898, + "learning_rate": 6.444293977372356e-08, + "loss": 0.2579, + "step": 4580 + }, + { + "epoch": 2.515650741350906, + "grad_norm": 0.5353868280447703, + "learning_rate": 6.430048574553959e-08, + "loss": 0.2311, + "step": 4581 + }, + { + "epoch": 2.516199890170236, + "grad_norm": 0.4951859746805475, + "learning_rate": 6.415817876676311e-08, + "loss": 0.2053, + "step": 4582 + }, + { + "epoch": 2.516749038989566, + "grad_norm": 0.564127433527795, + "learning_rate": 6.401601888541754e-08, + "loss": 0.2079, + "step": 4583 + }, + { + "epoch": 2.5172981878088962, + "grad_norm": 0.46829716616200456, + "learning_rate": 6.387400614947661e-08, + "loss": 0.2189, + "step": 4584 + }, + { + "epoch": 2.5178473366282264, + "grad_norm": 0.48694046259194307, + "learning_rate": 6.373214060686422e-08, + "loss": 0.2331, + "step": 4585 + }, + { + "epoch": 2.518396485447556, + "grad_norm": 0.49583766450853206, + "learning_rate": 6.359042230545498e-08, + "loss": 0.2606, + "step": 4586 + }, + { + "epoch": 2.5189456342668866, + "grad_norm": 0.5135448151667028, + "learning_rate": 6.344885129307338e-08, + "loss": 0.2444, + "step": 4587 + }, + { + "epoch": 2.519494783086216, + "grad_norm": 0.5133435337744062, + "learning_rate": 6.330742761749455e-08, + "loss": 0.2577, + "step": 4588 + }, + { + "epoch": 2.5200439319055463, + "grad_norm": 0.5072700573587068, + "learning_rate": 6.31661513264438e-08, + "loss": 0.2866, + "step": 4589 + }, + { + "epoch": 2.5205930807248764, + "grad_norm": 0.5292344438892093, + "learning_rate": 6.302502246759667e-08, + "loss": 0.2819, + "step": 4590 + }, + { + "epoch": 2.5211422295442065, + "grad_norm": 0.6788155184018672, + "learning_rate": 6.288404108857883e-08, + "loss": 0.2721, + "step": 4591 + }, + { + "epoch": 2.5216913783635366, + "grad_norm": 0.5735647635235057, + "learning_rate": 6.274320723696639e-08, + "loss": 0.2397, + "step": 4592 + }, + { + "epoch": 2.5222405271828663, + "grad_norm": 0.5417002077065098, + "learning_rate": 6.260252096028565e-08, + "loss": 0.2361, + "step": 4593 + }, + { + "epoch": 2.522789676002197, + "grad_norm": 0.5061394958599946, + "learning_rate": 6.246198230601283e-08, + "loss": 0.2136, + "step": 4594 + }, + { + "epoch": 2.5233388248215265, + "grad_norm": 0.41952177618486725, + "learning_rate": 6.232159132157487e-08, + "loss": 0.2506, + "step": 4595 + }, + { + "epoch": 2.5238879736408566, + "grad_norm": 0.4056545649175082, + "learning_rate": 6.21813480543482e-08, + "loss": 0.2027, + "step": 4596 + }, + { + "epoch": 2.5244371224601867, + "grad_norm": 0.4372681392455932, + "learning_rate": 6.20412525516601e-08, + "loss": 0.2481, + "step": 4597 + }, + { + "epoch": 2.524986271279517, + "grad_norm": 0.4966730329918373, + "learning_rate": 6.190130486078729e-08, + "loss": 0.2302, + "step": 4598 + }, + { + "epoch": 2.525535420098847, + "grad_norm": 0.44463476211589104, + "learning_rate": 6.176150502895718e-08, + "loss": 0.2184, + "step": 4599 + }, + { + "epoch": 2.5260845689181766, + "grad_norm": 0.4848012548769994, + "learning_rate": 6.162185310334725e-08, + "loss": 0.2305, + "step": 4600 + }, + { + "epoch": 2.5260845689181766, + "eval_loss": 0.3213394284248352, + "eval_runtime": 18.6927, + "eval_samples_per_second": 23.699, + "eval_steps_per_second": 1.016, + "step": 4600 + }, + { + "epoch": 2.526633717737507, + "grad_norm": 0.5031891557484789, + "learning_rate": 6.148234913108445e-08, + "loss": 0.273, + "step": 4601 + }, + { + "epoch": 2.5271828665568368, + "grad_norm": 0.5066358923837541, + "learning_rate": 6.134299315924644e-08, + "loss": 0.257, + "step": 4602 + }, + { + "epoch": 2.527732015376167, + "grad_norm": 0.5450729969251152, + "learning_rate": 6.120378523486075e-08, + "loss": 0.2321, + "step": 4603 + }, + { + "epoch": 2.528281164195497, + "grad_norm": 0.5417822891997048, + "learning_rate": 6.106472540490486e-08, + "loss": 0.2501, + "step": 4604 + }, + { + "epoch": 2.528830313014827, + "grad_norm": 0.46206438555126045, + "learning_rate": 6.092581371630639e-08, + "loss": 0.2099, + "step": 4605 + }, + { + "epoch": 2.529379461834157, + "grad_norm": 0.5361145957176399, + "learning_rate": 6.078705021594296e-08, + "loss": 0.2152, + "step": 4606 + }, + { + "epoch": 2.529928610653487, + "grad_norm": 0.5258311917250434, + "learning_rate": 6.064843495064211e-08, + "loss": 0.2296, + "step": 4607 + }, + { + "epoch": 2.5304777594728174, + "grad_norm": 0.6302793171276116, + "learning_rate": 6.050996796718128e-08, + "loss": 0.2523, + "step": 4608 + }, + { + "epoch": 2.531026908292147, + "grad_norm": 0.5163400990954325, + "learning_rate": 6.037164931228799e-08, + "loss": 0.2511, + "step": 4609 + }, + { + "epoch": 2.531576057111477, + "grad_norm": 0.46544326232452143, + "learning_rate": 6.023347903263991e-08, + "loss": 0.2833, + "step": 4610 + }, + { + "epoch": 2.5321252059308073, + "grad_norm": 0.5379793448658092, + "learning_rate": 6.009545717486425e-08, + "loss": 0.2336, + "step": 4611 + }, + { + "epoch": 2.5326743547501374, + "grad_norm": 0.59740990034428, + "learning_rate": 5.995758378553819e-08, + "loss": 0.2679, + "step": 4612 + }, + { + "epoch": 2.5332235035694675, + "grad_norm": 0.47440094686162837, + "learning_rate": 5.981985891118909e-08, + "loss": 0.2265, + "step": 4613 + }, + { + "epoch": 2.533772652388797, + "grad_norm": 0.5154349656381697, + "learning_rate": 5.968228259829405e-08, + "loss": 0.2215, + "step": 4614 + }, + { + "epoch": 2.5343218012081277, + "grad_norm": 0.6560178788666248, + "learning_rate": 5.9544854893279796e-08, + "loss": 0.2166, + "step": 4615 + }, + { + "epoch": 2.5348709500274573, + "grad_norm": 0.5391996347459551, + "learning_rate": 5.940757584252331e-08, + "loss": 0.2352, + "step": 4616 + }, + { + "epoch": 2.5354200988467874, + "grad_norm": 0.5210384854916712, + "learning_rate": 5.9270445492351334e-08, + "loss": 0.2268, + "step": 4617 + }, + { + "epoch": 2.5359692476661175, + "grad_norm": 0.42573991189010507, + "learning_rate": 5.913346388903995e-08, + "loss": 0.2274, + "step": 4618 + }, + { + "epoch": 2.5365183964854476, + "grad_norm": 0.6428833072681643, + "learning_rate": 5.899663107881555e-08, + "loss": 0.2418, + "step": 4619 + }, + { + "epoch": 2.5370675453047777, + "grad_norm": 0.5093942390967556, + "learning_rate": 5.885994710785435e-08, + "loss": 0.2504, + "step": 4620 + }, + { + "epoch": 2.5376166941241074, + "grad_norm": 0.5011740978857232, + "learning_rate": 5.87234120222819e-08, + "loss": 0.2806, + "step": 4621 + }, + { + "epoch": 2.538165842943438, + "grad_norm": 0.47394706730958225, + "learning_rate": 5.858702586817389e-08, + "loss": 0.2634, + "step": 4622 + }, + { + "epoch": 2.5387149917627676, + "grad_norm": 0.5010001792865919, + "learning_rate": 5.8450788691555573e-08, + "loss": 0.2732, + "step": 4623 + }, + { + "epoch": 2.5392641405820977, + "grad_norm": 0.6443936850229762, + "learning_rate": 5.831470053840211e-08, + "loss": 0.251, + "step": 4624 + }, + { + "epoch": 2.539813289401428, + "grad_norm": 0.4062283201118275, + "learning_rate": 5.8178761454637974e-08, + "loss": 0.2398, + "step": 4625 + }, + { + "epoch": 2.540362438220758, + "grad_norm": 0.4725602560706316, + "learning_rate": 5.80429714861378e-08, + "loss": 0.2129, + "step": 4626 + }, + { + "epoch": 2.540911587040088, + "grad_norm": 0.548604268347584, + "learning_rate": 5.790733067872573e-08, + "loss": 0.2901, + "step": 4627 + }, + { + "epoch": 2.5414607358594177, + "grad_norm": 0.645901346883235, + "learning_rate": 5.777183907817548e-08, + "loss": 0.2554, + "step": 4628 + }, + { + "epoch": 2.5420098846787478, + "grad_norm": 0.4765275483505522, + "learning_rate": 5.7636496730210375e-08, + "loss": 0.2305, + "step": 4629 + }, + { + "epoch": 2.542559033498078, + "grad_norm": 0.4588742452096127, + "learning_rate": 5.750130368050355e-08, + "loss": 0.2453, + "step": 4630 + }, + { + "epoch": 2.543108182317408, + "grad_norm": 0.556414516005863, + "learning_rate": 5.736625997467776e-08, + "loss": 0.2749, + "step": 4631 + }, + { + "epoch": 2.543657331136738, + "grad_norm": 0.5022395623924697, + "learning_rate": 5.723136565830512e-08, + "loss": 0.2529, + "step": 4632 + }, + { + "epoch": 2.544206479956068, + "grad_norm": 0.5592360813869147, + "learning_rate": 5.709662077690764e-08, + "loss": 0.2516, + "step": 4633 + }, + { + "epoch": 2.5447556287753983, + "grad_norm": 0.5079009995072159, + "learning_rate": 5.696202537595674e-08, + "loss": 0.2762, + "step": 4634 + }, + { + "epoch": 2.545304777594728, + "grad_norm": 0.4681104189612712, + "learning_rate": 5.6827579500873206e-08, + "loss": 0.2282, + "step": 4635 + }, + { + "epoch": 2.545853926414058, + "grad_norm": 0.4630116772608012, + "learning_rate": 5.669328319702776e-08, + "loss": 0.2606, + "step": 4636 + }, + { + "epoch": 2.546403075233388, + "grad_norm": 0.4641448645647968, + "learning_rate": 5.6559136509740504e-08, + "loss": 0.2422, + "step": 4637 + }, + { + "epoch": 2.5469522240527183, + "grad_norm": 0.4312021971699116, + "learning_rate": 5.642513948428081e-08, + "loss": 0.2585, + "step": 4638 + }, + { + "epoch": 2.5475013728720484, + "grad_norm": 0.5139453143354953, + "learning_rate": 5.629129216586796e-08, + "loss": 0.2611, + "step": 4639 + }, + { + "epoch": 2.5480505216913785, + "grad_norm": 0.5612319951795401, + "learning_rate": 5.6157594599670265e-08, + "loss": 0.2578, + "step": 4640 + }, + { + "epoch": 2.5485996705107086, + "grad_norm": 0.48901502797507146, + "learning_rate": 5.6024046830805954e-08, + "loss": 0.2813, + "step": 4641 + }, + { + "epoch": 2.5491488193300382, + "grad_norm": 0.4744618146143959, + "learning_rate": 5.589064890434224e-08, + "loss": 0.2327, + "step": 4642 + }, + { + "epoch": 2.5496979681493683, + "grad_norm": 0.39310686310390536, + "learning_rate": 5.575740086529616e-08, + "loss": 0.2264, + "step": 4643 + }, + { + "epoch": 2.5502471169686984, + "grad_norm": 0.5184786477437552, + "learning_rate": 5.562430275863407e-08, + "loss": 0.2336, + "step": 4644 + }, + { + "epoch": 2.5507962657880285, + "grad_norm": 0.5133046584505918, + "learning_rate": 5.5491354629271554e-08, + "loss": 0.2698, + "step": 4645 + }, + { + "epoch": 2.5513454146073586, + "grad_norm": 0.6326811499528321, + "learning_rate": 5.535855652207374e-08, + "loss": 0.2969, + "step": 4646 + }, + { + "epoch": 2.5518945634266887, + "grad_norm": 0.540072706259425, + "learning_rate": 5.5225908481855065e-08, + "loss": 0.2561, + "step": 4647 + }, + { + "epoch": 2.552443712246019, + "grad_norm": 0.6264760229681093, + "learning_rate": 5.5093410553379516e-08, + "loss": 0.2761, + "step": 4648 + }, + { + "epoch": 2.5529928610653485, + "grad_norm": 0.5268250905941013, + "learning_rate": 5.496106278136001e-08, + "loss": 0.2334, + "step": 4649 + }, + { + "epoch": 2.5535420098846786, + "grad_norm": 0.484754195266986, + "learning_rate": 5.4828865210459326e-08, + "loss": 0.2151, + "step": 4650 + }, + { + "epoch": 2.5540911587040087, + "grad_norm": 0.4467375174246866, + "learning_rate": 5.469681788528906e-08, + "loss": 0.2266, + "step": 4651 + }, + { + "epoch": 2.554640307523339, + "grad_norm": 0.48980020237373173, + "learning_rate": 5.456492085041034e-08, + "loss": 0.2452, + "step": 4652 + }, + { + "epoch": 2.555189456342669, + "grad_norm": 0.5705675482473022, + "learning_rate": 5.44331741503336e-08, + "loss": 0.2252, + "step": 4653 + }, + { + "epoch": 2.555738605161999, + "grad_norm": 0.43588192716967844, + "learning_rate": 5.4301577829518615e-08, + "loss": 0.2514, + "step": 4654 + }, + { + "epoch": 2.556287753981329, + "grad_norm": 0.5239870541830424, + "learning_rate": 5.417013193237414e-08, + "loss": 0.2655, + "step": 4655 + }, + { + "epoch": 2.556836902800659, + "grad_norm": 0.5352064399550495, + "learning_rate": 5.4038836503258285e-08, + "loss": 0.2694, + "step": 4656 + }, + { + "epoch": 2.557386051619989, + "grad_norm": 0.5399455872232076, + "learning_rate": 5.3907691586478485e-08, + "loss": 0.2137, + "step": 4657 + }, + { + "epoch": 2.557935200439319, + "grad_norm": 0.4376944943348027, + "learning_rate": 5.3776697226291435e-08, + "loss": 0.2221, + "step": 4658 + }, + { + "epoch": 2.558484349258649, + "grad_norm": 0.4495307972598976, + "learning_rate": 5.364585346690275e-08, + "loss": 0.2598, + "step": 4659 + }, + { + "epoch": 2.559033498077979, + "grad_norm": 0.5656563703152465, + "learning_rate": 5.3515160352467354e-08, + "loss": 0.2271, + "step": 4660 + }, + { + "epoch": 2.5595826468973093, + "grad_norm": 0.42037946863955317, + "learning_rate": 5.338461792708956e-08, + "loss": 0.219, + "step": 4661 + }, + { + "epoch": 2.5601317957166394, + "grad_norm": 0.4156499751036329, + "learning_rate": 5.325422623482255e-08, + "loss": 0.2247, + "step": 4662 + }, + { + "epoch": 2.560680944535969, + "grad_norm": 0.48706999344370866, + "learning_rate": 5.31239853196685e-08, + "loss": 0.2364, + "step": 4663 + }, + { + "epoch": 2.561230093355299, + "grad_norm": 0.5788069674855978, + "learning_rate": 5.299389522557912e-08, + "loss": 0.2436, + "step": 4664 + }, + { + "epoch": 2.5617792421746293, + "grad_norm": 0.5347028951822241, + "learning_rate": 5.2863955996455136e-08, + "loss": 0.225, + "step": 4665 + }, + { + "epoch": 2.5623283909939594, + "grad_norm": 0.6792519261978894, + "learning_rate": 5.2734167676146027e-08, + "loss": 0.2513, + "step": 4666 + }, + { + "epoch": 2.5628775398132895, + "grad_norm": 0.46016879879885514, + "learning_rate": 5.260453030845064e-08, + "loss": 0.2385, + "step": 4667 + }, + { + "epoch": 2.5634266886326196, + "grad_norm": 0.5288548738347421, + "learning_rate": 5.247504393711682e-08, + "loss": 0.2986, + "step": 4668 + }, + { + "epoch": 2.5639758374519497, + "grad_norm": 0.48035546757544517, + "learning_rate": 5.234570860584144e-08, + "loss": 0.2351, + "step": 4669 + }, + { + "epoch": 2.5645249862712793, + "grad_norm": 0.650647669868789, + "learning_rate": 5.2216524358270344e-08, + "loss": 0.257, + "step": 4670 + }, + { + "epoch": 2.5650741350906094, + "grad_norm": 0.49146587420418614, + "learning_rate": 5.208749123799865e-08, + "loss": 0.2471, + "step": 4671 + }, + { + "epoch": 2.5656232839099395, + "grad_norm": 0.4767436034446234, + "learning_rate": 5.195860928857022e-08, + "loss": 0.2227, + "step": 4672 + }, + { + "epoch": 2.5661724327292696, + "grad_norm": 0.4915045283337026, + "learning_rate": 5.182987855347775e-08, + "loss": 0.233, + "step": 4673 + }, + { + "epoch": 2.5667215815485998, + "grad_norm": 0.45810460134085823, + "learning_rate": 5.1701299076163244e-08, + "loss": 0.2387, + "step": 4674 + }, + { + "epoch": 2.56727073036793, + "grad_norm": 0.423622698497717, + "learning_rate": 5.157287090001777e-08, + "loss": 0.2157, + "step": 4675 + }, + { + "epoch": 2.56781987918726, + "grad_norm": 0.7069934117936212, + "learning_rate": 5.144459406838069e-08, + "loss": 0.2594, + "step": 4676 + }, + { + "epoch": 2.5683690280065896, + "grad_norm": 0.5397225037934974, + "learning_rate": 5.131646862454112e-08, + "loss": 0.2596, + "step": 4677 + }, + { + "epoch": 2.5689181768259197, + "grad_norm": 0.45142993986312696, + "learning_rate": 5.1188494611736313e-08, + "loss": 0.2111, + "step": 4678 + }, + { + "epoch": 2.56946732564525, + "grad_norm": 0.4386055649080123, + "learning_rate": 5.106067207315311e-08, + "loss": 0.2494, + "step": 4679 + }, + { + "epoch": 2.57001647446458, + "grad_norm": 0.5212506057678268, + "learning_rate": 5.09330010519266e-08, + "loss": 0.2512, + "step": 4680 + }, + { + "epoch": 2.57056562328391, + "grad_norm": 0.5749257055246332, + "learning_rate": 5.080548159114125e-08, + "loss": 0.24, + "step": 4681 + }, + { + "epoch": 2.57111477210324, + "grad_norm": 0.5165471932835463, + "learning_rate": 5.0678113733830195e-08, + "loss": 0.2049, + "step": 4682 + }, + { + "epoch": 2.5716639209225702, + "grad_norm": 0.6501241332802173, + "learning_rate": 5.0550897522975344e-08, + "loss": 0.2429, + "step": 4683 + }, + { + "epoch": 2.5722130697419, + "grad_norm": 0.5005552885572347, + "learning_rate": 5.04238330015074e-08, + "loss": 0.2196, + "step": 4684 + }, + { + "epoch": 2.57276221856123, + "grad_norm": 0.568246674109851, + "learning_rate": 5.029692021230605e-08, + "loss": 0.2459, + "step": 4685 + }, + { + "epoch": 2.57331136738056, + "grad_norm": 0.4251384491598092, + "learning_rate": 5.017015919819963e-08, + "loss": 0.2445, + "step": 4686 + }, + { + "epoch": 2.57386051619989, + "grad_norm": 0.5083118640870605, + "learning_rate": 5.0043550001965305e-08, + "loss": 0.2054, + "step": 4687 + }, + { + "epoch": 2.5744096650192203, + "grad_norm": 0.5184652791660596, + "learning_rate": 4.991709266632919e-08, + "loss": 0.2327, + "step": 4688 + }, + { + "epoch": 2.5749588138385504, + "grad_norm": 0.6602559013259608, + "learning_rate": 4.979078723396576e-08, + "loss": 0.2591, + "step": 4689 + }, + { + "epoch": 2.5755079626578805, + "grad_norm": 0.4260702856673771, + "learning_rate": 4.966463374749848e-08, + "loss": 0.2385, + "step": 4690 + }, + { + "epoch": 2.57605711147721, + "grad_norm": 0.4855338421035337, + "learning_rate": 4.953863224949954e-08, + "loss": 0.2511, + "step": 4691 + }, + { + "epoch": 2.5766062602965403, + "grad_norm": 0.5094669445022787, + "learning_rate": 4.9412782782489954e-08, + "loss": 0.2434, + "step": 4692 + }, + { + "epoch": 2.5771554091158704, + "grad_norm": 0.4382320243575517, + "learning_rate": 4.9287085388938994e-08, + "loss": 0.2498, + "step": 4693 + }, + { + "epoch": 2.5777045579352005, + "grad_norm": 0.7268016991155437, + "learning_rate": 4.916154011126514e-08, + "loss": 0.2322, + "step": 4694 + }, + { + "epoch": 2.5782537067545306, + "grad_norm": 0.36841152629993473, + "learning_rate": 4.9036146991835066e-08, + "loss": 0.232, + "step": 4695 + }, + { + "epoch": 2.5788028555738602, + "grad_norm": 0.4556278474869672, + "learning_rate": 4.8910906072964606e-08, + "loss": 0.2111, + "step": 4696 + }, + { + "epoch": 2.579352004393191, + "grad_norm": 0.3859393500165948, + "learning_rate": 4.8785817396917735e-08, + "loss": 0.2254, + "step": 4697 + }, + { + "epoch": 2.5799011532125204, + "grad_norm": 0.5499506639583974, + "learning_rate": 4.8660881005907347e-08, + "loss": 0.2276, + "step": 4698 + }, + { + "epoch": 2.5804503020318506, + "grad_norm": 0.4730504136293765, + "learning_rate": 4.8536096942095054e-08, + "loss": 0.2353, + "step": 4699 + }, + { + "epoch": 2.5809994508511807, + "grad_norm": 0.4406221867805422, + "learning_rate": 4.8411465247590505e-08, + "loss": 0.2538, + "step": 4700 + }, + { + "epoch": 2.5815485996705108, + "grad_norm": 0.5484447179927151, + "learning_rate": 4.828698596445252e-08, + "loss": 0.2506, + "step": 4701 + }, + { + "epoch": 2.582097748489841, + "grad_norm": 0.46067474930657737, + "learning_rate": 4.816265913468834e-08, + "loss": 0.2573, + "step": 4702 + }, + { + "epoch": 2.5826468973091705, + "grad_norm": 0.6997940313212557, + "learning_rate": 4.803848480025355e-08, + "loss": 0.2418, + "step": 4703 + }, + { + "epoch": 2.583196046128501, + "grad_norm": 0.5189259824433953, + "learning_rate": 4.7914463003052436e-08, + "loss": 0.2464, + "step": 4704 + }, + { + "epoch": 2.5837451949478307, + "grad_norm": 0.4341819655441293, + "learning_rate": 4.7790593784937875e-08, + "loss": 0.2612, + "step": 4705 + }, + { + "epoch": 2.584294343767161, + "grad_norm": 0.4773838832455015, + "learning_rate": 4.766687718771114e-08, + "loss": 0.2646, + "step": 4706 + }, + { + "epoch": 2.584843492586491, + "grad_norm": 0.5527088749553231, + "learning_rate": 4.754331325312193e-08, + "loss": 0.2632, + "step": 4707 + }, + { + "epoch": 2.585392641405821, + "grad_norm": 0.4594578033926344, + "learning_rate": 4.741990202286855e-08, + "loss": 0.2148, + "step": 4708 + }, + { + "epoch": 2.585941790225151, + "grad_norm": 0.5046396883730685, + "learning_rate": 4.729664353859786e-08, + "loss": 0.2456, + "step": 4709 + }, + { + "epoch": 2.586490939044481, + "grad_norm": 0.4518190799723593, + "learning_rate": 4.7173537841904974e-08, + "loss": 0.2155, + "step": 4710 + }, + { + "epoch": 2.5870400878638113, + "grad_norm": 0.4370217874947079, + "learning_rate": 4.7050584974333445e-08, + "loss": 0.2427, + "step": 4711 + }, + { + "epoch": 2.587589236683141, + "grad_norm": 0.4905549012073179, + "learning_rate": 4.692778497737542e-08, + "loss": 0.2397, + "step": 4712 + }, + { + "epoch": 2.588138385502471, + "grad_norm": 0.5202777342920942, + "learning_rate": 4.6805137892471515e-08, + "loss": 0.2467, + "step": 4713 + }, + { + "epoch": 2.588687534321801, + "grad_norm": 0.46490505046190356, + "learning_rate": 4.6682643761010297e-08, + "loss": 0.2185, + "step": 4714 + }, + { + "epoch": 2.5892366831411313, + "grad_norm": 0.4497853482569377, + "learning_rate": 4.656030262432923e-08, + "loss": 0.264, + "step": 4715 + }, + { + "epoch": 2.5897858319604614, + "grad_norm": 0.6418503318297589, + "learning_rate": 4.6438114523714044e-08, + "loss": 0.284, + "step": 4716 + }, + { + "epoch": 2.590334980779791, + "grad_norm": 0.649968508941332, + "learning_rate": 4.631607950039841e-08, + "loss": 0.2906, + "step": 4717 + }, + { + "epoch": 2.5908841295991216, + "grad_norm": 0.7254143065533668, + "learning_rate": 4.619419759556482e-08, + "loss": 0.3622, + "step": 4718 + }, + { + "epoch": 2.5914332784184513, + "grad_norm": 0.5590200146163397, + "learning_rate": 4.607246885034403e-08, + "loss": 0.252, + "step": 4719 + }, + { + "epoch": 2.5919824272377814, + "grad_norm": 0.5707436091864629, + "learning_rate": 4.59508933058148e-08, + "loss": 0.2312, + "step": 4720 + }, + { + "epoch": 2.5925315760571115, + "grad_norm": 0.4954821586055, + "learning_rate": 4.5829471003004586e-08, + "loss": 0.2565, + "step": 4721 + }, + { + "epoch": 2.5930807248764416, + "grad_norm": 0.45626611841852777, + "learning_rate": 4.570820198288873e-08, + "loss": 0.2607, + "step": 4722 + }, + { + "epoch": 2.5936298736957717, + "grad_norm": 0.6896882682186795, + "learning_rate": 4.5587086286391287e-08, + "loss": 0.2407, + "step": 4723 + }, + { + "epoch": 2.5941790225151014, + "grad_norm": 0.5087293344070736, + "learning_rate": 4.546612395438416e-08, + "loss": 0.218, + "step": 4724 + }, + { + "epoch": 2.594728171334432, + "grad_norm": 0.4757263257198551, + "learning_rate": 4.53453150276878e-08, + "loss": 0.2718, + "step": 4725 + }, + { + "epoch": 2.5952773201537616, + "grad_norm": 0.4343287030943466, + "learning_rate": 4.5224659547070764e-08, + "loss": 0.2257, + "step": 4726 + }, + { + "epoch": 2.5958264689730917, + "grad_norm": 0.5822430903768527, + "learning_rate": 4.510415755324978e-08, + "loss": 0.2655, + "step": 4727 + }, + { + "epoch": 2.5963756177924218, + "grad_norm": 0.494715238234534, + "learning_rate": 4.498380908688981e-08, + "loss": 0.2276, + "step": 4728 + }, + { + "epoch": 2.596924766611752, + "grad_norm": 0.4957113386494103, + "learning_rate": 4.486361418860402e-08, + "loss": 0.2247, + "step": 4729 + }, + { + "epoch": 2.597473915431082, + "grad_norm": 0.47788379642563933, + "learning_rate": 4.474357289895391e-08, + "loss": 0.2196, + "step": 4730 + }, + { + "epoch": 2.5980230642504116, + "grad_norm": 0.5128436610885865, + "learning_rate": 4.46236852584488e-08, + "loss": 0.2266, + "step": 4731 + }, + { + "epoch": 2.598572213069742, + "grad_norm": 0.5149207240709779, + "learning_rate": 4.45039513075465e-08, + "loss": 0.2008, + "step": 4732 + }, + { + "epoch": 2.599121361889072, + "grad_norm": 0.49792792860810636, + "learning_rate": 4.4384371086652805e-08, + "loss": 0.2163, + "step": 4733 + }, + { + "epoch": 2.599670510708402, + "grad_norm": 0.5099988112118381, + "learning_rate": 4.42649446361214e-08, + "loss": 0.2804, + "step": 4734 + }, + { + "epoch": 2.600219659527732, + "grad_norm": 0.5140093471043612, + "learning_rate": 4.414567199625458e-08, + "loss": 0.2403, + "step": 4735 + }, + { + "epoch": 2.600768808347062, + "grad_norm": 0.4783320401528292, + "learning_rate": 4.402655320730243e-08, + "loss": 0.2408, + "step": 4736 + }, + { + "epoch": 2.6013179571663922, + "grad_norm": 0.5582262996127252, + "learning_rate": 4.3907588309462944e-08, + "loss": 0.2571, + "step": 4737 + }, + { + "epoch": 2.601867105985722, + "grad_norm": 0.43664375354722884, + "learning_rate": 4.378877734288272e-08, + "loss": 0.2281, + "step": 4738 + }, + { + "epoch": 2.602416254805052, + "grad_norm": 0.43676963304185085, + "learning_rate": 4.367012034765573e-08, + "loss": 0.2338, + "step": 4739 + }, + { + "epoch": 2.602965403624382, + "grad_norm": 0.3930096124133771, + "learning_rate": 4.355161736382471e-08, + "loss": 0.2659, + "step": 4740 + }, + { + "epoch": 2.603514552443712, + "grad_norm": 0.48572630598438443, + "learning_rate": 4.343326843137966e-08, + "loss": 0.2497, + "step": 4741 + }, + { + "epoch": 2.6040637012630423, + "grad_norm": 0.5787132558029076, + "learning_rate": 4.3315073590259265e-08, + "loss": 0.2879, + "step": 4742 + }, + { + "epoch": 2.6046128500823724, + "grad_norm": 0.4619929764939395, + "learning_rate": 4.3197032880349886e-08, + "loss": 0.2735, + "step": 4743 + }, + { + "epoch": 2.6051619989017025, + "grad_norm": 0.6160254005259093, + "learning_rate": 4.3079146341485904e-08, + "loss": 0.2964, + "step": 4744 + }, + { + "epoch": 2.605711147721032, + "grad_norm": 0.43864630226171125, + "learning_rate": 4.2961414013449516e-08, + "loss": 0.2438, + "step": 4745 + }, + { + "epoch": 2.6062602965403623, + "grad_norm": 0.4938834227245852, + "learning_rate": 4.284383593597123e-08, + "loss": 0.2499, + "step": 4746 + }, + { + "epoch": 2.6068094453596924, + "grad_norm": 0.47847280336475584, + "learning_rate": 4.2726412148729344e-08, + "loss": 0.2195, + "step": 4747 + }, + { + "epoch": 2.6073585941790225, + "grad_norm": 0.5337392924298261, + "learning_rate": 4.2609142691349867e-08, + "loss": 0.2505, + "step": 4748 + }, + { + "epoch": 2.6079077429983526, + "grad_norm": 0.4751169068365289, + "learning_rate": 4.249202760340717e-08, + "loss": 0.2691, + "step": 4749 + }, + { + "epoch": 2.6084568918176827, + "grad_norm": 0.4896171720270328, + "learning_rate": 4.237506692442308e-08, + "loss": 0.2549, + "step": 4750 + }, + { + "epoch": 2.609006040637013, + "grad_norm": 0.42036685652941597, + "learning_rate": 4.225826069386756e-08, + "loss": 0.2641, + "step": 4751 + }, + { + "epoch": 2.6095551894563425, + "grad_norm": 0.5197911504008706, + "learning_rate": 4.2141608951158385e-08, + "loss": 0.2571, + "step": 4752 + }, + { + "epoch": 2.6101043382756726, + "grad_norm": 0.4530523762933118, + "learning_rate": 4.2025111735661376e-08, + "loss": 0.2398, + "step": 4753 + }, + { + "epoch": 2.6106534870950027, + "grad_norm": 0.5978128843061661, + "learning_rate": 4.1908769086689935e-08, + "loss": 0.2974, + "step": 4754 + }, + { + "epoch": 2.6112026359143328, + "grad_norm": 0.5174108726268848, + "learning_rate": 4.1792581043505403e-08, + "loss": 0.2148, + "step": 4755 + }, + { + "epoch": 2.611751784733663, + "grad_norm": 0.4386146138526397, + "learning_rate": 4.167654764531692e-08, + "loss": 0.2327, + "step": 4756 + }, + { + "epoch": 2.612300933552993, + "grad_norm": 0.5184461066340873, + "learning_rate": 4.156066893128165e-08, + "loss": 0.255, + "step": 4757 + }, + { + "epoch": 2.612850082372323, + "grad_norm": 0.43807923241645813, + "learning_rate": 4.144494494050422e-08, + "loss": 0.2169, + "step": 4758 + }, + { + "epoch": 2.6133992311916527, + "grad_norm": 0.4776469085295862, + "learning_rate": 4.132937571203732e-08, + "loss": 0.2354, + "step": 4759 + }, + { + "epoch": 2.613948380010983, + "grad_norm": 0.3983504999517481, + "learning_rate": 4.121396128488129e-08, + "loss": 0.2149, + "step": 4760 + }, + { + "epoch": 2.614497528830313, + "grad_norm": 0.47691603814494976, + "learning_rate": 4.1098701697984256e-08, + "loss": 0.2291, + "step": 4761 + }, + { + "epoch": 2.615046677649643, + "grad_norm": 0.8825558310205681, + "learning_rate": 4.098359699024197e-08, + "loss": 0.2106, + "step": 4762 + }, + { + "epoch": 2.615595826468973, + "grad_norm": 0.5530698801960924, + "learning_rate": 4.0868647200498155e-08, + "loss": 0.2109, + "step": 4763 + }, + { + "epoch": 2.6161449752883033, + "grad_norm": 0.43491063384156153, + "learning_rate": 4.075385236754416e-08, + "loss": 0.2567, + "step": 4764 + }, + { + "epoch": 2.6166941241076334, + "grad_norm": 0.46522762582385974, + "learning_rate": 4.063921253011891e-08, + "loss": 0.2291, + "step": 4765 + }, + { + "epoch": 2.617243272926963, + "grad_norm": 0.41670362278975864, + "learning_rate": 4.0524727726909093e-08, + "loss": 0.2355, + "step": 4766 + }, + { + "epoch": 2.617792421746293, + "grad_norm": 0.6314867618161334, + "learning_rate": 4.041039799654926e-08, + "loss": 0.2786, + "step": 4767 + }, + { + "epoch": 2.618341570565623, + "grad_norm": 0.4983037001286197, + "learning_rate": 4.029622337762135e-08, + "loss": 0.2376, + "step": 4768 + }, + { + "epoch": 2.6188907193849533, + "grad_norm": 0.4913778373908841, + "learning_rate": 4.0182203908655134e-08, + "loss": 0.2642, + "step": 4769 + }, + { + "epoch": 2.6194398682042834, + "grad_norm": 0.49607649863674436, + "learning_rate": 4.0068339628128046e-08, + "loss": 0.2653, + "step": 4770 + }, + { + "epoch": 2.6199890170236135, + "grad_norm": 0.5196374994707973, + "learning_rate": 3.9954630574465054e-08, + "loss": 0.2773, + "step": 4771 + }, + { + "epoch": 2.6205381658429436, + "grad_norm": 0.5866207636399032, + "learning_rate": 3.984107678603867e-08, + "loss": 0.2545, + "step": 4772 + }, + { + "epoch": 2.6210873146622733, + "grad_norm": 0.5520044519370948, + "learning_rate": 3.9727678301169195e-08, + "loss": 0.1979, + "step": 4773 + }, + { + "epoch": 2.6216364634816034, + "grad_norm": 0.6095799394588446, + "learning_rate": 3.961443515812452e-08, + "loss": 0.2776, + "step": 4774 + }, + { + "epoch": 2.6221856123009335, + "grad_norm": 0.5071296505031105, + "learning_rate": 3.9501347395119845e-08, + "loss": 0.261, + "step": 4775 + }, + { + "epoch": 2.6227347611202636, + "grad_norm": 0.4315683716276093, + "learning_rate": 3.938841505031834e-08, + "loss": 0.2052, + "step": 4776 + }, + { + "epoch": 2.6232839099395937, + "grad_norm": 0.4441442619975704, + "learning_rate": 3.927563816183032e-08, + "loss": 0.2209, + "step": 4777 + }, + { + "epoch": 2.623833058758924, + "grad_norm": 0.47724451620799896, + "learning_rate": 3.916301676771402e-08, + "loss": 0.222, + "step": 4778 + }, + { + "epoch": 2.624382207578254, + "grad_norm": 0.5163301849893983, + "learning_rate": 3.905055090597479e-08, + "loss": 0.2557, + "step": 4779 + }, + { + "epoch": 2.6249313563975836, + "grad_norm": 0.48395737176672, + "learning_rate": 3.8938240614565865e-08, + "loss": 0.2508, + "step": 4780 + }, + { + "epoch": 2.6254805052169137, + "grad_norm": 0.6187439087642276, + "learning_rate": 3.882608593138787e-08, + "loss": 0.2993, + "step": 4781 + }, + { + "epoch": 2.6260296540362438, + "grad_norm": 0.48989836631521816, + "learning_rate": 3.8714086894288776e-08, + "loss": 0.2502, + "step": 4782 + }, + { + "epoch": 2.626578802855574, + "grad_norm": 0.5692960320177147, + "learning_rate": 3.860224354106408e-08, + "loss": 0.2318, + "step": 4783 + }, + { + "epoch": 2.627127951674904, + "grad_norm": 0.5825133502871823, + "learning_rate": 3.8490555909456963e-08, + "loss": 0.2688, + "step": 4784 + }, + { + "epoch": 2.627677100494234, + "grad_norm": 0.6675933587545213, + "learning_rate": 3.8379024037157744e-08, + "loss": 0.248, + "step": 4785 + }, + { + "epoch": 2.628226249313564, + "grad_norm": 0.4833019451073019, + "learning_rate": 3.8267647961804316e-08, + "loss": 0.2178, + "step": 4786 + }, + { + "epoch": 2.628775398132894, + "grad_norm": 0.48857225437207225, + "learning_rate": 3.81564277209822e-08, + "loss": 0.2697, + "step": 4787 + }, + { + "epoch": 2.629324546952224, + "grad_norm": 0.43774263443358197, + "learning_rate": 3.804536335222398e-08, + "loss": 0.2222, + "step": 4788 + }, + { + "epoch": 2.629873695771554, + "grad_norm": 0.5054812119234828, + "learning_rate": 3.7934454893009723e-08, + "loss": 0.2537, + "step": 4789 + }, + { + "epoch": 2.630422844590884, + "grad_norm": 0.41473094609152666, + "learning_rate": 3.782370238076696e-08, + "loss": 0.2233, + "step": 4790 + }, + { + "epoch": 2.6309719934102143, + "grad_norm": 0.5206399353226098, + "learning_rate": 3.771310585287077e-08, + "loss": 0.2208, + "step": 4791 + }, + { + "epoch": 2.6315211422295444, + "grad_norm": 0.694570190374752, + "learning_rate": 3.7602665346643236e-08, + "loss": 0.2797, + "step": 4792 + }, + { + "epoch": 2.6320702910488745, + "grad_norm": 0.5165322160565938, + "learning_rate": 3.749238089935403e-08, + "loss": 0.2581, + "step": 4793 + }, + { + "epoch": 2.632619439868204, + "grad_norm": 0.5109998318786616, + "learning_rate": 3.738225254822e-08, + "loss": 0.223, + "step": 4794 + }, + { + "epoch": 2.6331685886875342, + "grad_norm": 1.0135762538631525, + "learning_rate": 3.7272280330405584e-08, + "loss": 0.4277, + "step": 4795 + }, + { + "epoch": 2.6337177375068643, + "grad_norm": 0.5786264229954179, + "learning_rate": 3.716246428302215e-08, + "loss": 0.2366, + "step": 4796 + }, + { + "epoch": 2.6342668863261944, + "grad_norm": 0.5173131435774923, + "learning_rate": 3.705280444312863e-08, + "loss": 0.2482, + "step": 4797 + }, + { + "epoch": 2.6348160351455245, + "grad_norm": 0.5550825757669052, + "learning_rate": 3.6943300847731443e-08, + "loss": 0.2704, + "step": 4798 + }, + { + "epoch": 2.6353651839648546, + "grad_norm": 0.4867997213383429, + "learning_rate": 3.6833953533783546e-08, + "loss": 0.2413, + "step": 4799 + }, + { + "epoch": 2.6359143327841847, + "grad_norm": 0.4490853152675713, + "learning_rate": 3.6724762538185905e-08, + "loss": 0.2229, + "step": 4800 + }, + { + "epoch": 2.6359143327841847, + "eval_loss": 0.32113179564476013, + "eval_runtime": 21.4684, + "eval_samples_per_second": 20.635, + "eval_steps_per_second": 0.885, + "step": 4800 + } + ], + "logging_steps": 1.0, + "max_steps": 5463, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 200, + "total_flos": 1249908039892992.0, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}