diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,121877 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 17405, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.0507381496559225, + "learning_rate": 3.824091778202677e-08, + "loss": 0.2942, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.8070182416007133, + "learning_rate": 7.648183556405354e-08, + "loss": 0.3872, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 1.80041143859621, + "learning_rate": 1.1472275334608032e-07, + "loss": 0.5905, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 0.8120071191832767, + "learning_rate": 1.5296367112810708e-07, + "loss": 0.2618, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 0.5949392601041631, + "learning_rate": 1.9120458891013387e-07, + "loss": 0.2642, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 4.055790990377875, + "learning_rate": 2.2944550669216063e-07, + "loss": 0.7541, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 0.7613072929404066, + "learning_rate": 2.676864244741874e-07, + "loss": 0.2649, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 1.304037252213996, + "learning_rate": 3.0592734225621416e-07, + "loss": 0.5504, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 0.9141654125014961, + "learning_rate": 3.441682600382409e-07, + "loss": 0.3869, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 0.7609269558117047, + "learning_rate": 3.8240917782026774e-07, + "loss": 0.2751, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.7955995450796811, + "learning_rate": 4.206500956022945e-07, + "loss": 0.1885, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 1.8612661473785725, + "learning_rate": 4.5889101338432127e-07, + "loss": 0.585, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 0.7748274547715579, + "learning_rate": 4.97131931166348e-07, + "loss": 0.3195, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 0.8639653689032786, + "learning_rate": 5.353728489483748e-07, + "loss": 0.3883, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 2.5884579720855796, + "learning_rate": 5.736137667304016e-07, + "loss": 0.785, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 1.8703771449042337, + "learning_rate": 6.118546845124283e-07, + "loss": 0.3993, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 0.6961613428356354, + "learning_rate": 6.500956022944552e-07, + "loss": 0.2686, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 0.971276714643732, + "learning_rate": 6.883365200764818e-07, + "loss": 0.3745, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 1.4283398141820636, + "learning_rate": 7.265774378585087e-07, + "loss": 0.4011, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 0.9170881209785016, + "learning_rate": 7.648183556405355e-07, + "loss": 0.3542, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.9135336035316859, + "learning_rate": 8.030592734225621e-07, + "loss": 0.3697, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 1.0476506879512788, + "learning_rate": 8.41300191204589e-07, + "loss": 0.3371, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 0.6115452493182032, + "learning_rate": 8.795411089866157e-07, + "loss": 0.2105, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 1.67676855881491, + "learning_rate": 9.177820267686425e-07, + "loss": 0.5603, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 0.7202407335460682, + "learning_rate": 9.560229445506693e-07, + "loss": 0.3293, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.9865131037247048, + "learning_rate": 9.94263862332696e-07, + "loss": 0.41, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 2.4535881440886347, + "learning_rate": 1.0325047801147228e-06, + "loss": 0.5634, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 1.0947257185323476, + "learning_rate": 1.0707456978967496e-06, + "loss": 0.3149, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 1.0129147168499806, + "learning_rate": 1.1089866156787763e-06, + "loss": 0.3681, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 0.6576020756810448, + "learning_rate": 1.1472275334608031e-06, + "loss": 0.2731, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.9047392383713737, + "learning_rate": 1.1854684512428299e-06, + "loss": 0.3647, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 1.4541026438021523, + "learning_rate": 1.2237093690248566e-06, + "loss": 0.4479, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 0.8653286599987811, + "learning_rate": 1.2619502868068834e-06, + "loss": 0.3584, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 2.121457616803094, + "learning_rate": 1.3001912045889104e-06, + "loss": 0.3729, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 0.8277941941005295, + "learning_rate": 1.3384321223709371e-06, + "loss": 0.3191, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.7443085270016412, + "learning_rate": 1.3766730401529637e-06, + "loss": 0.3493, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 1.1585945317118969, + "learning_rate": 1.4149139579349905e-06, + "loss": 0.3186, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 1.5269750513842424, + "learning_rate": 1.4531548757170174e-06, + "loss": 0.3761, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 2.837589017463728, + "learning_rate": 1.4913957934990442e-06, + "loss": 0.7611, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 1.1361050627404332, + "learning_rate": 1.529636711281071e-06, + "loss": 0.1478, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.9717312245736287, + "learning_rate": 1.5678776290630975e-06, + "loss": 0.3778, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 1.6502427723791129, + "learning_rate": 1.6061185468451243e-06, + "loss": 0.5467, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 0.6014184946161949, + "learning_rate": 1.6443594646271512e-06, + "loss": 0.1958, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 1.7347266365889769, + "learning_rate": 1.682600382409178e-06, + "loss": 0.4705, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 0.9784708341611575, + "learning_rate": 1.7208413001912048e-06, + "loss": 0.3925, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.8155632192584802, + "learning_rate": 1.7590822179732313e-06, + "loss": 0.2767, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 1.6297697128120328, + "learning_rate": 1.7973231357552585e-06, + "loss": 0.4902, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 2.0192981344155907, + "learning_rate": 1.835564053537285e-06, + "loss": 0.5705, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 0.6176671121632008, + "learning_rate": 1.8738049713193118e-06, + "loss": 0.2417, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 1.715246719483139, + "learning_rate": 1.9120458891013386e-06, + "loss": 0.2814, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 3.0156791314075595, + "learning_rate": 1.950286806883365e-06, + "loss": 0.8249, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 1.656560840965785, + "learning_rate": 1.988527724665392e-06, + "loss": 0.5013, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 1.1577287645486485, + "learning_rate": 2.026768642447419e-06, + "loss": 0.2761, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 3.0509099785445395, + "learning_rate": 2.0650095602294456e-06, + "loss": 0.6112, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 0.9025595249195338, + "learning_rate": 2.103250478011472e-06, + "loss": 0.1648, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 0.9417125486809438, + "learning_rate": 2.141491395793499e-06, + "loss": 0.325, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 0.9816339235493463, + "learning_rate": 2.179732313575526e-06, + "loss": 0.39, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 1.7398045274330625, + "learning_rate": 2.2179732313575527e-06, + "loss": 0.495, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 1.1507513410092913, + "learning_rate": 2.2562141491395797e-06, + "loss": 0.3634, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 4.505952386635836, + "learning_rate": 2.2944550669216062e-06, + "loss": 0.7398, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 0.706667753638737, + "learning_rate": 2.332695984703633e-06, + "loss": 0.2919, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 0.702597844899516, + "learning_rate": 2.3709369024856597e-06, + "loss": 0.2103, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 7.783725003259625, + "learning_rate": 2.4091778202676867e-06, + "loss": 0.853, + "step": 63 + }, + { + "epoch": 0.0, + "grad_norm": 1.3779057437038236, + "learning_rate": 2.4474187380497133e-06, + "loss": 0.4548, + "step": 64 + }, + { + "epoch": 0.0, + "grad_norm": 2.4971913730088304, + "learning_rate": 2.4856596558317402e-06, + "loss": 0.3769, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 2.077225117758281, + "learning_rate": 2.523900573613767e-06, + "loss": 0.4513, + "step": 66 + }, + { + "epoch": 0.0, + "grad_norm": 1.7469593011509847, + "learning_rate": 2.5621414913957938e-06, + "loss": 0.1754, + "step": 67 + }, + { + "epoch": 0.0, + "grad_norm": 3.1701958149020584, + "learning_rate": 2.6003824091778207e-06, + "loss": 0.381, + "step": 68 + }, + { + "epoch": 0.0, + "grad_norm": 1.0667732033492074, + "learning_rate": 2.6386233269598473e-06, + "loss": 0.365, + "step": 69 + }, + { + "epoch": 0.0, + "grad_norm": 2.805103712753199, + "learning_rate": 2.6768642447418743e-06, + "loss": 0.5996, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 1.1103519370245432, + "learning_rate": 2.7151051625239004e-06, + "loss": 0.2977, + "step": 71 + }, + { + "epoch": 0.0, + "grad_norm": 0.9741205038133375, + "learning_rate": 2.7533460803059274e-06, + "loss": 0.3441, + "step": 72 + }, + { + "epoch": 0.0, + "grad_norm": 3.3389588104138266, + "learning_rate": 2.7915869980879544e-06, + "loss": 0.4686, + "step": 73 + }, + { + "epoch": 0.0, + "grad_norm": 1.1622466550114607, + "learning_rate": 2.829827915869981e-06, + "loss": 0.2293, + "step": 74 + }, + { + "epoch": 0.0, + "grad_norm": 1.8095204360677337, + "learning_rate": 2.868068833652008e-06, + "loss": 0.4982, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 2.216426651097327, + "learning_rate": 2.906309751434035e-06, + "loss": 0.4559, + "step": 76 + }, + { + "epoch": 0.0, + "grad_norm": 1.7137062684636206, + "learning_rate": 2.9445506692160614e-06, + "loss": 0.3426, + "step": 77 + }, + { + "epoch": 0.0, + "grad_norm": 3.186090472631493, + "learning_rate": 2.9827915869980884e-06, + "loss": 0.6149, + "step": 78 + }, + { + "epoch": 0.0, + "grad_norm": 1.2457594513024053, + "learning_rate": 3.021032504780115e-06, + "loss": 0.2545, + "step": 79 + }, + { + "epoch": 0.0, + "grad_norm": 1.3233143755012595, + "learning_rate": 3.059273422562142e-06, + "loss": 0.3427, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 1.9962990513631782, + "learning_rate": 3.097514340344169e-06, + "loss": 0.4061, + "step": 81 + }, + { + "epoch": 0.0, + "grad_norm": 2.1748881777328464, + "learning_rate": 3.135755258126195e-06, + "loss": 0.4804, + "step": 82 + }, + { + "epoch": 0.0, + "grad_norm": 1.1349511100419203, + "learning_rate": 3.173996175908222e-06, + "loss": 0.3407, + "step": 83 + }, + { + "epoch": 0.0, + "grad_norm": 0.8325230383368618, + "learning_rate": 3.2122370936902485e-06, + "loss": 0.2679, + "step": 84 + }, + { + "epoch": 0.0, + "grad_norm": 1.4409698819778003, + "learning_rate": 3.2504780114722755e-06, + "loss": 0.2756, + "step": 85 + }, + { + "epoch": 0.0, + "grad_norm": 0.7717083869066103, + "learning_rate": 3.2887189292543025e-06, + "loss": 0.2771, + "step": 86 + }, + { + "epoch": 0.0, + "grad_norm": 3.0396106154008105, + "learning_rate": 3.326959847036329e-06, + "loss": 0.5461, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 0.8349352537444616, + "learning_rate": 3.365200764818356e-06, + "loss": 0.3778, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 0.8634730104834349, + "learning_rate": 3.4034416826003826e-06, + "loss": 0.26, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 1.8073245544233498, + "learning_rate": 3.4416826003824095e-06, + "loss": 0.3411, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 2.7074868343906857, + "learning_rate": 3.4799235181644365e-06, + "loss": 0.7505, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 1.4009581352720348, + "learning_rate": 3.5181644359464626e-06, + "loss": 0.3037, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 2.7581392814988335, + "learning_rate": 3.5564053537284896e-06, + "loss": 0.4342, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 4.586880035707701, + "learning_rate": 3.594646271510517e-06, + "loss": 0.95, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 0.9134436895106094, + "learning_rate": 3.632887189292543e-06, + "loss": 0.1859, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 3.5368674428633557, + "learning_rate": 3.67112810707457e-06, + "loss": 0.4042, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 1.695940105136588, + "learning_rate": 3.7093690248565967e-06, + "loss": 0.4439, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 1.4933772907830718, + "learning_rate": 3.7476099426386236e-06, + "loss": 0.3166, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 4.929994478426711, + "learning_rate": 3.7858508604206506e-06, + "loss": 0.6366, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 1.0588657191516118, + "learning_rate": 3.824091778202677e-06, + "loss": 0.3998, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.8299018542448147, + "learning_rate": 3.862332695984704e-06, + "loss": 0.3014, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 0.894972655187633, + "learning_rate": 3.90057361376673e-06, + "loss": 0.152, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 3.0056612950240744, + "learning_rate": 3.938814531548758e-06, + "loss": 0.5743, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 0.9439871853559727, + "learning_rate": 3.977055449330784e-06, + "loss": 0.292, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 1.0959590718704717, + "learning_rate": 4.015296367112811e-06, + "loss": 0.3727, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 2.2912607966735963, + "learning_rate": 4.053537284894838e-06, + "loss": 0.7559, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.9171050201317897, + "learning_rate": 4.091778202676865e-06, + "loss": 0.314, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.6284229206544971, + "learning_rate": 4.130019120458891e-06, + "loss": 0.2165, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 2.212927916169031, + "learning_rate": 4.168260038240919e-06, + "loss": 0.6106, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 0.8099374882393748, + "learning_rate": 4.206500956022944e-06, + "loss": 0.3248, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.655364301227268, + "learning_rate": 4.244741873804972e-06, + "loss": 0.5664, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.4636213484496585, + "learning_rate": 4.282982791586998e-06, + "loss": 0.393, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 1.0063990268097915, + "learning_rate": 4.321223709369025e-06, + "loss": 0.3236, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 0.6371885352644112, + "learning_rate": 4.359464627151052e-06, + "loss": 0.1923, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 3.490580634292843, + "learning_rate": 4.397705544933079e-06, + "loss": 0.5172, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.8466201174343196, + "learning_rate": 4.435946462715105e-06, + "loss": 0.3246, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.5069924896763471, + "learning_rate": 4.474187380497133e-06, + "loss": 0.3987, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 2.72546443790111, + "learning_rate": 4.512428298279159e-06, + "loss": 0.6617, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 2.2270235639211564, + "learning_rate": 4.550669216061186e-06, + "loss": 0.3173, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 0.611271995853063, + "learning_rate": 4.5889101338432124e-06, + "loss": 0.2566, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 2.1591179991848053, + "learning_rate": 4.627151051625239e-06, + "loss": 0.4692, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 0.8409326010763974, + "learning_rate": 4.665391969407266e-06, + "loss": 0.2925, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 1.6635869940410941, + "learning_rate": 4.703632887189293e-06, + "loss": 0.4966, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 0.8489652670343713, + "learning_rate": 4.7418738049713195e-06, + "loss": 0.3289, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 0.7948026475678698, + "learning_rate": 4.780114722753346e-06, + "loss": 0.3079, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.62879378549951, + "learning_rate": 4.8183556405353734e-06, + "loss": 0.2514, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 1.9267329931460127, + "learning_rate": 4.8565965583174e-06, + "loss": 0.575, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 0.6930803429078386, + "learning_rate": 4.8948374760994265e-06, + "loss": 0.2828, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.567288278653447, + "learning_rate": 4.933078393881454e-06, + "loss": 0.5037, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 2.4508660819457764, + "learning_rate": 4.9713193116634805e-06, + "loss": 0.8513, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.7264682519312051, + "learning_rate": 5.009560229445507e-06, + "loss": 0.2465, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 0.826742636113808, + "learning_rate": 5.047801147227534e-06, + "loss": 0.3116, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.1392276722179755, + "learning_rate": 5.086042065009561e-06, + "loss": 0.3726, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 1.0376511756469218, + "learning_rate": 5.1242829827915875e-06, + "loss": 0.2853, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 2.1718548855836737, + "learning_rate": 5.162523900573614e-06, + "loss": 0.8045, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.8519765854258867, + "learning_rate": 5.2007648183556415e-06, + "loss": 0.4227, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 0.8055693275531942, + "learning_rate": 5.239005736137668e-06, + "loss": 0.2194, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 1.7267804162988287, + "learning_rate": 5.277246653919695e-06, + "loss": 0.4669, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 0.5457146032547153, + "learning_rate": 5.315487571701722e-06, + "loss": 0.2702, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 0.8774674725605524, + "learning_rate": 5.3537284894837486e-06, + "loss": 0.3322, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.150336747102144, + "learning_rate": 5.391969407265774e-06, + "loss": 0.3492, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 2.1991018274354253, + "learning_rate": 5.430210325047801e-06, + "loss": 0.7824, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 1.2228654899985612, + "learning_rate": 5.468451242829829e-06, + "loss": 0.4072, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 0.7038931034059297, + "learning_rate": 5.506692160611855e-06, + "loss": 0.3396, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 1.745883442870367, + "learning_rate": 5.544933078393881e-06, + "loss": 0.6713, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.5065291039901288, + "learning_rate": 5.583173996175909e-06, + "loss": 0.2111, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 0.89516023920091, + "learning_rate": 5.621414913957935e-06, + "loss": 0.3344, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 1.5215200346193276, + "learning_rate": 5.659655831739962e-06, + "loss": 0.4512, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.0561066764454057, + "learning_rate": 5.697896749521989e-06, + "loss": 0.348, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 1.7077571805921623, + "learning_rate": 5.736137667304016e-06, + "loss": 0.44, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.7032590857528236, + "learning_rate": 5.774378585086042e-06, + "loss": 0.3231, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 0.6939036942549888, + "learning_rate": 5.81261950286807e-06, + "loss": 0.2651, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 1.23769484190234, + "learning_rate": 5.850860420650096e-06, + "loss": 0.4505, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 3.57505503947449, + "learning_rate": 5.889101338432123e-06, + "loss": 0.5387, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 1.4496022682391447, + "learning_rate": 5.927342256214149e-06, + "loss": 0.4709, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.9171040848212857, + "learning_rate": 5.965583173996177e-06, + "loss": 0.3718, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 1.3430140069069527, + "learning_rate": 6.003824091778203e-06, + "loss": 0.2232, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 0.7080554277096157, + "learning_rate": 6.04206500956023e-06, + "loss": 0.2103, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 1.2894404724722592, + "learning_rate": 6.080305927342257e-06, + "loss": 0.3918, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 3.1163562975594825, + "learning_rate": 6.118546845124284e-06, + "loss": 0.4489, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 1.8601545568123237, + "learning_rate": 6.15678776290631e-06, + "loss": 0.427, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 0.9819232651964342, + "learning_rate": 6.195028680688338e-06, + "loss": 0.4107, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 1.0574104719168467, + "learning_rate": 6.233269598470364e-06, + "loss": 0.3337, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 0.7953279060143272, + "learning_rate": 6.27151051625239e-06, + "loss": 0.2119, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 1.0482843536337256, + "learning_rate": 6.3097514340344166e-06, + "loss": 0.3817, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 2.1370186037298056, + "learning_rate": 6.347992351816444e-06, + "loss": 0.6306, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 1.6915570774478592, + "learning_rate": 6.3862332695984705e-06, + "loss": 0.3499, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 0.9773007045626713, + "learning_rate": 6.424474187380497e-06, + "loss": 0.369, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 2.6951767139469633, + "learning_rate": 6.4627151051625245e-06, + "loss": 0.7238, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 0.6679450780611562, + "learning_rate": 6.500956022944551e-06, + "loss": 0.1775, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 1.2440364112811422, + "learning_rate": 6.539196940726578e-06, + "loss": 0.4651, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 1.4532282158517489, + "learning_rate": 6.577437858508605e-06, + "loss": 0.4368, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 1.0484087595525342, + "learning_rate": 6.6156787762906315e-06, + "loss": 0.2313, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 1.2959855414896784, + "learning_rate": 6.653919694072658e-06, + "loss": 0.3893, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 1.149727668633913, + "learning_rate": 6.6921606118546855e-06, + "loss": 0.4034, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 2.3990532753289666, + "learning_rate": 6.730401529636712e-06, + "loss": 0.7575, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 0.7216334237931397, + "learning_rate": 6.768642447418739e-06, + "loss": 0.2945, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 1.6587270290307061, + "learning_rate": 6.806883365200765e-06, + "loss": 0.5882, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 0.7186727610904583, + "learning_rate": 6.8451242829827925e-06, + "loss": 0.2788, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 0.7580278664452965, + "learning_rate": 6.883365200764819e-06, + "loss": 0.2313, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 3.3484131561448582, + "learning_rate": 6.921606118546846e-06, + "loss": 0.7148, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 1.2480514471359154, + "learning_rate": 6.959847036328873e-06, + "loss": 0.5018, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 0.853569911541321, + "learning_rate": 6.9980879541109e-06, + "loss": 0.334, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 1.5935129539425128, + "learning_rate": 7.036328871892925e-06, + "loss": 0.484, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 0.8264510735195749, + "learning_rate": 7.0745697896749535e-06, + "loss": 0.2185, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 0.9399844092870507, + "learning_rate": 7.112810707456979e-06, + "loss": 0.3235, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 0.7297204562340768, + "learning_rate": 7.151051625239006e-06, + "loss": 0.3819, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 1.2104810033539928, + "learning_rate": 7.189292543021034e-06, + "loss": 0.4596, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 0.931742528076528, + "learning_rate": 7.22753346080306e-06, + "loss": 0.4061, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 1.490075769645383, + "learning_rate": 7.265774378585086e-06, + "loss": 0.5611, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 0.8152941819336589, + "learning_rate": 7.304015296367113e-06, + "loss": 0.3674, + "step": 191 + }, + { + "epoch": 0.01, + "grad_norm": 0.710166833510888, + "learning_rate": 7.34225621414914e-06, + "loss": 0.2668, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 1.568404819733602, + "learning_rate": 7.380497131931167e-06, + "loss": 0.1746, + "step": 193 + }, + { + "epoch": 0.01, + "grad_norm": 1.2671387794567825, + "learning_rate": 7.418738049713193e-06, + "loss": 0.519, + "step": 194 + }, + { + "epoch": 0.01, + "grad_norm": 0.7997462545674416, + "learning_rate": 7.456978967495221e-06, + "loss": 0.3353, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 0.9409506841315287, + "learning_rate": 7.495219885277247e-06, + "loss": 0.3744, + "step": 196 + }, + { + "epoch": 0.01, + "grad_norm": 1.0262626181568872, + "learning_rate": 7.533460803059274e-06, + "loss": 0.1996, + "step": 197 + }, + { + "epoch": 0.01, + "grad_norm": 0.6961696439810705, + "learning_rate": 7.571701720841301e-06, + "loss": 0.3293, + "step": 198 + }, + { + "epoch": 0.01, + "grad_norm": 0.805884804921434, + "learning_rate": 7.609942638623328e-06, + "loss": 0.3308, + "step": 199 + }, + { + "epoch": 0.01, + "grad_norm": 0.94997387801345, + "learning_rate": 7.648183556405354e-06, + "loss": 0.3929, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 0.7291239386927754, + "learning_rate": 7.686424474187381e-06, + "loss": 0.3934, + "step": 201 + }, + { + "epoch": 0.01, + "grad_norm": 2.004028030890788, + "learning_rate": 7.724665391969407e-06, + "loss": 0.8334, + "step": 202 + }, + { + "epoch": 0.01, + "grad_norm": 0.7999130816704142, + "learning_rate": 7.762906309751434e-06, + "loss": 0.3051, + "step": 203 + }, + { + "epoch": 0.01, + "grad_norm": 0.5985667826912873, + "learning_rate": 7.80114722753346e-06, + "loss": 0.2605, + "step": 204 + }, + { + "epoch": 0.01, + "grad_norm": 1.1179195289862098, + "learning_rate": 7.839388145315489e-06, + "loss": 0.2994, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 1.0573596138813663, + "learning_rate": 7.877629063097515e-06, + "loss": 0.4686, + "step": 206 + }, + { + "epoch": 0.01, + "grad_norm": 1.024477703497844, + "learning_rate": 7.915869980879542e-06, + "loss": 0.3344, + "step": 207 + }, + { + "epoch": 0.01, + "grad_norm": 0.8805912272772863, + "learning_rate": 7.954110898661568e-06, + "loss": 0.4153, + "step": 208 + }, + { + "epoch": 0.01, + "grad_norm": 1.8453276392969327, + "learning_rate": 7.992351816443595e-06, + "loss": 0.3134, + "step": 209 + }, + { + "epoch": 0.01, + "grad_norm": 0.6630727990199321, + "learning_rate": 8.030592734225622e-06, + "loss": 0.278, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 1.593524418186559, + "learning_rate": 8.06883365200765e-06, + "loss": 0.3308, + "step": 211 + }, + { + "epoch": 0.01, + "grad_norm": 1.5867712659122044, + "learning_rate": 8.107074569789676e-06, + "loss": 0.5022, + "step": 212 + }, + { + "epoch": 0.01, + "grad_norm": 1.858433894348274, + "learning_rate": 8.145315487571703e-06, + "loss": 0.3147, + "step": 213 + }, + { + "epoch": 0.01, + "grad_norm": 1.9048694714090977, + "learning_rate": 8.18355640535373e-06, + "loss": 0.8704, + "step": 214 + }, + { + "epoch": 0.01, + "grad_norm": 0.795882586881069, + "learning_rate": 8.221797323135756e-06, + "loss": 0.3508, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 1.2363337807852626, + "learning_rate": 8.260038240917783e-06, + "loss": 0.2234, + "step": 216 + }, + { + "epoch": 0.01, + "grad_norm": 1.2961495565006664, + "learning_rate": 8.298279158699809e-06, + "loss": 0.3844, + "step": 217 + }, + { + "epoch": 0.01, + "grad_norm": 1.7121762384248471, + "learning_rate": 8.336520076481837e-06, + "loss": 0.6158, + "step": 218 + }, + { + "epoch": 0.01, + "grad_norm": 0.7273021014242879, + "learning_rate": 8.374760994263862e-06, + "loss": 0.2802, + "step": 219 + }, + { + "epoch": 0.01, + "grad_norm": 2.6273244509030462, + "learning_rate": 8.413001912045889e-06, + "loss": 0.7777, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 1.4955213875868227, + "learning_rate": 8.451242829827917e-06, + "loss": 0.7094, + "step": 221 + }, + { + "epoch": 0.01, + "grad_norm": 1.0844734774581914, + "learning_rate": 8.489483747609944e-06, + "loss": 0.2128, + "step": 222 + }, + { + "epoch": 0.01, + "grad_norm": 0.7637627517102057, + "learning_rate": 8.52772466539197e-06, + "loss": 0.416, + "step": 223 + }, + { + "epoch": 0.01, + "grad_norm": 1.5085068537732527, + "learning_rate": 8.565965583173997e-06, + "loss": 0.4051, + "step": 224 + }, + { + "epoch": 0.01, + "grad_norm": 0.6779193752161682, + "learning_rate": 8.604206500956023e-06, + "loss": 0.248, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 3.324560462924011, + "learning_rate": 8.64244741873805e-06, + "loss": 0.823, + "step": 226 + }, + { + "epoch": 0.01, + "grad_norm": 0.7247870815802524, + "learning_rate": 8.680688336520076e-06, + "loss": 0.3926, + "step": 227 + }, + { + "epoch": 0.01, + "grad_norm": 0.874007650243524, + "learning_rate": 8.718929254302105e-06, + "loss": 0.3379, + "step": 228 + }, + { + "epoch": 0.01, + "grad_norm": 1.4143629669283038, + "learning_rate": 8.757170172084131e-06, + "loss": 0.3663, + "step": 229 + }, + { + "epoch": 0.01, + "grad_norm": 1.164683195417105, + "learning_rate": 8.795411089866158e-06, + "loss": 0.2845, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 0.5604708590835067, + "learning_rate": 8.833652007648184e-06, + "loss": 0.3083, + "step": 231 + }, + { + "epoch": 0.01, + "grad_norm": 1.196489898146485, + "learning_rate": 8.87189292543021e-06, + "loss": 0.4396, + "step": 232 + }, + { + "epoch": 0.01, + "grad_norm": 2.3549774616852077, + "learning_rate": 8.910133843212237e-06, + "loss": 0.7806, + "step": 233 + }, + { + "epoch": 0.01, + "grad_norm": 0.8069917514152943, + "learning_rate": 8.948374760994266e-06, + "loss": 0.3283, + "step": 234 + }, + { + "epoch": 0.01, + "grad_norm": 0.8060819660517314, + "learning_rate": 8.986615678776292e-06, + "loss": 0.4036, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 1.6268257383806288, + "learning_rate": 9.024856596558319e-06, + "loss": 0.2297, + "step": 236 + }, + { + "epoch": 0.01, + "grad_norm": 0.8055104715664961, + "learning_rate": 9.063097514340345e-06, + "loss": 0.325, + "step": 237 + }, + { + "epoch": 0.01, + "grad_norm": 1.922017240792222, + "learning_rate": 9.101338432122372e-06, + "loss": 0.6324, + "step": 238 + }, + { + "epoch": 0.01, + "grad_norm": 1.1570165829521175, + "learning_rate": 9.139579349904398e-06, + "loss": 0.4164, + "step": 239 + }, + { + "epoch": 0.01, + "grad_norm": 1.0882648871617964, + "learning_rate": 9.177820267686425e-06, + "loss": 0.3779, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 2.0650365945242295, + "learning_rate": 9.216061185468453e-06, + "loss": 0.6302, + "step": 241 + }, + { + "epoch": 0.01, + "grad_norm": 0.7876721236295158, + "learning_rate": 9.254302103250478e-06, + "loss": 0.2161, + "step": 242 + }, + { + "epoch": 0.01, + "grad_norm": 0.877348325878266, + "learning_rate": 9.292543021032505e-06, + "loss": 0.3256, + "step": 243 + }, + { + "epoch": 0.01, + "grad_norm": 1.2294123749330266, + "learning_rate": 9.330783938814533e-06, + "loss": 0.4789, + "step": 244 + }, + { + "epoch": 0.01, + "grad_norm": 2.0693063329445263, + "learning_rate": 9.36902485659656e-06, + "loss": 0.5469, + "step": 245 + }, + { + "epoch": 0.01, + "grad_norm": 0.8369746366963294, + "learning_rate": 9.407265774378586e-06, + "loss": 0.342, + "step": 246 + }, + { + "epoch": 0.01, + "grad_norm": 0.9630683957243528, + "learning_rate": 9.445506692160612e-06, + "loss": 0.3704, + "step": 247 + }, + { + "epoch": 0.01, + "grad_norm": 0.5500498119249109, + "learning_rate": 9.483747609942639e-06, + "loss": 0.0887, + "step": 248 + }, + { + "epoch": 0.01, + "grad_norm": 0.7378523176182469, + "learning_rate": 9.521988527724666e-06, + "loss": 0.3357, + "step": 249 + }, + { + "epoch": 0.01, + "grad_norm": 1.2327162595508403, + "learning_rate": 9.560229445506692e-06, + "loss": 0.4636, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 0.9886430366490976, + "learning_rate": 9.59847036328872e-06, + "loss": 0.416, + "step": 251 + }, + { + "epoch": 0.01, + "grad_norm": 0.9459014328163804, + "learning_rate": 9.636711281070747e-06, + "loss": 0.3347, + "step": 252 + }, + { + "epoch": 0.01, + "grad_norm": 1.4714918256305243, + "learning_rate": 9.674952198852773e-06, + "loss": 0.6434, + "step": 253 + }, + { + "epoch": 0.01, + "grad_norm": 0.6094277496484368, + "learning_rate": 9.7131931166348e-06, + "loss": 0.2764, + "step": 254 + }, + { + "epoch": 0.01, + "grad_norm": 0.6710565760041877, + "learning_rate": 9.751434034416827e-06, + "loss": 0.242, + "step": 255 + }, + { + "epoch": 0.01, + "grad_norm": 2.07823406563584, + "learning_rate": 9.789674952198853e-06, + "loss": 0.8126, + "step": 256 + }, + { + "epoch": 0.01, + "grad_norm": 1.7712955553118728, + "learning_rate": 9.827915869980881e-06, + "loss": 0.5811, + "step": 257 + }, + { + "epoch": 0.01, + "grad_norm": 0.8150028793715761, + "learning_rate": 9.866156787762908e-06, + "loss": 0.2496, + "step": 258 + }, + { + "epoch": 0.01, + "grad_norm": 0.7771432411433113, + "learning_rate": 9.904397705544934e-06, + "loss": 0.3765, + "step": 259 + }, + { + "epoch": 0.01, + "grad_norm": 0.6160523614998065, + "learning_rate": 9.942638623326961e-06, + "loss": 0.2061, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 0.7046157767914728, + "learning_rate": 9.980879541108988e-06, + "loss": 0.232, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 2.149024983560915, + "learning_rate": 1.0019120458891014e-05, + "loss": 0.4943, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 1.2286307024685774, + "learning_rate": 1.005736137667304e-05, + "loss": 0.4477, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 1.097112552920304, + "learning_rate": 1.0095602294455067e-05, + "loss": 0.3751, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 0.7614533976101915, + "learning_rate": 1.0133843212237095e-05, + "loss": 0.3311, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 0.7984309540612244, + "learning_rate": 1.0172084130019122e-05, + "loss": 0.3827, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 0.564765501949692, + "learning_rate": 1.0210325047801149e-05, + "loss": 0.213, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 1.75929373969915, + "learning_rate": 1.0248565965583175e-05, + "loss": 0.3951, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 2.166353414260506, + "learning_rate": 1.0286806883365202e-05, + "loss": 0.6514, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 0.49891390595289997, + "learning_rate": 1.0325047801147228e-05, + "loss": 0.2669, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 0.7155232121133612, + "learning_rate": 1.0363288718929255e-05, + "loss": 0.3295, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 2.4276693546072683, + "learning_rate": 1.0401529636711283e-05, + "loss": 0.8387, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 0.7231598135654922, + "learning_rate": 1.043977055449331e-05, + "loss": 0.3431, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 1.7399633483574997, + "learning_rate": 1.0478011472275336e-05, + "loss": 0.4369, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 1.0433473187882643, + "learning_rate": 1.0516252390057363e-05, + "loss": 0.4318, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 0.4793959174200006, + "learning_rate": 1.055449330783939e-05, + "loss": 0.2016, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 1.1899034899997953, + "learning_rate": 1.0592734225621416e-05, + "loss": 0.4827, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 0.7622063676502636, + "learning_rate": 1.0630975143403444e-05, + "loss": 0.3748, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 1.0017165788816116, + "learning_rate": 1.066921606118547e-05, + "loss": 0.4185, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 0.8309753387936472, + "learning_rate": 1.0707456978967497e-05, + "loss": 0.4142, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 0.6895111232491048, + "learning_rate": 1.0745697896749524e-05, + "loss": 0.211, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 0.6991545205239938, + "learning_rate": 1.0783938814531549e-05, + "loss": 0.3342, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 0.8430434837260139, + "learning_rate": 1.0822179732313575e-05, + "loss": 0.3487, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 2.4751180199401044, + "learning_rate": 1.0860420650095602e-05, + "loss": 0.5592, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 1.1958526321793117, + "learning_rate": 1.0898661567877632e-05, + "loss": 0.4458, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 0.893083908512126, + "learning_rate": 1.0936902485659658e-05, + "loss": 0.3897, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 1.1128175053171323, + "learning_rate": 1.0975143403441683e-05, + "loss": 0.3729, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 0.7310765740074888, + "learning_rate": 1.101338432122371e-05, + "loss": 0.1865, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 1.146099557909946, + "learning_rate": 1.1051625239005736e-05, + "loss": 0.3986, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 0.8860784783630814, + "learning_rate": 1.1089866156787763e-05, + "loss": 0.4463, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 0.7447706908798476, + "learning_rate": 1.1128107074569791e-05, + "loss": 0.3275, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 1.202436710568476, + "learning_rate": 1.1166347992351817e-05, + "loss": 0.5282, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 1.7152994728714328, + "learning_rate": 1.1204588910133844e-05, + "loss": 0.7841, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 0.5435181164029416, + "learning_rate": 1.124282982791587e-05, + "loss": 0.2422, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 0.6497594633770308, + "learning_rate": 1.1281070745697897e-05, + "loss": 0.315, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 1.4737099800720082, + "learning_rate": 1.1319311663479924e-05, + "loss": 0.6034, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 1.60322244237985, + "learning_rate": 1.135755258126195e-05, + "loss": 0.3813, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 0.7002085464698281, + "learning_rate": 1.1395793499043978e-05, + "loss": 0.3664, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 0.951906463609564, + "learning_rate": 1.1434034416826005e-05, + "loss": 0.4852, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 0.44217514977603756, + "learning_rate": 1.1472275334608032e-05, + "loss": 0.1206, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 0.6930363078552553, + "learning_rate": 1.1510516252390058e-05, + "loss": 0.3479, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 1.5346406725473178, + "learning_rate": 1.1548757170172085e-05, + "loss": 0.4308, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 1.83215837021332, + "learning_rate": 1.1586998087954111e-05, + "loss": 0.5532, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 0.6617004606774994, + "learning_rate": 1.162523900573614e-05, + "loss": 0.2918, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 1.7033213061876957, + "learning_rate": 1.1663479923518166e-05, + "loss": 0.8174, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 0.5124769881897817, + "learning_rate": 1.1701720841300193e-05, + "loss": 0.2797, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 0.5573421129903319, + "learning_rate": 1.1739961759082219e-05, + "loss": 0.2435, + "step": 307 + }, + { + "epoch": 0.02, + "grad_norm": 1.936707015756441, + "learning_rate": 1.1778202676864246e-05, + "loss": 0.587, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 1.101229402299148, + "learning_rate": 1.1816443594646272e-05, + "loss": 0.5648, + "step": 309 + }, + { + "epoch": 0.02, + "grad_norm": 0.5718313616620087, + "learning_rate": 1.1854684512428299e-05, + "loss": 0.2873, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 0.9736432054583405, + "learning_rate": 1.1892925430210327e-05, + "loss": 0.4525, + "step": 311 + }, + { + "epoch": 0.02, + "grad_norm": 0.5300437727042384, + "learning_rate": 1.1931166347992354e-05, + "loss": 0.1757, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 0.6818941883625804, + "learning_rate": 1.196940726577438e-05, + "loss": 0.2652, + "step": 313 + }, + { + "epoch": 0.02, + "grad_norm": 1.0449214685551322, + "learning_rate": 1.2007648183556407e-05, + "loss": 0.4432, + "step": 314 + }, + { + "epoch": 0.02, + "grad_norm": 1.3218968880735233, + "learning_rate": 1.2045889101338433e-05, + "loss": 0.5656, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 0.675172558207587, + "learning_rate": 1.208413001912046e-05, + "loss": 0.3515, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 1.9376915018087, + "learning_rate": 1.2122370936902486e-05, + "loss": 0.6026, + "step": 317 + }, + { + "epoch": 0.02, + "grad_norm": 0.7527951764348736, + "learning_rate": 1.2160611854684515e-05, + "loss": 0.39, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 0.6271551344942625, + "learning_rate": 1.2198852772466541e-05, + "loss": 0.3249, + "step": 319 + }, + { + "epoch": 0.02, + "grad_norm": 0.9520118512963162, + "learning_rate": 1.2237093690248568e-05, + "loss": 0.2872, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 0.9489835344276659, + "learning_rate": 1.2275334608030594e-05, + "loss": 0.4268, + "step": 321 + }, + { + "epoch": 0.02, + "grad_norm": 0.7608542124414743, + "learning_rate": 1.231357552581262e-05, + "loss": 0.3252, + "step": 322 + }, + { + "epoch": 0.02, + "grad_norm": 4.06054704906878, + "learning_rate": 1.2351816443594646e-05, + "loss": 0.692, + "step": 323 + }, + { + "epoch": 0.02, + "grad_norm": 1.0376904064881367, + "learning_rate": 1.2390057361376676e-05, + "loss": 0.4878, + "step": 324 + }, + { + "epoch": 0.02, + "grad_norm": 0.7936001862726354, + "learning_rate": 1.2428298279158702e-05, + "loss": 0.3846, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 0.7655996437533465, + "learning_rate": 1.2466539196940729e-05, + "loss": 0.2969, + "step": 326 + }, + { + "epoch": 0.02, + "grad_norm": 1.6075196309967965, + "learning_rate": 1.2504780114722753e-05, + "loss": 0.322, + "step": 327 + }, + { + "epoch": 0.02, + "grad_norm": 0.8333976096028658, + "learning_rate": 1.254302103250478e-05, + "loss": 0.3605, + "step": 328 + }, + { + "epoch": 0.02, + "grad_norm": 2.395249474296435, + "learning_rate": 1.2581261950286807e-05, + "loss": 0.8674, + "step": 329 + }, + { + "epoch": 0.02, + "grad_norm": 0.9426907172159265, + "learning_rate": 1.2619502868068833e-05, + "loss": 0.3545, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 0.6597426619503529, + "learning_rate": 1.2657743785850863e-05, + "loss": 0.3462, + "step": 331 + }, + { + "epoch": 0.02, + "grad_norm": 0.9277589366404783, + "learning_rate": 1.2695984703632888e-05, + "loss": 0.4374, + "step": 332 + }, + { + "epoch": 0.02, + "grad_norm": 0.49886983704319643, + "learning_rate": 1.2734225621414914e-05, + "loss": 0.2108, + "step": 333 + }, + { + "epoch": 0.02, + "grad_norm": 0.6541306009678697, + "learning_rate": 1.2772466539196941e-05, + "loss": 0.3265, + "step": 334 + }, + { + "epoch": 0.02, + "grad_norm": 1.7330589458418397, + "learning_rate": 1.2810707456978968e-05, + "loss": 0.765, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 1.9292808339282705, + "learning_rate": 1.2848948374760994e-05, + "loss": 0.5198, + "step": 336 + }, + { + "epoch": 0.02, + "grad_norm": 0.8238173474277146, + "learning_rate": 1.2887189292543022e-05, + "loss": 0.313, + "step": 337 + }, + { + "epoch": 0.02, + "grad_norm": 0.7391319260856145, + "learning_rate": 1.2925430210325049e-05, + "loss": 0.3138, + "step": 338 + }, + { + "epoch": 0.02, + "grad_norm": 0.8080684350177465, + "learning_rate": 1.2963671128107076e-05, + "loss": 0.3086, + "step": 339 + }, + { + "epoch": 0.02, + "grad_norm": 0.7345914153185755, + "learning_rate": 1.3001912045889102e-05, + "loss": 0.2706, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 1.9255928748149223, + "learning_rate": 1.3040152963671129e-05, + "loss": 0.7527, + "step": 341 + }, + { + "epoch": 0.02, + "grad_norm": 1.2808433824506942, + "learning_rate": 1.3078393881453155e-05, + "loss": 0.4384, + "step": 342 + }, + { + "epoch": 0.02, + "grad_norm": 0.7058183166356123, + "learning_rate": 1.3116634799235182e-05, + "loss": 0.3182, + "step": 343 + }, + { + "epoch": 0.02, + "grad_norm": 2.226462284510022, + "learning_rate": 1.315487571701721e-05, + "loss": 0.7395, + "step": 344 + }, + { + "epoch": 0.02, + "grad_norm": 0.6742925329478396, + "learning_rate": 1.3193116634799237e-05, + "loss": 0.2795, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 0.7439587116812453, + "learning_rate": 1.3231357552581263e-05, + "loss": 0.2749, + "step": 346 + }, + { + "epoch": 0.02, + "grad_norm": 1.4361449975745808, + "learning_rate": 1.326959847036329e-05, + "loss": 0.7106, + "step": 347 + }, + { + "epoch": 0.02, + "grad_norm": 1.2439492090019573, + "learning_rate": 1.3307839388145316e-05, + "loss": 0.589, + "step": 348 + }, + { + "epoch": 0.02, + "grad_norm": 0.7435609340200998, + "learning_rate": 1.3346080305927343e-05, + "loss": 0.2519, + "step": 349 + }, + { + "epoch": 0.02, + "grad_norm": 0.7689761834441273, + "learning_rate": 1.3384321223709371e-05, + "loss": 0.3878, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 0.604279488549852, + "learning_rate": 1.3422562141491398e-05, + "loss": 0.2728, + "step": 351 + }, + { + "epoch": 0.02, + "grad_norm": 0.6789890550766914, + "learning_rate": 1.3460803059273424e-05, + "loss": 0.3179, + "step": 352 + }, + { + "epoch": 0.02, + "grad_norm": 1.115636152118617, + "learning_rate": 1.349904397705545e-05, + "loss": 0.4138, + "step": 353 + }, + { + "epoch": 0.02, + "grad_norm": 0.6232254061695315, + "learning_rate": 1.3537284894837477e-05, + "loss": 0.3321, + "step": 354 + }, + { + "epoch": 0.02, + "grad_norm": 0.6478378710624862, + "learning_rate": 1.3575525812619504e-05, + "loss": 0.2918, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 0.6760264548742129, + "learning_rate": 1.361376673040153e-05, + "loss": 0.2258, + "step": 356 + }, + { + "epoch": 0.02, + "grad_norm": 0.8021332779117495, + "learning_rate": 1.3652007648183559e-05, + "loss": 0.4408, + "step": 357 + }, + { + "epoch": 0.02, + "grad_norm": 0.6232210579121872, + "learning_rate": 1.3690248565965585e-05, + "loss": 0.3153, + "step": 358 + }, + { + "epoch": 0.02, + "grad_norm": 1.737205410962497, + "learning_rate": 1.3728489483747612e-05, + "loss": 0.5192, + "step": 359 + }, + { + "epoch": 0.02, + "grad_norm": 0.882470126979341, + "learning_rate": 1.3766730401529638e-05, + "loss": 0.428, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 0.6041133303431246, + "learning_rate": 1.3804971319311665e-05, + "loss": 0.3065, + "step": 361 + }, + { + "epoch": 0.02, + "grad_norm": 0.8041421990760149, + "learning_rate": 1.3843212237093691e-05, + "loss": 0.3698, + "step": 362 + }, + { + "epoch": 0.02, + "grad_norm": 1.7265607031963566, + "learning_rate": 1.388145315487572e-05, + "loss": 0.7857, + "step": 363 + }, + { + "epoch": 0.02, + "grad_norm": 0.8204545366840523, + "learning_rate": 1.3919694072657746e-05, + "loss": 0.3723, + "step": 364 + }, + { + "epoch": 0.02, + "grad_norm": 1.8724560011222986, + "learning_rate": 1.3957934990439773e-05, + "loss": 0.579, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 0.49392242007190124, + "learning_rate": 1.39961759082218e-05, + "loss": 0.2412, + "step": 366 + }, + { + "epoch": 0.02, + "grad_norm": 0.6500233155836509, + "learning_rate": 1.4034416826003826e-05, + "loss": 0.3158, + "step": 367 + }, + { + "epoch": 0.02, + "grad_norm": 1.0068899962526328, + "learning_rate": 1.407265774378585e-05, + "loss": 0.3929, + "step": 368 + }, + { + "epoch": 0.02, + "grad_norm": 1.1908103503569025, + "learning_rate": 1.4110898661567877e-05, + "loss": 0.3623, + "step": 369 + }, + { + "epoch": 0.02, + "grad_norm": 0.6938542623801782, + "learning_rate": 1.4149139579349907e-05, + "loss": 0.3063, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 2.1507116005912006, + "learning_rate": 1.4187380497131934e-05, + "loss": 0.5147, + "step": 371 + }, + { + "epoch": 0.02, + "grad_norm": 0.7129151557497264, + "learning_rate": 1.4225621414913958e-05, + "loss": 0.1442, + "step": 372 + }, + { + "epoch": 0.02, + "grad_norm": 0.5561459063539664, + "learning_rate": 1.4263862332695985e-05, + "loss": 0.3316, + "step": 373 + }, + { + "epoch": 0.02, + "grad_norm": 0.7795311581768043, + "learning_rate": 1.4302103250478012e-05, + "loss": 0.3971, + "step": 374 + }, + { + "epoch": 0.02, + "grad_norm": 2.1573702444835594, + "learning_rate": 1.4340344168260038e-05, + "loss": 0.5285, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 0.8271672714492758, + "learning_rate": 1.4378585086042068e-05, + "loss": 0.346, + "step": 376 + }, + { + "epoch": 0.02, + "grad_norm": 1.22345199872424, + "learning_rate": 1.4416826003824093e-05, + "loss": 0.4845, + "step": 377 + }, + { + "epoch": 0.02, + "grad_norm": 0.6311654742572094, + "learning_rate": 1.445506692160612e-05, + "loss": 0.3056, + "step": 378 + }, + { + "epoch": 0.02, + "grad_norm": 0.48625752015949253, + "learning_rate": 1.4493307839388146e-05, + "loss": 0.2126, + "step": 379 + }, + { + "epoch": 0.02, + "grad_norm": 1.9799198948538195, + "learning_rate": 1.4531548757170173e-05, + "loss": 0.7895, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 0.8516318849459286, + "learning_rate": 1.4569789674952199e-05, + "loss": 0.4181, + "step": 381 + }, + { + "epoch": 0.02, + "grad_norm": 1.014567135790311, + "learning_rate": 1.4608030592734226e-05, + "loss": 0.3468, + "step": 382 + }, + { + "epoch": 0.02, + "grad_norm": 0.8579154771118827, + "learning_rate": 1.4646271510516254e-05, + "loss": 0.4599, + "step": 383 + }, + { + "epoch": 0.02, + "grad_norm": 0.6148793445042039, + "learning_rate": 1.468451242829828e-05, + "loss": 0.234, + "step": 384 + }, + { + "epoch": 0.02, + "grad_norm": 0.5488181146615484, + "learning_rate": 1.4722753346080307e-05, + "loss": 0.2141, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 0.6799560781486946, + "learning_rate": 1.4760994263862334e-05, + "loss": 0.3486, + "step": 386 + }, + { + "epoch": 0.02, + "grad_norm": 1.3813615427326338, + "learning_rate": 1.479923518164436e-05, + "loss": 0.5956, + "step": 387 + }, + { + "epoch": 0.02, + "grad_norm": 0.6414359509532044, + "learning_rate": 1.4837476099426387e-05, + "loss": 0.2277, + "step": 388 + }, + { + "epoch": 0.02, + "grad_norm": 0.586932100761519, + "learning_rate": 1.4875717017208415e-05, + "loss": 0.3545, + "step": 389 + }, + { + "epoch": 0.02, + "grad_norm": 0.7838674501697059, + "learning_rate": 1.4913957934990441e-05, + "loss": 0.2907, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 0.5658830728766918, + "learning_rate": 1.4952198852772468e-05, + "loss": 0.2235, + "step": 391 + }, + { + "epoch": 0.02, + "grad_norm": 0.8068619501663652, + "learning_rate": 1.4990439770554495e-05, + "loss": 0.344, + "step": 392 + }, + { + "epoch": 0.02, + "grad_norm": 0.6596708569982442, + "learning_rate": 1.5028680688336521e-05, + "loss": 0.3774, + "step": 393 + }, + { + "epoch": 0.02, + "grad_norm": 0.8793793597200517, + "learning_rate": 1.5066921606118548e-05, + "loss": 0.4466, + "step": 394 + }, + { + "epoch": 0.02, + "grad_norm": 1.0600744438699607, + "learning_rate": 1.5105162523900574e-05, + "loss": 0.3287, + "step": 395 + }, + { + "epoch": 0.02, + "grad_norm": 0.7919519835895109, + "learning_rate": 1.5143403441682602e-05, + "loss": 0.3511, + "step": 396 + }, + { + "epoch": 0.02, + "grad_norm": 0.5578491432818139, + "learning_rate": 1.5181644359464629e-05, + "loss": 0.2575, + "step": 397 + }, + { + "epoch": 0.02, + "grad_norm": 0.8105475118217892, + "learning_rate": 1.5219885277246656e-05, + "loss": 0.3245, + "step": 398 + }, + { + "epoch": 0.02, + "grad_norm": 0.7809252479896669, + "learning_rate": 1.5258126195028682e-05, + "loss": 0.5919, + "step": 399 + }, + { + "epoch": 0.02, + "grad_norm": 0.6766528930681027, + "learning_rate": 1.529636711281071e-05, + "loss": 0.3405, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 0.6551308795259978, + "learning_rate": 1.5334608030592735e-05, + "loss": 0.3199, + "step": 401 + }, + { + "epoch": 0.02, + "grad_norm": 0.6205927231666513, + "learning_rate": 1.5372848948374762e-05, + "loss": 0.2926, + "step": 402 + }, + { + "epoch": 0.02, + "grad_norm": 0.6434102165393524, + "learning_rate": 1.541108986615679e-05, + "loss": 0.2168, + "step": 403 + }, + { + "epoch": 0.02, + "grad_norm": 0.7035727497301221, + "learning_rate": 1.5449330783938815e-05, + "loss": 0.3766, + "step": 404 + }, + { + "epoch": 0.02, + "grad_norm": 0.6351981901932477, + "learning_rate": 1.548757170172084e-05, + "loss": 0.4042, + "step": 405 + }, + { + "epoch": 0.02, + "grad_norm": 1.0733492412828631, + "learning_rate": 1.5525812619502868e-05, + "loss": 0.3453, + "step": 406 + }, + { + "epoch": 0.02, + "grad_norm": 0.6931124633556552, + "learning_rate": 1.5564053537284895e-05, + "loss": 0.4334, + "step": 407 + }, + { + "epoch": 0.02, + "grad_norm": 1.344580603509804, + "learning_rate": 1.560229445506692e-05, + "loss": 0.3942, + "step": 408 + }, + { + "epoch": 0.02, + "grad_norm": 0.48351949036648106, + "learning_rate": 1.564053537284895e-05, + "loss": 0.2785, + "step": 409 + }, + { + "epoch": 0.02, + "grad_norm": 0.5724541069062071, + "learning_rate": 1.5678776290630978e-05, + "loss": 0.3334, + "step": 410 + }, + { + "epoch": 0.02, + "grad_norm": 1.0690880110758512, + "learning_rate": 1.5717017208413004e-05, + "loss": 0.5032, + "step": 411 + }, + { + "epoch": 0.02, + "grad_norm": 0.8955745333838802, + "learning_rate": 1.575525812619503e-05, + "loss": 0.4174, + "step": 412 + }, + { + "epoch": 0.02, + "grad_norm": 0.6489343273710856, + "learning_rate": 1.5793499043977057e-05, + "loss": 0.354, + "step": 413 + }, + { + "epoch": 0.02, + "grad_norm": 2.00140719177433, + "learning_rate": 1.5831739961759084e-05, + "loss": 0.6871, + "step": 414 + }, + { + "epoch": 0.02, + "grad_norm": 0.9468441119751057, + "learning_rate": 1.586998087954111e-05, + "loss": 0.3439, + "step": 415 + }, + { + "epoch": 0.02, + "grad_norm": 0.6811167520988824, + "learning_rate": 1.5908221797323137e-05, + "loss": 0.4035, + "step": 416 + }, + { + "epoch": 0.02, + "grad_norm": 0.5067638312443203, + "learning_rate": 1.5946462715105163e-05, + "loss": 0.2935, + "step": 417 + }, + { + "epoch": 0.02, + "grad_norm": 0.7214352543145547, + "learning_rate": 1.598470363288719e-05, + "loss": 0.2881, + "step": 418 + }, + { + "epoch": 0.02, + "grad_norm": 1.0508862135463248, + "learning_rate": 1.6022944550669217e-05, + "loss": 0.4995, + "step": 419 + }, + { + "epoch": 0.02, + "grad_norm": 1.8285100746936782, + "learning_rate": 1.6061185468451243e-05, + "loss": 0.6615, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 0.5708789432768756, + "learning_rate": 1.609942638623327e-05, + "loss": 0.3237, + "step": 421 + }, + { + "epoch": 0.02, + "grad_norm": 0.7903225965702565, + "learning_rate": 1.61376673040153e-05, + "loss": 0.3891, + "step": 422 + }, + { + "epoch": 0.02, + "grad_norm": 0.5582795431672651, + "learning_rate": 1.6175908221797326e-05, + "loss": 0.2004, + "step": 423 + }, + { + "epoch": 0.02, + "grad_norm": 0.8407191661050721, + "learning_rate": 1.6214149139579353e-05, + "loss": 0.312, + "step": 424 + }, + { + "epoch": 0.02, + "grad_norm": 0.6198087613324328, + "learning_rate": 1.625239005736138e-05, + "loss": 0.3276, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 1.8207898222982781, + "learning_rate": 1.6290630975143406e-05, + "loss": 0.7767, + "step": 426 + }, + { + "epoch": 0.02, + "grad_norm": 1.330115693764209, + "learning_rate": 1.6328871892925432e-05, + "loss": 0.5159, + "step": 427 + }, + { + "epoch": 0.02, + "grad_norm": 0.6387056248865568, + "learning_rate": 1.636711281070746e-05, + "loss": 0.3419, + "step": 428 + }, + { + "epoch": 0.02, + "grad_norm": 0.6215273109764146, + "learning_rate": 1.6405353728489485e-05, + "loss": 0.3152, + "step": 429 + }, + { + "epoch": 0.02, + "grad_norm": 0.9514248403782344, + "learning_rate": 1.6443594646271512e-05, + "loss": 0.2815, + "step": 430 + }, + { + "epoch": 0.02, + "grad_norm": 0.9310589290777825, + "learning_rate": 1.648183556405354e-05, + "loss": 0.3134, + "step": 431 + }, + { + "epoch": 0.02, + "grad_norm": 1.858149168694772, + "learning_rate": 1.6520076481835565e-05, + "loss": 0.5817, + "step": 432 + }, + { + "epoch": 0.02, + "grad_norm": 0.7691230463430312, + "learning_rate": 1.655831739961759e-05, + "loss": 0.3825, + "step": 433 + }, + { + "epoch": 0.02, + "grad_norm": 0.7411479882760116, + "learning_rate": 1.6596558317399618e-05, + "loss": 0.2704, + "step": 434 + }, + { + "epoch": 0.02, + "grad_norm": 1.1171597084891796, + "learning_rate": 1.6634799235181648e-05, + "loss": 0.1792, + "step": 435 + }, + { + "epoch": 0.03, + "grad_norm": 1.0198428305122083, + "learning_rate": 1.6673040152963675e-05, + "loss": 0.4587, + "step": 436 + }, + { + "epoch": 0.03, + "grad_norm": 0.8646954669117713, + "learning_rate": 1.67112810707457e-05, + "loss": 0.3177, + "step": 437 + }, + { + "epoch": 0.03, + "grad_norm": 2.2919448984679605, + "learning_rate": 1.6749521988527724e-05, + "loss": 0.6591, + "step": 438 + }, + { + "epoch": 0.03, + "grad_norm": 1.385420612343266, + "learning_rate": 1.678776290630975e-05, + "loss": 0.5364, + "step": 439 + }, + { + "epoch": 0.03, + "grad_norm": 0.8656076895746075, + "learning_rate": 1.6826003824091778e-05, + "loss": 0.3426, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 0.8693095612262008, + "learning_rate": 1.6864244741873804e-05, + "loss": 0.2902, + "step": 441 + }, + { + "epoch": 0.03, + "grad_norm": 2.203916729584415, + "learning_rate": 1.6902485659655834e-05, + "loss": 0.9356, + "step": 442 + }, + { + "epoch": 0.03, + "grad_norm": 0.6401954440906067, + "learning_rate": 1.694072657743786e-05, + "loss": 0.2328, + "step": 443 + }, + { + "epoch": 0.03, + "grad_norm": 1.2732766268188778, + "learning_rate": 1.6978967495219887e-05, + "loss": 0.4489, + "step": 444 + }, + { + "epoch": 0.03, + "grad_norm": 1.0743702431986437, + "learning_rate": 1.7017208413001914e-05, + "loss": 0.4417, + "step": 445 + }, + { + "epoch": 0.03, + "grad_norm": 0.7493088642138847, + "learning_rate": 1.705544933078394e-05, + "loss": 0.3438, + "step": 446 + }, + { + "epoch": 0.03, + "grad_norm": 1.872577130088869, + "learning_rate": 1.7093690248565967e-05, + "loss": 0.5092, + "step": 447 + }, + { + "epoch": 0.03, + "grad_norm": 1.6800381906510835, + "learning_rate": 1.7131931166347993e-05, + "loss": 0.3587, + "step": 448 + }, + { + "epoch": 0.03, + "grad_norm": 0.6966844101543027, + "learning_rate": 1.717017208413002e-05, + "loss": 0.3088, + "step": 449 + }, + { + "epoch": 0.03, + "grad_norm": 1.2041520310637208, + "learning_rate": 1.7208413001912046e-05, + "loss": 0.5373, + "step": 450 + }, + { + "epoch": 0.03, + "grad_norm": 1.1907325057349623, + "learning_rate": 1.7246653919694073e-05, + "loss": 0.3502, + "step": 451 + }, + { + "epoch": 0.03, + "grad_norm": 0.9197893826285918, + "learning_rate": 1.72848948374761e-05, + "loss": 0.3349, + "step": 452 + }, + { + "epoch": 0.03, + "grad_norm": 0.8202204456029969, + "learning_rate": 1.7323135755258126e-05, + "loss": 0.391, + "step": 453 + }, + { + "epoch": 0.03, + "grad_norm": 1.0635885255387458, + "learning_rate": 1.7361376673040153e-05, + "loss": 0.4402, + "step": 454 + }, + { + "epoch": 0.03, + "grad_norm": 0.7312803167255396, + "learning_rate": 1.7399617590822183e-05, + "loss": 0.341, + "step": 455 + }, + { + "epoch": 0.03, + "grad_norm": 0.7672739780107467, + "learning_rate": 1.743785850860421e-05, + "loss": 0.3814, + "step": 456 + }, + { + "epoch": 0.03, + "grad_norm": 0.7107829663539708, + "learning_rate": 1.7476099426386236e-05, + "loss": 0.2131, + "step": 457 + }, + { + "epoch": 0.03, + "grad_norm": 0.7640513015983967, + "learning_rate": 1.7514340344168262e-05, + "loss": 0.3358, + "step": 458 + }, + { + "epoch": 0.03, + "grad_norm": 1.4799590833631586, + "learning_rate": 1.755258126195029e-05, + "loss": 0.7856, + "step": 459 + }, + { + "epoch": 0.03, + "grad_norm": 0.6436605240448509, + "learning_rate": 1.7590822179732315e-05, + "loss": 0.3497, + "step": 460 + }, + { + "epoch": 0.03, + "grad_norm": 0.7464987117639431, + "learning_rate": 1.7629063097514342e-05, + "loss": 0.3435, + "step": 461 + }, + { + "epoch": 0.03, + "grad_norm": 0.7696930422997337, + "learning_rate": 1.766730401529637e-05, + "loss": 0.4322, + "step": 462 + }, + { + "epoch": 0.03, + "grad_norm": 0.6025265599658696, + "learning_rate": 1.7705544933078395e-05, + "loss": 0.1584, + "step": 463 + }, + { + "epoch": 0.03, + "grad_norm": 0.6378754667711887, + "learning_rate": 1.774378585086042e-05, + "loss": 0.3131, + "step": 464 + }, + { + "epoch": 0.03, + "grad_norm": 0.6729866894407964, + "learning_rate": 1.7782026768642448e-05, + "loss": 0.3856, + "step": 465 + }, + { + "epoch": 0.03, + "grad_norm": 0.935070447059843, + "learning_rate": 1.7820267686424475e-05, + "loss": 0.6001, + "step": 466 + }, + { + "epoch": 0.03, + "grad_norm": 0.5333343068447588, + "learning_rate": 1.78585086042065e-05, + "loss": 0.3228, + "step": 467 + }, + { + "epoch": 0.03, + "grad_norm": 0.9091879424576893, + "learning_rate": 1.789674952198853e-05, + "loss": 0.4292, + "step": 468 + }, + { + "epoch": 0.03, + "grad_norm": 0.5387821924240094, + "learning_rate": 1.7934990439770558e-05, + "loss": 0.2639, + "step": 469 + }, + { + "epoch": 0.03, + "grad_norm": 0.5311447953582863, + "learning_rate": 1.7973231357552584e-05, + "loss": 0.2444, + "step": 470 + }, + { + "epoch": 0.03, + "grad_norm": 1.395307363682624, + "learning_rate": 1.801147227533461e-05, + "loss": 0.7412, + "step": 471 + }, + { + "epoch": 0.03, + "grad_norm": 0.6504582290964441, + "learning_rate": 1.8049713193116637e-05, + "loss": 0.4336, + "step": 472 + }, + { + "epoch": 0.03, + "grad_norm": 0.5715139902093684, + "learning_rate": 1.8087954110898664e-05, + "loss": 0.2398, + "step": 473 + }, + { + "epoch": 0.03, + "grad_norm": 1.125542602171137, + "learning_rate": 1.812619502868069e-05, + "loss": 0.6667, + "step": 474 + }, + { + "epoch": 0.03, + "grad_norm": 0.4982371633361562, + "learning_rate": 1.8164435946462717e-05, + "loss": 0.237, + "step": 475 + }, + { + "epoch": 0.03, + "grad_norm": 0.5691266935810649, + "learning_rate": 1.8202676864244744e-05, + "loss": 0.2451, + "step": 476 + }, + { + "epoch": 0.03, + "grad_norm": 0.7030820342200949, + "learning_rate": 1.824091778202677e-05, + "loss": 0.4042, + "step": 477 + }, + { + "epoch": 0.03, + "grad_norm": 1.3941896853612525, + "learning_rate": 1.8279158699808797e-05, + "loss": 0.5841, + "step": 478 + }, + { + "epoch": 0.03, + "grad_norm": 0.6743046891263709, + "learning_rate": 1.8317399617590823e-05, + "loss": 0.3182, + "step": 479 + }, + { + "epoch": 0.03, + "grad_norm": 0.750355710951829, + "learning_rate": 1.835564053537285e-05, + "loss": 0.3391, + "step": 480 + }, + { + "epoch": 0.03, + "grad_norm": 0.46562694341915173, + "learning_rate": 1.839388145315488e-05, + "loss": 0.2027, + "step": 481 + }, + { + "epoch": 0.03, + "grad_norm": 0.6383543781218285, + "learning_rate": 1.8432122370936906e-05, + "loss": 0.3138, + "step": 482 + }, + { + "epoch": 0.03, + "grad_norm": 2.128120166212137, + "learning_rate": 1.847036328871893e-05, + "loss": 0.5683, + "step": 483 + }, + { + "epoch": 0.03, + "grad_norm": 0.7605870557660157, + "learning_rate": 1.8508604206500956e-05, + "loss": 0.4408, + "step": 484 + }, + { + "epoch": 0.03, + "grad_norm": 0.6354556583007445, + "learning_rate": 1.8546845124282983e-05, + "loss": 0.3147, + "step": 485 + }, + { + "epoch": 0.03, + "grad_norm": 2.9816014322679436, + "learning_rate": 1.858508604206501e-05, + "loss": 0.5888, + "step": 486 + }, + { + "epoch": 0.03, + "grad_norm": 0.5289682974012722, + "learning_rate": 1.8623326959847036e-05, + "loss": 0.2015, + "step": 487 + }, + { + "epoch": 0.03, + "grad_norm": 0.6236586560186679, + "learning_rate": 1.8661567877629066e-05, + "loss": 0.3256, + "step": 488 + }, + { + "epoch": 0.03, + "grad_norm": 0.6494916866609775, + "learning_rate": 1.8699808795411092e-05, + "loss": 0.3929, + "step": 489 + }, + { + "epoch": 0.03, + "grad_norm": 1.1871808128906545, + "learning_rate": 1.873804971319312e-05, + "loss": 0.6403, + "step": 490 + }, + { + "epoch": 0.03, + "grad_norm": 0.7214128916285852, + "learning_rate": 1.8776290630975145e-05, + "loss": 0.3347, + "step": 491 + }, + { + "epoch": 0.03, + "grad_norm": 0.6602266191900171, + "learning_rate": 1.8814531548757172e-05, + "loss": 0.3601, + "step": 492 + }, + { + "epoch": 0.03, + "grad_norm": 0.694604817799548, + "learning_rate": 1.88527724665392e-05, + "loss": 0.1621, + "step": 493 + }, + { + "epoch": 0.03, + "grad_norm": 0.7076032930955318, + "learning_rate": 1.8891013384321225e-05, + "loss": 0.3279, + "step": 494 + }, + { + "epoch": 0.03, + "grad_norm": 0.8666075990621492, + "learning_rate": 1.892925430210325e-05, + "loss": 0.4989, + "step": 495 + }, + { + "epoch": 0.03, + "grad_norm": 0.7678193345063739, + "learning_rate": 1.8967495219885278e-05, + "loss": 0.3314, + "step": 496 + }, + { + "epoch": 0.03, + "grad_norm": 0.555716909316726, + "learning_rate": 1.9005736137667305e-05, + "loss": 0.3502, + "step": 497 + }, + { + "epoch": 0.03, + "grad_norm": 0.8106958346923805, + "learning_rate": 1.904397705544933e-05, + "loss": 0.4847, + "step": 498 + }, + { + "epoch": 0.03, + "grad_norm": 0.5347451642599353, + "learning_rate": 1.9082217973231358e-05, + "loss": 0.1286, + "step": 499 + }, + { + "epoch": 0.03, + "grad_norm": 0.6613152718234083, + "learning_rate": 1.9120458891013384e-05, + "loss": 0.3141, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 0.9116473642098698, + "learning_rate": 1.9158699808795414e-05, + "loss": 0.4658, + "step": 501 + }, + { + "epoch": 0.03, + "grad_norm": 1.199682601869777, + "learning_rate": 1.919694072657744e-05, + "loss": 0.6147, + "step": 502 + }, + { + "epoch": 0.03, + "grad_norm": 0.7314781117930744, + "learning_rate": 1.9235181644359467e-05, + "loss": 0.2652, + "step": 503 + }, + { + "epoch": 0.03, + "grad_norm": 0.8325395903217065, + "learning_rate": 1.9273422562141494e-05, + "loss": 0.3995, + "step": 504 + }, + { + "epoch": 0.03, + "grad_norm": 0.5062483839186255, + "learning_rate": 1.931166347992352e-05, + "loss": 0.3077, + "step": 505 + }, + { + "epoch": 0.03, + "grad_norm": 0.5967051191367326, + "learning_rate": 1.9349904397705547e-05, + "loss": 0.3258, + "step": 506 + }, + { + "epoch": 0.03, + "grad_norm": 0.8272368056777742, + "learning_rate": 1.9388145315487573e-05, + "loss": 0.3995, + "step": 507 + }, + { + "epoch": 0.03, + "grad_norm": 0.9304286527087062, + "learning_rate": 1.94263862332696e-05, + "loss": 0.3921, + "step": 508 + }, + { + "epoch": 0.03, + "grad_norm": 0.5272587543969978, + "learning_rate": 1.9464627151051627e-05, + "loss": 0.2309, + "step": 509 + }, + { + "epoch": 0.03, + "grad_norm": 0.8486834700401805, + "learning_rate": 1.9502868068833653e-05, + "loss": 0.5258, + "step": 510 + }, + { + "epoch": 0.03, + "grad_norm": 0.6631668030453906, + "learning_rate": 1.954110898661568e-05, + "loss": 0.4544, + "step": 511 + }, + { + "epoch": 0.03, + "grad_norm": 0.763621465095935, + "learning_rate": 1.9579349904397706e-05, + "loss": 0.2702, + "step": 512 + }, + { + "epoch": 0.03, + "grad_norm": 0.5529369486422537, + "learning_rate": 1.9617590822179733e-05, + "loss": 0.3651, + "step": 513 + }, + { + "epoch": 0.03, + "grad_norm": 0.648527669233175, + "learning_rate": 1.9655831739961763e-05, + "loss": 0.3392, + "step": 514 + }, + { + "epoch": 0.03, + "grad_norm": 0.8333363603111317, + "learning_rate": 1.969407265774379e-05, + "loss": 0.4271, + "step": 515 + }, + { + "epoch": 0.03, + "grad_norm": 0.9578624303870017, + "learning_rate": 1.9732313575525816e-05, + "loss": 0.3442, + "step": 516 + }, + { + "epoch": 0.03, + "grad_norm": 0.9338089452068773, + "learning_rate": 1.9770554493307842e-05, + "loss": 0.5805, + "step": 517 + }, + { + "epoch": 0.03, + "grad_norm": 0.7261028256374711, + "learning_rate": 1.980879541108987e-05, + "loss": 0.4576, + "step": 518 + }, + { + "epoch": 0.03, + "grad_norm": 0.6206482079476885, + "learning_rate": 1.9847036328871892e-05, + "loss": 0.2308, + "step": 519 + }, + { + "epoch": 0.03, + "grad_norm": 0.5731488669023147, + "learning_rate": 1.9885277246653922e-05, + "loss": 0.3147, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 0.5715431597204902, + "learning_rate": 1.992351816443595e-05, + "loss": 0.3604, + "step": 521 + }, + { + "epoch": 0.03, + "grad_norm": 0.7894908009717267, + "learning_rate": 1.9961759082217975e-05, + "loss": 0.3063, + "step": 522 + }, + { + "epoch": 0.03, + "grad_norm": 0.8229915510438305, + "learning_rate": 2e-05, + "loss": 0.4539, + "step": 523 + }, + { + "epoch": 0.03, + "grad_norm": 0.6027096957912251, + "learning_rate": 1.9999999826850218e-05, + "loss": 0.3497, + "step": 524 + }, + { + "epoch": 0.03, + "grad_norm": 0.9160865998931952, + "learning_rate": 1.999999930740087e-05, + "loss": 0.2652, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 0.4656131298936089, + "learning_rate": 1.9999998441651974e-05, + "loss": 0.224, + "step": 526 + }, + { + "epoch": 0.03, + "grad_norm": 1.0868401681216302, + "learning_rate": 1.999999722960356e-05, + "loss": 0.5024, + "step": 527 + }, + { + "epoch": 0.03, + "grad_norm": 1.0309672847824662, + "learning_rate": 1.9999995671255675e-05, + "loss": 0.3227, + "step": 528 + }, + { + "epoch": 0.03, + "grad_norm": 0.9537024074650399, + "learning_rate": 1.999999376660837e-05, + "loss": 0.5217, + "step": 529 + }, + { + "epoch": 0.03, + "grad_norm": 0.7989227071893219, + "learning_rate": 1.9999991515661712e-05, + "loss": 0.3248, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 0.5587548459372644, + "learning_rate": 1.9999988918415777e-05, + "loss": 0.2575, + "step": 531 + }, + { + "epoch": 0.03, + "grad_norm": 0.6038947163659312, + "learning_rate": 1.9999985974870653e-05, + "loss": 0.2639, + "step": 532 + }, + { + "epoch": 0.03, + "grad_norm": 0.9005065511749109, + "learning_rate": 1.999998268502645e-05, + "loss": 0.424, + "step": 533 + }, + { + "epoch": 0.03, + "grad_norm": 0.7791223727687504, + "learning_rate": 1.9999979048883275e-05, + "loss": 0.3973, + "step": 534 + }, + { + "epoch": 0.03, + "grad_norm": 1.049952558614156, + "learning_rate": 1.999997506644125e-05, + "loss": 0.402, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 0.654247607069547, + "learning_rate": 1.9999970737700526e-05, + "loss": 0.3599, + "step": 536 + }, + { + "epoch": 0.03, + "grad_norm": 0.794701933156306, + "learning_rate": 1.999996606266124e-05, + "loss": 0.3839, + "step": 537 + }, + { + "epoch": 0.03, + "grad_norm": 0.7277135121566177, + "learning_rate": 1.999996104132356e-05, + "loss": 0.235, + "step": 538 + }, + { + "epoch": 0.03, + "grad_norm": 0.4981718868298558, + "learning_rate": 1.9999955673687663e-05, + "loss": 0.2761, + "step": 539 + }, + { + "epoch": 0.03, + "grad_norm": 0.7402595774332386, + "learning_rate": 1.999994995975373e-05, + "loss": 0.3425, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 1.5576818896204434, + "learning_rate": 1.9999943899521955e-05, + "loss": 0.587, + "step": 541 + }, + { + "epoch": 0.03, + "grad_norm": 0.5357370858034539, + "learning_rate": 1.9999937492992558e-05, + "loss": 0.2118, + "step": 542 + }, + { + "epoch": 0.03, + "grad_norm": 0.8512678621652553, + "learning_rate": 1.9999930740165755e-05, + "loss": 0.3902, + "step": 543 + }, + { + "epoch": 0.03, + "grad_norm": 0.8553424623445812, + "learning_rate": 1.999992364104178e-05, + "loss": 0.3927, + "step": 544 + }, + { + "epoch": 0.03, + "grad_norm": 1.5979085965888817, + "learning_rate": 1.9999916195620875e-05, + "loss": 0.3931, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 0.7901713482230103, + "learning_rate": 1.9999908403903307e-05, + "loss": 0.3896, + "step": 546 + }, + { + "epoch": 0.03, + "grad_norm": 0.394859846631288, + "learning_rate": 1.999990026588934e-05, + "loss": 0.2883, + "step": 547 + }, + { + "epoch": 0.03, + "grad_norm": 0.7640357579682913, + "learning_rate": 1.9999891781579256e-05, + "loss": 0.3455, + "step": 548 + }, + { + "epoch": 0.03, + "grad_norm": 0.6769055305092657, + "learning_rate": 1.9999882950973352e-05, + "loss": 0.3481, + "step": 549 + }, + { + "epoch": 0.03, + "grad_norm": 1.9577307589064314, + "learning_rate": 1.999987377407193e-05, + "loss": 0.7917, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 0.9439456813870318, + "learning_rate": 1.9999864250875305e-05, + "loss": 0.2947, + "step": 551 + }, + { + "epoch": 0.03, + "grad_norm": 0.8175729706645952, + "learning_rate": 1.9999854381383817e-05, + "loss": 0.3907, + "step": 552 + }, + { + "epoch": 0.03, + "grad_norm": 0.4732231542439032, + "learning_rate": 1.99998441655978e-05, + "loss": 0.2284, + "step": 553 + }, + { + "epoch": 0.03, + "grad_norm": 1.238787974808822, + "learning_rate": 1.999983360351761e-05, + "loss": 0.5136, + "step": 554 + }, + { + "epoch": 0.03, + "grad_norm": 0.7050079922045303, + "learning_rate": 1.999982269514361e-05, + "loss": 0.3216, + "step": 555 + }, + { + "epoch": 0.03, + "grad_norm": 0.8573120183532486, + "learning_rate": 1.9999811440476182e-05, + "loss": 0.3806, + "step": 556 + }, + { + "epoch": 0.03, + "grad_norm": 1.2215743690492187, + "learning_rate": 1.999979983951571e-05, + "loss": 0.5398, + "step": 557 + }, + { + "epoch": 0.03, + "grad_norm": 0.6446022868528241, + "learning_rate": 1.9999787892262605e-05, + "loss": 0.3098, + "step": 558 + }, + { + "epoch": 0.03, + "grad_norm": 0.5002653329984211, + "learning_rate": 1.9999775598717276e-05, + "loss": 0.2903, + "step": 559 + }, + { + "epoch": 0.03, + "grad_norm": 0.5895344073161108, + "learning_rate": 1.9999762958880145e-05, + "loss": 0.2155, + "step": 560 + }, + { + "epoch": 0.03, + "grad_norm": 0.8267429050787638, + "learning_rate": 1.9999749972751653e-05, + "loss": 0.3351, + "step": 561 + }, + { + "epoch": 0.03, + "grad_norm": 1.353467415655348, + "learning_rate": 1.999973664033225e-05, + "loss": 0.5793, + "step": 562 + }, + { + "epoch": 0.03, + "grad_norm": 0.7361197465689773, + "learning_rate": 1.9999722961622395e-05, + "loss": 0.3886, + "step": 563 + }, + { + "epoch": 0.03, + "grad_norm": 0.5447246086308706, + "learning_rate": 1.9999708936622564e-05, + "loss": 0.2858, + "step": 564 + }, + { + "epoch": 0.03, + "grad_norm": 0.5984252294712458, + "learning_rate": 1.9999694565333246e-05, + "loss": 0.1902, + "step": 565 + }, + { + "epoch": 0.03, + "grad_norm": 1.482940990942458, + "learning_rate": 1.999967984775493e-05, + "loss": 0.724, + "step": 566 + }, + { + "epoch": 0.03, + "grad_norm": 0.6741795289929788, + "learning_rate": 1.9999664783888138e-05, + "loss": 0.3405, + "step": 567 + }, + { + "epoch": 0.03, + "grad_norm": 0.9182428191621349, + "learning_rate": 1.9999649373733377e-05, + "loss": 0.3617, + "step": 568 + }, + { + "epoch": 0.03, + "grad_norm": 1.6389573056553957, + "learning_rate": 1.9999633617291196e-05, + "loss": 0.6102, + "step": 569 + }, + { + "epoch": 0.03, + "grad_norm": 0.6475024165740241, + "learning_rate": 1.9999617514562125e-05, + "loss": 0.3249, + "step": 570 + }, + { + "epoch": 0.03, + "grad_norm": 0.7182517478278182, + "learning_rate": 1.9999601065546733e-05, + "loss": 0.3586, + "step": 571 + }, + { + "epoch": 0.03, + "grad_norm": 0.6687765532978037, + "learning_rate": 1.9999584270245588e-05, + "loss": 0.206, + "step": 572 + }, + { + "epoch": 0.03, + "grad_norm": 0.7089634046898335, + "learning_rate": 1.9999567128659267e-05, + "loss": 0.3192, + "step": 573 + }, + { + "epoch": 0.03, + "grad_norm": 2.260767215457327, + "learning_rate": 1.9999549640788368e-05, + "loss": 0.5278, + "step": 574 + }, + { + "epoch": 0.03, + "grad_norm": 0.6480748864054804, + "learning_rate": 1.9999531806633493e-05, + "loss": 0.3919, + "step": 575 + }, + { + "epoch": 0.03, + "grad_norm": 0.675673550540879, + "learning_rate": 1.9999513626195265e-05, + "loss": 0.3158, + "step": 576 + }, + { + "epoch": 0.03, + "grad_norm": 1.6538901519919962, + "learning_rate": 1.9999495099474306e-05, + "loss": 0.831, + "step": 577 + }, + { + "epoch": 0.03, + "grad_norm": 0.5293004891675508, + "learning_rate": 1.9999476226471265e-05, + "loss": 0.1482, + "step": 578 + }, + { + "epoch": 0.03, + "grad_norm": 0.9542678819942969, + "learning_rate": 1.999945700718679e-05, + "loss": 0.3594, + "step": 579 + }, + { + "epoch": 0.03, + "grad_norm": 1.3111617705092098, + "learning_rate": 1.9999437441621547e-05, + "loss": 0.4239, + "step": 580 + }, + { + "epoch": 0.03, + "grad_norm": 2.0926709671371593, + "learning_rate": 1.9999417529776218e-05, + "loss": 0.5737, + "step": 581 + }, + { + "epoch": 0.03, + "grad_norm": 0.6708490869863095, + "learning_rate": 1.999939727165149e-05, + "loss": 0.3331, + "step": 582 + }, + { + "epoch": 0.03, + "grad_norm": 0.7622221883483998, + "learning_rate": 1.9999376667248068e-05, + "loss": 0.3782, + "step": 583 + }, + { + "epoch": 0.03, + "grad_norm": 0.7441804908580606, + "learning_rate": 1.9999355716566655e-05, + "loss": 0.1226, + "step": 584 + }, + { + "epoch": 0.03, + "grad_norm": 0.899774384946839, + "learning_rate": 1.999933441960799e-05, + "loss": 0.3422, + "step": 585 + }, + { + "epoch": 0.03, + "grad_norm": 3.4082682727776334, + "learning_rate": 1.9999312776372798e-05, + "loss": 0.6145, + "step": 586 + }, + { + "epoch": 0.03, + "grad_norm": 0.9534436469990148, + "learning_rate": 1.9999290786861837e-05, + "loss": 0.3507, + "step": 587 + }, + { + "epoch": 0.03, + "grad_norm": 0.7050693202744416, + "learning_rate": 1.999926845107587e-05, + "loss": 0.3421, + "step": 588 + }, + { + "epoch": 0.03, + "grad_norm": 1.7025023466205644, + "learning_rate": 1.9999245769015663e-05, + "loss": 0.7674, + "step": 589 + }, + { + "epoch": 0.03, + "grad_norm": 0.7588867665045335, + "learning_rate": 1.9999222740682004e-05, + "loss": 0.2988, + "step": 590 + }, + { + "epoch": 0.03, + "grad_norm": 0.8053056561388221, + "learning_rate": 1.9999199366075694e-05, + "loss": 0.279, + "step": 591 + }, + { + "epoch": 0.03, + "grad_norm": 2.7201821145047025, + "learning_rate": 1.9999175645197537e-05, + "loss": 0.6306, + "step": 592 + }, + { + "epoch": 0.03, + "grad_norm": 1.6922097540301058, + "learning_rate": 1.9999151578048357e-05, + "loss": 0.806, + "step": 593 + }, + { + "epoch": 0.03, + "grad_norm": 0.5154629765050703, + "learning_rate": 1.9999127164628992e-05, + "loss": 0.2397, + "step": 594 + }, + { + "epoch": 0.03, + "grad_norm": 0.6208505950553166, + "learning_rate": 1.999910240494028e-05, + "loss": 0.3727, + "step": 595 + }, + { + "epoch": 0.03, + "grad_norm": 0.6884024496186021, + "learning_rate": 1.9999077298983084e-05, + "loss": 0.2778, + "step": 596 + }, + { + "epoch": 0.03, + "grad_norm": 0.7202762821630867, + "learning_rate": 1.9999051846758267e-05, + "loss": 0.2434, + "step": 597 + }, + { + "epoch": 0.03, + "grad_norm": 2.2942921881997282, + "learning_rate": 1.999902604826672e-05, + "loss": 0.7034, + "step": 598 + }, + { + "epoch": 0.03, + "grad_norm": 0.772582124815898, + "learning_rate": 1.9998999903509326e-05, + "loss": 0.4107, + "step": 599 + }, + { + "epoch": 0.03, + "grad_norm": 0.6564284208707677, + "learning_rate": 1.9998973412487e-05, + "loss": 0.2485, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 4.5218916723217415, + "learning_rate": 1.9998946575200652e-05, + "loss": 0.6996, + "step": 601 + }, + { + "epoch": 0.03, + "grad_norm": 1.2246507040544916, + "learning_rate": 1.9998919391651214e-05, + "loss": 0.5521, + "step": 602 + }, + { + "epoch": 0.03, + "grad_norm": 0.6937961374682375, + "learning_rate": 1.9998891861839627e-05, + "loss": 0.3197, + "step": 603 + }, + { + "epoch": 0.03, + "grad_norm": 1.0318409532385129, + "learning_rate": 1.9998863985766845e-05, + "loss": 0.2606, + "step": 604 + }, + { + "epoch": 0.03, + "grad_norm": 2.722572887274754, + "learning_rate": 1.9998835763433836e-05, + "loss": 0.8884, + "step": 605 + }, + { + "epoch": 0.03, + "grad_norm": 0.6300069061706091, + "learning_rate": 1.999880719484157e-05, + "loss": 0.3101, + "step": 606 + }, + { + "epoch": 0.03, + "grad_norm": 0.6983482017923085, + "learning_rate": 1.9998778279991042e-05, + "loss": 0.3086, + "step": 607 + }, + { + "epoch": 0.03, + "grad_norm": 2.0707704364945454, + "learning_rate": 1.999874901888325e-05, + "loss": 0.6135, + "step": 608 + }, + { + "epoch": 0.03, + "grad_norm": 0.5749567826655014, + "learning_rate": 1.9998719411519215e-05, + "loss": 0.3335, + "step": 609 + }, + { + "epoch": 0.04, + "grad_norm": 0.5182988783531453, + "learning_rate": 1.9998689457899955e-05, + "loss": 0.1555, + "step": 610 + }, + { + "epoch": 0.04, + "grad_norm": 0.7617833054443183, + "learning_rate": 1.999865915802651e-05, + "loss": 0.4108, + "step": 611 + }, + { + "epoch": 0.04, + "grad_norm": 1.1526576959541612, + "learning_rate": 1.9998628511899925e-05, + "loss": 0.4166, + "step": 612 + }, + { + "epoch": 0.04, + "grad_norm": 0.6571422467528628, + "learning_rate": 1.999859751952127e-05, + "loss": 0.3332, + "step": 613 + }, + { + "epoch": 0.04, + "grad_norm": 0.7513551490147077, + "learning_rate": 1.9998566180891606e-05, + "loss": 0.4341, + "step": 614 + }, + { + "epoch": 0.04, + "grad_norm": 0.5806459633657569, + "learning_rate": 1.9998534496012026e-05, + "loss": 0.3158, + "step": 615 + }, + { + "epoch": 0.04, + "grad_norm": 1.6232848400792341, + "learning_rate": 1.9998502464883632e-05, + "loss": 0.2476, + "step": 616 + }, + { + "epoch": 0.04, + "grad_norm": 2.5384024602749533, + "learning_rate": 1.9998470087507522e-05, + "loss": 0.6709, + "step": 617 + }, + { + "epoch": 0.04, + "grad_norm": 0.6685859795334497, + "learning_rate": 1.9998437363884825e-05, + "loss": 0.3515, + "step": 618 + }, + { + "epoch": 0.04, + "grad_norm": 0.5741696913240106, + "learning_rate": 1.999840429401667e-05, + "loss": 0.3636, + "step": 619 + }, + { + "epoch": 0.04, + "grad_norm": 1.5049238219312997, + "learning_rate": 1.9998370877904208e-05, + "loss": 0.5154, + "step": 620 + }, + { + "epoch": 0.04, + "grad_norm": 0.672421234183969, + "learning_rate": 1.9998337115548588e-05, + "loss": 0.2922, + "step": 621 + }, + { + "epoch": 0.04, + "grad_norm": 2.2798101602612184, + "learning_rate": 1.999830300695099e-05, + "loss": 0.3201, + "step": 622 + }, + { + "epoch": 0.04, + "grad_norm": 0.6970423165330458, + "learning_rate": 1.9998268552112586e-05, + "loss": 0.3156, + "step": 623 + }, + { + "epoch": 0.04, + "grad_norm": 0.4610523510184624, + "learning_rate": 1.999823375103457e-05, + "loss": 0.2628, + "step": 624 + }, + { + "epoch": 0.04, + "grad_norm": 2.7809585438545255, + "learning_rate": 1.9998198603718148e-05, + "loss": 0.6929, + "step": 625 + }, + { + "epoch": 0.04, + "grad_norm": 0.7009702901425052, + "learning_rate": 1.9998163110164543e-05, + "loss": 0.4038, + "step": 626 + }, + { + "epoch": 0.04, + "grad_norm": 0.7154276273247365, + "learning_rate": 1.9998127270374975e-05, + "loss": 0.3343, + "step": 627 + }, + { + "epoch": 0.04, + "grad_norm": 0.6401189026507259, + "learning_rate": 1.999809108435069e-05, + "loss": 0.2824, + "step": 628 + }, + { + "epoch": 0.04, + "grad_norm": 1.1197570166445445, + "learning_rate": 1.9998054552092943e-05, + "loss": 0.3704, + "step": 629 + }, + { + "epoch": 0.04, + "grad_norm": 0.6645133821988366, + "learning_rate": 1.9998017673602996e-05, + "loss": 0.2726, + "step": 630 + }, + { + "epoch": 0.04, + "grad_norm": 0.6906799629741556, + "learning_rate": 1.9997980448882125e-05, + "loss": 0.3588, + "step": 631 + }, + { + "epoch": 0.04, + "grad_norm": 1.227238559775126, + "learning_rate": 1.9997942877931624e-05, + "loss": 0.6336, + "step": 632 + }, + { + "epoch": 0.04, + "grad_norm": 0.7603717426281402, + "learning_rate": 1.9997904960752794e-05, + "loss": 0.2539, + "step": 633 + }, + { + "epoch": 0.04, + "grad_norm": 0.7780626216278012, + "learning_rate": 1.999786669734694e-05, + "loss": 0.4715, + "step": 634 + }, + { + "epoch": 0.04, + "grad_norm": 0.5482206816162322, + "learning_rate": 1.999782808771539e-05, + "loss": 0.3319, + "step": 635 + }, + { + "epoch": 0.04, + "grad_norm": 0.7702010294483638, + "learning_rate": 1.999778913185949e-05, + "loss": 0.2772, + "step": 636 + }, + { + "epoch": 0.04, + "grad_norm": 0.5466923355560228, + "learning_rate": 1.9997749829780577e-05, + "loss": 0.3879, + "step": 637 + }, + { + "epoch": 0.04, + "grad_norm": 0.7200638224194983, + "learning_rate": 1.9997710181480018e-05, + "loss": 0.381, + "step": 638 + }, + { + "epoch": 0.04, + "grad_norm": 0.47588589352893357, + "learning_rate": 1.9997670186959187e-05, + "loss": 0.288, + "step": 639 + }, + { + "epoch": 0.04, + "grad_norm": 0.8159078143196873, + "learning_rate": 1.999762984621947e-05, + "loss": 0.4299, + "step": 640 + }, + { + "epoch": 0.04, + "grad_norm": 0.764774529343326, + "learning_rate": 1.9997589159262255e-05, + "loss": 0.3122, + "step": 641 + }, + { + "epoch": 0.04, + "grad_norm": 0.5095595388066965, + "learning_rate": 1.9997548126088963e-05, + "loss": 0.3581, + "step": 642 + }, + { + "epoch": 0.04, + "grad_norm": 0.5936175913635908, + "learning_rate": 1.9997506746701006e-05, + "loss": 0.3294, + "step": 643 + }, + { + "epoch": 0.04, + "grad_norm": 0.5773573972412904, + "learning_rate": 1.9997465021099818e-05, + "loss": 0.4026, + "step": 644 + }, + { + "epoch": 0.04, + "grad_norm": 0.5790983443211878, + "learning_rate": 1.9997422949286852e-05, + "loss": 0.3016, + "step": 645 + }, + { + "epoch": 0.04, + "grad_norm": 1.5202482266217976, + "learning_rate": 1.9997380531263555e-05, + "loss": 0.3345, + "step": 646 + }, + { + "epoch": 0.04, + "grad_norm": 0.5326923128194373, + "learning_rate": 1.99973377670314e-05, + "loss": 0.4049, + "step": 647 + }, + { + "epoch": 0.04, + "grad_norm": 0.6871631996017432, + "learning_rate": 1.999729465659187e-05, + "loss": 0.4462, + "step": 648 + }, + { + "epoch": 0.04, + "grad_norm": 0.5156786584531907, + "learning_rate": 1.9997251199946456e-05, + "loss": 0.2645, + "step": 649 + }, + { + "epoch": 0.04, + "grad_norm": 0.523957794870545, + "learning_rate": 1.999720739709666e-05, + "loss": 0.3247, + "step": 650 + }, + { + "epoch": 0.04, + "grad_norm": 1.1519934314835216, + "learning_rate": 1.9997163248044008e-05, + "loss": 0.4499, + "step": 651 + }, + { + "epoch": 0.04, + "grad_norm": 0.714868485827706, + "learning_rate": 1.9997118752790016e-05, + "loss": 0.2821, + "step": 652 + }, + { + "epoch": 0.04, + "grad_norm": 0.732881875664929, + "learning_rate": 1.9997073911336234e-05, + "loss": 0.5607, + "step": 653 + }, + { + "epoch": 0.04, + "grad_norm": 0.6503986849828309, + "learning_rate": 1.9997028723684213e-05, + "loss": 0.3507, + "step": 654 + }, + { + "epoch": 0.04, + "grad_norm": 0.5726787275180806, + "learning_rate": 1.999698318983552e-05, + "loss": 0.3602, + "step": 655 + }, + { + "epoch": 0.04, + "grad_norm": 0.5050293657924345, + "learning_rate": 1.9996937309791722e-05, + "loss": 0.1513, + "step": 656 + }, + { + "epoch": 0.04, + "grad_norm": 0.8105239826073464, + "learning_rate": 1.999689108355442e-05, + "loss": 0.4691, + "step": 657 + }, + { + "epoch": 0.04, + "grad_norm": 0.5513704345716917, + "learning_rate": 1.9996844511125205e-05, + "loss": 0.3814, + "step": 658 + }, + { + "epoch": 0.04, + "grad_norm": 0.6100721289861497, + "learning_rate": 1.9996797592505703e-05, + "loss": 0.4041, + "step": 659 + }, + { + "epoch": 0.04, + "grad_norm": 0.6755649661583865, + "learning_rate": 1.9996750327697523e-05, + "loss": 0.4135, + "step": 660 + }, + { + "epoch": 0.04, + "grad_norm": 0.5909747328001143, + "learning_rate": 1.999670271670231e-05, + "loss": 0.3755, + "step": 661 + }, + { + "epoch": 0.04, + "grad_norm": 0.5530198259517983, + "learning_rate": 1.9996654759521713e-05, + "loss": 0.2282, + "step": 662 + }, + { + "epoch": 0.04, + "grad_norm": 1.0659525428191399, + "learning_rate": 1.999660645615739e-05, + "loss": 0.2963, + "step": 663 + }, + { + "epoch": 0.04, + "grad_norm": 0.6884011137041578, + "learning_rate": 1.9996557806611017e-05, + "loss": 0.375, + "step": 664 + }, + { + "epoch": 0.04, + "grad_norm": 1.2005303148264228, + "learning_rate": 1.9996508810884277e-05, + "loss": 0.5657, + "step": 665 + }, + { + "epoch": 0.04, + "grad_norm": 0.7004073880089609, + "learning_rate": 1.9996459468978865e-05, + "loss": 0.2783, + "step": 666 + }, + { + "epoch": 0.04, + "grad_norm": 0.6849488869877197, + "learning_rate": 1.9996409780896495e-05, + "loss": 0.3791, + "step": 667 + }, + { + "epoch": 0.04, + "grad_norm": 0.9692201762247729, + "learning_rate": 1.999635974663888e-05, + "loss": 0.2391, + "step": 668 + }, + { + "epoch": 0.04, + "grad_norm": 1.3591328381927894, + "learning_rate": 1.999630936620776e-05, + "loss": 0.3121, + "step": 669 + }, + { + "epoch": 0.04, + "grad_norm": 0.6988627891771934, + "learning_rate": 1.9996258639604874e-05, + "loss": 0.3539, + "step": 670 + }, + { + "epoch": 0.04, + "grad_norm": 0.9556786113936547, + "learning_rate": 1.999620756683198e-05, + "loss": 0.4806, + "step": 671 + }, + { + "epoch": 0.04, + "grad_norm": 1.4342440011179771, + "learning_rate": 1.999615614789085e-05, + "loss": 0.3508, + "step": 672 + }, + { + "epoch": 0.04, + "grad_norm": 0.6267491039851717, + "learning_rate": 1.9996104382783266e-05, + "loss": 0.3627, + "step": 673 + }, + { + "epoch": 0.04, + "grad_norm": 0.6388301817445797, + "learning_rate": 1.9996052271511012e-05, + "loss": 0.2443, + "step": 674 + }, + { + "epoch": 0.04, + "grad_norm": 0.6126860481430475, + "learning_rate": 1.99959998140759e-05, + "loss": 0.2574, + "step": 675 + }, + { + "epoch": 0.04, + "grad_norm": 0.6335639107595146, + "learning_rate": 1.9995947010479744e-05, + "loss": 0.4149, + "step": 676 + }, + { + "epoch": 0.04, + "grad_norm": 1.009173507203124, + "learning_rate": 1.999589386072437e-05, + "loss": 0.6275, + "step": 677 + }, + { + "epoch": 0.04, + "grad_norm": 0.5334355696559099, + "learning_rate": 1.9995840364811627e-05, + "loss": 0.3687, + "step": 678 + }, + { + "epoch": 0.04, + "grad_norm": 0.5264094885639136, + "learning_rate": 1.999578652274336e-05, + "loss": 0.2631, + "step": 679 + }, + { + "epoch": 0.04, + "grad_norm": 0.7406091669303169, + "learning_rate": 1.9995732334521432e-05, + "loss": 0.2581, + "step": 680 + }, + { + "epoch": 0.04, + "grad_norm": 0.9294914520950267, + "learning_rate": 1.9995677800147728e-05, + "loss": 0.5048, + "step": 681 + }, + { + "epoch": 0.04, + "grad_norm": 0.5268582006547304, + "learning_rate": 1.9995622919624127e-05, + "loss": 0.2644, + "step": 682 + }, + { + "epoch": 0.04, + "grad_norm": 0.848890819064537, + "learning_rate": 1.9995567692952537e-05, + "loss": 0.4564, + "step": 683 + }, + { + "epoch": 0.04, + "grad_norm": 0.9587797278208916, + "learning_rate": 1.9995512120134867e-05, + "loss": 0.5373, + "step": 684 + }, + { + "epoch": 0.04, + "grad_norm": 0.5326515080037058, + "learning_rate": 1.9995456201173044e-05, + "loss": 0.2536, + "step": 685 + }, + { + "epoch": 0.04, + "grad_norm": 0.4332630514103188, + "learning_rate": 1.9995399936069e-05, + "loss": 0.2911, + "step": 686 + }, + { + "epoch": 0.04, + "grad_norm": 0.8241672572613798, + "learning_rate": 1.9995343324824686e-05, + "loss": 0.3667, + "step": 687 + }, + { + "epoch": 0.04, + "grad_norm": 0.5755331586063459, + "learning_rate": 1.9995286367442062e-05, + "loss": 0.3133, + "step": 688 + }, + { + "epoch": 0.04, + "grad_norm": 1.406266965308756, + "learning_rate": 1.9995229063923104e-05, + "loss": 0.8357, + "step": 689 + }, + { + "epoch": 0.04, + "grad_norm": 0.5461542233249763, + "learning_rate": 1.9995171414269793e-05, + "loss": 0.3926, + "step": 690 + }, + { + "epoch": 0.04, + "grad_norm": 0.5332452855766027, + "learning_rate": 1.999511341848412e-05, + "loss": 0.3142, + "step": 691 + }, + { + "epoch": 0.04, + "grad_norm": 0.5558375093163075, + "learning_rate": 1.9995055076568107e-05, + "loss": 0.1537, + "step": 692 + }, + { + "epoch": 0.04, + "grad_norm": 0.5096980092738956, + "learning_rate": 1.999499638852376e-05, + "loss": 0.3855, + "step": 693 + }, + { + "epoch": 0.04, + "grad_norm": 0.585061570533692, + "learning_rate": 1.999493735435312e-05, + "loss": 0.3167, + "step": 694 + }, + { + "epoch": 0.04, + "grad_norm": 1.3571760544795732, + "learning_rate": 1.999487797405823e-05, + "loss": 0.5282, + "step": 695 + }, + { + "epoch": 0.04, + "grad_norm": 1.1375980474442728, + "learning_rate": 1.9994818247641147e-05, + "loss": 0.5273, + "step": 696 + }, + { + "epoch": 0.04, + "grad_norm": 0.6087244752271485, + "learning_rate": 1.9994758175103935e-05, + "loss": 0.3229, + "step": 697 + }, + { + "epoch": 0.04, + "grad_norm": 0.6091437037486682, + "learning_rate": 1.999469775644868e-05, + "loss": 0.316, + "step": 698 + }, + { + "epoch": 0.04, + "grad_norm": 1.0768666202960535, + "learning_rate": 1.999463699167747e-05, + "loss": 0.5091, + "step": 699 + }, + { + "epoch": 0.04, + "grad_norm": 0.6652825079939793, + "learning_rate": 1.999457588079241e-05, + "loss": 0.3182, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 0.5327514124177343, + "learning_rate": 1.9994514423795616e-05, + "loss": 0.1642, + "step": 701 + }, + { + "epoch": 0.04, + "grad_norm": 0.6094186828307505, + "learning_rate": 1.9994452620689218e-05, + "loss": 0.3472, + "step": 702 + }, + { + "epoch": 0.04, + "grad_norm": 0.5620751646923052, + "learning_rate": 1.999439047147536e-05, + "loss": 0.3312, + "step": 703 + }, + { + "epoch": 0.04, + "grad_norm": 0.9069035528958392, + "learning_rate": 1.9994327976156184e-05, + "loss": 0.6049, + "step": 704 + }, + { + "epoch": 0.04, + "grad_norm": 0.6506383954312657, + "learning_rate": 1.9994265134733862e-05, + "loss": 0.397, + "step": 705 + }, + { + "epoch": 0.04, + "grad_norm": 0.45557859200370937, + "learning_rate": 1.999420194721057e-05, + "loss": 0.3243, + "step": 706 + }, + { + "epoch": 0.04, + "grad_norm": 0.48790557123093586, + "learning_rate": 1.999413841358849e-05, + "loss": 0.2508, + "step": 707 + }, + { + "epoch": 0.04, + "grad_norm": 1.776312246851432, + "learning_rate": 1.999407453386983e-05, + "loss": 0.3304, + "step": 708 + }, + { + "epoch": 0.04, + "grad_norm": 0.4977790934365835, + "learning_rate": 1.99940103080568e-05, + "loss": 0.3172, + "step": 709 + }, + { + "epoch": 0.04, + "grad_norm": 0.6252880128089249, + "learning_rate": 1.999394573615162e-05, + "loss": 0.3972, + "step": 710 + }, + { + "epoch": 0.04, + "grad_norm": 1.1498258098687408, + "learning_rate": 1.999388081815653e-05, + "loss": 0.5164, + "step": 711 + }, + { + "epoch": 0.04, + "grad_norm": 0.892056556618054, + "learning_rate": 1.9993815554073778e-05, + "loss": 0.2629, + "step": 712 + }, + { + "epoch": 0.04, + "grad_norm": 1.6940538088084591, + "learning_rate": 1.999374994390562e-05, + "loss": 0.7976, + "step": 713 + }, + { + "epoch": 0.04, + "grad_norm": 0.500147764955215, + "learning_rate": 1.9993683987654333e-05, + "loss": 0.2667, + "step": 714 + }, + { + "epoch": 0.04, + "grad_norm": 0.560974892337304, + "learning_rate": 1.99936176853222e-05, + "loss": 0.3075, + "step": 715 + }, + { + "epoch": 0.04, + "grad_norm": 1.183879772416323, + "learning_rate": 1.9993551036911514e-05, + "loss": 0.5875, + "step": 716 + }, + { + "epoch": 0.04, + "grad_norm": 0.5553161207454865, + "learning_rate": 1.9993484042424588e-05, + "loss": 0.3873, + "step": 717 + }, + { + "epoch": 0.04, + "grad_norm": 0.4812560176886251, + "learning_rate": 1.9993416701863736e-05, + "loss": 0.2393, + "step": 718 + }, + { + "epoch": 0.04, + "grad_norm": 1.3210916333620273, + "learning_rate": 1.9993349015231297e-05, + "loss": 0.8372, + "step": 719 + }, + { + "epoch": 0.04, + "grad_norm": 0.5269059279825339, + "learning_rate": 1.999328098252961e-05, + "loss": 0.2579, + "step": 720 + }, + { + "epoch": 0.04, + "grad_norm": 0.6285141235409536, + "learning_rate": 1.9993212603761032e-05, + "loss": 0.2367, + "step": 721 + }, + { + "epoch": 0.04, + "grad_norm": 0.564077425147305, + "learning_rate": 1.9993143878927933e-05, + "loss": 0.413, + "step": 722 + }, + { + "epoch": 0.04, + "grad_norm": 1.5407667855885592, + "learning_rate": 1.999307480803269e-05, + "loss": 0.6998, + "step": 723 + }, + { + "epoch": 0.04, + "grad_norm": 0.44099260765567994, + "learning_rate": 1.9993005391077694e-05, + "loss": 0.2509, + "step": 724 + }, + { + "epoch": 0.04, + "grad_norm": 0.6939970930509446, + "learning_rate": 1.999293562806535e-05, + "loss": 0.4579, + "step": 725 + }, + { + "epoch": 0.04, + "grad_norm": 0.5590999653786924, + "learning_rate": 1.999286551899808e-05, + "loss": 0.2503, + "step": 726 + }, + { + "epoch": 0.04, + "grad_norm": 0.4594339329962618, + "learning_rate": 1.9992795063878304e-05, + "loss": 0.2918, + "step": 727 + }, + { + "epoch": 0.04, + "grad_norm": 0.6897597721436182, + "learning_rate": 1.9992724262708466e-05, + "loss": 0.5618, + "step": 728 + }, + { + "epoch": 0.04, + "grad_norm": 0.6934232532251842, + "learning_rate": 1.9992653115491015e-05, + "loss": 0.4563, + "step": 729 + }, + { + "epoch": 0.04, + "grad_norm": 0.457017970343513, + "learning_rate": 1.9992581622228418e-05, + "loss": 0.3187, + "step": 730 + }, + { + "epoch": 0.04, + "grad_norm": 1.331623318724993, + "learning_rate": 1.9992509782923148e-05, + "loss": 0.5542, + "step": 731 + }, + { + "epoch": 0.04, + "grad_norm": 0.5444148013267605, + "learning_rate": 1.9992437597577692e-05, + "loss": 0.3955, + "step": 732 + }, + { + "epoch": 0.04, + "grad_norm": 0.4182804082754334, + "learning_rate": 1.9992365066194554e-05, + "loss": 0.3364, + "step": 733 + }, + { + "epoch": 0.04, + "grad_norm": 0.4630481339095031, + "learning_rate": 1.9992292188776244e-05, + "loss": 0.2866, + "step": 734 + }, + { + "epoch": 0.04, + "grad_norm": 1.576845422602978, + "learning_rate": 1.9992218965325283e-05, + "loss": 0.6871, + "step": 735 + }, + { + "epoch": 0.04, + "grad_norm": 0.8118373704300686, + "learning_rate": 1.999214539584421e-05, + "loss": 0.369, + "step": 736 + }, + { + "epoch": 0.04, + "grad_norm": 0.5701701832074956, + "learning_rate": 1.9992071480335574e-05, + "loss": 0.3343, + "step": 737 + }, + { + "epoch": 0.04, + "grad_norm": 0.592621937478156, + "learning_rate": 1.9991997218801925e-05, + "loss": 0.4622, + "step": 738 + }, + { + "epoch": 0.04, + "grad_norm": 0.5980367247552338, + "learning_rate": 1.999192261124585e-05, + "loss": 0.3063, + "step": 739 + }, + { + "epoch": 0.04, + "grad_norm": 0.40888204345849005, + "learning_rate": 1.9991847657669922e-05, + "loss": 0.2738, + "step": 740 + }, + { + "epoch": 0.04, + "grad_norm": 0.6537608743254626, + "learning_rate": 1.9991772358076738e-05, + "loss": 0.3047, + "step": 741 + }, + { + "epoch": 0.04, + "grad_norm": 0.5674094793046328, + "learning_rate": 1.999169671246891e-05, + "loss": 0.3443, + "step": 742 + }, + { + "epoch": 0.04, + "grad_norm": 0.6513678869487316, + "learning_rate": 1.999162072084905e-05, + "loss": 0.4706, + "step": 743 + }, + { + "epoch": 0.04, + "grad_norm": 0.7395606121601905, + "learning_rate": 1.9991544383219796e-05, + "loss": 0.4724, + "step": 744 + }, + { + "epoch": 0.04, + "grad_norm": 0.4922112233983982, + "learning_rate": 1.999146769958379e-05, + "loss": 0.312, + "step": 745 + }, + { + "epoch": 0.04, + "grad_norm": 0.4493305840997312, + "learning_rate": 1.999139066994369e-05, + "loss": 0.2616, + "step": 746 + }, + { + "epoch": 0.04, + "grad_norm": 1.197209300931008, + "learning_rate": 1.9991313294302155e-05, + "loss": 0.4788, + "step": 747 + }, + { + "epoch": 0.04, + "grad_norm": 0.4994103566217086, + "learning_rate": 1.9991235572661874e-05, + "loss": 0.3094, + "step": 748 + }, + { + "epoch": 0.04, + "grad_norm": 1.6447067429291433, + "learning_rate": 1.9991157505025534e-05, + "loss": 0.9033, + "step": 749 + }, + { + "epoch": 0.04, + "grad_norm": 0.5606319354875383, + "learning_rate": 1.999107909139584e-05, + "loss": 0.3895, + "step": 750 + }, + { + "epoch": 0.04, + "grad_norm": 0.5734727183073857, + "learning_rate": 1.9991000331775506e-05, + "loss": 0.3423, + "step": 751 + }, + { + "epoch": 0.04, + "grad_norm": 0.569467318382504, + "learning_rate": 1.999092122616726e-05, + "loss": 0.2506, + "step": 752 + }, + { + "epoch": 0.04, + "grad_norm": 0.5087244938226237, + "learning_rate": 1.9990841774573843e-05, + "loss": 0.331, + "step": 753 + }, + { + "epoch": 0.04, + "grad_norm": 0.34419700613207677, + "learning_rate": 1.9990761976998e-05, + "loss": 0.0829, + "step": 754 + }, + { + "epoch": 0.04, + "grad_norm": 0.5802236112684132, + "learning_rate": 1.9990681833442503e-05, + "loss": 0.3881, + "step": 755 + }, + { + "epoch": 0.04, + "grad_norm": 0.9103632693548557, + "learning_rate": 1.9990601343910126e-05, + "loss": 0.6007, + "step": 756 + }, + { + "epoch": 0.04, + "grad_norm": 0.45808385012447494, + "learning_rate": 1.999052050840365e-05, + "loss": 0.2621, + "step": 757 + }, + { + "epoch": 0.04, + "grad_norm": 0.4230696341008969, + "learning_rate": 1.9990439326925882e-05, + "loss": 0.2886, + "step": 758 + }, + { + "epoch": 0.04, + "grad_norm": 0.78737798009791, + "learning_rate": 1.9990357799479626e-05, + "loss": 0.342, + "step": 759 + }, + { + "epoch": 0.04, + "grad_norm": 0.5573592556824364, + "learning_rate": 1.9990275926067713e-05, + "loss": 0.2124, + "step": 760 + }, + { + "epoch": 0.04, + "grad_norm": 0.6546682929758058, + "learning_rate": 1.9990193706692972e-05, + "loss": 0.3749, + "step": 761 + }, + { + "epoch": 0.04, + "grad_norm": 1.533380714290272, + "learning_rate": 1.9990111141358252e-05, + "loss": 0.6221, + "step": 762 + }, + { + "epoch": 0.04, + "grad_norm": 0.5379176222315797, + "learning_rate": 1.9990028230066413e-05, + "loss": 0.2695, + "step": 763 + }, + { + "epoch": 0.04, + "grad_norm": 0.4715225267416928, + "learning_rate": 1.998994497282033e-05, + "loss": 0.252, + "step": 764 + }, + { + "epoch": 0.04, + "grad_norm": 0.4811990499682349, + "learning_rate": 1.9989861369622877e-05, + "loss": 0.2896, + "step": 765 + }, + { + "epoch": 0.04, + "grad_norm": 0.886555411853447, + "learning_rate": 1.9989777420476954e-05, + "loss": 0.4155, + "step": 766 + }, + { + "epoch": 0.04, + "grad_norm": 0.9790462773253676, + "learning_rate": 1.998969312538547e-05, + "loss": 0.3598, + "step": 767 + }, + { + "epoch": 0.04, + "grad_norm": 2.1859233395253255, + "learning_rate": 1.9989608484351343e-05, + "loss": 0.676, + "step": 768 + }, + { + "epoch": 0.04, + "grad_norm": 0.5154902003353229, + "learning_rate": 1.9989523497377505e-05, + "loss": 0.3098, + "step": 769 + }, + { + "epoch": 0.04, + "grad_norm": 0.6483670299556568, + "learning_rate": 1.9989438164466896e-05, + "loss": 0.3439, + "step": 770 + }, + { + "epoch": 0.04, + "grad_norm": 0.39362482228028733, + "learning_rate": 1.9989352485622472e-05, + "loss": 0.1467, + "step": 771 + }, + { + "epoch": 0.04, + "grad_norm": 1.74006668147056, + "learning_rate": 1.9989266460847207e-05, + "loss": 0.4542, + "step": 772 + }, + { + "epoch": 0.04, + "grad_norm": 0.8813615902844071, + "learning_rate": 1.998918009014407e-05, + "loss": 0.3356, + "step": 773 + }, + { + "epoch": 0.04, + "grad_norm": 1.841448242464731, + "learning_rate": 1.9989093373516053e-05, + "loss": 0.7304, + "step": 774 + }, + { + "epoch": 0.04, + "grad_norm": 1.0736502504903374, + "learning_rate": 1.9989006310966162e-05, + "loss": 0.4557, + "step": 775 + }, + { + "epoch": 0.04, + "grad_norm": 0.5013227982869481, + "learning_rate": 1.9988918902497417e-05, + "loss": 0.2518, + "step": 776 + }, + { + "epoch": 0.04, + "grad_norm": 0.5610738984393089, + "learning_rate": 1.998883114811284e-05, + "loss": 0.3094, + "step": 777 + }, + { + "epoch": 0.04, + "grad_norm": 1.8664247677141819, + "learning_rate": 1.9988743047815465e-05, + "loss": 0.4851, + "step": 778 + }, + { + "epoch": 0.04, + "grad_norm": 1.1921957110147678, + "learning_rate": 1.9988654601608354e-05, + "loss": 0.4159, + "step": 779 + }, + { + "epoch": 0.04, + "grad_norm": 1.8570412985712648, + "learning_rate": 1.998856580949456e-05, + "loss": 0.6539, + "step": 780 + }, + { + "epoch": 0.04, + "grad_norm": 0.5304787388263474, + "learning_rate": 1.998847667147716e-05, + "loss": 0.3169, + "step": 781 + }, + { + "epoch": 0.04, + "grad_norm": 0.6695906793491527, + "learning_rate": 1.9988387187559245e-05, + "loss": 0.3891, + "step": 782 + }, + { + "epoch": 0.04, + "grad_norm": 0.4991033760441603, + "learning_rate": 1.9988297357743906e-05, + "loss": 0.1626, + "step": 783 + }, + { + "epoch": 0.05, + "grad_norm": 1.6294509961056738, + "learning_rate": 1.9988207182034264e-05, + "loss": 0.4898, + "step": 784 + }, + { + "epoch": 0.05, + "grad_norm": 0.7317674592716401, + "learning_rate": 1.9988116660433435e-05, + "loss": 0.3378, + "step": 785 + }, + { + "epoch": 0.05, + "grad_norm": 1.1238616873997904, + "learning_rate": 1.9988025792944558e-05, + "loss": 0.3753, + "step": 786 + }, + { + "epoch": 0.05, + "grad_norm": 0.7119062202083866, + "learning_rate": 1.9987934579570776e-05, + "loss": 0.3211, + "step": 787 + }, + { + "epoch": 0.05, + "grad_norm": 0.7562474683264186, + "learning_rate": 1.9987843020315248e-05, + "loss": 0.3854, + "step": 788 + }, + { + "epoch": 0.05, + "grad_norm": 0.5043603296184589, + "learning_rate": 1.9987751115181147e-05, + "loss": 0.3109, + "step": 789 + }, + { + "epoch": 0.05, + "grad_norm": 1.0146693352319747, + "learning_rate": 1.9987658864171653e-05, + "loss": 0.5063, + "step": 790 + }, + { + "epoch": 0.05, + "grad_norm": 0.42337892738407523, + "learning_rate": 1.9987566267289963e-05, + "loss": 0.2544, + "step": 791 + }, + { + "epoch": 0.05, + "grad_norm": 1.2475416323106774, + "learning_rate": 1.9987473324539285e-05, + "loss": 0.7866, + "step": 792 + }, + { + "epoch": 0.05, + "grad_norm": 0.7025050670031587, + "learning_rate": 1.998738003592283e-05, + "loss": 0.3059, + "step": 793 + }, + { + "epoch": 0.05, + "grad_norm": 0.5384616703375238, + "learning_rate": 1.9987286401443838e-05, + "loss": 0.3314, + "step": 794 + }, + { + "epoch": 0.05, + "grad_norm": 1.0796972648685363, + "learning_rate": 1.9987192421105546e-05, + "loss": 0.5532, + "step": 795 + }, + { + "epoch": 0.05, + "grad_norm": 0.34313687983856905, + "learning_rate": 1.998709809491121e-05, + "loss": 0.1962, + "step": 796 + }, + { + "epoch": 0.05, + "grad_norm": 0.3985139635769342, + "learning_rate": 1.9987003422864094e-05, + "loss": 0.2577, + "step": 797 + }, + { + "epoch": 0.05, + "grad_norm": 1.6243134259537804, + "learning_rate": 1.998690840496748e-05, + "loss": 0.8693, + "step": 798 + }, + { + "epoch": 0.05, + "grad_norm": 0.7373614038171744, + "learning_rate": 1.9986813041224662e-05, + "loss": 0.2976, + "step": 799 + }, + { + "epoch": 0.05, + "grad_norm": 0.6195132227874984, + "learning_rate": 1.9986717331638935e-05, + "loss": 0.3861, + "step": 800 + }, + { + "epoch": 0.05, + "grad_norm": 0.5595292667728948, + "learning_rate": 1.9986621276213616e-05, + "loss": 0.3883, + "step": 801 + }, + { + "epoch": 0.05, + "grad_norm": 0.5434227894915622, + "learning_rate": 1.998652487495203e-05, + "loss": 0.2684, + "step": 802 + }, + { + "epoch": 0.05, + "grad_norm": 0.4591481879778902, + "learning_rate": 1.998642812785752e-05, + "loss": 0.2694, + "step": 803 + }, + { + "epoch": 0.05, + "grad_norm": 1.6614380862805616, + "learning_rate": 1.998633103493343e-05, + "loss": 0.8139, + "step": 804 + }, + { + "epoch": 0.05, + "grad_norm": 0.4670262324947827, + "learning_rate": 1.998623359618313e-05, + "loss": 0.3484, + "step": 805 + }, + { + "epoch": 0.05, + "grad_norm": 0.564486768316231, + "learning_rate": 1.9986135811609983e-05, + "loss": 0.2648, + "step": 806 + }, + { + "epoch": 0.05, + "grad_norm": 1.053932627292119, + "learning_rate": 1.998603768121739e-05, + "loss": 0.5622, + "step": 807 + }, + { + "epoch": 0.05, + "grad_norm": 0.4672012049877789, + "learning_rate": 1.9985939205008734e-05, + "loss": 0.3168, + "step": 808 + }, + { + "epoch": 0.05, + "grad_norm": 0.4293014248611022, + "learning_rate": 1.998584038298744e-05, + "loss": 0.2512, + "step": 809 + }, + { + "epoch": 0.05, + "grad_norm": 0.6436970530457229, + "learning_rate": 1.998574121515692e-05, + "loss": 0.3514, + "step": 810 + }, + { + "epoch": 0.05, + "grad_norm": 1.0177245118126792, + "learning_rate": 1.998564170152061e-05, + "loss": 0.5856, + "step": 811 + }, + { + "epoch": 0.05, + "grad_norm": 0.45703131510777767, + "learning_rate": 1.9985541842081957e-05, + "loss": 0.3059, + "step": 812 + }, + { + "epoch": 0.05, + "grad_norm": 0.48741084543212293, + "learning_rate": 1.9985441636844424e-05, + "loss": 0.413, + "step": 813 + }, + { + "epoch": 0.05, + "grad_norm": 1.4244781136880331, + "learning_rate": 1.998534108581147e-05, + "loss": 0.6553, + "step": 814 + }, + { + "epoch": 0.05, + "grad_norm": 0.46494993418916686, + "learning_rate": 1.998524018898659e-05, + "loss": 0.2524, + "step": 815 + }, + { + "epoch": 0.05, + "grad_norm": 0.5302682621937632, + "learning_rate": 1.9985138946373266e-05, + "loss": 0.3199, + "step": 816 + }, + { + "epoch": 0.05, + "grad_norm": 0.43980037601885463, + "learning_rate": 1.9985037357975013e-05, + "loss": 0.3068, + "step": 817 + }, + { + "epoch": 0.05, + "grad_norm": 0.504431850220822, + "learning_rate": 1.9984935423795345e-05, + "loss": 0.3338, + "step": 818 + }, + { + "epoch": 0.05, + "grad_norm": 0.8249104939952835, + "learning_rate": 1.998483314383779e-05, + "loss": 0.4644, + "step": 819 + }, + { + "epoch": 0.05, + "grad_norm": 0.5695000365147458, + "learning_rate": 1.9984730518105897e-05, + "loss": 0.3879, + "step": 820 + }, + { + "epoch": 0.05, + "grad_norm": 0.4473843111395704, + "learning_rate": 1.9984627546603214e-05, + "loss": 0.3239, + "step": 821 + }, + { + "epoch": 0.05, + "grad_norm": 0.3833543869741045, + "learning_rate": 1.9984524229333307e-05, + "loss": 0.1974, + "step": 822 + }, + { + "epoch": 0.05, + "grad_norm": 0.4942323075738801, + "learning_rate": 1.9984420566299756e-05, + "loss": 0.347, + "step": 823 + }, + { + "epoch": 0.05, + "grad_norm": 0.5670416194035222, + "learning_rate": 1.998431655750615e-05, + "loss": 0.3483, + "step": 824 + }, + { + "epoch": 0.05, + "grad_norm": 0.5051832663125587, + "learning_rate": 1.998421220295609e-05, + "loss": 0.3617, + "step": 825 + }, + { + "epoch": 0.05, + "grad_norm": 1.6905171954763243, + "learning_rate": 1.9984107502653193e-05, + "loss": 0.7626, + "step": 826 + }, + { + "epoch": 0.05, + "grad_norm": 0.4875542594639041, + "learning_rate": 1.9984002456601082e-05, + "loss": 0.2914, + "step": 827 + }, + { + "epoch": 0.05, + "grad_norm": 0.6393314261217024, + "learning_rate": 1.9983897064803396e-05, + "loss": 0.4494, + "step": 828 + }, + { + "epoch": 0.05, + "grad_norm": 0.5626873081314722, + "learning_rate": 1.9983791327263782e-05, + "loss": 0.4125, + "step": 829 + }, + { + "epoch": 0.05, + "grad_norm": 0.4740622156552829, + "learning_rate": 1.9983685243985905e-05, + "loss": 0.3176, + "step": 830 + }, + { + "epoch": 0.05, + "grad_norm": 0.45140349812898456, + "learning_rate": 1.9983578814973437e-05, + "loss": 0.2087, + "step": 831 + }, + { + "epoch": 0.05, + "grad_norm": 0.582391697868439, + "learning_rate": 1.9983472040230063e-05, + "loss": 0.3153, + "step": 832 + }, + { + "epoch": 0.05, + "grad_norm": 0.504699200239562, + "learning_rate": 1.998336491975948e-05, + "loss": 0.3052, + "step": 833 + }, + { + "epoch": 0.05, + "grad_norm": 1.6280795918613231, + "learning_rate": 1.9983257453565402e-05, + "loss": 0.8253, + "step": 834 + }, + { + "epoch": 0.05, + "grad_norm": 0.538834623854235, + "learning_rate": 1.9983149641651546e-05, + "loss": 0.3547, + "step": 835 + }, + { + "epoch": 0.05, + "grad_norm": 0.661409036567593, + "learning_rate": 1.998304148402165e-05, + "loss": 0.3122, + "step": 836 + }, + { + "epoch": 0.05, + "grad_norm": 0.48427368476034816, + "learning_rate": 1.9982932980679455e-05, + "loss": 0.2972, + "step": 837 + }, + { + "epoch": 0.05, + "grad_norm": 1.8611669536442395, + "learning_rate": 1.998282413162872e-05, + "loss": 0.5054, + "step": 838 + }, + { + "epoch": 0.05, + "grad_norm": 0.5038129864771659, + "learning_rate": 1.9982714936873215e-05, + "loss": 0.3187, + "step": 839 + }, + { + "epoch": 0.05, + "grad_norm": 0.6294109039118858, + "learning_rate": 1.998260539641672e-05, + "loss": 0.4348, + "step": 840 + }, + { + "epoch": 0.05, + "grad_norm": 0.582952653039707, + "learning_rate": 1.998249551026303e-05, + "loss": 0.4443, + "step": 841 + }, + { + "epoch": 0.05, + "grad_norm": 0.5631283283567661, + "learning_rate": 1.998238527841595e-05, + "loss": 0.2381, + "step": 842 + }, + { + "epoch": 0.05, + "grad_norm": 0.40488497045596294, + "learning_rate": 1.9982274700879295e-05, + "loss": 0.1833, + "step": 843 + }, + { + "epoch": 0.05, + "grad_norm": 0.6296325473623197, + "learning_rate": 1.9982163777656902e-05, + "loss": 0.3783, + "step": 844 + }, + { + "epoch": 0.05, + "grad_norm": 0.5638688872776851, + "learning_rate": 1.9982052508752605e-05, + "loss": 0.2327, + "step": 845 + }, + { + "epoch": 0.05, + "grad_norm": 0.9837522407153001, + "learning_rate": 1.998194089417025e-05, + "loss": 0.5804, + "step": 846 + }, + { + "epoch": 0.05, + "grad_norm": 0.8229817506353978, + "learning_rate": 1.9981828933913722e-05, + "loss": 0.5947, + "step": 847 + }, + { + "epoch": 0.05, + "grad_norm": 0.5743285905374217, + "learning_rate": 1.9981716627986882e-05, + "loss": 0.2498, + "step": 848 + }, + { + "epoch": 0.05, + "grad_norm": 0.440183500729955, + "learning_rate": 1.9981603976393625e-05, + "loss": 0.2566, + "step": 849 + }, + { + "epoch": 0.05, + "grad_norm": 1.769652766589286, + "learning_rate": 1.9981490979137853e-05, + "loss": 0.8859, + "step": 850 + }, + { + "epoch": 0.05, + "grad_norm": 0.5989281680159326, + "learning_rate": 1.9981377636223477e-05, + "loss": 0.2521, + "step": 851 + }, + { + "epoch": 0.05, + "grad_norm": 1.0097503020108294, + "learning_rate": 1.998126394765442e-05, + "loss": 0.4525, + "step": 852 + }, + { + "epoch": 0.05, + "grad_norm": 0.7673720570657228, + "learning_rate": 1.9981149913434626e-05, + "loss": 0.4724, + "step": 853 + }, + { + "epoch": 0.05, + "grad_norm": 0.4981975468603891, + "learning_rate": 1.9981035533568035e-05, + "loss": 0.3007, + "step": 854 + }, + { + "epoch": 0.05, + "grad_norm": 0.340649236428933, + "learning_rate": 1.998092080805862e-05, + "loss": 0.1132, + "step": 855 + }, + { + "epoch": 0.05, + "grad_norm": 0.652425187067407, + "learning_rate": 1.9980805736910337e-05, + "loss": 0.4076, + "step": 856 + }, + { + "epoch": 0.05, + "grad_norm": 0.5959681924395248, + "learning_rate": 1.9980690320127188e-05, + "loss": 0.3472, + "step": 857 + }, + { + "epoch": 0.05, + "grad_norm": 1.4708566945755925, + "learning_rate": 1.998057455771316e-05, + "loss": 0.4343, + "step": 858 + }, + { + "epoch": 0.05, + "grad_norm": 1.6946441906172074, + "learning_rate": 1.9980458449672263e-05, + "loss": 0.7589, + "step": 859 + }, + { + "epoch": 0.05, + "grad_norm": 0.546655269731215, + "learning_rate": 1.998034199600852e-05, + "loss": 0.3238, + "step": 860 + }, + { + "epoch": 0.05, + "grad_norm": 0.4237690098340433, + "learning_rate": 1.9980225196725964e-05, + "loss": 0.2079, + "step": 861 + }, + { + "epoch": 0.05, + "grad_norm": 1.1400927331869855, + "learning_rate": 1.998010805182864e-05, + "loss": 0.5551, + "step": 862 + }, + { + "epoch": 0.05, + "grad_norm": 0.5594124097189067, + "learning_rate": 1.9979990561320597e-05, + "loss": 0.3234, + "step": 863 + }, + { + "epoch": 0.05, + "grad_norm": 0.8241251910035258, + "learning_rate": 1.9979872725205915e-05, + "loss": 0.3545, + "step": 864 + }, + { + "epoch": 0.05, + "grad_norm": 1.6342272397960347, + "learning_rate": 1.997975454348867e-05, + "loss": 0.7562, + "step": 865 + }, + { + "epoch": 0.05, + "grad_norm": 0.5442992828218223, + "learning_rate": 1.9979636016172952e-05, + "loss": 0.3479, + "step": 866 + }, + { + "epoch": 0.05, + "grad_norm": 1.0426475164986881, + "learning_rate": 1.9979517143262867e-05, + "loss": 0.2534, + "step": 867 + }, + { + "epoch": 0.05, + "grad_norm": 0.8241038141038709, + "learning_rate": 1.9979397924762537e-05, + "loss": 0.3189, + "step": 868 + }, + { + "epoch": 0.05, + "grad_norm": 0.8868676701867368, + "learning_rate": 1.9979278360676082e-05, + "loss": 0.3811, + "step": 869 + }, + { + "epoch": 0.05, + "grad_norm": 0.8775413046921993, + "learning_rate": 1.9979158451007648e-05, + "loss": 0.4496, + "step": 870 + }, + { + "epoch": 0.05, + "grad_norm": 1.841188215520176, + "learning_rate": 1.9979038195761386e-05, + "loss": 0.5085, + "step": 871 + }, + { + "epoch": 0.05, + "grad_norm": 0.46194012023312764, + "learning_rate": 1.997891759494146e-05, + "loss": 0.2981, + "step": 872 + }, + { + "epoch": 0.05, + "grad_norm": 0.7829547374288088, + "learning_rate": 1.9978796648552045e-05, + "loss": 0.415, + "step": 873 + }, + { + "epoch": 0.05, + "grad_norm": 0.7663806220882737, + "learning_rate": 1.9978675356597334e-05, + "loss": 0.2796, + "step": 874 + }, + { + "epoch": 0.05, + "grad_norm": 0.6505647168387442, + "learning_rate": 1.9978553719081523e-05, + "loss": 0.3577, + "step": 875 + }, + { + "epoch": 0.05, + "grad_norm": 0.6693024697212153, + "learning_rate": 1.997843173600883e-05, + "loss": 0.3511, + "step": 876 + }, + { + "epoch": 0.05, + "grad_norm": 1.4101677187766488, + "learning_rate": 1.997830940738347e-05, + "loss": 0.5603, + "step": 877 + }, + { + "epoch": 0.05, + "grad_norm": 0.8793601862533259, + "learning_rate": 1.9978186733209686e-05, + "loss": 0.3397, + "step": 878 + }, + { + "epoch": 0.05, + "grad_norm": 0.587777319443702, + "learning_rate": 1.997806371349172e-05, + "loss": 0.2368, + "step": 879 + }, + { + "epoch": 0.05, + "grad_norm": 0.5639544073283919, + "learning_rate": 1.9977940348233845e-05, + "loss": 0.4186, + "step": 880 + }, + { + "epoch": 0.05, + "grad_norm": 0.7457185631167834, + "learning_rate": 1.997781663744032e-05, + "loss": 0.2677, + "step": 881 + }, + { + "epoch": 0.05, + "grad_norm": 0.6447444340438591, + "learning_rate": 1.9977692581115436e-05, + "loss": 0.3951, + "step": 882 + }, + { + "epoch": 0.05, + "grad_norm": 1.6502688235687761, + "learning_rate": 1.9977568179263484e-05, + "loss": 0.8296, + "step": 883 + }, + { + "epoch": 0.05, + "grad_norm": 0.4514393605711949, + "learning_rate": 1.9977443431888778e-05, + "loss": 0.2656, + "step": 884 + }, + { + "epoch": 0.05, + "grad_norm": 1.0553305711117273, + "learning_rate": 1.9977318338995632e-05, + "loss": 0.4572, + "step": 885 + }, + { + "epoch": 0.05, + "grad_norm": 0.8407200121591809, + "learning_rate": 1.9977192900588385e-05, + "loss": 0.5764, + "step": 886 + }, + { + "epoch": 0.05, + "grad_norm": 0.47881812578845484, + "learning_rate": 1.9977067116671374e-05, + "loss": 0.1567, + "step": 887 + }, + { + "epoch": 0.05, + "grad_norm": 0.4873118349825849, + "learning_rate": 1.9976940987248956e-05, + "loss": 0.3673, + "step": 888 + }, + { + "epoch": 0.05, + "grad_norm": 1.4079005001201457, + "learning_rate": 1.9976814512325503e-05, + "loss": 0.8054, + "step": 889 + }, + { + "epoch": 0.05, + "grad_norm": 0.5093928927192002, + "learning_rate": 1.9976687691905394e-05, + "loss": 0.2287, + "step": 890 + }, + { + "epoch": 0.05, + "grad_norm": 0.7518705232066628, + "learning_rate": 1.9976560525993015e-05, + "loss": 0.4441, + "step": 891 + }, + { + "epoch": 0.05, + "grad_norm": 0.6356126660781017, + "learning_rate": 1.9976433014592776e-05, + "loss": 0.4113, + "step": 892 + }, + { + "epoch": 0.05, + "grad_norm": 0.52789164129456, + "learning_rate": 1.9976305157709092e-05, + "loss": 0.2405, + "step": 893 + }, + { + "epoch": 0.05, + "grad_norm": 0.6451124335109072, + "learning_rate": 1.9976176955346392e-05, + "loss": 0.3365, + "step": 894 + }, + { + "epoch": 0.05, + "grad_norm": 0.4228472921247635, + "learning_rate": 1.9976048407509107e-05, + "loss": 0.3027, + "step": 895 + }, + { + "epoch": 0.05, + "grad_norm": 0.5781676443680558, + "learning_rate": 1.99759195142017e-05, + "loss": 0.3938, + "step": 896 + }, + { + "epoch": 0.05, + "grad_norm": 0.5641365163135668, + "learning_rate": 1.9975790275428625e-05, + "loss": 0.3254, + "step": 897 + }, + { + "epoch": 0.05, + "grad_norm": 0.5717025160344602, + "learning_rate": 1.9975660691194365e-05, + "loss": 0.4194, + "step": 898 + }, + { + "epoch": 0.05, + "grad_norm": 1.0582116646632875, + "learning_rate": 1.99755307615034e-05, + "loss": 0.5222, + "step": 899 + }, + { + "epoch": 0.05, + "grad_norm": 0.4722208730822841, + "learning_rate": 1.997540048636024e-05, + "loss": 0.2897, + "step": 900 + }, + { + "epoch": 0.05, + "grad_norm": 0.6038363034287668, + "learning_rate": 1.997526986576938e-05, + "loss": 0.3378, + "step": 901 + }, + { + "epoch": 0.05, + "grad_norm": 0.5519184370509896, + "learning_rate": 1.9975138899735366e-05, + "loss": 0.3994, + "step": 902 + }, + { + "epoch": 0.05, + "grad_norm": 0.4720499415776914, + "learning_rate": 1.9975007588262715e-05, + "loss": 0.3905, + "step": 903 + }, + { + "epoch": 0.05, + "grad_norm": 0.5569224560379596, + "learning_rate": 1.9974875931355977e-05, + "loss": 0.3047, + "step": 904 + }, + { + "epoch": 0.05, + "grad_norm": 0.7713492034663683, + "learning_rate": 1.9974743929019717e-05, + "loss": 0.3841, + "step": 905 + }, + { + "epoch": 0.05, + "grad_norm": 0.5967554524752199, + "learning_rate": 1.99746115812585e-05, + "loss": 0.3936, + "step": 906 + }, + { + "epoch": 0.05, + "grad_norm": 0.4020352157362454, + "learning_rate": 1.997447888807692e-05, + "loss": 0.1904, + "step": 907 + }, + { + "epoch": 0.05, + "grad_norm": 0.4793739647988955, + "learning_rate": 1.997434584947956e-05, + "loss": 0.3606, + "step": 908 + }, + { + "epoch": 0.05, + "grad_norm": 0.5324891885968183, + "learning_rate": 1.9974212465471037e-05, + "loss": 0.3638, + "step": 909 + }, + { + "epoch": 0.05, + "grad_norm": 0.711921743120288, + "learning_rate": 1.9974078736055963e-05, + "loss": 0.4732, + "step": 910 + }, + { + "epoch": 0.05, + "grad_norm": 0.4754960806126236, + "learning_rate": 1.997394466123897e-05, + "loss": 0.3441, + "step": 911 + }, + { + "epoch": 0.05, + "grad_norm": 0.4828296923573561, + "learning_rate": 1.99738102410247e-05, + "loss": 0.3559, + "step": 912 + }, + { + "epoch": 0.05, + "grad_norm": 0.4723973954273907, + "learning_rate": 1.9973675475417814e-05, + "loss": 0.1828, + "step": 913 + }, + { + "epoch": 0.05, + "grad_norm": 0.9490921519164291, + "learning_rate": 1.9973540364422973e-05, + "loss": 0.4723, + "step": 914 + }, + { + "epoch": 0.05, + "grad_norm": 0.5314162635369581, + "learning_rate": 1.997340490804486e-05, + "loss": 0.3647, + "step": 915 + }, + { + "epoch": 0.05, + "grad_norm": 0.48861096170235063, + "learning_rate": 1.9973269106288163e-05, + "loss": 0.387, + "step": 916 + }, + { + "epoch": 0.05, + "grad_norm": 0.635391914689005, + "learning_rate": 1.997313295915759e-05, + "loss": 0.3044, + "step": 917 + }, + { + "epoch": 0.05, + "grad_norm": 0.46713708953393285, + "learning_rate": 1.9972996466657846e-05, + "loss": 0.3306, + "step": 918 + }, + { + "epoch": 0.05, + "grad_norm": 0.478474730302629, + "learning_rate": 1.9972859628793663e-05, + "loss": 0.3477, + "step": 919 + }, + { + "epoch": 0.05, + "grad_norm": 0.8346975964677359, + "learning_rate": 1.9972722445569782e-05, + "loss": 0.3668, + "step": 920 + }, + { + "epoch": 0.05, + "grad_norm": 0.44496567184720826, + "learning_rate": 1.997258491699095e-05, + "loss": 0.2714, + "step": 921 + }, + { + "epoch": 0.05, + "grad_norm": 1.387562812852257, + "learning_rate": 1.9972447043061933e-05, + "loss": 0.7089, + "step": 922 + }, + { + "epoch": 0.05, + "grad_norm": 0.4650130501105177, + "learning_rate": 1.9972308823787504e-05, + "loss": 0.2542, + "step": 923 + }, + { + "epoch": 0.05, + "grad_norm": 0.49044841015158935, + "learning_rate": 1.9972170259172444e-05, + "loss": 0.3412, + "step": 924 + }, + { + "epoch": 0.05, + "grad_norm": 0.9751351988159547, + "learning_rate": 1.9972031349221563e-05, + "loss": 0.585, + "step": 925 + }, + { + "epoch": 0.05, + "grad_norm": 0.4007795809187631, + "learning_rate": 1.9971892093939663e-05, + "loss": 0.2822, + "step": 926 + }, + { + "epoch": 0.05, + "grad_norm": 0.4016861009078226, + "learning_rate": 1.9971752493331568e-05, + "loss": 0.2736, + "step": 927 + }, + { + "epoch": 0.05, + "grad_norm": 0.5568432011062416, + "learning_rate": 1.9971612547402116e-05, + "loss": 0.3634, + "step": 928 + }, + { + "epoch": 0.05, + "grad_norm": 0.8775709744925506, + "learning_rate": 1.9971472256156147e-05, + "loss": 0.507, + "step": 929 + }, + { + "epoch": 0.05, + "grad_norm": 0.5513163113977712, + "learning_rate": 1.997133161959852e-05, + "loss": 0.2609, + "step": 930 + }, + { + "epoch": 0.05, + "grad_norm": 0.5854383825688924, + "learning_rate": 1.9971190637734113e-05, + "loss": 0.3781, + "step": 931 + }, + { + "epoch": 0.05, + "grad_norm": 0.5414854941618819, + "learning_rate": 1.99710493105678e-05, + "loss": 0.3819, + "step": 932 + }, + { + "epoch": 0.05, + "grad_norm": 0.36193619268358984, + "learning_rate": 1.9970907638104483e-05, + "loss": 0.1874, + "step": 933 + }, + { + "epoch": 0.05, + "grad_norm": 1.0396706344808258, + "learning_rate": 1.9970765620349058e-05, + "loss": 0.6366, + "step": 934 + }, + { + "epoch": 0.05, + "grad_norm": 0.5805743491039917, + "learning_rate": 1.997062325730645e-05, + "loss": 0.36, + "step": 935 + }, + { + "epoch": 0.05, + "grad_norm": 0.4610257475867486, + "learning_rate": 1.997048054898159e-05, + "loss": 0.2515, + "step": 936 + }, + { + "epoch": 0.05, + "grad_norm": 1.3399968664903208, + "learning_rate": 1.997033749537941e-05, + "loss": 0.5433, + "step": 937 + }, + { + "epoch": 0.05, + "grad_norm": 0.5558017309777695, + "learning_rate": 1.9970194096504877e-05, + "loss": 0.2575, + "step": 938 + }, + { + "epoch": 0.05, + "grad_norm": 0.47350587576542036, + "learning_rate": 1.9970050352362952e-05, + "loss": 0.2149, + "step": 939 + }, + { + "epoch": 0.05, + "grad_norm": 0.8210880489033746, + "learning_rate": 1.996990626295861e-05, + "loss": 0.4412, + "step": 940 + }, + { + "epoch": 0.05, + "grad_norm": 1.5633944668244224, + "learning_rate": 1.9969761828296843e-05, + "loss": 0.8621, + "step": 941 + }, + { + "epoch": 0.05, + "grad_norm": 0.5279281352946498, + "learning_rate": 1.9969617048382653e-05, + "loss": 0.3227, + "step": 942 + }, + { + "epoch": 0.05, + "grad_norm": 0.5300481438173281, + "learning_rate": 1.996947192322105e-05, + "loss": 0.3458, + "step": 943 + }, + { + "epoch": 0.05, + "grad_norm": 0.7143815231199013, + "learning_rate": 1.9969326452817068e-05, + "loss": 0.4565, + "step": 944 + }, + { + "epoch": 0.05, + "grad_norm": 0.5568644549729681, + "learning_rate": 1.9969180637175737e-05, + "loss": 0.3141, + "step": 945 + }, + { + "epoch": 0.05, + "grad_norm": 0.4280076221992522, + "learning_rate": 1.9969034476302108e-05, + "loss": 0.0969, + "step": 946 + }, + { + "epoch": 0.05, + "grad_norm": 0.5319356976602545, + "learning_rate": 1.996888797020125e-05, + "loss": 0.3553, + "step": 947 + }, + { + "epoch": 0.05, + "grad_norm": 0.5872685383034628, + "learning_rate": 1.9968741118878224e-05, + "loss": 0.3226, + "step": 948 + }, + { + "epoch": 0.05, + "grad_norm": 1.0190650467630764, + "learning_rate": 1.9968593922338125e-05, + "loss": 0.4937, + "step": 949 + }, + { + "epoch": 0.05, + "grad_norm": 0.6087067671856272, + "learning_rate": 1.9968446380586045e-05, + "loss": 0.3468, + "step": 950 + }, + { + "epoch": 0.05, + "grad_norm": 0.5114568446529008, + "learning_rate": 1.9968298493627096e-05, + "loss": 0.2784, + "step": 951 + }, + { + "epoch": 0.05, + "grad_norm": 0.40876696190949857, + "learning_rate": 1.99681502614664e-05, + "loss": 0.2605, + "step": 952 + }, + { + "epoch": 0.05, + "grad_norm": 1.1545115092345783, + "learning_rate": 1.9968001684109086e-05, + "loss": 0.6098, + "step": 953 + }, + { + "epoch": 0.05, + "grad_norm": 0.48476631935801145, + "learning_rate": 1.9967852761560304e-05, + "loss": 0.3156, + "step": 954 + }, + { + "epoch": 0.05, + "grad_norm": 0.4489912788177858, + "learning_rate": 1.996770349382521e-05, + "loss": 0.3845, + "step": 955 + }, + { + "epoch": 0.05, + "grad_norm": 0.8817666801720644, + "learning_rate": 1.9967553880908973e-05, + "loss": 0.3831, + "step": 956 + }, + { + "epoch": 0.05, + "grad_norm": 0.5005095482186253, + "learning_rate": 1.996740392281677e-05, + "loss": 0.3326, + "step": 957 + }, + { + "epoch": 0.06, + "grad_norm": 0.41208570683327056, + "learning_rate": 1.9967253619553805e-05, + "loss": 0.1964, + "step": 958 + }, + { + "epoch": 0.06, + "grad_norm": 0.9145023410100701, + "learning_rate": 1.996710297112527e-05, + "loss": 0.3778, + "step": 959 + }, + { + "epoch": 0.06, + "grad_norm": 0.5094709013843017, + "learning_rate": 1.9966951977536387e-05, + "loss": 0.3345, + "step": 960 + }, + { + "epoch": 0.06, + "grad_norm": 1.1238884116633348, + "learning_rate": 1.996680063879239e-05, + "loss": 0.6331, + "step": 961 + }, + { + "epoch": 0.06, + "grad_norm": 0.6022319418093043, + "learning_rate": 1.9966648954898515e-05, + "loss": 0.3348, + "step": 962 + }, + { + "epoch": 0.06, + "grad_norm": 0.5012059527869338, + "learning_rate": 1.9966496925860014e-05, + "loss": 0.2811, + "step": 963 + }, + { + "epoch": 0.06, + "grad_norm": 0.34219376584073213, + "learning_rate": 1.996634455168215e-05, + "loss": 0.2451, + "step": 964 + }, + { + "epoch": 0.06, + "grad_norm": 1.2382160129106405, + "learning_rate": 1.9966191832370208e-05, + "loss": 0.4533, + "step": 965 + }, + { + "epoch": 0.06, + "grad_norm": 0.5785626838524301, + "learning_rate": 1.9966038767929468e-05, + "loss": 0.3016, + "step": 966 + }, + { + "epoch": 0.06, + "grad_norm": 0.5612011179574358, + "learning_rate": 1.9965885358365234e-05, + "loss": 0.3872, + "step": 967 + }, + { + "epoch": 0.06, + "grad_norm": 1.4325677209096839, + "learning_rate": 1.996573160368282e-05, + "loss": 0.6171, + "step": 968 + }, + { + "epoch": 0.06, + "grad_norm": 0.4714099776950298, + "learning_rate": 1.996557750388755e-05, + "loss": 0.2267, + "step": 969 + }, + { + "epoch": 0.06, + "grad_norm": 0.45563509639747796, + "learning_rate": 1.996542305898476e-05, + "loss": 0.2878, + "step": 970 + }, + { + "epoch": 0.06, + "grad_norm": 0.5373298863023417, + "learning_rate": 1.9965268268979794e-05, + "loss": 0.357, + "step": 971 + }, + { + "epoch": 0.06, + "grad_norm": 0.588654271043847, + "learning_rate": 1.996511313387802e-05, + "loss": 0.2835, + "step": 972 + }, + { + "epoch": 0.06, + "grad_norm": 1.4807502575156875, + "learning_rate": 1.9964957653684804e-05, + "loss": 0.8575, + "step": 973 + }, + { + "epoch": 0.06, + "grad_norm": 1.4442004066106364, + "learning_rate": 1.9964801828405536e-05, + "loss": 0.6195, + "step": 974 + }, + { + "epoch": 0.06, + "grad_norm": 0.4943797589189964, + "learning_rate": 1.9964645658045607e-05, + "loss": 0.2719, + "step": 975 + }, + { + "epoch": 0.06, + "grad_norm": 0.39577055533039807, + "learning_rate": 1.9964489142610426e-05, + "loss": 0.1685, + "step": 976 + }, + { + "epoch": 0.06, + "grad_norm": 1.1935907452878245, + "learning_rate": 1.996433228210542e-05, + "loss": 0.5387, + "step": 977 + }, + { + "epoch": 0.06, + "grad_norm": 0.5879709174870033, + "learning_rate": 1.996417507653601e-05, + "loss": 0.2531, + "step": 978 + }, + { + "epoch": 0.06, + "grad_norm": 0.5808913193155929, + "learning_rate": 1.9964017525907646e-05, + "loss": 0.365, + "step": 979 + }, + { + "epoch": 0.06, + "grad_norm": 1.8732982117118, + "learning_rate": 1.9963859630225786e-05, + "loss": 0.7228, + "step": 980 + }, + { + "epoch": 0.06, + "grad_norm": 0.5626726887011435, + "learning_rate": 1.9963701389495896e-05, + "loss": 0.3331, + "step": 981 + }, + { + "epoch": 0.06, + "grad_norm": 0.77938332796842, + "learning_rate": 1.9963542803723452e-05, + "loss": 0.4066, + "step": 982 + }, + { + "epoch": 0.06, + "grad_norm": 0.42358202474178924, + "learning_rate": 1.996338387291395e-05, + "loss": 0.2655, + "step": 983 + }, + { + "epoch": 0.06, + "grad_norm": 0.7147571092797101, + "learning_rate": 1.9963224597072896e-05, + "loss": 0.3894, + "step": 984 + }, + { + "epoch": 0.06, + "grad_norm": 0.6968938961710522, + "learning_rate": 1.99630649762058e-05, + "loss": 0.3112, + "step": 985 + }, + { + "epoch": 0.06, + "grad_norm": 0.6853819380607388, + "learning_rate": 1.996290501031819e-05, + "loss": 0.4494, + "step": 986 + }, + { + "epoch": 0.06, + "grad_norm": 0.4657733992528829, + "learning_rate": 1.996274469941561e-05, + "loss": 0.317, + "step": 987 + }, + { + "epoch": 0.06, + "grad_norm": 0.9112005971743489, + "learning_rate": 1.9962584043503616e-05, + "loss": 0.4292, + "step": 988 + }, + { + "epoch": 0.06, + "grad_norm": 0.49945616956948385, + "learning_rate": 1.9962423042587756e-05, + "loss": 0.2444, + "step": 989 + }, + { + "epoch": 0.06, + "grad_norm": 0.5901638109218509, + "learning_rate": 1.996226169667362e-05, + "loss": 0.3631, + "step": 990 + }, + { + "epoch": 0.06, + "grad_norm": 0.5898119279938763, + "learning_rate": 1.9962100005766783e-05, + "loss": 0.3696, + "step": 991 + }, + { + "epoch": 0.06, + "grad_norm": 0.37546046574179853, + "learning_rate": 1.9961937969872858e-05, + "loss": 0.0755, + "step": 992 + }, + { + "epoch": 0.06, + "grad_norm": 0.538046268884345, + "learning_rate": 1.996177558899745e-05, + "loss": 0.3463, + "step": 993 + }, + { + "epoch": 0.06, + "grad_norm": 0.7582586586626521, + "learning_rate": 1.9961612863146175e-05, + "loss": 0.4278, + "step": 994 + }, + { + "epoch": 0.06, + "grad_norm": 0.4647624246924399, + "learning_rate": 1.9961449792324677e-05, + "loss": 0.2752, + "step": 995 + }, + { + "epoch": 0.06, + "grad_norm": 0.46627636277037593, + "learning_rate": 1.9961286376538607e-05, + "loss": 0.2918, + "step": 996 + }, + { + "epoch": 0.06, + "grad_norm": 1.4470087570646943, + "learning_rate": 1.996112261579361e-05, + "loss": 0.8131, + "step": 997 + }, + { + "epoch": 0.06, + "grad_norm": 0.40123913886301504, + "learning_rate": 1.9960958510095373e-05, + "loss": 0.248, + "step": 998 + }, + { + "epoch": 0.06, + "grad_norm": 0.48846965331054437, + "learning_rate": 1.9960794059449564e-05, + "loss": 0.3432, + "step": 999 + }, + { + "epoch": 0.06, + "grad_norm": 0.6562528678683638, + "learning_rate": 1.996062926386189e-05, + "loss": 0.4336, + "step": 1000 + }, + { + "epoch": 0.06, + "grad_norm": 0.9824380086974319, + "learning_rate": 1.996046412333805e-05, + "loss": 0.472, + "step": 1001 + }, + { + "epoch": 0.06, + "grad_norm": 0.70434564261008, + "learning_rate": 1.996029863788377e-05, + "loss": 0.335, + "step": 1002 + }, + { + "epoch": 0.06, + "grad_norm": 0.47245421383507813, + "learning_rate": 1.9960132807504772e-05, + "loss": 0.3284, + "step": 1003 + }, + { + "epoch": 0.06, + "grad_norm": 0.39877422949668473, + "learning_rate": 1.9959966632206804e-05, + "loss": 0.2873, + "step": 1004 + }, + { + "epoch": 0.06, + "grad_norm": 0.5732104169024447, + "learning_rate": 1.995980011199562e-05, + "loss": 0.3251, + "step": 1005 + }, + { + "epoch": 0.06, + "grad_norm": 0.5576190638879068, + "learning_rate": 1.9959633246876987e-05, + "loss": 0.3643, + "step": 1006 + }, + { + "epoch": 0.06, + "grad_norm": 0.48385803317161563, + "learning_rate": 1.995946603685668e-05, + "loss": 0.3651, + "step": 1007 + }, + { + "epoch": 0.06, + "grad_norm": 0.46973803118307117, + "learning_rate": 1.99592984819405e-05, + "loss": 0.2235, + "step": 1008 + }, + { + "epoch": 0.06, + "grad_norm": 0.31711744184779644, + "learning_rate": 1.9959130582134234e-05, + "loss": 0.2126, + "step": 1009 + }, + { + "epoch": 0.06, + "grad_norm": 0.6060670327723078, + "learning_rate": 1.995896233744371e-05, + "loss": 0.4179, + "step": 1010 + }, + { + "epoch": 0.06, + "grad_norm": 0.39326789263335804, + "learning_rate": 1.9958793747874744e-05, + "loss": 0.2994, + "step": 1011 + }, + { + "epoch": 0.06, + "grad_norm": 0.6352828886575169, + "learning_rate": 1.995862481343318e-05, + "loss": 0.4093, + "step": 1012 + }, + { + "epoch": 0.06, + "grad_norm": 1.117282848504116, + "learning_rate": 1.9958455534124867e-05, + "loss": 0.6806, + "step": 1013 + }, + { + "epoch": 0.06, + "grad_norm": 0.38513507477952136, + "learning_rate": 1.9958285909955668e-05, + "loss": 0.2518, + "step": 1014 + }, + { + "epoch": 0.06, + "grad_norm": 0.49782104906396, + "learning_rate": 1.9958115940931454e-05, + "loss": 0.3526, + "step": 1015 + }, + { + "epoch": 0.06, + "grad_norm": 0.5610251919492635, + "learning_rate": 1.9957945627058115e-05, + "loss": 0.3765, + "step": 1016 + }, + { + "epoch": 0.06, + "grad_norm": 0.349097678917595, + "learning_rate": 1.995777496834155e-05, + "loss": 0.2702, + "step": 1017 + }, + { + "epoch": 0.06, + "grad_norm": 0.6490771901337339, + "learning_rate": 1.9957603964787662e-05, + "loss": 0.2902, + "step": 1018 + }, + { + "epoch": 0.06, + "grad_norm": 0.5038053547197864, + "learning_rate": 1.9957432616402377e-05, + "loss": 0.4018, + "step": 1019 + }, + { + "epoch": 0.06, + "grad_norm": 1.0330461815808312, + "learning_rate": 1.995726092319163e-05, + "loss": 0.4524, + "step": 1020 + }, + { + "epoch": 0.06, + "grad_norm": 0.4900679690379944, + "learning_rate": 1.9957088885161366e-05, + "loss": 0.3125, + "step": 1021 + }, + { + "epoch": 0.06, + "grad_norm": 0.46117236213253127, + "learning_rate": 1.9956916502317537e-05, + "loss": 0.395, + "step": 1022 + }, + { + "epoch": 0.06, + "grad_norm": 0.4574938839171714, + "learning_rate": 1.9956743774666124e-05, + "loss": 0.2827, + "step": 1023 + }, + { + "epoch": 0.06, + "grad_norm": 0.36527000688924915, + "learning_rate": 1.99565707022131e-05, + "loss": 0.2012, + "step": 1024 + }, + { + "epoch": 0.06, + "grad_norm": 1.771745956925933, + "learning_rate": 1.995639728496446e-05, + "loss": 0.859, + "step": 1025 + }, + { + "epoch": 0.06, + "grad_norm": 0.5436455796015267, + "learning_rate": 1.9956223522926212e-05, + "loss": 0.3379, + "step": 1026 + }, + { + "epoch": 0.06, + "grad_norm": 0.5224236889719983, + "learning_rate": 1.995604941610437e-05, + "loss": 0.3158, + "step": 1027 + }, + { + "epoch": 0.06, + "grad_norm": 0.8151837330528591, + "learning_rate": 1.9955874964504964e-05, + "loss": 0.5744, + "step": 1028 + }, + { + "epoch": 0.06, + "grad_norm": 0.31072521368728423, + "learning_rate": 1.995570016813404e-05, + "loss": 0.158, + "step": 1029 + }, + { + "epoch": 0.06, + "grad_norm": 0.6735818235853589, + "learning_rate": 1.995552502699764e-05, + "loss": 0.3841, + "step": 1030 + }, + { + "epoch": 0.06, + "grad_norm": 0.6394142288022795, + "learning_rate": 1.9955349541101844e-05, + "loss": 0.3293, + "step": 1031 + }, + { + "epoch": 0.06, + "grad_norm": 0.7060434285177977, + "learning_rate": 1.995517371045272e-05, + "loss": 0.4121, + "step": 1032 + }, + { + "epoch": 0.06, + "grad_norm": 0.7891476065439303, + "learning_rate": 1.9954997535056354e-05, + "loss": 0.361, + "step": 1033 + }, + { + "epoch": 0.06, + "grad_norm": 0.6378419743565197, + "learning_rate": 1.9954821014918857e-05, + "loss": 0.3563, + "step": 1034 + }, + { + "epoch": 0.06, + "grad_norm": 0.48004431946828835, + "learning_rate": 1.995464415004633e-05, + "loss": 0.2107, + "step": 1035 + }, + { + "epoch": 0.06, + "grad_norm": 0.3995032608655963, + "learning_rate": 1.9954466940444913e-05, + "loss": 0.2438, + "step": 1036 + }, + { + "epoch": 0.06, + "grad_norm": 1.1159782193087138, + "learning_rate": 1.9954289386120728e-05, + "loss": 0.5159, + "step": 1037 + }, + { + "epoch": 0.06, + "grad_norm": 0.5618332940945832, + "learning_rate": 1.995411148707993e-05, + "loss": 0.3351, + "step": 1038 + }, + { + "epoch": 0.06, + "grad_norm": 0.5215455455748942, + "learning_rate": 1.995393324332868e-05, + "loss": 0.3607, + "step": 1039 + }, + { + "epoch": 0.06, + "grad_norm": 0.8208653345275203, + "learning_rate": 1.9953754654873148e-05, + "loss": 0.5031, + "step": 1040 + }, + { + "epoch": 0.06, + "grad_norm": 0.3994022765006455, + "learning_rate": 1.995357572171952e-05, + "loss": 0.1645, + "step": 1041 + }, + { + "epoch": 0.06, + "grad_norm": 0.49644502143567504, + "learning_rate": 1.9953396443873996e-05, + "loss": 0.322, + "step": 1042 + }, + { + "epoch": 0.06, + "grad_norm": 0.7034835367330016, + "learning_rate": 1.995321682134278e-05, + "loss": 0.433, + "step": 1043 + }, + { + "epoch": 0.06, + "grad_norm": 0.7908560910972711, + "learning_rate": 1.995303685413209e-05, + "loss": 0.4042, + "step": 1044 + }, + { + "epoch": 0.06, + "grad_norm": 0.49210532169863536, + "learning_rate": 1.9952856542248168e-05, + "loss": 0.3592, + "step": 1045 + }, + { + "epoch": 0.06, + "grad_norm": 0.646689735534446, + "learning_rate": 1.995267588569725e-05, + "loss": 0.4115, + "step": 1046 + }, + { + "epoch": 0.06, + "grad_norm": 0.4048654510347636, + "learning_rate": 1.9952494884485593e-05, + "loss": 0.2289, + "step": 1047 + }, + { + "epoch": 0.06, + "grad_norm": 0.4411817060201455, + "learning_rate": 1.9952313538619467e-05, + "loss": 0.2335, + "step": 1048 + }, + { + "epoch": 0.06, + "grad_norm": 1.2620590338316189, + "learning_rate": 1.995213184810515e-05, + "loss": 0.7683, + "step": 1049 + }, + { + "epoch": 0.06, + "grad_norm": 0.5329777387061266, + "learning_rate": 1.9951949812948933e-05, + "loss": 0.3357, + "step": 1050 + }, + { + "epoch": 0.06, + "grad_norm": 0.5525441866753129, + "learning_rate": 1.9951767433157126e-05, + "loss": 0.3914, + "step": 1051 + }, + { + "epoch": 0.06, + "grad_norm": 1.0809939876830499, + "learning_rate": 1.9951584708736038e-05, + "loss": 0.7235, + "step": 1052 + }, + { + "epoch": 0.06, + "grad_norm": 0.3968852627552618, + "learning_rate": 1.9951401639692e-05, + "loss": 0.1826, + "step": 1053 + }, + { + "epoch": 0.06, + "grad_norm": 0.4945020554203922, + "learning_rate": 1.9951218226031354e-05, + "loss": 0.3181, + "step": 1054 + }, + { + "epoch": 0.06, + "grad_norm": 0.42878235628024847, + "learning_rate": 1.9951034467760446e-05, + "loss": 0.3214, + "step": 1055 + }, + { + "epoch": 0.06, + "grad_norm": 0.904246221547212, + "learning_rate": 1.995085036488564e-05, + "loss": 0.5193, + "step": 1056 + }, + { + "epoch": 0.06, + "grad_norm": 0.4788553603759133, + "learning_rate": 1.9950665917413318e-05, + "loss": 0.2582, + "step": 1057 + }, + { + "epoch": 0.06, + "grad_norm": 0.497482325015419, + "learning_rate": 1.995048112534986e-05, + "loss": 0.3624, + "step": 1058 + }, + { + "epoch": 0.06, + "grad_norm": 1.1997009804190655, + "learning_rate": 1.995029598870167e-05, + "loss": 0.6325, + "step": 1059 + }, + { + "epoch": 0.06, + "grad_norm": 0.3760169516809768, + "learning_rate": 1.995011050747516e-05, + "loss": 0.1641, + "step": 1060 + }, + { + "epoch": 0.06, + "grad_norm": 1.0218299290087096, + "learning_rate": 1.994992468167675e-05, + "loss": 0.5896, + "step": 1061 + }, + { + "epoch": 0.06, + "grad_norm": 0.5325934080764194, + "learning_rate": 1.9949738511312872e-05, + "loss": 0.4082, + "step": 1062 + }, + { + "epoch": 0.06, + "grad_norm": 0.5189018963919375, + "learning_rate": 1.994955199638998e-05, + "loss": 0.2595, + "step": 1063 + }, + { + "epoch": 0.06, + "grad_norm": 1.8985392734797513, + "learning_rate": 1.994936513691453e-05, + "loss": 0.8639, + "step": 1064 + }, + { + "epoch": 0.06, + "grad_norm": 0.616301320543746, + "learning_rate": 1.9949177932892997e-05, + "loss": 0.4081, + "step": 1065 + }, + { + "epoch": 0.06, + "grad_norm": 0.43255047491592685, + "learning_rate": 1.9948990384331853e-05, + "loss": 0.2952, + "step": 1066 + }, + { + "epoch": 0.06, + "grad_norm": 0.4972852098105318, + "learning_rate": 1.9948802491237608e-05, + "loss": 0.2134, + "step": 1067 + }, + { + "epoch": 0.06, + "grad_norm": 0.7883153357673944, + "learning_rate": 1.994861425361675e-05, + "loss": 0.525, + "step": 1068 + }, + { + "epoch": 0.06, + "grad_norm": 0.6785269587723624, + "learning_rate": 1.9948425671475816e-05, + "loss": 0.3279, + "step": 1069 + }, + { + "epoch": 0.06, + "grad_norm": 0.5178554482526371, + "learning_rate": 1.9948236744821327e-05, + "loss": 0.2993, + "step": 1070 + }, + { + "epoch": 0.06, + "grad_norm": 0.9167086626332699, + "learning_rate": 1.994804747365983e-05, + "loss": 0.4278, + "step": 1071 + }, + { + "epoch": 0.06, + "grad_norm": 0.5356860464606096, + "learning_rate": 1.994785785799787e-05, + "loss": 0.3144, + "step": 1072 + }, + { + "epoch": 0.06, + "grad_norm": 0.4967397434179642, + "learning_rate": 1.9947667897842027e-05, + "loss": 0.2778, + "step": 1073 + }, + { + "epoch": 0.06, + "grad_norm": 0.6967124996922901, + "learning_rate": 1.994747759319887e-05, + "loss": 0.3812, + "step": 1074 + }, + { + "epoch": 0.06, + "grad_norm": 0.48475894899954397, + "learning_rate": 1.994728694407499e-05, + "loss": 0.2961, + "step": 1075 + }, + { + "epoch": 0.06, + "grad_norm": 0.9083581496874299, + "learning_rate": 1.9947095950476992e-05, + "loss": 0.5082, + "step": 1076 + }, + { + "epoch": 0.06, + "grad_norm": 0.7444906820012983, + "learning_rate": 1.994690461241149e-05, + "loss": 0.3969, + "step": 1077 + }, + { + "epoch": 0.06, + "grad_norm": 0.46270242209941526, + "learning_rate": 1.994671292988511e-05, + "loss": 0.3147, + "step": 1078 + }, + { + "epoch": 0.06, + "grad_norm": 0.7264460965731782, + "learning_rate": 1.9946520902904485e-05, + "loss": 0.5275, + "step": 1079 + }, + { + "epoch": 0.06, + "grad_norm": 0.43704367289000573, + "learning_rate": 1.994632853147627e-05, + "loss": 0.1736, + "step": 1080 + }, + { + "epoch": 0.06, + "grad_norm": 0.5335354016333684, + "learning_rate": 1.9946135815607128e-05, + "loss": 0.3071, + "step": 1081 + }, + { + "epoch": 0.06, + "grad_norm": 0.4954050549137195, + "learning_rate": 1.9945942755303727e-05, + "loss": 0.2834, + "step": 1082 + }, + { + "epoch": 0.06, + "grad_norm": 1.0935397207678366, + "learning_rate": 1.994574935057276e-05, + "loss": 0.4421, + "step": 1083 + }, + { + "epoch": 0.06, + "grad_norm": 0.438437072971696, + "learning_rate": 1.994555560142092e-05, + "loss": 0.3371, + "step": 1084 + }, + { + "epoch": 0.06, + "grad_norm": 0.8748799090587484, + "learning_rate": 1.9945361507854914e-05, + "loss": 0.5721, + "step": 1085 + }, + { + "epoch": 0.06, + "grad_norm": 0.49367308799806964, + "learning_rate": 1.9945167069881468e-05, + "loss": 0.3301, + "step": 1086 + }, + { + "epoch": 0.06, + "grad_norm": 0.5471671685840018, + "learning_rate": 1.9944972287507316e-05, + "loss": 0.2885, + "step": 1087 + }, + { + "epoch": 0.06, + "grad_norm": 0.44722860542267706, + "learning_rate": 1.99447771607392e-05, + "loss": 0.2388, + "step": 1088 + }, + { + "epoch": 0.06, + "grad_norm": 0.4399451723523636, + "learning_rate": 1.9944581689583878e-05, + "loss": 0.3063, + "step": 1089 + }, + { + "epoch": 0.06, + "grad_norm": 0.49743000266769444, + "learning_rate": 1.994438587404812e-05, + "loss": 0.3283, + "step": 1090 + }, + { + "epoch": 0.06, + "grad_norm": 1.2104431749854316, + "learning_rate": 1.994418971413871e-05, + "loss": 0.5652, + "step": 1091 + }, + { + "epoch": 0.06, + "grad_norm": 1.2263233603048742, + "learning_rate": 1.994399320986243e-05, + "loss": 0.7256, + "step": 1092 + }, + { + "epoch": 0.06, + "grad_norm": 0.5740931856541661, + "learning_rate": 1.99437963612261e-05, + "loss": 0.2438, + "step": 1093 + }, + { + "epoch": 0.06, + "grad_norm": 0.39197406731229295, + "learning_rate": 1.9943599168236526e-05, + "loss": 0.2557, + "step": 1094 + }, + { + "epoch": 0.06, + "grad_norm": 1.245863493813506, + "learning_rate": 1.9943401630900543e-05, + "loss": 0.5469, + "step": 1095 + }, + { + "epoch": 0.06, + "grad_norm": 0.5865781893361174, + "learning_rate": 1.9943203749224986e-05, + "loss": 0.3007, + "step": 1096 + }, + { + "epoch": 0.06, + "grad_norm": 1.3374424837824597, + "learning_rate": 1.9943005523216713e-05, + "loss": 0.581, + "step": 1097 + }, + { + "epoch": 0.06, + "grad_norm": 0.5545592888985182, + "learning_rate": 1.9942806952882587e-05, + "loss": 0.3663, + "step": 1098 + }, + { + "epoch": 0.06, + "grad_norm": 0.4807505633477306, + "learning_rate": 1.994260803822948e-05, + "loss": 0.2451, + "step": 1099 + }, + { + "epoch": 0.06, + "grad_norm": 0.3769247168053469, + "learning_rate": 1.994240877926429e-05, + "loss": 0.2144, + "step": 1100 + }, + { + "epoch": 0.06, + "grad_norm": 0.7380917848688296, + "learning_rate": 1.994220917599391e-05, + "loss": 0.4029, + "step": 1101 + }, + { + "epoch": 0.06, + "grad_norm": 0.5737493430661074, + "learning_rate": 1.994200922842525e-05, + "loss": 0.3119, + "step": 1102 + }, + { + "epoch": 0.06, + "grad_norm": 2.063385576846166, + "learning_rate": 1.994180893656524e-05, + "loss": 0.8398, + "step": 1103 + }, + { + "epoch": 0.06, + "grad_norm": 1.3920732373420577, + "learning_rate": 1.9941608300420815e-05, + "loss": 0.7834, + "step": 1104 + }, + { + "epoch": 0.06, + "grad_norm": 0.7518563438767537, + "learning_rate": 1.9941407319998918e-05, + "loss": 0.3335, + "step": 1105 + }, + { + "epoch": 0.06, + "grad_norm": 0.45728680330538174, + "learning_rate": 1.9941205995306517e-05, + "loss": 0.2271, + "step": 1106 + }, + { + "epoch": 0.06, + "grad_norm": 0.7047347617250996, + "learning_rate": 1.994100432635058e-05, + "loss": 0.5542, + "step": 1107 + }, + { + "epoch": 0.06, + "grad_norm": 0.7164373543084449, + "learning_rate": 1.9940802313138092e-05, + "loss": 0.3288, + "step": 1108 + }, + { + "epoch": 0.06, + "grad_norm": 0.4610040084120822, + "learning_rate": 1.994059995567604e-05, + "loss": 0.2943, + "step": 1109 + }, + { + "epoch": 0.06, + "grad_norm": 0.6608778183845305, + "learning_rate": 1.9940397253971447e-05, + "loss": 0.3997, + "step": 1110 + }, + { + "epoch": 0.06, + "grad_norm": 0.6010430641256433, + "learning_rate": 1.9940194208031322e-05, + "loss": 0.3541, + "step": 1111 + }, + { + "epoch": 0.06, + "grad_norm": 0.5892069559400017, + "learning_rate": 1.9939990817862696e-05, + "loss": 0.3829, + "step": 1112 + }, + { + "epoch": 0.06, + "grad_norm": 0.39423776153888224, + "learning_rate": 1.9939787083472616e-05, + "loss": 0.3134, + "step": 1113 + }, + { + "epoch": 0.06, + "grad_norm": 0.3783860241639249, + "learning_rate": 1.993958300486814e-05, + "loss": 0.2465, + "step": 1114 + }, + { + "epoch": 0.06, + "grad_norm": 0.5857598729183383, + "learning_rate": 1.9939378582056332e-05, + "loss": 0.3123, + "step": 1115 + }, + { + "epoch": 0.06, + "grad_norm": 1.451790292271993, + "learning_rate": 1.993917381504427e-05, + "loss": 0.8534, + "step": 1116 + }, + { + "epoch": 0.06, + "grad_norm": 0.43312088945702426, + "learning_rate": 1.9938968703839045e-05, + "loss": 0.3252, + "step": 1117 + }, + { + "epoch": 0.06, + "grad_norm": 0.5774867354453822, + "learning_rate": 1.9938763248447762e-05, + "loss": 0.3247, + "step": 1118 + }, + { + "epoch": 0.06, + "grad_norm": 0.5606158673409164, + "learning_rate": 1.9938557448877536e-05, + "loss": 0.3491, + "step": 1119 + }, + { + "epoch": 0.06, + "grad_norm": 0.3791222671011608, + "learning_rate": 1.9938351305135492e-05, + "loss": 0.235, + "step": 1120 + }, + { + "epoch": 0.06, + "grad_norm": 1.9555671938828716, + "learning_rate": 1.993814481722877e-05, + "loss": 0.8035, + "step": 1121 + }, + { + "epoch": 0.06, + "grad_norm": 0.5317081120210266, + "learning_rate": 1.9937937985164518e-05, + "loss": 0.3285, + "step": 1122 + }, + { + "epoch": 0.06, + "grad_norm": 0.7753229601610528, + "learning_rate": 1.9937730808949905e-05, + "loss": 0.3667, + "step": 1123 + }, + { + "epoch": 0.06, + "grad_norm": 0.5828111764073042, + "learning_rate": 1.99375232885921e-05, + "loss": 0.4137, + "step": 1124 + }, + { + "epoch": 0.06, + "grad_norm": 0.39500188850271334, + "learning_rate": 1.9937315424098288e-05, + "loss": 0.291, + "step": 1125 + }, + { + "epoch": 0.06, + "grad_norm": 0.41533959678148713, + "learning_rate": 1.9937107215475673e-05, + "loss": 0.2114, + "step": 1126 + }, + { + "epoch": 0.06, + "grad_norm": 0.5307863364980268, + "learning_rate": 1.9936898662731463e-05, + "loss": 0.3774, + "step": 1127 + }, + { + "epoch": 0.06, + "grad_norm": 1.1478798383437205, + "learning_rate": 1.9936689765872878e-05, + "loss": 0.5261, + "step": 1128 + }, + { + "epoch": 0.06, + "grad_norm": 0.46423088674225926, + "learning_rate": 1.9936480524907154e-05, + "loss": 0.3142, + "step": 1129 + }, + { + "epoch": 0.06, + "grad_norm": 0.4766471909599253, + "learning_rate": 1.9936270939841536e-05, + "loss": 0.3808, + "step": 1130 + }, + { + "epoch": 0.06, + "grad_norm": 1.155229588534361, + "learning_rate": 1.9936061010683285e-05, + "loss": 0.5931, + "step": 1131 + }, + { + "epoch": 0.07, + "grad_norm": 0.3543521865584727, + "learning_rate": 1.9935850737439667e-05, + "loss": 0.134, + "step": 1132 + }, + { + "epoch": 0.07, + "grad_norm": 0.4931860315991214, + "learning_rate": 1.9935640120117965e-05, + "loss": 0.3337, + "step": 1133 + }, + { + "epoch": 0.07, + "grad_norm": 1.390135619937858, + "learning_rate": 1.9935429158725475e-05, + "loss": 0.8683, + "step": 1134 + }, + { + "epoch": 0.07, + "grad_norm": 0.5975249328681871, + "learning_rate": 1.9935217853269497e-05, + "loss": 0.3176, + "step": 1135 + }, + { + "epoch": 0.07, + "grad_norm": 0.625255175254, + "learning_rate": 1.9935006203757354e-05, + "loss": 0.4517, + "step": 1136 + }, + { + "epoch": 0.07, + "grad_norm": 0.5236283609437692, + "learning_rate": 1.9934794210196374e-05, + "loss": 0.3806, + "step": 1137 + }, + { + "epoch": 0.07, + "grad_norm": 0.25258242792276325, + "learning_rate": 1.9934581872593893e-05, + "loss": 0.1451, + "step": 1138 + }, + { + "epoch": 0.07, + "grad_norm": 0.6981887311269585, + "learning_rate": 1.9934369190957275e-05, + "loss": 0.4007, + "step": 1139 + }, + { + "epoch": 0.07, + "grad_norm": 0.8433075775942684, + "learning_rate": 1.9934156165293878e-05, + "loss": 0.6068, + "step": 1140 + }, + { + "epoch": 0.07, + "grad_norm": 0.4589248024137913, + "learning_rate": 1.9933942795611075e-05, + "loss": 0.2664, + "step": 1141 + }, + { + "epoch": 0.07, + "grad_norm": 0.5854364839569616, + "learning_rate": 1.9933729081916266e-05, + "loss": 0.4127, + "step": 1142 + }, + { + "epoch": 0.07, + "grad_norm": 1.1202946340701216, + "learning_rate": 1.9933515024216844e-05, + "loss": 0.6557, + "step": 1143 + }, + { + "epoch": 0.07, + "grad_norm": 0.32203550195598013, + "learning_rate": 1.9933300622520225e-05, + "loss": 0.1755, + "step": 1144 + }, + { + "epoch": 0.07, + "grad_norm": 0.47969552743697114, + "learning_rate": 1.9933085876833833e-05, + "loss": 0.2807, + "step": 1145 + }, + { + "epoch": 0.07, + "grad_norm": 1.0268504627365178, + "learning_rate": 1.99328707871651e-05, + "loss": 0.6002, + "step": 1146 + }, + { + "epoch": 0.07, + "grad_norm": 0.6890581144690542, + "learning_rate": 1.9932655353521483e-05, + "loss": 0.4666, + "step": 1147 + }, + { + "epoch": 0.07, + "grad_norm": 0.5003936995109405, + "learning_rate": 1.9932439575910436e-05, + "loss": 0.2906, + "step": 1148 + }, + { + "epoch": 0.07, + "grad_norm": 0.5008695250916106, + "learning_rate": 1.9932223454339435e-05, + "loss": 0.3429, + "step": 1149 + }, + { + "epoch": 0.07, + "grad_norm": 0.43940406343985966, + "learning_rate": 1.993200698881596e-05, + "loss": 0.2448, + "step": 1150 + }, + { + "epoch": 0.07, + "grad_norm": 0.37846887106776983, + "learning_rate": 1.9931790179347514e-05, + "loss": 0.2156, + "step": 1151 + }, + { + "epoch": 0.07, + "grad_norm": 1.9187364850397228, + "learning_rate": 1.99315730259416e-05, + "loss": 0.6074, + "step": 1152 + }, + { + "epoch": 0.07, + "grad_norm": 0.4227557946650834, + "learning_rate": 1.9931355528605738e-05, + "loss": 0.3318, + "step": 1153 + }, + { + "epoch": 0.07, + "grad_norm": 0.6158280113417645, + "learning_rate": 1.993113768734746e-05, + "loss": 0.3654, + "step": 1154 + }, + { + "epoch": 0.07, + "grad_norm": 1.1925768363660976, + "learning_rate": 1.9930919502174312e-05, + "loss": 0.4372, + "step": 1155 + }, + { + "epoch": 0.07, + "grad_norm": 0.2978244213918772, + "learning_rate": 1.993070097309385e-05, + "loss": 0.1655, + "step": 1156 + }, + { + "epoch": 0.07, + "grad_norm": 0.4938521213731747, + "learning_rate": 1.9930482100113642e-05, + "loss": 0.3306, + "step": 1157 + }, + { + "epoch": 0.07, + "grad_norm": 1.026122384333359, + "learning_rate": 1.9930262883241265e-05, + "loss": 0.4183, + "step": 1158 + }, + { + "epoch": 0.07, + "grad_norm": 0.9371568067117475, + "learning_rate": 1.9930043322484306e-05, + "loss": 0.552, + "step": 1159 + }, + { + "epoch": 0.07, + "grad_norm": 0.5631135620050378, + "learning_rate": 1.992982341785038e-05, + "loss": 0.3326, + "step": 1160 + }, + { + "epoch": 0.07, + "grad_norm": 0.5588638465425702, + "learning_rate": 1.9929603169347095e-05, + "loss": 0.3178, + "step": 1161 + }, + { + "epoch": 0.07, + "grad_norm": 1.096666482734466, + "learning_rate": 1.9929382576982076e-05, + "loss": 0.4737, + "step": 1162 + }, + { + "epoch": 0.07, + "grad_norm": 0.3927720395948921, + "learning_rate": 1.9929161640762968e-05, + "loss": 0.281, + "step": 1163 + }, + { + "epoch": 0.07, + "grad_norm": 0.7510863349671362, + "learning_rate": 1.992894036069742e-05, + "loss": 0.5086, + "step": 1164 + }, + { + "epoch": 0.07, + "grad_norm": 0.6070472615776253, + "learning_rate": 1.992871873679309e-05, + "loss": 0.3358, + "step": 1165 + }, + { + "epoch": 0.07, + "grad_norm": 0.4624380279404961, + "learning_rate": 1.9928496769057662e-05, + "loss": 0.2936, + "step": 1166 + }, + { + "epoch": 0.07, + "grad_norm": 2.0190631314339442, + "learning_rate": 1.9928274457498818e-05, + "loss": 0.7872, + "step": 1167 + }, + { + "epoch": 0.07, + "grad_norm": 0.4957965334493548, + "learning_rate": 1.9928051802124252e-05, + "loss": 0.3326, + "step": 1168 + }, + { + "epoch": 0.07, + "grad_norm": 0.47560370552216386, + "learning_rate": 1.9927828802941683e-05, + "loss": 0.3471, + "step": 1169 + }, + { + "epoch": 0.07, + "grad_norm": 0.5550644991870759, + "learning_rate": 1.9927605459958825e-05, + "loss": 0.3929, + "step": 1170 + }, + { + "epoch": 0.07, + "grad_norm": 0.4000080669193282, + "learning_rate": 1.992738177318342e-05, + "loss": 0.1703, + "step": 1171 + }, + { + "epoch": 0.07, + "grad_norm": 0.6353595781264442, + "learning_rate": 1.992715774262321e-05, + "loss": 0.3254, + "step": 1172 + }, + { + "epoch": 0.07, + "grad_norm": 0.5679976742869749, + "learning_rate": 1.992693336828596e-05, + "loss": 0.3704, + "step": 1173 + }, + { + "epoch": 0.07, + "grad_norm": 0.5135060035524516, + "learning_rate": 1.9926708650179426e-05, + "loss": 0.2752, + "step": 1174 + }, + { + "epoch": 0.07, + "grad_norm": 0.5082286635920554, + "learning_rate": 1.9926483588311402e-05, + "loss": 0.3722, + "step": 1175 + }, + { + "epoch": 0.07, + "grad_norm": 0.7864428636362328, + "learning_rate": 1.9926258182689677e-05, + "loss": 0.5978, + "step": 1176 + }, + { + "epoch": 0.07, + "grad_norm": 0.4977399864736148, + "learning_rate": 1.992603243332206e-05, + "loss": 0.2923, + "step": 1177 + }, + { + "epoch": 0.07, + "grad_norm": 0.3823089546041058, + "learning_rate": 1.9925806340216365e-05, + "loss": 0.2497, + "step": 1178 + }, + { + "epoch": 0.07, + "grad_norm": 0.5603124203509675, + "learning_rate": 1.9925579903380425e-05, + "loss": 0.3242, + "step": 1179 + }, + { + "epoch": 0.07, + "grad_norm": 0.4985678408576438, + "learning_rate": 1.9925353122822077e-05, + "loss": 0.4399, + "step": 1180 + }, + { + "epoch": 0.07, + "grad_norm": 0.37733182664093273, + "learning_rate": 1.992512599854918e-05, + "loss": 0.269, + "step": 1181 + }, + { + "epoch": 0.07, + "grad_norm": 1.1088873935769934, + "learning_rate": 1.9924898530569594e-05, + "loss": 0.6713, + "step": 1182 + }, + { + "epoch": 0.07, + "grad_norm": 0.5461847080526667, + "learning_rate": 1.99246707188912e-05, + "loss": 0.3439, + "step": 1183 + }, + { + "epoch": 0.07, + "grad_norm": 0.3354047490782291, + "learning_rate": 1.9924442563521885e-05, + "loss": 0.1924, + "step": 1184 + }, + { + "epoch": 0.07, + "grad_norm": 0.5354214457370254, + "learning_rate": 1.992421406446955e-05, + "loss": 0.3413, + "step": 1185 + }, + { + "epoch": 0.07, + "grad_norm": 0.9941015836479209, + "learning_rate": 1.9923985221742112e-05, + "loss": 0.5636, + "step": 1186 + }, + { + "epoch": 0.07, + "grad_norm": 0.4010314605169855, + "learning_rate": 1.992375603534749e-05, + "loss": 0.2394, + "step": 1187 + }, + { + "epoch": 0.07, + "grad_norm": 1.578704678511741, + "learning_rate": 1.9923526505293623e-05, + "loss": 0.844, + "step": 1188 + }, + { + "epoch": 0.07, + "grad_norm": 0.5145164289755612, + "learning_rate": 1.9923296631588462e-05, + "loss": 0.3961, + "step": 1189 + }, + { + "epoch": 0.07, + "grad_norm": 0.42736151507104164, + "learning_rate": 1.9923066414239965e-05, + "loss": 0.2347, + "step": 1190 + }, + { + "epoch": 0.07, + "grad_norm": 0.39926855460341876, + "learning_rate": 1.9922835853256103e-05, + "loss": 0.2363, + "step": 1191 + }, + { + "epoch": 0.07, + "grad_norm": 0.6873331025968474, + "learning_rate": 1.9922604948644865e-05, + "loss": 0.4264, + "step": 1192 + }, + { + "epoch": 0.07, + "grad_norm": 0.5172055478516355, + "learning_rate": 1.992237370041424e-05, + "loss": 0.3225, + "step": 1193 + }, + { + "epoch": 0.07, + "grad_norm": 0.5461826946326311, + "learning_rate": 1.9922142108572245e-05, + "loss": 0.3776, + "step": 1194 + }, + { + "epoch": 0.07, + "grad_norm": 0.8171086797305573, + "learning_rate": 1.9921910173126894e-05, + "loss": 0.545, + "step": 1195 + }, + { + "epoch": 0.07, + "grad_norm": 0.545195871554347, + "learning_rate": 1.9921677894086217e-05, + "loss": 0.3055, + "step": 1196 + }, + { + "epoch": 0.07, + "grad_norm": 0.296707240427443, + "learning_rate": 1.9921445271458263e-05, + "loss": 0.2576, + "step": 1197 + }, + { + "epoch": 0.07, + "grad_norm": 1.0695270171865263, + "learning_rate": 1.992121230525109e-05, + "loss": 0.5985, + "step": 1198 + }, + { + "epoch": 0.07, + "grad_norm": 0.5373484107110249, + "learning_rate": 1.992097899547276e-05, + "loss": 0.3029, + "step": 1199 + }, + { + "epoch": 0.07, + "grad_norm": 0.521812372352844, + "learning_rate": 1.992074534213135e-05, + "loss": 0.3079, + "step": 1200 + }, + { + "epoch": 0.07, + "grad_norm": 1.3838297456653332, + "learning_rate": 1.9920511345234956e-05, + "loss": 0.7985, + "step": 1201 + }, + { + "epoch": 0.07, + "grad_norm": 0.48375303096121197, + "learning_rate": 1.9920277004791682e-05, + "loss": 0.3086, + "step": 1202 + }, + { + "epoch": 0.07, + "grad_norm": 0.552106262015009, + "learning_rate": 1.992004232080964e-05, + "loss": 0.3058, + "step": 1203 + }, + { + "epoch": 0.07, + "grad_norm": 0.5531874559226082, + "learning_rate": 1.9919807293296963e-05, + "loss": 0.3231, + "step": 1204 + }, + { + "epoch": 0.07, + "grad_norm": 0.5455190994044464, + "learning_rate": 1.9919571922261784e-05, + "loss": 0.3043, + "step": 1205 + }, + { + "epoch": 0.07, + "grad_norm": 1.2944476032556527, + "learning_rate": 1.9919336207712258e-05, + "loss": 0.7407, + "step": 1206 + }, + { + "epoch": 0.07, + "grad_norm": 0.9246456693780484, + "learning_rate": 1.991910014965654e-05, + "loss": 0.5139, + "step": 1207 + }, + { + "epoch": 0.07, + "grad_norm": 0.5754654948616781, + "learning_rate": 1.9918863748102818e-05, + "loss": 0.2991, + "step": 1208 + }, + { + "epoch": 0.07, + "grad_norm": 0.5043664025461552, + "learning_rate": 1.9918627003059266e-05, + "loss": 0.3907, + "step": 1209 + }, + { + "epoch": 0.07, + "grad_norm": 0.3706019964753908, + "learning_rate": 1.9918389914534086e-05, + "loss": 0.2143, + "step": 1210 + }, + { + "epoch": 0.07, + "grad_norm": 0.5300152129471692, + "learning_rate": 1.9918152482535494e-05, + "loss": 0.3016, + "step": 1211 + }, + { + "epoch": 0.07, + "grad_norm": 0.5248817577305702, + "learning_rate": 1.9917914707071703e-05, + "loss": 0.4038, + "step": 1212 + }, + { + "epoch": 0.07, + "grad_norm": 0.617387402551657, + "learning_rate": 1.991767658815096e-05, + "loss": 0.3454, + "step": 1213 + }, + { + "epoch": 0.07, + "grad_norm": 0.4336728307512205, + "learning_rate": 1.9917438125781497e-05, + "loss": 0.2924, + "step": 1214 + }, + { + "epoch": 0.07, + "grad_norm": 0.72894687779715, + "learning_rate": 1.991719931997158e-05, + "loss": 0.5171, + "step": 1215 + }, + { + "epoch": 0.07, + "grad_norm": 0.3394246258776419, + "learning_rate": 1.9916960170729475e-05, + "loss": 0.2009, + "step": 1216 + }, + { + "epoch": 0.07, + "grad_norm": 0.44296100608782896, + "learning_rate": 1.9916720678063467e-05, + "loss": 0.3119, + "step": 1217 + }, + { + "epoch": 0.07, + "grad_norm": 1.2136487371261702, + "learning_rate": 1.9916480841981853e-05, + "loss": 0.6634, + "step": 1218 + }, + { + "epoch": 0.07, + "grad_norm": 0.9793411528882585, + "learning_rate": 1.991624066249293e-05, + "loss": 0.6157, + "step": 1219 + }, + { + "epoch": 0.07, + "grad_norm": 0.3788574100278588, + "learning_rate": 1.9916000139605013e-05, + "loss": 0.2773, + "step": 1220 + }, + { + "epoch": 0.07, + "grad_norm": 0.5509266148859141, + "learning_rate": 1.9915759273326447e-05, + "loss": 0.4402, + "step": 1221 + }, + { + "epoch": 0.07, + "grad_norm": 0.35834376031323334, + "learning_rate": 1.9915518063665556e-05, + "loss": 0.1633, + "step": 1222 + }, + { + "epoch": 0.07, + "grad_norm": 0.4119437399996039, + "learning_rate": 1.991527651063071e-05, + "loss": 0.2417, + "step": 1223 + }, + { + "epoch": 0.07, + "grad_norm": 0.6280447549338211, + "learning_rate": 1.9915034614230256e-05, + "loss": 0.4287, + "step": 1224 + }, + { + "epoch": 0.07, + "grad_norm": 0.5800209981404687, + "learning_rate": 1.9914792374472584e-05, + "loss": 0.4533, + "step": 1225 + }, + { + "epoch": 0.07, + "grad_norm": 0.43318599109249784, + "learning_rate": 1.991454979136608e-05, + "loss": 0.262, + "step": 1226 + }, + { + "epoch": 0.07, + "grad_norm": 0.8602780715330212, + "learning_rate": 1.991430686491914e-05, + "loss": 0.5644, + "step": 1227 + }, + { + "epoch": 0.07, + "grad_norm": 0.37876834389922714, + "learning_rate": 1.9914063595140184e-05, + "loss": 0.3045, + "step": 1228 + }, + { + "epoch": 0.07, + "grad_norm": 0.41026526676860525, + "learning_rate": 1.9913819982037627e-05, + "loss": 0.2101, + "step": 1229 + }, + { + "epoch": 0.07, + "grad_norm": 0.5216085347059072, + "learning_rate": 1.991357602561991e-05, + "loss": 0.2894, + "step": 1230 + }, + { + "epoch": 0.07, + "grad_norm": 0.9685343917475587, + "learning_rate": 1.9913331725895485e-05, + "loss": 0.5665, + "step": 1231 + }, + { + "epoch": 0.07, + "grad_norm": 0.49616526614313705, + "learning_rate": 1.9913087082872808e-05, + "loss": 0.3046, + "step": 1232 + }, + { + "epoch": 0.07, + "grad_norm": 0.40009900721312874, + "learning_rate": 1.9912842096560348e-05, + "loss": 0.2986, + "step": 1233 + }, + { + "epoch": 0.07, + "grad_norm": 0.6011999434725922, + "learning_rate": 1.9912596766966598e-05, + "loss": 0.3332, + "step": 1234 + }, + { + "epoch": 0.07, + "grad_norm": 0.3754282911061259, + "learning_rate": 1.9912351094100043e-05, + "loss": 0.2408, + "step": 1235 + }, + { + "epoch": 0.07, + "grad_norm": 0.4790995862974184, + "learning_rate": 1.99121050779692e-05, + "loss": 0.313, + "step": 1236 + }, + { + "epoch": 0.07, + "grad_norm": 0.9519799944694786, + "learning_rate": 1.9911858718582583e-05, + "loss": 0.5728, + "step": 1237 + }, + { + "epoch": 0.07, + "grad_norm": 0.48524535965068866, + "learning_rate": 1.9911612015948726e-05, + "loss": 0.3642, + "step": 1238 + }, + { + "epoch": 0.07, + "grad_norm": 0.6463784806450834, + "learning_rate": 1.9911364970076167e-05, + "loss": 0.2903, + "step": 1239 + }, + { + "epoch": 0.07, + "grad_norm": 0.5487653396668991, + "learning_rate": 1.9911117580973468e-05, + "loss": 0.3872, + "step": 1240 + }, + { + "epoch": 0.07, + "grad_norm": 0.28116441246936236, + "learning_rate": 1.9910869848649192e-05, + "loss": 0.1866, + "step": 1241 + }, + { + "epoch": 0.07, + "grad_norm": 0.7251262440666021, + "learning_rate": 1.991062177311192e-05, + "loss": 0.4662, + "step": 1242 + }, + { + "epoch": 0.07, + "grad_norm": 1.4629712657903426, + "learning_rate": 1.9910373354370245e-05, + "loss": 0.504, + "step": 1243 + }, + { + "epoch": 0.07, + "grad_norm": 0.4424282521343179, + "learning_rate": 1.9910124592432763e-05, + "loss": 0.3082, + "step": 1244 + }, + { + "epoch": 0.07, + "grad_norm": 0.7088073521939233, + "learning_rate": 1.9909875487308096e-05, + "loss": 0.4297, + "step": 1245 + }, + { + "epoch": 0.07, + "grad_norm": 0.45330422783598706, + "learning_rate": 1.9909626039004862e-05, + "loss": 0.1286, + "step": 1246 + }, + { + "epoch": 0.07, + "grad_norm": 0.6475879211850505, + "learning_rate": 1.990937624753171e-05, + "loss": 0.3348, + "step": 1247 + }, + { + "epoch": 0.07, + "grad_norm": 0.5626112796455328, + "learning_rate": 1.990912611289728e-05, + "loss": 0.3295, + "step": 1248 + }, + { + "epoch": 0.07, + "grad_norm": 2.169279671276457, + "learning_rate": 1.9908875635110244e-05, + "loss": 0.5194, + "step": 1249 + }, + { + "epoch": 0.07, + "grad_norm": 0.7257332887197935, + "learning_rate": 1.990862481417927e-05, + "loss": 0.4001, + "step": 1250 + }, + { + "epoch": 0.07, + "grad_norm": 0.6408089696432202, + "learning_rate": 1.990837365011304e-05, + "loss": 0.3905, + "step": 1251 + }, + { + "epoch": 0.07, + "grad_norm": 0.628381425763534, + "learning_rate": 1.9908122142920262e-05, + "loss": 0.2872, + "step": 1252 + }, + { + "epoch": 0.07, + "grad_norm": 0.393935167434499, + "learning_rate": 1.9907870292609633e-05, + "loss": 0.1497, + "step": 1253 + }, + { + "epoch": 0.07, + "grad_norm": 1.3577178025888306, + "learning_rate": 1.990761809918989e-05, + "loss": 0.4176, + "step": 1254 + }, + { + "epoch": 0.07, + "grad_norm": 2.0334997177234264, + "learning_rate": 1.9907365562669753e-05, + "loss": 0.6454, + "step": 1255 + }, + { + "epoch": 0.07, + "grad_norm": 0.4657792094873402, + "learning_rate": 1.9907112683057974e-05, + "loss": 0.2523, + "step": 1256 + }, + { + "epoch": 0.07, + "grad_norm": 0.6638005020345322, + "learning_rate": 1.9906859460363307e-05, + "loss": 0.4164, + "step": 1257 + }, + { + "epoch": 0.07, + "grad_norm": 1.2400648252392563, + "learning_rate": 1.9906605894594525e-05, + "loss": 0.6687, + "step": 1258 + }, + { + "epoch": 0.07, + "grad_norm": 0.5219896184442447, + "learning_rate": 1.990635198576041e-05, + "loss": 0.2149, + "step": 1259 + }, + { + "epoch": 0.07, + "grad_norm": 0.7292828300759986, + "learning_rate": 1.9906097733869746e-05, + "loss": 0.3419, + "step": 1260 + }, + { + "epoch": 0.07, + "grad_norm": 2.141928613905644, + "learning_rate": 1.990584313893135e-05, + "loss": 0.5934, + "step": 1261 + }, + { + "epoch": 0.07, + "grad_norm": 0.352641454123304, + "learning_rate": 1.9905588200954025e-05, + "loss": 0.1808, + "step": 1262 + }, + { + "epoch": 0.07, + "grad_norm": 0.743654610148312, + "learning_rate": 1.990533291994661e-05, + "loss": 0.4418, + "step": 1263 + }, + { + "epoch": 0.07, + "grad_norm": 0.5783868326461529, + "learning_rate": 1.9905077295917944e-05, + "loss": 0.3728, + "step": 1264 + }, + { + "epoch": 0.07, + "grad_norm": 0.6927545700509946, + "learning_rate": 1.9904821328876873e-05, + "loss": 0.2807, + "step": 1265 + }, + { + "epoch": 0.07, + "grad_norm": 0.9029660267620941, + "learning_rate": 1.9904565018832267e-05, + "loss": 0.4061, + "step": 1266 + }, + { + "epoch": 0.07, + "grad_norm": 0.48832856958609416, + "learning_rate": 1.9904308365792998e-05, + "loss": 0.3221, + "step": 1267 + }, + { + "epoch": 0.07, + "grad_norm": 0.6071012810126546, + "learning_rate": 1.9904051369767958e-05, + "loss": 0.3727, + "step": 1268 + }, + { + "epoch": 0.07, + "grad_norm": 0.4333485493904293, + "learning_rate": 1.9903794030766047e-05, + "loss": 0.216, + "step": 1269 + }, + { + "epoch": 0.07, + "grad_norm": 0.8550028384302811, + "learning_rate": 1.9903536348796172e-05, + "loss": 0.5802, + "step": 1270 + }, + { + "epoch": 0.07, + "grad_norm": 0.5444160978702124, + "learning_rate": 1.9903278323867262e-05, + "loss": 0.502, + "step": 1271 + }, + { + "epoch": 0.07, + "grad_norm": 0.49056768853210037, + "learning_rate": 1.9903019955988246e-05, + "loss": 0.2712, + "step": 1272 + }, + { + "epoch": 0.07, + "grad_norm": 1.1139660490114864, + "learning_rate": 1.9902761245168078e-05, + "loss": 0.659, + "step": 1273 + }, + { + "epoch": 0.07, + "grad_norm": 0.36944374947304287, + "learning_rate": 1.990250219141571e-05, + "loss": 0.1944, + "step": 1274 + }, + { + "epoch": 0.07, + "grad_norm": 0.4744817910147028, + "learning_rate": 1.990224279474012e-05, + "loss": 0.2431, + "step": 1275 + }, + { + "epoch": 0.07, + "grad_norm": 0.5063465239101601, + "learning_rate": 1.990198305515029e-05, + "loss": 0.3569, + "step": 1276 + }, + { + "epoch": 0.07, + "grad_norm": 0.8148200431071357, + "learning_rate": 1.9901722972655207e-05, + "loss": 0.4337, + "step": 1277 + }, + { + "epoch": 0.07, + "grad_norm": 0.4741481971958385, + "learning_rate": 1.990146254726389e-05, + "loss": 0.3101, + "step": 1278 + }, + { + "epoch": 0.07, + "grad_norm": 0.5962509814169538, + "learning_rate": 1.9901201778985344e-05, + "loss": 0.4426, + "step": 1279 + }, + { + "epoch": 0.07, + "grad_norm": 0.38573360540688933, + "learning_rate": 1.9900940667828606e-05, + "loss": 0.259, + "step": 1280 + }, + { + "epoch": 0.07, + "grad_norm": 0.40658105084541624, + "learning_rate": 1.9900679213802724e-05, + "loss": 0.2474, + "step": 1281 + }, + { + "epoch": 0.07, + "grad_norm": 0.7175034272163154, + "learning_rate": 1.9900417416916742e-05, + "loss": 0.4537, + "step": 1282 + }, + { + "epoch": 0.07, + "grad_norm": 0.5669166964944146, + "learning_rate": 1.9900155277179734e-05, + "loss": 0.3968, + "step": 1283 + }, + { + "epoch": 0.07, + "grad_norm": 0.7815627889065767, + "learning_rate": 1.989989279460077e-05, + "loss": 0.313, + "step": 1284 + }, + { + "epoch": 0.07, + "grad_norm": 0.9383698484651365, + "learning_rate": 1.989962996918895e-05, + "loss": 0.5734, + "step": 1285 + }, + { + "epoch": 0.07, + "grad_norm": 0.4976846772814327, + "learning_rate": 1.9899366800953367e-05, + "loss": 0.2426, + "step": 1286 + }, + { + "epoch": 0.07, + "grad_norm": 0.38644317778329124, + "learning_rate": 1.9899103289903137e-05, + "loss": 0.2545, + "step": 1287 + }, + { + "epoch": 0.07, + "grad_norm": 0.545966418703407, + "learning_rate": 1.9898839436047384e-05, + "loss": 0.3516, + "step": 1288 + }, + { + "epoch": 0.07, + "grad_norm": 0.8579484617086817, + "learning_rate": 1.989857523939525e-05, + "loss": 0.4972, + "step": 1289 + }, + { + "epoch": 0.07, + "grad_norm": 0.455372961518016, + "learning_rate": 1.9898310699955884e-05, + "loss": 0.3282, + "step": 1290 + }, + { + "epoch": 0.07, + "grad_norm": 0.6335961270391565, + "learning_rate": 1.989804581773844e-05, + "loss": 0.3331, + "step": 1291 + }, + { + "epoch": 0.07, + "grad_norm": 0.50902091626044, + "learning_rate": 1.989778059275209e-05, + "loss": 0.3618, + "step": 1292 + }, + { + "epoch": 0.07, + "grad_norm": 0.30150229710949894, + "learning_rate": 1.989751502500603e-05, + "loss": 0.2083, + "step": 1293 + }, + { + "epoch": 0.07, + "grad_norm": 0.9675402387132627, + "learning_rate": 1.9897249114509453e-05, + "loss": 0.533, + "step": 1294 + }, + { + "epoch": 0.07, + "grad_norm": 0.5243771162575476, + "learning_rate": 1.9896982861271558e-05, + "loss": 0.3046, + "step": 1295 + }, + { + "epoch": 0.07, + "grad_norm": 0.5198603235923646, + "learning_rate": 1.9896716265301577e-05, + "loss": 0.3362, + "step": 1296 + }, + { + "epoch": 0.07, + "grad_norm": 1.1509175904091664, + "learning_rate": 1.9896449326608734e-05, + "loss": 0.8126, + "step": 1297 + }, + { + "epoch": 0.07, + "grad_norm": 0.23527713331733763, + "learning_rate": 1.9896182045202278e-05, + "loss": 0.1337, + "step": 1298 + }, + { + "epoch": 0.07, + "grad_norm": 0.5617916958589548, + "learning_rate": 1.989591442109146e-05, + "loss": 0.3786, + "step": 1299 + }, + { + "epoch": 0.07, + "grad_norm": 0.5253914577139904, + "learning_rate": 1.9895646454285558e-05, + "loss": 0.4054, + "step": 1300 + }, + { + "epoch": 0.07, + "grad_norm": 0.3003271411328984, + "learning_rate": 1.989537814479384e-05, + "loss": 0.1277, + "step": 1301 + }, + { + "epoch": 0.07, + "grad_norm": 0.4684808438512099, + "learning_rate": 1.9895109492625604e-05, + "loss": 0.2995, + "step": 1302 + }, + { + "epoch": 0.07, + "grad_norm": 0.5034967279795296, + "learning_rate": 1.9894840497790154e-05, + "loss": 0.3855, + "step": 1303 + }, + { + "epoch": 0.07, + "grad_norm": 0.6015096789873883, + "learning_rate": 1.98945711602968e-05, + "loss": 0.4516, + "step": 1304 + }, + { + "epoch": 0.07, + "grad_norm": 0.43202696608905033, + "learning_rate": 1.9894301480154873e-05, + "loss": 0.2953, + "step": 1305 + }, + { + "epoch": 0.08, + "grad_norm": 0.39747979018767193, + "learning_rate": 1.989403145737371e-05, + "loss": 0.2653, + "step": 1306 + }, + { + "epoch": 0.08, + "grad_norm": 0.5408971085490359, + "learning_rate": 1.989376109196266e-05, + "loss": 0.3496, + "step": 1307 + }, + { + "epoch": 0.08, + "grad_norm": 0.4058095462488843, + "learning_rate": 1.9893490383931095e-05, + "loss": 0.2257, + "step": 1308 + }, + { + "epoch": 0.08, + "grad_norm": 1.6788930294125746, + "learning_rate": 1.989321933328838e-05, + "loss": 0.8471, + "step": 1309 + }, + { + "epoch": 0.08, + "grad_norm": 0.7626511562337264, + "learning_rate": 1.9892947940043906e-05, + "loss": 0.5442, + "step": 1310 + }, + { + "epoch": 0.08, + "grad_norm": 0.5049748058118498, + "learning_rate": 1.989267620420707e-05, + "loss": 0.2766, + "step": 1311 + }, + { + "epoch": 0.08, + "grad_norm": 0.7666799027992355, + "learning_rate": 1.9892404125787283e-05, + "loss": 0.438, + "step": 1312 + }, + { + "epoch": 0.08, + "grad_norm": 0.3823035156163941, + "learning_rate": 1.9892131704793968e-05, + "loss": 0.2047, + "step": 1313 + }, + { + "epoch": 0.08, + "grad_norm": 0.47813218026484006, + "learning_rate": 1.9891858941236554e-05, + "loss": 0.2231, + "step": 1314 + }, + { + "epoch": 0.08, + "grad_norm": 0.573911958015997, + "learning_rate": 1.989158583512449e-05, + "loss": 0.377, + "step": 1315 + }, + { + "epoch": 0.08, + "grad_norm": 0.7540145368248665, + "learning_rate": 1.989131238646724e-05, + "loss": 0.4964, + "step": 1316 + }, + { + "epoch": 0.08, + "grad_norm": 0.534381050112668, + "learning_rate": 1.989103859527426e-05, + "loss": 0.3059, + "step": 1317 + }, + { + "epoch": 0.08, + "grad_norm": 0.4482636946041032, + "learning_rate": 1.9890764461555044e-05, + "loss": 0.2821, + "step": 1318 + }, + { + "epoch": 0.08, + "grad_norm": 0.43344662744253243, + "learning_rate": 1.9890489985319077e-05, + "loss": 0.268, + "step": 1319 + }, + { + "epoch": 0.08, + "grad_norm": 0.5202040997266357, + "learning_rate": 1.989021516657587e-05, + "loss": 0.2804, + "step": 1320 + }, + { + "epoch": 0.08, + "grad_norm": 1.2201274898081262, + "learning_rate": 1.9889940005334932e-05, + "loss": 0.5302, + "step": 1321 + }, + { + "epoch": 0.08, + "grad_norm": 0.8568685374679005, + "learning_rate": 1.98896645016058e-05, + "loss": 0.5544, + "step": 1322 + }, + { + "epoch": 0.08, + "grad_norm": 0.45238693312134165, + "learning_rate": 1.9889388655398015e-05, + "loss": 0.3015, + "step": 1323 + }, + { + "epoch": 0.08, + "grad_norm": 0.45007359982134754, + "learning_rate": 1.9889112466721122e-05, + "loss": 0.3122, + "step": 1324 + }, + { + "epoch": 0.08, + "grad_norm": 0.3702787792276908, + "learning_rate": 1.9888835935584686e-05, + "loss": 0.2158, + "step": 1325 + }, + { + "epoch": 0.08, + "grad_norm": 0.44701134918728447, + "learning_rate": 1.9888559061998294e-05, + "loss": 0.2895, + "step": 1326 + }, + { + "epoch": 0.08, + "grad_norm": 0.5825053647444507, + "learning_rate": 1.9888281845971522e-05, + "loss": 0.3207, + "step": 1327 + }, + { + "epoch": 0.08, + "grad_norm": 1.9820095173697092, + "learning_rate": 1.988800428751398e-05, + "loss": 0.6004, + "step": 1328 + }, + { + "epoch": 0.08, + "grad_norm": 0.46305327622259507, + "learning_rate": 1.988772638663527e-05, + "loss": 0.3056, + "step": 1329 + }, + { + "epoch": 0.08, + "grad_norm": 1.3268004204954384, + "learning_rate": 1.9887448143345022e-05, + "loss": 0.6153, + "step": 1330 + }, + { + "epoch": 0.08, + "grad_norm": 0.3606881840853597, + "learning_rate": 1.988716955765287e-05, + "loss": 0.2031, + "step": 1331 + }, + { + "epoch": 0.08, + "grad_norm": 0.45448857925172526, + "learning_rate": 1.988689062956846e-05, + "loss": 0.2976, + "step": 1332 + }, + { + "epoch": 0.08, + "grad_norm": 0.8464056685167116, + "learning_rate": 1.9886611359101455e-05, + "loss": 0.4598, + "step": 1333 + }, + { + "epoch": 0.08, + "grad_norm": 0.8824103472910778, + "learning_rate": 1.9886331746261523e-05, + "loss": 0.4065, + "step": 1334 + }, + { + "epoch": 0.08, + "grad_norm": 0.4968644088356621, + "learning_rate": 1.988605179105835e-05, + "loss": 0.3155, + "step": 1335 + }, + { + "epoch": 0.08, + "grad_norm": 0.5104609846598314, + "learning_rate": 1.9885771493501625e-05, + "loss": 0.3783, + "step": 1336 + }, + { + "epoch": 0.08, + "grad_norm": 0.28315552118650017, + "learning_rate": 1.9885490853601058e-05, + "loss": 0.1439, + "step": 1337 + }, + { + "epoch": 0.08, + "grad_norm": 0.441602883979966, + "learning_rate": 1.988520987136637e-05, + "loss": 0.3015, + "step": 1338 + }, + { + "epoch": 0.08, + "grad_norm": 0.4708608501945979, + "learning_rate": 1.9884928546807286e-05, + "loss": 0.3942, + "step": 1339 + }, + { + "epoch": 0.08, + "grad_norm": 0.7510346237626402, + "learning_rate": 1.9884646879933555e-05, + "loss": 0.5117, + "step": 1340 + }, + { + "epoch": 0.08, + "grad_norm": 0.48347633854447297, + "learning_rate": 1.9884364870754925e-05, + "loss": 0.2751, + "step": 1341 + }, + { + "epoch": 0.08, + "grad_norm": 1.5360278255553699, + "learning_rate": 1.988408251928117e-05, + "loss": 0.6755, + "step": 1342 + }, + { + "epoch": 0.08, + "grad_norm": 0.38517016065839627, + "learning_rate": 1.9883799825522056e-05, + "loss": 0.2931, + "step": 1343 + }, + { + "epoch": 0.08, + "grad_norm": 0.36798370244603723, + "learning_rate": 1.988351678948738e-05, + "loss": 0.1964, + "step": 1344 + }, + { + "epoch": 0.08, + "grad_norm": 0.8098718422245624, + "learning_rate": 1.9883233411186947e-05, + "loss": 0.5179, + "step": 1345 + }, + { + "epoch": 0.08, + "grad_norm": 1.1491988511215931, + "learning_rate": 1.9882949690630563e-05, + "loss": 0.634, + "step": 1346 + }, + { + "epoch": 0.08, + "grad_norm": 0.38455018173049227, + "learning_rate": 1.9882665627828054e-05, + "loss": 0.2628, + "step": 1347 + }, + { + "epoch": 0.08, + "grad_norm": 2.134503438141174, + "learning_rate": 1.988238122278926e-05, + "loss": 0.6759, + "step": 1348 + }, + { + "epoch": 0.08, + "grad_norm": 0.45620155579332167, + "learning_rate": 1.9882096475524032e-05, + "loss": 0.3142, + "step": 1349 + }, + { + "epoch": 0.08, + "grad_norm": 0.6149984382187618, + "learning_rate": 1.988181138604223e-05, + "loss": 0.2522, + "step": 1350 + }, + { + "epoch": 0.08, + "grad_norm": 0.534257884253937, + "learning_rate": 1.988152595435372e-05, + "loss": 0.3617, + "step": 1351 + }, + { + "epoch": 0.08, + "grad_norm": 0.6039783722918978, + "learning_rate": 1.9881240180468394e-05, + "loss": 0.3739, + "step": 1352 + }, + { + "epoch": 0.08, + "grad_norm": 0.5317008736940727, + "learning_rate": 1.988095406439614e-05, + "loss": 0.1945, + "step": 1353 + }, + { + "epoch": 0.08, + "grad_norm": 0.7544906573407119, + "learning_rate": 1.9880667606146878e-05, + "loss": 0.4314, + "step": 1354 + }, + { + "epoch": 0.08, + "grad_norm": 0.461252463212955, + "learning_rate": 1.988038080573052e-05, + "loss": 0.3904, + "step": 1355 + }, + { + "epoch": 0.08, + "grad_norm": 0.6937852377504348, + "learning_rate": 1.9880093663157e-05, + "loss": 0.3911, + "step": 1356 + }, + { + "epoch": 0.08, + "grad_norm": 0.5038983872790718, + "learning_rate": 1.9879806178436258e-05, + "loss": 0.355, + "step": 1357 + }, + { + "epoch": 0.08, + "grad_norm": 0.6390378111434681, + "learning_rate": 1.9879518351578254e-05, + "loss": 0.2838, + "step": 1358 + }, + { + "epoch": 0.08, + "grad_norm": 0.38546418075325506, + "learning_rate": 1.9879230182592958e-05, + "loss": 0.3129, + "step": 1359 + }, + { + "epoch": 0.08, + "grad_norm": 0.49492171454772504, + "learning_rate": 1.9878941671490342e-05, + "loss": 0.2142, + "step": 1360 + }, + { + "epoch": 0.08, + "grad_norm": 0.8425537889343386, + "learning_rate": 1.9878652818280402e-05, + "loss": 0.5642, + "step": 1361 + }, + { + "epoch": 0.08, + "grad_norm": 0.4003643201454913, + "learning_rate": 1.9878363622973137e-05, + "loss": 0.3595, + "step": 1362 + }, + { + "epoch": 0.08, + "grad_norm": 0.43574709207441126, + "learning_rate": 1.987807408557857e-05, + "loss": 0.2916, + "step": 1363 + }, + { + "epoch": 0.08, + "grad_norm": 0.517603021457836, + "learning_rate": 1.987778420610672e-05, + "loss": 0.3182, + "step": 1364 + }, + { + "epoch": 0.08, + "grad_norm": 0.40541367749597956, + "learning_rate": 1.9877493984567623e-05, + "loss": 0.2377, + "step": 1365 + }, + { + "epoch": 0.08, + "grad_norm": 0.595146514308601, + "learning_rate": 1.9877203420971338e-05, + "loss": 0.3073, + "step": 1366 + }, + { + "epoch": 0.08, + "grad_norm": 0.3996742522387982, + "learning_rate": 1.9876912515327925e-05, + "loss": 0.3877, + "step": 1367 + }, + { + "epoch": 0.08, + "grad_norm": 0.6372451758270241, + "learning_rate": 1.9876621267647452e-05, + "loss": 0.4087, + "step": 1368 + }, + { + "epoch": 0.08, + "grad_norm": 0.5014611845347008, + "learning_rate": 1.9876329677940015e-05, + "loss": 0.3651, + "step": 1369 + }, + { + "epoch": 0.08, + "grad_norm": 0.4638886549372142, + "learning_rate": 1.9876037746215703e-05, + "loss": 0.3335, + "step": 1370 + }, + { + "epoch": 0.08, + "grad_norm": 0.5265298020277424, + "learning_rate": 1.9875745472484627e-05, + "loss": 0.2185, + "step": 1371 + }, + { + "epoch": 0.08, + "grad_norm": 0.38274955304444164, + "learning_rate": 1.987545285675691e-05, + "loss": 0.261, + "step": 1372 + }, + { + "epoch": 0.08, + "grad_norm": 0.7594647538424038, + "learning_rate": 1.9875159899042685e-05, + "loss": 0.4655, + "step": 1373 + }, + { + "epoch": 0.08, + "grad_norm": 0.653896701468227, + "learning_rate": 1.98748665993521e-05, + "loss": 0.3104, + "step": 1374 + }, + { + "epoch": 0.08, + "grad_norm": 0.39040508633732796, + "learning_rate": 1.987457295769531e-05, + "loss": 0.3225, + "step": 1375 + }, + { + "epoch": 0.08, + "grad_norm": 1.0234810635667073, + "learning_rate": 1.9874278974082482e-05, + "loss": 0.5349, + "step": 1376 + }, + { + "epoch": 0.08, + "grad_norm": 0.34396246269087427, + "learning_rate": 1.9873984648523796e-05, + "loss": 0.1595, + "step": 1377 + }, + { + "epoch": 0.08, + "grad_norm": 0.5609481420866197, + "learning_rate": 1.9873689981029445e-05, + "loss": 0.3817, + "step": 1378 + }, + { + "epoch": 0.08, + "grad_norm": 0.5179111274026793, + "learning_rate": 1.9873394971609636e-05, + "loss": 0.3453, + "step": 1379 + }, + { + "epoch": 0.08, + "grad_norm": 0.46525695462885736, + "learning_rate": 1.9873099620274585e-05, + "loss": 0.327, + "step": 1380 + }, + { + "epoch": 0.08, + "grad_norm": 0.6728477544118068, + "learning_rate": 1.987280392703452e-05, + "loss": 0.433, + "step": 1381 + }, + { + "epoch": 0.08, + "grad_norm": 0.6129052674102026, + "learning_rate": 1.987250789189968e-05, + "loss": 0.437, + "step": 1382 + }, + { + "epoch": 0.08, + "grad_norm": 0.33943298098310193, + "learning_rate": 1.987221151488031e-05, + "loss": 0.215, + "step": 1383 + }, + { + "epoch": 0.08, + "grad_norm": 0.4695780101175386, + "learning_rate": 1.9871914795986683e-05, + "loss": 0.2768, + "step": 1384 + }, + { + "epoch": 0.08, + "grad_norm": 0.9439529862803306, + "learning_rate": 1.987161773522907e-05, + "loss": 0.5373, + "step": 1385 + }, + { + "epoch": 0.08, + "grad_norm": 0.4313605518517955, + "learning_rate": 1.9871320332617762e-05, + "loss": 0.2314, + "step": 1386 + }, + { + "epoch": 0.08, + "grad_norm": 0.4968256366599863, + "learning_rate": 1.9871022588163057e-05, + "loss": 0.3409, + "step": 1387 + }, + { + "epoch": 0.08, + "grad_norm": 1.4379547696004578, + "learning_rate": 1.987072450187526e-05, + "loss": 0.8263, + "step": 1388 + }, + { + "epoch": 0.08, + "grad_norm": 0.3114063006115645, + "learning_rate": 1.98704260737647e-05, + "loss": 0.117, + "step": 1389 + }, + { + "epoch": 0.08, + "grad_norm": 0.5394045691022546, + "learning_rate": 1.9870127303841708e-05, + "loss": 0.393, + "step": 1390 + }, + { + "epoch": 0.08, + "grad_norm": 0.6326035735789947, + "learning_rate": 1.9869828192116634e-05, + "loss": 0.3016, + "step": 1391 + }, + { + "epoch": 0.08, + "grad_norm": 0.73994607381943, + "learning_rate": 1.986952873859983e-05, + "loss": 0.4724, + "step": 1392 + }, + { + "epoch": 0.08, + "grad_norm": 0.43056343619915194, + "learning_rate": 1.9869228943301677e-05, + "loss": 0.2817, + "step": 1393 + }, + { + "epoch": 0.08, + "grad_norm": 0.6526090883250378, + "learning_rate": 1.9868928806232545e-05, + "loss": 0.4252, + "step": 1394 + }, + { + "epoch": 0.08, + "grad_norm": 0.5255751811418452, + "learning_rate": 1.9868628327402833e-05, + "loss": 0.3748, + "step": 1395 + }, + { + "epoch": 0.08, + "grad_norm": 0.3704563536878913, + "learning_rate": 1.9868327506822948e-05, + "loss": 0.3383, + "step": 1396 + }, + { + "epoch": 0.08, + "grad_norm": 0.4162983542173049, + "learning_rate": 1.9868026344503307e-05, + "loss": 0.1772, + "step": 1397 + }, + { + "epoch": 0.08, + "grad_norm": 0.5122113190467039, + "learning_rate": 1.9867724840454336e-05, + "loss": 0.3366, + "step": 1398 + }, + { + "epoch": 0.08, + "grad_norm": 0.46115541335438065, + "learning_rate": 1.986742299468648e-05, + "loss": 0.2477, + "step": 1399 + }, + { + "epoch": 0.08, + "grad_norm": 3.5151180809976266, + "learning_rate": 1.9867120807210188e-05, + "loss": 0.7334, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 2.158542189747334, + "learning_rate": 1.9866818278035926e-05, + "loss": 0.4279, + "step": 1401 + }, + { + "epoch": 0.08, + "grad_norm": 0.6994009052744022, + "learning_rate": 1.9866515407174174e-05, + "loss": 0.3324, + "step": 1402 + }, + { + "epoch": 0.08, + "grad_norm": 1.0880865146791605, + "learning_rate": 1.9866212194635414e-05, + "loss": 0.2424, + "step": 1403 + }, + { + "epoch": 0.08, + "grad_norm": 1.9025391903348476, + "learning_rate": 1.986590864043015e-05, + "loss": 0.4065, + "step": 1404 + }, + { + "epoch": 0.08, + "grad_norm": 1.2725994136331205, + "learning_rate": 1.98656047445689e-05, + "loss": 0.3837, + "step": 1405 + }, + { + "epoch": 0.08, + "grad_norm": 1.1275735184080333, + "learning_rate": 1.9865300507062177e-05, + "loss": 0.3384, + "step": 1406 + }, + { + "epoch": 0.08, + "grad_norm": 0.6833289165964801, + "learning_rate": 1.986499592792052e-05, + "loss": 0.501, + "step": 1407 + }, + { + "epoch": 0.08, + "grad_norm": 1.7700107274879846, + "learning_rate": 1.9864691007154486e-05, + "loss": 0.3731, + "step": 1408 + }, + { + "epoch": 0.08, + "grad_norm": 2.4943906235284774, + "learning_rate": 1.9864385744774618e-05, + "loss": 0.1505, + "step": 1409 + }, + { + "epoch": 0.08, + "grad_norm": 0.7558856050651058, + "learning_rate": 1.9864080140791503e-05, + "loss": 0.3196, + "step": 1410 + }, + { + "epoch": 0.08, + "grad_norm": 0.9313219061687138, + "learning_rate": 1.9863774195215714e-05, + "loss": 0.359, + "step": 1411 + }, + { + "epoch": 0.08, + "grad_norm": 1.319474351714657, + "learning_rate": 1.9863467908057846e-05, + "loss": 0.4746, + "step": 1412 + }, + { + "epoch": 0.08, + "grad_norm": 0.7970602847741426, + "learning_rate": 1.986316127932851e-05, + "loss": 0.4742, + "step": 1413 + }, + { + "epoch": 0.08, + "grad_norm": 2.602170573188123, + "learning_rate": 1.9862854309038324e-05, + "loss": 0.3773, + "step": 1414 + }, + { + "epoch": 0.08, + "grad_norm": 0.9078365075980418, + "learning_rate": 1.9862546997197917e-05, + "loss": 0.2455, + "step": 1415 + }, + { + "epoch": 0.08, + "grad_norm": 0.6767700019137007, + "learning_rate": 1.9862239343817932e-05, + "loss": 0.2878, + "step": 1416 + }, + { + "epoch": 0.08, + "grad_norm": 3.0559677425033875, + "learning_rate": 1.9861931348909024e-05, + "loss": 0.3493, + "step": 1417 + }, + { + "epoch": 0.08, + "grad_norm": 1.0023429161123776, + "learning_rate": 1.9861623012481853e-05, + "loss": 0.4367, + "step": 1418 + }, + { + "epoch": 0.08, + "grad_norm": 0.9368686272084427, + "learning_rate": 1.9861314334547105e-05, + "loss": 0.3789, + "step": 1419 + }, + { + "epoch": 0.08, + "grad_norm": 0.9117183827974491, + "learning_rate": 1.9861005315115466e-05, + "loss": 0.3442, + "step": 1420 + }, + { + "epoch": 0.08, + "grad_norm": 1.3748896935498363, + "learning_rate": 1.9860695954197635e-05, + "loss": 0.3071, + "step": 1421 + }, + { + "epoch": 0.08, + "grad_norm": 0.6790630927567458, + "learning_rate": 1.9860386251804327e-05, + "loss": 0.2388, + "step": 1422 + }, + { + "epoch": 0.08, + "grad_norm": 0.8602801769926192, + "learning_rate": 1.9860076207946268e-05, + "loss": 0.352, + "step": 1423 + }, + { + "epoch": 0.08, + "grad_norm": 1.4437574717264712, + "learning_rate": 1.9859765822634194e-05, + "loss": 0.5806, + "step": 1424 + }, + { + "epoch": 0.08, + "grad_norm": 0.8911016493029239, + "learning_rate": 1.9859455095878853e-05, + "loss": 0.4489, + "step": 1425 + }, + { + "epoch": 0.08, + "grad_norm": 0.8138852682470921, + "learning_rate": 1.9859144027691006e-05, + "loss": 0.3231, + "step": 1426 + }, + { + "epoch": 0.08, + "grad_norm": 1.3279680757064087, + "learning_rate": 1.9858832618081427e-05, + "loss": 0.2975, + "step": 1427 + }, + { + "epoch": 0.08, + "grad_norm": 0.8824804769850708, + "learning_rate": 1.9858520867060897e-05, + "loss": 0.1917, + "step": 1428 + }, + { + "epoch": 0.08, + "grad_norm": 0.7736102829235555, + "learning_rate": 1.9858208774640213e-05, + "loss": 0.3659, + "step": 1429 + }, + { + "epoch": 0.08, + "grad_norm": 1.003301610323338, + "learning_rate": 1.9857896340830182e-05, + "loss": 0.4005, + "step": 1430 + }, + { + "epoch": 0.08, + "grad_norm": 1.6699557262792868, + "learning_rate": 1.9857583565641627e-05, + "loss": 0.6112, + "step": 1431 + }, + { + "epoch": 0.08, + "grad_norm": 0.7784077780986679, + "learning_rate": 1.9857270449085378e-05, + "loss": 0.2576, + "step": 1432 + }, + { + "epoch": 0.08, + "grad_norm": 1.7558353459824572, + "learning_rate": 1.9856956991172272e-05, + "loss": 0.7202, + "step": 1433 + }, + { + "epoch": 0.08, + "grad_norm": 0.8726908894911565, + "learning_rate": 1.9856643191913173e-05, + "loss": 0.3263, + "step": 1434 + }, + { + "epoch": 0.08, + "grad_norm": 0.6394845806469756, + "learning_rate": 1.9856329051318942e-05, + "loss": 0.2556, + "step": 1435 + }, + { + "epoch": 0.08, + "grad_norm": 2.364157046676915, + "learning_rate": 1.9856014569400463e-05, + "loss": 0.5657, + "step": 1436 + }, + { + "epoch": 0.08, + "grad_norm": 0.6999329889210596, + "learning_rate": 1.985569974616862e-05, + "loss": 0.3046, + "step": 1437 + }, + { + "epoch": 0.08, + "grad_norm": 0.8304526967619219, + "learning_rate": 1.9855384581634322e-05, + "loss": 0.2348, + "step": 1438 + }, + { + "epoch": 0.08, + "grad_norm": 0.6726255986180282, + "learning_rate": 1.985506907580848e-05, + "loss": 0.4075, + "step": 1439 + }, + { + "epoch": 0.08, + "grad_norm": 0.6573471554110082, + "learning_rate": 1.9854753228702016e-05, + "loss": 0.3843, + "step": 1440 + }, + { + "epoch": 0.08, + "grad_norm": 0.5886305659040076, + "learning_rate": 1.9854437040325872e-05, + "loss": 0.234, + "step": 1441 + }, + { + "epoch": 0.08, + "grad_norm": 0.7112981770432031, + "learning_rate": 1.9854120510691e-05, + "loss": 0.4008, + "step": 1442 + }, + { + "epoch": 0.08, + "grad_norm": 0.6036621461010034, + "learning_rate": 1.9853803639808357e-05, + "loss": 0.2906, + "step": 1443 + }, + { + "epoch": 0.08, + "grad_norm": 0.5678057149957291, + "learning_rate": 1.9853486427688918e-05, + "loss": 0.3099, + "step": 1444 + }, + { + "epoch": 0.08, + "grad_norm": 0.5707284738168328, + "learning_rate": 1.9853168874343665e-05, + "loss": 0.3151, + "step": 1445 + }, + { + "epoch": 0.08, + "grad_norm": 0.6543721185475424, + "learning_rate": 1.98528509797836e-05, + "loss": 0.4543, + "step": 1446 + }, + { + "epoch": 0.08, + "grad_norm": 0.5152518450197626, + "learning_rate": 1.985253274401973e-05, + "loss": 0.3259, + "step": 1447 + }, + { + "epoch": 0.08, + "grad_norm": 0.7630087118038243, + "learning_rate": 1.985221416706307e-05, + "loss": 0.5161, + "step": 1448 + }, + { + "epoch": 0.08, + "grad_norm": 0.6039997332315664, + "learning_rate": 1.9851895248924662e-05, + "loss": 0.2395, + "step": 1449 + }, + { + "epoch": 0.08, + "grad_norm": 0.5091314902898457, + "learning_rate": 1.9851575989615545e-05, + "loss": 0.3048, + "step": 1450 + }, + { + "epoch": 0.08, + "grad_norm": 1.4961746423041666, + "learning_rate": 1.9851256389146774e-05, + "loss": 0.089, + "step": 1451 + }, + { + "epoch": 0.08, + "grad_norm": 0.8407751460559815, + "learning_rate": 1.985093644752942e-05, + "loss": 0.5311, + "step": 1452 + }, + { + "epoch": 0.08, + "grad_norm": 0.5369866218744758, + "learning_rate": 1.9850616164774556e-05, + "loss": 0.3195, + "step": 1453 + }, + { + "epoch": 0.08, + "grad_norm": 0.6396346126247083, + "learning_rate": 1.985029554089328e-05, + "loss": 0.3142, + "step": 1454 + }, + { + "epoch": 0.08, + "grad_norm": 0.6021130089990159, + "learning_rate": 1.9849974575896695e-05, + "loss": 0.2219, + "step": 1455 + }, + { + "epoch": 0.08, + "grad_norm": 0.8167722928601668, + "learning_rate": 1.984965326979591e-05, + "loss": 0.3154, + "step": 1456 + }, + { + "epoch": 0.08, + "grad_norm": 0.6078781381764322, + "learning_rate": 1.984933162260206e-05, + "loss": 0.4259, + "step": 1457 + }, + { + "epoch": 0.08, + "grad_norm": 0.5111068434766219, + "learning_rate": 1.9849009634326275e-05, + "loss": 0.3663, + "step": 1458 + }, + { + "epoch": 0.08, + "grad_norm": 0.491679245588098, + "learning_rate": 1.984868730497971e-05, + "loss": 0.3022, + "step": 1459 + }, + { + "epoch": 0.08, + "grad_norm": 0.6657778643771437, + "learning_rate": 1.9848364634573533e-05, + "loss": 0.461, + "step": 1460 + }, + { + "epoch": 0.08, + "grad_norm": 0.4499622319663323, + "learning_rate": 1.984804162311891e-05, + "loss": 0.1674, + "step": 1461 + }, + { + "epoch": 0.08, + "grad_norm": 0.43775680064572114, + "learning_rate": 1.9847718270627022e-05, + "loss": 0.2935, + "step": 1462 + }, + { + "epoch": 0.08, + "grad_norm": 0.9724835720183442, + "learning_rate": 1.9847394577109083e-05, + "loss": 0.6095, + "step": 1463 + }, + { + "epoch": 0.08, + "grad_norm": 0.6832214021917391, + "learning_rate": 1.984707054257629e-05, + "loss": 0.4568, + "step": 1464 + }, + { + "epoch": 0.08, + "grad_norm": 0.553754646433227, + "learning_rate": 1.9846746167039864e-05, + "loss": 0.2975, + "step": 1465 + }, + { + "epoch": 0.08, + "grad_norm": 0.4793911200155322, + "learning_rate": 1.9846421450511045e-05, + "loss": 0.3857, + "step": 1466 + }, + { + "epoch": 0.08, + "grad_norm": 0.553399104439865, + "learning_rate": 1.9846096393001074e-05, + "loss": 0.1008, + "step": 1467 + }, + { + "epoch": 0.08, + "grad_norm": 0.4279760960702405, + "learning_rate": 1.984577099452121e-05, + "loss": 0.2942, + "step": 1468 + }, + { + "epoch": 0.08, + "grad_norm": 1.0356861473365662, + "learning_rate": 1.984544525508272e-05, + "loss": 0.5771, + "step": 1469 + }, + { + "epoch": 0.08, + "grad_norm": 0.4577243781894562, + "learning_rate": 1.9845119174696882e-05, + "loss": 0.4015, + "step": 1470 + }, + { + "epoch": 0.08, + "grad_norm": 0.42424685772485127, + "learning_rate": 1.984479275337499e-05, + "loss": 0.2457, + "step": 1471 + }, + { + "epoch": 0.08, + "grad_norm": 1.4301253843484638, + "learning_rate": 1.984446599112835e-05, + "loss": 0.838, + "step": 1472 + }, + { + "epoch": 0.08, + "grad_norm": 0.3919599300947234, + "learning_rate": 1.9844138887968273e-05, + "loss": 0.304, + "step": 1473 + }, + { + "epoch": 0.08, + "grad_norm": 0.3932132520222729, + "learning_rate": 1.9843811443906093e-05, + "loss": 0.1543, + "step": 1474 + }, + { + "epoch": 0.08, + "grad_norm": 0.6901366164674826, + "learning_rate": 1.9843483658953148e-05, + "loss": 0.4529, + "step": 1475 + }, + { + "epoch": 0.08, + "grad_norm": 0.7955472019255666, + "learning_rate": 1.9843155533120782e-05, + "loss": 0.5426, + "step": 1476 + }, + { + "epoch": 0.08, + "grad_norm": 0.47244127096043176, + "learning_rate": 1.9842827066420366e-05, + "loss": 0.2129, + "step": 1477 + }, + { + "epoch": 0.08, + "grad_norm": 0.4441680857479455, + "learning_rate": 1.9842498258863274e-05, + "loss": 0.362, + "step": 1478 + }, + { + "epoch": 0.08, + "grad_norm": 0.5322606042879695, + "learning_rate": 1.9842169110460885e-05, + "loss": 0.295, + "step": 1479 + }, + { + "epoch": 0.09, + "grad_norm": 0.404128094447175, + "learning_rate": 1.9841839621224606e-05, + "loss": 0.2315, + "step": 1480 + }, + { + "epoch": 0.09, + "grad_norm": 0.5781529263016978, + "learning_rate": 1.9841509791165847e-05, + "loss": 0.3367, + "step": 1481 + }, + { + "epoch": 0.09, + "grad_norm": 0.6389770846973686, + "learning_rate": 1.9841179620296022e-05, + "loss": 0.4023, + "step": 1482 + }, + { + "epoch": 0.09, + "grad_norm": 0.5240438297258404, + "learning_rate": 1.9840849108626574e-05, + "loss": 0.3152, + "step": 1483 + }, + { + "epoch": 0.09, + "grad_norm": 0.6238085399437978, + "learning_rate": 1.984051825616894e-05, + "loss": 0.34, + "step": 1484 + }, + { + "epoch": 0.09, + "grad_norm": 0.5489870116923643, + "learning_rate": 1.9840187062934583e-05, + "loss": 0.3452, + "step": 1485 + }, + { + "epoch": 0.09, + "grad_norm": 0.3663527210417489, + "learning_rate": 1.9839855528934972e-05, + "loss": 0.2432, + "step": 1486 + }, + { + "epoch": 0.09, + "grad_norm": 0.6521194126080185, + "learning_rate": 1.983952365418159e-05, + "loss": 0.3821, + "step": 1487 + }, + { + "epoch": 0.09, + "grad_norm": 0.48007821958494157, + "learning_rate": 1.9839191438685922e-05, + "loss": 0.3019, + "step": 1488 + }, + { + "epoch": 0.09, + "grad_norm": 0.5161949916231795, + "learning_rate": 1.9838858882459483e-05, + "loss": 0.3056, + "step": 1489 + }, + { + "epoch": 0.09, + "grad_norm": 0.5390200772732368, + "learning_rate": 1.9838525985513783e-05, + "loss": 0.2989, + "step": 1490 + }, + { + "epoch": 0.09, + "grad_norm": 0.8864658034838391, + "learning_rate": 1.9838192747860345e-05, + "loss": 0.6144, + "step": 1491 + }, + { + "epoch": 0.09, + "grad_norm": 0.5180367210317303, + "learning_rate": 1.9837859169510723e-05, + "loss": 0.4018, + "step": 1492 + }, + { + "epoch": 0.09, + "grad_norm": 0.4120010395143323, + "learning_rate": 1.9837525250476454e-05, + "loss": 0.2938, + "step": 1493 + }, + { + "epoch": 0.09, + "grad_norm": 0.40632841819630106, + "learning_rate": 1.9837190990769115e-05, + "loss": 0.2472, + "step": 1494 + }, + { + "epoch": 0.09, + "grad_norm": 0.7107951744200354, + "learning_rate": 1.9836856390400273e-05, + "loss": 0.3808, + "step": 1495 + }, + { + "epoch": 0.09, + "grad_norm": 0.5256133147742968, + "learning_rate": 1.9836521449381515e-05, + "loss": 0.3597, + "step": 1496 + }, + { + "epoch": 0.09, + "grad_norm": 0.43642958130027093, + "learning_rate": 1.9836186167724443e-05, + "loss": 0.3331, + "step": 1497 + }, + { + "epoch": 0.09, + "grad_norm": 0.6079517801611382, + "learning_rate": 1.983585054544067e-05, + "loss": 0.4103, + "step": 1498 + }, + { + "epoch": 0.09, + "grad_norm": 0.46354758872951146, + "learning_rate": 1.9835514582541812e-05, + "loss": 0.3661, + "step": 1499 + }, + { + "epoch": 0.09, + "grad_norm": 0.2972644570786185, + "learning_rate": 1.983517827903951e-05, + "loss": 0.1053, + "step": 1500 + }, + { + "epoch": 0.09, + "grad_norm": 0.3795868350126114, + "learning_rate": 1.9834841634945402e-05, + "loss": 0.2982, + "step": 1501 + }, + { + "epoch": 0.09, + "grad_norm": 0.5592247863268277, + "learning_rate": 1.9834504650271157e-05, + "loss": 0.3494, + "step": 1502 + }, + { + "epoch": 0.09, + "grad_norm": 0.6832464565355145, + "learning_rate": 1.9834167325028436e-05, + "loss": 0.4326, + "step": 1503 + }, + { + "epoch": 0.09, + "grad_norm": 0.45112271937913245, + "learning_rate": 1.9833829659228923e-05, + "loss": 0.3454, + "step": 1504 + }, + { + "epoch": 0.09, + "grad_norm": 0.6484508195836393, + "learning_rate": 1.983349165288431e-05, + "loss": 0.3971, + "step": 1505 + }, + { + "epoch": 0.09, + "grad_norm": 0.3254094644109379, + "learning_rate": 1.983315330600631e-05, + "loss": 0.2295, + "step": 1506 + }, + { + "epoch": 0.09, + "grad_norm": 0.45949992766591424, + "learning_rate": 1.983281461860663e-05, + "loss": 0.2378, + "step": 1507 + }, + { + "epoch": 0.09, + "grad_norm": 0.6729297248346193, + "learning_rate": 1.9832475590697e-05, + "loss": 0.4185, + "step": 1508 + }, + { + "epoch": 0.09, + "grad_norm": 0.5529243330352177, + "learning_rate": 1.9832136222289168e-05, + "loss": 0.3886, + "step": 1509 + }, + { + "epoch": 0.09, + "grad_norm": 0.6263171618684181, + "learning_rate": 1.983179651339488e-05, + "loss": 0.3395, + "step": 1510 + }, + { + "epoch": 0.09, + "grad_norm": 0.4995660700715559, + "learning_rate": 1.9831456464025897e-05, + "loss": 0.3677, + "step": 1511 + }, + { + "epoch": 0.09, + "grad_norm": 0.32612760199585944, + "learning_rate": 1.9831116074194006e-05, + "loss": 0.1966, + "step": 1512 + }, + { + "epoch": 0.09, + "grad_norm": 0.507043580890754, + "learning_rate": 1.9830775343910984e-05, + "loss": 0.2792, + "step": 1513 + }, + { + "epoch": 0.09, + "grad_norm": 0.5548136660748703, + "learning_rate": 1.9830434273188636e-05, + "loss": 0.3315, + "step": 1514 + }, + { + "epoch": 0.09, + "grad_norm": 1.1903386244763454, + "learning_rate": 1.9830092862038773e-05, + "loss": 0.5327, + "step": 1515 + }, + { + "epoch": 0.09, + "grad_norm": 0.35362906803195593, + "learning_rate": 1.9829751110473215e-05, + "loss": 0.0803, + "step": 1516 + }, + { + "epoch": 0.09, + "grad_norm": 0.38747733634272685, + "learning_rate": 1.98294090185038e-05, + "loss": 0.3157, + "step": 1517 + }, + { + "epoch": 0.09, + "grad_norm": 0.3734505271613158, + "learning_rate": 1.9829066586142375e-05, + "loss": 0.2002, + "step": 1518 + }, + { + "epoch": 0.09, + "grad_norm": 0.9309793973511313, + "learning_rate": 1.982872381340079e-05, + "loss": 0.5253, + "step": 1519 + }, + { + "epoch": 0.09, + "grad_norm": 0.8832068734492745, + "learning_rate": 1.982838070029093e-05, + "loss": 0.3373, + "step": 1520 + }, + { + "epoch": 0.09, + "grad_norm": 0.5060812038856458, + "learning_rate": 1.9828037246824664e-05, + "loss": 0.3777, + "step": 1521 + }, + { + "epoch": 0.09, + "grad_norm": 0.7545887148384214, + "learning_rate": 1.9827693453013892e-05, + "loss": 0.5325, + "step": 1522 + }, + { + "epoch": 0.09, + "grad_norm": 0.3855633161515275, + "learning_rate": 1.982734931887052e-05, + "loss": 0.2315, + "step": 1523 + }, + { + "epoch": 0.09, + "grad_norm": 0.3295751571087598, + "learning_rate": 1.982700484440646e-05, + "loss": 0.2219, + "step": 1524 + }, + { + "epoch": 0.09, + "grad_norm": 0.4242518325493269, + "learning_rate": 1.982666002963365e-05, + "loss": 0.3222, + "step": 1525 + }, + { + "epoch": 0.09, + "grad_norm": 0.4857458074762703, + "learning_rate": 1.982631487456402e-05, + "loss": 0.3091, + "step": 1526 + }, + { + "epoch": 0.09, + "grad_norm": 0.8730410681906142, + "learning_rate": 1.9825969379209533e-05, + "loss": 0.5321, + "step": 1527 + }, + { + "epoch": 0.09, + "grad_norm": 0.8632641640956932, + "learning_rate": 1.9825623543582145e-05, + "loss": 0.4488, + "step": 1528 + }, + { + "epoch": 0.09, + "grad_norm": 0.38414737870140997, + "learning_rate": 1.982527736769384e-05, + "loss": 0.2726, + "step": 1529 + }, + { + "epoch": 0.09, + "grad_norm": 0.4745151571052424, + "learning_rate": 1.9824930851556604e-05, + "loss": 0.3034, + "step": 1530 + }, + { + "epoch": 0.09, + "grad_norm": 0.9141713621897183, + "learning_rate": 1.982458399518243e-05, + "loss": 0.5761, + "step": 1531 + }, + { + "epoch": 0.09, + "grad_norm": 0.4592062472149273, + "learning_rate": 1.9824236798583338e-05, + "loss": 0.3236, + "step": 1532 + }, + { + "epoch": 0.09, + "grad_norm": 0.32912152990907256, + "learning_rate": 1.9823889261771346e-05, + "loss": 0.2727, + "step": 1533 + }, + { + "epoch": 0.09, + "grad_norm": 0.9264614208288615, + "learning_rate": 1.9823541384758492e-05, + "loss": 0.4646, + "step": 1534 + }, + { + "epoch": 0.09, + "grad_norm": 0.43380933167627445, + "learning_rate": 1.982319316755682e-05, + "loss": 0.3267, + "step": 1535 + }, + { + "epoch": 0.09, + "grad_norm": 0.3434369284109887, + "learning_rate": 1.9822844610178394e-05, + "loss": 0.1541, + "step": 1536 + }, + { + "epoch": 0.09, + "grad_norm": 0.42805356727710603, + "learning_rate": 1.982249571263528e-05, + "loss": 0.35, + "step": 1537 + }, + { + "epoch": 0.09, + "grad_norm": 0.46768921780410705, + "learning_rate": 1.9822146474939563e-05, + "loss": 0.3597, + "step": 1538 + }, + { + "epoch": 0.09, + "grad_norm": 0.3205738141307784, + "learning_rate": 1.9821796897103334e-05, + "loss": 0.171, + "step": 1539 + }, + { + "epoch": 0.09, + "grad_norm": 0.4918143379538728, + "learning_rate": 1.98214469791387e-05, + "loss": 0.3352, + "step": 1540 + }, + { + "epoch": 0.09, + "grad_norm": 0.38168005214386574, + "learning_rate": 1.9821096721057787e-05, + "loss": 0.3347, + "step": 1541 + }, + { + "epoch": 0.09, + "grad_norm": 0.9996924570187796, + "learning_rate": 1.982074612287271e-05, + "loss": 0.4523, + "step": 1542 + }, + { + "epoch": 0.09, + "grad_norm": 0.532892984371382, + "learning_rate": 1.982039518459562e-05, + "loss": 0.4597, + "step": 1543 + }, + { + "epoch": 0.09, + "grad_norm": 0.4556488663844575, + "learning_rate": 1.9820043906238667e-05, + "loss": 0.3112, + "step": 1544 + }, + { + "epoch": 0.09, + "grad_norm": 0.5333818960745033, + "learning_rate": 1.9819692287814014e-05, + "loss": 0.3715, + "step": 1545 + }, + { + "epoch": 0.09, + "grad_norm": 0.3205794074373295, + "learning_rate": 1.981934032933384e-05, + "loss": 0.1031, + "step": 1546 + }, + { + "epoch": 0.09, + "grad_norm": 0.5161864800464732, + "learning_rate": 1.981898803081033e-05, + "loss": 0.3052, + "step": 1547 + }, + { + "epoch": 0.09, + "grad_norm": 0.7795833825785495, + "learning_rate": 1.981863539225569e-05, + "loss": 0.5345, + "step": 1548 + }, + { + "epoch": 0.09, + "grad_norm": 0.4615357704091355, + "learning_rate": 1.9818282413682127e-05, + "loss": 0.3508, + "step": 1549 + }, + { + "epoch": 0.09, + "grad_norm": 0.3946497104947286, + "learning_rate": 1.981792909510187e-05, + "loss": 0.2904, + "step": 1550 + }, + { + "epoch": 0.09, + "grad_norm": 0.3454321623744058, + "learning_rate": 1.9817575436527147e-05, + "loss": 0.2079, + "step": 1551 + }, + { + "epoch": 0.09, + "grad_norm": 0.4495745099321259, + "learning_rate": 1.981722143797021e-05, + "loss": 0.2907, + "step": 1552 + }, + { + "epoch": 0.09, + "grad_norm": 0.45876952116799197, + "learning_rate": 1.9816867099443314e-05, + "loss": 0.3153, + "step": 1553 + }, + { + "epoch": 0.09, + "grad_norm": 1.1485942385955623, + "learning_rate": 1.9816512420958734e-05, + "loss": 0.5895, + "step": 1554 + }, + { + "epoch": 0.09, + "grad_norm": 0.9054974881219597, + "learning_rate": 1.9816157402528753e-05, + "loss": 0.5669, + "step": 1555 + }, + { + "epoch": 0.09, + "grad_norm": 0.4243340339417857, + "learning_rate": 1.9815802044165663e-05, + "loss": 0.2367, + "step": 1556 + }, + { + "epoch": 0.09, + "grad_norm": 0.5642073908469096, + "learning_rate": 1.981544634588177e-05, + "loss": 0.3691, + "step": 1557 + }, + { + "epoch": 0.09, + "grad_norm": 0.46111573851136795, + "learning_rate": 1.9815090307689392e-05, + "loss": 0.2229, + "step": 1558 + }, + { + "epoch": 0.09, + "grad_norm": 0.42590791206403383, + "learning_rate": 1.9814733929600857e-05, + "loss": 0.2272, + "step": 1559 + }, + { + "epoch": 0.09, + "grad_norm": 1.9421334937751409, + "learning_rate": 1.981437721162851e-05, + "loss": 0.5379, + "step": 1560 + }, + { + "epoch": 0.09, + "grad_norm": 0.561819349052626, + "learning_rate": 1.98140201537847e-05, + "loss": 0.3477, + "step": 1561 + }, + { + "epoch": 0.09, + "grad_norm": 0.4240617607652223, + "learning_rate": 1.9813662756081794e-05, + "loss": 0.2214, + "step": 1562 + }, + { + "epoch": 0.09, + "grad_norm": 1.0596622839127672, + "learning_rate": 1.9813305018532172e-05, + "loss": 0.7146, + "step": 1563 + }, + { + "epoch": 0.09, + "grad_norm": 0.3903654621157037, + "learning_rate": 1.981294694114822e-05, + "loss": 0.2457, + "step": 1564 + }, + { + "epoch": 0.09, + "grad_norm": 0.4339515383696826, + "learning_rate": 1.9812588523942334e-05, + "loss": 0.2166, + "step": 1565 + }, + { + "epoch": 0.09, + "grad_norm": 1.650174074607824, + "learning_rate": 1.981222976692693e-05, + "loss": 0.5498, + "step": 1566 + }, + { + "epoch": 0.09, + "grad_norm": 1.2077256521150712, + "learning_rate": 1.981187067011443e-05, + "loss": 0.6577, + "step": 1567 + }, + { + "epoch": 0.09, + "grad_norm": 0.6294600660337998, + "learning_rate": 1.9811511233517275e-05, + "loss": 0.313, + "step": 1568 + }, + { + "epoch": 0.09, + "grad_norm": 0.560575934466919, + "learning_rate": 1.9811151457147904e-05, + "loss": 0.29, + "step": 1569 + }, + { + "epoch": 0.09, + "grad_norm": 0.3676919839173703, + "learning_rate": 1.981079134101878e-05, + "loss": 0.214, + "step": 1570 + }, + { + "epoch": 0.09, + "grad_norm": 0.5136664835618114, + "learning_rate": 1.9810430885142377e-05, + "loss": 0.312, + "step": 1571 + }, + { + "epoch": 0.09, + "grad_norm": 1.4254358022411515, + "learning_rate": 1.981007008953117e-05, + "loss": 0.4594, + "step": 1572 + }, + { + "epoch": 0.09, + "grad_norm": 0.7586444608485484, + "learning_rate": 1.9809708954197658e-05, + "loss": 0.3544, + "step": 1573 + }, + { + "epoch": 0.09, + "grad_norm": 0.5827592667394669, + "learning_rate": 1.980934747915435e-05, + "loss": 0.2916, + "step": 1574 + }, + { + "epoch": 0.09, + "grad_norm": 1.6997799572006387, + "learning_rate": 1.9808985664413757e-05, + "loss": 0.4934, + "step": 1575 + }, + { + "epoch": 0.09, + "grad_norm": 0.34869623686568857, + "learning_rate": 1.9808623509988415e-05, + "loss": 0.2492, + "step": 1576 + }, + { + "epoch": 0.09, + "grad_norm": 0.7051869049693643, + "learning_rate": 1.980826101589086e-05, + "loss": 0.3112, + "step": 1577 + }, + { + "epoch": 0.09, + "grad_norm": 2.375367808092143, + "learning_rate": 1.980789818213365e-05, + "loss": 0.4905, + "step": 1578 + }, + { + "epoch": 0.09, + "grad_norm": 1.6868012547392646, + "learning_rate": 1.9807535008729347e-05, + "loss": 0.8162, + "step": 1579 + }, + { + "epoch": 0.09, + "grad_norm": 0.5252690640932575, + "learning_rate": 1.980717149569053e-05, + "loss": 0.3133, + "step": 1580 + }, + { + "epoch": 0.09, + "grad_norm": 0.467302492898827, + "learning_rate": 1.9806807643029786e-05, + "loss": 0.3452, + "step": 1581 + }, + { + "epoch": 0.09, + "grad_norm": 1.0265829484598235, + "learning_rate": 1.9806443450759715e-05, + "loss": 0.3143, + "step": 1582 + }, + { + "epoch": 0.09, + "grad_norm": 0.5886112657839901, + "learning_rate": 1.9806078918892925e-05, + "loss": 0.34, + "step": 1583 + }, + { + "epoch": 0.09, + "grad_norm": 0.5920413529763409, + "learning_rate": 1.9805714047442045e-05, + "loss": 0.3122, + "step": 1584 + }, + { + "epoch": 0.09, + "grad_norm": 0.6764117396792537, + "learning_rate": 1.9805348836419712e-05, + "loss": 0.3036, + "step": 1585 + }, + { + "epoch": 0.09, + "grad_norm": 0.5594852265265499, + "learning_rate": 1.9804983285838567e-05, + "loss": 0.3569, + "step": 1586 + }, + { + "epoch": 0.09, + "grad_norm": 0.6844695489408091, + "learning_rate": 1.9804617395711275e-05, + "loss": 0.4128, + "step": 1587 + }, + { + "epoch": 0.09, + "grad_norm": 0.6307190369431338, + "learning_rate": 1.9804251166050505e-05, + "loss": 0.3458, + "step": 1588 + }, + { + "epoch": 0.09, + "grad_norm": 0.41952195036620993, + "learning_rate": 1.9803884596868937e-05, + "loss": 0.3168, + "step": 1589 + }, + { + "epoch": 0.09, + "grad_norm": 0.3453562469345302, + "learning_rate": 1.9803517688179264e-05, + "loss": 0.2204, + "step": 1590 + }, + { + "epoch": 0.09, + "grad_norm": 1.0262728387774869, + "learning_rate": 1.9803150439994202e-05, + "loss": 0.5397, + "step": 1591 + }, + { + "epoch": 0.09, + "grad_norm": 0.5369925676511108, + "learning_rate": 1.9802782852326456e-05, + "loss": 0.3018, + "step": 1592 + }, + { + "epoch": 0.09, + "grad_norm": 0.7179121095737457, + "learning_rate": 1.9802414925188766e-05, + "loss": 0.3874, + "step": 1593 + }, + { + "epoch": 0.09, + "grad_norm": 0.7147170188690286, + "learning_rate": 1.9802046658593867e-05, + "loss": 0.5446, + "step": 1594 + }, + { + "epoch": 0.09, + "grad_norm": 0.4854192777883944, + "learning_rate": 1.9801678052554512e-05, + "loss": 0.2607, + "step": 1595 + }, + { + "epoch": 0.09, + "grad_norm": 0.3895561134200103, + "learning_rate": 1.9801309107083465e-05, + "loss": 0.2836, + "step": 1596 + }, + { + "epoch": 0.09, + "grad_norm": 0.3859364875104156, + "learning_rate": 1.9800939822193512e-05, + "loss": 0.2896, + "step": 1597 + }, + { + "epoch": 0.09, + "grad_norm": 0.4179295750829518, + "learning_rate": 1.980057019789743e-05, + "loss": 0.2141, + "step": 1598 + }, + { + "epoch": 0.09, + "grad_norm": 0.5931917930493955, + "learning_rate": 1.9800200234208022e-05, + "loss": 0.3975, + "step": 1599 + }, + { + "epoch": 0.09, + "grad_norm": 0.5242128316638529, + "learning_rate": 1.9799829931138107e-05, + "loss": 0.4053, + "step": 1600 + }, + { + "epoch": 0.09, + "grad_norm": 0.4126511092046684, + "learning_rate": 1.9799459288700498e-05, + "loss": 0.2093, + "step": 1601 + }, + { + "epoch": 0.09, + "grad_norm": 0.3701533373340219, + "learning_rate": 1.9799088306908035e-05, + "loss": 0.2427, + "step": 1602 + }, + { + "epoch": 0.09, + "grad_norm": 1.1308744292919486, + "learning_rate": 1.9798716985773567e-05, + "loss": 0.6998, + "step": 1603 + }, + { + "epoch": 0.09, + "grad_norm": 0.3758134428810142, + "learning_rate": 1.9798345325309952e-05, + "loss": 0.2527, + "step": 1604 + }, + { + "epoch": 0.09, + "grad_norm": 0.5785694558525246, + "learning_rate": 1.9797973325530058e-05, + "loss": 0.4437, + "step": 1605 + }, + { + "epoch": 0.09, + "grad_norm": 0.7455283306651675, + "learning_rate": 1.9797600986446773e-05, + "loss": 0.5741, + "step": 1606 + }, + { + "epoch": 0.09, + "grad_norm": 0.49548617080975527, + "learning_rate": 1.9797228308072982e-05, + "loss": 0.3097, + "step": 1607 + }, + { + "epoch": 0.09, + "grad_norm": 0.399945434741605, + "learning_rate": 1.97968552904216e-05, + "loss": 0.2833, + "step": 1608 + }, + { + "epoch": 0.09, + "grad_norm": 0.3260983798124931, + "learning_rate": 1.9796481933505535e-05, + "loss": 0.1926, + "step": 1609 + }, + { + "epoch": 0.09, + "grad_norm": 0.5046829217920206, + "learning_rate": 1.9796108237337724e-05, + "loss": 0.3538, + "step": 1610 + }, + { + "epoch": 0.09, + "grad_norm": 0.588275165004359, + "learning_rate": 1.979573420193111e-05, + "loss": 0.3805, + "step": 1611 + }, + { + "epoch": 0.09, + "grad_norm": 0.45169272979663194, + "learning_rate": 1.9795359827298643e-05, + "loss": 0.3902, + "step": 1612 + }, + { + "epoch": 0.09, + "grad_norm": 0.45282344751971243, + "learning_rate": 1.979498511345328e-05, + "loss": 0.3399, + "step": 1613 + }, + { + "epoch": 0.09, + "grad_norm": 0.3358407517147635, + "learning_rate": 1.9794610060408007e-05, + "loss": 0.1913, + "step": 1614 + }, + { + "epoch": 0.09, + "grad_norm": 0.468574580020193, + "learning_rate": 1.979423466817581e-05, + "loss": 0.3122, + "step": 1615 + }, + { + "epoch": 0.09, + "grad_norm": 0.49239486490812656, + "learning_rate": 1.9793858936769683e-05, + "loss": 0.3336, + "step": 1616 + }, + { + "epoch": 0.09, + "grad_norm": 0.4719865931595738, + "learning_rate": 1.9793482866202645e-05, + "loss": 0.3269, + "step": 1617 + }, + { + "epoch": 0.09, + "grad_norm": 0.8462916216082507, + "learning_rate": 1.9793106456487717e-05, + "loss": 0.5964, + "step": 1618 + }, + { + "epoch": 0.09, + "grad_norm": 0.7204368957434376, + "learning_rate": 1.9792729707637935e-05, + "loss": 0.3242, + "step": 1619 + }, + { + "epoch": 0.09, + "grad_norm": 0.36900163439375716, + "learning_rate": 1.979235261966634e-05, + "loss": 0.323, + "step": 1620 + }, + { + "epoch": 0.09, + "grad_norm": 0.3250448460505923, + "learning_rate": 1.9791975192586e-05, + "loss": 0.1878, + "step": 1621 + }, + { + "epoch": 0.09, + "grad_norm": 0.7114004359000305, + "learning_rate": 1.9791597426409973e-05, + "loss": 0.4203, + "step": 1622 + }, + { + "epoch": 0.09, + "grad_norm": 0.48779469367220873, + "learning_rate": 1.9791219321151356e-05, + "loss": 0.4049, + "step": 1623 + }, + { + "epoch": 0.09, + "grad_norm": 0.33688741860596855, + "learning_rate": 1.979084087682323e-05, + "loss": 0.2401, + "step": 1624 + }, + { + "epoch": 0.09, + "grad_norm": 0.6457978863500597, + "learning_rate": 1.9790462093438707e-05, + "loss": 0.3773, + "step": 1625 + }, + { + "epoch": 0.09, + "grad_norm": 0.43997114105386825, + "learning_rate": 1.9790082971010903e-05, + "loss": 0.3352, + "step": 1626 + }, + { + "epoch": 0.09, + "grad_norm": 0.8578347622033227, + "learning_rate": 1.9789703509552947e-05, + "loss": 0.4459, + "step": 1627 + }, + { + "epoch": 0.09, + "grad_norm": 0.3320817142902381, + "learning_rate": 1.978932370907798e-05, + "loss": 0.3051, + "step": 1628 + }, + { + "epoch": 0.09, + "grad_norm": 0.46309576333499364, + "learning_rate": 1.978894356959915e-05, + "loss": 0.4221, + "step": 1629 + }, + { + "epoch": 0.09, + "grad_norm": 0.23451118651819808, + "learning_rate": 1.978856309112963e-05, + "loss": 0.1448, + "step": 1630 + }, + { + "epoch": 0.09, + "grad_norm": 0.4679183307690478, + "learning_rate": 1.978818227368259e-05, + "loss": 0.2926, + "step": 1631 + }, + { + "epoch": 0.09, + "grad_norm": 0.4429604060875303, + "learning_rate": 1.9787801117271213e-05, + "loss": 0.3402, + "step": 1632 + }, + { + "epoch": 0.09, + "grad_norm": 0.8594224955091179, + "learning_rate": 1.978741962190871e-05, + "loss": 0.6281, + "step": 1633 + }, + { + "epoch": 0.09, + "grad_norm": 0.6065069575331038, + "learning_rate": 1.9787037787608287e-05, + "loss": 0.3928, + "step": 1634 + }, + { + "epoch": 0.09, + "grad_norm": 0.42824444304880876, + "learning_rate": 1.9786655614383163e-05, + "loss": 0.3295, + "step": 1635 + }, + { + "epoch": 0.09, + "grad_norm": 0.30050373194288177, + "learning_rate": 1.978627310224658e-05, + "loss": 0.2354, + "step": 1636 + }, + { + "epoch": 0.09, + "grad_norm": 0.5963019421892746, + "learning_rate": 1.9785890251211777e-05, + "loss": 0.1868, + "step": 1637 + }, + { + "epoch": 0.09, + "grad_norm": 0.400518818725669, + "learning_rate": 1.9785507061292017e-05, + "loss": 0.2862, + "step": 1638 + }, + { + "epoch": 0.09, + "grad_norm": 0.9336374686602577, + "learning_rate": 1.978512353250057e-05, + "loss": 0.5577, + "step": 1639 + }, + { + "epoch": 0.09, + "grad_norm": 0.35757051249823835, + "learning_rate": 1.978473966485071e-05, + "loss": 0.2641, + "step": 1640 + }, + { + "epoch": 0.09, + "grad_norm": 0.4439429484546262, + "learning_rate": 1.978435545835574e-05, + "loss": 0.3602, + "step": 1641 + }, + { + "epoch": 0.09, + "grad_norm": 0.3921207587665929, + "learning_rate": 1.978397091302896e-05, + "loss": 0.1743, + "step": 1642 + }, + { + "epoch": 0.09, + "grad_norm": 0.7618586521311463, + "learning_rate": 1.9783586028883688e-05, + "loss": 0.4513, + "step": 1643 + }, + { + "epoch": 0.09, + "grad_norm": 0.3751913533863891, + "learning_rate": 1.9783200805933252e-05, + "loss": 0.2761, + "step": 1644 + }, + { + "epoch": 0.09, + "grad_norm": 0.7805939473702508, + "learning_rate": 1.9782815244190997e-05, + "loss": 0.5491, + "step": 1645 + }, + { + "epoch": 0.09, + "grad_norm": 0.6195714209412733, + "learning_rate": 1.9782429343670267e-05, + "loss": 0.5169, + "step": 1646 + }, + { + "epoch": 0.09, + "grad_norm": 0.4918147444122574, + "learning_rate": 1.978204310438443e-05, + "loss": 0.2578, + "step": 1647 + }, + { + "epoch": 0.09, + "grad_norm": 0.3435562889326771, + "learning_rate": 1.9781656526346863e-05, + "loss": 0.2275, + "step": 1648 + }, + { + "epoch": 0.09, + "grad_norm": 0.7237826939442035, + "learning_rate": 1.9781269609570945e-05, + "loss": 0.3185, + "step": 1649 + }, + { + "epoch": 0.09, + "grad_norm": 0.5178707759432076, + "learning_rate": 1.978088235407009e-05, + "loss": 0.2934, + "step": 1650 + }, + { + "epoch": 0.09, + "grad_norm": 0.7923454047135651, + "learning_rate": 1.978049475985769e-05, + "loss": 0.4562, + "step": 1651 + }, + { + "epoch": 0.09, + "grad_norm": 0.5428509403531181, + "learning_rate": 1.9780106826947184e-05, + "loss": 0.3805, + "step": 1652 + }, + { + "epoch": 0.09, + "grad_norm": 0.45702446485133047, + "learning_rate": 1.9779718555351997e-05, + "loss": 0.2591, + "step": 1653 + }, + { + "epoch": 0.1, + "grad_norm": 0.36296431359883, + "learning_rate": 1.9779329945085578e-05, + "loss": 0.2195, + "step": 1654 + }, + { + "epoch": 0.1, + "grad_norm": 0.6704714680728022, + "learning_rate": 1.9778940996161382e-05, + "loss": 0.4751, + "step": 1655 + }, + { + "epoch": 0.1, + "grad_norm": 0.43102537372482086, + "learning_rate": 1.9778551708592883e-05, + "loss": 0.2816, + "step": 1656 + }, + { + "epoch": 0.1, + "grad_norm": 0.7987627103272095, + "learning_rate": 1.9778162082393554e-05, + "loss": 0.4378, + "step": 1657 + }, + { + "epoch": 0.1, + "grad_norm": 1.4586157829032407, + "learning_rate": 1.9777772117576893e-05, + "loss": 0.8086, + "step": 1658 + }, + { + "epoch": 0.1, + "grad_norm": 0.5097364905331209, + "learning_rate": 1.977738181415641e-05, + "loss": 0.2658, + "step": 1659 + }, + { + "epoch": 0.1, + "grad_norm": 0.5945672810230304, + "learning_rate": 1.977699117214561e-05, + "loss": 0.2941, + "step": 1660 + }, + { + "epoch": 0.1, + "grad_norm": 0.3399558203328304, + "learning_rate": 1.9776600191558025e-05, + "loss": 0.2018, + "step": 1661 + }, + { + "epoch": 0.1, + "grad_norm": 0.6445674144621936, + "learning_rate": 1.9776208872407202e-05, + "loss": 0.3421, + "step": 1662 + }, + { + "epoch": 0.1, + "grad_norm": 0.7888347334580998, + "learning_rate": 1.9775817214706682e-05, + "loss": 0.3533, + "step": 1663 + }, + { + "epoch": 0.1, + "grad_norm": 0.7188794356920819, + "learning_rate": 1.977542521847003e-05, + "loss": 0.3961, + "step": 1664 + }, + { + "epoch": 0.1, + "grad_norm": 0.5233002822851152, + "learning_rate": 1.9775032883710826e-05, + "loss": 0.3212, + "step": 1665 + }, + { + "epoch": 0.1, + "grad_norm": 0.22069561834964382, + "learning_rate": 1.9774640210442654e-05, + "loss": 0.1167, + "step": 1666 + }, + { + "epoch": 0.1, + "grad_norm": 0.4538299309152852, + "learning_rate": 1.977424719867911e-05, + "loss": 0.3369, + "step": 1667 + }, + { + "epoch": 0.1, + "grad_norm": 0.5837962589537384, + "learning_rate": 1.9773853848433806e-05, + "loss": 0.3332, + "step": 1668 + }, + { + "epoch": 0.1, + "grad_norm": 1.2822390523629652, + "learning_rate": 1.9773460159720365e-05, + "loss": 0.499, + "step": 1669 + }, + { + "epoch": 0.1, + "grad_norm": 0.6098981703977231, + "learning_rate": 1.977306613255242e-05, + "loss": 0.3464, + "step": 1670 + }, + { + "epoch": 0.1, + "grad_norm": 0.5010711098035411, + "learning_rate": 1.977267176694361e-05, + "loss": 0.3058, + "step": 1671 + }, + { + "epoch": 0.1, + "grad_norm": 0.4940495865152775, + "learning_rate": 1.97722770629076e-05, + "loss": 0.3481, + "step": 1672 + }, + { + "epoch": 0.1, + "grad_norm": 0.6839400527952039, + "learning_rate": 1.9771882020458055e-05, + "loss": 0.3003, + "step": 1673 + }, + { + "epoch": 0.1, + "grad_norm": 0.4901359588398224, + "learning_rate": 1.9771486639608657e-05, + "loss": 0.3097, + "step": 1674 + }, + { + "epoch": 0.1, + "grad_norm": 0.38335500215663265, + "learning_rate": 1.9771090920373096e-05, + "loss": 0.2784, + "step": 1675 + }, + { + "epoch": 0.1, + "grad_norm": 1.0457668872874353, + "learning_rate": 1.9770694862765077e-05, + "loss": 0.6059, + "step": 1676 + }, + { + "epoch": 0.1, + "grad_norm": 0.4529475843166634, + "learning_rate": 1.9770298466798316e-05, + "loss": 0.2996, + "step": 1677 + }, + { + "epoch": 0.1, + "grad_norm": 0.9647389731875299, + "learning_rate": 1.9769901732486542e-05, + "loss": 0.545, + "step": 1678 + }, + { + "epoch": 0.1, + "grad_norm": 0.4138833322271508, + "learning_rate": 1.9769504659843486e-05, + "loss": 0.3303, + "step": 1679 + }, + { + "epoch": 0.1, + "grad_norm": 0.37714688186515205, + "learning_rate": 1.9769107248882904e-05, + "loss": 0.2794, + "step": 1680 + }, + { + "epoch": 0.1, + "grad_norm": 0.539646531653099, + "learning_rate": 1.976870949961856e-05, + "loss": 0.2455, + "step": 1681 + }, + { + "epoch": 0.1, + "grad_norm": 1.9706385236557056, + "learning_rate": 1.9768311412064224e-05, + "loss": 0.8027, + "step": 1682 + }, + { + "epoch": 0.1, + "grad_norm": 0.4506680655535206, + "learning_rate": 1.9767912986233685e-05, + "loss": 0.2213, + "step": 1683 + }, + { + "epoch": 0.1, + "grad_norm": 0.4818504309643764, + "learning_rate": 1.976751422214074e-05, + "loss": 0.3729, + "step": 1684 + }, + { + "epoch": 0.1, + "grad_norm": 0.6867809896350154, + "learning_rate": 1.9767115119799197e-05, + "loss": 0.5068, + "step": 1685 + }, + { + "epoch": 0.1, + "grad_norm": 0.4081668174851612, + "learning_rate": 1.9766715679222875e-05, + "loss": 0.2312, + "step": 1686 + }, + { + "epoch": 0.1, + "grad_norm": 0.3395615177013707, + "learning_rate": 1.9766315900425613e-05, + "loss": 0.2423, + "step": 1687 + }, + { + "epoch": 0.1, + "grad_norm": 1.7966060680798426, + "learning_rate": 1.976591578342125e-05, + "loss": 0.8855, + "step": 1688 + }, + { + "epoch": 0.1, + "grad_norm": 0.380188460807513, + "learning_rate": 1.9765515328223644e-05, + "loss": 0.2262, + "step": 1689 + }, + { + "epoch": 0.1, + "grad_norm": 0.7703079045710871, + "learning_rate": 1.9765114534846662e-05, + "loss": 0.5256, + "step": 1690 + }, + { + "epoch": 0.1, + "grad_norm": 0.5143745240081559, + "learning_rate": 1.9764713403304183e-05, + "loss": 0.3954, + "step": 1691 + }, + { + "epoch": 0.1, + "grad_norm": 0.46836814781278047, + "learning_rate": 1.97643119336101e-05, + "loss": 0.2407, + "step": 1692 + }, + { + "epoch": 0.1, + "grad_norm": 0.3690528313572445, + "learning_rate": 1.976391012577831e-05, + "loss": 0.1872, + "step": 1693 + }, + { + "epoch": 0.1, + "grad_norm": 1.2780455908200286, + "learning_rate": 1.9763507979822737e-05, + "loss": 0.7456, + "step": 1694 + }, + { + "epoch": 0.1, + "grad_norm": 0.4214444867559315, + "learning_rate": 1.97631054957573e-05, + "loss": 0.2931, + "step": 1695 + }, + { + "epoch": 0.1, + "grad_norm": 0.5189345658634478, + "learning_rate": 1.9762702673595943e-05, + "loss": 0.3177, + "step": 1696 + }, + { + "epoch": 0.1, + "grad_norm": 0.880757909211329, + "learning_rate": 1.9762299513352604e-05, + "loss": 0.5314, + "step": 1697 + }, + { + "epoch": 0.1, + "grad_norm": 0.4845554317665074, + "learning_rate": 1.976189601504126e-05, + "loss": 0.2937, + "step": 1698 + }, + { + "epoch": 0.1, + "grad_norm": 0.30040774511408, + "learning_rate": 1.9761492178675876e-05, + "loss": 0.1939, + "step": 1699 + }, + { + "epoch": 0.1, + "grad_norm": 1.1642293277701257, + "learning_rate": 1.9761088004270435e-05, + "loss": 0.7888, + "step": 1700 + }, + { + "epoch": 0.1, + "grad_norm": 0.43107411175540156, + "learning_rate": 1.976068349183894e-05, + "loss": 0.2908, + "step": 1701 + }, + { + "epoch": 0.1, + "grad_norm": 0.7767979561549465, + "learning_rate": 1.9760278641395395e-05, + "loss": 0.447, + "step": 1702 + }, + { + "epoch": 0.1, + "grad_norm": 0.562402325785457, + "learning_rate": 1.9759873452953816e-05, + "loss": 0.3703, + "step": 1703 + }, + { + "epoch": 0.1, + "grad_norm": 0.4995224833626151, + "learning_rate": 1.9759467926528242e-05, + "loss": 0.3042, + "step": 1704 + }, + { + "epoch": 0.1, + "grad_norm": 0.27285547576376873, + "learning_rate": 1.975906206213271e-05, + "loss": 0.1633, + "step": 1705 + }, + { + "epoch": 0.1, + "grad_norm": 0.9079324937335543, + "learning_rate": 1.9758655859781282e-05, + "loss": 0.6932, + "step": 1706 + }, + { + "epoch": 0.1, + "grad_norm": 0.40677426359666474, + "learning_rate": 1.975824931948802e-05, + "loss": 0.2871, + "step": 1707 + }, + { + "epoch": 0.1, + "grad_norm": 0.45580786137028356, + "learning_rate": 1.975784244126701e-05, + "loss": 0.3893, + "step": 1708 + }, + { + "epoch": 0.1, + "grad_norm": 0.9194512800758327, + "learning_rate": 1.9757435225132325e-05, + "loss": 0.3548, + "step": 1709 + }, + { + "epoch": 0.1, + "grad_norm": 0.44115743053683315, + "learning_rate": 1.9757027671098083e-05, + "loss": 0.3031, + "step": 1710 + }, + { + "epoch": 0.1, + "grad_norm": 0.2825713405110046, + "learning_rate": 1.9756619779178393e-05, + "loss": 0.2427, + "step": 1711 + }, + { + "epoch": 0.1, + "grad_norm": 1.1849133538579537, + "learning_rate": 1.9756211549387378e-05, + "loss": 0.4944, + "step": 1712 + }, + { + "epoch": 0.1, + "grad_norm": 0.36690700948942095, + "learning_rate": 1.975580298173918e-05, + "loss": 0.3057, + "step": 1713 + }, + { + "epoch": 0.1, + "grad_norm": 0.7245805282332856, + "learning_rate": 1.975539407624794e-05, + "loss": 0.5556, + "step": 1714 + }, + { + "epoch": 0.1, + "grad_norm": 0.4612175857613643, + "learning_rate": 1.9754984832927825e-05, + "loss": 0.3085, + "step": 1715 + }, + { + "epoch": 0.1, + "grad_norm": 0.44090499684018897, + "learning_rate": 1.9754575251793006e-05, + "loss": 0.3089, + "step": 1716 + }, + { + "epoch": 0.1, + "grad_norm": 0.5288877897785201, + "learning_rate": 1.975416533285766e-05, + "loss": 0.3071, + "step": 1717 + }, + { + "epoch": 0.1, + "grad_norm": 1.3106501640407626, + "learning_rate": 1.9753755076135994e-05, + "loss": 0.587, + "step": 1718 + }, + { + "epoch": 0.1, + "grad_norm": 0.3528842608221892, + "learning_rate": 1.9753344481642205e-05, + "loss": 0.2396, + "step": 1719 + }, + { + "epoch": 0.1, + "grad_norm": 0.6296848852224964, + "learning_rate": 1.975293354939052e-05, + "loss": 0.4465, + "step": 1720 + }, + { + "epoch": 0.1, + "grad_norm": 0.5365509567706639, + "learning_rate": 1.975252227939516e-05, + "loss": 0.2626, + "step": 1721 + }, + { + "epoch": 0.1, + "grad_norm": 0.3821923286062335, + "learning_rate": 1.9752110671670375e-05, + "loss": 0.1844, + "step": 1722 + }, + { + "epoch": 0.1, + "grad_norm": 0.40328934838267844, + "learning_rate": 1.975169872623042e-05, + "loss": 0.3276, + "step": 1723 + }, + { + "epoch": 0.1, + "grad_norm": 0.9932472840383428, + "learning_rate": 1.9751286443089555e-05, + "loss": 0.5172, + "step": 1724 + }, + { + "epoch": 0.1, + "grad_norm": 0.39449755648213397, + "learning_rate": 1.975087382226206e-05, + "loss": 0.2579, + "step": 1725 + }, + { + "epoch": 0.1, + "grad_norm": 0.34791684251849897, + "learning_rate": 1.9750460863762225e-05, + "loss": 0.2135, + "step": 1726 + }, + { + "epoch": 0.1, + "grad_norm": 0.4824628574743985, + "learning_rate": 1.9750047567604348e-05, + "loss": 0.3734, + "step": 1727 + }, + { + "epoch": 0.1, + "grad_norm": 0.45556148548025616, + "learning_rate": 1.9749633933802743e-05, + "loss": 0.22, + "step": 1728 + }, + { + "epoch": 0.1, + "grad_norm": 0.6722527683545421, + "learning_rate": 1.9749219962371736e-05, + "loss": 0.4172, + "step": 1729 + }, + { + "epoch": 0.1, + "grad_norm": 1.3213518952540688, + "learning_rate": 1.9748805653325657e-05, + "loss": 0.5264, + "step": 1730 + }, + { + "epoch": 0.1, + "grad_norm": 0.43129412639939235, + "learning_rate": 1.974839100667886e-05, + "loss": 0.3006, + "step": 1731 + }, + { + "epoch": 0.1, + "grad_norm": 0.5744145659725003, + "learning_rate": 1.97479760224457e-05, + "loss": 0.3189, + "step": 1732 + }, + { + "epoch": 0.1, + "grad_norm": 0.28489126228750195, + "learning_rate": 1.9747560700640552e-05, + "loss": 0.1805, + "step": 1733 + }, + { + "epoch": 0.1, + "grad_norm": 0.4691600450533372, + "learning_rate": 1.97471450412778e-05, + "loss": 0.2848, + "step": 1734 + }, + { + "epoch": 0.1, + "grad_norm": 0.48444296804977377, + "learning_rate": 1.9746729044371826e-05, + "loss": 0.2677, + "step": 1735 + }, + { + "epoch": 0.1, + "grad_norm": 1.6659386268209997, + "learning_rate": 1.9746312709937047e-05, + "loss": 0.5739, + "step": 1736 + }, + { + "epoch": 0.1, + "grad_norm": 0.4936719526225396, + "learning_rate": 1.974589603798788e-05, + "loss": 0.3041, + "step": 1737 + }, + { + "epoch": 0.1, + "grad_norm": 0.34597533311036405, + "learning_rate": 1.974547902853875e-05, + "loss": 0.2043, + "step": 1738 + }, + { + "epoch": 0.1, + "grad_norm": 0.38018595031990254, + "learning_rate": 1.9745061681604104e-05, + "loss": 0.3154, + "step": 1739 + }, + { + "epoch": 0.1, + "grad_norm": 0.8001416151657222, + "learning_rate": 1.974464399719839e-05, + "loss": 0.461, + "step": 1740 + }, + { + "epoch": 0.1, + "grad_norm": 0.5315768234902671, + "learning_rate": 1.974422597533607e-05, + "loss": 0.3053, + "step": 1741 + }, + { + "epoch": 0.1, + "grad_norm": 0.5469926220469847, + "learning_rate": 1.9743807616031624e-05, + "loss": 0.3805, + "step": 1742 + }, + { + "epoch": 0.1, + "grad_norm": 0.7185184120264385, + "learning_rate": 1.974338891929954e-05, + "loss": 0.3472, + "step": 1743 + }, + { + "epoch": 0.1, + "grad_norm": 0.5697328626613448, + "learning_rate": 1.9742969885154318e-05, + "loss": 0.3864, + "step": 1744 + }, + { + "epoch": 0.1, + "grad_norm": 0.2519361023605971, + "learning_rate": 1.974255051361047e-05, + "loss": 0.076, + "step": 1745 + }, + { + "epoch": 0.1, + "grad_norm": 0.5995345198599, + "learning_rate": 1.974213080468251e-05, + "loss": 0.3818, + "step": 1746 + }, + { + "epoch": 0.1, + "grad_norm": 0.40605771267747187, + "learning_rate": 1.9741710758384985e-05, + "loss": 0.3281, + "step": 1747 + }, + { + "epoch": 0.1, + "grad_norm": 0.852458852226233, + "learning_rate": 1.9741290374732434e-05, + "loss": 0.4804, + "step": 1748 + }, + { + "epoch": 0.1, + "grad_norm": 0.6862694898087204, + "learning_rate": 1.9740869653739413e-05, + "loss": 0.4156, + "step": 1749 + }, + { + "epoch": 0.1, + "grad_norm": 0.5487997218358596, + "learning_rate": 1.97404485954205e-05, + "loss": 0.3663, + "step": 1750 + }, + { + "epoch": 0.1, + "grad_norm": 0.303352834458015, + "learning_rate": 1.974002719979027e-05, + "loss": 0.1821, + "step": 1751 + }, + { + "epoch": 0.1, + "grad_norm": 0.5481110458066554, + "learning_rate": 1.973960546686331e-05, + "loss": 0.3175, + "step": 1752 + }, + { + "epoch": 0.1, + "grad_norm": 0.6217773627623907, + "learning_rate": 1.973918339665424e-05, + "loss": 0.4053, + "step": 1753 + }, + { + "epoch": 0.1, + "grad_norm": 0.4157060592887071, + "learning_rate": 1.9738760989177665e-05, + "loss": 0.3342, + "step": 1754 + }, + { + "epoch": 0.1, + "grad_norm": 0.48771030560903766, + "learning_rate": 1.9738338244448214e-05, + "loss": 0.3222, + "step": 1755 + }, + { + "epoch": 0.1, + "grad_norm": 0.5874946088172913, + "learning_rate": 1.9737915162480527e-05, + "loss": 0.4358, + "step": 1756 + }, + { + "epoch": 0.1, + "grad_norm": 0.37947933594438027, + "learning_rate": 1.973749174328926e-05, + "loss": 0.2179, + "step": 1757 + }, + { + "epoch": 0.1, + "grad_norm": 0.4789152218360017, + "learning_rate": 1.9737067986889072e-05, + "loss": 0.3133, + "step": 1758 + }, + { + "epoch": 0.1, + "grad_norm": 0.45954057561314693, + "learning_rate": 1.973664389329464e-05, + "loss": 0.3215, + "step": 1759 + }, + { + "epoch": 0.1, + "grad_norm": 1.0575391871829922, + "learning_rate": 1.9736219462520645e-05, + "loss": 0.6064, + "step": 1760 + }, + { + "epoch": 0.1, + "grad_norm": 0.5316241369605608, + "learning_rate": 1.973579469458179e-05, + "loss": 0.2216, + "step": 1761 + }, + { + "epoch": 0.1, + "grad_norm": 0.5200064785449022, + "learning_rate": 1.9735369589492786e-05, + "loss": 0.3547, + "step": 1762 + }, + { + "epoch": 0.1, + "grad_norm": 0.3661703504092863, + "learning_rate": 1.973494414726835e-05, + "loss": 0.2464, + "step": 1763 + }, + { + "epoch": 0.1, + "grad_norm": 0.6885337064692816, + "learning_rate": 1.9734518367923216e-05, + "loss": 0.3472, + "step": 1764 + }, + { + "epoch": 0.1, + "grad_norm": 0.44627825575802316, + "learning_rate": 1.973409225147213e-05, + "loss": 0.3789, + "step": 1765 + }, + { + "epoch": 0.1, + "grad_norm": 0.5653737927716992, + "learning_rate": 1.973366579792985e-05, + "loss": 0.3915, + "step": 1766 + }, + { + "epoch": 0.1, + "grad_norm": 0.47244377328827575, + "learning_rate": 1.9733239007311137e-05, + "loss": 0.2816, + "step": 1767 + }, + { + "epoch": 0.1, + "grad_norm": 0.5118989988502627, + "learning_rate": 1.973281187963078e-05, + "loss": 0.3149, + "step": 1768 + }, + { + "epoch": 0.1, + "grad_norm": 0.8961397366295146, + "learning_rate": 1.9732384414903562e-05, + "loss": 0.5801, + "step": 1769 + }, + { + "epoch": 0.1, + "grad_norm": 0.40915889685555634, + "learning_rate": 1.9731956613144297e-05, + "loss": 0.3385, + "step": 1770 + }, + { + "epoch": 0.1, + "grad_norm": 0.36489091879616176, + "learning_rate": 1.9731528474367787e-05, + "loss": 0.1888, + "step": 1771 + }, + { + "epoch": 0.1, + "grad_norm": 0.4653678958356583, + "learning_rate": 1.9731099998588865e-05, + "loss": 0.3244, + "step": 1772 + }, + { + "epoch": 0.1, + "grad_norm": 0.7889078900037176, + "learning_rate": 1.973067118582237e-05, + "loss": 0.4141, + "step": 1773 + }, + { + "epoch": 0.1, + "grad_norm": 0.48457358345367246, + "learning_rate": 1.973024203608315e-05, + "loss": 0.2676, + "step": 1774 + }, + { + "epoch": 0.1, + "grad_norm": 0.5849351746372535, + "learning_rate": 1.9729812549386066e-05, + "loss": 0.3811, + "step": 1775 + }, + { + "epoch": 0.1, + "grad_norm": 0.6100840543617714, + "learning_rate": 1.9729382725745997e-05, + "loss": 0.3586, + "step": 1776 + }, + { + "epoch": 0.1, + "grad_norm": 0.32169239000575955, + "learning_rate": 1.9728952565177817e-05, + "loss": 0.1859, + "step": 1777 + }, + { + "epoch": 0.1, + "grad_norm": 0.4926537571689009, + "learning_rate": 1.972852206769643e-05, + "loss": 0.3794, + "step": 1778 + }, + { + "epoch": 0.1, + "grad_norm": 1.4956203678206292, + "learning_rate": 1.972809123331674e-05, + "loss": 0.8704, + "step": 1779 + }, + { + "epoch": 0.1, + "grad_norm": 0.432781530735122, + "learning_rate": 1.972766006205367e-05, + "loss": 0.2187, + "step": 1780 + }, + { + "epoch": 0.1, + "grad_norm": 0.8836133604422183, + "learning_rate": 1.9727228553922152e-05, + "loss": 0.5072, + "step": 1781 + }, + { + "epoch": 0.1, + "grad_norm": 0.6000939456264764, + "learning_rate": 1.9726796708937125e-05, + "loss": 0.3967, + "step": 1782 + }, + { + "epoch": 0.1, + "grad_norm": 0.36428401062594407, + "learning_rate": 1.972636452711355e-05, + "loss": 0.2402, + "step": 1783 + }, + { + "epoch": 0.1, + "grad_norm": 0.35233509057631474, + "learning_rate": 1.9725932008466383e-05, + "loss": 0.1392, + "step": 1784 + }, + { + "epoch": 0.1, + "grad_norm": 0.7541938557788923, + "learning_rate": 1.9725499153010613e-05, + "loss": 0.4919, + "step": 1785 + }, + { + "epoch": 0.1, + "grad_norm": 0.4648704692571199, + "learning_rate": 1.9725065960761225e-05, + "loss": 0.3193, + "step": 1786 + }, + { + "epoch": 0.1, + "grad_norm": 0.7643519431832295, + "learning_rate": 1.9724632431733223e-05, + "loss": 0.3782, + "step": 1787 + }, + { + "epoch": 0.1, + "grad_norm": 0.4383417741200626, + "learning_rate": 1.9724198565941616e-05, + "loss": 0.2711, + "step": 1788 + }, + { + "epoch": 0.1, + "grad_norm": 0.5752027956523929, + "learning_rate": 1.972376436340143e-05, + "loss": 0.3578, + "step": 1789 + }, + { + "epoch": 0.1, + "grad_norm": 0.3491638947718094, + "learning_rate": 1.9723329824127703e-05, + "loss": 0.2435, + "step": 1790 + }, + { + "epoch": 0.1, + "grad_norm": 0.9210790563154199, + "learning_rate": 1.9722894948135485e-05, + "loss": 0.5047, + "step": 1791 + }, + { + "epoch": 0.1, + "grad_norm": 0.4745370515957307, + "learning_rate": 1.972245973543983e-05, + "loss": 0.3136, + "step": 1792 + }, + { + "epoch": 0.1, + "grad_norm": 0.8457754702291259, + "learning_rate": 1.9722024186055812e-05, + "loss": 0.4412, + "step": 1793 + }, + { + "epoch": 0.1, + "grad_norm": 0.5232986705641869, + "learning_rate": 1.9721588299998516e-05, + "loss": 0.3358, + "step": 1794 + }, + { + "epoch": 0.1, + "grad_norm": 0.43026069963986585, + "learning_rate": 1.9721152077283038e-05, + "loss": 0.2955, + "step": 1795 + }, + { + "epoch": 0.1, + "grad_norm": 0.3170651313977208, + "learning_rate": 1.9720715517924476e-05, + "loss": 0.2067, + "step": 1796 + }, + { + "epoch": 0.1, + "grad_norm": 1.055213009657008, + "learning_rate": 1.9720278621937954e-05, + "loss": 0.4355, + "step": 1797 + }, + { + "epoch": 0.1, + "grad_norm": 0.39566676103877046, + "learning_rate": 1.9719841389338605e-05, + "loss": 0.2776, + "step": 1798 + }, + { + "epoch": 0.1, + "grad_norm": 0.45559232573659136, + "learning_rate": 1.9719403820141563e-05, + "loss": 0.4138, + "step": 1799 + }, + { + "epoch": 0.1, + "grad_norm": 0.9887763688604866, + "learning_rate": 1.971896591436199e-05, + "loss": 0.2925, + "step": 1800 + }, + { + "epoch": 0.1, + "grad_norm": 0.3638885992702867, + "learning_rate": 1.971852767201504e-05, + "loss": 0.2546, + "step": 1801 + }, + { + "epoch": 0.1, + "grad_norm": 0.6125066940881474, + "learning_rate": 1.97180890931159e-05, + "loss": 0.4316, + "step": 1802 + }, + { + "epoch": 0.1, + "grad_norm": 0.645523555658483, + "learning_rate": 1.971765017767975e-05, + "loss": 0.3526, + "step": 1803 + }, + { + "epoch": 0.1, + "grad_norm": 0.48292600807911545, + "learning_rate": 1.9717210925721792e-05, + "loss": 0.3628, + "step": 1804 + }, + { + "epoch": 0.1, + "grad_norm": 0.510701994227429, + "learning_rate": 1.9716771337257235e-05, + "loss": 0.2944, + "step": 1805 + }, + { + "epoch": 0.1, + "grad_norm": 0.512257770671276, + "learning_rate": 1.9716331412301304e-05, + "loss": 0.3456, + "step": 1806 + }, + { + "epoch": 0.1, + "grad_norm": 0.3723557207434571, + "learning_rate": 1.971589115086924e-05, + "loss": 0.2188, + "step": 1807 + }, + { + "epoch": 0.1, + "grad_norm": 0.4776975335292782, + "learning_rate": 1.971545055297628e-05, + "loss": 0.3271, + "step": 1808 + }, + { + "epoch": 0.1, + "grad_norm": 0.4671253964539537, + "learning_rate": 1.971500961863768e-05, + "loss": 0.4177, + "step": 1809 + }, + { + "epoch": 0.1, + "grad_norm": 0.4226489406088548, + "learning_rate": 1.9714568347868722e-05, + "loss": 0.2196, + "step": 1810 + }, + { + "epoch": 0.1, + "grad_norm": 0.36354839297692376, + "learning_rate": 1.9714126740684677e-05, + "loss": 0.3186, + "step": 1811 + }, + { + "epoch": 0.1, + "grad_norm": 1.4618017892562303, + "learning_rate": 1.9713684797100843e-05, + "loss": 0.7304, + "step": 1812 + }, + { + "epoch": 0.1, + "grad_norm": 0.35343600195296193, + "learning_rate": 1.9713242517132518e-05, + "loss": 0.2205, + "step": 1813 + }, + { + "epoch": 0.1, + "grad_norm": 0.4373763331330048, + "learning_rate": 1.9712799900795026e-05, + "loss": 0.3891, + "step": 1814 + }, + { + "epoch": 0.1, + "grad_norm": 0.7587560756410427, + "learning_rate": 1.971235694810369e-05, + "loss": 0.5362, + "step": 1815 + }, + { + "epoch": 0.1, + "grad_norm": 0.38486193172658234, + "learning_rate": 1.971191365907385e-05, + "loss": 0.2663, + "step": 1816 + }, + { + "epoch": 0.1, + "grad_norm": 0.3515214902888066, + "learning_rate": 1.9711470033720856e-05, + "loss": 0.168, + "step": 1817 + }, + { + "epoch": 0.1, + "grad_norm": 0.48100670484562935, + "learning_rate": 1.9711026072060077e-05, + "loss": 0.3562, + "step": 1818 + }, + { + "epoch": 0.1, + "grad_norm": 0.39630337446911107, + "learning_rate": 1.971058177410688e-05, + "loss": 0.2998, + "step": 1819 + }, + { + "epoch": 0.1, + "grad_norm": 0.8004872736078943, + "learning_rate": 1.9710137139876653e-05, + "loss": 0.4614, + "step": 1820 + }, + { + "epoch": 0.1, + "grad_norm": 0.49674968425439975, + "learning_rate": 1.9709692169384794e-05, + "loss": 0.4202, + "step": 1821 + }, + { + "epoch": 0.1, + "grad_norm": 0.3863426434060569, + "learning_rate": 1.9709246862646712e-05, + "loss": 0.2811, + "step": 1822 + }, + { + "epoch": 0.1, + "grad_norm": 0.22159405934145307, + "learning_rate": 1.9708801219677832e-05, + "loss": 0.0938, + "step": 1823 + }, + { + "epoch": 0.1, + "grad_norm": 1.116757279703663, + "learning_rate": 1.970835524049358e-05, + "loss": 0.6516, + "step": 1824 + }, + { + "epoch": 0.1, + "grad_norm": 0.4811222290577782, + "learning_rate": 1.9707908925109405e-05, + "loss": 0.3042, + "step": 1825 + }, + { + "epoch": 0.1, + "grad_norm": 0.4106501961078986, + "learning_rate": 1.9707462273540762e-05, + "loss": 0.3154, + "step": 1826 + }, + { + "epoch": 0.1, + "grad_norm": 0.8876173475980776, + "learning_rate": 1.9707015285803117e-05, + "loss": 0.5606, + "step": 1827 + }, + { + "epoch": 0.11, + "grad_norm": 0.4763770328566096, + "learning_rate": 1.9706567961911952e-05, + "loss": 0.3034, + "step": 1828 + }, + { + "epoch": 0.11, + "grad_norm": 0.2731838409066813, + "learning_rate": 1.9706120301882755e-05, + "loss": 0.1925, + "step": 1829 + }, + { + "epoch": 0.11, + "grad_norm": 0.6050226311696162, + "learning_rate": 1.9705672305731027e-05, + "loss": 0.3795, + "step": 1830 + }, + { + "epoch": 0.11, + "grad_norm": 0.5254253951284472, + "learning_rate": 1.9705223973472288e-05, + "loss": 0.3244, + "step": 1831 + }, + { + "epoch": 0.11, + "grad_norm": 0.7309290009909968, + "learning_rate": 1.9704775305122057e-05, + "loss": 0.4473, + "step": 1832 + }, + { + "epoch": 0.11, + "grad_norm": 0.5142034091004006, + "learning_rate": 1.9704326300695874e-05, + "loss": 0.3584, + "step": 1833 + }, + { + "epoch": 0.11, + "grad_norm": 0.41208408396675505, + "learning_rate": 1.9703876960209292e-05, + "loss": 0.2997, + "step": 1834 + }, + { + "epoch": 0.11, + "grad_norm": 0.4215025386501301, + "learning_rate": 1.970342728367787e-05, + "loss": 0.207, + "step": 1835 + }, + { + "epoch": 0.11, + "grad_norm": 1.1215430532038166, + "learning_rate": 1.9702977271117172e-05, + "loss": 0.3586, + "step": 1836 + }, + { + "epoch": 0.11, + "grad_norm": 0.40290356090073365, + "learning_rate": 1.9702526922542788e-05, + "loss": 0.3035, + "step": 1837 + }, + { + "epoch": 0.11, + "grad_norm": 0.4444886784774347, + "learning_rate": 1.970207623797032e-05, + "loss": 0.367, + "step": 1838 + }, + { + "epoch": 0.11, + "grad_norm": 0.7960973226307128, + "learning_rate": 1.9701625217415366e-05, + "loss": 0.4945, + "step": 1839 + }, + { + "epoch": 0.11, + "grad_norm": 0.40419674822028395, + "learning_rate": 1.9701173860893547e-05, + "loss": 0.2819, + "step": 1840 + }, + { + "epoch": 0.11, + "grad_norm": 0.3928574273644804, + "learning_rate": 1.9700722168420493e-05, + "loss": 0.1876, + "step": 1841 + }, + { + "epoch": 0.11, + "grad_norm": 0.6832258064420349, + "learning_rate": 1.9700270140011852e-05, + "loss": 0.3169, + "step": 1842 + }, + { + "epoch": 0.11, + "grad_norm": 0.5857771512420725, + "learning_rate": 1.969981777568327e-05, + "loss": 0.3352, + "step": 1843 + }, + { + "epoch": 0.11, + "grad_norm": 0.4742736573868894, + "learning_rate": 1.969936507545042e-05, + "loss": 0.3776, + "step": 1844 + }, + { + "epoch": 0.11, + "grad_norm": 0.448882200382126, + "learning_rate": 1.969891203932897e-05, + "loss": 0.3702, + "step": 1845 + }, + { + "epoch": 0.11, + "grad_norm": 0.3997430046247253, + "learning_rate": 1.9698458667334616e-05, + "loss": 0.2028, + "step": 1846 + }, + { + "epoch": 0.11, + "grad_norm": 0.42185324820247727, + "learning_rate": 1.969800495948305e-05, + "loss": 0.2546, + "step": 1847 + }, + { + "epoch": 0.11, + "grad_norm": 0.581614305117135, + "learning_rate": 1.9697550915789992e-05, + "loss": 0.3695, + "step": 1848 + }, + { + "epoch": 0.11, + "grad_norm": 0.46363413685206445, + "learning_rate": 1.9697096536271166e-05, + "loss": 0.2585, + "step": 1849 + }, + { + "epoch": 0.11, + "grad_norm": 0.42718596651710894, + "learning_rate": 1.96966418209423e-05, + "loss": 0.3225, + "step": 1850 + }, + { + "epoch": 0.11, + "grad_norm": 1.4101683380916672, + "learning_rate": 1.9696186769819146e-05, + "loss": 0.8429, + "step": 1851 + }, + { + "epoch": 0.11, + "grad_norm": 0.406650262457706, + "learning_rate": 1.9695731382917463e-05, + "loss": 0.2515, + "step": 1852 + }, + { + "epoch": 0.11, + "grad_norm": 0.2883048378866706, + "learning_rate": 1.9695275660253017e-05, + "loss": 0.236, + "step": 1853 + }, + { + "epoch": 0.11, + "grad_norm": 0.5570129462774673, + "learning_rate": 1.9694819601841593e-05, + "loss": 0.4075, + "step": 1854 + }, + { + "epoch": 0.11, + "grad_norm": 0.6608487821497272, + "learning_rate": 1.969436320769898e-05, + "loss": 0.324, + "step": 1855 + }, + { + "epoch": 0.11, + "grad_norm": 0.695684745170112, + "learning_rate": 1.969390647784099e-05, + "loss": 0.4257, + "step": 1856 + }, + { + "epoch": 0.11, + "grad_norm": 0.42950676535452104, + "learning_rate": 1.9693449412283435e-05, + "loss": 0.3572, + "step": 1857 + }, + { + "epoch": 0.11, + "grad_norm": 0.422128890021055, + "learning_rate": 1.9692992011042143e-05, + "loss": 0.2884, + "step": 1858 + }, + { + "epoch": 0.11, + "grad_norm": 0.4976931054505805, + "learning_rate": 1.969253427413295e-05, + "loss": 0.3108, + "step": 1859 + }, + { + "epoch": 0.11, + "grad_norm": 0.42506504553438845, + "learning_rate": 1.969207620157172e-05, + "loss": 0.2968, + "step": 1860 + }, + { + "epoch": 0.11, + "grad_norm": 0.44845845302893456, + "learning_rate": 1.9691617793374304e-05, + "loss": 0.2973, + "step": 1861 + }, + { + "epoch": 0.11, + "grad_norm": 0.463892625466207, + "learning_rate": 1.969115904955658e-05, + "loss": 0.2725, + "step": 1862 + }, + { + "epoch": 0.11, + "grad_norm": 1.2968732447910472, + "learning_rate": 1.9690699970134436e-05, + "loss": 0.7922, + "step": 1863 + }, + { + "epoch": 0.11, + "grad_norm": 0.7591209376006144, + "learning_rate": 1.9690240555123767e-05, + "loss": 0.4119, + "step": 1864 + }, + { + "epoch": 0.11, + "grad_norm": 0.37942803845107786, + "learning_rate": 1.9689780804540487e-05, + "loss": 0.2586, + "step": 1865 + }, + { + "epoch": 0.11, + "grad_norm": 0.5185336534105881, + "learning_rate": 1.968932071840051e-05, + "loss": 0.4203, + "step": 1866 + }, + { + "epoch": 0.11, + "grad_norm": 0.41377909230155135, + "learning_rate": 1.968886029671978e-05, + "loss": 0.2385, + "step": 1867 + }, + { + "epoch": 0.11, + "grad_norm": 0.3525014148154284, + "learning_rate": 1.968839953951423e-05, + "loss": 0.2215, + "step": 1868 + }, + { + "epoch": 0.11, + "grad_norm": 0.4768967276948403, + "learning_rate": 1.968793844679982e-05, + "loss": 0.3546, + "step": 1869 + }, + { + "epoch": 0.11, + "grad_norm": 0.5977294660884478, + "learning_rate": 1.9687477018592517e-05, + "loss": 0.3633, + "step": 1870 + }, + { + "epoch": 0.11, + "grad_norm": 0.4148338563966621, + "learning_rate": 1.9687015254908305e-05, + "loss": 0.375, + "step": 1871 + }, + { + "epoch": 0.11, + "grad_norm": 0.5783402829003099, + "learning_rate": 1.968655315576317e-05, + "loss": 0.4202, + "step": 1872 + }, + { + "epoch": 0.11, + "grad_norm": 0.3409097705300037, + "learning_rate": 1.9686090721173118e-05, + "loss": 0.2646, + "step": 1873 + }, + { + "epoch": 0.11, + "grad_norm": 0.3731720064413754, + "learning_rate": 1.9685627951154154e-05, + "loss": 0.259, + "step": 1874 + }, + { + "epoch": 0.11, + "grad_norm": 1.056227121717924, + "learning_rate": 1.9685164845722317e-05, + "loss": 0.6084, + "step": 1875 + }, + { + "epoch": 0.11, + "grad_norm": 0.47496672745326424, + "learning_rate": 1.9684701404893635e-05, + "loss": 0.331, + "step": 1876 + }, + { + "epoch": 0.11, + "grad_norm": 0.6892217331540745, + "learning_rate": 1.9684237628684162e-05, + "loss": 0.4158, + "step": 1877 + }, + { + "epoch": 0.11, + "grad_norm": 0.5381495786374161, + "learning_rate": 1.9683773517109954e-05, + "loss": 0.3442, + "step": 1878 + }, + { + "epoch": 0.11, + "grad_norm": 0.4442211799150528, + "learning_rate": 1.9683309070187088e-05, + "loss": 0.2373, + "step": 1879 + }, + { + "epoch": 0.11, + "grad_norm": 0.4516614938674143, + "learning_rate": 1.9682844287931643e-05, + "loss": 0.2681, + "step": 1880 + }, + { + "epoch": 0.11, + "grad_norm": 0.46034638626235896, + "learning_rate": 1.9682379170359717e-05, + "loss": 0.379, + "step": 1881 + }, + { + "epoch": 0.11, + "grad_norm": 0.7113109005722336, + "learning_rate": 1.9681913717487418e-05, + "loss": 0.2923, + "step": 1882 + }, + { + "epoch": 0.11, + "grad_norm": 0.46398582128192567, + "learning_rate": 1.9681447929330864e-05, + "loss": 0.343, + "step": 1883 + }, + { + "epoch": 0.11, + "grad_norm": 0.8005165026287019, + "learning_rate": 1.9680981805906182e-05, + "loss": 0.5295, + "step": 1884 + }, + { + "epoch": 0.11, + "grad_norm": 0.5318756913666476, + "learning_rate": 1.968051534722952e-05, + "loss": 0.222, + "step": 1885 + }, + { + "epoch": 0.11, + "grad_norm": 0.36430519799466327, + "learning_rate": 1.968004855331702e-05, + "loss": 0.2523, + "step": 1886 + }, + { + "epoch": 0.11, + "grad_norm": 1.379727729095134, + "learning_rate": 1.9679581424184862e-05, + "loss": 0.803, + "step": 1887 + }, + { + "epoch": 0.11, + "grad_norm": 0.6338746679719071, + "learning_rate": 1.9679113959849213e-05, + "loss": 0.3052, + "step": 1888 + }, + { + "epoch": 0.11, + "grad_norm": 0.436401101542336, + "learning_rate": 1.9678646160326268e-05, + "loss": 0.3473, + "step": 1889 + }, + { + "epoch": 0.11, + "grad_norm": 0.9931177098972608, + "learning_rate": 1.9678178025632217e-05, + "loss": 0.5409, + "step": 1890 + }, + { + "epoch": 0.11, + "grad_norm": 0.4048200278519992, + "learning_rate": 1.9677709555783278e-05, + "loss": 0.1736, + "step": 1891 + }, + { + "epoch": 0.11, + "grad_norm": 0.5199305893202164, + "learning_rate": 1.9677240750795677e-05, + "loss": 0.3528, + "step": 1892 + }, + { + "epoch": 0.11, + "grad_norm": 0.45682355339831787, + "learning_rate": 1.967677161068564e-05, + "loss": 0.3718, + "step": 1893 + }, + { + "epoch": 0.11, + "grad_norm": 0.6801158394462933, + "learning_rate": 1.9676302135469424e-05, + "loss": 0.3543, + "step": 1894 + }, + { + "epoch": 0.11, + "grad_norm": 0.4675467552847163, + "learning_rate": 1.9675832325163277e-05, + "loss": 0.2873, + "step": 1895 + }, + { + "epoch": 0.11, + "grad_norm": 1.21108403861959, + "learning_rate": 1.9675362179783472e-05, + "loss": 0.6525, + "step": 1896 + }, + { + "epoch": 0.11, + "grad_norm": 0.4128238832269528, + "learning_rate": 1.967489169934629e-05, + "loss": 0.3157, + "step": 1897 + }, + { + "epoch": 0.11, + "grad_norm": 0.4572471040151339, + "learning_rate": 1.9674420883868032e-05, + "loss": 0.2685, + "step": 1898 + }, + { + "epoch": 0.11, + "grad_norm": 0.5102926952782346, + "learning_rate": 1.9673949733364987e-05, + "loss": 0.3365, + "step": 1899 + }, + { + "epoch": 0.11, + "grad_norm": 0.801037699421989, + "learning_rate": 1.9673478247853482e-05, + "loss": 0.5427, + "step": 1900 + }, + { + "epoch": 0.11, + "grad_norm": 0.40519304528153915, + "learning_rate": 1.967300642734984e-05, + "loss": 0.2522, + "step": 1901 + }, + { + "epoch": 0.11, + "grad_norm": 0.43817917000824186, + "learning_rate": 1.96725342718704e-05, + "loss": 0.2323, + "step": 1902 + }, + { + "epoch": 0.11, + "grad_norm": 1.0546181162648953, + "learning_rate": 1.967206178143152e-05, + "loss": 0.6465, + "step": 1903 + }, + { + "epoch": 0.11, + "grad_norm": 0.40293228795698843, + "learning_rate": 1.967158895604955e-05, + "loss": 0.2422, + "step": 1904 + }, + { + "epoch": 0.11, + "grad_norm": 0.4840118868245216, + "learning_rate": 1.967111579574087e-05, + "loss": 0.3647, + "step": 1905 + }, + { + "epoch": 0.11, + "grad_norm": 0.856390248834586, + "learning_rate": 1.967064230052187e-05, + "loss": 0.3756, + "step": 1906 + }, + { + "epoch": 0.11, + "grad_norm": 0.3372824122208772, + "learning_rate": 1.9670168470408942e-05, + "loss": 0.2382, + "step": 1907 + }, + { + "epoch": 0.11, + "grad_norm": 1.0445851645817175, + "learning_rate": 1.9669694305418498e-05, + "loss": 0.3723, + "step": 1908 + }, + { + "epoch": 0.11, + "grad_norm": 0.5236158385845334, + "learning_rate": 1.9669219805566954e-05, + "loss": 0.3219, + "step": 1909 + }, + { + "epoch": 0.11, + "grad_norm": 0.489704534862564, + "learning_rate": 1.966874497087074e-05, + "loss": 0.3341, + "step": 1910 + }, + { + "epoch": 0.11, + "grad_norm": 0.8448033082087018, + "learning_rate": 1.9668269801346305e-05, + "loss": 0.4509, + "step": 1911 + }, + { + "epoch": 0.11, + "grad_norm": 0.440510759931443, + "learning_rate": 1.966779429701011e-05, + "loss": 0.3606, + "step": 1912 + }, + { + "epoch": 0.11, + "grad_norm": 0.49307606252366426, + "learning_rate": 1.9667318457878604e-05, + "loss": 0.3268, + "step": 1913 + }, + { + "epoch": 0.11, + "grad_norm": 0.4100642263944072, + "learning_rate": 1.966684228396828e-05, + "loss": 0.2066, + "step": 1914 + }, + { + "epoch": 0.11, + "grad_norm": 0.9405707203282181, + "learning_rate": 1.9666365775295622e-05, + "loss": 0.4985, + "step": 1915 + }, + { + "epoch": 0.11, + "grad_norm": 0.46413787240405263, + "learning_rate": 1.966588893187714e-05, + "loss": 0.2986, + "step": 1916 + }, + { + "epoch": 0.11, + "grad_norm": 0.532938541056413, + "learning_rate": 1.966541175372933e-05, + "loss": 0.3551, + "step": 1917 + }, + { + "epoch": 0.11, + "grad_norm": 0.8985917488823684, + "learning_rate": 1.966493424086873e-05, + "loss": 0.6217, + "step": 1918 + }, + { + "epoch": 0.11, + "grad_norm": 0.33445042184657847, + "learning_rate": 1.9664456393311876e-05, + "loss": 0.2323, + "step": 1919 + }, + { + "epoch": 0.11, + "grad_norm": 0.5402494374756727, + "learning_rate": 1.966397821107531e-05, + "loss": 0.2774, + "step": 1920 + }, + { + "epoch": 0.11, + "grad_norm": 0.45417449458805104, + "learning_rate": 1.9663499694175596e-05, + "loss": 0.2966, + "step": 1921 + }, + { + "epoch": 0.11, + "grad_norm": 0.41078506159188094, + "learning_rate": 1.96630208426293e-05, + "loss": 0.3234, + "step": 1922 + }, + { + "epoch": 0.11, + "grad_norm": 0.880978732867112, + "learning_rate": 1.966254165645301e-05, + "loss": 0.5256, + "step": 1923 + }, + { + "epoch": 0.11, + "grad_norm": 0.397876864335666, + "learning_rate": 1.9662062135663316e-05, + "loss": 0.2994, + "step": 1924 + }, + { + "epoch": 0.11, + "grad_norm": 0.36149216851821453, + "learning_rate": 1.9661582280276828e-05, + "loss": 0.2885, + "step": 1925 + }, + { + "epoch": 0.11, + "grad_norm": 0.32020921506739036, + "learning_rate": 1.9661102090310157e-05, + "loss": 0.1698, + "step": 1926 + }, + { + "epoch": 0.11, + "grad_norm": 1.0554723084064312, + "learning_rate": 1.9660621565779943e-05, + "loss": 0.4549, + "step": 1927 + }, + { + "epoch": 0.11, + "grad_norm": 0.39709968758603115, + "learning_rate": 1.966014070670281e-05, + "loss": 0.2861, + "step": 1928 + }, + { + "epoch": 0.11, + "grad_norm": 0.4627118390227328, + "learning_rate": 1.965965951309543e-05, + "loss": 0.37, + "step": 1929 + }, + { + "epoch": 0.11, + "grad_norm": 0.5652502961754505, + "learning_rate": 1.965917798497445e-05, + "loss": 0.3301, + "step": 1930 + }, + { + "epoch": 0.11, + "grad_norm": 0.5014863993079642, + "learning_rate": 1.9658696122356556e-05, + "loss": 0.3489, + "step": 1931 + }, + { + "epoch": 0.11, + "grad_norm": 0.3653922709051474, + "learning_rate": 1.965821392525843e-05, + "loss": 0.2062, + "step": 1932 + }, + { + "epoch": 0.11, + "grad_norm": 0.5667706777693249, + "learning_rate": 1.9657731393696768e-05, + "loss": 0.3548, + "step": 1933 + }, + { + "epoch": 0.11, + "grad_norm": 0.36616733693133574, + "learning_rate": 1.9657248527688285e-05, + "loss": 0.2127, + "step": 1934 + }, + { + "epoch": 0.11, + "grad_norm": 1.0432472752070607, + "learning_rate": 1.9656765327249697e-05, + "loss": 0.5552, + "step": 1935 + }, + { + "epoch": 0.11, + "grad_norm": 0.5719621038132259, + "learning_rate": 1.9656281792397745e-05, + "loss": 0.4127, + "step": 1936 + }, + { + "epoch": 0.11, + "grad_norm": 0.37457017917808055, + "learning_rate": 1.965579792314917e-05, + "loss": 0.2565, + "step": 1937 + }, + { + "epoch": 0.11, + "grad_norm": 0.30681311507200504, + "learning_rate": 1.9655313719520726e-05, + "loss": 0.2221, + "step": 1938 + }, + { + "epoch": 0.11, + "grad_norm": 0.8535758857629228, + "learning_rate": 1.9654829181529186e-05, + "loss": 0.5047, + "step": 1939 + }, + { + "epoch": 0.11, + "grad_norm": 0.5719271930360567, + "learning_rate": 1.965434430919132e-05, + "loss": 0.286, + "step": 1940 + }, + { + "epoch": 0.11, + "grad_norm": 0.5563675709499672, + "learning_rate": 1.9653859102523936e-05, + "loss": 0.364, + "step": 1941 + }, + { + "epoch": 0.11, + "grad_norm": 1.3137731119716958, + "learning_rate": 1.965337356154382e-05, + "loss": 0.8021, + "step": 1942 + }, + { + "epoch": 0.11, + "grad_norm": 0.38001594373431286, + "learning_rate": 1.9652887686267795e-05, + "loss": 0.2393, + "step": 1943 + }, + { + "epoch": 0.11, + "grad_norm": 0.2948874349792371, + "learning_rate": 1.965240147671268e-05, + "loss": 0.1511, + "step": 1944 + }, + { + "epoch": 0.11, + "grad_norm": 0.6219444278818353, + "learning_rate": 1.965191493289532e-05, + "loss": 0.4022, + "step": 1945 + }, + { + "epoch": 0.11, + "grad_norm": 0.48882053571073614, + "learning_rate": 1.9651428054832562e-05, + "loss": 0.3009, + "step": 1946 + }, + { + "epoch": 0.11, + "grad_norm": 1.0667319837664195, + "learning_rate": 1.9650940842541265e-05, + "loss": 0.4638, + "step": 1947 + }, + { + "epoch": 0.11, + "grad_norm": 0.4139384433063878, + "learning_rate": 1.9650453296038302e-05, + "loss": 0.3544, + "step": 1948 + }, + { + "epoch": 0.11, + "grad_norm": 0.5010220701794562, + "learning_rate": 1.9649965415340553e-05, + "loss": 0.3336, + "step": 1949 + }, + { + "epoch": 0.11, + "grad_norm": 0.28320481878320053, + "learning_rate": 1.964947720046492e-05, + "loss": 0.1546, + "step": 1950 + }, + { + "epoch": 0.11, + "grad_norm": 1.2928573922057744, + "learning_rate": 1.9648988651428308e-05, + "loss": 0.5374, + "step": 1951 + }, + { + "epoch": 0.11, + "grad_norm": 0.48585857871631216, + "learning_rate": 1.964849976824763e-05, + "loss": 0.2929, + "step": 1952 + }, + { + "epoch": 0.11, + "grad_norm": 0.4246659305786616, + "learning_rate": 1.964801055093982e-05, + "loss": 0.3102, + "step": 1953 + }, + { + "epoch": 0.11, + "grad_norm": 1.487500029193504, + "learning_rate": 1.964752099952182e-05, + "loss": 0.8572, + "step": 1954 + }, + { + "epoch": 0.11, + "grad_norm": 0.4105755914914621, + "learning_rate": 1.9647031114010585e-05, + "loss": 0.3072, + "step": 1955 + }, + { + "epoch": 0.11, + "grad_norm": 0.3978007522631092, + "learning_rate": 1.9646540894423074e-05, + "loss": 0.2497, + "step": 1956 + }, + { + "epoch": 0.11, + "grad_norm": 1.6591002883094217, + "learning_rate": 1.964605034077627e-05, + "loss": 0.5882, + "step": 1957 + }, + { + "epoch": 0.11, + "grad_norm": 0.34811743917000226, + "learning_rate": 1.9645559453087158e-05, + "loss": 0.235, + "step": 1958 + }, + { + "epoch": 0.11, + "grad_norm": 0.6295042608541986, + "learning_rate": 1.9645068231372733e-05, + "loss": 0.426, + "step": 1959 + }, + { + "epoch": 0.11, + "grad_norm": 0.3943625001062919, + "learning_rate": 1.9644576675650012e-05, + "loss": 0.2832, + "step": 1960 + }, + { + "epoch": 0.11, + "grad_norm": 0.4604908836391139, + "learning_rate": 1.9644084785936014e-05, + "loss": 0.3313, + "step": 1961 + }, + { + "epoch": 0.11, + "grad_norm": 1.0978664581234787, + "learning_rate": 1.9643592562247776e-05, + "loss": 0.5663, + "step": 1962 + }, + { + "epoch": 0.11, + "grad_norm": 0.5053998885488958, + "learning_rate": 1.964310000460234e-05, + "loss": 0.2447, + "step": 1963 + }, + { + "epoch": 0.11, + "grad_norm": 0.4433806067056563, + "learning_rate": 1.964260711301677e-05, + "loss": 0.2844, + "step": 1964 + }, + { + "epoch": 0.11, + "grad_norm": 0.5933816056127612, + "learning_rate": 1.9642113887508127e-05, + "loss": 0.4007, + "step": 1965 + }, + { + "epoch": 0.11, + "grad_norm": 0.9395519950185571, + "learning_rate": 1.9641620328093496e-05, + "loss": 0.5341, + "step": 1966 + }, + { + "epoch": 0.11, + "grad_norm": 0.6664142487597251, + "learning_rate": 1.964112643478997e-05, + "loss": 0.3412, + "step": 1967 + }, + { + "epoch": 0.11, + "grad_norm": 0.5458830028835255, + "learning_rate": 1.9640632207614647e-05, + "loss": 0.3502, + "step": 1968 + }, + { + "epoch": 0.11, + "grad_norm": 0.4650976263027919, + "learning_rate": 1.9640137646584646e-05, + "loss": 0.3399, + "step": 1969 + }, + { + "epoch": 0.11, + "grad_norm": 0.2908370498165278, + "learning_rate": 1.963964275171709e-05, + "loss": 0.1562, + "step": 1970 + }, + { + "epoch": 0.11, + "grad_norm": 0.7007220925956545, + "learning_rate": 1.9639147523029125e-05, + "loss": 0.4742, + "step": 1971 + }, + { + "epoch": 0.11, + "grad_norm": 0.5483497042322073, + "learning_rate": 1.963865196053789e-05, + "loss": 0.3736, + "step": 1972 + }, + { + "epoch": 0.11, + "grad_norm": 0.42343628698039426, + "learning_rate": 1.9638156064260555e-05, + "loss": 0.1913, + "step": 1973 + }, + { + "epoch": 0.11, + "grad_norm": 0.4601079222814512, + "learning_rate": 1.9637659834214294e-05, + "loss": 0.4321, + "step": 1974 + }, + { + "epoch": 0.11, + "grad_norm": 1.3014578591026609, + "learning_rate": 1.9637163270416283e-05, + "loss": 0.7311, + "step": 1975 + }, + { + "epoch": 0.11, + "grad_norm": 0.31662376400744924, + "learning_rate": 1.9636666372883722e-05, + "loss": 0.2132, + "step": 1976 + }, + { + "epoch": 0.11, + "grad_norm": 0.423523284773505, + "learning_rate": 1.963616914163382e-05, + "loss": 0.2908, + "step": 1977 + }, + { + "epoch": 0.11, + "grad_norm": 1.229874934088239, + "learning_rate": 1.9635671576683798e-05, + "loss": 0.7617, + "step": 1978 + }, + { + "epoch": 0.11, + "grad_norm": 0.4814629313067314, + "learning_rate": 1.9635173678050878e-05, + "loss": 0.3127, + "step": 1979 + }, + { + "epoch": 0.11, + "grad_norm": 0.5001839315639627, + "learning_rate": 1.963467544575231e-05, + "loss": 0.3492, + "step": 1980 + }, + { + "epoch": 0.11, + "grad_norm": 0.5230200200146033, + "learning_rate": 1.9634176879805347e-05, + "loss": 0.3975, + "step": 1981 + }, + { + "epoch": 0.11, + "grad_norm": 0.31796381358669257, + "learning_rate": 1.9633677980227254e-05, + "loss": 0.2032, + "step": 1982 + }, + { + "epoch": 0.11, + "grad_norm": 0.562655816237142, + "learning_rate": 1.96331787470353e-05, + "loss": 0.3209, + "step": 1983 + }, + { + "epoch": 0.11, + "grad_norm": 0.3877665388095057, + "learning_rate": 1.9632679180246787e-05, + "loss": 0.3566, + "step": 1984 + }, + { + "epoch": 0.11, + "grad_norm": 0.5555037398364667, + "learning_rate": 1.9632179279879006e-05, + "loss": 0.4005, + "step": 1985 + }, + { + "epoch": 0.11, + "grad_norm": 0.4124910862080048, + "learning_rate": 1.963167904594927e-05, + "loss": 0.3273, + "step": 1986 + }, + { + "epoch": 0.11, + "grad_norm": 0.6000884096195452, + "learning_rate": 1.9631178478474905e-05, + "loss": 0.3107, + "step": 1987 + }, + { + "epoch": 0.11, + "grad_norm": 0.4283814322598163, + "learning_rate": 1.9630677577473242e-05, + "loss": 0.3005, + "step": 1988 + }, + { + "epoch": 0.11, + "grad_norm": 0.4370673442250718, + "learning_rate": 1.963017634296163e-05, + "loss": 0.2721, + "step": 1989 + }, + { + "epoch": 0.11, + "grad_norm": 0.4521900634802621, + "learning_rate": 1.9629674774957425e-05, + "loss": 0.3517, + "step": 1990 + }, + { + "epoch": 0.11, + "grad_norm": 0.5563225794926104, + "learning_rate": 1.9629172873477995e-05, + "loss": 0.3998, + "step": 1991 + }, + { + "epoch": 0.11, + "grad_norm": 0.4409425742430659, + "learning_rate": 1.9628670638540722e-05, + "loss": 0.2862, + "step": 1992 + }, + { + "epoch": 0.11, + "grad_norm": 1.3474166819860802, + "learning_rate": 1.9628168070163e-05, + "loss": 0.6459, + "step": 1993 + }, + { + "epoch": 0.11, + "grad_norm": 0.6413772554148557, + "learning_rate": 1.9627665168362234e-05, + "loss": 0.4017, + "step": 1994 + }, + { + "epoch": 0.11, + "grad_norm": 0.4979080455824859, + "learning_rate": 1.9627161933155833e-05, + "loss": 0.3773, + "step": 1995 + }, + { + "epoch": 0.11, + "grad_norm": 0.47734635309643114, + "learning_rate": 1.962665836456123e-05, + "loss": 0.3279, + "step": 1996 + }, + { + "epoch": 0.11, + "grad_norm": 0.4702257531545852, + "learning_rate": 1.9626154462595863e-05, + "loss": 0.2722, + "step": 1997 + }, + { + "epoch": 0.11, + "grad_norm": 0.36757418615763193, + "learning_rate": 1.9625650227277182e-05, + "loss": 0.2572, + "step": 1998 + }, + { + "epoch": 0.11, + "grad_norm": 2.0698780573684648, + "learning_rate": 1.9625145658622644e-05, + "loss": 0.3931, + "step": 1999 + }, + { + "epoch": 0.11, + "grad_norm": 0.4586305864242774, + "learning_rate": 1.962464075664973e-05, + "loss": 0.2985, + "step": 2000 + }, + { + "epoch": 0.11, + "grad_norm": 0.4799034083760339, + "learning_rate": 1.9624135521375914e-05, + "loss": 0.345, + "step": 2001 + }, + { + "epoch": 0.12, + "grad_norm": 0.8087456132963655, + "learning_rate": 1.9623629952818705e-05, + "loss": 0.4573, + "step": 2002 + }, + { + "epoch": 0.12, + "grad_norm": 0.273169211369182, + "learning_rate": 1.9623124050995603e-05, + "loss": 0.1335, + "step": 2003 + }, + { + "epoch": 0.12, + "grad_norm": 0.44311381479955264, + "learning_rate": 1.9622617815924125e-05, + "loss": 0.3134, + "step": 2004 + }, + { + "epoch": 0.12, + "grad_norm": 4.874639231040764, + "learning_rate": 1.962211124762181e-05, + "loss": 0.5349, + "step": 2005 + }, + { + "epoch": 0.12, + "grad_norm": 0.910928440837772, + "learning_rate": 1.9621604346106197e-05, + "loss": 0.5488, + "step": 2006 + }, + { + "epoch": 0.12, + "grad_norm": 0.730448021870774, + "learning_rate": 1.9621097111394837e-05, + "loss": 0.334, + "step": 2007 + }, + { + "epoch": 0.12, + "grad_norm": 0.8847582506626019, + "learning_rate": 1.9620589543505297e-05, + "loss": 0.4115, + "step": 2008 + }, + { + "epoch": 0.12, + "grad_norm": 0.26085846379198124, + "learning_rate": 1.9620081642455155e-05, + "loss": 0.109, + "step": 2009 + }, + { + "epoch": 0.12, + "grad_norm": 0.37726397681575125, + "learning_rate": 1.9619573408262004e-05, + "loss": 0.2366, + "step": 2010 + }, + { + "epoch": 0.12, + "grad_norm": 8.216495069611042, + "learning_rate": 1.9619064840943432e-05, + "loss": 0.8289, + "step": 2011 + }, + { + "epoch": 0.12, + "grad_norm": 0.4878648634362295, + "learning_rate": 1.9618555940517062e-05, + "loss": 0.2654, + "step": 2012 + }, + { + "epoch": 0.12, + "grad_norm": 8.093933887287843, + "learning_rate": 1.9618046707000515e-05, + "loss": 0.4438, + "step": 2013 + }, + { + "epoch": 0.12, + "grad_norm": 7.040222786715266, + "learning_rate": 1.9617537140411423e-05, + "loss": 0.6686, + "step": 2014 + }, + { + "epoch": 0.12, + "grad_norm": 1.4288877331493517, + "learning_rate": 1.961702724076743e-05, + "loss": 0.2278, + "step": 2015 + }, + { + "epoch": 0.12, + "grad_norm": 0.609857521709017, + "learning_rate": 1.96165170080862e-05, + "loss": 0.3412, + "step": 2016 + }, + { + "epoch": 0.12, + "grad_norm": 2.3849639020833395, + "learning_rate": 1.9616006442385403e-05, + "loss": 0.5256, + "step": 2017 + }, + { + "epoch": 0.12, + "grad_norm": 1.0769342424347998, + "learning_rate": 1.9615495543682712e-05, + "loss": 0.3283, + "step": 2018 + }, + { + "epoch": 0.12, + "grad_norm": 1.679860697691885, + "learning_rate": 1.9614984311995825e-05, + "loss": 0.4145, + "step": 2019 + }, + { + "epoch": 0.12, + "grad_norm": 0.7572448145592617, + "learning_rate": 1.9614472747342445e-05, + "loss": 0.4164, + "step": 2020 + }, + { + "epoch": 0.12, + "grad_norm": 1.0369229362139345, + "learning_rate": 1.9613960849740284e-05, + "loss": 0.4782, + "step": 2021 + }, + { + "epoch": 0.12, + "grad_norm": 0.43789872014124714, + "learning_rate": 1.9613448619207077e-05, + "loss": 0.172, + "step": 2022 + }, + { + "epoch": 0.12, + "grad_norm": 0.8217791474880007, + "learning_rate": 1.9612936055760557e-05, + "loss": 0.4245, + "step": 2023 + }, + { + "epoch": 0.12, + "grad_norm": 0.7758649033699128, + "learning_rate": 1.9612423159418474e-05, + "loss": 0.4332, + "step": 2024 + }, + { + "epoch": 0.12, + "grad_norm": 0.5558536412840944, + "learning_rate": 1.9611909930198588e-05, + "loss": 0.305, + "step": 2025 + }, + { + "epoch": 0.12, + "grad_norm": 2.763245662074697, + "learning_rate": 1.9611396368118675e-05, + "loss": 0.68, + "step": 2026 + }, + { + "epoch": 0.12, + "grad_norm": 0.6710808942620035, + "learning_rate": 1.961088247319652e-05, + "loss": 0.4107, + "step": 2027 + }, + { + "epoch": 0.12, + "grad_norm": 0.36096294284561037, + "learning_rate": 1.961036824544992e-05, + "loss": 0.2201, + "step": 2028 + }, + { + "epoch": 0.12, + "grad_norm": 0.5810650256593232, + "learning_rate": 1.960985368489668e-05, + "loss": 0.2909, + "step": 2029 + }, + { + "epoch": 0.12, + "grad_norm": 1.5690149812729766, + "learning_rate": 1.9609338791554623e-05, + "loss": 0.5314, + "step": 2030 + }, + { + "epoch": 0.12, + "grad_norm": 0.6393362197055744, + "learning_rate": 1.960882356544157e-05, + "loss": 0.2609, + "step": 2031 + }, + { + "epoch": 0.12, + "grad_norm": 0.6976100461836577, + "learning_rate": 1.9608308006575373e-05, + "loss": 0.4226, + "step": 2032 + }, + { + "epoch": 0.12, + "grad_norm": 1.479240310964179, + "learning_rate": 1.9607792114973884e-05, + "loss": 0.5768, + "step": 2033 + }, + { + "epoch": 0.12, + "grad_norm": 0.3668310724900021, + "learning_rate": 1.9607275890654967e-05, + "loss": 0.2626, + "step": 2034 + }, + { + "epoch": 0.12, + "grad_norm": 2.857454535850804, + "learning_rate": 1.9606759333636498e-05, + "loss": 0.2436, + "step": 2035 + }, + { + "epoch": 0.12, + "grad_norm": 0.9569480593524808, + "learning_rate": 1.9606242443936368e-05, + "loss": 0.4232, + "step": 2036 + }, + { + "epoch": 0.12, + "grad_norm": 0.5653570617669198, + "learning_rate": 1.9605725221572475e-05, + "loss": 0.3075, + "step": 2037 + }, + { + "epoch": 0.12, + "grad_norm": 1.2376985969066299, + "learning_rate": 1.960520766656273e-05, + "loss": 0.594, + "step": 2038 + }, + { + "epoch": 0.12, + "grad_norm": 0.6424647493461252, + "learning_rate": 1.9604689778925056e-05, + "loss": 0.3995, + "step": 2039 + }, + { + "epoch": 0.12, + "grad_norm": 0.4235494564972714, + "learning_rate": 1.960417155867739e-05, + "loss": 0.3047, + "step": 2040 + }, + { + "epoch": 0.12, + "grad_norm": 0.5912471531591597, + "learning_rate": 1.960365300583767e-05, + "loss": 0.2393, + "step": 2041 + }, + { + "epoch": 0.12, + "grad_norm": 1.029688638479728, + "learning_rate": 1.960313412042387e-05, + "loss": 0.5656, + "step": 2042 + }, + { + "epoch": 0.12, + "grad_norm": 0.4732960711478903, + "learning_rate": 1.960261490245394e-05, + "loss": 0.3001, + "step": 2043 + }, + { + "epoch": 0.12, + "grad_norm": 0.5220022647134588, + "learning_rate": 1.9602095351945872e-05, + "loss": 0.3535, + "step": 2044 + }, + { + "epoch": 0.12, + "grad_norm": 1.5883266373730363, + "learning_rate": 1.9601575468917654e-05, + "loss": 0.4899, + "step": 2045 + }, + { + "epoch": 0.12, + "grad_norm": 0.3387809210650048, + "learning_rate": 1.9601055253387292e-05, + "loss": 0.2403, + "step": 2046 + }, + { + "epoch": 0.12, + "grad_norm": 0.5623556957444193, + "learning_rate": 1.9600534705372795e-05, + "loss": 0.3546, + "step": 2047 + }, + { + "epoch": 0.12, + "grad_norm": 0.5773352363021086, + "learning_rate": 1.9600013824892198e-05, + "loss": 0.3738, + "step": 2048 + }, + { + "epoch": 0.12, + "grad_norm": 0.4232267894991952, + "learning_rate": 1.9599492611963533e-05, + "loss": 0.29, + "step": 2049 + }, + { + "epoch": 0.12, + "grad_norm": 1.5190140255118185, + "learning_rate": 1.9598971066604854e-05, + "loss": 0.818, + "step": 2050 + }, + { + "epoch": 0.12, + "grad_norm": 0.6714832479028133, + "learning_rate": 1.9598449188834218e-05, + "loss": 0.314, + "step": 2051 + }, + { + "epoch": 0.12, + "grad_norm": 0.44430442128339687, + "learning_rate": 1.95979269786697e-05, + "loss": 0.2978, + "step": 2052 + }, + { + "epoch": 0.12, + "grad_norm": 0.9537546657131378, + "learning_rate": 1.959740443612938e-05, + "loss": 0.5501, + "step": 2053 + }, + { + "epoch": 0.12, + "grad_norm": 0.2242161983506829, + "learning_rate": 1.9596881561231363e-05, + "loss": 0.1592, + "step": 2054 + }, + { + "epoch": 0.12, + "grad_norm": 0.48220806957042844, + "learning_rate": 1.9596358353993747e-05, + "loss": 0.3105, + "step": 2055 + }, + { + "epoch": 0.12, + "grad_norm": 0.6976369230595237, + "learning_rate": 1.959583481443465e-05, + "loss": 0.4052, + "step": 2056 + }, + { + "epoch": 0.12, + "grad_norm": 1.8327429933334323, + "learning_rate": 1.9595310942572212e-05, + "loss": 0.768, + "step": 2057 + }, + { + "epoch": 0.12, + "grad_norm": 0.44394301554498045, + "learning_rate": 1.9594786738424566e-05, + "loss": 0.2232, + "step": 2058 + }, + { + "epoch": 0.12, + "grad_norm": 0.7342598439395362, + "learning_rate": 1.959426220200987e-05, + "loss": 0.4006, + "step": 2059 + }, + { + "epoch": 0.12, + "grad_norm": 0.427795716449992, + "learning_rate": 1.9593737333346286e-05, + "loss": 0.2239, + "step": 2060 + }, + { + "epoch": 0.12, + "grad_norm": 0.4343683489018406, + "learning_rate": 1.9593212132451992e-05, + "loss": 0.2375, + "step": 2061 + }, + { + "epoch": 0.12, + "grad_norm": 1.6866265621347991, + "learning_rate": 1.959268659934517e-05, + "loss": 0.7592, + "step": 2062 + }, + { + "epoch": 0.12, + "grad_norm": 0.46897540943464006, + "learning_rate": 1.9592160734044027e-05, + "loss": 0.3492, + "step": 2063 + }, + { + "epoch": 0.12, + "grad_norm": 0.46889413154603504, + "learning_rate": 1.9591634536566766e-05, + "loss": 0.271, + "step": 2064 + }, + { + "epoch": 0.12, + "grad_norm": 1.0019504066133773, + "learning_rate": 1.9591108006931618e-05, + "loss": 0.5227, + "step": 2065 + }, + { + "epoch": 0.12, + "grad_norm": 0.4826247658886829, + "learning_rate": 1.9590581145156812e-05, + "loss": 0.2342, + "step": 2066 + }, + { + "epoch": 0.12, + "grad_norm": 0.46222434250869515, + "learning_rate": 1.959005395126059e-05, + "loss": 0.2521, + "step": 2067 + }, + { + "epoch": 0.12, + "grad_norm": 0.5843926885025996, + "learning_rate": 1.9589526425261213e-05, + "loss": 0.4092, + "step": 2068 + }, + { + "epoch": 0.12, + "grad_norm": 0.8389100016932158, + "learning_rate": 1.958899856717695e-05, + "loss": 0.5575, + "step": 2069 + }, + { + "epoch": 0.12, + "grad_norm": 0.4238709325497757, + "learning_rate": 1.958847037702608e-05, + "loss": 0.334, + "step": 2070 + }, + { + "epoch": 0.12, + "grad_norm": 0.4507205040580907, + "learning_rate": 1.9587941854826892e-05, + "loss": 0.3119, + "step": 2071 + }, + { + "epoch": 0.12, + "grad_norm": 0.47985631218897673, + "learning_rate": 1.9587413000597687e-05, + "loss": 0.3043, + "step": 2072 + }, + { + "epoch": 0.12, + "grad_norm": 0.7661228750951127, + "learning_rate": 1.9586883814356785e-05, + "loss": 0.4541, + "step": 2073 + }, + { + "epoch": 0.12, + "grad_norm": 0.3672906869250385, + "learning_rate": 1.958635429612251e-05, + "loss": 0.2207, + "step": 2074 + }, + { + "epoch": 0.12, + "grad_norm": 0.4746221052006005, + "learning_rate": 1.9585824445913194e-05, + "loss": 0.3761, + "step": 2075 + }, + { + "epoch": 0.12, + "grad_norm": 0.4915279326174938, + "learning_rate": 1.9585294263747192e-05, + "loss": 0.3161, + "step": 2076 + }, + { + "epoch": 0.12, + "grad_norm": 0.49898984543053004, + "learning_rate": 1.9584763749642862e-05, + "loss": 0.3595, + "step": 2077 + }, + { + "epoch": 0.12, + "grad_norm": 0.5341824181711823, + "learning_rate": 1.9584232903618576e-05, + "loss": 0.3403, + "step": 2078 + }, + { + "epoch": 0.12, + "grad_norm": 0.38017236195959025, + "learning_rate": 1.9583701725692716e-05, + "loss": 0.3137, + "step": 2079 + }, + { + "epoch": 0.12, + "grad_norm": 0.32745929971134763, + "learning_rate": 1.9583170215883677e-05, + "loss": 0.2298, + "step": 2080 + }, + { + "epoch": 0.12, + "grad_norm": 0.8625293763295988, + "learning_rate": 1.9582638374209864e-05, + "loss": 0.5644, + "step": 2081 + }, + { + "epoch": 0.12, + "grad_norm": 0.4296796221626104, + "learning_rate": 1.9582106200689698e-05, + "loss": 0.3539, + "step": 2082 + }, + { + "epoch": 0.12, + "grad_norm": 0.43436138803461416, + "learning_rate": 1.9581573695341607e-05, + "loss": 0.3427, + "step": 2083 + }, + { + "epoch": 0.12, + "grad_norm": 0.9284517598696714, + "learning_rate": 1.9581040858184028e-05, + "loss": 0.3858, + "step": 2084 + }, + { + "epoch": 0.12, + "grad_norm": 0.38693079051285023, + "learning_rate": 1.958050768923542e-05, + "loss": 0.3074, + "step": 2085 + }, + { + "epoch": 0.12, + "grad_norm": 0.4582866470000151, + "learning_rate": 1.957997418851424e-05, + "loss": 0.2932, + "step": 2086 + }, + { + "epoch": 0.12, + "grad_norm": 0.7347478591787949, + "learning_rate": 1.9579440356038966e-05, + "loss": 0.3689, + "step": 2087 + }, + { + "epoch": 0.12, + "grad_norm": 0.3424729743541756, + "learning_rate": 1.9578906191828086e-05, + "loss": 0.2313, + "step": 2088 + }, + { + "epoch": 0.12, + "grad_norm": 0.6525889969280737, + "learning_rate": 1.9578371695900097e-05, + "loss": 0.4339, + "step": 2089 + }, + { + "epoch": 0.12, + "grad_norm": 0.49133362050474455, + "learning_rate": 1.9577836868273504e-05, + "loss": 0.3199, + "step": 2090 + }, + { + "epoch": 0.12, + "grad_norm": 0.46103788217869224, + "learning_rate": 1.9577301708966837e-05, + "loss": 0.3133, + "step": 2091 + }, + { + "epoch": 0.12, + "grad_norm": 0.3968124504129748, + "learning_rate": 1.9576766217998622e-05, + "loss": 0.2731, + "step": 2092 + }, + { + "epoch": 0.12, + "grad_norm": 0.5598574134391687, + "learning_rate": 1.9576230395387403e-05, + "loss": 0.3231, + "step": 2093 + }, + { + "epoch": 0.12, + "grad_norm": 0.507072225088975, + "learning_rate": 1.9575694241151737e-05, + "loss": 0.3042, + "step": 2094 + }, + { + "epoch": 0.12, + "grad_norm": 0.4229706741547905, + "learning_rate": 1.9575157755310193e-05, + "loss": 0.3297, + "step": 2095 + }, + { + "epoch": 0.12, + "grad_norm": 1.2787555528563692, + "learning_rate": 1.957462093788135e-05, + "loss": 0.677, + "step": 2096 + }, + { + "epoch": 0.12, + "grad_norm": 0.40537782087377044, + "learning_rate": 1.957408378888379e-05, + "loss": 0.245, + "step": 2097 + }, + { + "epoch": 0.12, + "grad_norm": 0.6392618964501624, + "learning_rate": 1.957354630833612e-05, + "loss": 0.4378, + "step": 2098 + }, + { + "epoch": 0.12, + "grad_norm": 0.44083595616310706, + "learning_rate": 1.957300849625696e-05, + "loss": 0.3578, + "step": 2099 + }, + { + "epoch": 0.12, + "grad_norm": 0.277591819404161, + "learning_rate": 1.9572470352664923e-05, + "loss": 0.1437, + "step": 2100 + }, + { + "epoch": 0.12, + "grad_norm": 0.6445522078777529, + "learning_rate": 1.957193187757865e-05, + "loss": 0.4184, + "step": 2101 + }, + { + "epoch": 0.12, + "grad_norm": 1.06003956945609, + "learning_rate": 1.9571393071016785e-05, + "loss": 0.7141, + "step": 2102 + }, + { + "epoch": 0.12, + "grad_norm": 0.3327901938604386, + "learning_rate": 1.9570853932997993e-05, + "loss": 0.2564, + "step": 2103 + }, + { + "epoch": 0.12, + "grad_norm": 0.5349142642984711, + "learning_rate": 1.957031446354094e-05, + "loss": 0.4241, + "step": 2104 + }, + { + "epoch": 0.12, + "grad_norm": 0.7450362254049, + "learning_rate": 1.9569774662664306e-05, + "loss": 0.5647, + "step": 2105 + }, + { + "epoch": 0.12, + "grad_norm": 0.2616609861296739, + "learning_rate": 1.9569234530386792e-05, + "loss": 0.1297, + "step": 2106 + }, + { + "epoch": 0.12, + "grad_norm": 0.4211284765636636, + "learning_rate": 1.9568694066727095e-05, + "loss": 0.311, + "step": 2107 + }, + { + "epoch": 0.12, + "grad_norm": 1.4618973142603389, + "learning_rate": 1.9568153271703932e-05, + "loss": 0.8132, + "step": 2108 + }, + { + "epoch": 0.12, + "grad_norm": 0.738279974487151, + "learning_rate": 1.9567612145336036e-05, + "loss": 0.4929, + "step": 2109 + }, + { + "epoch": 0.12, + "grad_norm": 0.44960921087855577, + "learning_rate": 1.9567070687642142e-05, + "loss": 0.317, + "step": 2110 + }, + { + "epoch": 0.12, + "grad_norm": 0.4821347287488108, + "learning_rate": 1.9566528898641e-05, + "loss": 0.3665, + "step": 2111 + }, + { + "epoch": 0.12, + "grad_norm": 0.32569543755682334, + "learning_rate": 1.9565986778351376e-05, + "loss": 0.1666, + "step": 2112 + }, + { + "epoch": 0.12, + "grad_norm": 0.48347428505913215, + "learning_rate": 1.9565444326792038e-05, + "loss": 0.2783, + "step": 2113 + }, + { + "epoch": 0.12, + "grad_norm": 1.5037634230328805, + "learning_rate": 1.9564901543981776e-05, + "loss": 0.7917, + "step": 2114 + }, + { + "epoch": 0.12, + "grad_norm": 0.3697719835295979, + "learning_rate": 1.9564358429939386e-05, + "loss": 0.3053, + "step": 2115 + }, + { + "epoch": 0.12, + "grad_norm": 0.5447348890954887, + "learning_rate": 1.9563814984683674e-05, + "loss": 0.3506, + "step": 2116 + }, + { + "epoch": 0.12, + "grad_norm": 1.304263404947526, + "learning_rate": 1.9563271208233462e-05, + "loss": 0.8144, + "step": 2117 + }, + { + "epoch": 0.12, + "grad_norm": 0.3159438724391187, + "learning_rate": 1.9562727100607577e-05, + "loss": 0.1621, + "step": 2118 + }, + { + "epoch": 0.12, + "grad_norm": 0.41587057997986904, + "learning_rate": 1.956218266182486e-05, + "loss": 0.2664, + "step": 2119 + }, + { + "epoch": 0.12, + "grad_norm": 1.0253769563866983, + "learning_rate": 1.9561637891904176e-05, + "loss": 0.568, + "step": 2120 + }, + { + "epoch": 0.12, + "grad_norm": 0.6434762446373379, + "learning_rate": 1.9561092790864376e-05, + "loss": 0.4318, + "step": 2121 + }, + { + "epoch": 0.12, + "grad_norm": 0.5164150783625712, + "learning_rate": 1.956054735872435e-05, + "loss": 0.3499, + "step": 2122 + }, + { + "epoch": 0.12, + "grad_norm": 0.437135421408849, + "learning_rate": 1.9560001595502978e-05, + "loss": 0.3035, + "step": 2123 + }, + { + "epoch": 0.12, + "grad_norm": 0.3042150377766891, + "learning_rate": 1.955945550121916e-05, + "loss": 0.1825, + "step": 2124 + }, + { + "epoch": 0.12, + "grad_norm": 0.506806034858422, + "learning_rate": 1.9558909075891812e-05, + "loss": 0.3272, + "step": 2125 + }, + { + "epoch": 0.12, + "grad_norm": 0.47732056135622875, + "learning_rate": 1.955836231953985e-05, + "loss": 0.3269, + "step": 2126 + }, + { + "epoch": 0.12, + "grad_norm": 0.968804033210661, + "learning_rate": 1.9557815232182216e-05, + "loss": 0.4565, + "step": 2127 + }, + { + "epoch": 0.12, + "grad_norm": 0.3925570608975105, + "learning_rate": 1.9557267813837848e-05, + "loss": 0.3436, + "step": 2128 + }, + { + "epoch": 0.12, + "grad_norm": 0.9226390952793319, + "learning_rate": 1.955672006452571e-05, + "loss": 0.4685, + "step": 2129 + }, + { + "epoch": 0.12, + "grad_norm": 0.4344185349778874, + "learning_rate": 1.9556171984264765e-05, + "loss": 0.2778, + "step": 2130 + }, + { + "epoch": 0.12, + "grad_norm": 0.3822100632725823, + "learning_rate": 1.9555623573073994e-05, + "loss": 0.2504, + "step": 2131 + }, + { + "epoch": 0.12, + "grad_norm": 1.166974059789117, + "learning_rate": 1.955507483097239e-05, + "loss": 0.548, + "step": 2132 + }, + { + "epoch": 0.12, + "grad_norm": 0.7183519526044224, + "learning_rate": 1.9554525757978958e-05, + "loss": 0.4007, + "step": 2133 + }, + { + "epoch": 0.12, + "grad_norm": 0.40625089003812276, + "learning_rate": 1.955397635411271e-05, + "loss": 0.3013, + "step": 2134 + }, + { + "epoch": 0.12, + "grad_norm": 0.6427276061652427, + "learning_rate": 1.955342661939267e-05, + "loss": 0.4044, + "step": 2135 + }, + { + "epoch": 0.12, + "grad_norm": 0.7481016203756692, + "learning_rate": 1.9552876553837878e-05, + "loss": 0.2465, + "step": 2136 + }, + { + "epoch": 0.12, + "grad_norm": 0.4078942359314576, + "learning_rate": 1.955232615746738e-05, + "loss": 0.2344, + "step": 2137 + }, + { + "epoch": 0.12, + "grad_norm": 0.9841112838622158, + "learning_rate": 1.9551775430300238e-05, + "loss": 0.4383, + "step": 2138 + }, + { + "epoch": 0.12, + "grad_norm": 0.5783608548387257, + "learning_rate": 1.9551224372355523e-05, + "loss": 0.2947, + "step": 2139 + }, + { + "epoch": 0.12, + "grad_norm": 0.513156088021875, + "learning_rate": 1.955067298365232e-05, + "loss": 0.3523, + "step": 2140 + }, + { + "epoch": 0.12, + "grad_norm": 1.4460222044906592, + "learning_rate": 1.9550121264209724e-05, + "loss": 0.8275, + "step": 2141 + }, + { + "epoch": 0.12, + "grad_norm": 0.47796263241095566, + "learning_rate": 1.9549569214046837e-05, + "loss": 0.261, + "step": 2142 + }, + { + "epoch": 0.12, + "grad_norm": 0.5068551991890565, + "learning_rate": 1.954901683318278e-05, + "loss": 0.2742, + "step": 2143 + }, + { + "epoch": 0.12, + "grad_norm": 0.7854309469545647, + "learning_rate": 1.9548464121636678e-05, + "loss": 0.39, + "step": 2144 + }, + { + "epoch": 0.12, + "grad_norm": 0.9562992698117687, + "learning_rate": 1.954791107942768e-05, + "loss": 0.4997, + "step": 2145 + }, + { + "epoch": 0.12, + "grad_norm": 0.49884170414180384, + "learning_rate": 1.9547357706574926e-05, + "loss": 0.2523, + "step": 2146 + }, + { + "epoch": 0.12, + "grad_norm": 0.43289131925381763, + "learning_rate": 1.9546804003097588e-05, + "loss": 0.3589, + "step": 2147 + }, + { + "epoch": 0.12, + "grad_norm": 1.1386748667131321, + "learning_rate": 1.9546249969014836e-05, + "loss": 0.673, + "step": 2148 + }, + { + "epoch": 0.12, + "grad_norm": 0.478383355031457, + "learning_rate": 1.954569560434586e-05, + "loss": 0.283, + "step": 2149 + }, + { + "epoch": 0.12, + "grad_norm": 0.5539994701491447, + "learning_rate": 1.9545140909109854e-05, + "loss": 0.3762, + "step": 2150 + }, + { + "epoch": 0.12, + "grad_norm": 0.35583342252654265, + "learning_rate": 1.954458588332603e-05, + "loss": 0.168, + "step": 2151 + }, + { + "epoch": 0.12, + "grad_norm": 0.47482346170901524, + "learning_rate": 1.9544030527013603e-05, + "loss": 0.236, + "step": 2152 + }, + { + "epoch": 0.12, + "grad_norm": 1.488695547220567, + "learning_rate": 1.9543474840191817e-05, + "loss": 0.7431, + "step": 2153 + }, + { + "epoch": 0.12, + "grad_norm": 0.4557534247583365, + "learning_rate": 1.9542918822879902e-05, + "loss": 0.3779, + "step": 2154 + }, + { + "epoch": 0.12, + "grad_norm": 0.33288832250828193, + "learning_rate": 1.954236247509712e-05, + "loss": 0.2592, + "step": 2155 + }, + { + "epoch": 0.12, + "grad_norm": 0.4796173843723215, + "learning_rate": 1.954180579686274e-05, + "loss": 0.3337, + "step": 2156 + }, + { + "epoch": 0.12, + "grad_norm": 0.4089067855760762, + "learning_rate": 1.954124878819603e-05, + "loss": 0.2759, + "step": 2157 + }, + { + "epoch": 0.12, + "grad_norm": 0.4690219866747868, + "learning_rate": 1.9540691449116286e-05, + "loss": 0.2872, + "step": 2158 + }, + { + "epoch": 0.12, + "grad_norm": 0.445945068109367, + "learning_rate": 1.9540133779642806e-05, + "loss": 0.3255, + "step": 2159 + }, + { + "epoch": 0.12, + "grad_norm": 0.620237870223999, + "learning_rate": 1.9539575779794906e-05, + "loss": 0.4513, + "step": 2160 + }, + { + "epoch": 0.12, + "grad_norm": 0.4622798158558348, + "learning_rate": 1.9539017449591905e-05, + "loss": 0.3245, + "step": 2161 + }, + { + "epoch": 0.12, + "grad_norm": 0.40301614161542826, + "learning_rate": 1.9538458789053143e-05, + "loss": 0.3361, + "step": 2162 + }, + { + "epoch": 0.12, + "grad_norm": 0.4533281560660749, + "learning_rate": 1.9537899798197963e-05, + "loss": 0.2963, + "step": 2163 + }, + { + "epoch": 0.12, + "grad_norm": 0.4209930222368525, + "learning_rate": 1.953734047704572e-05, + "loss": 0.281, + "step": 2164 + }, + { + "epoch": 0.12, + "grad_norm": 0.36565295658400504, + "learning_rate": 1.9536780825615788e-05, + "loss": 0.2151, + "step": 2165 + }, + { + "epoch": 0.12, + "grad_norm": 0.4422205941031057, + "learning_rate": 1.9536220843927544e-05, + "loss": 0.3669, + "step": 2166 + }, + { + "epoch": 0.12, + "grad_norm": 0.4385665259665185, + "learning_rate": 1.953566053200039e-05, + "loss": 0.3155, + "step": 2167 + }, + { + "epoch": 0.12, + "grad_norm": 0.6424769906790054, + "learning_rate": 1.953509988985371e-05, + "loss": 0.4169, + "step": 2168 + }, + { + "epoch": 0.12, + "grad_norm": 1.194460975030824, + "learning_rate": 1.953453891750694e-05, + "loss": 0.7403, + "step": 2169 + }, + { + "epoch": 0.12, + "grad_norm": 0.3717051558823366, + "learning_rate": 1.9533977614979493e-05, + "loss": 0.2938, + "step": 2170 + }, + { + "epoch": 0.12, + "grad_norm": 0.3201587539393018, + "learning_rate": 1.9533415982290813e-05, + "loss": 0.2192, + "step": 2171 + }, + { + "epoch": 0.12, + "grad_norm": 0.7368872184262916, + "learning_rate": 1.9532854019460346e-05, + "loss": 0.4229, + "step": 2172 + }, + { + "epoch": 0.12, + "grad_norm": 0.44181930693740384, + "learning_rate": 1.9532291726507557e-05, + "loss": 0.2848, + "step": 2173 + }, + { + "epoch": 0.12, + "grad_norm": 0.4664201908407062, + "learning_rate": 1.9531729103451912e-05, + "loss": 0.35, + "step": 2174 + }, + { + "epoch": 0.12, + "grad_norm": 1.454446688689338, + "learning_rate": 1.9531166150312902e-05, + "loss": 0.605, + "step": 2175 + }, + { + "epoch": 0.13, + "grad_norm": 0.46776768368543914, + "learning_rate": 1.9530602867110015e-05, + "loss": 0.2971, + "step": 2176 + }, + { + "epoch": 0.13, + "grad_norm": 0.3477231628548521, + "learning_rate": 1.953003925386276e-05, + "loss": 0.2378, + "step": 2177 + }, + { + "epoch": 0.13, + "grad_norm": 0.5094617271180392, + "learning_rate": 1.9529475310590656e-05, + "loss": 0.3366, + "step": 2178 + }, + { + "epoch": 0.13, + "grad_norm": 0.46931079454798147, + "learning_rate": 1.9528911037313233e-05, + "loss": 0.3177, + "step": 2179 + }, + { + "epoch": 0.13, + "grad_norm": 1.178746644251515, + "learning_rate": 1.952834643405003e-05, + "loss": 0.6243, + "step": 2180 + }, + { + "epoch": 0.13, + "grad_norm": 1.9884192839837473, + "learning_rate": 1.95277815008206e-05, + "loss": 0.5839, + "step": 2181 + }, + { + "epoch": 0.13, + "grad_norm": 0.4751766716695899, + "learning_rate": 1.9527216237644508e-05, + "loss": 0.2844, + "step": 2182 + }, + { + "epoch": 0.13, + "grad_norm": 0.587711364919638, + "learning_rate": 1.9526650644541326e-05, + "loss": 0.3622, + "step": 2183 + }, + { + "epoch": 0.13, + "grad_norm": 0.8459515983868039, + "learning_rate": 1.952608472153064e-05, + "loss": 0.3906, + "step": 2184 + }, + { + "epoch": 0.13, + "grad_norm": 0.4486221294280846, + "learning_rate": 1.952551846863205e-05, + "loss": 0.2203, + "step": 2185 + }, + { + "epoch": 0.13, + "grad_norm": 0.6553354179919787, + "learning_rate": 1.9524951885865165e-05, + "loss": 0.3764, + "step": 2186 + }, + { + "epoch": 0.13, + "grad_norm": 1.830584238414141, + "learning_rate": 1.952438497324961e-05, + "loss": 0.7863, + "step": 2187 + }, + { + "epoch": 0.13, + "grad_norm": 0.5252954020431466, + "learning_rate": 1.9523817730805008e-05, + "loss": 0.2538, + "step": 2188 + }, + { + "epoch": 0.13, + "grad_norm": 0.8584255441697742, + "learning_rate": 1.952325015855101e-05, + "loss": 0.5336, + "step": 2189 + }, + { + "epoch": 0.13, + "grad_norm": 0.3471580515229295, + "learning_rate": 1.9522682256507268e-05, + "loss": 0.2511, + "step": 2190 + }, + { + "epoch": 0.13, + "grad_norm": 0.4088705601951199, + "learning_rate": 1.952211402469345e-05, + "loss": 0.2464, + "step": 2191 + }, + { + "epoch": 0.13, + "grad_norm": 1.2705970535049596, + "learning_rate": 1.952154546312923e-05, + "loss": 0.6611, + "step": 2192 + }, + { + "epoch": 0.13, + "grad_norm": 0.6454425353434776, + "learning_rate": 1.9520976571834304e-05, + "loss": 0.4209, + "step": 2193 + }, + { + "epoch": 0.13, + "grad_norm": 0.44634869722763765, + "learning_rate": 1.9520407350828364e-05, + "loss": 0.2355, + "step": 2194 + }, + { + "epoch": 0.13, + "grad_norm": 0.7939220131831405, + "learning_rate": 1.9519837800131127e-05, + "loss": 0.4377, + "step": 2195 + }, + { + "epoch": 0.13, + "grad_norm": 0.3147220383048209, + "learning_rate": 1.9519267919762318e-05, + "loss": 0.1702, + "step": 2196 + }, + { + "epoch": 0.13, + "grad_norm": 0.703105533799731, + "learning_rate": 1.951869770974167e-05, + "loss": 0.3809, + "step": 2197 + }, + { + "epoch": 0.13, + "grad_norm": 0.47335849847144024, + "learning_rate": 1.951812717008893e-05, + "loss": 0.2881, + "step": 2198 + }, + { + "epoch": 0.13, + "grad_norm": 1.1606156210211465, + "learning_rate": 1.951755630082386e-05, + "loss": 0.7156, + "step": 2199 + }, + { + "epoch": 0.13, + "grad_norm": 0.45947436250339385, + "learning_rate": 1.9516985101966218e-05, + "loss": 0.3269, + "step": 2200 + }, + { + "epoch": 0.13, + "grad_norm": 0.43358484416349463, + "learning_rate": 1.9516413573535794e-05, + "loss": 0.3523, + "step": 2201 + }, + { + "epoch": 0.13, + "grad_norm": 0.5472429483088781, + "learning_rate": 1.9515841715552376e-05, + "loss": 0.3946, + "step": 2202 + }, + { + "epoch": 0.13, + "grad_norm": 0.34276882389222835, + "learning_rate": 1.951526952803577e-05, + "loss": 0.1918, + "step": 2203 + }, + { + "epoch": 0.13, + "grad_norm": 0.5604731992436495, + "learning_rate": 1.951469701100579e-05, + "loss": 0.3188, + "step": 2204 + }, + { + "epoch": 0.13, + "grad_norm": 0.5485943753095354, + "learning_rate": 1.951412416448226e-05, + "loss": 0.4226, + "step": 2205 + }, + { + "epoch": 0.13, + "grad_norm": 0.38949595158304373, + "learning_rate": 1.951355098848502e-05, + "loss": 0.3067, + "step": 2206 + }, + { + "epoch": 0.13, + "grad_norm": 0.5935212725749861, + "learning_rate": 1.9512977483033916e-05, + "loss": 0.4256, + "step": 2207 + }, + { + "epoch": 0.13, + "grad_norm": 0.3030094098212264, + "learning_rate": 1.9512403648148813e-05, + "loss": 0.1428, + "step": 2208 + }, + { + "epoch": 0.13, + "grad_norm": 0.3956227850830546, + "learning_rate": 1.951182948384958e-05, + "loss": 0.2861, + "step": 2209 + }, + { + "epoch": 0.13, + "grad_norm": 0.4697965880003851, + "learning_rate": 1.9511254990156103e-05, + "loss": 0.3211, + "step": 2210 + }, + { + "epoch": 0.13, + "grad_norm": 0.9172394638687885, + "learning_rate": 1.9510680167088275e-05, + "loss": 0.4232, + "step": 2211 + }, + { + "epoch": 0.13, + "grad_norm": 0.43535146422603604, + "learning_rate": 1.9510105014665998e-05, + "loss": 0.3297, + "step": 2212 + }, + { + "epoch": 0.13, + "grad_norm": 0.5773882269858571, + "learning_rate": 1.9509529532909196e-05, + "loss": 0.4388, + "step": 2213 + }, + { + "epoch": 0.13, + "grad_norm": 0.3137998718551292, + "learning_rate": 1.9508953721837795e-05, + "loss": 0.2522, + "step": 2214 + }, + { + "epoch": 0.13, + "grad_norm": 0.35939192870248576, + "learning_rate": 1.9508377581471735e-05, + "loss": 0.1858, + "step": 2215 + }, + { + "epoch": 0.13, + "grad_norm": 0.5116306737278765, + "learning_rate": 1.9507801111830967e-05, + "loss": 0.3576, + "step": 2216 + }, + { + "epoch": 0.13, + "grad_norm": 0.5731809416440271, + "learning_rate": 1.950722431293546e-05, + "loss": 0.3353, + "step": 2217 + }, + { + "epoch": 0.13, + "grad_norm": 0.47304259161573736, + "learning_rate": 1.950664718480518e-05, + "loss": 0.3236, + "step": 2218 + }, + { + "epoch": 0.13, + "grad_norm": 0.6276525740396246, + "learning_rate": 1.9506069727460116e-05, + "loss": 0.4444, + "step": 2219 + }, + { + "epoch": 0.13, + "grad_norm": 1.090832100391312, + "learning_rate": 1.9505491940920268e-05, + "loss": 0.6109, + "step": 2220 + }, + { + "epoch": 0.13, + "grad_norm": 0.2481407623113843, + "learning_rate": 1.9504913825205643e-05, + "loss": 0.1352, + "step": 2221 + }, + { + "epoch": 0.13, + "grad_norm": 0.4448099721555093, + "learning_rate": 1.950433538033626e-05, + "loss": 0.3269, + "step": 2222 + }, + { + "epoch": 0.13, + "grad_norm": 0.866009184797271, + "learning_rate": 1.950375660633215e-05, + "loss": 0.55, + "step": 2223 + }, + { + "epoch": 0.13, + "grad_norm": 0.4384851116001498, + "learning_rate": 1.950317750321336e-05, + "loss": 0.2842, + "step": 2224 + }, + { + "epoch": 0.13, + "grad_norm": 0.6313666828316659, + "learning_rate": 1.950259807099994e-05, + "loss": 0.3842, + "step": 2225 + }, + { + "epoch": 0.13, + "grad_norm": 0.4331107293530284, + "learning_rate": 1.950201830971196e-05, + "loss": 0.3443, + "step": 2226 + }, + { + "epoch": 0.13, + "grad_norm": 0.23533665688629704, + "learning_rate": 1.9501438219369492e-05, + "loss": 0.1392, + "step": 2227 + }, + { + "epoch": 0.13, + "grad_norm": 0.6139970636009742, + "learning_rate": 1.9500857799992628e-05, + "loss": 0.4041, + "step": 2228 + }, + { + "epoch": 0.13, + "grad_norm": 0.48155996698312703, + "learning_rate": 1.9500277051601465e-05, + "loss": 0.3663, + "step": 2229 + }, + { + "epoch": 0.13, + "grad_norm": 0.4152605894643503, + "learning_rate": 1.949969597421612e-05, + "loss": 0.2154, + "step": 2230 + }, + { + "epoch": 0.13, + "grad_norm": 0.6361750555191724, + "learning_rate": 1.9499114567856708e-05, + "loss": 0.4438, + "step": 2231 + }, + { + "epoch": 0.13, + "grad_norm": 1.2016891543780488, + "learning_rate": 1.9498532832543372e-05, + "loss": 0.6434, + "step": 2232 + }, + { + "epoch": 0.13, + "grad_norm": 0.39561891747852557, + "learning_rate": 1.9497950768296246e-05, + "loss": 0.2549, + "step": 2233 + }, + { + "epoch": 0.13, + "grad_norm": 0.31869872802169985, + "learning_rate": 1.9497368375135497e-05, + "loss": 0.2302, + "step": 2234 + }, + { + "epoch": 0.13, + "grad_norm": 0.8584671892353136, + "learning_rate": 1.949678565308129e-05, + "loss": 0.4976, + "step": 2235 + }, + { + "epoch": 0.13, + "grad_norm": 0.8802500200900603, + "learning_rate": 1.9496202602153805e-05, + "loss": 0.4712, + "step": 2236 + }, + { + "epoch": 0.13, + "grad_norm": 0.35855578518631354, + "learning_rate": 1.949561922237323e-05, + "loss": 0.2521, + "step": 2237 + }, + { + "epoch": 0.13, + "grad_norm": 0.5343010087481445, + "learning_rate": 1.9495035513759772e-05, + "loss": 0.386, + "step": 2238 + }, + { + "epoch": 0.13, + "grad_norm": 0.3763359873365377, + "learning_rate": 1.9494451476333637e-05, + "loss": 0.2417, + "step": 2239 + }, + { + "epoch": 0.13, + "grad_norm": 0.7266720426527591, + "learning_rate": 1.949386711011506e-05, + "loss": 0.3591, + "step": 2240 + }, + { + "epoch": 0.13, + "grad_norm": 0.3434275838027053, + "learning_rate": 1.9493282415124274e-05, + "loss": 0.279, + "step": 2241 + }, + { + "epoch": 0.13, + "grad_norm": 0.793014395205871, + "learning_rate": 1.9492697391381523e-05, + "loss": 0.3409, + "step": 2242 + }, + { + "epoch": 0.13, + "grad_norm": 0.4304945592429074, + "learning_rate": 1.949211203890707e-05, + "loss": 0.2789, + "step": 2243 + }, + { + "epoch": 0.13, + "grad_norm": 0.8892252947519116, + "learning_rate": 1.949152635772119e-05, + "loss": 0.617, + "step": 2244 + }, + { + "epoch": 0.13, + "grad_norm": 0.362436878968347, + "learning_rate": 1.9490940347844156e-05, + "loss": 0.3207, + "step": 2245 + }, + { + "epoch": 0.13, + "grad_norm": 0.44349164878713104, + "learning_rate": 1.9490354009296268e-05, + "loss": 0.3616, + "step": 2246 + }, + { + "epoch": 0.13, + "grad_norm": 0.3274107448446957, + "learning_rate": 1.9489767342097824e-05, + "loss": 0.2196, + "step": 2247 + }, + { + "epoch": 0.13, + "grad_norm": 0.8783237744052049, + "learning_rate": 1.948918034626915e-05, + "loss": 0.5678, + "step": 2248 + }, + { + "epoch": 0.13, + "grad_norm": 0.43709607755935653, + "learning_rate": 1.948859302183057e-05, + "loss": 0.329, + "step": 2249 + }, + { + "epoch": 0.13, + "grad_norm": 0.4099301199962942, + "learning_rate": 1.9488005368802415e-05, + "loss": 0.2837, + "step": 2250 + }, + { + "epoch": 0.13, + "grad_norm": 0.5420335066249002, + "learning_rate": 1.948741738720505e-05, + "loss": 0.4656, + "step": 2251 + }, + { + "epoch": 0.13, + "grad_norm": 0.3731959010848403, + "learning_rate": 1.9486829077058823e-05, + "loss": 0.3333, + "step": 2252 + }, + { + "epoch": 0.13, + "grad_norm": 0.2855765128398837, + "learning_rate": 1.9486240438384114e-05, + "loss": 0.2129, + "step": 2253 + }, + { + "epoch": 0.13, + "grad_norm": 0.8666950180014855, + "learning_rate": 1.9485651471201306e-05, + "loss": 0.4876, + "step": 2254 + }, + { + "epoch": 0.13, + "grad_norm": 0.46097536285247287, + "learning_rate": 1.94850621755308e-05, + "loss": 0.3323, + "step": 2255 + }, + { + "epoch": 0.13, + "grad_norm": 0.7361756550595048, + "learning_rate": 1.9484472551392993e-05, + "loss": 0.4123, + "step": 2256 + }, + { + "epoch": 0.13, + "grad_norm": 0.3658556105623814, + "learning_rate": 1.9483882598808315e-05, + "loss": 0.3458, + "step": 2257 + }, + { + "epoch": 0.13, + "grad_norm": 0.44969033758873034, + "learning_rate": 1.948329231779719e-05, + "loss": 0.3461, + "step": 2258 + }, + { + "epoch": 0.13, + "grad_norm": 0.5560341222882007, + "learning_rate": 1.9482701708380056e-05, + "loss": 0.2802, + "step": 2259 + }, + { + "epoch": 0.13, + "grad_norm": 0.4553291061872389, + "learning_rate": 1.948211077057737e-05, + "loss": 0.2485, + "step": 2260 + }, + { + "epoch": 0.13, + "grad_norm": 0.4033237221641209, + "learning_rate": 1.9481519504409596e-05, + "loss": 0.2866, + "step": 2261 + }, + { + "epoch": 0.13, + "grad_norm": 0.5367463676843832, + "learning_rate": 1.948092790989721e-05, + "loss": 0.3968, + "step": 2262 + }, + { + "epoch": 0.13, + "grad_norm": 0.5626236268127532, + "learning_rate": 1.94803359870607e-05, + "loss": 0.3514, + "step": 2263 + }, + { + "epoch": 0.13, + "grad_norm": 0.4338608687064876, + "learning_rate": 1.947974373592056e-05, + "loss": 0.3034, + "step": 2264 + }, + { + "epoch": 0.13, + "grad_norm": 0.4131007253237356, + "learning_rate": 1.9479151156497303e-05, + "loss": 0.3456, + "step": 2265 + }, + { + "epoch": 0.13, + "grad_norm": 0.7775496063551903, + "learning_rate": 1.9478558248811448e-05, + "loss": 0.3321, + "step": 2266 + }, + { + "epoch": 0.13, + "grad_norm": 0.35258832797577216, + "learning_rate": 1.947796501288353e-05, + "loss": 0.2487, + "step": 2267 + }, + { + "epoch": 0.13, + "grad_norm": 0.6701083819158502, + "learning_rate": 1.947737144873409e-05, + "loss": 0.3394, + "step": 2268 + }, + { + "epoch": 0.13, + "grad_norm": 0.43898287408668396, + "learning_rate": 1.9476777556383685e-05, + "loss": 0.3219, + "step": 2269 + }, + { + "epoch": 0.13, + "grad_norm": 0.3596482929485019, + "learning_rate": 1.947618333585288e-05, + "loss": 0.2842, + "step": 2270 + }, + { + "epoch": 0.13, + "grad_norm": 1.1019436057037184, + "learning_rate": 1.947558878716225e-05, + "loss": 0.7142, + "step": 2271 + }, + { + "epoch": 0.13, + "grad_norm": 0.5411126947410136, + "learning_rate": 1.9474993910332394e-05, + "loss": 0.3461, + "step": 2272 + }, + { + "epoch": 0.13, + "grad_norm": 0.40898556290402976, + "learning_rate": 1.9474398705383904e-05, + "loss": 0.2542, + "step": 2273 + }, + { + "epoch": 0.13, + "grad_norm": 0.6471286583545248, + "learning_rate": 1.9473803172337396e-05, + "loss": 0.3206, + "step": 2274 + }, + { + "epoch": 0.13, + "grad_norm": 0.8779107686075143, + "learning_rate": 1.947320731121349e-05, + "loss": 0.5398, + "step": 2275 + }, + { + "epoch": 0.13, + "grad_norm": 0.3532457572292047, + "learning_rate": 1.947261112203282e-05, + "loss": 0.2389, + "step": 2276 + }, + { + "epoch": 0.13, + "grad_norm": 0.4493598183529648, + "learning_rate": 1.947201460481604e-05, + "loss": 0.3389, + "step": 2277 + }, + { + "epoch": 0.13, + "grad_norm": 1.2898301269699424, + "learning_rate": 1.9471417759583796e-05, + "loss": 0.7665, + "step": 2278 + }, + { + "epoch": 0.13, + "grad_norm": 0.465301976628202, + "learning_rate": 1.9470820586356763e-05, + "loss": 0.2486, + "step": 2279 + }, + { + "epoch": 0.13, + "grad_norm": 0.8359470240166336, + "learning_rate": 1.9470223085155622e-05, + "loss": 0.3916, + "step": 2280 + }, + { + "epoch": 0.13, + "grad_norm": 0.36417049272575086, + "learning_rate": 1.946962525600106e-05, + "loss": 0.2749, + "step": 2281 + }, + { + "epoch": 0.13, + "grad_norm": 0.39093612328850025, + "learning_rate": 1.9469027098913787e-05, + "loss": 0.2384, + "step": 2282 + }, + { + "epoch": 0.13, + "grad_norm": 1.4785923875213445, + "learning_rate": 1.946842861391451e-05, + "loss": 0.876, + "step": 2283 + }, + { + "epoch": 0.13, + "grad_norm": 0.49453376260267884, + "learning_rate": 1.9467829801023957e-05, + "loss": 0.3659, + "step": 2284 + }, + { + "epoch": 0.13, + "grad_norm": 0.5000531898361844, + "learning_rate": 1.9467230660262864e-05, + "loss": 0.3059, + "step": 2285 + }, + { + "epoch": 0.13, + "grad_norm": 0.5643620845088193, + "learning_rate": 1.9466631191651984e-05, + "loss": 0.305, + "step": 2286 + }, + { + "epoch": 0.13, + "grad_norm": 0.4382366231139346, + "learning_rate": 1.9466031395212073e-05, + "loss": 0.3067, + "step": 2287 + }, + { + "epoch": 0.13, + "grad_norm": 0.40747576339356095, + "learning_rate": 1.9465431270963898e-05, + "loss": 0.2737, + "step": 2288 + }, + { + "epoch": 0.13, + "grad_norm": 0.4389132877484424, + "learning_rate": 1.9464830818928247e-05, + "loss": 0.2912, + "step": 2289 + }, + { + "epoch": 0.13, + "grad_norm": 0.8213346014038311, + "learning_rate": 1.946423003912591e-05, + "loss": 0.648, + "step": 2290 + }, + { + "epoch": 0.13, + "grad_norm": 0.3995884995275768, + "learning_rate": 1.94636289315777e-05, + "loss": 0.3271, + "step": 2291 + }, + { + "epoch": 0.13, + "grad_norm": 0.8771463518079066, + "learning_rate": 1.946302749630442e-05, + "loss": 0.4139, + "step": 2292 + }, + { + "epoch": 0.13, + "grad_norm": 0.3162617732642643, + "learning_rate": 1.9462425733326906e-05, + "loss": 0.2405, + "step": 2293 + }, + { + "epoch": 0.13, + "grad_norm": 0.40209065551847967, + "learning_rate": 1.9461823642666e-05, + "loss": 0.2874, + "step": 2294 + }, + { + "epoch": 0.13, + "grad_norm": 0.9720138998527125, + "learning_rate": 1.9461221224342544e-05, + "loss": 0.5947, + "step": 2295 + }, + { + "epoch": 0.13, + "grad_norm": 0.46874994093569994, + "learning_rate": 1.9460618478377406e-05, + "loss": 0.342, + "step": 2296 + }, + { + "epoch": 0.13, + "grad_norm": 0.4541039308298201, + "learning_rate": 1.9460015404791456e-05, + "loss": 0.3137, + "step": 2297 + }, + { + "epoch": 0.13, + "grad_norm": 0.9474289796905376, + "learning_rate": 1.9459412003605577e-05, + "loss": 0.595, + "step": 2298 + }, + { + "epoch": 0.13, + "grad_norm": 0.3041258639698369, + "learning_rate": 1.945880827484067e-05, + "loss": 0.1296, + "step": 2299 + }, + { + "epoch": 0.13, + "grad_norm": 0.44826921009046905, + "learning_rate": 1.9458204218517638e-05, + "loss": 0.2813, + "step": 2300 + }, + { + "epoch": 0.13, + "grad_norm": 0.4353294044673283, + "learning_rate": 1.94575998346574e-05, + "loss": 0.3258, + "step": 2301 + }, + { + "epoch": 0.13, + "grad_norm": 0.9102026658374495, + "learning_rate": 1.945699512328089e-05, + "loss": 0.4023, + "step": 2302 + }, + { + "epoch": 0.13, + "grad_norm": 0.4436564510925583, + "learning_rate": 1.9456390084409044e-05, + "loss": 0.3012, + "step": 2303 + }, + { + "epoch": 0.13, + "grad_norm": 0.8056583922169369, + "learning_rate": 1.9455784718062813e-05, + "loss": 0.5757, + "step": 2304 + }, + { + "epoch": 0.13, + "grad_norm": 0.2843505676702236, + "learning_rate": 1.9455179024263166e-05, + "loss": 0.1952, + "step": 2305 + }, + { + "epoch": 0.13, + "grad_norm": 0.42052209498560444, + "learning_rate": 1.9454573003031078e-05, + "loss": 0.2669, + "step": 2306 + }, + { + "epoch": 0.13, + "grad_norm": 0.9245941501277933, + "learning_rate": 1.945396665438753e-05, + "loss": 0.6041, + "step": 2307 + }, + { + "epoch": 0.13, + "grad_norm": 0.5978445007136682, + "learning_rate": 1.9453359978353524e-05, + "loss": 0.4349, + "step": 2308 + }, + { + "epoch": 0.13, + "grad_norm": 0.32742532663412943, + "learning_rate": 1.945275297495007e-05, + "loss": 0.2599, + "step": 2309 + }, + { + "epoch": 0.13, + "grad_norm": 0.9431148146992495, + "learning_rate": 1.9452145644198185e-05, + "loss": 0.6554, + "step": 2310 + }, + { + "epoch": 0.13, + "grad_norm": 0.3051067841482634, + "learning_rate": 1.9451537986118904e-05, + "loss": 0.1965, + "step": 2311 + }, + { + "epoch": 0.13, + "grad_norm": 0.4158374417954412, + "learning_rate": 1.945093000073327e-05, + "loss": 0.2121, + "step": 2312 + }, + { + "epoch": 0.13, + "grad_norm": 0.4787539021841934, + "learning_rate": 1.9450321688062336e-05, + "loss": 0.3388, + "step": 2313 + }, + { + "epoch": 0.13, + "grad_norm": 1.1867093489703773, + "learning_rate": 1.944971304812717e-05, + "loss": 0.5359, + "step": 2314 + }, + { + "epoch": 0.13, + "grad_norm": 0.31635180255683376, + "learning_rate": 1.9449104080948842e-05, + "loss": 0.2358, + "step": 2315 + }, + { + "epoch": 0.13, + "grad_norm": 1.9261529169569618, + "learning_rate": 1.9448494786548448e-05, + "loss": 0.8326, + "step": 2316 + }, + { + "epoch": 0.13, + "grad_norm": 0.34949286273506625, + "learning_rate": 1.944788516494709e-05, + "loss": 0.2497, + "step": 2317 + }, + { + "epoch": 0.13, + "grad_norm": 0.4379583841036261, + "learning_rate": 1.944727521616587e-05, + "loss": 0.2183, + "step": 2318 + }, + { + "epoch": 0.13, + "grad_norm": 0.8861251508067872, + "learning_rate": 1.9446664940225917e-05, + "loss": 0.4202, + "step": 2319 + }, + { + "epoch": 0.13, + "grad_norm": 0.5479306060921134, + "learning_rate": 1.9446054337148364e-05, + "loss": 0.3957, + "step": 2320 + }, + { + "epoch": 0.13, + "grad_norm": 0.8068260676833023, + "learning_rate": 1.9445443406954357e-05, + "loss": 0.4384, + "step": 2321 + }, + { + "epoch": 0.13, + "grad_norm": 0.4521357323513071, + "learning_rate": 1.9444832149665048e-05, + "loss": 0.285, + "step": 2322 + }, + { + "epoch": 0.13, + "grad_norm": 0.3373624792377293, + "learning_rate": 1.944422056530161e-05, + "loss": 0.1889, + "step": 2323 + }, + { + "epoch": 0.13, + "grad_norm": 0.5131559776901216, + "learning_rate": 1.944360865388522e-05, + "loss": 0.3261, + "step": 2324 + }, + { + "epoch": 0.13, + "grad_norm": 0.4789087105248987, + "learning_rate": 1.9442996415437066e-05, + "loss": 0.3175, + "step": 2325 + }, + { + "epoch": 0.13, + "grad_norm": 1.0279162829610848, + "learning_rate": 1.9442383849978354e-05, + "loss": 0.6445, + "step": 2326 + }, + { + "epoch": 0.13, + "grad_norm": 0.5037670907171229, + "learning_rate": 1.9441770957530295e-05, + "loss": 0.3263, + "step": 2327 + }, + { + "epoch": 0.13, + "grad_norm": 0.4267882635388313, + "learning_rate": 1.9441157738114114e-05, + "loss": 0.2697, + "step": 2328 + }, + { + "epoch": 0.13, + "grad_norm": 0.3900503798338192, + "learning_rate": 1.9440544191751046e-05, + "loss": 0.2927, + "step": 2329 + }, + { + "epoch": 0.13, + "grad_norm": 0.3245214667402623, + "learning_rate": 1.943993031846234e-05, + "loss": 0.2375, + "step": 2330 + }, + { + "epoch": 0.13, + "grad_norm": 0.7409088978080296, + "learning_rate": 1.9439316118269248e-05, + "loss": 0.3682, + "step": 2331 + }, + { + "epoch": 0.13, + "grad_norm": 0.5008460271734648, + "learning_rate": 1.943870159119305e-05, + "loss": 0.3801, + "step": 2332 + }, + { + "epoch": 0.13, + "grad_norm": 0.41324478033839196, + "learning_rate": 1.9438086737255022e-05, + "loss": 0.2974, + "step": 2333 + }, + { + "epoch": 0.13, + "grad_norm": 1.0615661938603904, + "learning_rate": 1.9437471556476454e-05, + "loss": 0.6605, + "step": 2334 + }, + { + "epoch": 0.13, + "grad_norm": 0.7578715878324432, + "learning_rate": 1.9436856048878653e-05, + "loss": 0.4057, + "step": 2335 + }, + { + "epoch": 0.13, + "grad_norm": 0.4578545746760707, + "learning_rate": 1.943624021448293e-05, + "loss": 0.2886, + "step": 2336 + }, + { + "epoch": 0.13, + "grad_norm": 0.46684580870173853, + "learning_rate": 1.9435624053310617e-05, + "loss": 0.335, + "step": 2337 + }, + { + "epoch": 0.13, + "grad_norm": 0.2697068340642194, + "learning_rate": 1.943500756538305e-05, + "loss": 0.1427, + "step": 2338 + }, + { + "epoch": 0.13, + "grad_norm": 0.620415357649466, + "learning_rate": 1.943439075072157e-05, + "loss": 0.2752, + "step": 2339 + }, + { + "epoch": 0.13, + "grad_norm": 0.4502906967970935, + "learning_rate": 1.9433773609347553e-05, + "loss": 0.3357, + "step": 2340 + }, + { + "epoch": 0.13, + "grad_norm": 0.5019698091806195, + "learning_rate": 1.9433156141282356e-05, + "loss": 0.3581, + "step": 2341 + }, + { + "epoch": 0.13, + "grad_norm": 0.6175844702796243, + "learning_rate": 1.943253834654737e-05, + "loss": 0.4041, + "step": 2342 + }, + { + "epoch": 0.13, + "grad_norm": 0.365708234558843, + "learning_rate": 1.9431920225163984e-05, + "loss": 0.2691, + "step": 2343 + }, + { + "epoch": 0.13, + "grad_norm": 0.45935395249114086, + "learning_rate": 1.9431301777153607e-05, + "loss": 0.2848, + "step": 2344 + }, + { + "epoch": 0.13, + "grad_norm": 0.44585991482447607, + "learning_rate": 1.9430683002537657e-05, + "loss": 0.327, + "step": 2345 + }, + { + "epoch": 0.13, + "grad_norm": 0.3728807303254334, + "learning_rate": 1.9430063901337562e-05, + "loss": 0.3105, + "step": 2346 + }, + { + "epoch": 0.13, + "grad_norm": 0.7637717215205987, + "learning_rate": 1.9429444473574753e-05, + "loss": 0.5145, + "step": 2347 + }, + { + "epoch": 0.13, + "grad_norm": 0.4234374308721751, + "learning_rate": 1.942882471927069e-05, + "loss": 0.2724, + "step": 2348 + }, + { + "epoch": 0.13, + "grad_norm": 0.569665903816151, + "learning_rate": 1.9428204638446834e-05, + "loss": 0.412, + "step": 2349 + }, + { + "epoch": 0.14, + "grad_norm": 0.4707793907074113, + "learning_rate": 1.9427584231124656e-05, + "loss": 0.3092, + "step": 2350 + }, + { + "epoch": 0.14, + "grad_norm": 0.2978071148204672, + "learning_rate": 1.942696349732564e-05, + "loss": 0.1615, + "step": 2351 + }, + { + "epoch": 0.14, + "grad_norm": 0.4934235765164325, + "learning_rate": 1.9426342437071287e-05, + "loss": 0.3496, + "step": 2352 + }, + { + "epoch": 0.14, + "grad_norm": 0.5296265448238974, + "learning_rate": 1.94257210503831e-05, + "loss": 0.4341, + "step": 2353 + }, + { + "epoch": 0.14, + "grad_norm": 0.4039824220929225, + "learning_rate": 1.9425099337282596e-05, + "loss": 0.2677, + "step": 2354 + }, + { + "epoch": 0.14, + "grad_norm": 0.46538885633625465, + "learning_rate": 1.942447729779131e-05, + "loss": 0.3583, + "step": 2355 + }, + { + "epoch": 0.14, + "grad_norm": 0.5022552896111364, + "learning_rate": 1.9423854931930778e-05, + "loss": 0.3459, + "step": 2356 + }, + { + "epoch": 0.14, + "grad_norm": 0.2656556336693924, + "learning_rate": 1.9423232239722557e-05, + "loss": 0.1004, + "step": 2357 + }, + { + "epoch": 0.14, + "grad_norm": 0.3751636957128666, + "learning_rate": 1.9422609221188208e-05, + "loss": 0.2678, + "step": 2358 + }, + { + "epoch": 0.14, + "grad_norm": 0.7635549619993852, + "learning_rate": 1.9421985876349307e-05, + "loss": 0.5, + "step": 2359 + }, + { + "epoch": 0.14, + "grad_norm": 0.4246343907147413, + "learning_rate": 1.942136220522744e-05, + "loss": 0.3152, + "step": 2360 + }, + { + "epoch": 0.14, + "grad_norm": 0.40637458685864913, + "learning_rate": 1.9420738207844202e-05, + "loss": 0.2499, + "step": 2361 + }, + { + "epoch": 0.14, + "grad_norm": 1.2699260617905255, + "learning_rate": 1.9420113884221207e-05, + "loss": 0.7759, + "step": 2362 + }, + { + "epoch": 0.14, + "grad_norm": 0.2895707578621004, + "learning_rate": 1.9419489234380077e-05, + "loss": 0.1907, + "step": 2363 + }, + { + "epoch": 0.14, + "grad_norm": 0.42114540707427844, + "learning_rate": 1.9418864258342433e-05, + "loss": 0.3008, + "step": 2364 + }, + { + "epoch": 0.14, + "grad_norm": 0.823988721160775, + "learning_rate": 1.941823895612993e-05, + "loss": 0.5296, + "step": 2365 + }, + { + "epoch": 0.14, + "grad_norm": 0.5848088756752329, + "learning_rate": 1.9417613327764214e-05, + "loss": 0.3968, + "step": 2366 + }, + { + "epoch": 0.14, + "grad_norm": 0.4707688843312788, + "learning_rate": 1.9416987373266957e-05, + "loss": 0.2714, + "step": 2367 + }, + { + "epoch": 0.14, + "grad_norm": 0.4300757765992352, + "learning_rate": 1.941636109265983e-05, + "loss": 0.3517, + "step": 2368 + }, + { + "epoch": 0.14, + "grad_norm": 0.3547634392299474, + "learning_rate": 1.941573448596452e-05, + "loss": 0.2276, + "step": 2369 + }, + { + "epoch": 0.14, + "grad_norm": 0.4239024769417483, + "learning_rate": 1.9415107553202736e-05, + "loss": 0.3535, + "step": 2370 + }, + { + "epoch": 0.14, + "grad_norm": 0.5859513694469434, + "learning_rate": 1.9414480294396178e-05, + "loss": 0.3019, + "step": 2371 + }, + { + "epoch": 0.14, + "grad_norm": 0.41543686750719744, + "learning_rate": 1.9413852709566574e-05, + "loss": 0.3323, + "step": 2372 + }, + { + "epoch": 0.14, + "grad_norm": 0.4297104608283397, + "learning_rate": 1.9413224798735655e-05, + "loss": 0.3159, + "step": 2373 + }, + { + "epoch": 0.14, + "grad_norm": 0.9676280782400583, + "learning_rate": 1.9412596561925164e-05, + "loss": 0.5264, + "step": 2374 + }, + { + "epoch": 0.14, + "grad_norm": 0.7162830264380677, + "learning_rate": 1.9411967999156866e-05, + "loss": 0.5078, + "step": 2375 + }, + { + "epoch": 0.14, + "grad_norm": 0.3800369126423308, + "learning_rate": 1.9411339110452512e-05, + "loss": 0.3029, + "step": 2376 + }, + { + "epoch": 0.14, + "grad_norm": 0.3623660175814287, + "learning_rate": 1.9410709895833895e-05, + "loss": 0.2252, + "step": 2377 + }, + { + "epoch": 0.14, + "grad_norm": 0.7698720451832731, + "learning_rate": 1.9410080355322797e-05, + "loss": 0.4136, + "step": 2378 + }, + { + "epoch": 0.14, + "grad_norm": 0.5038883234863918, + "learning_rate": 1.9409450488941018e-05, + "loss": 0.3308, + "step": 2379 + }, + { + "epoch": 0.14, + "grad_norm": 0.523722918460601, + "learning_rate": 1.9408820296710377e-05, + "loss": 0.3026, + "step": 2380 + }, + { + "epoch": 0.14, + "grad_norm": 0.6180731282065387, + "learning_rate": 1.9408189778652694e-05, + "loss": 0.473, + "step": 2381 + }, + { + "epoch": 0.14, + "grad_norm": 0.4264018524998352, + "learning_rate": 1.94075589347898e-05, + "loss": 0.3437, + "step": 2382 + }, + { + "epoch": 0.14, + "grad_norm": 0.34952776740085484, + "learning_rate": 1.940692776514355e-05, + "loss": 0.1839, + "step": 2383 + }, + { + "epoch": 0.14, + "grad_norm": 0.3725548545644662, + "learning_rate": 1.9406296269735792e-05, + "loss": 0.2575, + "step": 2384 + }, + { + "epoch": 0.14, + "grad_norm": 0.5378553612475551, + "learning_rate": 1.94056644485884e-05, + "loss": 0.3269, + "step": 2385 + }, + { + "epoch": 0.14, + "grad_norm": 1.312515301867269, + "learning_rate": 1.940503230172325e-05, + "loss": 0.8482, + "step": 2386 + }, + { + "epoch": 0.14, + "grad_norm": 0.6711137585190222, + "learning_rate": 1.940439982916224e-05, + "loss": 0.3149, + "step": 2387 + }, + { + "epoch": 0.14, + "grad_norm": 0.39804796254758124, + "learning_rate": 1.9403767030927265e-05, + "loss": 0.3114, + "step": 2388 + }, + { + "epoch": 0.14, + "grad_norm": 0.49038512112971977, + "learning_rate": 1.9403133907040245e-05, + "loss": 0.3422, + "step": 2389 + }, + { + "epoch": 0.14, + "grad_norm": 0.2498696865928933, + "learning_rate": 1.94025004575231e-05, + "loss": 0.1121, + "step": 2390 + }, + { + "epoch": 0.14, + "grad_norm": 0.4388817507166522, + "learning_rate": 1.940186668239777e-05, + "loss": 0.3111, + "step": 2391 + }, + { + "epoch": 0.14, + "grad_norm": 0.5706651361437006, + "learning_rate": 1.94012325816862e-05, + "loss": 0.3399, + "step": 2392 + }, + { + "epoch": 0.14, + "grad_norm": 1.1738840800470578, + "learning_rate": 1.9400598155410352e-05, + "loss": 0.4457, + "step": 2393 + }, + { + "epoch": 0.14, + "grad_norm": 0.4022071782421249, + "learning_rate": 1.939996340359219e-05, + "loss": 0.2773, + "step": 2394 + }, + { + "epoch": 0.14, + "grad_norm": 0.3049991474196785, + "learning_rate": 1.9399328326253702e-05, + "loss": 0.2383, + "step": 2395 + }, + { + "epoch": 0.14, + "grad_norm": 0.5410903011939309, + "learning_rate": 1.9398692923416877e-05, + "loss": 0.3912, + "step": 2396 + }, + { + "epoch": 0.14, + "grad_norm": 0.38073227562966827, + "learning_rate": 1.9398057195103725e-05, + "loss": 0.2435, + "step": 2397 + }, + { + "epoch": 0.14, + "grad_norm": 1.3803216316707985, + "learning_rate": 1.9397421141336252e-05, + "loss": 0.6396, + "step": 2398 + }, + { + "epoch": 0.14, + "grad_norm": 0.5643971893815023, + "learning_rate": 1.9396784762136488e-05, + "loss": 0.3818, + "step": 2399 + }, + { + "epoch": 0.14, + "grad_norm": 0.31425978785959, + "learning_rate": 1.9396148057526473e-05, + "loss": 0.2359, + "step": 2400 + }, + { + "epoch": 0.14, + "grad_norm": 1.311778884841662, + "learning_rate": 1.9395511027528257e-05, + "loss": 0.653, + "step": 2401 + }, + { + "epoch": 0.14, + "grad_norm": 0.3134910613406851, + "learning_rate": 1.9394873672163896e-05, + "loss": 0.1867, + "step": 2402 + }, + { + "epoch": 0.14, + "grad_norm": 0.45026996141169107, + "learning_rate": 1.9394235991455464e-05, + "loss": 0.2147, + "step": 2403 + }, + { + "epoch": 0.14, + "grad_norm": 0.6240035351919085, + "learning_rate": 1.9393597985425044e-05, + "loss": 0.3687, + "step": 2404 + }, + { + "epoch": 0.14, + "grad_norm": 1.0016231929095378, + "learning_rate": 1.939295965409473e-05, + "loss": 0.4781, + "step": 2405 + }, + { + "epoch": 0.14, + "grad_norm": 0.39450446781921794, + "learning_rate": 1.9392320997486624e-05, + "loss": 0.234, + "step": 2406 + }, + { + "epoch": 0.14, + "grad_norm": 0.6254131653942641, + "learning_rate": 1.939168201562285e-05, + "loss": 0.3663, + "step": 2407 + }, + { + "epoch": 0.14, + "grad_norm": 0.3195029509674968, + "learning_rate": 1.939104270852553e-05, + "loss": 0.2335, + "step": 2408 + }, + { + "epoch": 0.14, + "grad_norm": 0.46483875879314784, + "learning_rate": 1.9390403076216805e-05, + "loss": 0.3006, + "step": 2409 + }, + { + "epoch": 0.14, + "grad_norm": 1.1589225599424164, + "learning_rate": 1.9389763118718824e-05, + "loss": 0.4651, + "step": 2410 + }, + { + "epoch": 0.14, + "grad_norm": 0.6169719370131748, + "learning_rate": 1.938912283605375e-05, + "loss": 0.3751, + "step": 2411 + }, + { + "epoch": 0.14, + "grad_norm": 0.3554253129647085, + "learning_rate": 1.9388482228243758e-05, + "loss": 0.3024, + "step": 2412 + }, + { + "epoch": 0.14, + "grad_norm": 0.4486534251987144, + "learning_rate": 1.938784129531103e-05, + "loss": 0.2002, + "step": 2413 + }, + { + "epoch": 0.14, + "grad_norm": 0.47231607456050406, + "learning_rate": 1.938720003727776e-05, + "loss": 0.2697, + "step": 2414 + }, + { + "epoch": 0.14, + "grad_norm": 0.4328718171018966, + "learning_rate": 1.9386558454166158e-05, + "loss": 0.3028, + "step": 2415 + }, + { + "epoch": 0.14, + "grad_norm": 0.9168616970784886, + "learning_rate": 1.938591654599844e-05, + "loss": 0.324, + "step": 2416 + }, + { + "epoch": 0.14, + "grad_norm": 1.0797620646008737, + "learning_rate": 1.9385274312796834e-05, + "loss": 0.6565, + "step": 2417 + }, + { + "epoch": 0.14, + "grad_norm": 0.400563368589376, + "learning_rate": 1.9384631754583586e-05, + "loss": 0.2876, + "step": 2418 + }, + { + "epoch": 0.14, + "grad_norm": 0.7285786800582772, + "learning_rate": 1.938398887138094e-05, + "loss": 0.2817, + "step": 2419 + }, + { + "epoch": 0.14, + "grad_norm": 0.2902928252182725, + "learning_rate": 1.9383345663211162e-05, + "loss": 0.2576, + "step": 2420 + }, + { + "epoch": 0.14, + "grad_norm": 0.3972358838624784, + "learning_rate": 1.938270213009653e-05, + "loss": 0.2703, + "step": 2421 + }, + { + "epoch": 0.14, + "grad_norm": 0.8769939771723178, + "learning_rate": 1.9382058272059326e-05, + "loss": 0.5173, + "step": 2422 + }, + { + "epoch": 0.14, + "grad_norm": 0.5224940687326848, + "learning_rate": 1.9381414089121848e-05, + "loss": 0.3176, + "step": 2423 + }, + { + "epoch": 0.14, + "grad_norm": 0.4746503525476009, + "learning_rate": 1.9380769581306404e-05, + "loss": 0.2982, + "step": 2424 + }, + { + "epoch": 0.14, + "grad_norm": 0.5547203830668347, + "learning_rate": 1.9380124748635312e-05, + "loss": 0.4058, + "step": 2425 + }, + { + "epoch": 0.14, + "grad_norm": 0.8275991864380573, + "learning_rate": 1.9379479591130903e-05, + "loss": 0.197, + "step": 2426 + }, + { + "epoch": 0.14, + "grad_norm": 0.395529200272553, + "learning_rate": 1.937883410881552e-05, + "loss": 0.2868, + "step": 2427 + }, + { + "epoch": 0.14, + "grad_norm": 0.5053460012049817, + "learning_rate": 1.937818830171151e-05, + "loss": 0.3728, + "step": 2428 + }, + { + "epoch": 0.14, + "grad_norm": 1.0284565975396147, + "learning_rate": 1.937754216984125e-05, + "loss": 0.5913, + "step": 2429 + }, + { + "epoch": 0.14, + "grad_norm": 0.41865327079166226, + "learning_rate": 1.9376895713227106e-05, + "loss": 0.2927, + "step": 2430 + }, + { + "epoch": 0.14, + "grad_norm": 0.5272358314902269, + "learning_rate": 1.9376248931891463e-05, + "loss": 0.3642, + "step": 2431 + }, + { + "epoch": 0.14, + "grad_norm": 0.7547226733060595, + "learning_rate": 1.9375601825856724e-05, + "loss": 0.4149, + "step": 2432 + }, + { + "epoch": 0.14, + "grad_norm": 0.37814727594751524, + "learning_rate": 1.93749543951453e-05, + "loss": 0.2974, + "step": 2433 + }, + { + "epoch": 0.14, + "grad_norm": 0.4377703315425029, + "learning_rate": 1.9374306639779606e-05, + "loss": 0.2508, + "step": 2434 + }, + { + "epoch": 0.14, + "grad_norm": 0.46933076479616304, + "learning_rate": 1.9373658559782075e-05, + "loss": 0.3411, + "step": 2435 + }, + { + "epoch": 0.14, + "grad_norm": 0.38251871677416266, + "learning_rate": 1.9373010155175155e-05, + "loss": 0.2416, + "step": 2436 + }, + { + "epoch": 0.14, + "grad_norm": 1.4471766073972423, + "learning_rate": 1.9372361425981293e-05, + "loss": 0.7063, + "step": 2437 + }, + { + "epoch": 0.14, + "grad_norm": 0.7185286092475877, + "learning_rate": 1.937171237222296e-05, + "loss": 0.4922, + "step": 2438 + }, + { + "epoch": 0.14, + "grad_norm": 0.3228268085409524, + "learning_rate": 1.9371062993922627e-05, + "loss": 0.238, + "step": 2439 + }, + { + "epoch": 0.14, + "grad_norm": 0.3947789797377527, + "learning_rate": 1.937041329110279e-05, + "loss": 0.2659, + "step": 2440 + }, + { + "epoch": 0.14, + "grad_norm": 0.4828332809183167, + "learning_rate": 1.9369763263785942e-05, + "loss": 0.2961, + "step": 2441 + }, + { + "epoch": 0.14, + "grad_norm": 0.42032450024642715, + "learning_rate": 1.936911291199459e-05, + "loss": 0.1853, + "step": 2442 + }, + { + "epoch": 0.14, + "grad_norm": 0.42092663555747223, + "learning_rate": 1.9368462235751267e-05, + "loss": 0.3511, + "step": 2443 + }, + { + "epoch": 0.14, + "grad_norm": 0.8694030752192309, + "learning_rate": 1.93678112350785e-05, + "loss": 0.5347, + "step": 2444 + }, + { + "epoch": 0.14, + "grad_norm": 0.3919234530205258, + "learning_rate": 1.936715990999883e-05, + "loss": 0.1909, + "step": 2445 + }, + { + "epoch": 0.14, + "grad_norm": 0.43121382968926086, + "learning_rate": 1.9366508260534816e-05, + "loss": 0.2989, + "step": 2446 + }, + { + "epoch": 0.14, + "grad_norm": 0.41099307028572735, + "learning_rate": 1.936585628670902e-05, + "loss": 0.3394, + "step": 2447 + }, + { + "epoch": 0.14, + "grad_norm": 0.3230654577529209, + "learning_rate": 1.936520398854403e-05, + "loss": 0.2249, + "step": 2448 + }, + { + "epoch": 0.14, + "grad_norm": 0.5695656142368685, + "learning_rate": 1.9364551366062426e-05, + "loss": 0.3446, + "step": 2449 + }, + { + "epoch": 0.14, + "grad_norm": 0.7429372957735979, + "learning_rate": 1.9363898419286812e-05, + "loss": 0.5617, + "step": 2450 + }, + { + "epoch": 0.14, + "grad_norm": 0.5023685924135727, + "learning_rate": 1.9363245148239796e-05, + "loss": 0.3299, + "step": 2451 + }, + { + "epoch": 0.14, + "grad_norm": 0.45158885365379514, + "learning_rate": 1.9362591552944004e-05, + "loss": 0.2682, + "step": 2452 + }, + { + "epoch": 0.14, + "grad_norm": 0.4618495334104154, + "learning_rate": 1.9361937633422066e-05, + "loss": 0.3204, + "step": 2453 + }, + { + "epoch": 0.14, + "grad_norm": 0.28793626450866544, + "learning_rate": 1.9361283389696637e-05, + "loss": 0.2239, + "step": 2454 + }, + { + "epoch": 0.14, + "grad_norm": 0.4599655334941602, + "learning_rate": 1.9360628821790365e-05, + "loss": 0.3066, + "step": 2455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1546352730802452, + "learning_rate": 1.9359973929725915e-05, + "loss": 0.5382, + "step": 2456 + }, + { + "epoch": 0.14, + "grad_norm": 0.41592337368392673, + "learning_rate": 1.9359318713525974e-05, + "loss": 0.303, + "step": 2457 + }, + { + "epoch": 0.14, + "grad_norm": 1.5380319613336388, + "learning_rate": 1.9358663173213227e-05, + "loss": 0.8324, + "step": 2458 + }, + { + "epoch": 0.14, + "grad_norm": 0.2911517817560607, + "learning_rate": 1.9358007308810377e-05, + "loss": 0.23, + "step": 2459 + }, + { + "epoch": 0.14, + "grad_norm": 0.32365945054881184, + "learning_rate": 1.9357351120340137e-05, + "loss": 0.2067, + "step": 2460 + }, + { + "epoch": 0.14, + "grad_norm": 0.5107713270649263, + "learning_rate": 1.935669460782523e-05, + "loss": 0.3869, + "step": 2461 + }, + { + "epoch": 0.14, + "grad_norm": 0.5950877636646806, + "learning_rate": 1.935603777128839e-05, + "loss": 0.3651, + "step": 2462 + }, + { + "epoch": 0.14, + "grad_norm": 0.4329840077090914, + "learning_rate": 1.9355380610752364e-05, + "loss": 0.2897, + "step": 2463 + }, + { + "epoch": 0.14, + "grad_norm": 0.5263592276254527, + "learning_rate": 1.9354723126239913e-05, + "loss": 0.368, + "step": 2464 + }, + { + "epoch": 0.14, + "grad_norm": 0.8971518502199158, + "learning_rate": 1.93540653177738e-05, + "loss": 0.5344, + "step": 2465 + }, + { + "epoch": 0.14, + "grad_norm": 0.3520567138494093, + "learning_rate": 1.9353407185376805e-05, + "loss": 0.2685, + "step": 2466 + }, + { + "epoch": 0.14, + "grad_norm": 0.4554545967398719, + "learning_rate": 1.9352748729071727e-05, + "loss": 0.3259, + "step": 2467 + }, + { + "epoch": 0.14, + "grad_norm": 0.4117401158899818, + "learning_rate": 1.9352089948881358e-05, + "loss": 0.2224, + "step": 2468 + }, + { + "epoch": 0.14, + "grad_norm": 0.4564482028613299, + "learning_rate": 1.935143084482852e-05, + "loss": 0.2908, + "step": 2469 + }, + { + "epoch": 0.14, + "grad_norm": 0.6895418863474747, + "learning_rate": 1.935077141693603e-05, + "loss": 0.447, + "step": 2470 + }, + { + "epoch": 0.14, + "grad_norm": 0.5274251237370065, + "learning_rate": 1.935011166522673e-05, + "loss": 0.3838, + "step": 2471 + }, + { + "epoch": 0.14, + "grad_norm": 0.4138804782363925, + "learning_rate": 1.9349451589723465e-05, + "loss": 0.2777, + "step": 2472 + }, + { + "epoch": 0.14, + "grad_norm": 0.5436885501881874, + "learning_rate": 1.9348791190449092e-05, + "loss": 0.4258, + "step": 2473 + }, + { + "epoch": 0.14, + "grad_norm": 0.3883095750427234, + "learning_rate": 1.9348130467426486e-05, + "loss": 0.1774, + "step": 2474 + }, + { + "epoch": 0.14, + "grad_norm": 0.3857708543855049, + "learning_rate": 1.934746942067852e-05, + "loss": 0.2547, + "step": 2475 + }, + { + "epoch": 0.14, + "grad_norm": 0.5736300598247335, + "learning_rate": 1.9346808050228093e-05, + "loss": 0.4197, + "step": 2476 + }, + { + "epoch": 0.14, + "grad_norm": 0.7074214232035989, + "learning_rate": 1.9346146356098103e-05, + "loss": 0.5278, + "step": 2477 + }, + { + "epoch": 0.14, + "grad_norm": 0.442403841592901, + "learning_rate": 1.9345484338311467e-05, + "loss": 0.2977, + "step": 2478 + }, + { + "epoch": 0.14, + "grad_norm": 0.4645088493036005, + "learning_rate": 1.9344821996891106e-05, + "loss": 0.3155, + "step": 2479 + }, + { + "epoch": 0.14, + "grad_norm": 0.4166778230104967, + "learning_rate": 1.9344159331859965e-05, + "loss": 0.1801, + "step": 2480 + }, + { + "epoch": 0.14, + "grad_norm": 0.4899829826259577, + "learning_rate": 1.9343496343240994e-05, + "loss": 0.1517, + "step": 2481 + }, + { + "epoch": 0.14, + "grad_norm": 0.5196827332828534, + "learning_rate": 1.9342833031057138e-05, + "loss": 0.3627, + "step": 2482 + }, + { + "epoch": 0.14, + "grad_norm": 0.46032261971842325, + "learning_rate": 1.934216939533138e-05, + "loss": 0.367, + "step": 2483 + }, + { + "epoch": 0.14, + "grad_norm": 0.8366442088496777, + "learning_rate": 1.9341505436086695e-05, + "loss": 0.441, + "step": 2484 + }, + { + "epoch": 0.14, + "grad_norm": 0.48325853141372255, + "learning_rate": 1.934084115334608e-05, + "loss": 0.263, + "step": 2485 + }, + { + "epoch": 0.14, + "grad_norm": 0.38690498341366925, + "learning_rate": 1.9340176547132536e-05, + "loss": 0.2036, + "step": 2486 + }, + { + "epoch": 0.14, + "grad_norm": 0.41703692608099363, + "learning_rate": 1.9339511617469082e-05, + "loss": 0.3184, + "step": 2487 + }, + { + "epoch": 0.14, + "grad_norm": 0.47705445064380225, + "learning_rate": 1.9338846364378742e-05, + "loss": 0.2599, + "step": 2488 + }, + { + "epoch": 0.14, + "grad_norm": 1.186205154115369, + "learning_rate": 1.9338180787884554e-05, + "loss": 0.5164, + "step": 2489 + }, + { + "epoch": 0.14, + "grad_norm": 0.5865526165668318, + "learning_rate": 1.9337514888009566e-05, + "loss": 0.4188, + "step": 2490 + }, + { + "epoch": 0.14, + "grad_norm": 0.3801379438763642, + "learning_rate": 1.933684866477684e-05, + "loss": 0.264, + "step": 2491 + }, + { + "epoch": 0.14, + "grad_norm": 0.2829332968836524, + "learning_rate": 1.9336182118209442e-05, + "loss": 0.192, + "step": 2492 + }, + { + "epoch": 0.14, + "grad_norm": 0.5463688788581329, + "learning_rate": 1.9335515248330462e-05, + "loss": 0.3456, + "step": 2493 + }, + { + "epoch": 0.14, + "grad_norm": 0.719184449716863, + "learning_rate": 1.933484805516299e-05, + "loss": 0.3116, + "step": 2494 + }, + { + "epoch": 0.14, + "grad_norm": 0.8530292958738765, + "learning_rate": 1.9334180538730133e-05, + "loss": 0.3951, + "step": 2495 + }, + { + "epoch": 0.14, + "grad_norm": 1.5021582635497452, + "learning_rate": 1.9333512699055004e-05, + "loss": 0.7942, + "step": 2496 + }, + { + "epoch": 0.14, + "grad_norm": 0.37553192886357234, + "learning_rate": 1.933284453616073e-05, + "loss": 0.3011, + "step": 2497 + }, + { + "epoch": 0.14, + "grad_norm": 0.27112912159156816, + "learning_rate": 1.9332176050070454e-05, + "loss": 0.2007, + "step": 2498 + }, + { + "epoch": 0.14, + "grad_norm": 0.763728328487966, + "learning_rate": 1.9331507240807322e-05, + "loss": 0.4586, + "step": 2499 + }, + { + "epoch": 0.14, + "grad_norm": 0.6138837661580483, + "learning_rate": 1.9330838108394497e-05, + "loss": 0.327, + "step": 2500 + }, + { + "epoch": 0.14, + "grad_norm": 1.051207841064682, + "learning_rate": 1.933016865285515e-05, + "loss": 0.438, + "step": 2501 + }, + { + "epoch": 0.14, + "grad_norm": 0.5530078844473687, + "learning_rate": 1.932949887421246e-05, + "loss": 0.3198, + "step": 2502 + }, + { + "epoch": 0.14, + "grad_norm": 0.4045931786615891, + "learning_rate": 1.9328828772489623e-05, + "loss": 0.293, + "step": 2503 + }, + { + "epoch": 0.14, + "grad_norm": 0.3240936130174602, + "learning_rate": 1.932815834770985e-05, + "loss": 0.1162, + "step": 2504 + }, + { + "epoch": 0.14, + "grad_norm": 0.37395828600806214, + "learning_rate": 1.9327487599896355e-05, + "loss": 0.2758, + "step": 2505 + }, + { + "epoch": 0.14, + "grad_norm": 0.4175861001760937, + "learning_rate": 1.9326816529072364e-05, + "loss": 0.3197, + "step": 2506 + }, + { + "epoch": 0.14, + "grad_norm": 0.7256771592836744, + "learning_rate": 1.932614513526112e-05, + "loss": 0.3772, + "step": 2507 + }, + { + "epoch": 0.14, + "grad_norm": 0.6586431832270313, + "learning_rate": 1.932547341848587e-05, + "loss": 0.4013, + "step": 2508 + }, + { + "epoch": 0.14, + "grad_norm": 0.5348147230679424, + "learning_rate": 1.932480137876988e-05, + "loss": 0.3208, + "step": 2509 + }, + { + "epoch": 0.14, + "grad_norm": 0.39809772127850734, + "learning_rate": 1.9324129016136414e-05, + "loss": 0.265, + "step": 2510 + }, + { + "epoch": 0.14, + "grad_norm": 0.29378693207938744, + "learning_rate": 1.9323456330608762e-05, + "loss": 0.2083, + "step": 2511 + }, + { + "epoch": 0.14, + "grad_norm": 0.41811342753632974, + "learning_rate": 1.932278332221022e-05, + "loss": 0.3463, + "step": 2512 + }, + { + "epoch": 0.14, + "grad_norm": 0.7113376176543778, + "learning_rate": 1.932210999096409e-05, + "loss": 0.5525, + "step": 2513 + }, + { + "epoch": 0.14, + "grad_norm": 0.4728687902245757, + "learning_rate": 1.9321436336893693e-05, + "loss": 0.2821, + "step": 2514 + }, + { + "epoch": 0.14, + "grad_norm": 0.3785533771206013, + "learning_rate": 1.9320762360022354e-05, + "loss": 0.2939, + "step": 2515 + }, + { + "epoch": 0.14, + "grad_norm": 0.4423604779271077, + "learning_rate": 1.932008806037342e-05, + "loss": 0.2852, + "step": 2516 + }, + { + "epoch": 0.14, + "grad_norm": 0.34758961266810223, + "learning_rate": 1.9319413437970236e-05, + "loss": 0.2658, + "step": 2517 + }, + { + "epoch": 0.14, + "grad_norm": 0.40220948422232616, + "learning_rate": 1.9318738492836165e-05, + "loss": 0.2915, + "step": 2518 + }, + { + "epoch": 0.14, + "grad_norm": 0.515572698759552, + "learning_rate": 1.931806322499458e-05, + "loss": 0.4523, + "step": 2519 + }, + { + "epoch": 0.14, + "grad_norm": 0.5901676572616871, + "learning_rate": 1.9317387634468867e-05, + "loss": 0.2373, + "step": 2520 + }, + { + "epoch": 0.14, + "grad_norm": 0.42464998973281115, + "learning_rate": 1.9316711721282423e-05, + "loss": 0.3158, + "step": 2521 + }, + { + "epoch": 0.14, + "grad_norm": 0.5342049535459824, + "learning_rate": 1.931603548545865e-05, + "loss": 0.3712, + "step": 2522 + }, + { + "epoch": 0.14, + "grad_norm": 0.3737853946547916, + "learning_rate": 1.9315358927020965e-05, + "loss": 0.3404, + "step": 2523 + }, + { + "epoch": 0.15, + "grad_norm": 0.28710083849023404, + "learning_rate": 1.931468204599281e-05, + "loss": 0.2055, + "step": 2524 + }, + { + "epoch": 0.15, + "grad_norm": 1.221810801453039, + "learning_rate": 1.931400484239761e-05, + "loss": 0.8419, + "step": 2525 + }, + { + "epoch": 0.15, + "grad_norm": 0.4907234180429648, + "learning_rate": 1.931332731625882e-05, + "loss": 0.3614, + "step": 2526 + }, + { + "epoch": 0.15, + "grad_norm": 0.37691010597331226, + "learning_rate": 1.931264946759991e-05, + "loss": 0.2088, + "step": 2527 + }, + { + "epoch": 0.15, + "grad_norm": 0.7873124730767418, + "learning_rate": 1.931197129644435e-05, + "loss": 0.5543, + "step": 2528 + }, + { + "epoch": 0.15, + "grad_norm": 0.5526920490936016, + "learning_rate": 1.9311292802815626e-05, + "loss": 0.4587, + "step": 2529 + }, + { + "epoch": 0.15, + "grad_norm": 0.28601007260420575, + "learning_rate": 1.931061398673723e-05, + "loss": 0.1858, + "step": 2530 + }, + { + "epoch": 0.15, + "grad_norm": 0.4887404163838093, + "learning_rate": 1.930993484823267e-05, + "loss": 0.3604, + "step": 2531 + }, + { + "epoch": 0.15, + "grad_norm": 0.5338039650083546, + "learning_rate": 1.9309255387325468e-05, + "loss": 0.3452, + "step": 2532 + }, + { + "epoch": 0.15, + "grad_norm": 0.34880351685054517, + "learning_rate": 1.9308575604039156e-05, + "loss": 0.2113, + "step": 2533 + }, + { + "epoch": 0.15, + "grad_norm": 0.50223678765854, + "learning_rate": 1.9307895498397268e-05, + "loss": 0.3496, + "step": 2534 + }, + { + "epoch": 0.15, + "grad_norm": 1.0383332882780774, + "learning_rate": 1.9307215070423354e-05, + "loss": 0.5223, + "step": 2535 + }, + { + "epoch": 0.15, + "grad_norm": 0.40725782340222744, + "learning_rate": 1.930653432014099e-05, + "loss": 0.3043, + "step": 2536 + }, + { + "epoch": 0.15, + "grad_norm": 1.0892283717210414, + "learning_rate": 1.930585324757374e-05, + "loss": 0.5579, + "step": 2537 + }, + { + "epoch": 0.15, + "grad_norm": 0.3285256564934091, + "learning_rate": 1.9305171852745192e-05, + "loss": 0.2542, + "step": 2538 + }, + { + "epoch": 0.15, + "grad_norm": 0.3941296085844982, + "learning_rate": 1.9304490135678944e-05, + "loss": 0.2776, + "step": 2539 + }, + { + "epoch": 0.15, + "grad_norm": 0.8550640205170302, + "learning_rate": 1.9303808096398602e-05, + "loss": 0.4352, + "step": 2540 + }, + { + "epoch": 0.15, + "grad_norm": 0.7635534341325637, + "learning_rate": 1.9303125734927784e-05, + "loss": 0.5426, + "step": 2541 + }, + { + "epoch": 0.15, + "grad_norm": 0.3402636225143411, + "learning_rate": 1.9302443051290126e-05, + "loss": 0.2776, + "step": 2542 + }, + { + "epoch": 0.15, + "grad_norm": 0.6774987084654007, + "learning_rate": 1.930176004550926e-05, + "loss": 0.3408, + "step": 2543 + }, + { + "epoch": 0.15, + "grad_norm": 0.33787908931664956, + "learning_rate": 1.930107671760885e-05, + "loss": 0.2045, + "step": 2544 + }, + { + "epoch": 0.15, + "grad_norm": 0.47972956125767385, + "learning_rate": 1.9300393067612545e-05, + "loss": 0.2875, + "step": 2545 + }, + { + "epoch": 0.15, + "grad_norm": 0.4447447023685629, + "learning_rate": 1.9299709095544035e-05, + "loss": 0.3332, + "step": 2546 + }, + { + "epoch": 0.15, + "grad_norm": 0.9395633232005106, + "learning_rate": 1.9299024801426994e-05, + "loss": 0.4455, + "step": 2547 + }, + { + "epoch": 0.15, + "grad_norm": 0.4374552033580974, + "learning_rate": 1.9298340185285127e-05, + "loss": 0.2683, + "step": 2548 + }, + { + "epoch": 0.15, + "grad_norm": 0.6582182316770087, + "learning_rate": 1.929765524714214e-05, + "loss": 0.4531, + "step": 2549 + }, + { + "epoch": 0.15, + "grad_norm": 0.3174736595811298, + "learning_rate": 1.929696998702175e-05, + "loss": 0.1964, + "step": 2550 + }, + { + "epoch": 0.15, + "grad_norm": 0.40811572548117436, + "learning_rate": 1.929628440494769e-05, + "loss": 0.2882, + "step": 2551 + }, + { + "epoch": 0.15, + "grad_norm": 0.8862066628895517, + "learning_rate": 1.92955985009437e-05, + "loss": 0.4916, + "step": 2552 + }, + { + "epoch": 0.15, + "grad_norm": 1.0030933658275236, + "learning_rate": 1.9294912275033533e-05, + "loss": 0.3916, + "step": 2553 + }, + { + "epoch": 0.15, + "grad_norm": 0.4245887044072022, + "learning_rate": 1.9294225727240958e-05, + "loss": 0.2804, + "step": 2554 + }, + { + "epoch": 0.15, + "grad_norm": 0.5604093210603397, + "learning_rate": 1.9293538857589737e-05, + "loss": 0.4478, + "step": 2555 + }, + { + "epoch": 0.15, + "grad_norm": 0.22753638244231286, + "learning_rate": 1.9292851666103673e-05, + "loss": 0.1424, + "step": 2556 + }, + { + "epoch": 0.15, + "grad_norm": 0.393215750130867, + "learning_rate": 1.9292164152806553e-05, + "loss": 0.321, + "step": 2557 + }, + { + "epoch": 0.15, + "grad_norm": 0.42867114089154623, + "learning_rate": 1.9291476317722185e-05, + "loss": 0.3651, + "step": 2558 + }, + { + "epoch": 0.15, + "grad_norm": 1.0719463340093578, + "learning_rate": 1.9290788160874395e-05, + "loss": 0.5735, + "step": 2559 + }, + { + "epoch": 0.15, + "grad_norm": 0.42211388783214765, + "learning_rate": 1.9290099682287012e-05, + "loss": 0.2557, + "step": 2560 + }, + { + "epoch": 0.15, + "grad_norm": 1.346065145365312, + "learning_rate": 1.928941088198387e-05, + "loss": 0.877, + "step": 2561 + }, + { + "epoch": 0.15, + "grad_norm": 0.4098509316745687, + "learning_rate": 1.9288721759988834e-05, + "loss": 0.372, + "step": 2562 + }, + { + "epoch": 0.15, + "grad_norm": 0.42484258362884286, + "learning_rate": 1.9288032316325762e-05, + "loss": 0.2418, + "step": 2563 + }, + { + "epoch": 0.15, + "grad_norm": 0.35294191462631713, + "learning_rate": 1.928734255101853e-05, + "loss": 0.2576, + "step": 2564 + }, + { + "epoch": 0.15, + "grad_norm": 0.5300039653477053, + "learning_rate": 1.9286652464091024e-05, + "loss": 0.4365, + "step": 2565 + }, + { + "epoch": 0.15, + "grad_norm": 0.36024518354461466, + "learning_rate": 1.9285962055567144e-05, + "loss": 0.2375, + "step": 2566 + }, + { + "epoch": 0.15, + "grad_norm": 0.676422785771166, + "learning_rate": 1.9285271325470794e-05, + "loss": 0.4694, + "step": 2567 + }, + { + "epoch": 0.15, + "grad_norm": 0.8126513636201301, + "learning_rate": 1.92845802738259e-05, + "loss": 0.5042, + "step": 2568 + }, + { + "epoch": 0.15, + "grad_norm": 0.4114429506728939, + "learning_rate": 1.928388890065639e-05, + "loss": 0.2532, + "step": 2569 + }, + { + "epoch": 0.15, + "grad_norm": 0.3213342473563277, + "learning_rate": 1.928319720598621e-05, + "loss": 0.27, + "step": 2570 + }, + { + "epoch": 0.15, + "grad_norm": 0.40903951950701906, + "learning_rate": 1.9282505189839305e-05, + "loss": 0.2594, + "step": 2571 + }, + { + "epoch": 0.15, + "grad_norm": 0.44270353797659523, + "learning_rate": 1.9281812852239647e-05, + "loss": 0.3207, + "step": 2572 + }, + { + "epoch": 0.15, + "grad_norm": 0.43969504549066846, + "learning_rate": 1.928112019321121e-05, + "loss": 0.297, + "step": 2573 + }, + { + "epoch": 0.15, + "grad_norm": 0.6514459661898435, + "learning_rate": 1.9280427212777976e-05, + "loss": 0.4354, + "step": 2574 + }, + { + "epoch": 0.15, + "grad_norm": 0.5278709615714426, + "learning_rate": 1.927973391096395e-05, + "loss": 0.334, + "step": 2575 + }, + { + "epoch": 0.15, + "grad_norm": 0.3225729087069408, + "learning_rate": 1.9279040287793138e-05, + "loss": 0.1843, + "step": 2576 + }, + { + "epoch": 0.15, + "grad_norm": 0.38192438331171524, + "learning_rate": 1.9278346343289555e-05, + "loss": 0.285, + "step": 2577 + }, + { + "epoch": 0.15, + "grad_norm": 0.4238320027450154, + "learning_rate": 1.9277652077477242e-05, + "loss": 0.3222, + "step": 2578 + }, + { + "epoch": 0.15, + "grad_norm": 0.5692015716239939, + "learning_rate": 1.9276957490380236e-05, + "loss": 0.3179, + "step": 2579 + }, + { + "epoch": 0.15, + "grad_norm": 0.6989023862803406, + "learning_rate": 1.927626258202259e-05, + "loss": 0.4818, + "step": 2580 + }, + { + "epoch": 0.15, + "grad_norm": 0.47851777806709617, + "learning_rate": 1.927556735242837e-05, + "loss": 0.3246, + "step": 2581 + }, + { + "epoch": 0.15, + "grad_norm": 0.37801509190886534, + "learning_rate": 1.9274871801621652e-05, + "loss": 0.2653, + "step": 2582 + }, + { + "epoch": 0.15, + "grad_norm": 0.31902918198499913, + "learning_rate": 1.927417592962652e-05, + "loss": 0.2124, + "step": 2583 + }, + { + "epoch": 0.15, + "grad_norm": 0.5711538165034585, + "learning_rate": 1.9273479736467077e-05, + "loss": 0.3241, + "step": 2584 + }, + { + "epoch": 0.15, + "grad_norm": 0.48424088195332443, + "learning_rate": 1.9272783222167424e-05, + "loss": 0.3557, + "step": 2585 + }, + { + "epoch": 0.15, + "grad_norm": 0.5128003089466692, + "learning_rate": 1.9272086386751693e-05, + "loss": 0.3375, + "step": 2586 + }, + { + "epoch": 0.15, + "grad_norm": 0.5824582826960971, + "learning_rate": 1.927138923024401e-05, + "loss": 0.3402, + "step": 2587 + }, + { + "epoch": 0.15, + "grad_norm": 0.3477545206134362, + "learning_rate": 1.927069175266851e-05, + "loss": 0.2719, + "step": 2588 + }, + { + "epoch": 0.15, + "grad_norm": 0.35480404863309206, + "learning_rate": 1.926999395404936e-05, + "loss": 0.1912, + "step": 2589 + }, + { + "epoch": 0.15, + "grad_norm": 0.4088392855070457, + "learning_rate": 1.9269295834410715e-05, + "loss": 0.3091, + "step": 2590 + }, + { + "epoch": 0.15, + "grad_norm": 0.4128220609130996, + "learning_rate": 1.9268597393776753e-05, + "loss": 0.3596, + "step": 2591 + }, + { + "epoch": 0.15, + "grad_norm": 0.6523103375473858, + "learning_rate": 1.9267898632171663e-05, + "loss": 0.4379, + "step": 2592 + }, + { + "epoch": 0.15, + "grad_norm": 0.42876866445114575, + "learning_rate": 1.9267199549619643e-05, + "loss": 0.2967, + "step": 2593 + }, + { + "epoch": 0.15, + "grad_norm": 0.4201511676991754, + "learning_rate": 1.92665001461449e-05, + "loss": 0.3146, + "step": 2594 + }, + { + "epoch": 0.15, + "grad_norm": 0.23291446601377253, + "learning_rate": 1.9265800421771655e-05, + "loss": 0.1366, + "step": 2595 + }, + { + "epoch": 0.15, + "grad_norm": 0.43171545122840194, + "learning_rate": 1.926510037652414e-05, + "loss": 0.3457, + "step": 2596 + }, + { + "epoch": 0.15, + "grad_norm": 0.5129067503078448, + "learning_rate": 1.9264400010426598e-05, + "loss": 0.4012, + "step": 2597 + }, + { + "epoch": 0.15, + "grad_norm": 0.41831673800447344, + "learning_rate": 1.926369932350328e-05, + "loss": 0.3836, + "step": 2598 + }, + { + "epoch": 0.15, + "grad_norm": 0.30934776416133264, + "learning_rate": 1.9262998315778453e-05, + "loss": 0.1444, + "step": 2599 + }, + { + "epoch": 0.15, + "grad_norm": 0.46632659562332673, + "learning_rate": 1.9262296987276395e-05, + "loss": 0.348, + "step": 2600 + }, + { + "epoch": 0.15, + "grad_norm": 0.27229649864445243, + "learning_rate": 1.9261595338021388e-05, + "loss": 0.2331, + "step": 2601 + }, + { + "epoch": 0.15, + "grad_norm": 0.40189885147596294, + "learning_rate": 1.9260893368037735e-05, + "loss": 0.1689, + "step": 2602 + }, + { + "epoch": 0.15, + "grad_norm": 0.41329355608342555, + "learning_rate": 1.9260191077349742e-05, + "loss": 0.3672, + "step": 2603 + }, + { + "epoch": 0.15, + "grad_norm": 0.6931020159640395, + "learning_rate": 1.925948846598173e-05, + "loss": 0.5568, + "step": 2604 + }, + { + "epoch": 0.15, + "grad_norm": 0.3821818839566917, + "learning_rate": 1.925878553395803e-05, + "loss": 0.2077, + "step": 2605 + }, + { + "epoch": 0.15, + "grad_norm": 0.36378068853839796, + "learning_rate": 1.9258082281302988e-05, + "loss": 0.3239, + "step": 2606 + }, + { + "epoch": 0.15, + "grad_norm": 0.3489085591030622, + "learning_rate": 1.925737870804095e-05, + "loss": 0.1991, + "step": 2607 + }, + { + "epoch": 0.15, + "grad_norm": 0.4365373846944763, + "learning_rate": 1.925667481419629e-05, + "loss": 0.3204, + "step": 2608 + }, + { + "epoch": 0.15, + "grad_norm": 0.37061176255326356, + "learning_rate": 1.9255970599793376e-05, + "loss": 0.3358, + "step": 2609 + }, + { + "epoch": 0.15, + "grad_norm": 0.5448112276722097, + "learning_rate": 1.92552660648566e-05, + "loss": 0.4135, + "step": 2610 + }, + { + "epoch": 0.15, + "grad_norm": 0.7625349028762688, + "learning_rate": 1.925456120941036e-05, + "loss": 0.4468, + "step": 2611 + }, + { + "epoch": 0.15, + "grad_norm": 0.3891806726343858, + "learning_rate": 1.925385603347906e-05, + "loss": 0.2244, + "step": 2612 + }, + { + "epoch": 0.15, + "grad_norm": 0.3134500837415368, + "learning_rate": 1.9253150537087123e-05, + "loss": 0.2471, + "step": 2613 + }, + { + "epoch": 0.15, + "grad_norm": 0.6358650633518437, + "learning_rate": 1.9252444720258982e-05, + "loss": 0.3955, + "step": 2614 + }, + { + "epoch": 0.15, + "grad_norm": 0.4146219135777843, + "learning_rate": 1.925173858301908e-05, + "loss": 0.3018, + "step": 2615 + }, + { + "epoch": 0.15, + "grad_norm": 0.5250021533099688, + "learning_rate": 1.9251032125391867e-05, + "loss": 0.3568, + "step": 2616 + }, + { + "epoch": 0.15, + "grad_norm": 0.3748774238993723, + "learning_rate": 1.925032534740181e-05, + "loss": 0.3043, + "step": 2617 + }, + { + "epoch": 0.15, + "grad_norm": 0.4535619682866479, + "learning_rate": 1.9249618249073384e-05, + "loss": 0.2748, + "step": 2618 + }, + { + "epoch": 0.15, + "grad_norm": 0.7356712302689734, + "learning_rate": 1.9248910830431073e-05, + "loss": 0.5015, + "step": 2619 + }, + { + "epoch": 0.15, + "grad_norm": 0.4561318815496416, + "learning_rate": 1.924820309149938e-05, + "loss": 0.3636, + "step": 2620 + }, + { + "epoch": 0.15, + "grad_norm": 0.40170842706925475, + "learning_rate": 1.9247495032302812e-05, + "loss": 0.317, + "step": 2621 + }, + { + "epoch": 0.15, + "grad_norm": 0.3116553045717942, + "learning_rate": 1.924678665286589e-05, + "loss": 0.2193, + "step": 2622 + }, + { + "epoch": 0.15, + "grad_norm": 1.0170395298052468, + "learning_rate": 1.924607795321314e-05, + "loss": 0.4508, + "step": 2623 + }, + { + "epoch": 0.15, + "grad_norm": 0.4078089046880881, + "learning_rate": 1.924536893336911e-05, + "loss": 0.3254, + "step": 2624 + }, + { + "epoch": 0.15, + "grad_norm": 0.43203476595198353, + "learning_rate": 1.924465959335835e-05, + "loss": 0.3156, + "step": 2625 + }, + { + "epoch": 0.15, + "grad_norm": 0.8382291099953593, + "learning_rate": 1.924394993320543e-05, + "loss": 0.4348, + "step": 2626 + }, + { + "epoch": 0.15, + "grad_norm": 0.4875964972050568, + "learning_rate": 1.9243239952934918e-05, + "loss": 0.3129, + "step": 2627 + }, + { + "epoch": 0.15, + "grad_norm": 0.3377201613499536, + "learning_rate": 1.9242529652571405e-05, + "loss": 0.1133, + "step": 2628 + }, + { + "epoch": 0.15, + "grad_norm": 0.4149281039334817, + "learning_rate": 1.9241819032139487e-05, + "loss": 0.3227, + "step": 2629 + }, + { + "epoch": 0.15, + "grad_norm": 0.48560499229576615, + "learning_rate": 1.9241108091663774e-05, + "loss": 0.3227, + "step": 2630 + }, + { + "epoch": 0.15, + "grad_norm": 1.3978598633144965, + "learning_rate": 1.9240396831168883e-05, + "loss": 0.4226, + "step": 2631 + }, + { + "epoch": 0.15, + "grad_norm": 0.5892322349385228, + "learning_rate": 1.923968525067945e-05, + "loss": 0.3879, + "step": 2632 + }, + { + "epoch": 0.15, + "grad_norm": 0.5121453834736848, + "learning_rate": 1.923897335022011e-05, + "loss": 0.3623, + "step": 2633 + }, + { + "epoch": 0.15, + "grad_norm": 0.4052392757884597, + "learning_rate": 1.9238261129815526e-05, + "loss": 0.2655, + "step": 2634 + }, + { + "epoch": 0.15, + "grad_norm": 0.3905272412860965, + "learning_rate": 1.9237548589490355e-05, + "loss": 0.14, + "step": 2635 + }, + { + "epoch": 0.15, + "grad_norm": 0.7394376527945423, + "learning_rate": 1.923683572926927e-05, + "loss": 0.3439, + "step": 2636 + }, + { + "epoch": 0.15, + "grad_norm": 0.8275157121617628, + "learning_rate": 1.9236122549176963e-05, + "loss": 0.3755, + "step": 2637 + }, + { + "epoch": 0.15, + "grad_norm": 0.9844646238383457, + "learning_rate": 1.923540904923813e-05, + "loss": 0.3717, + "step": 2638 + }, + { + "epoch": 0.15, + "grad_norm": 0.3459704137199816, + "learning_rate": 1.9234695229477475e-05, + "loss": 0.3101, + "step": 2639 + }, + { + "epoch": 0.15, + "grad_norm": 0.31297296064647323, + "learning_rate": 1.9233981089919727e-05, + "loss": 0.1909, + "step": 2640 + }, + { + "epoch": 0.15, + "grad_norm": 0.6016717619270547, + "learning_rate": 1.9233266630589607e-05, + "loss": 0.2894, + "step": 2641 + }, + { + "epoch": 0.15, + "grad_norm": 0.5144862160701495, + "learning_rate": 1.923255185151186e-05, + "loss": 0.3252, + "step": 2642 + }, + { + "epoch": 0.15, + "grad_norm": 1.0734016508045525, + "learning_rate": 1.923183675271124e-05, + "loss": 0.4787, + "step": 2643 + }, + { + "epoch": 0.15, + "grad_norm": 0.6162045563854098, + "learning_rate": 1.9231121334212513e-05, + "loss": 0.2236, + "step": 2644 + }, + { + "epoch": 0.15, + "grad_norm": 0.45479225577934984, + "learning_rate": 1.9230405596040448e-05, + "loss": 0.2862, + "step": 2645 + }, + { + "epoch": 0.15, + "grad_norm": 1.5405861989390848, + "learning_rate": 1.922968953821984e-05, + "loss": 0.8193, + "step": 2646 + }, + { + "epoch": 0.15, + "grad_norm": 0.3822968065664251, + "learning_rate": 1.9228973160775474e-05, + "loss": 0.2538, + "step": 2647 + }, + { + "epoch": 0.15, + "grad_norm": 0.4312183020139809, + "learning_rate": 1.9228256463732165e-05, + "loss": 0.2563, + "step": 2648 + }, + { + "epoch": 0.15, + "grad_norm": 0.614927999518781, + "learning_rate": 1.9227539447114732e-05, + "loss": 0.3677, + "step": 2649 + }, + { + "epoch": 0.15, + "grad_norm": 1.1379887071076775, + "learning_rate": 1.9226822110948005e-05, + "loss": 0.6745, + "step": 2650 + }, + { + "epoch": 0.15, + "grad_norm": 0.37894151707122936, + "learning_rate": 1.9226104455256827e-05, + "loss": 0.2201, + "step": 2651 + }, + { + "epoch": 0.15, + "grad_norm": 0.4781041246729594, + "learning_rate": 1.9225386480066046e-05, + "loss": 0.3187, + "step": 2652 + }, + { + "epoch": 0.15, + "grad_norm": 0.4630776461667929, + "learning_rate": 1.9224668185400528e-05, + "loss": 0.3508, + "step": 2653 + }, + { + "epoch": 0.15, + "grad_norm": 0.38185126415206494, + "learning_rate": 1.922394957128515e-05, + "loss": 0.2651, + "step": 2654 + }, + { + "epoch": 0.15, + "grad_norm": 0.4365198237742083, + "learning_rate": 1.9223230637744792e-05, + "loss": 0.2976, + "step": 2655 + }, + { + "epoch": 0.15, + "grad_norm": 1.6072299775806103, + "learning_rate": 1.9222511384804355e-05, + "loss": 0.8611, + "step": 2656 + }, + { + "epoch": 0.15, + "grad_norm": 0.35629856336310844, + "learning_rate": 1.9221791812488746e-05, + "loss": 0.2504, + "step": 2657 + }, + { + "epoch": 0.15, + "grad_norm": 0.9970693080774518, + "learning_rate": 1.9221071920822882e-05, + "loss": 0.6513, + "step": 2658 + }, + { + "epoch": 0.15, + "grad_norm": 0.7995130307427668, + "learning_rate": 1.922035170983169e-05, + "loss": 0.5226, + "step": 2659 + }, + { + "epoch": 0.15, + "grad_norm": 0.3808491244278673, + "learning_rate": 1.9219631179540125e-05, + "loss": 0.3165, + "step": 2660 + }, + { + "epoch": 0.15, + "grad_norm": 0.3197238539222895, + "learning_rate": 1.921891032997312e-05, + "loss": 0.1838, + "step": 2661 + }, + { + "epoch": 0.15, + "grad_norm": 1.503646870246312, + "learning_rate": 1.921818916115565e-05, + "loss": 0.794, + "step": 2662 + }, + { + "epoch": 0.15, + "grad_norm": 0.39543891144707394, + "learning_rate": 1.9217467673112685e-05, + "loss": 0.2965, + "step": 2663 + }, + { + "epoch": 0.15, + "grad_norm": 0.9269117583869534, + "learning_rate": 1.921674586586921e-05, + "loss": 0.4454, + "step": 2664 + }, + { + "epoch": 0.15, + "grad_norm": 0.4249499526884838, + "learning_rate": 1.9216023739450222e-05, + "loss": 0.3679, + "step": 2665 + }, + { + "epoch": 0.15, + "grad_norm": 0.4217638497070136, + "learning_rate": 1.921530129388073e-05, + "loss": 0.2539, + "step": 2666 + }, + { + "epoch": 0.15, + "grad_norm": 0.277953569574096, + "learning_rate": 1.921457852918575e-05, + "loss": 0.1021, + "step": 2667 + }, + { + "epoch": 0.15, + "grad_norm": 0.44369267599406603, + "learning_rate": 1.921385544539031e-05, + "loss": 0.3468, + "step": 2668 + }, + { + "epoch": 0.15, + "grad_norm": 0.4842457706418174, + "learning_rate": 1.9213132042519453e-05, + "loss": 0.2911, + "step": 2669 + }, + { + "epoch": 0.15, + "grad_norm": 0.836503549357935, + "learning_rate": 1.921240832059823e-05, + "loss": 0.376, + "step": 2670 + }, + { + "epoch": 0.15, + "grad_norm": 0.8464103679581624, + "learning_rate": 1.9211684279651703e-05, + "loss": 0.5637, + "step": 2671 + }, + { + "epoch": 0.15, + "grad_norm": 0.4795410800772796, + "learning_rate": 1.9210959919704946e-05, + "loss": 0.2707, + "step": 2672 + }, + { + "epoch": 0.15, + "grad_norm": 0.3464321091289899, + "learning_rate": 1.9210235240783044e-05, + "loss": 0.2455, + "step": 2673 + }, + { + "epoch": 0.15, + "grad_norm": 1.184053359160218, + "learning_rate": 1.920951024291109e-05, + "loss": 0.4233, + "step": 2674 + }, + { + "epoch": 0.15, + "grad_norm": 0.4197906790697942, + "learning_rate": 1.9208784926114194e-05, + "loss": 0.3018, + "step": 2675 + }, + { + "epoch": 0.15, + "grad_norm": 0.9864362176828683, + "learning_rate": 1.9208059290417468e-05, + "loss": 0.5272, + "step": 2676 + }, + { + "epoch": 0.15, + "grad_norm": 0.42562445797911436, + "learning_rate": 1.9207333335846048e-05, + "loss": 0.311, + "step": 2677 + }, + { + "epoch": 0.15, + "grad_norm": 0.4449285311849742, + "learning_rate": 1.920660706242507e-05, + "loss": 0.2907, + "step": 2678 + }, + { + "epoch": 0.15, + "grad_norm": 0.4022883840813125, + "learning_rate": 1.9205880470179682e-05, + "loss": 0.2319, + "step": 2679 + }, + { + "epoch": 0.15, + "grad_norm": 0.45241222820165394, + "learning_rate": 1.9205153559135056e-05, + "loss": 0.2982, + "step": 2680 + }, + { + "epoch": 0.15, + "grad_norm": 0.3837648287989111, + "learning_rate": 1.9204426329316354e-05, + "loss": 0.28, + "step": 2681 + }, + { + "epoch": 0.15, + "grad_norm": 1.1774690784929287, + "learning_rate": 1.9203698780748765e-05, + "loss": 0.5684, + "step": 2682 + }, + { + "epoch": 0.15, + "grad_norm": 0.9706697136795795, + "learning_rate": 1.920297091345748e-05, + "loss": 0.4298, + "step": 2683 + }, + { + "epoch": 0.15, + "grad_norm": 0.44832358661289956, + "learning_rate": 1.9202242727467713e-05, + "loss": 0.2841, + "step": 2684 + }, + { + "epoch": 0.15, + "grad_norm": 0.40119273159907204, + "learning_rate": 1.9201514222804672e-05, + "loss": 0.2904, + "step": 2685 + }, + { + "epoch": 0.15, + "grad_norm": 0.5340909714198232, + "learning_rate": 1.9200785399493592e-05, + "loss": 0.3694, + "step": 2686 + }, + { + "epoch": 0.15, + "grad_norm": 0.6551279965817546, + "learning_rate": 1.9200056257559706e-05, + "loss": 0.2458, + "step": 2687 + }, + { + "epoch": 0.15, + "grad_norm": 1.2765480970724306, + "learning_rate": 1.9199326797028268e-05, + "loss": 0.4481, + "step": 2688 + }, + { + "epoch": 0.15, + "grad_norm": 0.4896591885875324, + "learning_rate": 1.9198597017924543e-05, + "loss": 0.3536, + "step": 2689 + }, + { + "epoch": 0.15, + "grad_norm": 0.3569245432977872, + "learning_rate": 1.9197866920273794e-05, + "loss": 0.2064, + "step": 2690 + }, + { + "epoch": 0.15, + "grad_norm": 0.33094841706483585, + "learning_rate": 1.919713650410131e-05, + "loss": 0.2783, + "step": 2691 + }, + { + "epoch": 0.15, + "grad_norm": 0.46907427883634445, + "learning_rate": 1.9196405769432385e-05, + "loss": 0.3683, + "step": 2692 + }, + { + "epoch": 0.15, + "grad_norm": 0.540396516824427, + "learning_rate": 1.9195674716292326e-05, + "loss": 0.2609, + "step": 2693 + }, + { + "epoch": 0.15, + "grad_norm": 0.7820904049432057, + "learning_rate": 1.9194943344706444e-05, + "loss": 0.423, + "step": 2694 + }, + { + "epoch": 0.15, + "grad_norm": 1.4341198750989717, + "learning_rate": 1.919421165470007e-05, + "loss": 0.7056, + "step": 2695 + }, + { + "epoch": 0.15, + "grad_norm": 0.41243600637594274, + "learning_rate": 1.919347964629854e-05, + "loss": 0.217, + "step": 2696 + }, + { + "epoch": 0.15, + "grad_norm": 0.29327859156718405, + "learning_rate": 1.919274731952721e-05, + "loss": 0.2332, + "step": 2697 + }, + { + "epoch": 0.16, + "grad_norm": 0.9189693213979191, + "learning_rate": 1.919201467441143e-05, + "loss": 0.629, + "step": 2698 + }, + { + "epoch": 0.16, + "grad_norm": 0.46368652568716096, + "learning_rate": 1.919128171097658e-05, + "loss": 0.3495, + "step": 2699 + }, + { + "epoch": 0.16, + "grad_norm": 0.594946940268688, + "learning_rate": 1.919054842924804e-05, + "loss": 0.341, + "step": 2700 + }, + { + "epoch": 0.16, + "grad_norm": 0.4819190307628047, + "learning_rate": 1.9189814829251204e-05, + "loss": 0.3548, + "step": 2701 + }, + { + "epoch": 0.16, + "grad_norm": 0.48033771181806284, + "learning_rate": 1.9189080911011474e-05, + "loss": 0.3313, + "step": 2702 + }, + { + "epoch": 0.16, + "grad_norm": 0.38210522382790274, + "learning_rate": 1.9188346674554267e-05, + "loss": 0.1974, + "step": 2703 + }, + { + "epoch": 0.16, + "grad_norm": 0.36886884191893365, + "learning_rate": 1.918761211990501e-05, + "loss": 0.2936, + "step": 2704 + }, + { + "epoch": 0.16, + "grad_norm": 0.47789844802984893, + "learning_rate": 1.918687724708914e-05, + "loss": 0.3213, + "step": 2705 + }, + { + "epoch": 0.16, + "grad_norm": 0.5360226368299564, + "learning_rate": 1.918614205613211e-05, + "loss": 0.3334, + "step": 2706 + }, + { + "epoch": 0.16, + "grad_norm": 0.878108287963956, + "learning_rate": 1.9185406547059367e-05, + "loss": 0.5888, + "step": 2707 + }, + { + "epoch": 0.16, + "grad_norm": 0.48964078753235296, + "learning_rate": 1.91846707198964e-05, + "loss": 0.3056, + "step": 2708 + }, + { + "epoch": 0.16, + "grad_norm": 0.4418702371752559, + "learning_rate": 1.9183934574668674e-05, + "loss": 0.3089, + "step": 2709 + }, + { + "epoch": 0.16, + "grad_norm": 0.4469509833893016, + "learning_rate": 1.918319811140169e-05, + "loss": 0.3066, + "step": 2710 + }, + { + "epoch": 0.16, + "grad_norm": 0.5447582634426448, + "learning_rate": 1.9182461330120952e-05, + "loss": 0.3778, + "step": 2711 + }, + { + "epoch": 0.16, + "grad_norm": 0.38486688340331743, + "learning_rate": 1.918172423085197e-05, + "loss": 0.3022, + "step": 2712 + }, + { + "epoch": 0.16, + "grad_norm": 0.3383846109182444, + "learning_rate": 1.9180986813620276e-05, + "loss": 0.2254, + "step": 2713 + }, + { + "epoch": 0.16, + "grad_norm": 0.5701648234097344, + "learning_rate": 1.9180249078451406e-05, + "loss": 0.3499, + "step": 2714 + }, + { + "epoch": 0.16, + "grad_norm": 0.4083474638888638, + "learning_rate": 1.9179511025370902e-05, + "loss": 0.3619, + "step": 2715 + }, + { + "epoch": 0.16, + "grad_norm": 0.41177540040311006, + "learning_rate": 1.9178772654404323e-05, + "loss": 0.3057, + "step": 2716 + }, + { + "epoch": 0.16, + "grad_norm": 0.3629090355785191, + "learning_rate": 1.9178033965577243e-05, + "loss": 0.2618, + "step": 2717 + }, + { + "epoch": 0.16, + "grad_norm": 0.5828011824693402, + "learning_rate": 1.9177294958915246e-05, + "loss": 0.4277, + "step": 2718 + }, + { + "epoch": 0.16, + "grad_norm": 0.44338515108899634, + "learning_rate": 1.9176555634443912e-05, + "loss": 0.1719, + "step": 2719 + }, + { + "epoch": 0.16, + "grad_norm": 0.3385222291686445, + "learning_rate": 1.9175815992188856e-05, + "loss": 0.2838, + "step": 2720 + }, + { + "epoch": 0.16, + "grad_norm": 0.5432606412549738, + "learning_rate": 1.9175076032175685e-05, + "loss": 0.3732, + "step": 2721 + }, + { + "epoch": 0.16, + "grad_norm": 0.5378990298129169, + "learning_rate": 1.9174335754430026e-05, + "loss": 0.4871, + "step": 2722 + }, + { + "epoch": 0.16, + "grad_norm": 0.544793784552123, + "learning_rate": 1.9173595158977515e-05, + "loss": 0.2266, + "step": 2723 + }, + { + "epoch": 0.16, + "grad_norm": 0.34300168100501205, + "learning_rate": 1.9172854245843796e-05, + "loss": 0.3132, + "step": 2724 + }, + { + "epoch": 0.16, + "grad_norm": 0.35438645648218475, + "learning_rate": 1.917211301505453e-05, + "loss": 0.2455, + "step": 2725 + }, + { + "epoch": 0.16, + "grad_norm": 0.37041691780405817, + "learning_rate": 1.9171371466635385e-05, + "loss": 0.1947, + "step": 2726 + }, + { + "epoch": 0.16, + "grad_norm": 0.41229526848301834, + "learning_rate": 1.9170629600612044e-05, + "loss": 0.3676, + "step": 2727 + }, + { + "epoch": 0.16, + "grad_norm": 0.3968209388434555, + "learning_rate": 1.916988741701019e-05, + "loss": 0.3847, + "step": 2728 + }, + { + "epoch": 0.16, + "grad_norm": 0.45513126229457, + "learning_rate": 1.9169144915855532e-05, + "loss": 0.1219, + "step": 2729 + }, + { + "epoch": 0.16, + "grad_norm": 0.4630940115074412, + "learning_rate": 1.9168402097173774e-05, + "loss": 0.3857, + "step": 2730 + }, + { + "epoch": 0.16, + "grad_norm": 0.31886087385947326, + "learning_rate": 1.916765896099065e-05, + "loss": 0.1737, + "step": 2731 + }, + { + "epoch": 0.16, + "grad_norm": 0.38274972925488704, + "learning_rate": 1.916691550733189e-05, + "loss": 0.2351, + "step": 2732 + }, + { + "epoch": 0.16, + "grad_norm": 0.3942346069361589, + "learning_rate": 1.9166171736223244e-05, + "loss": 0.3809, + "step": 2733 + }, + { + "epoch": 0.16, + "grad_norm": 0.7779362864741345, + "learning_rate": 1.9165427647690457e-05, + "loss": 0.5735, + "step": 2734 + }, + { + "epoch": 0.16, + "grad_norm": 0.7486116425794241, + "learning_rate": 1.916468324175931e-05, + "loss": 0.5144, + "step": 2735 + }, + { + "epoch": 0.16, + "grad_norm": 0.4035043380202906, + "learning_rate": 1.9163938518455577e-05, + "loss": 0.2557, + "step": 2736 + }, + { + "epoch": 0.16, + "grad_norm": 0.2841072985025231, + "learning_rate": 1.9163193477805042e-05, + "loss": 0.2075, + "step": 2737 + }, + { + "epoch": 0.16, + "grad_norm": 0.4792294242182115, + "learning_rate": 1.9162448119833515e-05, + "loss": 0.4132, + "step": 2738 + }, + { + "epoch": 0.16, + "grad_norm": 0.440562055294282, + "learning_rate": 1.9161702444566803e-05, + "loss": 0.3153, + "step": 2739 + }, + { + "epoch": 0.16, + "grad_norm": 0.43331696516541285, + "learning_rate": 1.9160956452030728e-05, + "loss": 0.3573, + "step": 2740 + }, + { + "epoch": 0.16, + "grad_norm": 0.7305226944821447, + "learning_rate": 1.9160210142251127e-05, + "loss": 0.4819, + "step": 2741 + }, + { + "epoch": 0.16, + "grad_norm": 0.39263699843117555, + "learning_rate": 1.9159463515253842e-05, + "loss": 0.2561, + "step": 2742 + }, + { + "epoch": 0.16, + "grad_norm": 0.28498177963286886, + "learning_rate": 1.9158716571064728e-05, + "loss": 0.2061, + "step": 2743 + }, + { + "epoch": 0.16, + "grad_norm": 0.509629190991603, + "learning_rate": 1.9157969309709656e-05, + "loss": 0.4165, + "step": 2744 + }, + { + "epoch": 0.16, + "grad_norm": 0.343301559511347, + "learning_rate": 1.9157221731214498e-05, + "loss": 0.2715, + "step": 2745 + }, + { + "epoch": 0.16, + "grad_norm": 0.9309225184678585, + "learning_rate": 1.9156473835605146e-05, + "loss": 0.6432, + "step": 2746 + }, + { + "epoch": 0.16, + "grad_norm": 0.8504928341821835, + "learning_rate": 1.9155725622907496e-05, + "loss": 0.5317, + "step": 2747 + }, + { + "epoch": 0.16, + "grad_norm": 0.3641216283918297, + "learning_rate": 1.9154977093147467e-05, + "loss": 0.2938, + "step": 2748 + }, + { + "epoch": 0.16, + "grad_norm": 0.36303716960804644, + "learning_rate": 1.915422824635097e-05, + "loss": 0.148, + "step": 2749 + }, + { + "epoch": 0.16, + "grad_norm": 0.6009779568927565, + "learning_rate": 1.9153479082543945e-05, + "loss": 0.4414, + "step": 2750 + }, + { + "epoch": 0.16, + "grad_norm": 0.3125761277927421, + "learning_rate": 1.9152729601752334e-05, + "loss": 0.2688, + "step": 2751 + }, + { + "epoch": 0.16, + "grad_norm": 0.4369004478121061, + "learning_rate": 1.9151979804002086e-05, + "loss": 0.2762, + "step": 2752 + }, + { + "epoch": 0.16, + "grad_norm": 0.8335443101192821, + "learning_rate": 1.9151229689319177e-05, + "loss": 0.5237, + "step": 2753 + }, + { + "epoch": 0.16, + "grad_norm": 0.41561726711084734, + "learning_rate": 1.9150479257729576e-05, + "loss": 0.3092, + "step": 2754 + }, + { + "epoch": 0.16, + "grad_norm": 0.9114026808149253, + "learning_rate": 1.9149728509259268e-05, + "loss": 0.3958, + "step": 2755 + }, + { + "epoch": 0.16, + "grad_norm": 0.37242842792677777, + "learning_rate": 1.9148977443934257e-05, + "loss": 0.313, + "step": 2756 + }, + { + "epoch": 0.16, + "grad_norm": 0.45442449929965356, + "learning_rate": 1.914822606178055e-05, + "loss": 0.328, + "step": 2757 + }, + { + "epoch": 0.16, + "grad_norm": 0.29522579252993075, + "learning_rate": 1.914747436282417e-05, + "loss": 0.1609, + "step": 2758 + }, + { + "epoch": 0.16, + "grad_norm": 0.5405467495766618, + "learning_rate": 1.9146722347091145e-05, + "loss": 0.385, + "step": 2759 + }, + { + "epoch": 0.16, + "grad_norm": 0.3952204533027063, + "learning_rate": 1.9145970014607517e-05, + "loss": 0.297, + "step": 2760 + }, + { + "epoch": 0.16, + "grad_norm": 1.4252943635395237, + "learning_rate": 1.914521736539934e-05, + "loss": 0.5132, + "step": 2761 + }, + { + "epoch": 0.16, + "grad_norm": 0.42261706849570274, + "learning_rate": 1.9144464399492682e-05, + "loss": 0.2417, + "step": 2762 + }, + { + "epoch": 0.16, + "grad_norm": 0.3112984251374848, + "learning_rate": 1.9143711116913614e-05, + "loss": 0.2201, + "step": 2763 + }, + { + "epoch": 0.16, + "grad_norm": 0.5503114825951091, + "learning_rate": 1.9142957517688226e-05, + "loss": 0.3508, + "step": 2764 + }, + { + "epoch": 0.16, + "grad_norm": 2.290525038080281, + "learning_rate": 1.9142203601842607e-05, + "loss": 0.626, + "step": 2765 + }, + { + "epoch": 0.16, + "grad_norm": 0.4340454110617708, + "learning_rate": 1.9141449369402873e-05, + "loss": 0.305, + "step": 2766 + }, + { + "epoch": 0.16, + "grad_norm": 1.0459985522737103, + "learning_rate": 1.914069482039514e-05, + "loss": 0.5296, + "step": 2767 + }, + { + "epoch": 0.16, + "grad_norm": 0.4969749645117052, + "learning_rate": 1.913993995484554e-05, + "loss": 0.2863, + "step": 2768 + }, + { + "epoch": 0.16, + "grad_norm": 0.40985606509878225, + "learning_rate": 1.913918477278021e-05, + "loss": 0.2643, + "step": 2769 + }, + { + "epoch": 0.16, + "grad_norm": 0.6034420793845431, + "learning_rate": 1.9138429274225306e-05, + "loss": 0.3003, + "step": 2770 + }, + { + "epoch": 0.16, + "grad_norm": 0.9476623773086797, + "learning_rate": 1.913767345920699e-05, + "loss": 0.2913, + "step": 2771 + }, + { + "epoch": 0.16, + "grad_norm": 0.7160270392478728, + "learning_rate": 1.9136917327751433e-05, + "loss": 0.3226, + "step": 2772 + }, + { + "epoch": 0.16, + "grad_norm": 1.8619252386421343, + "learning_rate": 1.913616087988482e-05, + "loss": 0.4995, + "step": 2773 + }, + { + "epoch": 0.16, + "grad_norm": 1.1457060818764169, + "learning_rate": 1.9135404115633354e-05, + "loss": 0.6252, + "step": 2774 + }, + { + "epoch": 0.16, + "grad_norm": 0.4366886822707938, + "learning_rate": 1.9134647035023233e-05, + "loss": 0.2177, + "step": 2775 + }, + { + "epoch": 0.16, + "grad_norm": 0.37860818327377915, + "learning_rate": 1.913388963808068e-05, + "loss": 0.2423, + "step": 2776 + }, + { + "epoch": 0.16, + "grad_norm": 0.9684838256494043, + "learning_rate": 1.9133131924831917e-05, + "loss": 0.4987, + "step": 2777 + }, + { + "epoch": 0.16, + "grad_norm": 0.8367814228740066, + "learning_rate": 1.9132373895303193e-05, + "loss": 0.2736, + "step": 2778 + }, + { + "epoch": 0.16, + "grad_norm": 1.2098389822749138, + "learning_rate": 1.9131615549520752e-05, + "loss": 0.4306, + "step": 2779 + }, + { + "epoch": 0.16, + "grad_norm": 0.8197725081398024, + "learning_rate": 1.913085688751086e-05, + "loss": 0.3871, + "step": 2780 + }, + { + "epoch": 0.16, + "grad_norm": 0.49208863169300127, + "learning_rate": 1.913009790929978e-05, + "loss": 0.2306, + "step": 2781 + }, + { + "epoch": 0.16, + "grad_norm": 0.3526381294259932, + "learning_rate": 1.9129338614913808e-05, + "loss": 0.2522, + "step": 2782 + }, + { + "epoch": 0.16, + "grad_norm": 0.4756058297463948, + "learning_rate": 1.912857900437923e-05, + "loss": 0.326, + "step": 2783 + }, + { + "epoch": 0.16, + "grad_norm": 0.45837754491678995, + "learning_rate": 1.9127819077722353e-05, + "loss": 0.3238, + "step": 2784 + }, + { + "epoch": 0.16, + "grad_norm": 1.4324033883404848, + "learning_rate": 1.9127058834969494e-05, + "loss": 0.4619, + "step": 2785 + }, + { + "epoch": 0.16, + "grad_norm": 1.0480337876957362, + "learning_rate": 1.9126298276146982e-05, + "loss": 0.5592, + "step": 2786 + }, + { + "epoch": 0.16, + "grad_norm": 0.41564243767044307, + "learning_rate": 1.912553740128115e-05, + "loss": 0.2959, + "step": 2787 + }, + { + "epoch": 0.16, + "grad_norm": 0.2892471983222418, + "learning_rate": 1.9124776210398354e-05, + "loss": 0.1853, + "step": 2788 + }, + { + "epoch": 0.16, + "grad_norm": 0.7875199409762493, + "learning_rate": 1.9124014703524946e-05, + "loss": 0.5366, + "step": 2789 + }, + { + "epoch": 0.16, + "grad_norm": 0.4263502403630036, + "learning_rate": 1.9123252880687303e-05, + "loss": 0.3377, + "step": 2790 + }, + { + "epoch": 0.16, + "grad_norm": 0.5112805395238533, + "learning_rate": 1.9122490741911806e-05, + "loss": 0.2663, + "step": 2791 + }, + { + "epoch": 0.16, + "grad_norm": 0.525359587134657, + "learning_rate": 1.9121728287224844e-05, + "loss": 0.4025, + "step": 2792 + }, + { + "epoch": 0.16, + "grad_norm": 0.4425050759597999, + "learning_rate": 1.9120965516652828e-05, + "loss": 0.2785, + "step": 2793 + }, + { + "epoch": 0.16, + "grad_norm": 0.2788256255576857, + "learning_rate": 1.912020243022217e-05, + "loss": 0.1893, + "step": 2794 + }, + { + "epoch": 0.16, + "grad_norm": 0.47566417935386807, + "learning_rate": 1.911943902795929e-05, + "loss": 0.3708, + "step": 2795 + }, + { + "epoch": 0.16, + "grad_norm": 0.47443133106229063, + "learning_rate": 1.9118675309890628e-05, + "loss": 0.3175, + "step": 2796 + }, + { + "epoch": 0.16, + "grad_norm": 0.5872484241415183, + "learning_rate": 1.911791127604263e-05, + "loss": 0.4337, + "step": 2797 + }, + { + "epoch": 0.16, + "grad_norm": 0.9705192208886395, + "learning_rate": 1.9117146926441757e-05, + "loss": 0.3478, + "step": 2798 + }, + { + "epoch": 0.16, + "grad_norm": 0.3903267981435643, + "learning_rate": 1.9116382261114484e-05, + "loss": 0.2852, + "step": 2799 + }, + { + "epoch": 0.16, + "grad_norm": 0.3265402016275641, + "learning_rate": 1.911561728008728e-05, + "loss": 0.2788, + "step": 2800 + }, + { + "epoch": 0.16, + "grad_norm": 0.7350955291555213, + "learning_rate": 1.9114851983386646e-05, + "loss": 0.4241, + "step": 2801 + }, + { + "epoch": 0.16, + "grad_norm": 0.4414743371523098, + "learning_rate": 1.9114086371039078e-05, + "loss": 0.3136, + "step": 2802 + }, + { + "epoch": 0.16, + "grad_norm": 0.4236622091499315, + "learning_rate": 1.911332044307109e-05, + "loss": 0.3239, + "step": 2803 + }, + { + "epoch": 0.16, + "grad_norm": 0.5945493542557609, + "learning_rate": 1.9112554199509207e-05, + "loss": 0.3062, + "step": 2804 + }, + { + "epoch": 0.16, + "grad_norm": 0.4080833519256838, + "learning_rate": 1.911178764037996e-05, + "loss": 0.2928, + "step": 2805 + }, + { + "epoch": 0.16, + "grad_norm": 0.803133176210868, + "learning_rate": 1.9111020765709905e-05, + "loss": 0.5323, + "step": 2806 + }, + { + "epoch": 0.16, + "grad_norm": 0.4639683487323455, + "learning_rate": 1.9110253575525593e-05, + "loss": 0.3419, + "step": 2807 + }, + { + "epoch": 0.16, + "grad_norm": 0.48221297823220266, + "learning_rate": 1.910948606985359e-05, + "loss": 0.3114, + "step": 2808 + }, + { + "epoch": 0.16, + "grad_norm": 0.4188949921186524, + "learning_rate": 1.9108718248720472e-05, + "loss": 0.2776, + "step": 2809 + }, + { + "epoch": 0.16, + "grad_norm": 0.4180698257150617, + "learning_rate": 1.9107950112152838e-05, + "loss": 0.2458, + "step": 2810 + }, + { + "epoch": 0.16, + "grad_norm": 0.33142474909538355, + "learning_rate": 1.910718166017728e-05, + "loss": 0.2246, + "step": 2811 + }, + { + "epoch": 0.16, + "grad_norm": 0.3976551718650196, + "learning_rate": 1.910641289282041e-05, + "loss": 0.3871, + "step": 2812 + }, + { + "epoch": 0.16, + "grad_norm": 0.754813347285477, + "learning_rate": 1.910564381010886e-05, + "loss": 0.5863, + "step": 2813 + }, + { + "epoch": 0.16, + "grad_norm": 0.4022736855798634, + "learning_rate": 1.9104874412069253e-05, + "loss": 0.2171, + "step": 2814 + }, + { + "epoch": 0.16, + "grad_norm": 0.32538048947625997, + "learning_rate": 1.9104104698728235e-05, + "loss": 0.2844, + "step": 2815 + }, + { + "epoch": 0.16, + "grad_norm": 0.4742319547411235, + "learning_rate": 1.9103334670112468e-05, + "loss": 0.3209, + "step": 2816 + }, + { + "epoch": 0.16, + "grad_norm": 0.3539327872330429, + "learning_rate": 1.9102564326248608e-05, + "loss": 0.1927, + "step": 2817 + }, + { + "epoch": 0.16, + "grad_norm": 0.5182586066974086, + "learning_rate": 1.910179366716334e-05, + "loss": 0.4189, + "step": 2818 + }, + { + "epoch": 0.16, + "grad_norm": 0.46291557914978526, + "learning_rate": 1.9101022692883348e-05, + "loss": 0.3427, + "step": 2819 + }, + { + "epoch": 0.16, + "grad_norm": 0.3223453767011489, + "learning_rate": 1.910025140343533e-05, + "loss": 0.2285, + "step": 2820 + }, + { + "epoch": 0.16, + "grad_norm": 0.4671981056244132, + "learning_rate": 1.9099479798845997e-05, + "loss": 0.2945, + "step": 2821 + }, + { + "epoch": 0.16, + "grad_norm": 0.48019689418661415, + "learning_rate": 1.9098707879142072e-05, + "loss": 0.3136, + "step": 2822 + }, + { + "epoch": 0.16, + "grad_norm": 0.4030849015440731, + "learning_rate": 1.9097935644350284e-05, + "loss": 0.2743, + "step": 2823 + }, + { + "epoch": 0.16, + "grad_norm": 0.5656401007116433, + "learning_rate": 1.9097163094497374e-05, + "loss": 0.349, + "step": 2824 + }, + { + "epoch": 0.16, + "grad_norm": 0.6376831416506765, + "learning_rate": 1.9096390229610095e-05, + "loss": 0.557, + "step": 2825 + }, + { + "epoch": 0.16, + "grad_norm": 0.4153453663473684, + "learning_rate": 1.9095617049715217e-05, + "loss": 0.3185, + "step": 2826 + }, + { + "epoch": 0.16, + "grad_norm": 0.34493890499970015, + "learning_rate": 1.9094843554839513e-05, + "loss": 0.2722, + "step": 2827 + }, + { + "epoch": 0.16, + "grad_norm": 0.3024656467389478, + "learning_rate": 1.9094069745009766e-05, + "loss": 0.2226, + "step": 2828 + }, + { + "epoch": 0.16, + "grad_norm": 0.7295724146360978, + "learning_rate": 1.9093295620252776e-05, + "loss": 0.4354, + "step": 2829 + }, + { + "epoch": 0.16, + "grad_norm": 0.3640806422459361, + "learning_rate": 1.9092521180595347e-05, + "loss": 0.2594, + "step": 2830 + }, + { + "epoch": 0.16, + "grad_norm": 0.3788662395591776, + "learning_rate": 1.9091746426064303e-05, + "loss": 0.3483, + "step": 2831 + }, + { + "epoch": 0.16, + "grad_norm": 0.5866543462727462, + "learning_rate": 1.9090971356686473e-05, + "loss": 0.3439, + "step": 2832 + }, + { + "epoch": 0.16, + "grad_norm": 0.2907543494961357, + "learning_rate": 1.909019597248869e-05, + "loss": 0.2154, + "step": 2833 + }, + { + "epoch": 0.16, + "grad_norm": 0.440487120983383, + "learning_rate": 1.9089420273497813e-05, + "loss": 0.2879, + "step": 2834 + }, + { + "epoch": 0.16, + "grad_norm": 0.3480069803830963, + "learning_rate": 1.9088644259740708e-05, + "loss": 0.3217, + "step": 2835 + }, + { + "epoch": 0.16, + "grad_norm": 0.5148676360714839, + "learning_rate": 1.9087867931244238e-05, + "loss": 0.364, + "step": 2836 + }, + { + "epoch": 0.16, + "grad_norm": 0.9445440875281329, + "learning_rate": 1.9087091288035293e-05, + "loss": 0.4219, + "step": 2837 + }, + { + "epoch": 0.16, + "grad_norm": 0.6342925351854132, + "learning_rate": 1.908631433014077e-05, + "loss": 0.4129, + "step": 2838 + }, + { + "epoch": 0.16, + "grad_norm": 0.32986836933112085, + "learning_rate": 1.9085537057587568e-05, + "loss": 0.3206, + "step": 2839 + }, + { + "epoch": 0.16, + "grad_norm": 0.5347354807326525, + "learning_rate": 1.9084759470402612e-05, + "loss": 0.247, + "step": 2840 + }, + { + "epoch": 0.16, + "grad_norm": 0.4500849622065172, + "learning_rate": 1.9083981568612828e-05, + "loss": 0.3415, + "step": 2841 + }, + { + "epoch": 0.16, + "grad_norm": 0.4047582580502225, + "learning_rate": 1.9083203352245148e-05, + "loss": 0.255, + "step": 2842 + }, + { + "epoch": 0.16, + "grad_norm": 0.4042265707783651, + "learning_rate": 1.9082424821326532e-05, + "loss": 0.3031, + "step": 2843 + }, + { + "epoch": 0.16, + "grad_norm": 0.4400449057120318, + "learning_rate": 1.9081645975883928e-05, + "loss": 0.325, + "step": 2844 + }, + { + "epoch": 0.16, + "grad_norm": 0.5280907990199353, + "learning_rate": 1.908086681594432e-05, + "loss": 0.3971, + "step": 2845 + }, + { + "epoch": 0.16, + "grad_norm": 0.4402381574442051, + "learning_rate": 1.908008734153468e-05, + "loss": 0.3617, + "step": 2846 + }, + { + "epoch": 0.16, + "grad_norm": 0.34365348876661916, + "learning_rate": 1.9079307552682013e-05, + "loss": 0.2332, + "step": 2847 + }, + { + "epoch": 0.16, + "grad_norm": 0.3630634485960283, + "learning_rate": 1.907852744941331e-05, + "loss": 0.2653, + "step": 2848 + }, + { + "epoch": 0.16, + "grad_norm": 0.9110965112820536, + "learning_rate": 1.9077747031755594e-05, + "loss": 0.6379, + "step": 2849 + }, + { + "epoch": 0.16, + "grad_norm": 0.4468249874232147, + "learning_rate": 1.9076966299735887e-05, + "loss": 0.1208, + "step": 2850 + }, + { + "epoch": 0.16, + "grad_norm": 0.3606662643476382, + "learning_rate": 1.9076185253381227e-05, + "loss": 0.2985, + "step": 2851 + }, + { + "epoch": 0.16, + "grad_norm": 0.6445782349707652, + "learning_rate": 1.9075403892718664e-05, + "loss": 0.533, + "step": 2852 + }, + { + "epoch": 0.16, + "grad_norm": 0.3657364361197772, + "learning_rate": 1.9074622217775253e-05, + "loss": 0.1565, + "step": 2853 + }, + { + "epoch": 0.16, + "grad_norm": 0.352593659165976, + "learning_rate": 1.9073840228578068e-05, + "loss": 0.2859, + "step": 2854 + }, + { + "epoch": 0.16, + "grad_norm": 0.4152158108112304, + "learning_rate": 1.9073057925154184e-05, + "loss": 0.3108, + "step": 2855 + }, + { + "epoch": 0.16, + "grad_norm": 0.5767800434899424, + "learning_rate": 1.9072275307530692e-05, + "loss": 0.2169, + "step": 2856 + }, + { + "epoch": 0.16, + "grad_norm": 0.406189229853736, + "learning_rate": 1.9071492375734698e-05, + "loss": 0.3657, + "step": 2857 + }, + { + "epoch": 0.16, + "grad_norm": 0.7434134856812202, + "learning_rate": 1.9070709129793313e-05, + "loss": 0.536, + "step": 2858 + }, + { + "epoch": 0.16, + "grad_norm": 0.30162267512857593, + "learning_rate": 1.906992556973366e-05, + "loss": 0.1912, + "step": 2859 + }, + { + "epoch": 0.16, + "grad_norm": 0.360668147936046, + "learning_rate": 1.906914169558288e-05, + "loss": 0.2589, + "step": 2860 + }, + { + "epoch": 0.16, + "grad_norm": 0.8302107796578109, + "learning_rate": 1.9068357507368108e-05, + "loss": 0.6726, + "step": 2861 + }, + { + "epoch": 0.16, + "grad_norm": 0.43014673007102716, + "learning_rate": 1.9067573005116506e-05, + "loss": 0.3113, + "step": 2862 + }, + { + "epoch": 0.16, + "grad_norm": 0.351210921130384, + "learning_rate": 1.9066788188855237e-05, + "loss": 0.2871, + "step": 2863 + }, + { + "epoch": 0.16, + "grad_norm": 0.8047851678299188, + "learning_rate": 1.906600305861149e-05, + "loss": 0.4959, + "step": 2864 + }, + { + "epoch": 0.16, + "grad_norm": 0.47323207583686905, + "learning_rate": 1.906521761441244e-05, + "loss": 0.2408, + "step": 2865 + }, + { + "epoch": 0.16, + "grad_norm": 0.3269727310027329, + "learning_rate": 1.90644318562853e-05, + "loss": 0.1871, + "step": 2866 + }, + { + "epoch": 0.16, + "grad_norm": 0.4722268465786524, + "learning_rate": 1.9063645784257274e-05, + "loss": 0.3434, + "step": 2867 + }, + { + "epoch": 0.16, + "grad_norm": 0.8141264243435448, + "learning_rate": 1.906285939835558e-05, + "loss": 0.5588, + "step": 2868 + }, + { + "epoch": 0.16, + "grad_norm": 0.3784784877528985, + "learning_rate": 1.9062072698607457e-05, + "loss": 0.2705, + "step": 2869 + }, + { + "epoch": 0.16, + "grad_norm": 0.6574080394539428, + "learning_rate": 1.9061285685040148e-05, + "loss": 0.3812, + "step": 2870 + }, + { + "epoch": 0.16, + "grad_norm": 0.4694759377072518, + "learning_rate": 1.9060498357680905e-05, + "loss": 0.3247, + "step": 2871 + }, + { + "epoch": 0.17, + "grad_norm": 0.3590925137545595, + "learning_rate": 1.905971071655699e-05, + "loss": 0.243, + "step": 2872 + }, + { + "epoch": 0.17, + "grad_norm": 0.43092525195270276, + "learning_rate": 1.9058922761695684e-05, + "loss": 0.1675, + "step": 2873 + }, + { + "epoch": 0.17, + "grad_norm": 0.509367055075115, + "learning_rate": 1.9058134493124275e-05, + "loss": 0.3637, + "step": 2874 + }, + { + "epoch": 0.17, + "grad_norm": 0.3456206104797771, + "learning_rate": 1.9057345910870054e-05, + "loss": 0.3076, + "step": 2875 + }, + { + "epoch": 0.17, + "grad_norm": 1.0697084543942041, + "learning_rate": 1.905655701496034e-05, + "loss": 0.3674, + "step": 2876 + }, + { + "epoch": 0.17, + "grad_norm": 0.40182912681545313, + "learning_rate": 1.9055767805422438e-05, + "loss": 0.2836, + "step": 2877 + }, + { + "epoch": 0.17, + "grad_norm": 0.473138542860798, + "learning_rate": 1.905497828228369e-05, + "loss": 0.3122, + "step": 2878 + }, + { + "epoch": 0.17, + "grad_norm": 0.3813765291907576, + "learning_rate": 1.9054188445571435e-05, + "loss": 0.2404, + "step": 2879 + }, + { + "epoch": 0.17, + "grad_norm": 0.7796531744569375, + "learning_rate": 1.905339829531302e-05, + "loss": 0.448, + "step": 2880 + }, + { + "epoch": 0.17, + "grad_norm": 0.42795114627019815, + "learning_rate": 1.9052607831535812e-05, + "loss": 0.2856, + "step": 2881 + }, + { + "epoch": 0.17, + "grad_norm": 0.4012186112644199, + "learning_rate": 1.9051817054267184e-05, + "loss": 0.3072, + "step": 2882 + }, + { + "epoch": 0.17, + "grad_norm": 0.748819417416899, + "learning_rate": 1.9051025963534526e-05, + "loss": 0.4127, + "step": 2883 + }, + { + "epoch": 0.17, + "grad_norm": 0.41468273696256386, + "learning_rate": 1.9050234559365223e-05, + "loss": 0.3054, + "step": 2884 + }, + { + "epoch": 0.17, + "grad_norm": 0.5002904393428474, + "learning_rate": 1.904944284178669e-05, + "loss": 0.3061, + "step": 2885 + }, + { + "epoch": 0.17, + "grad_norm": 0.4156710785103392, + "learning_rate": 1.9048650810826333e-05, + "loss": 0.3083, + "step": 2886 + }, + { + "epoch": 0.17, + "grad_norm": 0.40806478413793273, + "learning_rate": 1.9047858466511594e-05, + "loss": 0.3049, + "step": 2887 + }, + { + "epoch": 0.17, + "grad_norm": 0.4128113081838106, + "learning_rate": 1.9047065808869902e-05, + "loss": 0.2899, + "step": 2888 + }, + { + "epoch": 0.17, + "grad_norm": 0.3442654178253383, + "learning_rate": 1.9046272837928713e-05, + "loss": 0.1134, + "step": 2889 + }, + { + "epoch": 0.17, + "grad_norm": 0.3907830303341359, + "learning_rate": 1.9045479553715482e-05, + "loss": 0.2846, + "step": 2890 + }, + { + "epoch": 0.17, + "grad_norm": 0.5828855253121485, + "learning_rate": 1.9044685956257686e-05, + "loss": 0.4289, + "step": 2891 + }, + { + "epoch": 0.17, + "grad_norm": 0.514484579636209, + "learning_rate": 1.9043892045582804e-05, + "loss": 0.3811, + "step": 2892 + }, + { + "epoch": 0.17, + "grad_norm": 0.3266743076958912, + "learning_rate": 1.9043097821718327e-05, + "loss": 0.2815, + "step": 2893 + }, + { + "epoch": 0.17, + "grad_norm": 0.5032500743515141, + "learning_rate": 1.9042303284691762e-05, + "loss": 0.3833, + "step": 2894 + }, + { + "epoch": 0.17, + "grad_norm": 0.4345386460910853, + "learning_rate": 1.9041508434530622e-05, + "loss": 0.2607, + "step": 2895 + }, + { + "epoch": 0.17, + "grad_norm": 0.41878196971218673, + "learning_rate": 1.9040713271262438e-05, + "loss": 0.2993, + "step": 2896 + }, + { + "epoch": 0.17, + "grad_norm": 0.8099840083560745, + "learning_rate": 1.9039917794914736e-05, + "loss": 0.5032, + "step": 2897 + }, + { + "epoch": 0.17, + "grad_norm": 0.42788121734475587, + "learning_rate": 1.9039122005515074e-05, + "loss": 0.3714, + "step": 2898 + }, + { + "epoch": 0.17, + "grad_norm": 0.3654518107930367, + "learning_rate": 1.9038325903091003e-05, + "loss": 0.218, + "step": 2899 + }, + { + "epoch": 0.17, + "grad_norm": 0.37284915313610173, + "learning_rate": 1.90375294876701e-05, + "loss": 0.2112, + "step": 2900 + }, + { + "epoch": 0.17, + "grad_norm": 1.25242807424076, + "learning_rate": 1.9036732759279935e-05, + "loss": 0.6335, + "step": 2901 + }, + { + "epoch": 0.17, + "grad_norm": 0.3850326090104052, + "learning_rate": 1.9035935717948102e-05, + "loss": 0.2102, + "step": 2902 + }, + { + "epoch": 0.17, + "grad_norm": 0.6094638643634038, + "learning_rate": 1.9035138363702206e-05, + "loss": 0.392, + "step": 2903 + }, + { + "epoch": 0.17, + "grad_norm": 0.9291079961641789, + "learning_rate": 1.9034340696569858e-05, + "loss": 0.635, + "step": 2904 + }, + { + "epoch": 0.17, + "grad_norm": 0.6338307726280604, + "learning_rate": 1.9033542716578677e-05, + "loss": 0.248, + "step": 2905 + }, + { + "epoch": 0.17, + "grad_norm": 0.3460124838063629, + "learning_rate": 1.90327444237563e-05, + "loss": 0.2289, + "step": 2906 + }, + { + "epoch": 0.17, + "grad_norm": 1.568956704643555, + "learning_rate": 1.9031945818130373e-05, + "loss": 0.7279, + "step": 2907 + }, + { + "epoch": 0.17, + "grad_norm": 0.4521370738515677, + "learning_rate": 1.9031146899728555e-05, + "loss": 0.2614, + "step": 2908 + }, + { + "epoch": 0.17, + "grad_norm": 0.9225487734393911, + "learning_rate": 1.9030347668578506e-05, + "loss": 0.5389, + "step": 2909 + }, + { + "epoch": 0.17, + "grad_norm": 0.5007109970727628, + "learning_rate": 1.90295481247079e-05, + "loss": 0.3537, + "step": 2910 + }, + { + "epoch": 0.17, + "grad_norm": 0.3697140213680634, + "learning_rate": 1.902874826814444e-05, + "loss": 0.2837, + "step": 2911 + }, + { + "epoch": 0.17, + "grad_norm": 0.25441038379523395, + "learning_rate": 1.902794809891581e-05, + "loss": 0.1146, + "step": 2912 + }, + { + "epoch": 0.17, + "grad_norm": 0.9960926038338246, + "learning_rate": 1.9027147617049727e-05, + "loss": 0.5207, + "step": 2913 + }, + { + "epoch": 0.17, + "grad_norm": 0.49115998471316646, + "learning_rate": 1.9026346822573906e-05, + "loss": 0.2785, + "step": 2914 + }, + { + "epoch": 0.17, + "grad_norm": 0.6090023738796487, + "learning_rate": 1.902554571551609e-05, + "loss": 0.3221, + "step": 2915 + }, + { + "epoch": 0.17, + "grad_norm": 1.1039710139860757, + "learning_rate": 1.902474429590401e-05, + "loss": 0.735, + "step": 2916 + }, + { + "epoch": 0.17, + "grad_norm": 0.49261792202989546, + "learning_rate": 1.9023942563765422e-05, + "loss": 0.2849, + "step": 2917 + }, + { + "epoch": 0.17, + "grad_norm": 0.28069746124257844, + "learning_rate": 1.9023140519128093e-05, + "loss": 0.1992, + "step": 2918 + }, + { + "epoch": 0.17, + "grad_norm": 1.3144734644202736, + "learning_rate": 1.9022338162019794e-05, + "loss": 0.5199, + "step": 2919 + }, + { + "epoch": 0.17, + "grad_norm": 0.5300530829580868, + "learning_rate": 1.9021535492468313e-05, + "loss": 0.3645, + "step": 2920 + }, + { + "epoch": 0.17, + "grad_norm": 0.5599830716860857, + "learning_rate": 1.9020732510501445e-05, + "loss": 0.3213, + "step": 2921 + }, + { + "epoch": 0.17, + "grad_norm": 0.4318688829311013, + "learning_rate": 1.9019929216147002e-05, + "loss": 0.3321, + "step": 2922 + }, + { + "epoch": 0.17, + "grad_norm": 0.40200132881097284, + "learning_rate": 1.9019125609432793e-05, + "loss": 0.2933, + "step": 2923 + }, + { + "epoch": 0.17, + "grad_norm": 0.35823983826684375, + "learning_rate": 1.9018321690386656e-05, + "loss": 0.2411, + "step": 2924 + }, + { + "epoch": 0.17, + "grad_norm": 1.2091907994532067, + "learning_rate": 1.9017517459036426e-05, + "loss": 0.4403, + "step": 2925 + }, + { + "epoch": 0.17, + "grad_norm": 0.44885633440024797, + "learning_rate": 1.9016712915409953e-05, + "loss": 0.2987, + "step": 2926 + }, + { + "epoch": 0.17, + "grad_norm": 0.6097786376505031, + "learning_rate": 1.90159080595351e-05, + "loss": 0.3783, + "step": 2927 + }, + { + "epoch": 0.17, + "grad_norm": 1.4166969282231294, + "learning_rate": 1.901510289143974e-05, + "loss": 0.5974, + "step": 2928 + }, + { + "epoch": 0.17, + "grad_norm": 0.39087853057358346, + "learning_rate": 1.901429741115175e-05, + "loss": 0.2789, + "step": 2929 + }, + { + "epoch": 0.17, + "grad_norm": 0.4707325846249934, + "learning_rate": 1.901349161869903e-05, + "loss": 0.3342, + "step": 2930 + }, + { + "epoch": 0.17, + "grad_norm": 0.5044461165624786, + "learning_rate": 1.9012685514109487e-05, + "loss": 0.2705, + "step": 2931 + }, + { + "epoch": 0.17, + "grad_norm": 0.446221372568942, + "learning_rate": 1.9011879097411028e-05, + "loss": 0.2971, + "step": 2932 + }, + { + "epoch": 0.17, + "grad_norm": 0.7000113910533794, + "learning_rate": 1.9011072368631586e-05, + "loss": 0.4128, + "step": 2933 + }, + { + "epoch": 0.17, + "grad_norm": 0.6515818280851625, + "learning_rate": 1.9010265327799092e-05, + "loss": 0.3242, + "step": 2934 + }, + { + "epoch": 0.17, + "grad_norm": 0.4527641519593625, + "learning_rate": 1.90094579749415e-05, + "loss": 0.271, + "step": 2935 + }, + { + "epoch": 0.17, + "grad_norm": 0.34571287724653443, + "learning_rate": 1.9008650310086768e-05, + "loss": 0.3034, + "step": 2936 + }, + { + "epoch": 0.17, + "grad_norm": 0.3554877938716127, + "learning_rate": 1.900784233326286e-05, + "loss": 0.3289, + "step": 2937 + }, + { + "epoch": 0.17, + "grad_norm": 0.4412511425860835, + "learning_rate": 1.9007034044497757e-05, + "loss": 0.2159, + "step": 2938 + }, + { + "epoch": 0.17, + "grad_norm": 0.7452586967068775, + "learning_rate": 1.9006225443819456e-05, + "loss": 0.4174, + "step": 2939 + }, + { + "epoch": 0.17, + "grad_norm": 1.353204527219416, + "learning_rate": 1.900541653125595e-05, + "loss": 0.8481, + "step": 2940 + }, + { + "epoch": 0.17, + "grad_norm": 0.4632062818338658, + "learning_rate": 1.9004607306835263e-05, + "loss": 0.218, + "step": 2941 + }, + { + "epoch": 0.17, + "grad_norm": 0.5341120108017874, + "learning_rate": 1.900379777058541e-05, + "loss": 0.3449, + "step": 2942 + }, + { + "epoch": 0.17, + "grad_norm": 0.9366471498287889, + "learning_rate": 1.9002987922534427e-05, + "loss": 0.5459, + "step": 2943 + }, + { + "epoch": 0.17, + "grad_norm": 0.27741904876939394, + "learning_rate": 1.900217776271036e-05, + "loss": 0.1587, + "step": 2944 + }, + { + "epoch": 0.17, + "grad_norm": 1.3941750294590425, + "learning_rate": 1.9001367291141264e-05, + "loss": 0.6409, + "step": 2945 + }, + { + "epoch": 0.17, + "grad_norm": 0.48903414699985137, + "learning_rate": 1.9000556507855204e-05, + "loss": 0.353, + "step": 2946 + }, + { + "epoch": 0.17, + "grad_norm": 0.4693605982996768, + "learning_rate": 1.8999745412880264e-05, + "loss": 0.3303, + "step": 2947 + }, + { + "epoch": 0.17, + "grad_norm": 0.5832080658668408, + "learning_rate": 1.8998934006244522e-05, + "loss": 0.3351, + "step": 2948 + }, + { + "epoch": 0.17, + "grad_norm": 0.5745640754829965, + "learning_rate": 1.8998122287976085e-05, + "loss": 0.4208, + "step": 2949 + }, + { + "epoch": 0.17, + "grad_norm": 0.3367288088164626, + "learning_rate": 1.899731025810306e-05, + "loss": 0.2534, + "step": 2950 + }, + { + "epoch": 0.17, + "grad_norm": 0.38374292249387476, + "learning_rate": 1.8996497916653565e-05, + "loss": 0.1868, + "step": 2951 + }, + { + "epoch": 0.17, + "grad_norm": 1.4710186990718137, + "learning_rate": 1.899568526365574e-05, + "loss": 0.8454, + "step": 2952 + }, + { + "epoch": 0.17, + "grad_norm": 0.6019236351354754, + "learning_rate": 1.8994872299137715e-05, + "loss": 0.462, + "step": 2953 + }, + { + "epoch": 0.17, + "grad_norm": 0.3551118807958904, + "learning_rate": 1.8994059023127655e-05, + "loss": 0.2787, + "step": 2954 + }, + { + "epoch": 0.17, + "grad_norm": 0.4704132793170269, + "learning_rate": 1.899324543565371e-05, + "loss": 0.3093, + "step": 2955 + }, + { + "epoch": 0.17, + "grad_norm": 0.37749498137552984, + "learning_rate": 1.899243153674407e-05, + "loss": 0.1934, + "step": 2956 + }, + { + "epoch": 0.17, + "grad_norm": 0.4254951358246995, + "learning_rate": 1.8991617326426907e-05, + "loss": 0.2868, + "step": 2957 + }, + { + "epoch": 0.17, + "grad_norm": 0.6753680434296535, + "learning_rate": 1.8990802804730424e-05, + "loss": 0.4295, + "step": 2958 + }, + { + "epoch": 0.17, + "grad_norm": 0.7358847969565454, + "learning_rate": 1.8989987971682828e-05, + "loss": 0.3888, + "step": 2959 + }, + { + "epoch": 0.17, + "grad_norm": 0.40847603026118584, + "learning_rate": 1.8989172827312337e-05, + "loss": 0.3701, + "step": 2960 + }, + { + "epoch": 0.17, + "grad_norm": 0.5456246498843511, + "learning_rate": 1.8988357371647173e-05, + "loss": 0.2735, + "step": 2961 + }, + { + "epoch": 0.17, + "grad_norm": 0.31106052444760784, + "learning_rate": 1.8987541604715584e-05, + "loss": 0.2288, + "step": 2962 + }, + { + "epoch": 0.17, + "grad_norm": 0.36068213668776067, + "learning_rate": 1.898672552654581e-05, + "loss": 0.2461, + "step": 2963 + }, + { + "epoch": 0.17, + "grad_norm": 0.9711275231098498, + "learning_rate": 1.8985909137166122e-05, + "loss": 0.4776, + "step": 2964 + }, + { + "epoch": 0.17, + "grad_norm": 0.3911974616653507, + "learning_rate": 1.8985092436604783e-05, + "loss": 0.329, + "step": 2965 + }, + { + "epoch": 0.17, + "grad_norm": 0.4923858502324332, + "learning_rate": 1.8984275424890085e-05, + "loss": 0.3115, + "step": 2966 + }, + { + "epoch": 0.17, + "grad_norm": 0.5945187050292371, + "learning_rate": 1.8983458102050313e-05, + "loss": 0.4092, + "step": 2967 + }, + { + "epoch": 0.17, + "grad_norm": 0.24507124970481212, + "learning_rate": 1.8982640468113774e-05, + "loss": 0.197, + "step": 2968 + }, + { + "epoch": 0.17, + "grad_norm": 0.5955934559945066, + "learning_rate": 1.898182252310878e-05, + "loss": 0.4072, + "step": 2969 + }, + { + "epoch": 0.17, + "grad_norm": 0.429200057143055, + "learning_rate": 1.8981004267063658e-05, + "loss": 0.3242, + "step": 2970 + }, + { + "epoch": 0.17, + "grad_norm": 0.5983194080034205, + "learning_rate": 1.8980185700006744e-05, + "loss": 0.4249, + "step": 2971 + }, + { + "epoch": 0.17, + "grad_norm": 0.42932383596148743, + "learning_rate": 1.8979366821966386e-05, + "loss": 0.3293, + "step": 2972 + }, + { + "epoch": 0.17, + "grad_norm": 0.3954252758600323, + "learning_rate": 1.8978547632970943e-05, + "loss": 0.3225, + "step": 2973 + }, + { + "epoch": 0.17, + "grad_norm": 0.30943424037499556, + "learning_rate": 1.897772813304878e-05, + "loss": 0.0774, + "step": 2974 + }, + { + "epoch": 0.17, + "grad_norm": 0.2955268319884107, + "learning_rate": 1.8976908322228277e-05, + "loss": 0.2291, + "step": 2975 + }, + { + "epoch": 0.17, + "grad_norm": 0.8219325687989213, + "learning_rate": 1.897608820053783e-05, + "loss": 0.5307, + "step": 2976 + }, + { + "epoch": 0.17, + "grad_norm": 0.3811432509694952, + "learning_rate": 1.8975267768005828e-05, + "loss": 0.2864, + "step": 2977 + }, + { + "epoch": 0.17, + "grad_norm": 0.4285338176452864, + "learning_rate": 1.897444702466069e-05, + "loss": 0.3331, + "step": 2978 + }, + { + "epoch": 0.17, + "grad_norm": 0.9754019324573616, + "learning_rate": 1.897362597053084e-05, + "loss": 0.5513, + "step": 2979 + }, + { + "epoch": 0.17, + "grad_norm": 0.28062601130507825, + "learning_rate": 1.897280460564471e-05, + "loss": 0.1539, + "step": 2980 + }, + { + "epoch": 0.17, + "grad_norm": 0.5016667633376, + "learning_rate": 1.897198293003074e-05, + "loss": 0.3324, + "step": 2981 + }, + { + "epoch": 0.17, + "grad_norm": 0.5883370722958858, + "learning_rate": 1.8971160943717387e-05, + "loss": 0.3618, + "step": 2982 + }, + { + "epoch": 0.17, + "grad_norm": 0.7596055078716403, + "learning_rate": 1.8970338646733112e-05, + "loss": 0.3371, + "step": 2983 + }, + { + "epoch": 0.17, + "grad_norm": 0.34230411328604804, + "learning_rate": 1.8969516039106402e-05, + "loss": 0.2392, + "step": 2984 + }, + { + "epoch": 0.17, + "grad_norm": 0.42128346482721546, + "learning_rate": 1.8968693120865734e-05, + "loss": 0.3327, + "step": 2985 + }, + { + "epoch": 0.17, + "grad_norm": 0.6798764575206757, + "learning_rate": 1.896786989203961e-05, + "loss": 0.3974, + "step": 2986 + }, + { + "epoch": 0.17, + "grad_norm": 0.3784241264767709, + "learning_rate": 1.896704635265654e-05, + "loss": 0.2136, + "step": 2987 + }, + { + "epoch": 0.17, + "grad_norm": 1.276853121304933, + "learning_rate": 1.8966222502745034e-05, + "loss": 0.5163, + "step": 2988 + }, + { + "epoch": 0.17, + "grad_norm": 0.44923140900964686, + "learning_rate": 1.8965398342333632e-05, + "loss": 0.3497, + "step": 2989 + }, + { + "epoch": 0.17, + "grad_norm": 0.2639236438544276, + "learning_rate": 1.896457387145087e-05, + "loss": 0.1798, + "step": 2990 + }, + { + "epoch": 0.17, + "grad_norm": 1.1097809048080438, + "learning_rate": 1.8963749090125302e-05, + "loss": 0.6302, + "step": 2991 + }, + { + "epoch": 0.17, + "grad_norm": 1.234947244764597, + "learning_rate": 1.8962923998385487e-05, + "loss": 0.6959, + "step": 2992 + }, + { + "epoch": 0.17, + "grad_norm": 0.4410351618966857, + "learning_rate": 1.896209859626e-05, + "loss": 0.258, + "step": 2993 + }, + { + "epoch": 0.17, + "grad_norm": 0.9236199272405345, + "learning_rate": 1.8961272883777424e-05, + "loss": 0.4043, + "step": 2994 + }, + { + "epoch": 0.17, + "grad_norm": 0.8780417499501578, + "learning_rate": 1.8960446860966353e-05, + "loss": 0.5626, + "step": 2995 + }, + { + "epoch": 0.17, + "grad_norm": 0.2345376241037861, + "learning_rate": 1.895962052785539e-05, + "loss": 0.1591, + "step": 2996 + }, + { + "epoch": 0.17, + "grad_norm": 0.4550241241419985, + "learning_rate": 1.895879388447316e-05, + "loss": 0.347, + "step": 2997 + }, + { + "epoch": 0.17, + "grad_norm": 1.0166192508391756, + "learning_rate": 1.8957966930848278e-05, + "loss": 0.4882, + "step": 2998 + }, + { + "epoch": 0.17, + "grad_norm": 0.4244725281572897, + "learning_rate": 1.8957139667009388e-05, + "loss": 0.3237, + "step": 2999 + }, + { + "epoch": 0.17, + "grad_norm": 0.9454011346453592, + "learning_rate": 1.8956312092985135e-05, + "loss": 0.453, + "step": 3000 + }, + { + "epoch": 0.17, + "grad_norm": 0.4287534901474559, + "learning_rate": 1.895548420880418e-05, + "loss": 0.3222, + "step": 3001 + }, + { + "epoch": 0.17, + "grad_norm": 0.3976559492680062, + "learning_rate": 1.8954656014495193e-05, + "loss": 0.2943, + "step": 3002 + }, + { + "epoch": 0.17, + "grad_norm": 0.256110448009788, + "learning_rate": 1.8953827510086855e-05, + "loss": 0.128, + "step": 3003 + }, + { + "epoch": 0.17, + "grad_norm": 0.6654967054817726, + "learning_rate": 1.8952998695607848e-05, + "loss": 0.4478, + "step": 3004 + }, + { + "epoch": 0.17, + "grad_norm": 0.4407137936298601, + "learning_rate": 1.895216957108689e-05, + "loss": 0.2961, + "step": 3005 + }, + { + "epoch": 0.17, + "grad_norm": 0.4160984346255178, + "learning_rate": 1.8951340136552677e-05, + "loss": 0.3153, + "step": 3006 + }, + { + "epoch": 0.17, + "grad_norm": 1.078401739989723, + "learning_rate": 1.8950510392033945e-05, + "loss": 0.669, + "step": 3007 + }, + { + "epoch": 0.17, + "grad_norm": 0.33084119777448523, + "learning_rate": 1.8949680337559422e-05, + "loss": 0.2457, + "step": 3008 + }, + { + "epoch": 0.17, + "grad_norm": 0.3027583624026009, + "learning_rate": 1.8948849973157855e-05, + "loss": 0.2139, + "step": 3009 + }, + { + "epoch": 0.17, + "grad_norm": 0.9818615820212763, + "learning_rate": 1.8948019298858e-05, + "loss": 0.4861, + "step": 3010 + }, + { + "epoch": 0.17, + "grad_norm": 0.4073751640836383, + "learning_rate": 1.8947188314688614e-05, + "loss": 0.3296, + "step": 3011 + }, + { + "epoch": 0.17, + "grad_norm": 0.7477288026893142, + "learning_rate": 1.8946357020678484e-05, + "loss": 0.5517, + "step": 3012 + }, + { + "epoch": 0.17, + "grad_norm": 0.38402672958445916, + "learning_rate": 1.89455254168564e-05, + "loss": 0.2941, + "step": 3013 + }, + { + "epoch": 0.17, + "grad_norm": 0.378465393496418, + "learning_rate": 1.8944693503251154e-05, + "loss": 0.2712, + "step": 3014 + }, + { + "epoch": 0.17, + "grad_norm": 0.38536621633444945, + "learning_rate": 1.8943861279891555e-05, + "loss": 0.19, + "step": 3015 + }, + { + "epoch": 0.17, + "grad_norm": 1.112414636701315, + "learning_rate": 1.8943028746806423e-05, + "loss": 0.3941, + "step": 3016 + }, + { + "epoch": 0.17, + "grad_norm": 0.4202034683893572, + "learning_rate": 1.8942195904024593e-05, + "loss": 0.315, + "step": 3017 + }, + { + "epoch": 0.17, + "grad_norm": 1.5026074471147242, + "learning_rate": 1.89413627515749e-05, + "loss": 0.8241, + "step": 3018 + }, + { + "epoch": 0.17, + "grad_norm": 0.6070421657926213, + "learning_rate": 1.89405292894862e-05, + "loss": 0.3541, + "step": 3019 + }, + { + "epoch": 0.17, + "grad_norm": 0.41453425610717626, + "learning_rate": 1.8939695517787355e-05, + "loss": 0.3023, + "step": 3020 + }, + { + "epoch": 0.17, + "grad_norm": 0.29993170844835293, + "learning_rate": 1.893886143650724e-05, + "loss": 0.2491, + "step": 3021 + }, + { + "epoch": 0.17, + "grad_norm": 0.6600485195325879, + "learning_rate": 1.893802704567474e-05, + "loss": 0.3919, + "step": 3022 + }, + { + "epoch": 0.17, + "grad_norm": 0.4849563665909965, + "learning_rate": 1.8937192345318745e-05, + "loss": 0.2887, + "step": 3023 + }, + { + "epoch": 0.17, + "grad_norm": 1.121191174841304, + "learning_rate": 1.8936357335468164e-05, + "loss": 0.7787, + "step": 3024 + }, + { + "epoch": 0.17, + "grad_norm": 0.4336597655448138, + "learning_rate": 1.8935522016151914e-05, + "loss": 0.3233, + "step": 3025 + }, + { + "epoch": 0.17, + "grad_norm": 0.30783811468715805, + "learning_rate": 1.8934686387398916e-05, + "loss": 0.2047, + "step": 3026 + }, + { + "epoch": 0.17, + "grad_norm": 0.31539444070821426, + "learning_rate": 1.8933850449238118e-05, + "loss": 0.2582, + "step": 3027 + }, + { + "epoch": 0.17, + "grad_norm": 0.7055654305578103, + "learning_rate": 1.893301420169846e-05, + "loss": 0.4926, + "step": 3028 + }, + { + "epoch": 0.17, + "grad_norm": 0.3686311554656913, + "learning_rate": 1.893217764480891e-05, + "loss": 0.2407, + "step": 3029 + }, + { + "epoch": 0.17, + "grad_norm": 1.1909355193895523, + "learning_rate": 1.8931340778598427e-05, + "loss": 0.6177, + "step": 3030 + }, + { + "epoch": 0.17, + "grad_norm": 1.4663985038033136, + "learning_rate": 1.8930503603095996e-05, + "loss": 0.8403, + "step": 3031 + }, + { + "epoch": 0.17, + "grad_norm": 0.3190431419463848, + "learning_rate": 1.892966611833061e-05, + "loss": 0.1833, + "step": 3032 + }, + { + "epoch": 0.17, + "grad_norm": 0.4511305241542145, + "learning_rate": 1.892882832433127e-05, + "loss": 0.3602, + "step": 3033 + }, + { + "epoch": 0.17, + "grad_norm": 0.7068053529043494, + "learning_rate": 1.8927990221126992e-05, + "loss": 0.3597, + "step": 3034 + }, + { + "epoch": 0.17, + "grad_norm": 0.42401792713410896, + "learning_rate": 1.8927151808746794e-05, + "loss": 0.284, + "step": 3035 + }, + { + "epoch": 0.17, + "grad_norm": 1.0660654062947625, + "learning_rate": 1.8926313087219715e-05, + "loss": 0.4866, + "step": 3036 + }, + { + "epoch": 0.17, + "grad_norm": 0.4213915786757873, + "learning_rate": 1.8925474056574797e-05, + "loss": 0.3367, + "step": 3037 + }, + { + "epoch": 0.17, + "grad_norm": 0.5059010362430902, + "learning_rate": 1.8924634716841095e-05, + "loss": 0.3145, + "step": 3038 + }, + { + "epoch": 0.17, + "grad_norm": 0.6904293276334135, + "learning_rate": 1.8923795068047676e-05, + "loss": 0.3638, + "step": 3039 + }, + { + "epoch": 0.17, + "grad_norm": 0.32126069141038616, + "learning_rate": 1.892295511022362e-05, + "loss": 0.2206, + "step": 3040 + }, + { + "epoch": 0.17, + "grad_norm": 0.49072173398664753, + "learning_rate": 1.8922114843398008e-05, + "loss": 0.2853, + "step": 3041 + }, + { + "epoch": 0.17, + "grad_norm": 0.6178054617716159, + "learning_rate": 1.8921274267599948e-05, + "loss": 0.3293, + "step": 3042 + }, + { + "epoch": 0.17, + "grad_norm": 1.3910184013031428, + "learning_rate": 1.8920433382858543e-05, + "loss": 0.6896, + "step": 3043 + }, + { + "epoch": 0.17, + "grad_norm": 0.43153226746326395, + "learning_rate": 1.8919592189202907e-05, + "loss": 0.3513, + "step": 3044 + }, + { + "epoch": 0.17, + "grad_norm": 0.4102488838420799, + "learning_rate": 1.8918750686662182e-05, + "loss": 0.3131, + "step": 3045 + }, + { + "epoch": 0.18, + "grad_norm": 0.47947338773237685, + "learning_rate": 1.8917908875265507e-05, + "loss": 0.3053, + "step": 3046 + }, + { + "epoch": 0.18, + "grad_norm": 0.3460541287559192, + "learning_rate": 1.8917066755042028e-05, + "loss": 0.2311, + "step": 3047 + }, + { + "epoch": 0.18, + "grad_norm": 0.5716077686683415, + "learning_rate": 1.891622432602091e-05, + "loss": 0.4332, + "step": 3048 + }, + { + "epoch": 0.18, + "grad_norm": 0.5029071813624045, + "learning_rate": 1.8915381588231327e-05, + "loss": 0.2939, + "step": 3049 + }, + { + "epoch": 0.18, + "grad_norm": 0.44046657183956867, + "learning_rate": 1.8914538541702466e-05, + "loss": 0.2922, + "step": 3050 + }, + { + "epoch": 0.18, + "grad_norm": 0.7995531866883153, + "learning_rate": 1.8913695186463517e-05, + "loss": 0.4911, + "step": 3051 + }, + { + "epoch": 0.18, + "grad_norm": 0.36615694629870316, + "learning_rate": 1.8912851522543687e-05, + "loss": 0.2274, + "step": 3052 + }, + { + "epoch": 0.18, + "grad_norm": 0.32321900527474984, + "learning_rate": 1.891200754997219e-05, + "loss": 0.2136, + "step": 3053 + }, + { + "epoch": 0.18, + "grad_norm": 1.2596204295464923, + "learning_rate": 1.8911163268778257e-05, + "loss": 0.8114, + "step": 3054 + }, + { + "epoch": 0.18, + "grad_norm": 0.7144167478169268, + "learning_rate": 1.891031867899112e-05, + "loss": 0.4086, + "step": 3055 + }, + { + "epoch": 0.18, + "grad_norm": 0.6253671899765448, + "learning_rate": 1.8909473780640037e-05, + "loss": 0.3703, + "step": 3056 + }, + { + "epoch": 0.18, + "grad_norm": 0.45134821280279186, + "learning_rate": 1.8908628573754254e-05, + "loss": 0.3021, + "step": 3057 + }, + { + "epoch": 0.18, + "grad_norm": 0.22005853180138574, + "learning_rate": 1.890778305836305e-05, + "loss": 0.1269, + "step": 3058 + }, + { + "epoch": 0.18, + "grad_norm": 0.4524272668639035, + "learning_rate": 1.89069372344957e-05, + "loss": 0.264, + "step": 3059 + }, + { + "epoch": 0.18, + "grad_norm": 0.6209278718230915, + "learning_rate": 1.8906091102181495e-05, + "loss": 0.4388, + "step": 3060 + }, + { + "epoch": 0.18, + "grad_norm": 0.4937884114799369, + "learning_rate": 1.890524466144974e-05, + "loss": 0.3877, + "step": 3061 + }, + { + "epoch": 0.18, + "grad_norm": 0.41369254008640416, + "learning_rate": 1.8904397912329745e-05, + "loss": 0.2448, + "step": 3062 + }, + { + "epoch": 0.18, + "grad_norm": 0.5973610783320861, + "learning_rate": 1.8903550854850834e-05, + "loss": 0.3712, + "step": 3063 + }, + { + "epoch": 0.18, + "grad_norm": 0.656951303682737, + "learning_rate": 1.890270348904234e-05, + "loss": 0.4271, + "step": 3064 + }, + { + "epoch": 0.18, + "grad_norm": 0.2452544715442981, + "learning_rate": 1.8901855814933607e-05, + "loss": 0.1632, + "step": 3065 + }, + { + "epoch": 0.18, + "grad_norm": 0.5677559684846096, + "learning_rate": 1.890100783255399e-05, + "loss": 0.4578, + "step": 3066 + }, + { + "epoch": 0.18, + "grad_norm": 0.5891915481483365, + "learning_rate": 1.890015954193285e-05, + "loss": 0.4746, + "step": 3067 + }, + { + "epoch": 0.18, + "grad_norm": 0.35663869657398584, + "learning_rate": 1.8899310943099573e-05, + "loss": 0.2468, + "step": 3068 + }, + { + "epoch": 0.18, + "grad_norm": 0.5247041604668781, + "learning_rate": 1.8898462036083537e-05, + "loss": 0.3401, + "step": 3069 + }, + { + "epoch": 0.18, + "grad_norm": 1.0380567894925528, + "learning_rate": 1.8897612820914147e-05, + "loss": 0.5755, + "step": 3070 + }, + { + "epoch": 0.18, + "grad_norm": 0.2670561595530922, + "learning_rate": 1.8896763297620805e-05, + "loss": 0.1841, + "step": 3071 + }, + { + "epoch": 0.18, + "grad_norm": 0.9455444266024311, + "learning_rate": 1.8895913466232937e-05, + "loss": 0.6592, + "step": 3072 + }, + { + "epoch": 0.18, + "grad_norm": 0.41053313749568826, + "learning_rate": 1.8895063326779965e-05, + "loss": 0.3663, + "step": 3073 + }, + { + "epoch": 0.18, + "grad_norm": 0.36267065242506263, + "learning_rate": 1.8894212879291332e-05, + "loss": 0.2047, + "step": 3074 + }, + { + "epoch": 0.18, + "grad_norm": 0.4786448101222816, + "learning_rate": 1.8893362123796488e-05, + "loss": 0.2759, + "step": 3075 + }, + { + "epoch": 0.18, + "grad_norm": 0.4093384984549816, + "learning_rate": 1.88925110603249e-05, + "loss": 0.3397, + "step": 3076 + }, + { + "epoch": 0.18, + "grad_norm": 0.845745579057613, + "learning_rate": 1.8891659688906033e-05, + "loss": 0.4151, + "step": 3077 + }, + { + "epoch": 0.18, + "grad_norm": 0.47292178771932475, + "learning_rate": 1.8890808009569376e-05, + "loss": 0.3051, + "step": 3078 + }, + { + "epoch": 0.18, + "grad_norm": 0.49182209863150383, + "learning_rate": 1.8889956022344414e-05, + "loss": 0.3479, + "step": 3079 + }, + { + "epoch": 0.18, + "grad_norm": 0.46806648165181836, + "learning_rate": 1.8889103727260666e-05, + "loss": 0.2757, + "step": 3080 + }, + { + "epoch": 0.18, + "grad_norm": 0.3120414790673563, + "learning_rate": 1.888825112434763e-05, + "loss": 0.2244, + "step": 3081 + }, + { + "epoch": 0.18, + "grad_norm": 1.2679215891920346, + "learning_rate": 1.8887398213634848e-05, + "loss": 0.7125, + "step": 3082 + }, + { + "epoch": 0.18, + "grad_norm": 0.6403408615039261, + "learning_rate": 1.8886544995151844e-05, + "loss": 0.38, + "step": 3083 + }, + { + "epoch": 0.18, + "grad_norm": 0.33151253151833543, + "learning_rate": 1.8885691468928166e-05, + "loss": 0.2932, + "step": 3084 + }, + { + "epoch": 0.18, + "grad_norm": 0.7277961331311545, + "learning_rate": 1.8884837634993377e-05, + "loss": 0.5574, + "step": 3085 + }, + { + "epoch": 0.18, + "grad_norm": 0.340274101058813, + "learning_rate": 1.8883983493377045e-05, + "loss": 0.1544, + "step": 3086 + }, + { + "epoch": 0.18, + "grad_norm": 0.4429892097106967, + "learning_rate": 1.8883129044108744e-05, + "loss": 0.3517, + "step": 3087 + }, + { + "epoch": 0.18, + "grad_norm": 0.3928554684234481, + "learning_rate": 1.8882274287218067e-05, + "loss": 0.2741, + "step": 3088 + }, + { + "epoch": 0.18, + "grad_norm": 0.4743146801940653, + "learning_rate": 1.8881419222734615e-05, + "loss": 0.3134, + "step": 3089 + }, + { + "epoch": 0.18, + "grad_norm": 0.5239268770428587, + "learning_rate": 1.8880563850687995e-05, + "loss": 0.4167, + "step": 3090 + }, + { + "epoch": 0.18, + "grad_norm": 0.8884943070873123, + "learning_rate": 1.8879708171107828e-05, + "loss": 0.4566, + "step": 3091 + }, + { + "epoch": 0.18, + "grad_norm": 0.5371132744319287, + "learning_rate": 1.8878852184023754e-05, + "loss": 0.3002, + "step": 3092 + }, + { + "epoch": 0.18, + "grad_norm": 0.32469681817681595, + "learning_rate": 1.887799588946541e-05, + "loss": 0.2159, + "step": 3093 + }, + { + "epoch": 0.18, + "grad_norm": 1.2594353839536387, + "learning_rate": 1.8877139287462446e-05, + "loss": 0.3695, + "step": 3094 + }, + { + "epoch": 0.18, + "grad_norm": 0.6393985107547493, + "learning_rate": 1.8876282378044535e-05, + "loss": 0.4849, + "step": 3095 + }, + { + "epoch": 0.18, + "grad_norm": 0.3707643080188264, + "learning_rate": 1.8875425161241345e-05, + "loss": 0.3116, + "step": 3096 + }, + { + "epoch": 0.18, + "grad_norm": 0.5094016079937749, + "learning_rate": 1.887456763708256e-05, + "loss": 0.3134, + "step": 3097 + }, + { + "epoch": 0.18, + "grad_norm": 0.2347285327171995, + "learning_rate": 1.8873709805597884e-05, + "loss": 0.1598, + "step": 3098 + }, + { + "epoch": 0.18, + "grad_norm": 0.40752913954631803, + "learning_rate": 1.8872851666817017e-05, + "loss": 0.3338, + "step": 3099 + }, + { + "epoch": 0.18, + "grad_norm": 0.5350281103112107, + "learning_rate": 1.887199322076968e-05, + "loss": 0.3887, + "step": 3100 + }, + { + "epoch": 0.18, + "grad_norm": 0.6124219414870147, + "learning_rate": 1.8871134467485597e-05, + "loss": 0.2902, + "step": 3101 + }, + { + "epoch": 0.18, + "grad_norm": 0.5018307054344358, + "learning_rate": 1.8870275406994513e-05, + "loss": 0.3797, + "step": 3102 + }, + { + "epoch": 0.18, + "grad_norm": 1.1961520940052548, + "learning_rate": 1.886941603932617e-05, + "loss": 0.7538, + "step": 3103 + }, + { + "epoch": 0.18, + "grad_norm": 0.32363681224514806, + "learning_rate": 1.886855636451033e-05, + "loss": 0.2232, + "step": 3104 + }, + { + "epoch": 0.18, + "grad_norm": 0.4023577478846728, + "learning_rate": 1.8867696382576767e-05, + "loss": 0.2839, + "step": 3105 + }, + { + "epoch": 0.18, + "grad_norm": 0.437139177965491, + "learning_rate": 1.886683609355526e-05, + "loss": 0.3293, + "step": 3106 + }, + { + "epoch": 0.18, + "grad_norm": 0.6121045236750057, + "learning_rate": 1.8865975497475596e-05, + "loss": 0.333, + "step": 3107 + }, + { + "epoch": 0.18, + "grad_norm": 0.4427691977419533, + "learning_rate": 1.8865114594367585e-05, + "loss": 0.3435, + "step": 3108 + }, + { + "epoch": 0.18, + "grad_norm": 0.6110743595027336, + "learning_rate": 1.8864253384261036e-05, + "loss": 0.4338, + "step": 3109 + }, + { + "epoch": 0.18, + "grad_norm": 0.580897320895251, + "learning_rate": 1.8863391867185774e-05, + "loss": 0.3527, + "step": 3110 + }, + { + "epoch": 0.18, + "grad_norm": 0.29260609606365173, + "learning_rate": 1.8862530043171633e-05, + "loss": 0.1696, + "step": 3111 + }, + { + "epoch": 0.18, + "grad_norm": 0.4524691334794982, + "learning_rate": 1.8861667912248456e-05, + "loss": 0.3644, + "step": 3112 + }, + { + "epoch": 0.18, + "grad_norm": 0.8502053958943329, + "learning_rate": 1.8860805474446103e-05, + "loss": 0.5187, + "step": 3113 + }, + { + "epoch": 0.18, + "grad_norm": 0.34026675743767276, + "learning_rate": 1.8859942729794433e-05, + "loss": 0.2207, + "step": 3114 + }, + { + "epoch": 0.18, + "grad_norm": 0.5803398218349507, + "learning_rate": 1.885907967832333e-05, + "loss": 0.4215, + "step": 3115 + }, + { + "epoch": 0.18, + "grad_norm": 0.5413184538995245, + "learning_rate": 1.885821632006268e-05, + "loss": 0.385, + "step": 3116 + }, + { + "epoch": 0.18, + "grad_norm": 0.267538412442085, + "learning_rate": 1.8857352655042378e-05, + "loss": 0.1605, + "step": 3117 + }, + { + "epoch": 0.18, + "grad_norm": 0.9481877105032955, + "learning_rate": 1.885648868329234e-05, + "loss": 0.5323, + "step": 3118 + }, + { + "epoch": 0.18, + "grad_norm": 0.7094437605080949, + "learning_rate": 1.8855624404842472e-05, + "loss": 0.4595, + "step": 3119 + }, + { + "epoch": 0.18, + "grad_norm": 0.3795999302636725, + "learning_rate": 1.8854759819722713e-05, + "loss": 0.2596, + "step": 3120 + }, + { + "epoch": 0.18, + "grad_norm": 1.3263785752350479, + "learning_rate": 1.8853894927963004e-05, + "loss": 0.8709, + "step": 3121 + }, + { + "epoch": 0.18, + "grad_norm": 0.6358746810174966, + "learning_rate": 1.8853029729593296e-05, + "loss": 0.4608, + "step": 3122 + }, + { + "epoch": 0.18, + "grad_norm": 0.3726386152293299, + "learning_rate": 1.8852164224643546e-05, + "loss": 0.2343, + "step": 3123 + }, + { + "epoch": 0.18, + "grad_norm": 0.5836516260644941, + "learning_rate": 1.885129841314373e-05, + "loss": 0.3314, + "step": 3124 + }, + { + "epoch": 0.18, + "grad_norm": 0.49963353752080175, + "learning_rate": 1.8850432295123832e-05, + "loss": 0.2861, + "step": 3125 + }, + { + "epoch": 0.18, + "grad_norm": 0.4523725849030945, + "learning_rate": 1.8849565870613844e-05, + "loss": 0.3031, + "step": 3126 + }, + { + "epoch": 0.18, + "grad_norm": 0.47633458502229503, + "learning_rate": 1.8848699139643768e-05, + "loss": 0.3484, + "step": 3127 + }, + { + "epoch": 0.18, + "grad_norm": 0.4572869976239571, + "learning_rate": 1.8847832102243626e-05, + "loss": 0.3506, + "step": 3128 + }, + { + "epoch": 0.18, + "grad_norm": 0.4738886849479047, + "learning_rate": 1.8846964758443434e-05, + "loss": 0.3156, + "step": 3129 + }, + { + "epoch": 0.18, + "grad_norm": 0.3158973478786652, + "learning_rate": 1.8846097108273234e-05, + "loss": 0.2454, + "step": 3130 + }, + { + "epoch": 0.18, + "grad_norm": 0.6088936520152631, + "learning_rate": 1.8845229151763072e-05, + "loss": 0.3999, + "step": 3131 + }, + { + "epoch": 0.18, + "grad_norm": 0.341198313608363, + "learning_rate": 1.884436088894301e-05, + "loss": 0.2854, + "step": 3132 + }, + { + "epoch": 0.18, + "grad_norm": 0.8632109606054379, + "learning_rate": 1.8843492319843105e-05, + "loss": 0.5362, + "step": 3133 + }, + { + "epoch": 0.18, + "grad_norm": 0.8698131777727494, + "learning_rate": 1.884262344449344e-05, + "loss": 0.5899, + "step": 3134 + }, + { + "epoch": 0.18, + "grad_norm": 0.40345140665949575, + "learning_rate": 1.8841754262924106e-05, + "loss": 0.2877, + "step": 3135 + }, + { + "epoch": 0.18, + "grad_norm": 0.4323619276330218, + "learning_rate": 1.8840884775165204e-05, + "loss": 0.3597, + "step": 3136 + }, + { + "epoch": 0.18, + "grad_norm": 0.23494410269272176, + "learning_rate": 1.8840014981246843e-05, + "loss": 0.121, + "step": 3137 + }, + { + "epoch": 0.18, + "grad_norm": 0.39915844033516507, + "learning_rate": 1.8839144881199144e-05, + "loss": 0.2998, + "step": 3138 + }, + { + "epoch": 0.18, + "grad_norm": 1.3686412428770773, + "learning_rate": 1.8838274475052233e-05, + "loss": 0.8266, + "step": 3139 + }, + { + "epoch": 0.18, + "grad_norm": 0.3427600739038595, + "learning_rate": 1.883740376283626e-05, + "loss": 0.2858, + "step": 3140 + }, + { + "epoch": 0.18, + "grad_norm": 0.4077783868641141, + "learning_rate": 1.8836532744581377e-05, + "loss": 0.3271, + "step": 3141 + }, + { + "epoch": 0.18, + "grad_norm": 0.7926122519324589, + "learning_rate": 1.8835661420317745e-05, + "loss": 0.4978, + "step": 3142 + }, + { + "epoch": 0.18, + "grad_norm": 0.25284561606242717, + "learning_rate": 1.8834789790075536e-05, + "loss": 0.1756, + "step": 3143 + }, + { + "epoch": 0.18, + "grad_norm": 0.38142146209876004, + "learning_rate": 1.8833917853884935e-05, + "loss": 0.2853, + "step": 3144 + }, + { + "epoch": 0.18, + "grad_norm": 1.088916732216867, + "learning_rate": 1.8833045611776143e-05, + "loss": 0.801, + "step": 3145 + }, + { + "epoch": 0.18, + "grad_norm": 0.618587140267945, + "learning_rate": 1.8832173063779357e-05, + "loss": 0.3862, + "step": 3146 + }, + { + "epoch": 0.18, + "grad_norm": 0.4643314856293626, + "learning_rate": 1.8831300209924797e-05, + "loss": 0.3202, + "step": 3147 + }, + { + "epoch": 0.18, + "grad_norm": 0.40377489613397566, + "learning_rate": 1.8830427050242693e-05, + "loss": 0.3142, + "step": 3148 + }, + { + "epoch": 0.18, + "grad_norm": 0.2564840205627374, + "learning_rate": 1.8829553584763278e-05, + "loss": 0.1487, + "step": 3149 + }, + { + "epoch": 0.18, + "grad_norm": 0.3232933389811095, + "learning_rate": 1.8828679813516806e-05, + "loss": 0.2236, + "step": 3150 + }, + { + "epoch": 0.18, + "grad_norm": 0.5490085608060375, + "learning_rate": 1.8827805736533528e-05, + "loss": 0.4398, + "step": 3151 + }, + { + "epoch": 0.18, + "grad_norm": 0.5491388527656633, + "learning_rate": 1.8826931353843717e-05, + "loss": 0.4112, + "step": 3152 + }, + { + "epoch": 0.18, + "grad_norm": 0.39943577631082894, + "learning_rate": 1.8826056665477654e-05, + "loss": 0.2803, + "step": 3153 + }, + { + "epoch": 0.18, + "grad_norm": 0.773866438726851, + "learning_rate": 1.8825181671465628e-05, + "loss": 0.5776, + "step": 3154 + }, + { + "epoch": 0.18, + "grad_norm": 0.3216913090997973, + "learning_rate": 1.882430637183794e-05, + "loss": 0.2737, + "step": 3155 + }, + { + "epoch": 0.18, + "grad_norm": 0.290138086584041, + "learning_rate": 1.8823430766624905e-05, + "loss": 0.1886, + "step": 3156 + }, + { + "epoch": 0.18, + "grad_norm": 1.4431178972907859, + "learning_rate": 1.8822554855856838e-05, + "loss": 0.8918, + "step": 3157 + }, + { + "epoch": 0.18, + "grad_norm": 0.5797258534093347, + "learning_rate": 1.8821678639564075e-05, + "loss": 0.4781, + "step": 3158 + }, + { + "epoch": 0.18, + "grad_norm": 0.3837048306336846, + "learning_rate": 1.8820802117776963e-05, + "loss": 0.2224, + "step": 3159 + }, + { + "epoch": 0.18, + "grad_norm": 0.42205950077117843, + "learning_rate": 1.8819925290525854e-05, + "loss": 0.3351, + "step": 3160 + }, + { + "epoch": 0.18, + "grad_norm": 0.3076485977032022, + "learning_rate": 1.8819048157841105e-05, + "loss": 0.2032, + "step": 3161 + }, + { + "epoch": 0.18, + "grad_norm": 0.5041804621749775, + "learning_rate": 1.8818170719753104e-05, + "loss": 0.3487, + "step": 3162 + }, + { + "epoch": 0.18, + "grad_norm": 0.38071028970387893, + "learning_rate": 1.8817292976292227e-05, + "loss": 0.33, + "step": 3163 + }, + { + "epoch": 0.18, + "grad_norm": 0.4558400988040032, + "learning_rate": 1.8816414927488877e-05, + "loss": 0.3912, + "step": 3164 + }, + { + "epoch": 0.18, + "grad_norm": 0.45130705580419683, + "learning_rate": 1.8815536573373453e-05, + "loss": 0.2836, + "step": 3165 + }, + { + "epoch": 0.18, + "grad_norm": 0.8510761019378265, + "learning_rate": 1.8814657913976377e-05, + "loss": 0.302, + "step": 3166 + }, + { + "epoch": 0.18, + "grad_norm": 0.429963063044294, + "learning_rate": 1.881377894932808e-05, + "loss": 0.3323, + "step": 3167 + }, + { + "epoch": 0.18, + "grad_norm": 0.29400249907992115, + "learning_rate": 1.8812899679458993e-05, + "loss": 0.23, + "step": 3168 + }, + { + "epoch": 0.18, + "grad_norm": 0.464996279016933, + "learning_rate": 1.8812020104399572e-05, + "loss": 0.3656, + "step": 3169 + }, + { + "epoch": 0.18, + "grad_norm": 0.46758247447598833, + "learning_rate": 1.8811140224180273e-05, + "loss": 0.3112, + "step": 3170 + }, + { + "epoch": 0.18, + "grad_norm": 0.41433320208215574, + "learning_rate": 1.8810260038831564e-05, + "loss": 0.291, + "step": 3171 + }, + { + "epoch": 0.18, + "grad_norm": 0.452144073174074, + "learning_rate": 1.8809379548383932e-05, + "loss": 0.2709, + "step": 3172 + }, + { + "epoch": 0.18, + "grad_norm": 1.178725602164265, + "learning_rate": 1.8808498752867863e-05, + "loss": 0.7155, + "step": 3173 + }, + { + "epoch": 0.18, + "grad_norm": 0.34409535391847335, + "learning_rate": 1.880761765231386e-05, + "loss": 0.2906, + "step": 3174 + }, + { + "epoch": 0.18, + "grad_norm": 0.7192857758620904, + "learning_rate": 1.8806736246752443e-05, + "loss": 0.4855, + "step": 3175 + }, + { + "epoch": 0.18, + "grad_norm": 0.3361678418408267, + "learning_rate": 1.8805854536214122e-05, + "loss": 0.2721, + "step": 3176 + }, + { + "epoch": 0.18, + "grad_norm": 0.37055400507860653, + "learning_rate": 1.8804972520729443e-05, + "loss": 0.262, + "step": 3177 + }, + { + "epoch": 0.18, + "grad_norm": 0.9664731315219579, + "learning_rate": 1.8804090200328938e-05, + "loss": 0.5504, + "step": 3178 + }, + { + "epoch": 0.18, + "grad_norm": 0.42261767384952514, + "learning_rate": 1.880320757504317e-05, + "loss": 0.2837, + "step": 3179 + }, + { + "epoch": 0.18, + "grad_norm": 0.614889244744422, + "learning_rate": 1.8802324644902704e-05, + "loss": 0.3954, + "step": 3180 + }, + { + "epoch": 0.18, + "grad_norm": 0.5047054055116674, + "learning_rate": 1.880144140993811e-05, + "loss": 0.365, + "step": 3181 + }, + { + "epoch": 0.18, + "grad_norm": 0.2809330896067468, + "learning_rate": 1.880055787017998e-05, + "loss": 0.1917, + "step": 3182 + }, + { + "epoch": 0.18, + "grad_norm": 0.4424624416740233, + "learning_rate": 1.8799674025658913e-05, + "loss": 0.3117, + "step": 3183 + }, + { + "epoch": 0.18, + "grad_norm": 0.4259910851470908, + "learning_rate": 1.879878987640551e-05, + "loss": 0.3169, + "step": 3184 + }, + { + "epoch": 0.18, + "grad_norm": 1.5477900879594761, + "learning_rate": 1.879790542245039e-05, + "loss": 0.4208, + "step": 3185 + }, + { + "epoch": 0.18, + "grad_norm": 0.5038440134908405, + "learning_rate": 1.8797020663824187e-05, + "loss": 0.3235, + "step": 3186 + }, + { + "epoch": 0.18, + "grad_norm": 0.4738666383574769, + "learning_rate": 1.8796135600557534e-05, + "loss": 0.3355, + "step": 3187 + }, + { + "epoch": 0.18, + "grad_norm": 1.4251583580601876, + "learning_rate": 1.8795250232681085e-05, + "loss": 0.9205, + "step": 3188 + }, + { + "epoch": 0.18, + "grad_norm": 0.253214970145997, + "learning_rate": 1.8794364560225496e-05, + "loss": 0.1462, + "step": 3189 + }, + { + "epoch": 0.18, + "grad_norm": 0.6313789827965512, + "learning_rate": 1.8793478583221448e-05, + "loss": 0.3916, + "step": 3190 + }, + { + "epoch": 0.18, + "grad_norm": 0.4616837157366747, + "learning_rate": 1.879259230169961e-05, + "loss": 0.3263, + "step": 3191 + }, + { + "epoch": 0.18, + "grad_norm": 0.3453392906380078, + "learning_rate": 1.8791705715690675e-05, + "loss": 0.2672, + "step": 3192 + }, + { + "epoch": 0.18, + "grad_norm": 0.7657077369940369, + "learning_rate": 1.8790818825225355e-05, + "loss": 0.5951, + "step": 3193 + }, + { + "epoch": 0.18, + "grad_norm": 0.4873348104716959, + "learning_rate": 1.8789931630334353e-05, + "loss": 0.3264, + "step": 3194 + }, + { + "epoch": 0.18, + "grad_norm": 0.3471370066679422, + "learning_rate": 1.8789044131048397e-05, + "loss": 0.2468, + "step": 3195 + }, + { + "epoch": 0.18, + "grad_norm": 0.5258610482459171, + "learning_rate": 1.8788156327398225e-05, + "loss": 0.2788, + "step": 3196 + }, + { + "epoch": 0.18, + "grad_norm": 0.5999552621196282, + "learning_rate": 1.8787268219414572e-05, + "loss": 0.4748, + "step": 3197 + }, + { + "epoch": 0.18, + "grad_norm": 0.5376375761332872, + "learning_rate": 1.87863798071282e-05, + "loss": 0.3582, + "step": 3198 + }, + { + "epoch": 0.18, + "grad_norm": 0.42167059924717126, + "learning_rate": 1.8785491090569876e-05, + "loss": 0.2779, + "step": 3199 + }, + { + "epoch": 0.18, + "grad_norm": 1.048591073850127, + "learning_rate": 1.878460206977037e-05, + "loss": 0.7897, + "step": 3200 + }, + { + "epoch": 0.18, + "grad_norm": 0.3659218143881057, + "learning_rate": 1.8783712744760475e-05, + "loss": 0.1953, + "step": 3201 + }, + { + "epoch": 0.18, + "grad_norm": 0.4075025882352195, + "learning_rate": 1.878282311557098e-05, + "loss": 0.2572, + "step": 3202 + }, + { + "epoch": 0.18, + "grad_norm": 0.43005347413976514, + "learning_rate": 1.8781933182232702e-05, + "loss": 0.3816, + "step": 3203 + }, + { + "epoch": 0.18, + "grad_norm": 0.5294171894124521, + "learning_rate": 1.8781042944776457e-05, + "loss": 0.3735, + "step": 3204 + }, + { + "epoch": 0.18, + "grad_norm": 0.4682006514675936, + "learning_rate": 1.8780152403233073e-05, + "loss": 0.29, + "step": 3205 + }, + { + "epoch": 0.18, + "grad_norm": 1.4391156031332297, + "learning_rate": 1.8779261557633385e-05, + "loss": 0.696, + "step": 3206 + }, + { + "epoch": 0.18, + "grad_norm": 0.34352071114583893, + "learning_rate": 1.8778370408008247e-05, + "loss": 0.256, + "step": 3207 + }, + { + "epoch": 0.18, + "grad_norm": 0.31215051226110274, + "learning_rate": 1.877747895438852e-05, + "loss": 0.2139, + "step": 3208 + }, + { + "epoch": 0.18, + "grad_norm": 0.7384179652760221, + "learning_rate": 1.8776587196805077e-05, + "loss": 0.4823, + "step": 3209 + }, + { + "epoch": 0.18, + "grad_norm": 0.6360730312478228, + "learning_rate": 1.8775695135288794e-05, + "loss": 0.4391, + "step": 3210 + }, + { + "epoch": 0.18, + "grad_norm": 0.46584518363056193, + "learning_rate": 1.8774802769870564e-05, + "loss": 0.3308, + "step": 3211 + }, + { + "epoch": 0.18, + "grad_norm": 0.4706672925631658, + "learning_rate": 1.8773910100581294e-05, + "loss": 0.3272, + "step": 3212 + }, + { + "epoch": 0.18, + "grad_norm": 0.2877101741955395, + "learning_rate": 1.8773017127451893e-05, + "loss": 0.2086, + "step": 3213 + }, + { + "epoch": 0.18, + "grad_norm": 0.4410049272946392, + "learning_rate": 1.877212385051329e-05, + "loss": 0.2814, + "step": 3214 + }, + { + "epoch": 0.18, + "grad_norm": 0.4041381893357965, + "learning_rate": 1.8771230269796412e-05, + "loss": 0.3087, + "step": 3215 + }, + { + "epoch": 0.18, + "grad_norm": 0.7639969423662106, + "learning_rate": 1.877033638533221e-05, + "loss": 0.4319, + "step": 3216 + }, + { + "epoch": 0.18, + "grad_norm": 0.4278602034149037, + "learning_rate": 1.876944219715163e-05, + "loss": 0.3137, + "step": 3217 + }, + { + "epoch": 0.18, + "grad_norm": 0.48961249607176893, + "learning_rate": 1.876854770528565e-05, + "loss": 0.3156, + "step": 3218 + }, + { + "epoch": 0.18, + "grad_norm": 0.44431744410002555, + "learning_rate": 1.8767652909765236e-05, + "loss": 0.3015, + "step": 3219 + }, + { + "epoch": 0.19, + "grad_norm": 0.33106297024120673, + "learning_rate": 1.8766757810621383e-05, + "loss": 0.3407, + "step": 3220 + }, + { + "epoch": 0.19, + "grad_norm": 0.28688345276164595, + "learning_rate": 1.876586240788508e-05, + "loss": 0.1918, + "step": 3221 + }, + { + "epoch": 0.19, + "grad_norm": 0.5071273372638695, + "learning_rate": 1.876496670158734e-05, + "loss": 0.3566, + "step": 3222 + }, + { + "epoch": 0.19, + "grad_norm": 0.4110348765229278, + "learning_rate": 1.876407069175918e-05, + "loss": 0.316, + "step": 3223 + }, + { + "epoch": 0.19, + "grad_norm": 0.9657130710337607, + "learning_rate": 1.876317437843163e-05, + "loss": 0.6621, + "step": 3224 + }, + { + "epoch": 0.19, + "grad_norm": 0.4092046974694814, + "learning_rate": 1.8762277761635725e-05, + "loss": 0.2433, + "step": 3225 + }, + { + "epoch": 0.19, + "grad_norm": 0.39059696176709385, + "learning_rate": 1.876138084140252e-05, + "loss": 0.3564, + "step": 3226 + }, + { + "epoch": 0.19, + "grad_norm": 0.3427885145541495, + "learning_rate": 1.876048361776307e-05, + "loss": 0.2631, + "step": 3227 + }, + { + "epoch": 0.19, + "grad_norm": 0.2848821101035255, + "learning_rate": 1.8759586090748454e-05, + "loss": 0.1501, + "step": 3228 + }, + { + "epoch": 0.19, + "grad_norm": 0.4979226409712117, + "learning_rate": 1.8758688260389747e-05, + "loss": 0.3348, + "step": 3229 + }, + { + "epoch": 0.19, + "grad_norm": 1.7883755482032335, + "learning_rate": 1.8757790126718044e-05, + "loss": 0.849, + "step": 3230 + }, + { + "epoch": 0.19, + "grad_norm": 0.3038487339560472, + "learning_rate": 1.8756891689764444e-05, + "loss": 0.2654, + "step": 3231 + }, + { + "epoch": 0.19, + "grad_norm": 0.426868655627405, + "learning_rate": 1.8755992949560058e-05, + "loss": 0.3428, + "step": 3232 + }, + { + "epoch": 0.19, + "grad_norm": 0.34227594083645335, + "learning_rate": 1.8755093906136016e-05, + "loss": 0.2369, + "step": 3233 + }, + { + "epoch": 0.19, + "grad_norm": 0.4159884317750057, + "learning_rate": 1.8754194559523447e-05, + "loss": 0.2368, + "step": 3234 + }, + { + "epoch": 0.19, + "grad_norm": 0.4452450173245431, + "learning_rate": 1.87532949097535e-05, + "loss": 0.3102, + "step": 3235 + }, + { + "epoch": 0.19, + "grad_norm": 0.8178285752035969, + "learning_rate": 1.8752394956857322e-05, + "loss": 0.5632, + "step": 3236 + }, + { + "epoch": 0.19, + "grad_norm": 0.6512942111877557, + "learning_rate": 1.8751494700866088e-05, + "loss": 0.491, + "step": 3237 + }, + { + "epoch": 0.19, + "grad_norm": 0.36820713984463505, + "learning_rate": 1.8750594141810964e-05, + "loss": 0.205, + "step": 3238 + }, + { + "epoch": 0.19, + "grad_norm": 0.32327909485014955, + "learning_rate": 1.8749693279723146e-05, + "loss": 0.241, + "step": 3239 + }, + { + "epoch": 0.19, + "grad_norm": 1.3048291367110827, + "learning_rate": 1.874879211463382e-05, + "loss": 0.6865, + "step": 3240 + }, + { + "epoch": 0.19, + "grad_norm": 0.38934075541803975, + "learning_rate": 1.8747890646574204e-05, + "loss": 0.2433, + "step": 3241 + }, + { + "epoch": 0.19, + "grad_norm": 0.8793842949876021, + "learning_rate": 1.874698887557551e-05, + "loss": 0.5371, + "step": 3242 + }, + { + "epoch": 0.19, + "grad_norm": 0.4606832588802145, + "learning_rate": 1.8746086801668964e-05, + "loss": 0.3555, + "step": 3243 + }, + { + "epoch": 0.19, + "grad_norm": 0.34513483530045763, + "learning_rate": 1.8745184424885815e-05, + "loss": 0.2315, + "step": 3244 + }, + { + "epoch": 0.19, + "grad_norm": 0.6050070424752414, + "learning_rate": 1.87442817452573e-05, + "loss": 0.2856, + "step": 3245 + }, + { + "epoch": 0.19, + "grad_norm": 0.30067116685715717, + "learning_rate": 1.8743378762814685e-05, + "loss": 0.2406, + "step": 3246 + }, + { + "epoch": 0.19, + "grad_norm": 0.3879704127020492, + "learning_rate": 1.874247547758924e-05, + "loss": 0.2495, + "step": 3247 + }, + { + "epoch": 0.19, + "grad_norm": 1.0069201260432241, + "learning_rate": 1.8741571889612248e-05, + "loss": 0.5666, + "step": 3248 + }, + { + "epoch": 0.19, + "grad_norm": 0.6375161367998676, + "learning_rate": 1.8740667998914996e-05, + "loss": 0.4336, + "step": 3249 + }, + { + "epoch": 0.19, + "grad_norm": 0.42327093852418024, + "learning_rate": 1.8739763805528782e-05, + "loss": 0.306, + "step": 3250 + }, + { + "epoch": 0.19, + "grad_norm": 0.41315398403240317, + "learning_rate": 1.8738859309484926e-05, + "loss": 0.2775, + "step": 3251 + }, + { + "epoch": 0.19, + "grad_norm": 0.4775464807987565, + "learning_rate": 1.8737954510814752e-05, + "loss": 0.306, + "step": 3252 + }, + { + "epoch": 0.19, + "grad_norm": 0.3845598831291664, + "learning_rate": 1.8737049409549586e-05, + "loss": 0.2899, + "step": 3253 + }, + { + "epoch": 0.19, + "grad_norm": 0.7332887356293013, + "learning_rate": 1.8736144005720775e-05, + "loss": 0.3581, + "step": 3254 + }, + { + "epoch": 0.19, + "grad_norm": 0.5740626229995096, + "learning_rate": 1.8735238299359672e-05, + "loss": 0.3243, + "step": 3255 + }, + { + "epoch": 0.19, + "grad_norm": 0.37707225799385136, + "learning_rate": 1.8734332290497642e-05, + "loss": 0.285, + "step": 3256 + }, + { + "epoch": 0.19, + "grad_norm": 0.2926649412762437, + "learning_rate": 1.8733425979166063e-05, + "loss": 0.1384, + "step": 3257 + }, + { + "epoch": 0.19, + "grad_norm": 0.4234930784962659, + "learning_rate": 1.8732519365396314e-05, + "loss": 0.3293, + "step": 3258 + }, + { + "epoch": 0.19, + "grad_norm": 0.44694857769056656, + "learning_rate": 1.87316124492198e-05, + "loss": 0.3063, + "step": 3259 + }, + { + "epoch": 0.19, + "grad_norm": 0.881998655230814, + "learning_rate": 1.8730705230667916e-05, + "loss": 0.4143, + "step": 3260 + }, + { + "epoch": 0.19, + "grad_norm": 0.47126328441633636, + "learning_rate": 1.8729797709772088e-05, + "loss": 0.3326, + "step": 3261 + }, + { + "epoch": 0.19, + "grad_norm": 0.3424134567721488, + "learning_rate": 1.872888988656374e-05, + "loss": 0.309, + "step": 3262 + }, + { + "epoch": 0.19, + "grad_norm": 0.5270321114224565, + "learning_rate": 1.8727981761074315e-05, + "loss": 0.375, + "step": 3263 + }, + { + "epoch": 0.19, + "grad_norm": 0.6391706970436589, + "learning_rate": 1.872707333333525e-05, + "loss": 0.3429, + "step": 3264 + }, + { + "epoch": 0.19, + "grad_norm": 0.4914387054979702, + "learning_rate": 1.8726164603378016e-05, + "loss": 0.3219, + "step": 3265 + }, + { + "epoch": 0.19, + "grad_norm": 0.6902772708493562, + "learning_rate": 1.8725255571234075e-05, + "loss": 0.4276, + "step": 3266 + }, + { + "epoch": 0.19, + "grad_norm": 0.252273761014582, + "learning_rate": 1.872434623693491e-05, + "loss": 0.1688, + "step": 3267 + }, + { + "epoch": 0.19, + "grad_norm": 0.43328993906220703, + "learning_rate": 1.8723436600512007e-05, + "loss": 0.3001, + "step": 3268 + }, + { + "epoch": 0.19, + "grad_norm": 1.5775971207484492, + "learning_rate": 1.8722526661996872e-05, + "loss": 0.6708, + "step": 3269 + }, + { + "epoch": 0.19, + "grad_norm": 0.4552562263007585, + "learning_rate": 1.8721616421421017e-05, + "loss": 0.3101, + "step": 3270 + }, + { + "epoch": 0.19, + "grad_norm": 0.3964430436328616, + "learning_rate": 1.8720705878815953e-05, + "loss": 0.2978, + "step": 3271 + }, + { + "epoch": 0.19, + "grad_norm": 0.9240033860694061, + "learning_rate": 1.8719795034213226e-05, + "loss": 0.5123, + "step": 3272 + }, + { + "epoch": 0.19, + "grad_norm": 0.35059453995859596, + "learning_rate": 1.871888388764437e-05, + "loss": 0.1863, + "step": 3273 + }, + { + "epoch": 0.19, + "grad_norm": 0.32757580915834644, + "learning_rate": 1.8717972439140938e-05, + "loss": 0.23, + "step": 3274 + }, + { + "epoch": 0.19, + "grad_norm": 0.5944268003456217, + "learning_rate": 1.8717060688734495e-05, + "loss": 0.41, + "step": 3275 + }, + { + "epoch": 0.19, + "grad_norm": 1.1405676959172855, + "learning_rate": 1.871614863645662e-05, + "loss": 0.5975, + "step": 3276 + }, + { + "epoch": 0.19, + "grad_norm": 0.4561238387683592, + "learning_rate": 1.871523628233889e-05, + "loss": 0.267, + "step": 3277 + }, + { + "epoch": 0.19, + "grad_norm": 0.5709846215706098, + "learning_rate": 1.87143236264129e-05, + "loss": 0.4121, + "step": 3278 + }, + { + "epoch": 0.19, + "grad_norm": 0.32475528845468604, + "learning_rate": 1.871341066871026e-05, + "loss": 0.2167, + "step": 3279 + }, + { + "epoch": 0.19, + "grad_norm": 0.35223528951570476, + "learning_rate": 1.8712497409262582e-05, + "loss": 0.2124, + "step": 3280 + }, + { + "epoch": 0.19, + "grad_norm": 1.107902808950662, + "learning_rate": 1.8711583848101492e-05, + "loss": 0.6202, + "step": 3281 + }, + { + "epoch": 0.19, + "grad_norm": 0.5448763076692719, + "learning_rate": 1.871066998525863e-05, + "loss": 0.3702, + "step": 3282 + }, + { + "epoch": 0.19, + "grad_norm": 0.43760624896546474, + "learning_rate": 1.870975582076564e-05, + "loss": 0.2446, + "step": 3283 + }, + { + "epoch": 0.19, + "grad_norm": 0.49297898663282697, + "learning_rate": 1.8708841354654184e-05, + "loss": 0.4166, + "step": 3284 + }, + { + "epoch": 0.19, + "grad_norm": 0.3323663731490816, + "learning_rate": 1.870792658695592e-05, + "loss": 0.2341, + "step": 3285 + }, + { + "epoch": 0.19, + "grad_norm": 0.49578418381735967, + "learning_rate": 1.870701151770254e-05, + "loss": 0.2838, + "step": 3286 + }, + { + "epoch": 0.19, + "grad_norm": 0.4174917171345786, + "learning_rate": 1.870609614692572e-05, + "loss": 0.2761, + "step": 3287 + }, + { + "epoch": 0.19, + "grad_norm": 0.750839879220727, + "learning_rate": 1.8705180474657166e-05, + "loss": 0.4965, + "step": 3288 + }, + { + "epoch": 0.19, + "grad_norm": 0.5129470258434858, + "learning_rate": 1.8704264500928588e-05, + "loss": 0.2986, + "step": 3289 + }, + { + "epoch": 0.19, + "grad_norm": 0.42347571147632856, + "learning_rate": 1.87033482257717e-05, + "loss": 0.2931, + "step": 3290 + }, + { + "epoch": 0.19, + "grad_norm": 0.481272218463836, + "learning_rate": 1.8702431649218245e-05, + "loss": 0.2205, + "step": 3291 + }, + { + "epoch": 0.19, + "grad_norm": 0.30216937260105364, + "learning_rate": 1.870151477129995e-05, + "loss": 0.2286, + "step": 3292 + }, + { + "epoch": 0.19, + "grad_norm": 0.5601484432369667, + "learning_rate": 1.8700597592048576e-05, + "loss": 0.3279, + "step": 3293 + }, + { + "epoch": 0.19, + "grad_norm": 0.4983673447612351, + "learning_rate": 1.869968011149588e-05, + "loss": 0.3449, + "step": 3294 + }, + { + "epoch": 0.19, + "grad_norm": 0.48122935402031364, + "learning_rate": 1.8698762329673636e-05, + "loss": 0.3335, + "step": 3295 + }, + { + "epoch": 0.19, + "grad_norm": 0.5877751356413182, + "learning_rate": 1.869784424661363e-05, + "loss": 0.3133, + "step": 3296 + }, + { + "epoch": 0.19, + "grad_norm": 0.3422421447813925, + "learning_rate": 1.8696925862347647e-05, + "loss": 0.2158, + "step": 3297 + }, + { + "epoch": 0.19, + "grad_norm": 0.3768728050060615, + "learning_rate": 1.8696007176907494e-05, + "loss": 0.2928, + "step": 3298 + }, + { + "epoch": 0.19, + "grad_norm": 0.6163884429509207, + "learning_rate": 1.869508819032499e-05, + "loss": 0.3752, + "step": 3299 + }, + { + "epoch": 0.19, + "grad_norm": 0.8648706566024917, + "learning_rate": 1.8694168902631957e-05, + "loss": 0.4133, + "step": 3300 + }, + { + "epoch": 0.19, + "grad_norm": 0.4157955496291116, + "learning_rate": 1.8693249313860225e-05, + "loss": 0.3008, + "step": 3301 + }, + { + "epoch": 0.19, + "grad_norm": 0.3660210547536385, + "learning_rate": 1.8692329424041648e-05, + "loss": 0.3064, + "step": 3302 + }, + { + "epoch": 0.19, + "grad_norm": 0.267248482819533, + "learning_rate": 1.8691409233208072e-05, + "loss": 0.1172, + "step": 3303 + }, + { + "epoch": 0.19, + "grad_norm": 0.6430966014648096, + "learning_rate": 1.869048874139137e-05, + "loss": 0.3144, + "step": 3304 + }, + { + "epoch": 0.19, + "grad_norm": 0.666856224468795, + "learning_rate": 1.8689567948623417e-05, + "loss": 0.3695, + "step": 3305 + }, + { + "epoch": 0.19, + "grad_norm": 0.5142155690233231, + "learning_rate": 1.86886468549361e-05, + "loss": 0.3119, + "step": 3306 + }, + { + "epoch": 0.19, + "grad_norm": 0.4921167365321923, + "learning_rate": 1.8687725460361315e-05, + "loss": 0.3192, + "step": 3307 + }, + { + "epoch": 0.19, + "grad_norm": 0.6095335014762302, + "learning_rate": 1.868680376493097e-05, + "loss": 0.4577, + "step": 3308 + }, + { + "epoch": 0.19, + "grad_norm": 0.3542443554161993, + "learning_rate": 1.8685881768676983e-05, + "loss": 0.1199, + "step": 3309 + }, + { + "epoch": 0.19, + "grad_norm": 0.3551171004629968, + "learning_rate": 1.868495947163129e-05, + "loss": 0.2994, + "step": 3310 + }, + { + "epoch": 0.19, + "grad_norm": 0.4032175463163124, + "learning_rate": 1.8684036873825817e-05, + "loss": 0.3191, + "step": 3311 + }, + { + "epoch": 0.19, + "grad_norm": 0.9000679546075138, + "learning_rate": 1.8683113975292522e-05, + "loss": 0.5656, + "step": 3312 + }, + { + "epoch": 0.19, + "grad_norm": 0.4240286543723701, + "learning_rate": 1.8682190776063368e-05, + "loss": 0.2259, + "step": 3313 + }, + { + "epoch": 0.19, + "grad_norm": 0.4201800746491544, + "learning_rate": 1.8681267276170315e-05, + "loss": 0.3116, + "step": 3314 + }, + { + "epoch": 0.19, + "grad_norm": 0.8506988193417204, + "learning_rate": 1.8680343475645354e-05, + "loss": 0.5609, + "step": 3315 + }, + { + "epoch": 0.19, + "grad_norm": 0.3969631858789526, + "learning_rate": 1.8679419374520467e-05, + "loss": 0.2653, + "step": 3316 + }, + { + "epoch": 0.19, + "grad_norm": 0.6065235075604374, + "learning_rate": 1.8678494972827665e-05, + "loss": 0.419, + "step": 3317 + }, + { + "epoch": 0.19, + "grad_norm": 0.32114184153525965, + "learning_rate": 1.8677570270598956e-05, + "loss": 0.2346, + "step": 3318 + }, + { + "epoch": 0.19, + "grad_norm": 0.40387175409091025, + "learning_rate": 1.8676645267866356e-05, + "loss": 0.2204, + "step": 3319 + }, + { + "epoch": 0.19, + "grad_norm": 0.6801443170904025, + "learning_rate": 1.867571996466191e-05, + "loss": 0.4311, + "step": 3320 + }, + { + "epoch": 0.19, + "grad_norm": 0.5159272006258127, + "learning_rate": 1.867479436101765e-05, + "loss": 0.3767, + "step": 3321 + }, + { + "epoch": 0.19, + "grad_norm": 0.4089571870928136, + "learning_rate": 1.8673868456965635e-05, + "loss": 0.2527, + "step": 3322 + }, + { + "epoch": 0.19, + "grad_norm": 0.48397213976412556, + "learning_rate": 1.867294225253793e-05, + "loss": 0.3643, + "step": 3323 + }, + { + "epoch": 0.19, + "grad_norm": 0.3446079467709578, + "learning_rate": 1.8672015747766606e-05, + "loss": 0.205, + "step": 3324 + }, + { + "epoch": 0.19, + "grad_norm": 0.8503150555575159, + "learning_rate": 1.8671088942683752e-05, + "loss": 0.5678, + "step": 3325 + }, + { + "epoch": 0.19, + "grad_norm": 0.39654527040197246, + "learning_rate": 1.8670161837321457e-05, + "loss": 0.2591, + "step": 3326 + }, + { + "epoch": 0.19, + "grad_norm": 1.0220278362714632, + "learning_rate": 1.8669234431711833e-05, + "loss": 0.5651, + "step": 3327 + }, + { + "epoch": 0.19, + "grad_norm": 0.5649493509801116, + "learning_rate": 1.866830672588699e-05, + "loss": 0.3735, + "step": 3328 + }, + { + "epoch": 0.19, + "grad_norm": 0.3635380013575951, + "learning_rate": 1.866737871987906e-05, + "loss": 0.2953, + "step": 3329 + }, + { + "epoch": 0.19, + "grad_norm": 0.3433034425476781, + "learning_rate": 1.866645041372018e-05, + "loss": 0.2176, + "step": 3330 + }, + { + "epoch": 0.19, + "grad_norm": 0.6755984958812598, + "learning_rate": 1.8665521807442495e-05, + "loss": 0.4254, + "step": 3331 + }, + { + "epoch": 0.19, + "grad_norm": 0.41328940725588204, + "learning_rate": 1.866459290107816e-05, + "loss": 0.2699, + "step": 3332 + }, + { + "epoch": 0.19, + "grad_norm": 0.5305543815291648, + "learning_rate": 1.8663663694659348e-05, + "loss": 0.3819, + "step": 3333 + }, + { + "epoch": 0.19, + "grad_norm": 0.42991083738123465, + "learning_rate": 1.866273418821823e-05, + "loss": 0.3441, + "step": 3334 + }, + { + "epoch": 0.19, + "grad_norm": 0.6544073955837417, + "learning_rate": 1.8661804381787e-05, + "loss": 0.2438, + "step": 3335 + }, + { + "epoch": 0.19, + "grad_norm": 0.3474028790555719, + "learning_rate": 1.8660874275397864e-05, + "loss": 0.2269, + "step": 3336 + }, + { + "epoch": 0.19, + "grad_norm": 0.44660721208933224, + "learning_rate": 1.8659943869083016e-05, + "loss": 0.3194, + "step": 3337 + }, + { + "epoch": 0.19, + "grad_norm": 0.36479851194376395, + "learning_rate": 1.865901316287469e-05, + "loss": 0.3026, + "step": 3338 + }, + { + "epoch": 0.19, + "grad_norm": 0.7251793067592559, + "learning_rate": 1.8658082156805105e-05, + "loss": 0.4025, + "step": 3339 + }, + { + "epoch": 0.19, + "grad_norm": 1.1762657605614177, + "learning_rate": 1.8657150850906515e-05, + "loss": 0.4898, + "step": 3340 + }, + { + "epoch": 0.19, + "grad_norm": 0.44822935372518946, + "learning_rate": 1.8656219245211157e-05, + "loss": 0.3101, + "step": 3341 + }, + { + "epoch": 0.19, + "grad_norm": 0.3095470714000383, + "learning_rate": 1.86552873397513e-05, + "loss": 0.2329, + "step": 3342 + }, + { + "epoch": 0.19, + "grad_norm": 0.8687453149208805, + "learning_rate": 1.8654355134559216e-05, + "loss": 0.5129, + "step": 3343 + }, + { + "epoch": 0.19, + "grad_norm": 0.3021584131565959, + "learning_rate": 1.8653422629667183e-05, + "loss": 0.2725, + "step": 3344 + }, + { + "epoch": 0.19, + "grad_norm": 0.5873873067313393, + "learning_rate": 1.8652489825107497e-05, + "loss": 0.358, + "step": 3345 + }, + { + "epoch": 0.19, + "grad_norm": 0.46313571761420985, + "learning_rate": 1.865155672091246e-05, + "loss": 0.3272, + "step": 3346 + }, + { + "epoch": 0.19, + "grad_norm": 0.42763704537288455, + "learning_rate": 1.865062331711439e-05, + "loss": 0.2979, + "step": 3347 + }, + { + "epoch": 0.19, + "grad_norm": 0.8943885070194173, + "learning_rate": 1.8649689613745605e-05, + "loss": 0.4268, + "step": 3348 + }, + { + "epoch": 0.19, + "grad_norm": 0.4519340597314842, + "learning_rate": 1.864875561083844e-05, + "loss": 0.3254, + "step": 3349 + }, + { + "epoch": 0.19, + "grad_norm": 0.34699922308137227, + "learning_rate": 1.864782130842524e-05, + "loss": 0.2848, + "step": 3350 + }, + { + "epoch": 0.19, + "grad_norm": 0.6473095340940566, + "learning_rate": 1.8646886706538358e-05, + "loss": 0.471, + "step": 3351 + }, + { + "epoch": 0.19, + "grad_norm": 0.337720021022626, + "learning_rate": 1.8645951805210164e-05, + "loss": 0.1635, + "step": 3352 + }, + { + "epoch": 0.19, + "grad_norm": 0.515168665181641, + "learning_rate": 1.864501660447303e-05, + "loss": 0.363, + "step": 3353 + }, + { + "epoch": 0.19, + "grad_norm": 0.45899933383969893, + "learning_rate": 1.8644081104359343e-05, + "loss": 0.3239, + "step": 3354 + }, + { + "epoch": 0.19, + "grad_norm": 0.4341098182538872, + "learning_rate": 1.8643145304901497e-05, + "loss": 0.2779, + "step": 3355 + }, + { + "epoch": 0.19, + "grad_norm": 0.43247406577763553, + "learning_rate": 1.8642209206131902e-05, + "loss": 0.3562, + "step": 3356 + }, + { + "epoch": 0.19, + "grad_norm": 0.315059317047122, + "learning_rate": 1.8641272808082975e-05, + "loss": 0.3091, + "step": 3357 + }, + { + "epoch": 0.19, + "grad_norm": 0.24709989347560327, + "learning_rate": 1.864033611078714e-05, + "loss": 0.0748, + "step": 3358 + }, + { + "epoch": 0.19, + "grad_norm": 0.4234610157900327, + "learning_rate": 1.863939911427684e-05, + "loss": 0.3157, + "step": 3359 + }, + { + "epoch": 0.19, + "grad_norm": 1.0117601153578217, + "learning_rate": 1.8638461818584517e-05, + "loss": 0.672, + "step": 3360 + }, + { + "epoch": 0.19, + "grad_norm": 0.4147187717202964, + "learning_rate": 1.8637524223742636e-05, + "loss": 0.3426, + "step": 3361 + }, + { + "epoch": 0.19, + "grad_norm": 0.35245150890474214, + "learning_rate": 1.863658632978366e-05, + "loss": 0.2728, + "step": 3362 + }, + { + "epoch": 0.19, + "grad_norm": 0.4678697988608274, + "learning_rate": 1.8635648136740072e-05, + "loss": 0.3828, + "step": 3363 + }, + { + "epoch": 0.19, + "grad_norm": 0.4667194994023077, + "learning_rate": 1.863470964464436e-05, + "loss": 0.2872, + "step": 3364 + }, + { + "epoch": 0.19, + "grad_norm": 0.32564335685434603, + "learning_rate": 1.8633770853529025e-05, + "loss": 0.236, + "step": 3365 + }, + { + "epoch": 0.19, + "grad_norm": 0.5608166440455324, + "learning_rate": 1.8632831763426574e-05, + "loss": 0.3899, + "step": 3366 + }, + { + "epoch": 0.19, + "grad_norm": 0.6682512540559473, + "learning_rate": 1.863189237436953e-05, + "loss": 0.4906, + "step": 3367 + }, + { + "epoch": 0.19, + "grad_norm": 0.35298988092787115, + "learning_rate": 1.863095268639043e-05, + "loss": 0.2482, + "step": 3368 + }, + { + "epoch": 0.19, + "grad_norm": 0.42340797367221217, + "learning_rate": 1.8630012699521806e-05, + "loss": 0.3356, + "step": 3369 + }, + { + "epoch": 0.19, + "grad_norm": 0.2767430952636719, + "learning_rate": 1.8629072413796213e-05, + "loss": 0.1825, + "step": 3370 + }, + { + "epoch": 0.19, + "grad_norm": 0.3605012936700501, + "learning_rate": 1.862813182924621e-05, + "loss": 0.2341, + "step": 3371 + }, + { + "epoch": 0.19, + "grad_norm": 1.153440050494576, + "learning_rate": 1.8627190945904382e-05, + "loss": 0.5661, + "step": 3372 + }, + { + "epoch": 0.19, + "grad_norm": 0.48897754156527273, + "learning_rate": 1.8626249763803295e-05, + "loss": 0.3302, + "step": 3373 + }, + { + "epoch": 0.19, + "grad_norm": 0.46612356766072266, + "learning_rate": 1.8625308282975552e-05, + "loss": 0.3312, + "step": 3374 + }, + { + "epoch": 0.19, + "grad_norm": 0.3181013883121376, + "learning_rate": 1.8624366503453752e-05, + "loss": 0.2266, + "step": 3375 + }, + { + "epoch": 0.19, + "grad_norm": 0.4652146982144647, + "learning_rate": 1.8623424425270514e-05, + "loss": 0.2808, + "step": 3376 + }, + { + "epoch": 0.19, + "grad_norm": 0.39602293510398723, + "learning_rate": 1.8622482048458454e-05, + "loss": 0.3131, + "step": 3377 + }, + { + "epoch": 0.19, + "grad_norm": 0.44220445658577895, + "learning_rate": 1.8621539373050218e-05, + "loss": 0.3288, + "step": 3378 + }, + { + "epoch": 0.19, + "grad_norm": 0.7299981826510856, + "learning_rate": 1.862059639907844e-05, + "loss": 0.569, + "step": 3379 + }, + { + "epoch": 0.19, + "grad_norm": 0.39958092415064894, + "learning_rate": 1.861965312657578e-05, + "loss": 0.2999, + "step": 3380 + }, + { + "epoch": 0.19, + "grad_norm": 0.3869027931462021, + "learning_rate": 1.8618709555574903e-05, + "loss": 0.277, + "step": 3381 + }, + { + "epoch": 0.19, + "grad_norm": 0.3501211008611354, + "learning_rate": 1.8617765686108486e-05, + "loss": 0.2184, + "step": 3382 + }, + { + "epoch": 0.19, + "grad_norm": 0.3614144002216407, + "learning_rate": 1.8616821518209213e-05, + "loss": 0.2989, + "step": 3383 + }, + { + "epoch": 0.19, + "grad_norm": 0.8955308958623391, + "learning_rate": 1.8615877051909783e-05, + "loss": 0.4199, + "step": 3384 + }, + { + "epoch": 0.19, + "grad_norm": 0.420971422861682, + "learning_rate": 1.8614932287242897e-05, + "loss": 0.3649, + "step": 3385 + }, + { + "epoch": 0.19, + "grad_norm": 0.39696752016884945, + "learning_rate": 1.8613987224241283e-05, + "loss": 0.2856, + "step": 3386 + }, + { + "epoch": 0.19, + "grad_norm": 1.1534536864598737, + "learning_rate": 1.8613041862937656e-05, + "loss": 0.6503, + "step": 3387 + }, + { + "epoch": 0.19, + "grad_norm": 0.26372459169274415, + "learning_rate": 1.861209620336476e-05, + "loss": 0.0746, + "step": 3388 + }, + { + "epoch": 0.19, + "grad_norm": 0.5163757820951047, + "learning_rate": 1.8611150245555345e-05, + "loss": 0.3125, + "step": 3389 + }, + { + "epoch": 0.19, + "grad_norm": 0.5196639264351134, + "learning_rate": 1.861020398954217e-05, + "loss": 0.3546, + "step": 3390 + }, + { + "epoch": 0.19, + "grad_norm": 0.8214877192095381, + "learning_rate": 1.8609257435357995e-05, + "loss": 0.387, + "step": 3391 + }, + { + "epoch": 0.19, + "grad_norm": 0.45946932605048707, + "learning_rate": 1.8608310583035607e-05, + "loss": 0.3055, + "step": 3392 + }, + { + "epoch": 0.19, + "grad_norm": 0.452824130541817, + "learning_rate": 1.8607363432607793e-05, + "loss": 0.3342, + "step": 3393 + }, + { + "epoch": 0.2, + "grad_norm": 0.33934441895880785, + "learning_rate": 1.8606415984107357e-05, + "loss": 0.1221, + "step": 3394 + }, + { + "epoch": 0.2, + "grad_norm": 0.4336853637931559, + "learning_rate": 1.8605468237567103e-05, + "loss": 0.2819, + "step": 3395 + }, + { + "epoch": 0.2, + "grad_norm": 1.0742880039688079, + "learning_rate": 1.8604520193019855e-05, + "loss": 0.5185, + "step": 3396 + }, + { + "epoch": 0.2, + "grad_norm": 0.39427788076090214, + "learning_rate": 1.860357185049844e-05, + "loss": 0.2925, + "step": 3397 + }, + { + "epoch": 0.2, + "grad_norm": 0.39690461047371556, + "learning_rate": 1.8602623210035707e-05, + "loss": 0.307, + "step": 3398 + }, + { + "epoch": 0.2, + "grad_norm": 1.1140528804897003, + "learning_rate": 1.8601674271664497e-05, + "loss": 0.7088, + "step": 3399 + }, + { + "epoch": 0.2, + "grad_norm": 0.2886567836009131, + "learning_rate": 1.8600725035417678e-05, + "loss": 0.1846, + "step": 3400 + }, + { + "epoch": 0.2, + "grad_norm": 0.3227616596528429, + "learning_rate": 1.8599775501328125e-05, + "loss": 0.2425, + "step": 3401 + }, + { + "epoch": 0.2, + "grad_norm": 0.848213881735556, + "learning_rate": 1.8598825669428713e-05, + "loss": 0.473, + "step": 3402 + }, + { + "epoch": 0.2, + "grad_norm": 0.837716610668294, + "learning_rate": 1.8597875539752337e-05, + "loss": 0.562, + "step": 3403 + }, + { + "epoch": 0.2, + "grad_norm": 0.33372430563348676, + "learning_rate": 1.85969251123319e-05, + "loss": 0.2439, + "step": 3404 + }, + { + "epoch": 0.2, + "grad_norm": 0.49703820168497725, + "learning_rate": 1.859597438720032e-05, + "loss": 0.3647, + "step": 3405 + }, + { + "epoch": 0.2, + "grad_norm": 0.4523814047602288, + "learning_rate": 1.8595023364390515e-05, + "loss": 0.3094, + "step": 3406 + }, + { + "epoch": 0.2, + "grad_norm": 0.3932121628983667, + "learning_rate": 1.8594072043935418e-05, + "loss": 0.241, + "step": 3407 + }, + { + "epoch": 0.2, + "grad_norm": 0.530507341323851, + "learning_rate": 1.8593120425867977e-05, + "loss": 0.3333, + "step": 3408 + }, + { + "epoch": 0.2, + "grad_norm": 0.45189749977195065, + "learning_rate": 1.859216851022115e-05, + "loss": 0.3763, + "step": 3409 + }, + { + "epoch": 0.2, + "grad_norm": 0.31896053252137, + "learning_rate": 1.859121629702789e-05, + "loss": 0.2049, + "step": 3410 + }, + { + "epoch": 0.2, + "grad_norm": 1.4510347587273766, + "learning_rate": 1.8590263786321182e-05, + "loss": 0.7991, + "step": 3411 + }, + { + "epoch": 0.2, + "grad_norm": 0.5061445451316303, + "learning_rate": 1.858931097813401e-05, + "loss": 0.4186, + "step": 3412 + }, + { + "epoch": 0.2, + "grad_norm": 0.35254918453174994, + "learning_rate": 1.8588357872499364e-05, + "loss": 0.2737, + "step": 3413 + }, + { + "epoch": 0.2, + "grad_norm": 0.30901119829012413, + "learning_rate": 1.8587404469450256e-05, + "loss": 0.1988, + "step": 3414 + }, + { + "epoch": 0.2, + "grad_norm": 1.1748732561056525, + "learning_rate": 1.85864507690197e-05, + "loss": 0.7539, + "step": 3415 + }, + { + "epoch": 0.2, + "grad_norm": 0.4472144634212105, + "learning_rate": 1.8585496771240726e-05, + "loss": 0.3451, + "step": 3416 + }, + { + "epoch": 0.2, + "grad_norm": 0.3288085382989217, + "learning_rate": 1.8584542476146364e-05, + "loss": 0.2777, + "step": 3417 + }, + { + "epoch": 0.2, + "grad_norm": 0.701787075717411, + "learning_rate": 1.8583587883769668e-05, + "loss": 0.5152, + "step": 3418 + }, + { + "epoch": 0.2, + "grad_norm": 0.4124720953337286, + "learning_rate": 1.8582632994143693e-05, + "loss": 0.3517, + "step": 3419 + }, + { + "epoch": 0.2, + "grad_norm": 0.26448985422784416, + "learning_rate": 1.8581677807301507e-05, + "loss": 0.1267, + "step": 3420 + }, + { + "epoch": 0.2, + "grad_norm": 0.4416407925509442, + "learning_rate": 1.8580722323276186e-05, + "loss": 0.35, + "step": 3421 + }, + { + "epoch": 0.2, + "grad_norm": 0.36003377851432783, + "learning_rate": 1.857976654210082e-05, + "loss": 0.2706, + "step": 3422 + }, + { + "epoch": 0.2, + "grad_norm": 1.0498458699375062, + "learning_rate": 1.857881046380851e-05, + "loss": 0.3784, + "step": 3423 + }, + { + "epoch": 0.2, + "grad_norm": 0.3688550683980165, + "learning_rate": 1.8577854088432355e-05, + "loss": 0.3505, + "step": 3424 + }, + { + "epoch": 0.2, + "grad_norm": 0.44942001956730154, + "learning_rate": 1.8576897416005487e-05, + "loss": 0.3227, + "step": 3425 + }, + { + "epoch": 0.2, + "grad_norm": 0.6148292590435895, + "learning_rate": 1.857594044656103e-05, + "loss": 0.3691, + "step": 3426 + }, + { + "epoch": 0.2, + "grad_norm": 0.26088148061349903, + "learning_rate": 1.8574983180132128e-05, + "loss": 0.139, + "step": 3427 + }, + { + "epoch": 0.2, + "grad_norm": 0.6143931862134248, + "learning_rate": 1.8574025616751923e-05, + "loss": 0.2772, + "step": 3428 + }, + { + "epoch": 0.2, + "grad_norm": 0.40106184091905833, + "learning_rate": 1.8573067756453578e-05, + "loss": 0.312, + "step": 3429 + }, + { + "epoch": 0.2, + "grad_norm": 0.6215169188275277, + "learning_rate": 1.8572109599270266e-05, + "loss": 0.4106, + "step": 3430 + }, + { + "epoch": 0.2, + "grad_norm": 0.4565998521916527, + "learning_rate": 1.857115114523517e-05, + "loss": 0.3022, + "step": 3431 + }, + { + "epoch": 0.2, + "grad_norm": 0.3754630031717685, + "learning_rate": 1.857019239438148e-05, + "loss": 0.2459, + "step": 3432 + }, + { + "epoch": 0.2, + "grad_norm": 0.3584316941904439, + "learning_rate": 1.8569233346742392e-05, + "loss": 0.2165, + "step": 3433 + }, + { + "epoch": 0.2, + "grad_norm": 0.4805254322830583, + "learning_rate": 1.856827400235112e-05, + "loss": 0.3607, + "step": 3434 + }, + { + "epoch": 0.2, + "grad_norm": 0.5382648282007101, + "learning_rate": 1.8567314361240893e-05, + "loss": 0.3951, + "step": 3435 + }, + { + "epoch": 0.2, + "grad_norm": 0.4865226514408291, + "learning_rate": 1.8566354423444933e-05, + "loss": 0.3781, + "step": 3436 + }, + { + "epoch": 0.2, + "grad_norm": 0.358374646552267, + "learning_rate": 1.856539418899649e-05, + "loss": 0.2488, + "step": 3437 + }, + { + "epoch": 0.2, + "grad_norm": 0.5303949361998881, + "learning_rate": 1.8564433657928815e-05, + "loss": 0.3902, + "step": 3438 + }, + { + "epoch": 0.2, + "grad_norm": 0.4547723658493619, + "learning_rate": 1.8563472830275172e-05, + "loss": 0.3408, + "step": 3439 + }, + { + "epoch": 0.2, + "grad_norm": 0.270919077914169, + "learning_rate": 1.856251170606883e-05, + "loss": 0.1925, + "step": 3440 + }, + { + "epoch": 0.2, + "grad_norm": 0.38320462625120033, + "learning_rate": 1.8561550285343077e-05, + "loss": 0.3415, + "step": 3441 + }, + { + "epoch": 0.2, + "grad_norm": 0.7710149460976287, + "learning_rate": 1.8560588568131205e-05, + "loss": 0.5157, + "step": 3442 + }, + { + "epoch": 0.2, + "grad_norm": 0.45201487807738494, + "learning_rate": 1.8559626554466523e-05, + "loss": 0.1742, + "step": 3443 + }, + { + "epoch": 0.2, + "grad_norm": 0.6001960036849652, + "learning_rate": 1.8558664244382338e-05, + "loss": 0.3938, + "step": 3444 + }, + { + "epoch": 0.2, + "grad_norm": 0.4399877397096689, + "learning_rate": 1.8557701637911978e-05, + "loss": 0.3467, + "step": 3445 + }, + { + "epoch": 0.2, + "grad_norm": 0.6694405301341718, + "learning_rate": 1.855673873508878e-05, + "loss": 0.2563, + "step": 3446 + }, + { + "epoch": 0.2, + "grad_norm": 0.5429342439640811, + "learning_rate": 1.855577553594609e-05, + "loss": 0.3684, + "step": 3447 + }, + { + "epoch": 0.2, + "grad_norm": 0.3009787386986737, + "learning_rate": 1.8554812040517255e-05, + "loss": 0.2322, + "step": 3448 + }, + { + "epoch": 0.2, + "grad_norm": 0.782021777816558, + "learning_rate": 1.855384824883565e-05, + "loss": 0.4354, + "step": 3449 + }, + { + "epoch": 0.2, + "grad_norm": 0.3773608366203985, + "learning_rate": 1.8552884160934647e-05, + "loss": 0.2431, + "step": 3450 + }, + { + "epoch": 0.2, + "grad_norm": 1.2925400215861826, + "learning_rate": 1.8551919776847634e-05, + "loss": 0.837, + "step": 3451 + }, + { + "epoch": 0.2, + "grad_norm": 0.43774430556639365, + "learning_rate": 1.8550955096608007e-05, + "loss": 0.3146, + "step": 3452 + }, + { + "epoch": 0.2, + "grad_norm": 0.4075921947601884, + "learning_rate": 1.8549990120249174e-05, + "loss": 0.3066, + "step": 3453 + }, + { + "epoch": 0.2, + "grad_norm": 0.3309237068426338, + "learning_rate": 1.8549024847804547e-05, + "loss": 0.1845, + "step": 3454 + }, + { + "epoch": 0.2, + "grad_norm": 0.5641842287581473, + "learning_rate": 1.854805927930756e-05, + "loss": 0.3114, + "step": 3455 + }, + { + "epoch": 0.2, + "grad_norm": 0.4169525329222009, + "learning_rate": 1.854709341479165e-05, + "loss": 0.2763, + "step": 3456 + }, + { + "epoch": 0.2, + "grad_norm": 0.49289840010942465, + "learning_rate": 1.8546127254290257e-05, + "loss": 0.3636, + "step": 3457 + }, + { + "epoch": 0.2, + "grad_norm": 0.5541013067086684, + "learning_rate": 1.8545160797836847e-05, + "loss": 0.4148, + "step": 3458 + }, + { + "epoch": 0.2, + "grad_norm": 0.4141408240338715, + "learning_rate": 1.8544194045464888e-05, + "loss": 0.2903, + "step": 3459 + }, + { + "epoch": 0.2, + "grad_norm": 0.28105441189183067, + "learning_rate": 1.8543226997207854e-05, + "loss": 0.2312, + "step": 3460 + }, + { + "epoch": 0.2, + "grad_norm": 0.6534352410994891, + "learning_rate": 1.8542259653099236e-05, + "loss": 0.2852, + "step": 3461 + }, + { + "epoch": 0.2, + "grad_norm": 0.4909083697020356, + "learning_rate": 1.8541292013172538e-05, + "loss": 0.3315, + "step": 3462 + }, + { + "epoch": 0.2, + "grad_norm": 0.924048730821875, + "learning_rate": 1.854032407746126e-05, + "loss": 0.4172, + "step": 3463 + }, + { + "epoch": 0.2, + "grad_norm": 0.36777627302291016, + "learning_rate": 1.853935584599893e-05, + "loss": 0.2993, + "step": 3464 + }, + { + "epoch": 0.2, + "grad_norm": 0.4041852347714912, + "learning_rate": 1.8538387318819074e-05, + "loss": 0.3167, + "step": 3465 + }, + { + "epoch": 0.2, + "grad_norm": 0.2281237706488959, + "learning_rate": 1.853741849595523e-05, + "loss": 0.0881, + "step": 3466 + }, + { + "epoch": 0.2, + "grad_norm": 0.8221586813021399, + "learning_rate": 1.853644937744095e-05, + "loss": 0.486, + "step": 3467 + }, + { + "epoch": 0.2, + "grad_norm": 0.3859475003282859, + "learning_rate": 1.8535479963309796e-05, + "loss": 0.313, + "step": 3468 + }, + { + "epoch": 0.2, + "grad_norm": 0.6530897529329045, + "learning_rate": 1.853451025359534e-05, + "loss": 0.3369, + "step": 3469 + }, + { + "epoch": 0.2, + "grad_norm": 1.042977278524951, + "learning_rate": 1.8533540248331162e-05, + "loss": 0.4573, + "step": 3470 + }, + { + "epoch": 0.2, + "grad_norm": 0.41146027977536204, + "learning_rate": 1.8532569947550846e-05, + "loss": 0.3255, + "step": 3471 + }, + { + "epoch": 0.2, + "grad_norm": 0.35765902247522974, + "learning_rate": 1.8531599351288007e-05, + "loss": 0.2252, + "step": 3472 + }, + { + "epoch": 0.2, + "grad_norm": 0.3855611438390921, + "learning_rate": 1.8530628459576243e-05, + "loss": 0.2276, + "step": 3473 + }, + { + "epoch": 0.2, + "grad_norm": 0.5204397928422537, + "learning_rate": 1.8529657272449186e-05, + "loss": 0.3064, + "step": 3474 + }, + { + "epoch": 0.2, + "grad_norm": 1.810437510701928, + "learning_rate": 1.8528685789940463e-05, + "loss": 0.5076, + "step": 3475 + }, + { + "epoch": 0.2, + "grad_norm": 0.40869087933670045, + "learning_rate": 1.8527714012083718e-05, + "loss": 0.2685, + "step": 3476 + }, + { + "epoch": 0.2, + "grad_norm": 0.43618505181314465, + "learning_rate": 1.8526741938912605e-05, + "loss": 0.3277, + "step": 3477 + }, + { + "epoch": 0.2, + "grad_norm": 0.42187847427639297, + "learning_rate": 1.8525769570460783e-05, + "loss": 0.2454, + "step": 3478 + }, + { + "epoch": 0.2, + "grad_norm": 0.3165643524146173, + "learning_rate": 1.8524796906761928e-05, + "loss": 0.1485, + "step": 3479 + }, + { + "epoch": 0.2, + "grad_norm": 0.46732317429125, + "learning_rate": 1.8523823947849722e-05, + "loss": 0.2923, + "step": 3480 + }, + { + "epoch": 0.2, + "grad_norm": 1.1275700154976764, + "learning_rate": 1.8522850693757865e-05, + "loss": 0.4158, + "step": 3481 + }, + { + "epoch": 0.2, + "grad_norm": 0.7907367906006263, + "learning_rate": 1.8521877144520047e-05, + "loss": 0.2819, + "step": 3482 + }, + { + "epoch": 0.2, + "grad_norm": 0.44558259936120503, + "learning_rate": 1.8520903300169993e-05, + "loss": 0.3128, + "step": 3483 + }, + { + "epoch": 0.2, + "grad_norm": 0.31241687566734866, + "learning_rate": 1.8519929160741427e-05, + "loss": 0.2827, + "step": 3484 + }, + { + "epoch": 0.2, + "grad_norm": 0.31618469870226495, + "learning_rate": 1.8518954726268076e-05, + "loss": 0.1135, + "step": 3485 + }, + { + "epoch": 0.2, + "grad_norm": 0.3964958438859382, + "learning_rate": 1.851797999678369e-05, + "loss": 0.3282, + "step": 3486 + }, + { + "epoch": 0.2, + "grad_norm": 0.8788401444161869, + "learning_rate": 1.8517004972322022e-05, + "loss": 0.5349, + "step": 3487 + }, + { + "epoch": 0.2, + "grad_norm": 0.5842037117974624, + "learning_rate": 1.851602965291684e-05, + "loss": 0.3523, + "step": 3488 + }, + { + "epoch": 0.2, + "grad_norm": 0.4250093402378405, + "learning_rate": 1.851505403860192e-05, + "loss": 0.2307, + "step": 3489 + }, + { + "epoch": 0.2, + "grad_norm": 1.1648122285456606, + "learning_rate": 1.8514078129411045e-05, + "loss": 0.5946, + "step": 3490 + }, + { + "epoch": 0.2, + "grad_norm": 0.33870419487266124, + "learning_rate": 1.8513101925378006e-05, + "loss": 0.2547, + "step": 3491 + }, + { + "epoch": 0.2, + "grad_norm": 0.3289610462955457, + "learning_rate": 1.8512125426536617e-05, + "loss": 0.2312, + "step": 3492 + }, + { + "epoch": 0.2, + "grad_norm": 0.7856786137489506, + "learning_rate": 1.8511148632920685e-05, + "loss": 0.4921, + "step": 3493 + }, + { + "epoch": 0.2, + "grad_norm": 0.9243725044925123, + "learning_rate": 1.851017154456405e-05, + "loss": 0.5751, + "step": 3494 + }, + { + "epoch": 0.2, + "grad_norm": 0.38206441570861877, + "learning_rate": 1.8509194161500536e-05, + "loss": 0.24, + "step": 3495 + }, + { + "epoch": 0.2, + "grad_norm": 0.4583245897267825, + "learning_rate": 1.8508216483763993e-05, + "loss": 0.3524, + "step": 3496 + }, + { + "epoch": 0.2, + "grad_norm": 0.2942176231528043, + "learning_rate": 1.850723851138828e-05, + "loss": 0.2291, + "step": 3497 + }, + { + "epoch": 0.2, + "grad_norm": 0.3675286195522216, + "learning_rate": 1.850626024440726e-05, + "loss": 0.2646, + "step": 3498 + }, + { + "epoch": 0.2, + "grad_norm": 0.4881052772922558, + "learning_rate": 1.850528168285482e-05, + "loss": 0.4122, + "step": 3499 + }, + { + "epoch": 0.2, + "grad_norm": 0.5091923599798945, + "learning_rate": 1.8504302826764835e-05, + "loss": 0.3891, + "step": 3500 + }, + { + "epoch": 0.2, + "grad_norm": 0.3633056005786531, + "learning_rate": 1.8503323676171212e-05, + "loss": 0.2949, + "step": 3501 + }, + { + "epoch": 0.2, + "grad_norm": 0.5991201857527949, + "learning_rate": 1.8502344231107855e-05, + "loss": 0.3974, + "step": 3502 + }, + { + "epoch": 0.2, + "grad_norm": 0.40415730403043115, + "learning_rate": 1.8501364491608683e-05, + "loss": 0.3875, + "step": 3503 + }, + { + "epoch": 0.2, + "grad_norm": 0.34569731298708684, + "learning_rate": 1.8500384457707625e-05, + "loss": 0.285, + "step": 3504 + }, + { + "epoch": 0.2, + "grad_norm": 0.2568026864658739, + "learning_rate": 1.8499404129438617e-05, + "loss": 0.1563, + "step": 3505 + }, + { + "epoch": 0.2, + "grad_norm": 1.1707986100418522, + "learning_rate": 1.8498423506835613e-05, + "loss": 0.7772, + "step": 3506 + }, + { + "epoch": 0.2, + "grad_norm": 0.38602275849489903, + "learning_rate": 1.8497442589932568e-05, + "loss": 0.2805, + "step": 3507 + }, + { + "epoch": 0.2, + "grad_norm": 0.4079578462025017, + "learning_rate": 1.8496461378763445e-05, + "loss": 0.2396, + "step": 3508 + }, + { + "epoch": 0.2, + "grad_norm": 0.5885481273942008, + "learning_rate": 1.8495479873362237e-05, + "loss": 0.4813, + "step": 3509 + }, + { + "epoch": 0.2, + "grad_norm": 0.28505728074008135, + "learning_rate": 1.8494498073762924e-05, + "loss": 0.2308, + "step": 3510 + }, + { + "epoch": 0.2, + "grad_norm": 0.3259515702055285, + "learning_rate": 1.8493515979999508e-05, + "loss": 0.2147, + "step": 3511 + }, + { + "epoch": 0.2, + "grad_norm": 0.5081829044842984, + "learning_rate": 1.8492533592105998e-05, + "loss": 0.3786, + "step": 3512 + }, + { + "epoch": 0.2, + "grad_norm": 0.40211123448914077, + "learning_rate": 1.8491550910116415e-05, + "loss": 0.2937, + "step": 3513 + }, + { + "epoch": 0.2, + "grad_norm": 0.5797803004312481, + "learning_rate": 1.8490567934064788e-05, + "loss": 0.3893, + "step": 3514 + }, + { + "epoch": 0.2, + "grad_norm": 0.35474516854979343, + "learning_rate": 1.848958466398516e-05, + "loss": 0.311, + "step": 3515 + }, + { + "epoch": 0.2, + "grad_norm": 0.3743499556350666, + "learning_rate": 1.8488601099911582e-05, + "loss": 0.2879, + "step": 3516 + }, + { + "epoch": 0.2, + "grad_norm": 0.34838342234863295, + "learning_rate": 1.8487617241878114e-05, + "loss": 0.2501, + "step": 3517 + }, + { + "epoch": 0.2, + "grad_norm": 0.9330455844434704, + "learning_rate": 1.8486633089918823e-05, + "loss": 0.5301, + "step": 3518 + }, + { + "epoch": 0.2, + "grad_norm": 0.3876398473412494, + "learning_rate": 1.848564864406779e-05, + "loss": 0.2736, + "step": 3519 + }, + { + "epoch": 0.2, + "grad_norm": 0.38224800897615363, + "learning_rate": 1.8484663904359112e-05, + "loss": 0.3232, + "step": 3520 + }, + { + "epoch": 0.2, + "grad_norm": 0.7342180011930438, + "learning_rate": 1.848367887082689e-05, + "loss": 0.4027, + "step": 3521 + }, + { + "epoch": 0.2, + "grad_norm": 0.37253887582059153, + "learning_rate": 1.848269354350523e-05, + "loss": 0.2922, + "step": 3522 + }, + { + "epoch": 0.2, + "grad_norm": 0.4486634748559754, + "learning_rate": 1.848170792242826e-05, + "loss": 0.3188, + "step": 3523 + }, + { + "epoch": 0.2, + "grad_norm": 0.41124068257992336, + "learning_rate": 1.848072200763011e-05, + "loss": 0.3629, + "step": 3524 + }, + { + "epoch": 0.2, + "grad_norm": 0.252569055610881, + "learning_rate": 1.8479735799144917e-05, + "loss": 0.1877, + "step": 3525 + }, + { + "epoch": 0.2, + "grad_norm": 0.6926210684337959, + "learning_rate": 1.847874929700684e-05, + "loss": 0.5067, + "step": 3526 + }, + { + "epoch": 0.2, + "grad_norm": 0.5730030339821481, + "learning_rate": 1.8477762501250046e-05, + "loss": 0.3562, + "step": 3527 + }, + { + "epoch": 0.2, + "grad_norm": 0.36594434314465024, + "learning_rate": 1.847677541190869e-05, + "loss": 0.2255, + "step": 3528 + }, + { + "epoch": 0.2, + "grad_norm": 0.5439482028879972, + "learning_rate": 1.8475788029016974e-05, + "loss": 0.3023, + "step": 3529 + }, + { + "epoch": 0.2, + "grad_norm": 0.5147312308129324, + "learning_rate": 1.847480035260908e-05, + "loss": 0.3325, + "step": 3530 + }, + { + "epoch": 0.2, + "grad_norm": 0.396855562434019, + "learning_rate": 1.8473812382719215e-05, + "loss": 0.2331, + "step": 3531 + }, + { + "epoch": 0.2, + "grad_norm": 0.3650937303546053, + "learning_rate": 1.8472824119381592e-05, + "loss": 0.3324, + "step": 3532 + }, + { + "epoch": 0.2, + "grad_norm": 0.8069519781469283, + "learning_rate": 1.8471835562630435e-05, + "loss": 0.5369, + "step": 3533 + }, + { + "epoch": 0.2, + "grad_norm": 0.3224016264255935, + "learning_rate": 1.8470846712499977e-05, + "loss": 0.2151, + "step": 3534 + }, + { + "epoch": 0.2, + "grad_norm": 0.49887651442034286, + "learning_rate": 1.846985756902446e-05, + "loss": 0.4021, + "step": 3535 + }, + { + "epoch": 0.2, + "grad_norm": 0.372999385227418, + "learning_rate": 1.8468868132238138e-05, + "loss": 0.3151, + "step": 3536 + }, + { + "epoch": 0.2, + "grad_norm": 0.4155513944432335, + "learning_rate": 1.8467878402175278e-05, + "loss": 0.3341, + "step": 3537 + }, + { + "epoch": 0.2, + "grad_norm": 0.38115414788893204, + "learning_rate": 1.8466888378870155e-05, + "loss": 0.2816, + "step": 3538 + }, + { + "epoch": 0.2, + "grad_norm": 0.3971013745780681, + "learning_rate": 1.8465898062357048e-05, + "loss": 0.3233, + "step": 3539 + }, + { + "epoch": 0.2, + "grad_norm": 0.405786263334606, + "learning_rate": 1.846490745267026e-05, + "loss": 0.3123, + "step": 3540 + }, + { + "epoch": 0.2, + "grad_norm": 0.524142882921503, + "learning_rate": 1.8463916549844084e-05, + "loss": 0.2972, + "step": 3541 + }, + { + "epoch": 0.2, + "grad_norm": 0.43131354308913267, + "learning_rate": 1.846292535391285e-05, + "loss": 0.3471, + "step": 3542 + }, + { + "epoch": 0.2, + "grad_norm": 0.3522374006752674, + "learning_rate": 1.846193386491087e-05, + "loss": 0.316, + "step": 3543 + }, + { + "epoch": 0.2, + "grad_norm": 0.4367897212599001, + "learning_rate": 1.846094208287248e-05, + "loss": 0.3275, + "step": 3544 + }, + { + "epoch": 0.2, + "grad_norm": 0.4318912036953102, + "learning_rate": 1.845995000783204e-05, + "loss": 0.2809, + "step": 3545 + }, + { + "epoch": 0.2, + "grad_norm": 0.37576132450278826, + "learning_rate": 1.8458957639823887e-05, + "loss": 0.271, + "step": 3546 + }, + { + "epoch": 0.2, + "grad_norm": 0.44073497833225533, + "learning_rate": 1.8457964978882397e-05, + "loss": 0.2809, + "step": 3547 + }, + { + "epoch": 0.2, + "grad_norm": 0.4402566585814481, + "learning_rate": 1.8456972025041943e-05, + "loss": 0.411, + "step": 3548 + }, + { + "epoch": 0.2, + "grad_norm": 0.5817384512608036, + "learning_rate": 1.845597877833691e-05, + "loss": 0.3939, + "step": 3549 + }, + { + "epoch": 0.2, + "grad_norm": 0.347563565667777, + "learning_rate": 1.84549852388017e-05, + "loss": 0.2921, + "step": 3550 + }, + { + "epoch": 0.2, + "grad_norm": 0.3335895526495127, + "learning_rate": 1.845399140647071e-05, + "loss": 0.2274, + "step": 3551 + }, + { + "epoch": 0.2, + "grad_norm": 0.7074274468152657, + "learning_rate": 1.8452997281378364e-05, + "loss": 0.3816, + "step": 3552 + }, + { + "epoch": 0.2, + "grad_norm": 0.4984396584595269, + "learning_rate": 1.8452002863559086e-05, + "loss": 0.3568, + "step": 3553 + }, + { + "epoch": 0.2, + "grad_norm": 0.6284443435298703, + "learning_rate": 1.845100815304731e-05, + "loss": 0.4117, + "step": 3554 + }, + { + "epoch": 0.2, + "grad_norm": 0.3490991839784079, + "learning_rate": 1.845001314987749e-05, + "loss": 0.28, + "step": 3555 + }, + { + "epoch": 0.2, + "grad_norm": 0.4250866389209264, + "learning_rate": 1.8449017854084072e-05, + "loss": 0.3298, + "step": 3556 + }, + { + "epoch": 0.2, + "grad_norm": 0.3043618837957513, + "learning_rate": 1.844802226570153e-05, + "loss": 0.1234, + "step": 3557 + }, + { + "epoch": 0.2, + "grad_norm": 0.4379559170905638, + "learning_rate": 1.8447026384764343e-05, + "loss": 0.3207, + "step": 3558 + }, + { + "epoch": 0.2, + "grad_norm": 0.3965722955626275, + "learning_rate": 1.8446030211306993e-05, + "loss": 0.3391, + "step": 3559 + }, + { + "epoch": 0.2, + "grad_norm": 1.024955890124149, + "learning_rate": 1.844503374536398e-05, + "loss": 0.427, + "step": 3560 + }, + { + "epoch": 0.2, + "grad_norm": 0.41957172094011896, + "learning_rate": 1.8444036986969814e-05, + "loss": 0.336, + "step": 3561 + }, + { + "epoch": 0.2, + "grad_norm": 0.5542877434696505, + "learning_rate": 1.8443039936159007e-05, + "loss": 0.3461, + "step": 3562 + }, + { + "epoch": 0.2, + "grad_norm": 0.2729193047980405, + "learning_rate": 1.8442042592966095e-05, + "loss": 0.2327, + "step": 3563 + }, + { + "epoch": 0.2, + "grad_norm": 0.3549744193912546, + "learning_rate": 1.8441044957425608e-05, + "loss": 0.1783, + "step": 3564 + }, + { + "epoch": 0.2, + "grad_norm": 0.5966312774507957, + "learning_rate": 1.8440047029572094e-05, + "loss": 0.41, + "step": 3565 + }, + { + "epoch": 0.2, + "grad_norm": 1.0779161799481367, + "learning_rate": 1.843904880944012e-05, + "loss": 0.5309, + "step": 3566 + }, + { + "epoch": 0.2, + "grad_norm": 0.3425507945907625, + "learning_rate": 1.843805029706425e-05, + "loss": 0.2269, + "step": 3567 + }, + { + "epoch": 0.2, + "grad_norm": 0.43024803622830704, + "learning_rate": 1.8437051492479053e-05, + "loss": 0.3461, + "step": 3568 + }, + { + "epoch": 0.21, + "grad_norm": 0.32182440028828346, + "learning_rate": 1.843605239571913e-05, + "loss": 0.1903, + "step": 3569 + }, + { + "epoch": 0.21, + "grad_norm": 0.43595402051191434, + "learning_rate": 1.8435053006819073e-05, + "loss": 0.1819, + "step": 3570 + }, + { + "epoch": 0.21, + "grad_norm": 0.4289935360872228, + "learning_rate": 1.8434053325813495e-05, + "loss": 0.3026, + "step": 3571 + }, + { + "epoch": 0.21, + "grad_norm": 1.259600352356555, + "learning_rate": 1.8433053352737014e-05, + "loss": 0.5352, + "step": 3572 + }, + { + "epoch": 0.21, + "grad_norm": 0.46454768580118233, + "learning_rate": 1.8432053087624258e-05, + "loss": 0.2123, + "step": 3573 + }, + { + "epoch": 0.21, + "grad_norm": 0.3683549795511345, + "learning_rate": 1.8431052530509866e-05, + "loss": 0.2625, + "step": 3574 + }, + { + "epoch": 0.21, + "grad_norm": 0.3434082020790198, + "learning_rate": 1.8430051681428486e-05, + "loss": 0.2504, + "step": 3575 + }, + { + "epoch": 0.21, + "grad_norm": 0.6314261221401793, + "learning_rate": 1.842905054041478e-05, + "loss": 0.3683, + "step": 3576 + }, + { + "epoch": 0.21, + "grad_norm": 0.7178093309482125, + "learning_rate": 1.8428049107503417e-05, + "loss": 0.3267, + "step": 3577 + }, + { + "epoch": 0.21, + "grad_norm": 1.1845693433728908, + "learning_rate": 1.8427047382729073e-05, + "loss": 0.606, + "step": 3578 + }, + { + "epoch": 0.21, + "grad_norm": 0.3857144192125777, + "learning_rate": 1.842604536612644e-05, + "loss": 0.3092, + "step": 3579 + }, + { + "epoch": 0.21, + "grad_norm": 0.422890021608481, + "learning_rate": 1.842504305773022e-05, + "loss": 0.2626, + "step": 3580 + }, + { + "epoch": 0.21, + "grad_norm": 0.319692052216907, + "learning_rate": 1.8424040457575124e-05, + "loss": 0.1854, + "step": 3581 + }, + { + "epoch": 0.21, + "grad_norm": 0.609801137363293, + "learning_rate": 1.8423037565695864e-05, + "loss": 0.3804, + "step": 3582 + }, + { + "epoch": 0.21, + "grad_norm": 0.3952230768238028, + "learning_rate": 1.842203438212718e-05, + "loss": 0.2583, + "step": 3583 + }, + { + "epoch": 0.21, + "grad_norm": 0.9995316109812836, + "learning_rate": 1.8421030906903805e-05, + "loss": 0.5573, + "step": 3584 + }, + { + "epoch": 0.21, + "grad_norm": 0.8080682129475155, + "learning_rate": 1.8420027140060493e-05, + "loss": 0.464, + "step": 3585 + }, + { + "epoch": 0.21, + "grad_norm": 0.38502970294226097, + "learning_rate": 1.8419023081632e-05, + "loss": 0.2237, + "step": 3586 + }, + { + "epoch": 0.21, + "grad_norm": 0.3003903655050074, + "learning_rate": 1.8418018731653106e-05, + "loss": 0.2376, + "step": 3587 + }, + { + "epoch": 0.21, + "grad_norm": 0.8598542283516147, + "learning_rate": 1.841701409015858e-05, + "loss": 0.4417, + "step": 3588 + }, + { + "epoch": 0.21, + "grad_norm": 0.4722555501097505, + "learning_rate": 1.841600915718322e-05, + "loss": 0.3618, + "step": 3589 + }, + { + "epoch": 0.21, + "grad_norm": 0.8780361475667137, + "learning_rate": 1.8415003932761823e-05, + "loss": 0.5438, + "step": 3590 + }, + { + "epoch": 0.21, + "grad_norm": 0.36609700316137517, + "learning_rate": 1.8413998416929205e-05, + "loss": 0.3317, + "step": 3591 + }, + { + "epoch": 0.21, + "grad_norm": 0.4232364971031735, + "learning_rate": 1.8412992609720183e-05, + "loss": 0.3091, + "step": 3592 + }, + { + "epoch": 0.21, + "grad_norm": 0.2582320073159369, + "learning_rate": 1.8411986511169585e-05, + "loss": 0.0721, + "step": 3593 + }, + { + "epoch": 0.21, + "grad_norm": 0.615145234861627, + "learning_rate": 1.8410980121312258e-05, + "loss": 0.41, + "step": 3594 + }, + { + "epoch": 0.21, + "grad_norm": 0.3981913852643038, + "learning_rate": 1.8409973440183054e-05, + "loss": 0.3132, + "step": 3595 + }, + { + "epoch": 0.21, + "grad_norm": 1.2713381305155262, + "learning_rate": 1.840896646781683e-05, + "loss": 0.5107, + "step": 3596 + }, + { + "epoch": 0.21, + "grad_norm": 0.5843553386069017, + "learning_rate": 1.8407959204248455e-05, + "loss": 0.3627, + "step": 3597 + }, + { + "epoch": 0.21, + "grad_norm": 0.46489119063537054, + "learning_rate": 1.8406951649512817e-05, + "loss": 0.3248, + "step": 3598 + }, + { + "epoch": 0.21, + "grad_norm": 0.47103184457362807, + "learning_rate": 1.8405943803644803e-05, + "loss": 0.3158, + "step": 3599 + }, + { + "epoch": 0.21, + "grad_norm": 0.7760715327257453, + "learning_rate": 1.840493566667932e-05, + "loss": 0.3387, + "step": 3600 + }, + { + "epoch": 0.21, + "grad_norm": 0.3663667134586227, + "learning_rate": 1.8403927238651274e-05, + "loss": 0.2674, + "step": 3601 + }, + { + "epoch": 0.21, + "grad_norm": 0.5303169306539498, + "learning_rate": 1.8402918519595592e-05, + "loss": 0.3238, + "step": 3602 + }, + { + "epoch": 0.21, + "grad_norm": 0.47049291683057637, + "learning_rate": 1.8401909509547196e-05, + "loss": 0.2613, + "step": 3603 + }, + { + "epoch": 0.21, + "grad_norm": 0.4636995547912467, + "learning_rate": 1.8400900208541045e-05, + "loss": 0.3415, + "step": 3604 + }, + { + "epoch": 0.21, + "grad_norm": 0.610816875773527, + "learning_rate": 1.8399890616612073e-05, + "loss": 0.4726, + "step": 3605 + }, + { + "epoch": 0.21, + "grad_norm": 0.3932357164783332, + "learning_rate": 1.8398880733795253e-05, + "loss": 0.2688, + "step": 3606 + }, + { + "epoch": 0.21, + "grad_norm": 0.33115183414057575, + "learning_rate": 1.8397870560125554e-05, + "loss": 0.2524, + "step": 3607 + }, + { + "epoch": 0.21, + "grad_norm": 0.5466588959281407, + "learning_rate": 1.839686009563796e-05, + "loss": 0.368, + "step": 3608 + }, + { + "epoch": 0.21, + "grad_norm": 0.3873845994650159, + "learning_rate": 1.839584934036746e-05, + "loss": 0.0684, + "step": 3609 + }, + { + "epoch": 0.21, + "grad_norm": 0.5054505521879232, + "learning_rate": 1.8394838294349058e-05, + "loss": 0.3073, + "step": 3610 + }, + { + "epoch": 0.21, + "grad_norm": 0.595052581816832, + "learning_rate": 1.839382695761777e-05, + "loss": 0.3465, + "step": 3611 + }, + { + "epoch": 0.21, + "grad_norm": 1.1129720778207792, + "learning_rate": 1.839281533020861e-05, + "loss": 0.4943, + "step": 3612 + }, + { + "epoch": 0.21, + "grad_norm": 0.33785668181825734, + "learning_rate": 1.839180341215662e-05, + "loss": 0.1835, + "step": 3613 + }, + { + "epoch": 0.21, + "grad_norm": 1.3938903417041624, + "learning_rate": 1.8390791203496842e-05, + "loss": 0.8968, + "step": 3614 + }, + { + "epoch": 0.21, + "grad_norm": 0.3179001508479327, + "learning_rate": 1.838977870426432e-05, + "loss": 0.2777, + "step": 3615 + }, + { + "epoch": 0.21, + "grad_norm": 0.38387065537886916, + "learning_rate": 1.8388765914494124e-05, + "loss": 0.1989, + "step": 3616 + }, + { + "epoch": 0.21, + "grad_norm": 0.9955530352807715, + "learning_rate": 1.8387752834221326e-05, + "loss": 0.4931, + "step": 3617 + }, + { + "epoch": 0.21, + "grad_norm": 0.45334877775759963, + "learning_rate": 1.8386739463481004e-05, + "loss": 0.2833, + "step": 3618 + }, + { + "epoch": 0.21, + "grad_norm": 0.4419384557222577, + "learning_rate": 1.838572580230826e-05, + "loss": 0.2239, + "step": 3619 + }, + { + "epoch": 0.21, + "grad_norm": 1.2042963328857403, + "learning_rate": 1.838471185073819e-05, + "loss": 0.7506, + "step": 3620 + }, + { + "epoch": 0.21, + "grad_norm": 0.28333197613712585, + "learning_rate": 1.8383697608805907e-05, + "loss": 0.2277, + "step": 3621 + }, + { + "epoch": 0.21, + "grad_norm": 0.35663776056728297, + "learning_rate": 1.838268307654654e-05, + "loss": 0.2561, + "step": 3622 + }, + { + "epoch": 0.21, + "grad_norm": 0.38137141453365087, + "learning_rate": 1.8381668253995216e-05, + "loss": 0.3414, + "step": 3623 + }, + { + "epoch": 0.21, + "grad_norm": 1.3007390949271715, + "learning_rate": 1.8380653141187084e-05, + "loss": 0.5905, + "step": 3624 + }, + { + "epoch": 0.21, + "grad_norm": 0.3586717879521889, + "learning_rate": 1.837963773815729e-05, + "loss": 0.2828, + "step": 3625 + }, + { + "epoch": 0.21, + "grad_norm": 0.4624648099073319, + "learning_rate": 1.8378622044941007e-05, + "loss": 0.2992, + "step": 3626 + }, + { + "epoch": 0.21, + "grad_norm": 0.28370620018557996, + "learning_rate": 1.8377606061573398e-05, + "loss": 0.1702, + "step": 3627 + }, + { + "epoch": 0.21, + "grad_norm": 0.4113606179544061, + "learning_rate": 1.8376589788089655e-05, + "loss": 0.2951, + "step": 3628 + }, + { + "epoch": 0.21, + "grad_norm": 0.771704015761994, + "learning_rate": 1.837557322452496e-05, + "loss": 0.4089, + "step": 3629 + }, + { + "epoch": 0.21, + "grad_norm": 0.4695330219200323, + "learning_rate": 1.8374556370914533e-05, + "loss": 0.3328, + "step": 3630 + }, + { + "epoch": 0.21, + "grad_norm": 0.3891106792486114, + "learning_rate": 1.8373539227293576e-05, + "loss": 0.2798, + "step": 3631 + }, + { + "epoch": 0.21, + "grad_norm": 0.9231594968954966, + "learning_rate": 1.8372521793697317e-05, + "loss": 0.5186, + "step": 3632 + }, + { + "epoch": 0.21, + "grad_norm": 0.5334306794574006, + "learning_rate": 1.8371504070160985e-05, + "loss": 0.3294, + "step": 3633 + }, + { + "epoch": 0.21, + "grad_norm": 0.4208129570730506, + "learning_rate": 1.8370486056719828e-05, + "loss": 0.2981, + "step": 3634 + }, + { + "epoch": 0.21, + "grad_norm": 0.3440651749124253, + "learning_rate": 1.8369467753409102e-05, + "loss": 0.2406, + "step": 3635 + }, + { + "epoch": 0.21, + "grad_norm": 1.0039452445703967, + "learning_rate": 1.8368449160264064e-05, + "loss": 0.5453, + "step": 3636 + }, + { + "epoch": 0.21, + "grad_norm": 0.44073239184364055, + "learning_rate": 1.836743027731999e-05, + "loss": 0.3189, + "step": 3637 + }, + { + "epoch": 0.21, + "grad_norm": 0.40024632916236896, + "learning_rate": 1.8366411104612168e-05, + "loss": 0.3527, + "step": 3638 + }, + { + "epoch": 0.21, + "grad_norm": 0.4302314709355559, + "learning_rate": 1.8365391642175892e-05, + "loss": 0.3616, + "step": 3639 + }, + { + "epoch": 0.21, + "grad_norm": 0.3902773433049301, + "learning_rate": 1.836437189004646e-05, + "loss": 0.2706, + "step": 3640 + }, + { + "epoch": 0.21, + "grad_norm": 0.3034132980546468, + "learning_rate": 1.836335184825919e-05, + "loss": 0.2261, + "step": 3641 + }, + { + "epoch": 0.21, + "grad_norm": 0.4224483560026623, + "learning_rate": 1.8362331516849405e-05, + "loss": 0.2859, + "step": 3642 + }, + { + "epoch": 0.21, + "grad_norm": 0.3997927453552849, + "learning_rate": 1.8361310895852437e-05, + "loss": 0.2909, + "step": 3643 + }, + { + "epoch": 0.21, + "grad_norm": 0.8073075120292302, + "learning_rate": 1.8360289985303637e-05, + "loss": 0.5745, + "step": 3644 + }, + { + "epoch": 0.21, + "grad_norm": 0.5995548870672561, + "learning_rate": 1.8359268785238348e-05, + "loss": 0.4058, + "step": 3645 + }, + { + "epoch": 0.21, + "grad_norm": 0.4135190586223736, + "learning_rate": 1.8358247295691946e-05, + "loss": 0.2841, + "step": 3646 + }, + { + "epoch": 0.21, + "grad_norm": 0.2933792066466321, + "learning_rate": 1.8357225516699797e-05, + "loss": 0.2371, + "step": 3647 + }, + { + "epoch": 0.21, + "grad_norm": 0.8306758061657067, + "learning_rate": 1.835620344829729e-05, + "loss": 0.3256, + "step": 3648 + }, + { + "epoch": 0.21, + "grad_norm": 0.40272226138516737, + "learning_rate": 1.8355181090519814e-05, + "loss": 0.2961, + "step": 3649 + }, + { + "epoch": 0.21, + "grad_norm": 0.7342811633805951, + "learning_rate": 1.8354158443402777e-05, + "loss": 0.4417, + "step": 3650 + }, + { + "epoch": 0.21, + "grad_norm": 0.5508644077439324, + "learning_rate": 1.835313550698159e-05, + "loss": 0.3978, + "step": 3651 + }, + { + "epoch": 0.21, + "grad_norm": 0.3936970414946991, + "learning_rate": 1.8352112281291683e-05, + "loss": 0.2251, + "step": 3652 + }, + { + "epoch": 0.21, + "grad_norm": 0.30868325393237483, + "learning_rate": 1.8351088766368487e-05, + "loss": 0.1808, + "step": 3653 + }, + { + "epoch": 0.21, + "grad_norm": 0.4717999561118713, + "learning_rate": 1.8350064962247443e-05, + "loss": 0.3688, + "step": 3654 + }, + { + "epoch": 0.21, + "grad_norm": 0.3690124510455886, + "learning_rate": 1.8349040868964012e-05, + "loss": 0.1872, + "step": 3655 + }, + { + "epoch": 0.21, + "grad_norm": 0.516836148527607, + "learning_rate": 1.8348016486553653e-05, + "loss": 0.4212, + "step": 3656 + }, + { + "epoch": 0.21, + "grad_norm": 0.4174942641382965, + "learning_rate": 1.8346991815051844e-05, + "loss": 0.4136, + "step": 3657 + }, + { + "epoch": 0.21, + "grad_norm": 0.34246783846879647, + "learning_rate": 1.8345966854494065e-05, + "loss": 0.2218, + "step": 3658 + }, + { + "epoch": 0.21, + "grad_norm": 0.29464001657216854, + "learning_rate": 1.8344941604915813e-05, + "loss": 0.2341, + "step": 3659 + }, + { + "epoch": 0.21, + "grad_norm": 1.0963469580919971, + "learning_rate": 1.8343916066352593e-05, + "loss": 0.5888, + "step": 3660 + }, + { + "epoch": 0.21, + "grad_norm": 0.39006202105175725, + "learning_rate": 1.834289023883992e-05, + "loss": 0.2244, + "step": 3661 + }, + { + "epoch": 0.21, + "grad_norm": 0.45351816412520496, + "learning_rate": 1.8341864122413313e-05, + "loss": 0.3389, + "step": 3662 + }, + { + "epoch": 0.21, + "grad_norm": 1.013846492560218, + "learning_rate": 1.8340837717108312e-05, + "loss": 0.6265, + "step": 3663 + }, + { + "epoch": 0.21, + "grad_norm": 0.44933729211335915, + "learning_rate": 1.8339811022960458e-05, + "loss": 0.3057, + "step": 3664 + }, + { + "epoch": 0.21, + "grad_norm": 0.31780800408881466, + "learning_rate": 1.833878404000531e-05, + "loss": 0.1626, + "step": 3665 + }, + { + "epoch": 0.21, + "grad_norm": 0.3529354991105561, + "learning_rate": 1.8337756768278425e-05, + "loss": 0.2648, + "step": 3666 + }, + { + "epoch": 0.21, + "grad_norm": 0.4312121030833976, + "learning_rate": 1.8336729207815386e-05, + "loss": 0.2812, + "step": 3667 + }, + { + "epoch": 0.21, + "grad_norm": 0.7354723969211455, + "learning_rate": 1.833570135865177e-05, + "loss": 0.3566, + "step": 3668 + }, + { + "epoch": 0.21, + "grad_norm": 1.0099131930682428, + "learning_rate": 1.8334673220823175e-05, + "loss": 0.6111, + "step": 3669 + }, + { + "epoch": 0.21, + "grad_norm": 0.31425998270147343, + "learning_rate": 1.8333644794365205e-05, + "loss": 0.27, + "step": 3670 + }, + { + "epoch": 0.21, + "grad_norm": 0.3286391464636836, + "learning_rate": 1.8332616079313473e-05, + "loss": 0.2264, + "step": 3671 + }, + { + "epoch": 0.21, + "grad_norm": 0.47239060853182213, + "learning_rate": 1.8331587075703607e-05, + "loss": 0.302, + "step": 3672 + }, + { + "epoch": 0.21, + "grad_norm": 0.5911532765330798, + "learning_rate": 1.833055778357124e-05, + "loss": 0.3735, + "step": 3673 + }, + { + "epoch": 0.21, + "grad_norm": 0.3816260176156264, + "learning_rate": 1.8329528202952013e-05, + "loss": 0.2915, + "step": 3674 + }, + { + "epoch": 0.21, + "grad_norm": 1.2602046875475506, + "learning_rate": 1.832849833388158e-05, + "loss": 0.8317, + "step": 3675 + }, + { + "epoch": 0.21, + "grad_norm": 0.5802384065602598, + "learning_rate": 1.8327468176395614e-05, + "loss": 0.378, + "step": 3676 + }, + { + "epoch": 0.21, + "grad_norm": 0.3233690864917701, + "learning_rate": 1.8326437730529778e-05, + "loss": 0.2625, + "step": 3677 + }, + { + "epoch": 0.21, + "grad_norm": 0.30650130488543587, + "learning_rate": 1.8325406996319762e-05, + "loss": 0.2177, + "step": 3678 + }, + { + "epoch": 0.21, + "grad_norm": 0.4361961196440014, + "learning_rate": 1.8324375973801262e-05, + "loss": 0.3167, + "step": 3679 + }, + { + "epoch": 0.21, + "grad_norm": 0.540582742403159, + "learning_rate": 1.8323344663009976e-05, + "loss": 0.4255, + "step": 3680 + }, + { + "epoch": 0.21, + "grad_norm": 1.26530181020778, + "learning_rate": 1.8322313063981628e-05, + "loss": 0.5116, + "step": 3681 + }, + { + "epoch": 0.21, + "grad_norm": 0.3019632406735647, + "learning_rate": 1.8321281176751932e-05, + "loss": 0.2601, + "step": 3682 + }, + { + "epoch": 0.21, + "grad_norm": 0.5169685558732896, + "learning_rate": 1.8320249001356627e-05, + "loss": 0.3703, + "step": 3683 + }, + { + "epoch": 0.21, + "grad_norm": 0.40742847831985723, + "learning_rate": 1.831921653783146e-05, + "loss": 0.1924, + "step": 3684 + }, + { + "epoch": 0.21, + "grad_norm": 0.5064412702318805, + "learning_rate": 1.8318183786212177e-05, + "loss": 0.3476, + "step": 3685 + }, + { + "epoch": 0.21, + "grad_norm": 0.39189664074458824, + "learning_rate": 1.8317150746534553e-05, + "loss": 0.3303, + "step": 3686 + }, + { + "epoch": 0.21, + "grad_norm": 1.386780392774275, + "learning_rate": 1.8316117418834352e-05, + "loss": 0.8824, + "step": 3687 + }, + { + "epoch": 0.21, + "grad_norm": 0.3476158374729174, + "learning_rate": 1.831508380314736e-05, + "loss": 0.1713, + "step": 3688 + }, + { + "epoch": 0.21, + "grad_norm": 0.5407610308721493, + "learning_rate": 1.831404989950938e-05, + "loss": 0.4049, + "step": 3689 + }, + { + "epoch": 0.21, + "grad_norm": 0.34332848216555034, + "learning_rate": 1.8313015707956205e-05, + "loss": 0.2995, + "step": 3690 + }, + { + "epoch": 0.21, + "grad_norm": 0.4909483803893434, + "learning_rate": 1.831198122852366e-05, + "loss": 0.2905, + "step": 3691 + }, + { + "epoch": 0.21, + "grad_norm": 0.34866275708306377, + "learning_rate": 1.8310946461247553e-05, + "loss": 0.2568, + "step": 3692 + }, + { + "epoch": 0.21, + "grad_norm": 0.5144534607864611, + "learning_rate": 1.8309911406163736e-05, + "loss": 0.3944, + "step": 3693 + }, + { + "epoch": 0.21, + "grad_norm": 0.4451881757530133, + "learning_rate": 1.830887606330804e-05, + "loss": 0.2252, + "step": 3694 + }, + { + "epoch": 0.21, + "grad_norm": 0.46182446045835396, + "learning_rate": 1.8307840432716323e-05, + "loss": 0.3298, + "step": 3695 + }, + { + "epoch": 0.21, + "grad_norm": 0.7256437850629539, + "learning_rate": 1.830680451442445e-05, + "loss": 0.4658, + "step": 3696 + }, + { + "epoch": 0.21, + "grad_norm": 0.31246873377998513, + "learning_rate": 1.8305768308468294e-05, + "loss": 0.1741, + "step": 3697 + }, + { + "epoch": 0.21, + "grad_norm": 0.3792276788301899, + "learning_rate": 1.830473181488374e-05, + "loss": 0.2723, + "step": 3698 + }, + { + "epoch": 0.21, + "grad_norm": 1.4475052756468612, + "learning_rate": 1.8303695033706675e-05, + "loss": 0.8954, + "step": 3699 + }, + { + "epoch": 0.21, + "grad_norm": 0.4832360785231713, + "learning_rate": 1.8302657964973014e-05, + "loss": 0.3362, + "step": 3700 + }, + { + "epoch": 0.21, + "grad_norm": 0.38449186292934234, + "learning_rate": 1.830162060871866e-05, + "loss": 0.2838, + "step": 3701 + }, + { + "epoch": 0.21, + "grad_norm": 0.4419538620503951, + "learning_rate": 1.8300582964979544e-05, + "loss": 0.3564, + "step": 3702 + }, + { + "epoch": 0.21, + "grad_norm": 0.36778425880138244, + "learning_rate": 1.8299545033791596e-05, + "loss": 0.2228, + "step": 3703 + }, + { + "epoch": 0.21, + "grad_norm": 0.30802703243687346, + "learning_rate": 1.829850681519076e-05, + "loss": 0.1985, + "step": 3704 + }, + { + "epoch": 0.21, + "grad_norm": 0.6655387140633007, + "learning_rate": 1.8297468309212994e-05, + "loss": 0.4198, + "step": 3705 + }, + { + "epoch": 0.21, + "grad_norm": 0.45041933695514075, + "learning_rate": 1.8296429515894255e-05, + "loss": 0.3048, + "step": 3706 + }, + { + "epoch": 0.21, + "grad_norm": 0.41420386663343106, + "learning_rate": 1.8295390435270516e-05, + "loss": 0.2809, + "step": 3707 + }, + { + "epoch": 0.21, + "grad_norm": 0.6850026949422853, + "learning_rate": 1.8294351067377762e-05, + "loss": 0.4826, + "step": 3708 + }, + { + "epoch": 0.21, + "grad_norm": 0.3475258356965287, + "learning_rate": 1.829331141225199e-05, + "loss": 0.2416, + "step": 3709 + }, + { + "epoch": 0.21, + "grad_norm": 0.39414644413001365, + "learning_rate": 1.8292271469929202e-05, + "loss": 0.2489, + "step": 3710 + }, + { + "epoch": 0.21, + "grad_norm": 0.46620981413281065, + "learning_rate": 1.829123124044541e-05, + "loss": 0.3112, + "step": 3711 + }, + { + "epoch": 0.21, + "grad_norm": 0.8393024501634222, + "learning_rate": 1.8290190723836632e-05, + "loss": 0.4927, + "step": 3712 + }, + { + "epoch": 0.21, + "grad_norm": 0.5442988866000577, + "learning_rate": 1.828914992013891e-05, + "loss": 0.3368, + "step": 3713 + }, + { + "epoch": 0.21, + "grad_norm": 0.5338386638537052, + "learning_rate": 1.828810882938828e-05, + "loss": 0.2955, + "step": 3714 + }, + { + "epoch": 0.21, + "grad_norm": 0.7721046804040257, + "learning_rate": 1.8287067451620796e-05, + "loss": 0.4498, + "step": 3715 + }, + { + "epoch": 0.21, + "grad_norm": 0.39359198704409026, + "learning_rate": 1.8286025786872526e-05, + "loss": 0.3215, + "step": 3716 + }, + { + "epoch": 0.21, + "grad_norm": 0.309872202825924, + "learning_rate": 1.8284983835179536e-05, + "loss": 0.2202, + "step": 3717 + }, + { + "epoch": 0.21, + "grad_norm": 0.3891445865569338, + "learning_rate": 1.8283941596577917e-05, + "loss": 0.2658, + "step": 3718 + }, + { + "epoch": 0.21, + "grad_norm": 0.5257520788875862, + "learning_rate": 1.8282899071103755e-05, + "loss": 0.3476, + "step": 3719 + }, + { + "epoch": 0.21, + "grad_norm": 0.9016773666981592, + "learning_rate": 1.8281856258793155e-05, + "loss": 0.364, + "step": 3720 + }, + { + "epoch": 0.21, + "grad_norm": 0.40787992390494676, + "learning_rate": 1.8280813159682226e-05, + "loss": 0.3125, + "step": 3721 + }, + { + "epoch": 0.21, + "grad_norm": 0.39215624227848944, + "learning_rate": 1.8279769773807094e-05, + "loss": 0.295, + "step": 3722 + }, + { + "epoch": 0.21, + "grad_norm": 0.19942256276475304, + "learning_rate": 1.8278726101203892e-05, + "loss": 0.1365, + "step": 3723 + }, + { + "epoch": 0.21, + "grad_norm": 1.0897560592528546, + "learning_rate": 1.8277682141908763e-05, + "loss": 0.4017, + "step": 3724 + }, + { + "epoch": 0.21, + "grad_norm": 0.45932270538036235, + "learning_rate": 1.8276637895957853e-05, + "loss": 0.331, + "step": 3725 + }, + { + "epoch": 0.21, + "grad_norm": 0.38873202435458054, + "learning_rate": 1.827559336338733e-05, + "loss": 0.3411, + "step": 3726 + }, + { + "epoch": 0.21, + "grad_norm": 0.7523879935802436, + "learning_rate": 1.8274548544233367e-05, + "loss": 0.2851, + "step": 3727 + }, + { + "epoch": 0.21, + "grad_norm": 0.3509208008137019, + "learning_rate": 1.827350343853214e-05, + "loss": 0.2867, + "step": 3728 + }, + { + "epoch": 0.21, + "grad_norm": 0.4536301444246971, + "learning_rate": 1.8272458046319848e-05, + "loss": 0.3437, + "step": 3729 + }, + { + "epoch": 0.21, + "grad_norm": 1.733104281686962, + "learning_rate": 1.8271412367632688e-05, + "loss": 0.4302, + "step": 3730 + }, + { + "epoch": 0.21, + "grad_norm": 0.294900982042248, + "learning_rate": 1.8270366402506872e-05, + "loss": 0.223, + "step": 3731 + }, + { + "epoch": 0.21, + "grad_norm": 0.45382345157960424, + "learning_rate": 1.8269320150978625e-05, + "loss": 0.312, + "step": 3732 + }, + { + "epoch": 0.21, + "grad_norm": 0.4332203720271532, + "learning_rate": 1.8268273613084177e-05, + "loss": 0.2502, + "step": 3733 + }, + { + "epoch": 0.21, + "grad_norm": 0.4650150296518048, + "learning_rate": 1.826722678885977e-05, + "loss": 0.3157, + "step": 3734 + }, + { + "epoch": 0.21, + "grad_norm": 1.571029287761417, + "learning_rate": 1.8266179678341654e-05, + "loss": 0.7587, + "step": 3735 + }, + { + "epoch": 0.21, + "grad_norm": 1.048302589229643, + "learning_rate": 1.826513228156609e-05, + "loss": 0.3022, + "step": 3736 + }, + { + "epoch": 0.21, + "grad_norm": 0.3862146200874309, + "learning_rate": 1.826408459856935e-05, + "loss": 0.2914, + "step": 3737 + }, + { + "epoch": 0.21, + "grad_norm": 0.5222607559671937, + "learning_rate": 1.826303662938772e-05, + "loss": 0.2394, + "step": 3738 + }, + { + "epoch": 0.21, + "grad_norm": 1.3267918828905096, + "learning_rate": 1.826198837405748e-05, + "loss": 0.709, + "step": 3739 + }, + { + "epoch": 0.21, + "grad_norm": 0.39077333658956925, + "learning_rate": 1.8260939832614942e-05, + "loss": 0.2071, + "step": 3740 + }, + { + "epoch": 0.21, + "grad_norm": 0.44790985884301965, + "learning_rate": 1.8259891005096414e-05, + "loss": 0.3655, + "step": 3741 + }, + { + "epoch": 0.21, + "grad_norm": 0.667719602409911, + "learning_rate": 1.8258841891538214e-05, + "loss": 0.4282, + "step": 3742 + }, + { + "epoch": 0.22, + "grad_norm": 0.516883514093155, + "learning_rate": 1.8257792491976676e-05, + "loss": 0.2252, + "step": 3743 + }, + { + "epoch": 0.22, + "grad_norm": 0.42556191862763, + "learning_rate": 1.825674280644814e-05, + "loss": 0.192, + "step": 3744 + }, + { + "epoch": 0.22, + "grad_norm": 0.42746714546213527, + "learning_rate": 1.8255692834988952e-05, + "loss": 0.3717, + "step": 3745 + }, + { + "epoch": 0.22, + "grad_norm": 0.3455175925005218, + "learning_rate": 1.8254642577635478e-05, + "loss": 0.2232, + "step": 3746 + }, + { + "epoch": 0.22, + "grad_norm": 0.5540250899869666, + "learning_rate": 1.8253592034424085e-05, + "loss": 0.5053, + "step": 3747 + }, + { + "epoch": 0.22, + "grad_norm": 1.0421298320869656, + "learning_rate": 1.8252541205391155e-05, + "loss": 0.6174, + "step": 3748 + }, + { + "epoch": 0.22, + "grad_norm": 0.44154945291697445, + "learning_rate": 1.825149009057308e-05, + "loss": 0.2326, + "step": 3749 + }, + { + "epoch": 0.22, + "grad_norm": 0.3709396033941532, + "learning_rate": 1.8250438690006257e-05, + "loss": 0.2199, + "step": 3750 + }, + { + "epoch": 0.22, + "grad_norm": 1.3866050733970163, + "learning_rate": 1.8249387003727097e-05, + "loss": 0.7559, + "step": 3751 + }, + { + "epoch": 0.22, + "grad_norm": 0.384792848713734, + "learning_rate": 1.824833503177202e-05, + "loss": 0.3101, + "step": 3752 + }, + { + "epoch": 0.22, + "grad_norm": 0.3965591271182984, + "learning_rate": 1.8247282774177456e-05, + "loss": 0.3009, + "step": 3753 + }, + { + "epoch": 0.22, + "grad_norm": 0.7462894283297303, + "learning_rate": 1.824623023097984e-05, + "loss": 0.4997, + "step": 3754 + }, + { + "epoch": 0.22, + "grad_norm": 0.3943149843852122, + "learning_rate": 1.824517740221563e-05, + "loss": 0.2825, + "step": 3755 + }, + { + "epoch": 0.22, + "grad_norm": 0.2177870980472017, + "learning_rate": 1.824412428792128e-05, + "loss": 0.1226, + "step": 3756 + }, + { + "epoch": 0.22, + "grad_norm": 0.499144612415864, + "learning_rate": 1.8243070888133262e-05, + "loss": 0.361, + "step": 3757 + }, + { + "epoch": 0.22, + "grad_norm": 0.5001785109282243, + "learning_rate": 1.8242017202888053e-05, + "loss": 0.3159, + "step": 3758 + }, + { + "epoch": 0.22, + "grad_norm": 0.7388789713211021, + "learning_rate": 1.824096323222214e-05, + "loss": 0.3556, + "step": 3759 + }, + { + "epoch": 0.22, + "grad_norm": 0.534150495177783, + "learning_rate": 1.8239908976172027e-05, + "loss": 0.4201, + "step": 3760 + }, + { + "epoch": 0.22, + "grad_norm": 0.4950814070548895, + "learning_rate": 1.823885443477422e-05, + "loss": 0.2938, + "step": 3761 + }, + { + "epoch": 0.22, + "grad_norm": 0.28446685490046, + "learning_rate": 1.8237799608065238e-05, + "loss": 0.2129, + "step": 3762 + }, + { + "epoch": 0.22, + "grad_norm": 0.6083754544167314, + "learning_rate": 1.823674449608161e-05, + "loss": 0.405, + "step": 3763 + }, + { + "epoch": 0.22, + "grad_norm": 0.3692492992721875, + "learning_rate": 1.8235689098859874e-05, + "loss": 0.2551, + "step": 3764 + }, + { + "epoch": 0.22, + "grad_norm": 0.39185364731901257, + "learning_rate": 1.823463341643658e-05, + "loss": 0.3614, + "step": 3765 + }, + { + "epoch": 0.22, + "grad_norm": 0.8683819485532979, + "learning_rate": 1.8233577448848283e-05, + "loss": 0.4439, + "step": 3766 + }, + { + "epoch": 0.22, + "grad_norm": 0.3771140986653881, + "learning_rate": 1.8232521196131552e-05, + "loss": 0.3055, + "step": 3767 + }, + { + "epoch": 0.22, + "grad_norm": 0.35751520286149197, + "learning_rate": 1.823146465832297e-05, + "loss": 0.2238, + "step": 3768 + }, + { + "epoch": 0.22, + "grad_norm": 0.4560483343446017, + "learning_rate": 1.823040783545912e-05, + "loss": 0.309, + "step": 3769 + }, + { + "epoch": 0.22, + "grad_norm": 0.35578254331441966, + "learning_rate": 1.8229350727576597e-05, + "loss": 0.2929, + "step": 3770 + }, + { + "epoch": 0.22, + "grad_norm": 0.7910099204556784, + "learning_rate": 1.8228293334712015e-05, + "loss": 0.5764, + "step": 3771 + }, + { + "epoch": 0.22, + "grad_norm": 0.47524677083124306, + "learning_rate": 1.822723565690199e-05, + "loss": 0.3022, + "step": 3772 + }, + { + "epoch": 0.22, + "grad_norm": 0.3292122348325114, + "learning_rate": 1.8226177694183144e-05, + "loss": 0.2735, + "step": 3773 + }, + { + "epoch": 0.22, + "grad_norm": 0.5184964716025334, + "learning_rate": 1.8225119446592122e-05, + "loss": 0.2828, + "step": 3774 + }, + { + "epoch": 0.22, + "grad_norm": 0.3960302570284166, + "learning_rate": 1.8224060914165564e-05, + "loss": 0.3376, + "step": 3775 + }, + { + "epoch": 0.22, + "grad_norm": 0.3777909633108905, + "learning_rate": 1.8223002096940133e-05, + "loss": 0.2277, + "step": 3776 + }, + { + "epoch": 0.22, + "grad_norm": 0.3777145561989262, + "learning_rate": 1.822194299495249e-05, + "loss": 0.334, + "step": 3777 + }, + { + "epoch": 0.22, + "grad_norm": 0.9906118795976717, + "learning_rate": 1.8220883608239317e-05, + "loss": 0.6105, + "step": 3778 + }, + { + "epoch": 0.22, + "grad_norm": 0.3680252029129673, + "learning_rate": 1.82198239368373e-05, + "loss": 0.2007, + "step": 3779 + }, + { + "epoch": 0.22, + "grad_norm": 0.29715685119899815, + "learning_rate": 1.8218763980783127e-05, + "loss": 0.2653, + "step": 3780 + }, + { + "epoch": 0.22, + "grad_norm": 0.3725279137918228, + "learning_rate": 1.8217703740113518e-05, + "loss": 0.3547, + "step": 3781 + }, + { + "epoch": 0.22, + "grad_norm": 0.36171919213090536, + "learning_rate": 1.8216643214865176e-05, + "loss": 0.1849, + "step": 3782 + }, + { + "epoch": 0.22, + "grad_norm": 0.5560972723081516, + "learning_rate": 1.8215582405074838e-05, + "loss": 0.4323, + "step": 3783 + }, + { + "epoch": 0.22, + "grad_norm": 1.6986156985423673, + "learning_rate": 1.821452131077923e-05, + "loss": 0.7669, + "step": 3784 + }, + { + "epoch": 0.22, + "grad_norm": 0.2939380110603323, + "learning_rate": 1.8213459932015104e-05, + "loss": 0.2367, + "step": 3785 + }, + { + "epoch": 0.22, + "grad_norm": 0.5918622124773595, + "learning_rate": 1.8212398268819214e-05, + "loss": 0.4855, + "step": 3786 + }, + { + "epoch": 0.22, + "grad_norm": 0.4214369828365935, + "learning_rate": 1.8211336321228326e-05, + "loss": 0.3198, + "step": 3787 + }, + { + "epoch": 0.22, + "grad_norm": 0.3423353082604522, + "learning_rate": 1.8210274089279214e-05, + "loss": 0.2273, + "step": 3788 + }, + { + "epoch": 0.22, + "grad_norm": 0.48273982176060215, + "learning_rate": 1.8209211573008663e-05, + "loss": 0.2885, + "step": 3789 + }, + { + "epoch": 0.22, + "grad_norm": 1.081969336795844, + "learning_rate": 1.8208148772453466e-05, + "loss": 0.6395, + "step": 3790 + }, + { + "epoch": 0.22, + "grad_norm": 0.4540363009130433, + "learning_rate": 1.8207085687650433e-05, + "loss": 0.3198, + "step": 3791 + }, + { + "epoch": 0.22, + "grad_norm": 0.6965030051365356, + "learning_rate": 1.8206022318636375e-05, + "loss": 0.3283, + "step": 3792 + }, + { + "epoch": 0.22, + "grad_norm": 0.36981946890013234, + "learning_rate": 1.8204958665448116e-05, + "loss": 0.3409, + "step": 3793 + }, + { + "epoch": 0.22, + "grad_norm": 0.30072224494974714, + "learning_rate": 1.8203894728122492e-05, + "loss": 0.2236, + "step": 3794 + }, + { + "epoch": 0.22, + "grad_norm": 0.489290813598683, + "learning_rate": 1.8202830506696346e-05, + "loss": 0.3333, + "step": 3795 + }, + { + "epoch": 0.22, + "grad_norm": 0.3308604611705892, + "learning_rate": 1.8201766001206533e-05, + "loss": 0.2731, + "step": 3796 + }, + { + "epoch": 0.22, + "grad_norm": 0.5543044671117653, + "learning_rate": 1.8200701211689915e-05, + "loss": 0.3272, + "step": 3797 + }, + { + "epoch": 0.22, + "grad_norm": 0.49671661520822036, + "learning_rate": 1.819963613818337e-05, + "loss": 0.3211, + "step": 3798 + }, + { + "epoch": 0.22, + "grad_norm": 0.39700198461844927, + "learning_rate": 1.8198570780723773e-05, + "loss": 0.2987, + "step": 3799 + }, + { + "epoch": 0.22, + "grad_norm": 0.6450937703071342, + "learning_rate": 1.8197505139348023e-05, + "loss": 0.3396, + "step": 3800 + }, + { + "epoch": 0.22, + "grad_norm": 0.34925975121207203, + "learning_rate": 1.8196439214093023e-05, + "loss": 0.2853, + "step": 3801 + }, + { + "epoch": 0.22, + "grad_norm": 0.3075989600836906, + "learning_rate": 1.819537300499569e-05, + "loss": 0.1727, + "step": 3802 + }, + { + "epoch": 0.22, + "grad_norm": 0.40942585125745, + "learning_rate": 1.8194306512092938e-05, + "loss": 0.3184, + "step": 3803 + }, + { + "epoch": 0.22, + "grad_norm": 0.3781360749319683, + "learning_rate": 1.8193239735421703e-05, + "loss": 0.3548, + "step": 3804 + }, + { + "epoch": 0.22, + "grad_norm": 0.5636306621999936, + "learning_rate": 1.819217267501893e-05, + "loss": 0.2625, + "step": 3805 + }, + { + "epoch": 0.22, + "grad_norm": 0.44361177405533797, + "learning_rate": 1.819110533092157e-05, + "loss": 0.3215, + "step": 3806 + }, + { + "epoch": 0.22, + "grad_norm": 0.6164138406659124, + "learning_rate": 1.8190037703166585e-05, + "loss": 0.4649, + "step": 3807 + }, + { + "epoch": 0.22, + "grad_norm": 0.25182141423743754, + "learning_rate": 1.8188969791790946e-05, + "loss": 0.1842, + "step": 3808 + }, + { + "epoch": 0.22, + "grad_norm": 0.4578954704514088, + "learning_rate": 1.8187901596831638e-05, + "loss": 0.3233, + "step": 3809 + }, + { + "epoch": 0.22, + "grad_norm": 0.5217483050301579, + "learning_rate": 1.8186833118325645e-05, + "loss": 0.4206, + "step": 3810 + }, + { + "epoch": 0.22, + "grad_norm": 0.5431293205707014, + "learning_rate": 1.8185764356309975e-05, + "loss": 0.4136, + "step": 3811 + }, + { + "epoch": 0.22, + "grad_norm": 0.45279199338795334, + "learning_rate": 1.8184695310821635e-05, + "loss": 0.2989, + "step": 3812 + }, + { + "epoch": 0.22, + "grad_norm": 0.5020020584142632, + "learning_rate": 1.8183625981897653e-05, + "loss": 0.3023, + "step": 3813 + }, + { + "epoch": 0.22, + "grad_norm": 0.34007283638326713, + "learning_rate": 1.818255636957505e-05, + "loss": 0.2279, + "step": 3814 + }, + { + "epoch": 0.22, + "grad_norm": 0.5940995910555453, + "learning_rate": 1.818148647389088e-05, + "loss": 0.2751, + "step": 3815 + }, + { + "epoch": 0.22, + "grad_norm": 0.3728257688155502, + "learning_rate": 1.8180416294882178e-05, + "loss": 0.3428, + "step": 3816 + }, + { + "epoch": 0.22, + "grad_norm": 0.540902906588058, + "learning_rate": 1.817934583258601e-05, + "loss": 0.413, + "step": 3817 + }, + { + "epoch": 0.22, + "grad_norm": 0.32082684762622404, + "learning_rate": 1.8178275087039452e-05, + "loss": 0.1668, + "step": 3818 + }, + { + "epoch": 0.22, + "grad_norm": 0.4394101437520785, + "learning_rate": 1.8177204058279577e-05, + "loss": 0.3368, + "step": 3819 + }, + { + "epoch": 0.22, + "grad_norm": 0.41746611985218063, + "learning_rate": 1.817613274634348e-05, + "loss": 0.2987, + "step": 3820 + }, + { + "epoch": 0.22, + "grad_norm": 0.5320382789127321, + "learning_rate": 1.8175061151268255e-05, + "loss": 0.3093, + "step": 3821 + }, + { + "epoch": 0.22, + "grad_norm": 0.2989541511419468, + "learning_rate": 1.8173989273091014e-05, + "loss": 0.2661, + "step": 3822 + }, + { + "epoch": 0.22, + "grad_norm": 1.724527721383952, + "learning_rate": 1.8172917111848878e-05, + "loss": 0.8488, + "step": 3823 + }, + { + "epoch": 0.22, + "grad_norm": 0.3618052569029262, + "learning_rate": 1.817184466757897e-05, + "loss": 0.2378, + "step": 3824 + }, + { + "epoch": 0.22, + "grad_norm": 0.4104423447288719, + "learning_rate": 1.8170771940318437e-05, + "loss": 0.3514, + "step": 3825 + }, + { + "epoch": 0.22, + "grad_norm": 0.600402119776531, + "learning_rate": 1.816969893010442e-05, + "loss": 0.496, + "step": 3826 + }, + { + "epoch": 0.22, + "grad_norm": 0.3670048381276231, + "learning_rate": 1.8168625636974085e-05, + "loss": 0.2748, + "step": 3827 + }, + { + "epoch": 0.22, + "grad_norm": 0.3009020200838833, + "learning_rate": 1.816755206096459e-05, + "loss": 0.1996, + "step": 3828 + }, + { + "epoch": 0.22, + "grad_norm": 0.504312784568991, + "learning_rate": 1.816647820211312e-05, + "loss": 0.38, + "step": 3829 + }, + { + "epoch": 0.22, + "grad_norm": 0.6567393314755094, + "learning_rate": 1.8165404060456863e-05, + "loss": 0.3679, + "step": 3830 + }, + { + "epoch": 0.22, + "grad_norm": 0.3984924468921483, + "learning_rate": 1.8164329636033012e-05, + "loss": 0.252, + "step": 3831 + }, + { + "epoch": 0.22, + "grad_norm": 0.4239600262528248, + "learning_rate": 1.8163254928878777e-05, + "loss": 0.3663, + "step": 3832 + }, + { + "epoch": 0.22, + "grad_norm": 0.4164722216338981, + "learning_rate": 1.8162179939031377e-05, + "loss": 0.299, + "step": 3833 + }, + { + "epoch": 0.22, + "grad_norm": 0.35787438325680954, + "learning_rate": 1.8161104666528033e-05, + "loss": 0.1849, + "step": 3834 + }, + { + "epoch": 0.22, + "grad_norm": 0.982908517879726, + "learning_rate": 1.8160029111405986e-05, + "loss": 0.6916, + "step": 3835 + }, + { + "epoch": 0.22, + "grad_norm": 0.46631107207101113, + "learning_rate": 1.8158953273702486e-05, + "loss": 0.3176, + "step": 3836 + }, + { + "epoch": 0.22, + "grad_norm": 0.3537925294977268, + "learning_rate": 1.8157877153454785e-05, + "loss": 0.273, + "step": 3837 + }, + { + "epoch": 0.22, + "grad_norm": 0.6615628600913611, + "learning_rate": 1.8156800750700143e-05, + "loss": 0.4955, + "step": 3838 + }, + { + "epoch": 0.22, + "grad_norm": 0.41214597949670184, + "learning_rate": 1.8155724065475845e-05, + "loss": 0.2243, + "step": 3839 + }, + { + "epoch": 0.22, + "grad_norm": 0.3755937652787498, + "learning_rate": 1.8154647097819172e-05, + "loss": 0.2564, + "step": 3840 + }, + { + "epoch": 0.22, + "grad_norm": 0.48876484681974797, + "learning_rate": 1.8153569847767423e-05, + "loss": 0.2991, + "step": 3841 + }, + { + "epoch": 0.22, + "grad_norm": 0.7458949562080135, + "learning_rate": 1.8152492315357902e-05, + "loss": 0.5104, + "step": 3842 + }, + { + "epoch": 0.22, + "grad_norm": 0.3735505680224295, + "learning_rate": 1.815141450062792e-05, + "loss": 0.282, + "step": 3843 + }, + { + "epoch": 0.22, + "grad_norm": 0.41540487618060473, + "learning_rate": 1.8150336403614804e-05, + "loss": 0.3188, + "step": 3844 + }, + { + "epoch": 0.22, + "grad_norm": 0.5712946769237918, + "learning_rate": 1.814925802435589e-05, + "loss": 0.3764, + "step": 3845 + }, + { + "epoch": 0.22, + "grad_norm": 0.3572232718406907, + "learning_rate": 1.814817936288852e-05, + "loss": 0.2627, + "step": 3846 + }, + { + "epoch": 0.22, + "grad_norm": 0.3055017102946289, + "learning_rate": 1.814710041925005e-05, + "loss": 0.1336, + "step": 3847 + }, + { + "epoch": 0.22, + "grad_norm": 0.3538207884429109, + "learning_rate": 1.8146021193477846e-05, + "loss": 0.3007, + "step": 3848 + }, + { + "epoch": 0.22, + "grad_norm": 0.4314766190553378, + "learning_rate": 1.8144941685609273e-05, + "loss": 0.3486, + "step": 3849 + }, + { + "epoch": 0.22, + "grad_norm": 0.6441772312208831, + "learning_rate": 1.8143861895681723e-05, + "loss": 0.4833, + "step": 3850 + }, + { + "epoch": 0.22, + "grad_norm": 0.30729483425364906, + "learning_rate": 1.8142781823732582e-05, + "loss": 0.1791, + "step": 3851 + }, + { + "epoch": 0.22, + "grad_norm": 0.3808537248820102, + "learning_rate": 1.814170146979926e-05, + "loss": 0.2803, + "step": 3852 + }, + { + "epoch": 0.22, + "grad_norm": 0.37558207046729103, + "learning_rate": 1.8140620833919165e-05, + "loss": 0.2885, + "step": 3853 + }, + { + "epoch": 0.22, + "grad_norm": 0.6154231825728195, + "learning_rate": 1.813953991612972e-05, + "loss": 0.4008, + "step": 3854 + }, + { + "epoch": 0.22, + "grad_norm": 0.35044717507297185, + "learning_rate": 1.813845871646836e-05, + "loss": 0.2794, + "step": 3855 + }, + { + "epoch": 0.22, + "grad_norm": 0.412504774677019, + "learning_rate": 1.813737723497252e-05, + "loss": 0.3406, + "step": 3856 + }, + { + "epoch": 0.22, + "grad_norm": 0.5714736784254149, + "learning_rate": 1.8136295471679662e-05, + "loss": 0.2334, + "step": 3857 + }, + { + "epoch": 0.22, + "grad_norm": 0.3209375714349231, + "learning_rate": 1.8135213426627237e-05, + "loss": 0.2582, + "step": 3858 + }, + { + "epoch": 0.22, + "grad_norm": 0.6265957239815656, + "learning_rate": 1.8134131099852724e-05, + "loss": 0.2952, + "step": 3859 + }, + { + "epoch": 0.22, + "grad_norm": 0.4414280891315469, + "learning_rate": 1.81330484913936e-05, + "loss": 0.284, + "step": 3860 + }, + { + "epoch": 0.22, + "grad_norm": 0.41773899450768254, + "learning_rate": 1.8131965601287357e-05, + "loss": 0.3228, + "step": 3861 + }, + { + "epoch": 0.22, + "grad_norm": 0.8234390961731449, + "learning_rate": 1.8130882429571496e-05, + "loss": 0.5967, + "step": 3862 + }, + { + "epoch": 0.22, + "grad_norm": 0.5027296598506996, + "learning_rate": 1.8129798976283522e-05, + "loss": 0.4022, + "step": 3863 + }, + { + "epoch": 0.22, + "grad_norm": 0.4194025271798038, + "learning_rate": 1.812871524146096e-05, + "loss": 0.2134, + "step": 3864 + }, + { + "epoch": 0.22, + "grad_norm": 0.3110021600253895, + "learning_rate": 1.812763122514134e-05, + "loss": 0.2123, + "step": 3865 + }, + { + "epoch": 0.22, + "grad_norm": 0.7499695380435911, + "learning_rate": 1.8126546927362204e-05, + "loss": 0.4432, + "step": 3866 + }, + { + "epoch": 0.22, + "grad_norm": 0.35946689971793444, + "learning_rate": 1.8125462348161093e-05, + "loss": 0.2403, + "step": 3867 + }, + { + "epoch": 0.22, + "grad_norm": 0.4073612075479078, + "learning_rate": 1.812437748757557e-05, + "loss": 0.3583, + "step": 3868 + }, + { + "epoch": 0.22, + "grad_norm": 1.1775040168059026, + "learning_rate": 1.8123292345643203e-05, + "loss": 0.7311, + "step": 3869 + }, + { + "epoch": 0.22, + "grad_norm": 0.3359065562504141, + "learning_rate": 1.8122206922401573e-05, + "loss": 0.2049, + "step": 3870 + }, + { + "epoch": 0.22, + "grad_norm": 0.3035022075371208, + "learning_rate": 1.8121121217888268e-05, + "loss": 0.2616, + "step": 3871 + }, + { + "epoch": 0.22, + "grad_norm": 0.4404755357892392, + "learning_rate": 1.8120035232140884e-05, + "loss": 0.3895, + "step": 3872 + }, + { + "epoch": 0.22, + "grad_norm": 0.3304185548692813, + "learning_rate": 1.8118948965197027e-05, + "loss": 0.2378, + "step": 3873 + }, + { + "epoch": 0.22, + "grad_norm": 1.1038158285453556, + "learning_rate": 1.8117862417094318e-05, + "loss": 0.8049, + "step": 3874 + }, + { + "epoch": 0.22, + "grad_norm": 0.47637000688014897, + "learning_rate": 1.811677558787038e-05, + "loss": 0.4025, + "step": 3875 + }, + { + "epoch": 0.22, + "grad_norm": 0.3109465314784538, + "learning_rate": 1.8115688477562855e-05, + "loss": 0.27, + "step": 3876 + }, + { + "epoch": 0.22, + "grad_norm": 0.2238118146029211, + "learning_rate": 1.8114601086209387e-05, + "loss": 0.137, + "step": 3877 + }, + { + "epoch": 0.22, + "grad_norm": 0.6389128213112062, + "learning_rate": 1.8113513413847634e-05, + "loss": 0.4794, + "step": 3878 + }, + { + "epoch": 0.22, + "grad_norm": 0.9375943093100196, + "learning_rate": 1.811242546051526e-05, + "loss": 0.2699, + "step": 3879 + }, + { + "epoch": 0.22, + "grad_norm": 0.4320199364520138, + "learning_rate": 1.811133722624994e-05, + "loss": 0.2924, + "step": 3880 + }, + { + "epoch": 0.22, + "grad_norm": 1.0738585193981018, + "learning_rate": 1.811024871108936e-05, + "loss": 0.5673, + "step": 3881 + }, + { + "epoch": 0.22, + "grad_norm": 0.4571929492755686, + "learning_rate": 1.8109159915071215e-05, + "loss": 0.3033, + "step": 3882 + }, + { + "epoch": 0.22, + "grad_norm": 0.40435453226191476, + "learning_rate": 1.810807083823321e-05, + "loss": 0.3142, + "step": 3883 + }, + { + "epoch": 0.22, + "grad_norm": 0.3231601205057292, + "learning_rate": 1.8106981480613063e-05, + "loss": 0.2424, + "step": 3884 + }, + { + "epoch": 0.22, + "grad_norm": 0.47123062190681675, + "learning_rate": 1.8105891842248496e-05, + "loss": 0.3023, + "step": 3885 + }, + { + "epoch": 0.22, + "grad_norm": 0.5011503701732182, + "learning_rate": 1.810480192317724e-05, + "loss": 0.3053, + "step": 3886 + }, + { + "epoch": 0.22, + "grad_norm": 0.5743534405718205, + "learning_rate": 1.8103711723437048e-05, + "loss": 0.4042, + "step": 3887 + }, + { + "epoch": 0.22, + "grad_norm": 0.3408418367503883, + "learning_rate": 1.8102621243065665e-05, + "loss": 0.2877, + "step": 3888 + }, + { + "epoch": 0.22, + "grad_norm": 0.6381779781359501, + "learning_rate": 1.8101530482100855e-05, + "loss": 0.4795, + "step": 3889 + }, + { + "epoch": 0.22, + "grad_norm": 0.2908090921032573, + "learning_rate": 1.8100439440580393e-05, + "loss": 0.1469, + "step": 3890 + }, + { + "epoch": 0.22, + "grad_norm": 0.4231337495748093, + "learning_rate": 1.809934811854206e-05, + "loss": 0.3074, + "step": 3891 + }, + { + "epoch": 0.22, + "grad_norm": 0.3200805794748924, + "learning_rate": 1.8098256516023654e-05, + "loss": 0.2839, + "step": 3892 + }, + { + "epoch": 0.22, + "grad_norm": 1.011602052709592, + "learning_rate": 1.809716463306297e-05, + "loss": 0.2713, + "step": 3893 + }, + { + "epoch": 0.22, + "grad_norm": 0.3933605424410423, + "learning_rate": 1.8096072469697822e-05, + "loss": 0.324, + "step": 3894 + }, + { + "epoch": 0.22, + "grad_norm": 0.7690843943555367, + "learning_rate": 1.8094980025966036e-05, + "loss": 0.4747, + "step": 3895 + }, + { + "epoch": 0.22, + "grad_norm": 0.3765206416318077, + "learning_rate": 1.809388730190544e-05, + "loss": 0.2389, + "step": 3896 + }, + { + "epoch": 0.22, + "grad_norm": 0.3835611886266919, + "learning_rate": 1.8092794297553873e-05, + "loss": 0.28, + "step": 3897 + }, + { + "epoch": 0.22, + "grad_norm": 0.4180873590762182, + "learning_rate": 1.8091701012949187e-05, + "loss": 0.2725, + "step": 3898 + }, + { + "epoch": 0.22, + "grad_norm": 0.44449937407460755, + "learning_rate": 1.8090607448129244e-05, + "loss": 0.3152, + "step": 3899 + }, + { + "epoch": 0.22, + "grad_norm": 0.43700527552620777, + "learning_rate": 1.8089513603131913e-05, + "loss": 0.301, + "step": 3900 + }, + { + "epoch": 0.22, + "grad_norm": 0.6587341669479327, + "learning_rate": 1.808841947799507e-05, + "loss": 0.4034, + "step": 3901 + }, + { + "epoch": 0.22, + "grad_norm": 1.2622215351655142, + "learning_rate": 1.808732507275661e-05, + "loss": 0.7012, + "step": 3902 + }, + { + "epoch": 0.22, + "grad_norm": 0.35778822861958404, + "learning_rate": 1.8086230387454434e-05, + "loss": 0.2038, + "step": 3903 + }, + { + "epoch": 0.22, + "grad_norm": 0.41095801057868664, + "learning_rate": 1.8085135422126448e-05, + "loss": 0.2571, + "step": 3904 + }, + { + "epoch": 0.22, + "grad_norm": 0.5132522096870761, + "learning_rate": 1.8084040176810567e-05, + "loss": 0.349, + "step": 3905 + }, + { + "epoch": 0.22, + "grad_norm": 0.4841833694973949, + "learning_rate": 1.808294465154472e-05, + "loss": 0.2723, + "step": 3906 + }, + { + "epoch": 0.22, + "grad_norm": 0.4857845170840667, + "learning_rate": 1.8081848846366852e-05, + "loss": 0.3875, + "step": 3907 + }, + { + "epoch": 0.22, + "grad_norm": 0.5247363261757332, + "learning_rate": 1.8080752761314904e-05, + "loss": 0.3969, + "step": 3908 + }, + { + "epoch": 0.22, + "grad_norm": 0.34761712484213264, + "learning_rate": 1.8079656396426834e-05, + "loss": 0.2346, + "step": 3909 + }, + { + "epoch": 0.22, + "grad_norm": 0.2940198797850371, + "learning_rate": 1.807855975174061e-05, + "loss": 0.1858, + "step": 3910 + }, + { + "epoch": 0.22, + "grad_norm": 0.40981413692161606, + "learning_rate": 1.8077462827294214e-05, + "loss": 0.3515, + "step": 3911 + }, + { + "epoch": 0.22, + "grad_norm": 0.3987787782958305, + "learning_rate": 1.8076365623125625e-05, + "loss": 0.2753, + "step": 3912 + }, + { + "epoch": 0.22, + "grad_norm": 0.8541449017563671, + "learning_rate": 1.8075268139272842e-05, + "loss": 0.5343, + "step": 3913 + }, + { + "epoch": 0.22, + "grad_norm": 1.122921649883041, + "learning_rate": 1.8074170375773867e-05, + "loss": 0.7047, + "step": 3914 + }, + { + "epoch": 0.22, + "grad_norm": 0.337497593773467, + "learning_rate": 1.8073072332666723e-05, + "loss": 0.2768, + "step": 3915 + }, + { + "epoch": 0.22, + "grad_norm": 0.4214408990448417, + "learning_rate": 1.807197400998943e-05, + "loss": 0.257, + "step": 3916 + }, + { + "epoch": 0.23, + "grad_norm": 0.6049790373030124, + "learning_rate": 1.8070875407780026e-05, + "loss": 0.355, + "step": 3917 + }, + { + "epoch": 0.23, + "grad_norm": 0.30708101652710945, + "learning_rate": 1.806977652607655e-05, + "loss": 0.1988, + "step": 3918 + }, + { + "epoch": 0.23, + "grad_norm": 0.4084061861810741, + "learning_rate": 1.8068677364917063e-05, + "loss": 0.283, + "step": 3919 + }, + { + "epoch": 0.23, + "grad_norm": 0.626383104086622, + "learning_rate": 1.806757792433962e-05, + "loss": 0.4173, + "step": 3920 + }, + { + "epoch": 0.23, + "grad_norm": 0.6969864507969111, + "learning_rate": 1.806647820438231e-05, + "loss": 0.4274, + "step": 3921 + }, + { + "epoch": 0.23, + "grad_norm": 0.3530384228236583, + "learning_rate": 1.8065378205083202e-05, + "loss": 0.316, + "step": 3922 + }, + { + "epoch": 0.23, + "grad_norm": 0.4005606977157132, + "learning_rate": 1.8064277926480392e-05, + "loss": 0.3583, + "step": 3923 + }, + { + "epoch": 0.23, + "grad_norm": 0.31165153005202517, + "learning_rate": 1.8063177368611988e-05, + "loss": 0.214, + "step": 3924 + }, + { + "epoch": 0.23, + "grad_norm": 0.33150213382715915, + "learning_rate": 1.8062076531516094e-05, + "loss": 0.2387, + "step": 3925 + }, + { + "epoch": 0.23, + "grad_norm": 0.8408216515743814, + "learning_rate": 1.806097541523084e-05, + "loss": 0.4223, + "step": 3926 + }, + { + "epoch": 0.23, + "grad_norm": 0.43962821527072266, + "learning_rate": 1.8059874019794352e-05, + "loss": 0.3006, + "step": 3927 + }, + { + "epoch": 0.23, + "grad_norm": 0.4049589074514208, + "learning_rate": 1.8058772345244775e-05, + "loss": 0.3666, + "step": 3928 + }, + { + "epoch": 0.23, + "grad_norm": 0.6436948491887067, + "learning_rate": 1.8057670391620258e-05, + "loss": 0.3893, + "step": 3929 + }, + { + "epoch": 0.23, + "grad_norm": 0.26980658996097057, + "learning_rate": 1.8056568158958958e-05, + "loss": 0.194, + "step": 3930 + }, + { + "epoch": 0.23, + "grad_norm": 0.5857657539673209, + "learning_rate": 1.8055465647299052e-05, + "loss": 0.4342, + "step": 3931 + }, + { + "epoch": 0.23, + "grad_norm": 0.3769739875049705, + "learning_rate": 1.805436285667872e-05, + "loss": 0.2869, + "step": 3932 + }, + { + "epoch": 0.23, + "grad_norm": 0.44323285132468504, + "learning_rate": 1.8053259787136144e-05, + "loss": 0.3503, + "step": 3933 + }, + { + "epoch": 0.23, + "grad_norm": 0.5264863084482408, + "learning_rate": 1.8052156438709527e-05, + "loss": 0.3917, + "step": 3934 + }, + { + "epoch": 0.23, + "grad_norm": 0.3824530834229984, + "learning_rate": 1.805105281143708e-05, + "loss": 0.3065, + "step": 3935 + }, + { + "epoch": 0.23, + "grad_norm": 0.39071495256736205, + "learning_rate": 1.8049948905357023e-05, + "loss": 0.1898, + "step": 3936 + }, + { + "epoch": 0.23, + "grad_norm": 0.3091528912278944, + "learning_rate": 1.804884472050758e-05, + "loss": 0.2312, + "step": 3937 + }, + { + "epoch": 0.23, + "grad_norm": 0.9481919338379736, + "learning_rate": 1.8047740256926993e-05, + "loss": 0.6748, + "step": 3938 + }, + { + "epoch": 0.23, + "grad_norm": 0.35422805559962023, + "learning_rate": 1.8046635514653505e-05, + "loss": 0.2324, + "step": 3939 + }, + { + "epoch": 0.23, + "grad_norm": 0.412781947395747, + "learning_rate": 1.8045530493725375e-05, + "loss": 0.3477, + "step": 3940 + }, + { + "epoch": 0.23, + "grad_norm": 0.7610604686105258, + "learning_rate": 1.8044425194180868e-05, + "loss": 0.5374, + "step": 3941 + }, + { + "epoch": 0.23, + "grad_norm": 0.23494787925537458, + "learning_rate": 1.8043319616058266e-05, + "loss": 0.0743, + "step": 3942 + }, + { + "epoch": 0.23, + "grad_norm": 0.32367312358901884, + "learning_rate": 1.804221375939585e-05, + "loss": 0.2955, + "step": 3943 + }, + { + "epoch": 0.23, + "grad_norm": 0.9361586658175016, + "learning_rate": 1.8041107624231916e-05, + "loss": 0.6321, + "step": 3944 + }, + { + "epoch": 0.23, + "grad_norm": 0.4736112307448519, + "learning_rate": 1.804000121060477e-05, + "loss": 0.3214, + "step": 3945 + }, + { + "epoch": 0.23, + "grad_norm": 0.4152969262120841, + "learning_rate": 1.803889451855273e-05, + "loss": 0.346, + "step": 3946 + }, + { + "epoch": 0.23, + "grad_norm": 0.425406663947165, + "learning_rate": 1.8037787548114122e-05, + "loss": 0.3407, + "step": 3947 + }, + { + "epoch": 0.23, + "grad_norm": 0.3436781987502028, + "learning_rate": 1.8036680299327273e-05, + "loss": 0.1941, + "step": 3948 + }, + { + "epoch": 0.23, + "grad_norm": 0.2759805308240463, + "learning_rate": 1.8035572772230526e-05, + "loss": 0.1915, + "step": 3949 + }, + { + "epoch": 0.23, + "grad_norm": 0.7343999781835354, + "learning_rate": 1.8034464966862247e-05, + "loss": 0.5131, + "step": 3950 + }, + { + "epoch": 0.23, + "grad_norm": 0.3709341545619263, + "learning_rate": 1.8033356883260786e-05, + "loss": 0.3157, + "step": 3951 + }, + { + "epoch": 0.23, + "grad_norm": 0.3961664624622881, + "learning_rate": 1.8032248521464526e-05, + "loss": 0.301, + "step": 3952 + }, + { + "epoch": 0.23, + "grad_norm": 0.9898774268867264, + "learning_rate": 1.8031139881511844e-05, + "loss": 0.6261, + "step": 3953 + }, + { + "epoch": 0.23, + "grad_norm": 0.39230522301924536, + "learning_rate": 1.8030030963441133e-05, + "loss": 0.2662, + "step": 3954 + }, + { + "epoch": 0.23, + "grad_norm": 0.28249473186971336, + "learning_rate": 1.8028921767290796e-05, + "loss": 0.2525, + "step": 3955 + }, + { + "epoch": 0.23, + "grad_norm": 0.5699095328371475, + "learning_rate": 1.802781229309924e-05, + "loss": 0.3755, + "step": 3956 + }, + { + "epoch": 0.23, + "grad_norm": 0.7814378890464383, + "learning_rate": 1.8026702540904893e-05, + "loss": 0.4934, + "step": 3957 + }, + { + "epoch": 0.23, + "grad_norm": 0.31906069339089543, + "learning_rate": 1.802559251074618e-05, + "loss": 0.2279, + "step": 3958 + }, + { + "epoch": 0.23, + "grad_norm": 0.4300544687060639, + "learning_rate": 1.8024482202661544e-05, + "loss": 0.3191, + "step": 3959 + }, + { + "epoch": 0.23, + "grad_norm": 0.7882393206382694, + "learning_rate": 1.8023371616689437e-05, + "loss": 0.4986, + "step": 3960 + }, + { + "epoch": 0.23, + "grad_norm": 0.2288574775693125, + "learning_rate": 1.8022260752868314e-05, + "loss": 0.1694, + "step": 3961 + }, + { + "epoch": 0.23, + "grad_norm": 1.129884549557266, + "learning_rate": 1.8021149611236644e-05, + "loss": 0.5047, + "step": 3962 + }, + { + "epoch": 0.23, + "grad_norm": 0.3673895981661131, + "learning_rate": 1.8020038191832912e-05, + "loss": 0.3243, + "step": 3963 + }, + { + "epoch": 0.23, + "grad_norm": 0.3960014025969493, + "learning_rate": 1.80189264946956e-05, + "loss": 0.3041, + "step": 3964 + }, + { + "epoch": 0.23, + "grad_norm": 0.7265493417525126, + "learning_rate": 1.8017814519863206e-05, + "loss": 0.2853, + "step": 3965 + }, + { + "epoch": 0.23, + "grad_norm": 0.4288534438453486, + "learning_rate": 1.8016702267374243e-05, + "loss": 0.3599, + "step": 3966 + }, + { + "epoch": 0.23, + "grad_norm": 0.3331264382746232, + "learning_rate": 1.8015589737267226e-05, + "loss": 0.2876, + "step": 3967 + }, + { + "epoch": 0.23, + "grad_norm": 0.43022925309376164, + "learning_rate": 1.801447692958068e-05, + "loss": 0.2281, + "step": 3968 + }, + { + "epoch": 0.23, + "grad_norm": 0.5957577630302175, + "learning_rate": 1.801336384435314e-05, + "loss": 0.3927, + "step": 3969 + }, + { + "epoch": 0.23, + "grad_norm": 0.4640169255196903, + "learning_rate": 1.8012250481623158e-05, + "loss": 0.3406, + "step": 3970 + }, + { + "epoch": 0.23, + "grad_norm": 0.39945224996755907, + "learning_rate": 1.8011136841429285e-05, + "loss": 0.2894, + "step": 3971 + }, + { + "epoch": 0.23, + "grad_norm": 0.8005686154322458, + "learning_rate": 1.801002292381009e-05, + "loss": 0.4909, + "step": 3972 + }, + { + "epoch": 0.23, + "grad_norm": 0.33287331011769583, + "learning_rate": 1.800890872880414e-05, + "loss": 0.2381, + "step": 3973 + }, + { + "epoch": 0.23, + "grad_norm": 0.36297847553286217, + "learning_rate": 1.8007794256450027e-05, + "loss": 0.3012, + "step": 3974 + }, + { + "epoch": 0.23, + "grad_norm": 0.5849863340592387, + "learning_rate": 1.800667950678635e-05, + "loss": 0.4239, + "step": 3975 + }, + { + "epoch": 0.23, + "grad_norm": 0.378342194413626, + "learning_rate": 1.8005564479851697e-05, + "loss": 0.2682, + "step": 3976 + }, + { + "epoch": 0.23, + "grad_norm": 0.7952623423604379, + "learning_rate": 1.8004449175684697e-05, + "loss": 0.6006, + "step": 3977 + }, + { + "epoch": 0.23, + "grad_norm": 0.4097070482739936, + "learning_rate": 1.8003333594323962e-05, + "loss": 0.2736, + "step": 3978 + }, + { + "epoch": 0.23, + "grad_norm": 0.3728346283618812, + "learning_rate": 1.800221773580813e-05, + "loss": 0.2866, + "step": 3979 + }, + { + "epoch": 0.23, + "grad_norm": 0.5766114897185256, + "learning_rate": 1.8001101600175843e-05, + "loss": 0.4659, + "step": 3980 + }, + { + "epoch": 0.23, + "grad_norm": 0.26613403625490034, + "learning_rate": 1.799998518746575e-05, + "loss": 0.1474, + "step": 3981 + }, + { + "epoch": 0.23, + "grad_norm": 0.31031290429059133, + "learning_rate": 1.7998868497716516e-05, + "loss": 0.2421, + "step": 3982 + }, + { + "epoch": 0.23, + "grad_norm": 0.4392924725322208, + "learning_rate": 1.7997751530966806e-05, + "loss": 0.3419, + "step": 3983 + }, + { + "epoch": 0.23, + "grad_norm": 0.5462229687452343, + "learning_rate": 1.799663428725531e-05, + "loss": 0.2927, + "step": 3984 + }, + { + "epoch": 0.23, + "grad_norm": 0.39202509019051524, + "learning_rate": 1.7995516766620706e-05, + "loss": 0.3135, + "step": 3985 + }, + { + "epoch": 0.23, + "grad_norm": 0.7239484639028205, + "learning_rate": 1.7994398969101704e-05, + "loss": 0.5246, + "step": 3986 + }, + { + "epoch": 0.23, + "grad_norm": 0.3566107377592294, + "learning_rate": 1.799328089473701e-05, + "loss": 0.2878, + "step": 3987 + }, + { + "epoch": 0.23, + "grad_norm": 0.4206515109068827, + "learning_rate": 1.799216254356534e-05, + "loss": 0.3109, + "step": 3988 + }, + { + "epoch": 0.23, + "grad_norm": 0.337123993090019, + "learning_rate": 1.7991043915625427e-05, + "loss": 0.2135, + "step": 3989 + }, + { + "epoch": 0.23, + "grad_norm": 0.34315829448210505, + "learning_rate": 1.798992501095601e-05, + "loss": 0.3353, + "step": 3990 + }, + { + "epoch": 0.23, + "grad_norm": 0.36382392161356036, + "learning_rate": 1.7988805829595825e-05, + "loss": 0.2506, + "step": 3991 + }, + { + "epoch": 0.23, + "grad_norm": 0.6713822788963748, + "learning_rate": 1.7987686371583643e-05, + "loss": 0.501, + "step": 3992 + }, + { + "epoch": 0.23, + "grad_norm": 1.326313126631407, + "learning_rate": 1.7986566636958228e-05, + "loss": 0.8042, + "step": 3993 + }, + { + "epoch": 0.23, + "grad_norm": 0.27371657393729876, + "learning_rate": 1.798544662575835e-05, + "loss": 0.188, + "step": 3994 + }, + { + "epoch": 0.23, + "grad_norm": 0.31879060982136537, + "learning_rate": 1.7984326338022797e-05, + "loss": 0.2921, + "step": 3995 + }, + { + "epoch": 0.23, + "grad_norm": 0.6619585978080218, + "learning_rate": 1.798320577379037e-05, + "loss": 0.4928, + "step": 3996 + }, + { + "epoch": 0.23, + "grad_norm": 0.353273027323354, + "learning_rate": 1.7982084933099868e-05, + "loss": 0.2369, + "step": 3997 + }, + { + "epoch": 0.23, + "grad_norm": 1.1077945691153839, + "learning_rate": 1.798096381599011e-05, + "loss": 0.6902, + "step": 3998 + }, + { + "epoch": 0.23, + "grad_norm": 0.3950848276115181, + "learning_rate": 1.7979842422499917e-05, + "loss": 0.3657, + "step": 3999 + }, + { + "epoch": 0.23, + "grad_norm": 0.298536345873338, + "learning_rate": 1.7978720752668123e-05, + "loss": 0.2263, + "step": 4000 + }, + { + "epoch": 0.23, + "grad_norm": 0.3103745986561787, + "learning_rate": 1.7977598806533575e-05, + "loss": 0.1444, + "step": 4001 + }, + { + "epoch": 0.23, + "grad_norm": 0.46143426008863875, + "learning_rate": 1.797647658413512e-05, + "loss": 0.3617, + "step": 4002 + }, + { + "epoch": 0.23, + "grad_norm": 0.35964308521923766, + "learning_rate": 1.7975354085511627e-05, + "loss": 0.3244, + "step": 4003 + }, + { + "epoch": 0.23, + "grad_norm": 1.0071718247568406, + "learning_rate": 1.7974231310701964e-05, + "loss": 0.0483, + "step": 4004 + }, + { + "epoch": 0.23, + "grad_norm": 0.8956530240852543, + "learning_rate": 1.7973108259745012e-05, + "loss": 0.7035, + "step": 4005 + }, + { + "epoch": 0.23, + "grad_norm": 0.4631104114709448, + "learning_rate": 1.7971984932679663e-05, + "loss": 0.2886, + "step": 4006 + }, + { + "epoch": 0.23, + "grad_norm": 0.25131716487061634, + "learning_rate": 1.7970861329544823e-05, + "loss": 0.2199, + "step": 4007 + }, + { + "epoch": 0.23, + "grad_norm": 0.6088567020101945, + "learning_rate": 1.7969737450379395e-05, + "loss": 0.4963, + "step": 4008 + }, + { + "epoch": 0.23, + "grad_norm": 0.6053261675860214, + "learning_rate": 1.7968613295222304e-05, + "loss": 0.2756, + "step": 4009 + }, + { + "epoch": 0.23, + "grad_norm": 0.3335128304816215, + "learning_rate": 1.7967488864112473e-05, + "loss": 0.2663, + "step": 4010 + }, + { + "epoch": 0.23, + "grad_norm": 1.4790286393120566, + "learning_rate": 1.7966364157088853e-05, + "loss": 0.8433, + "step": 4011 + }, + { + "epoch": 0.23, + "grad_norm": 0.3681446596417048, + "learning_rate": 1.7965239174190376e-05, + "loss": 0.2871, + "step": 4012 + }, + { + "epoch": 0.23, + "grad_norm": 0.6855409947925746, + "learning_rate": 1.7964113915456013e-05, + "loss": 0.5071, + "step": 4013 + }, + { + "epoch": 0.23, + "grad_norm": 0.3131770028920056, + "learning_rate": 1.7962988380924727e-05, + "loss": 0.2426, + "step": 4014 + }, + { + "epoch": 0.23, + "grad_norm": 0.39953225148360916, + "learning_rate": 1.7961862570635496e-05, + "loss": 0.2161, + "step": 4015 + }, + { + "epoch": 0.23, + "grad_norm": 1.0905675166530646, + "learning_rate": 1.7960736484627306e-05, + "loss": 0.5986, + "step": 4016 + }, + { + "epoch": 0.23, + "grad_norm": 0.8487839708425405, + "learning_rate": 1.7959610122939155e-05, + "loss": 0.521, + "step": 4017 + }, + { + "epoch": 0.23, + "grad_norm": 0.35305893350636597, + "learning_rate": 1.7958483485610048e-05, + "loss": 0.275, + "step": 4018 + }, + { + "epoch": 0.23, + "grad_norm": 0.5051109731237906, + "learning_rate": 1.7957356572678998e-05, + "loss": 0.389, + "step": 4019 + }, + { + "epoch": 0.23, + "grad_norm": 0.37882249996544415, + "learning_rate": 1.7956229384185036e-05, + "loss": 0.217, + "step": 4020 + }, + { + "epoch": 0.23, + "grad_norm": 0.43823490781549507, + "learning_rate": 1.7955101920167188e-05, + "loss": 0.2772, + "step": 4021 + }, + { + "epoch": 0.23, + "grad_norm": 0.5312447099067517, + "learning_rate": 1.7953974180664504e-05, + "loss": 0.3589, + "step": 4022 + }, + { + "epoch": 0.23, + "grad_norm": 0.5289342208062883, + "learning_rate": 1.7952846165716038e-05, + "loss": 0.3424, + "step": 4023 + }, + { + "epoch": 0.23, + "grad_norm": 0.44434750769308423, + "learning_rate": 1.795171787536085e-05, + "loss": 0.2691, + "step": 4024 + }, + { + "epoch": 0.23, + "grad_norm": 0.5755076448856171, + "learning_rate": 1.7950589309638014e-05, + "loss": 0.4052, + "step": 4025 + }, + { + "epoch": 0.23, + "grad_norm": 0.30817817030080696, + "learning_rate": 1.794946046858661e-05, + "loss": 0.2699, + "step": 4026 + }, + { + "epoch": 0.23, + "grad_norm": 0.3306458946911386, + "learning_rate": 1.7948331352245736e-05, + "loss": 0.1414, + "step": 4027 + }, + { + "epoch": 0.23, + "grad_norm": 0.509175609376385, + "learning_rate": 1.7947201960654488e-05, + "loss": 0.3813, + "step": 4028 + }, + { + "epoch": 0.23, + "grad_norm": 0.8703972842173666, + "learning_rate": 1.7946072293851976e-05, + "loss": 0.6002, + "step": 4029 + }, + { + "epoch": 0.23, + "grad_norm": 0.343542528326212, + "learning_rate": 1.794494235187732e-05, + "loss": 0.2338, + "step": 4030 + }, + { + "epoch": 0.23, + "grad_norm": 0.4128671296922721, + "learning_rate": 1.7943812134769656e-05, + "loss": 0.4134, + "step": 4031 + }, + { + "epoch": 0.23, + "grad_norm": 1.041845445196055, + "learning_rate": 1.794268164256812e-05, + "loss": 0.628, + "step": 4032 + }, + { + "epoch": 0.23, + "grad_norm": 0.23769051134991365, + "learning_rate": 1.7941550875311858e-05, + "loss": 0.1505, + "step": 4033 + }, + { + "epoch": 0.23, + "grad_norm": 0.45355094491900144, + "learning_rate": 1.794041983304003e-05, + "loss": 0.3421, + "step": 4034 + }, + { + "epoch": 0.23, + "grad_norm": 1.1868507744408294, + "learning_rate": 1.7939288515791806e-05, + "loss": 0.7895, + "step": 4035 + }, + { + "epoch": 0.23, + "grad_norm": 0.395618032596114, + "learning_rate": 1.7938156923606362e-05, + "loss": 0.2637, + "step": 4036 + }, + { + "epoch": 0.23, + "grad_norm": 0.7611232233618405, + "learning_rate": 1.7937025056522884e-05, + "loss": 0.5185, + "step": 4037 + }, + { + "epoch": 0.23, + "grad_norm": 0.4522028536625917, + "learning_rate": 1.7935892914580572e-05, + "loss": 0.3422, + "step": 4038 + }, + { + "epoch": 0.23, + "grad_norm": 0.32600638350104655, + "learning_rate": 1.7934760497818628e-05, + "loss": 0.1793, + "step": 4039 + }, + { + "epoch": 0.23, + "grad_norm": 0.5263969279202462, + "learning_rate": 1.7933627806276267e-05, + "loss": 0.2869, + "step": 4040 + }, + { + "epoch": 0.23, + "grad_norm": 0.5985454643455146, + "learning_rate": 1.7932494839992723e-05, + "loss": 0.4168, + "step": 4041 + }, + { + "epoch": 0.23, + "grad_norm": 0.40800954663605743, + "learning_rate": 1.793136159900722e-05, + "loss": 0.286, + "step": 4042 + }, + { + "epoch": 0.23, + "grad_norm": 0.4796678193727095, + "learning_rate": 1.7930228083359002e-05, + "loss": 0.3264, + "step": 4043 + }, + { + "epoch": 0.23, + "grad_norm": 0.4774420141970719, + "learning_rate": 1.792909429308733e-05, + "loss": 0.3023, + "step": 4044 + }, + { + "epoch": 0.23, + "grad_norm": 0.33552706676028116, + "learning_rate": 1.792796022823146e-05, + "loss": 0.1879, + "step": 4045 + }, + { + "epoch": 0.23, + "grad_norm": 0.34466688409543883, + "learning_rate": 1.7926825888830673e-05, + "loss": 0.2726, + "step": 4046 + }, + { + "epoch": 0.23, + "grad_norm": 0.6822063282086891, + "learning_rate": 1.7925691274924247e-05, + "loss": 0.5477, + "step": 4047 + }, + { + "epoch": 0.23, + "grad_norm": 0.4588359590822482, + "learning_rate": 1.7924556386551472e-05, + "loss": 0.4077, + "step": 4048 + }, + { + "epoch": 0.23, + "grad_norm": 0.39472767234202216, + "learning_rate": 1.792342122375165e-05, + "loss": 0.289, + "step": 4049 + }, + { + "epoch": 0.23, + "grad_norm": 0.4249909103766952, + "learning_rate": 1.792228578656409e-05, + "loss": 0.3337, + "step": 4050 + }, + { + "epoch": 0.23, + "grad_norm": 0.40114167914557236, + "learning_rate": 1.7921150075028112e-05, + "loss": 0.2137, + "step": 4051 + }, + { + "epoch": 0.23, + "grad_norm": 0.3296340901766964, + "learning_rate": 1.792001408918305e-05, + "loss": 0.2957, + "step": 4052 + }, + { + "epoch": 0.23, + "grad_norm": 0.8150910975190974, + "learning_rate": 1.791887782906824e-05, + "loss": 0.4098, + "step": 4053 + }, + { + "epoch": 0.23, + "grad_norm": 0.3098451298308945, + "learning_rate": 1.7917741294723035e-05, + "loss": 0.2909, + "step": 4054 + }, + { + "epoch": 0.23, + "grad_norm": 0.6086232025078487, + "learning_rate": 1.7916604486186786e-05, + "loss": 0.4586, + "step": 4055 + }, + { + "epoch": 0.23, + "grad_norm": 0.7056291363756321, + "learning_rate": 1.7915467403498864e-05, + "loss": 0.3649, + "step": 4056 + }, + { + "epoch": 0.23, + "grad_norm": 0.232611976797289, + "learning_rate": 1.791433004669865e-05, + "loss": 0.1749, + "step": 4057 + }, + { + "epoch": 0.23, + "grad_norm": 0.3738788987291126, + "learning_rate": 1.791319241582552e-05, + "loss": 0.3219, + "step": 4058 + }, + { + "epoch": 0.23, + "grad_norm": 1.0108750504208646, + "learning_rate": 1.791205451091888e-05, + "loss": 0.4338, + "step": 4059 + }, + { + "epoch": 0.23, + "grad_norm": 0.8253458918236195, + "learning_rate": 1.7910916332018137e-05, + "loss": 0.4663, + "step": 4060 + }, + { + "epoch": 0.23, + "grad_norm": 0.4535609620382996, + "learning_rate": 1.7909777879162695e-05, + "loss": 0.2864, + "step": 4061 + }, + { + "epoch": 0.23, + "grad_norm": 0.5264572392672097, + "learning_rate": 1.7908639152391988e-05, + "loss": 0.3096, + "step": 4062 + }, + { + "epoch": 0.23, + "grad_norm": 0.6313130758765294, + "learning_rate": 1.790750015174545e-05, + "loss": 0.3589, + "step": 4063 + }, + { + "epoch": 0.23, + "grad_norm": 0.4027259179702243, + "learning_rate": 1.7906360877262515e-05, + "loss": 0.3104, + "step": 4064 + }, + { + "epoch": 0.23, + "grad_norm": 0.8796907969909685, + "learning_rate": 1.7905221328982647e-05, + "loss": 0.5063, + "step": 4065 + }, + { + "epoch": 0.23, + "grad_norm": 0.3771326162949683, + "learning_rate": 1.7904081506945304e-05, + "loss": 0.25, + "step": 4066 + }, + { + "epoch": 0.23, + "grad_norm": 0.37190974661172604, + "learning_rate": 1.790294141118996e-05, + "loss": 0.2481, + "step": 4067 + }, + { + "epoch": 0.23, + "grad_norm": 1.5489614559790572, + "learning_rate": 1.790180104175609e-05, + "loss": 0.7663, + "step": 4068 + }, + { + "epoch": 0.23, + "grad_norm": 0.5644846432613142, + "learning_rate": 1.7900660398683192e-05, + "loss": 0.2647, + "step": 4069 + }, + { + "epoch": 0.23, + "grad_norm": 0.5649799604812463, + "learning_rate": 1.789951948201077e-05, + "loss": 0.3422, + "step": 4070 + }, + { + "epoch": 0.23, + "grad_norm": 0.5752127497152826, + "learning_rate": 1.789837829177832e-05, + "loss": 0.3297, + "step": 4071 + }, + { + "epoch": 0.23, + "grad_norm": 0.2515960625460765, + "learning_rate": 1.7897236828025373e-05, + "loss": 0.13, + "step": 4072 + }, + { + "epoch": 0.23, + "grad_norm": 0.454993611244226, + "learning_rate": 1.7896095090791452e-05, + "loss": 0.3405, + "step": 4073 + }, + { + "epoch": 0.23, + "grad_norm": 0.46244441347114756, + "learning_rate": 1.7894953080116102e-05, + "loss": 0.3233, + "step": 4074 + }, + { + "epoch": 0.23, + "grad_norm": 0.46040709971171495, + "learning_rate": 1.7893810796038862e-05, + "loss": 0.2484, + "step": 4075 + }, + { + "epoch": 0.23, + "grad_norm": 0.7576076875799639, + "learning_rate": 1.7892668238599293e-05, + "loss": 0.4109, + "step": 4076 + }, + { + "epoch": 0.23, + "grad_norm": 0.6283333218980784, + "learning_rate": 1.7891525407836967e-05, + "loss": 0.4061, + "step": 4077 + }, + { + "epoch": 0.23, + "grad_norm": 0.49411712691782517, + "learning_rate": 1.789038230379145e-05, + "loss": 0.3519, + "step": 4078 + }, + { + "epoch": 0.23, + "grad_norm": 0.25958003257867746, + "learning_rate": 1.7889238926502336e-05, + "loss": 0.1599, + "step": 4079 + }, + { + "epoch": 0.23, + "grad_norm": 0.6904502163139321, + "learning_rate": 1.788809527600922e-05, + "loss": 0.5345, + "step": 4080 + }, + { + "epoch": 0.23, + "grad_norm": 0.42273087648124946, + "learning_rate": 1.78869513523517e-05, + "loss": 0.3553, + "step": 4081 + }, + { + "epoch": 0.23, + "grad_norm": 0.41321642610780296, + "learning_rate": 1.7885807155569395e-05, + "loss": 0.2611, + "step": 4082 + }, + { + "epoch": 0.23, + "grad_norm": 1.29593229800086, + "learning_rate": 1.7884662685701927e-05, + "loss": 0.797, + "step": 4083 + }, + { + "epoch": 0.23, + "grad_norm": 0.4185943883142081, + "learning_rate": 1.788351794278893e-05, + "loss": 0.2627, + "step": 4084 + }, + { + "epoch": 0.23, + "grad_norm": 0.30586527586360296, + "learning_rate": 1.7882372926870045e-05, + "loss": 0.2179, + "step": 4085 + }, + { + "epoch": 0.23, + "grad_norm": 0.5378611831339616, + "learning_rate": 1.7881227637984922e-05, + "loss": 0.391, + "step": 4086 + }, + { + "epoch": 0.23, + "grad_norm": 0.6274700701055269, + "learning_rate": 1.788008207617323e-05, + "loss": 0.4243, + "step": 4087 + }, + { + "epoch": 0.23, + "grad_norm": 0.3917913457312723, + "learning_rate": 1.787893624147463e-05, + "loss": 0.3136, + "step": 4088 + }, + { + "epoch": 0.23, + "grad_norm": 0.4596357049274881, + "learning_rate": 1.7877790133928807e-05, + "loss": 0.2903, + "step": 4089 + }, + { + "epoch": 0.23, + "grad_norm": 0.4524210604969984, + "learning_rate": 1.7876643753575457e-05, + "loss": 0.3614, + "step": 4090 + }, + { + "epoch": 0.24, + "grad_norm": 0.382573208203039, + "learning_rate": 1.7875497100454266e-05, + "loss": 0.3108, + "step": 4091 + }, + { + "epoch": 0.24, + "grad_norm": 0.3418759654466254, + "learning_rate": 1.787435017460495e-05, + "loss": 0.1814, + "step": 4092 + }, + { + "epoch": 0.24, + "grad_norm": 0.48679353512366635, + "learning_rate": 1.7873202976067225e-05, + "loss": 0.3678, + "step": 4093 + }, + { + "epoch": 0.24, + "grad_norm": 0.3769792537925658, + "learning_rate": 1.787205550488082e-05, + "loss": 0.3096, + "step": 4094 + }, + { + "epoch": 0.24, + "grad_norm": 0.744428708424359, + "learning_rate": 1.7870907761085474e-05, + "loss": 0.4643, + "step": 4095 + }, + { + "epoch": 0.24, + "grad_norm": 0.6447660292616412, + "learning_rate": 1.786975974472093e-05, + "loss": 0.4391, + "step": 4096 + }, + { + "epoch": 0.24, + "grad_norm": 0.3339067669230319, + "learning_rate": 1.7868611455826942e-05, + "loss": 0.2451, + "step": 4097 + }, + { + "epoch": 0.24, + "grad_norm": 0.3580989985318797, + "learning_rate": 1.7867462894443283e-05, + "loss": 0.2714, + "step": 4098 + }, + { + "epoch": 0.24, + "grad_norm": 0.9081337673125298, + "learning_rate": 1.7866314060609714e-05, + "loss": 0.6394, + "step": 4099 + }, + { + "epoch": 0.24, + "grad_norm": 0.4002506801084894, + "learning_rate": 1.7865164954366033e-05, + "loss": 0.2894, + "step": 4100 + }, + { + "epoch": 0.24, + "grad_norm": 0.4251875388623584, + "learning_rate": 1.7864015575752026e-05, + "loss": 0.353, + "step": 4101 + }, + { + "epoch": 0.24, + "grad_norm": 1.063061765139635, + "learning_rate": 1.78628659248075e-05, + "loss": 0.4411, + "step": 4102 + }, + { + "epoch": 0.24, + "grad_norm": 0.29534226202391967, + "learning_rate": 1.7861716001572262e-05, + "loss": 0.2349, + "step": 4103 + }, + { + "epoch": 0.24, + "grad_norm": 0.6987250149273035, + "learning_rate": 1.7860565806086142e-05, + "loss": 0.4517, + "step": 4104 + }, + { + "epoch": 0.24, + "grad_norm": 0.31222378317392696, + "learning_rate": 1.7859415338388963e-05, + "loss": 0.228, + "step": 4105 + }, + { + "epoch": 0.24, + "grad_norm": 0.4311047635232939, + "learning_rate": 1.7858264598520568e-05, + "loss": 0.3151, + "step": 4106 + }, + { + "epoch": 0.24, + "grad_norm": 1.4992911334787213, + "learning_rate": 1.7857113586520806e-05, + "loss": 0.7821, + "step": 4107 + }, + { + "epoch": 0.24, + "grad_norm": 0.5395303710203192, + "learning_rate": 1.7855962302429542e-05, + "loss": 0.2653, + "step": 4108 + }, + { + "epoch": 0.24, + "grad_norm": 0.40037400934122036, + "learning_rate": 1.785481074628664e-05, + "loss": 0.3089, + "step": 4109 + }, + { + "epoch": 0.24, + "grad_norm": 0.4222237834089705, + "learning_rate": 1.785365891813198e-05, + "loss": 0.3479, + "step": 4110 + }, + { + "epoch": 0.24, + "grad_norm": 0.26589948850227196, + "learning_rate": 1.785250681800545e-05, + "loss": 0.1399, + "step": 4111 + }, + { + "epoch": 0.24, + "grad_norm": 0.41721052652204943, + "learning_rate": 1.7851354445946944e-05, + "loss": 0.3075, + "step": 4112 + }, + { + "epoch": 0.24, + "grad_norm": 0.4215018631260107, + "learning_rate": 1.785020180199637e-05, + "loss": 0.3456, + "step": 4113 + }, + { + "epoch": 0.24, + "grad_norm": 1.3790817111716487, + "learning_rate": 1.7849048886193648e-05, + "loss": 0.8667, + "step": 4114 + }, + { + "epoch": 0.24, + "grad_norm": 0.354931172130831, + "learning_rate": 1.7847895698578702e-05, + "loss": 0.2496, + "step": 4115 + }, + { + "epoch": 0.24, + "grad_norm": 0.423749864426112, + "learning_rate": 1.7846742239191464e-05, + "loss": 0.3579, + "step": 4116 + }, + { + "epoch": 0.24, + "grad_norm": 0.3605983029128115, + "learning_rate": 1.784558850807188e-05, + "loss": 0.2686, + "step": 4117 + }, + { + "epoch": 0.24, + "grad_norm": 0.3667287214717069, + "learning_rate": 1.7844434505259904e-05, + "loss": 0.2296, + "step": 4118 + }, + { + "epoch": 0.24, + "grad_norm": 0.9766307941377927, + "learning_rate": 1.7843280230795496e-05, + "loss": 0.6913, + "step": 4119 + }, + { + "epoch": 0.24, + "grad_norm": 0.7464374748135114, + "learning_rate": 1.784212568471863e-05, + "loss": 0.5132, + "step": 4120 + }, + { + "epoch": 0.24, + "grad_norm": 0.34975514825247495, + "learning_rate": 1.7840970867069293e-05, + "loss": 0.2485, + "step": 4121 + }, + { + "epoch": 0.24, + "grad_norm": 0.7385557641533902, + "learning_rate": 1.7839815777887472e-05, + "loss": 0.511, + "step": 4122 + }, + { + "epoch": 0.24, + "grad_norm": 0.31425441408801236, + "learning_rate": 1.7838660417213166e-05, + "loss": 0.179, + "step": 4123 + }, + { + "epoch": 0.24, + "grad_norm": 0.3292178319221814, + "learning_rate": 1.7837504785086386e-05, + "loss": 0.2168, + "step": 4124 + }, + { + "epoch": 0.24, + "grad_norm": 0.36915751123007257, + "learning_rate": 1.7836348881547153e-05, + "loss": 0.3212, + "step": 4125 + }, + { + "epoch": 0.24, + "grad_norm": 0.9386073864728668, + "learning_rate": 1.7835192706635494e-05, + "loss": 0.5632, + "step": 4126 + }, + { + "epoch": 0.24, + "grad_norm": 0.38900805335201366, + "learning_rate": 1.783403626039145e-05, + "loss": 0.2911, + "step": 4127 + }, + { + "epoch": 0.24, + "grad_norm": 0.7257634073735822, + "learning_rate": 1.7832879542855067e-05, + "loss": 0.4026, + "step": 4128 + }, + { + "epoch": 0.24, + "grad_norm": 0.260931509801551, + "learning_rate": 1.7831722554066403e-05, + "loss": 0.2349, + "step": 4129 + }, + { + "epoch": 0.24, + "grad_norm": 0.41780863397346774, + "learning_rate": 1.7830565294065522e-05, + "loss": 0.2964, + "step": 4130 + }, + { + "epoch": 0.24, + "grad_norm": 0.5120862661410127, + "learning_rate": 1.7829407762892504e-05, + "loss": 0.2983, + "step": 4131 + }, + { + "epoch": 0.24, + "grad_norm": 0.9249334484384528, + "learning_rate": 1.7828249960587428e-05, + "loss": 0.4811, + "step": 4132 + }, + { + "epoch": 0.24, + "grad_norm": 0.36244641485753243, + "learning_rate": 1.7827091887190396e-05, + "loss": 0.3057, + "step": 4133 + }, + { + "epoch": 0.24, + "grad_norm": 0.36789794513151913, + "learning_rate": 1.7825933542741506e-05, + "loss": 0.2761, + "step": 4134 + }, + { + "epoch": 0.24, + "grad_norm": 0.2799037207296845, + "learning_rate": 1.7824774927280877e-05, + "loss": 0.1626, + "step": 4135 + }, + { + "epoch": 0.24, + "grad_norm": 0.3573932183986537, + "learning_rate": 1.7823616040848625e-05, + "loss": 0.3016, + "step": 4136 + }, + { + "epoch": 0.24, + "grad_norm": 0.4498035506977487, + "learning_rate": 1.782245688348489e-05, + "loss": 0.3051, + "step": 4137 + }, + { + "epoch": 0.24, + "grad_norm": 1.1538000821845913, + "learning_rate": 1.7821297455229807e-05, + "loss": 0.4959, + "step": 4138 + }, + { + "epoch": 0.24, + "grad_norm": 0.3316262304163246, + "learning_rate": 1.7820137756123527e-05, + "loss": 0.2842, + "step": 4139 + }, + { + "epoch": 0.24, + "grad_norm": 1.53751881341363, + "learning_rate": 1.7818977786206217e-05, + "loss": 0.8152, + "step": 4140 + }, + { + "epoch": 0.24, + "grad_norm": 0.32222675266258977, + "learning_rate": 1.7817817545518045e-05, + "loss": 0.2308, + "step": 4141 + }, + { + "epoch": 0.24, + "grad_norm": 0.3286628540441941, + "learning_rate": 1.7816657034099182e-05, + "loss": 0.2449, + "step": 4142 + }, + { + "epoch": 0.24, + "grad_norm": 1.4467305285964456, + "learning_rate": 1.781549625198982e-05, + "loss": 0.8061, + "step": 4143 + }, + { + "epoch": 0.24, + "grad_norm": 0.870823301196831, + "learning_rate": 1.7814335199230164e-05, + "loss": 0.3788, + "step": 4144 + }, + { + "epoch": 0.24, + "grad_norm": 0.336748819198341, + "learning_rate": 1.7813173875860416e-05, + "loss": 0.2872, + "step": 4145 + }, + { + "epoch": 0.24, + "grad_norm": 0.6111350426253821, + "learning_rate": 1.781201228192079e-05, + "loss": 0.4169, + "step": 4146 + }, + { + "epoch": 0.24, + "grad_norm": 0.21527619658183722, + "learning_rate": 1.7810850417451517e-05, + "loss": 0.1193, + "step": 4147 + }, + { + "epoch": 0.24, + "grad_norm": 0.4550362260441464, + "learning_rate": 1.780968828249283e-05, + "loss": 0.2929, + "step": 4148 + }, + { + "epoch": 0.24, + "grad_norm": 0.49114266037537563, + "learning_rate": 1.780852587708497e-05, + "loss": 0.3229, + "step": 4149 + }, + { + "epoch": 0.24, + "grad_norm": 1.1197232619356512, + "learning_rate": 1.78073632012682e-05, + "loss": 0.4345, + "step": 4150 + }, + { + "epoch": 0.24, + "grad_norm": 0.4730494255334301, + "learning_rate": 1.780620025508277e-05, + "loss": 0.3158, + "step": 4151 + }, + { + "epoch": 0.24, + "grad_norm": 0.5063609235250967, + "learning_rate": 1.7805037038568972e-05, + "loss": 0.4172, + "step": 4152 + }, + { + "epoch": 0.24, + "grad_norm": 0.41048674198046375, + "learning_rate": 1.780387355176707e-05, + "loss": 0.2985, + "step": 4153 + }, + { + "epoch": 0.24, + "grad_norm": 0.21889835566971555, + "learning_rate": 1.7802709794717363e-05, + "loss": 0.1212, + "step": 4154 + }, + { + "epoch": 0.24, + "grad_norm": 1.0661163910225424, + "learning_rate": 1.780154576746015e-05, + "loss": 0.4338, + "step": 4155 + }, + { + "epoch": 0.24, + "grad_norm": 1.3205369104367537, + "learning_rate": 1.7800381470035745e-05, + "loss": 0.7398, + "step": 4156 + }, + { + "epoch": 0.24, + "grad_norm": 0.36004560177244327, + "learning_rate": 1.7799216902484465e-05, + "loss": 0.2448, + "step": 4157 + }, + { + "epoch": 0.24, + "grad_norm": 1.496192252384227, + "learning_rate": 1.7798052064846637e-05, + "loss": 0.805, + "step": 4158 + }, + { + "epoch": 0.24, + "grad_norm": 0.8488803818598749, + "learning_rate": 1.7796886957162603e-05, + "loss": 0.5812, + "step": 4159 + }, + { + "epoch": 0.24, + "grad_norm": 0.2631792065123535, + "learning_rate": 1.7795721579472712e-05, + "loss": 0.2027, + "step": 4160 + }, + { + "epoch": 0.24, + "grad_norm": 0.39486478303427447, + "learning_rate": 1.7794555931817314e-05, + "loss": 0.3416, + "step": 4161 + }, + { + "epoch": 0.24, + "grad_norm": 0.7292174481348954, + "learning_rate": 1.779339001423678e-05, + "loss": 0.5763, + "step": 4162 + }, + { + "epoch": 0.24, + "grad_norm": 0.277092082065919, + "learning_rate": 1.7792223826771484e-05, + "loss": 0.1583, + "step": 4163 + }, + { + "epoch": 0.24, + "grad_norm": 0.648421763994709, + "learning_rate": 1.779105736946181e-05, + "loss": 0.4098, + "step": 4164 + }, + { + "epoch": 0.24, + "grad_norm": 0.4148689107380746, + "learning_rate": 1.778989064234816e-05, + "loss": 0.3432, + "step": 4165 + }, + { + "epoch": 0.24, + "grad_norm": 0.5232083805274033, + "learning_rate": 1.7788723645470928e-05, + "loss": 0.3272, + "step": 4166 + }, + { + "epoch": 0.24, + "grad_norm": 0.2818994881344073, + "learning_rate": 1.7787556378870534e-05, + "loss": 0.2582, + "step": 4167 + }, + { + "epoch": 0.24, + "grad_norm": 0.457924102148931, + "learning_rate": 1.7786388842587397e-05, + "loss": 0.377, + "step": 4168 + }, + { + "epoch": 0.24, + "grad_norm": 0.5522352742313344, + "learning_rate": 1.7785221036661945e-05, + "loss": 0.3301, + "step": 4169 + }, + { + "epoch": 0.24, + "grad_norm": 0.32880274033610146, + "learning_rate": 1.778405296113463e-05, + "loss": 0.2008, + "step": 4170 + }, + { + "epoch": 0.24, + "grad_norm": 0.7913590840489569, + "learning_rate": 1.7782884616045892e-05, + "loss": 0.5754, + "step": 4171 + }, + { + "epoch": 0.24, + "grad_norm": 0.5348610835612185, + "learning_rate": 1.7781716001436192e-05, + "loss": 0.3692, + "step": 4172 + }, + { + "epoch": 0.24, + "grad_norm": 0.3764624857271168, + "learning_rate": 1.7780547117346005e-05, + "loss": 0.2682, + "step": 4173 + }, + { + "epoch": 0.24, + "grad_norm": 1.2520872298678765, + "learning_rate": 1.7779377963815804e-05, + "loss": 0.7315, + "step": 4174 + }, + { + "epoch": 0.24, + "grad_norm": 0.3392114778810658, + "learning_rate": 1.7778208540886082e-05, + "loss": 0.1731, + "step": 4175 + }, + { + "epoch": 0.24, + "grad_norm": 0.5068710149305249, + "learning_rate": 1.777703884859733e-05, + "loss": 0.3636, + "step": 4176 + }, + { + "epoch": 0.24, + "grad_norm": 0.393948244622996, + "learning_rate": 1.7775868886990056e-05, + "loss": 0.3052, + "step": 4177 + }, + { + "epoch": 0.24, + "grad_norm": 0.4345252768576852, + "learning_rate": 1.7774698656104778e-05, + "loss": 0.343, + "step": 4178 + }, + { + "epoch": 0.24, + "grad_norm": 0.5250465194272104, + "learning_rate": 1.777352815598202e-05, + "loss": 0.3784, + "step": 4179 + }, + { + "epoch": 0.24, + "grad_norm": 0.34170118548309036, + "learning_rate": 1.7772357386662316e-05, + "loss": 0.2921, + "step": 4180 + }, + { + "epoch": 0.24, + "grad_norm": 0.3091716890840008, + "learning_rate": 1.777118634818621e-05, + "loss": 0.2455, + "step": 4181 + }, + { + "epoch": 0.24, + "grad_norm": 0.32875149163461864, + "learning_rate": 1.7770015040594256e-05, + "loss": 0.2709, + "step": 4182 + }, + { + "epoch": 0.24, + "grad_norm": 0.5657387770353896, + "learning_rate": 1.7768843463927012e-05, + "loss": 0.4015, + "step": 4183 + }, + { + "epoch": 0.24, + "grad_norm": 0.4578594861031997, + "learning_rate": 1.776767161822506e-05, + "loss": 0.3508, + "step": 4184 + }, + { + "epoch": 0.24, + "grad_norm": 0.3722925933401782, + "learning_rate": 1.7766499503528965e-05, + "loss": 0.293, + "step": 4185 + }, + { + "epoch": 0.24, + "grad_norm": 0.8542895200701599, + "learning_rate": 1.776532711987933e-05, + "loss": 0.561, + "step": 4186 + }, + { + "epoch": 0.24, + "grad_norm": 0.3903147749744142, + "learning_rate": 1.7764154467316753e-05, + "loss": 0.206, + "step": 4187 + }, + { + "epoch": 0.24, + "grad_norm": 0.3327376950139663, + "learning_rate": 1.776298154588184e-05, + "loss": 0.2483, + "step": 4188 + }, + { + "epoch": 0.24, + "grad_norm": 0.6250386359397111, + "learning_rate": 1.7761808355615207e-05, + "loss": 0.4006, + "step": 4189 + }, + { + "epoch": 0.24, + "grad_norm": 0.49450962581882085, + "learning_rate": 1.7760634896557483e-05, + "loss": 0.2286, + "step": 4190 + }, + { + "epoch": 0.24, + "grad_norm": 0.42356405355596793, + "learning_rate": 1.775946116874931e-05, + "loss": 0.3589, + "step": 4191 + }, + { + "epoch": 0.24, + "grad_norm": 0.5552779771392584, + "learning_rate": 1.7758287172231333e-05, + "loss": 0.4024, + "step": 4192 + }, + { + "epoch": 0.24, + "grad_norm": 0.37558968188355335, + "learning_rate": 1.77571129070442e-05, + "loss": 0.2241, + "step": 4193 + }, + { + "epoch": 0.24, + "grad_norm": 0.25518944264314564, + "learning_rate": 1.775593837322858e-05, + "loss": 0.2089, + "step": 4194 + }, + { + "epoch": 0.24, + "grad_norm": 1.00903247293503, + "learning_rate": 1.775476357082515e-05, + "loss": 0.4944, + "step": 4195 + }, + { + "epoch": 0.24, + "grad_norm": 0.2975944794423369, + "learning_rate": 1.7753588499874592e-05, + "loss": 0.2273, + "step": 4196 + }, + { + "epoch": 0.24, + "grad_norm": 0.44397822897049244, + "learning_rate": 1.7752413160417597e-05, + "loss": 0.357, + "step": 4197 + }, + { + "epoch": 0.24, + "grad_norm": 1.1675312656602383, + "learning_rate": 1.7751237552494867e-05, + "loss": 0.862, + "step": 4198 + }, + { + "epoch": 0.24, + "grad_norm": 0.2532797489976462, + "learning_rate": 1.7750061676147114e-05, + "loss": 0.108, + "step": 4199 + }, + { + "epoch": 0.24, + "grad_norm": 0.4934846294571999, + "learning_rate": 1.774888553141506e-05, + "loss": 0.3664, + "step": 4200 + }, + { + "epoch": 0.24, + "grad_norm": 0.3683129833102259, + "learning_rate": 1.7747709118339428e-05, + "loss": 0.2941, + "step": 4201 + }, + { + "epoch": 0.24, + "grad_norm": 1.1727462583908863, + "learning_rate": 1.7746532436960965e-05, + "loss": 0.653, + "step": 4202 + }, + { + "epoch": 0.24, + "grad_norm": 0.3065183783693479, + "learning_rate": 1.7745355487320418e-05, + "loss": 0.2344, + "step": 4203 + }, + { + "epoch": 0.24, + "grad_norm": 0.41090849776968874, + "learning_rate": 1.7744178269458547e-05, + "loss": 0.3612, + "step": 4204 + }, + { + "epoch": 0.24, + "grad_norm": 0.6804604082844093, + "learning_rate": 1.774300078341611e-05, + "loss": 0.5032, + "step": 4205 + }, + { + "epoch": 0.24, + "grad_norm": 0.3564856518489956, + "learning_rate": 1.7741823029233892e-05, + "loss": 0.2573, + "step": 4206 + }, + { + "epoch": 0.24, + "grad_norm": 0.3123261819071861, + "learning_rate": 1.7740645006952674e-05, + "loss": 0.2185, + "step": 4207 + }, + { + "epoch": 0.24, + "grad_norm": 0.36540726586784045, + "learning_rate": 1.773946671661325e-05, + "loss": 0.2873, + "step": 4208 + }, + { + "epoch": 0.24, + "grad_norm": 0.3851981339848704, + "learning_rate": 1.773828815825643e-05, + "loss": 0.2368, + "step": 4209 + }, + { + "epoch": 0.24, + "grad_norm": 1.403401900227148, + "learning_rate": 1.773710933192302e-05, + "loss": 0.8898, + "step": 4210 + }, + { + "epoch": 0.24, + "grad_norm": 0.8506443084440741, + "learning_rate": 1.7735930237653853e-05, + "loss": 0.4175, + "step": 4211 + }, + { + "epoch": 0.24, + "grad_norm": 0.3401308629226619, + "learning_rate": 1.773475087548975e-05, + "loss": 0.2571, + "step": 4212 + }, + { + "epoch": 0.24, + "grad_norm": 0.3457458099433843, + "learning_rate": 1.7733571245471557e-05, + "loss": 0.2495, + "step": 4213 + }, + { + "epoch": 0.24, + "grad_norm": 0.3963265595086062, + "learning_rate": 1.7732391347640125e-05, + "loss": 0.2667, + "step": 4214 + }, + { + "epoch": 0.24, + "grad_norm": 0.41455830175953723, + "learning_rate": 1.7731211182036312e-05, + "loss": 0.3137, + "step": 4215 + }, + { + "epoch": 0.24, + "grad_norm": 0.40500875371408623, + "learning_rate": 1.773003074870099e-05, + "loss": 0.299, + "step": 4216 + }, + { + "epoch": 0.24, + "grad_norm": 0.5403576844166071, + "learning_rate": 1.7728850047675035e-05, + "loss": 0.35, + "step": 4217 + }, + { + "epoch": 0.24, + "grad_norm": 0.4087263684779261, + "learning_rate": 1.7727669078999336e-05, + "loss": 0.3484, + "step": 4218 + }, + { + "epoch": 0.24, + "grad_norm": 0.2268995265041368, + "learning_rate": 1.772648784271479e-05, + "loss": 0.1586, + "step": 4219 + }, + { + "epoch": 0.24, + "grad_norm": 0.38627405176100876, + "learning_rate": 1.7725306338862298e-05, + "loss": 0.3399, + "step": 4220 + }, + { + "epoch": 0.24, + "grad_norm": 0.34508423192036813, + "learning_rate": 1.7724124567482782e-05, + "loss": 0.2773, + "step": 4221 + }, + { + "epoch": 0.24, + "grad_norm": 0.5635387172753441, + "learning_rate": 1.7722942528617163e-05, + "loss": 0.443, + "step": 4222 + }, + { + "epoch": 0.24, + "grad_norm": 0.6278867989446979, + "learning_rate": 1.772176022230638e-05, + "loss": 0.4841, + "step": 4223 + }, + { + "epoch": 0.24, + "grad_norm": 0.3090845892934344, + "learning_rate": 1.7720577648591368e-05, + "loss": 0.2715, + "step": 4224 + }, + { + "epoch": 0.24, + "grad_norm": 0.43203694883147464, + "learning_rate": 1.771939480751309e-05, + "loss": 0.2774, + "step": 4225 + }, + { + "epoch": 0.24, + "grad_norm": 0.28886362206526994, + "learning_rate": 1.7718211699112496e-05, + "loss": 0.1862, + "step": 4226 + }, + { + "epoch": 0.24, + "grad_norm": 0.35675105970492776, + "learning_rate": 1.7717028323430562e-05, + "loss": 0.2986, + "step": 4227 + }, + { + "epoch": 0.24, + "grad_norm": 0.4192480731244511, + "learning_rate": 1.7715844680508273e-05, + "loss": 0.3561, + "step": 4228 + }, + { + "epoch": 0.24, + "grad_norm": 0.954576314104161, + "learning_rate": 1.7714660770386615e-05, + "loss": 0.3737, + "step": 4229 + }, + { + "epoch": 0.24, + "grad_norm": 0.37465111339648155, + "learning_rate": 1.771347659310658e-05, + "loss": 0.3111, + "step": 4230 + }, + { + "epoch": 0.24, + "grad_norm": 1.154325732369229, + "learning_rate": 1.7712292148709188e-05, + "loss": 0.667, + "step": 4231 + }, + { + "epoch": 0.24, + "grad_norm": 0.23526929972174873, + "learning_rate": 1.7711107437235453e-05, + "loss": 0.1821, + "step": 4232 + }, + { + "epoch": 0.24, + "grad_norm": 0.4190335124027483, + "learning_rate": 1.7709922458726395e-05, + "loss": 0.3299, + "step": 4233 + }, + { + "epoch": 0.24, + "grad_norm": 0.8046301903471953, + "learning_rate": 1.770873721322305e-05, + "loss": 0.5668, + "step": 4234 + }, + { + "epoch": 0.24, + "grad_norm": 0.3958648304902932, + "learning_rate": 1.7707551700766474e-05, + "loss": 0.2981, + "step": 4235 + }, + { + "epoch": 0.24, + "grad_norm": 0.4148428047558691, + "learning_rate": 1.770636592139771e-05, + "loss": 0.3152, + "step": 4236 + }, + { + "epoch": 0.24, + "grad_norm": 0.4855183491267279, + "learning_rate": 1.7705179875157826e-05, + "loss": 0.3586, + "step": 4237 + }, + { + "epoch": 0.24, + "grad_norm": 0.24397917134812547, + "learning_rate": 1.7703993562087895e-05, + "loss": 0.1169, + "step": 4238 + }, + { + "epoch": 0.24, + "grad_norm": 0.3776877212817177, + "learning_rate": 1.7702806982229e-05, + "loss": 0.2973, + "step": 4239 + }, + { + "epoch": 0.24, + "grad_norm": 0.5154970926323544, + "learning_rate": 1.7701620135622228e-05, + "loss": 0.3764, + "step": 4240 + }, + { + "epoch": 0.24, + "grad_norm": 1.042901070896944, + "learning_rate": 1.7700433022308684e-05, + "loss": 0.635, + "step": 4241 + }, + { + "epoch": 0.24, + "grad_norm": 0.3494430310945817, + "learning_rate": 1.7699245642329473e-05, + "loss": 0.2064, + "step": 4242 + }, + { + "epoch": 0.24, + "grad_norm": 1.1972989287454718, + "learning_rate": 1.7698057995725717e-05, + "loss": 0.6586, + "step": 4243 + }, + { + "epoch": 0.24, + "grad_norm": 0.3358791750280596, + "learning_rate": 1.7696870082538544e-05, + "loss": 0.2379, + "step": 4244 + }, + { + "epoch": 0.24, + "grad_norm": 0.4175936922148545, + "learning_rate": 1.769568190280909e-05, + "loss": 0.2467, + "step": 4245 + }, + { + "epoch": 0.24, + "grad_norm": 1.2074876580005733, + "learning_rate": 1.7694493456578503e-05, + "loss": 0.4767, + "step": 4246 + }, + { + "epoch": 0.24, + "grad_norm": 0.4768857304128634, + "learning_rate": 1.769330474388794e-05, + "loss": 0.3806, + "step": 4247 + }, + { + "epoch": 0.24, + "grad_norm": 0.3347435221937712, + "learning_rate": 1.7692115764778564e-05, + "loss": 0.2254, + "step": 4248 + }, + { + "epoch": 0.24, + "grad_norm": 1.2119371591283326, + "learning_rate": 1.7690926519291548e-05, + "loss": 0.6777, + "step": 4249 + }, + { + "epoch": 0.24, + "grad_norm": 0.5732297049816935, + "learning_rate": 1.7689737007468082e-05, + "loss": 0.3127, + "step": 4250 + }, + { + "epoch": 0.24, + "grad_norm": 0.4110880892865378, + "learning_rate": 1.768854722934935e-05, + "loss": 0.2711, + "step": 4251 + }, + { + "epoch": 0.24, + "grad_norm": 0.5346347752222697, + "learning_rate": 1.7687357184976558e-05, + "loss": 0.304, + "step": 4252 + }, + { + "epoch": 0.24, + "grad_norm": 0.4209249612683654, + "learning_rate": 1.7686166874390916e-05, + "loss": 0.3113, + "step": 4253 + }, + { + "epoch": 0.24, + "grad_norm": 0.4029524455386135, + "learning_rate": 1.768497629763365e-05, + "loss": 0.284, + "step": 4254 + }, + { + "epoch": 0.24, + "grad_norm": 0.39416868752265927, + "learning_rate": 1.7683785454745983e-05, + "loss": 0.267, + "step": 4255 + }, + { + "epoch": 0.24, + "grad_norm": 0.4629564240237894, + "learning_rate": 1.768259434576916e-05, + "loss": 0.3873, + "step": 4256 + }, + { + "epoch": 0.24, + "grad_norm": 0.39823641313634994, + "learning_rate": 1.768140297074442e-05, + "loss": 0.3366, + "step": 4257 + }, + { + "epoch": 0.24, + "grad_norm": 0.44095663559634946, + "learning_rate": 1.7680211329713027e-05, + "loss": 0.3567, + "step": 4258 + }, + { + "epoch": 0.24, + "grad_norm": 0.3111575942167663, + "learning_rate": 1.7679019422716244e-05, + "loss": 0.2723, + "step": 4259 + }, + { + "epoch": 0.24, + "grad_norm": 0.32667092720747753, + "learning_rate": 1.767782724979535e-05, + "loss": 0.2803, + "step": 4260 + }, + { + "epoch": 0.24, + "grad_norm": 0.3594877209168396, + "learning_rate": 1.767663481099163e-05, + "loss": 0.15, + "step": 4261 + }, + { + "epoch": 0.24, + "grad_norm": 0.5838500192099852, + "learning_rate": 1.7675442106346377e-05, + "loss": 0.4736, + "step": 4262 + }, + { + "epoch": 0.24, + "grad_norm": 0.34204796905833723, + "learning_rate": 1.7674249135900892e-05, + "loss": 0.3, + "step": 4263 + }, + { + "epoch": 0.24, + "grad_norm": 0.4358076489958484, + "learning_rate": 1.767305589969649e-05, + "loss": 0.3492, + "step": 4264 + }, + { + "epoch": 0.25, + "grad_norm": 0.21427413315468624, + "learning_rate": 1.7671862397774494e-05, + "loss": 0.1329, + "step": 4265 + }, + { + "epoch": 0.25, + "grad_norm": 0.35988453024304295, + "learning_rate": 1.767066863017623e-05, + "loss": 0.2818, + "step": 4266 + }, + { + "epoch": 0.25, + "grad_norm": 0.9682883111015376, + "learning_rate": 1.766947459694304e-05, + "loss": 0.5893, + "step": 4267 + }, + { + "epoch": 0.25, + "grad_norm": 0.3709235509398682, + "learning_rate": 1.766828029811628e-05, + "loss": 0.3169, + "step": 4268 + }, + { + "epoch": 0.25, + "grad_norm": 0.5406561017554826, + "learning_rate": 1.7667085733737298e-05, + "loss": 0.3701, + "step": 4269 + }, + { + "epoch": 0.25, + "grad_norm": 0.4233765597253248, + "learning_rate": 1.7665890903847468e-05, + "loss": 0.3483, + "step": 4270 + }, + { + "epoch": 0.25, + "grad_norm": 0.2603354062606371, + "learning_rate": 1.7664695808488164e-05, + "loss": 0.2296, + "step": 4271 + }, + { + "epoch": 0.25, + "grad_norm": 0.4590253911487976, + "learning_rate": 1.766350044770078e-05, + "loss": 0.2895, + "step": 4272 + }, + { + "epoch": 0.25, + "grad_norm": 0.3741916057341281, + "learning_rate": 1.76623048215267e-05, + "loss": 0.3108, + "step": 4273 + }, + { + "epoch": 0.25, + "grad_norm": 0.7421170186002759, + "learning_rate": 1.7661108930007334e-05, + "loss": 0.3889, + "step": 4274 + }, + { + "epoch": 0.25, + "grad_norm": 0.45537247750720905, + "learning_rate": 1.7659912773184095e-05, + "loss": 0.2839, + "step": 4275 + }, + { + "epoch": 0.25, + "grad_norm": 0.3989938266714445, + "learning_rate": 1.7658716351098407e-05, + "loss": 0.3258, + "step": 4276 + }, + { + "epoch": 0.25, + "grad_norm": 1.318451272676308, + "learning_rate": 1.76575196637917e-05, + "loss": 0.7469, + "step": 4277 + }, + { + "epoch": 0.25, + "grad_norm": 0.22641040750112476, + "learning_rate": 1.7656322711305417e-05, + "loss": 0.1216, + "step": 4278 + }, + { + "epoch": 0.25, + "grad_norm": 0.3801159653441589, + "learning_rate": 1.7655125493681012e-05, + "loss": 0.3474, + "step": 4279 + }, + { + "epoch": 0.25, + "grad_norm": 0.5049163226978106, + "learning_rate": 1.7653928010959936e-05, + "loss": 0.3951, + "step": 4280 + }, + { + "epoch": 0.25, + "grad_norm": 0.3293722513867542, + "learning_rate": 1.765273026318366e-05, + "loss": 0.2225, + "step": 4281 + }, + { + "epoch": 0.25, + "grad_norm": 1.349210674604087, + "learning_rate": 1.7651532250393666e-05, + "loss": 0.7544, + "step": 4282 + }, + { + "epoch": 0.25, + "grad_norm": 0.3886610404525146, + "learning_rate": 1.7650333972631443e-05, + "loss": 0.3501, + "step": 4283 + }, + { + "epoch": 0.25, + "grad_norm": 0.24857630478813894, + "learning_rate": 1.7649135429938477e-05, + "loss": 0.1807, + "step": 4284 + }, + { + "epoch": 0.25, + "grad_norm": 0.4471098422892603, + "learning_rate": 1.764793662235628e-05, + "loss": 0.3165, + "step": 4285 + }, + { + "epoch": 0.25, + "grad_norm": 0.6148974597430013, + "learning_rate": 1.7646737549926376e-05, + "loss": 0.4766, + "step": 4286 + }, + { + "epoch": 0.25, + "grad_norm": 0.37646323777327495, + "learning_rate": 1.764553821269027e-05, + "loss": 0.216, + "step": 4287 + }, + { + "epoch": 0.25, + "grad_norm": 0.39924535706099146, + "learning_rate": 1.764433861068951e-05, + "loss": 0.3246, + "step": 4288 + }, + { + "epoch": 0.25, + "grad_norm": 1.0115253786277327, + "learning_rate": 1.764313874396563e-05, + "loss": 0.6713, + "step": 4289 + }, + { + "epoch": 0.25, + "grad_norm": 0.37546513680467763, + "learning_rate": 1.7641938612560182e-05, + "loss": 0.2153, + "step": 4290 + }, + { + "epoch": 0.25, + "grad_norm": 0.31499422778096564, + "learning_rate": 1.7640738216514733e-05, + "loss": 0.293, + "step": 4291 + }, + { + "epoch": 0.25, + "grad_norm": 0.341004193409986, + "learning_rate": 1.7639537555870844e-05, + "loss": 0.2931, + "step": 4292 + }, + { + "epoch": 0.25, + "grad_norm": 0.5975754148604239, + "learning_rate": 1.7638336630670102e-05, + "loss": 0.3508, + "step": 4293 + }, + { + "epoch": 0.25, + "grad_norm": 0.4147059353112628, + "learning_rate": 1.763713544095409e-05, + "loss": 0.299, + "step": 4294 + }, + { + "epoch": 0.25, + "grad_norm": 0.3928079898041313, + "learning_rate": 1.7635933986764403e-05, + "loss": 0.3508, + "step": 4295 + }, + { + "epoch": 0.25, + "grad_norm": 0.559386737857805, + "learning_rate": 1.7634732268142652e-05, + "loss": 0.3621, + "step": 4296 + }, + { + "epoch": 0.25, + "grad_norm": 0.2935297955911469, + "learning_rate": 1.7633530285130452e-05, + "loss": 0.2541, + "step": 4297 + }, + { + "epoch": 0.25, + "grad_norm": 0.4767343833347403, + "learning_rate": 1.7632328037769423e-05, + "loss": 0.2457, + "step": 4298 + }, + { + "epoch": 0.25, + "grad_norm": 0.3746108706148988, + "learning_rate": 1.7631125526101206e-05, + "loss": 0.3049, + "step": 4299 + }, + { + "epoch": 0.25, + "grad_norm": 0.4020633558516471, + "learning_rate": 1.7629922750167437e-05, + "loss": 0.2688, + "step": 4300 + }, + { + "epoch": 0.25, + "grad_norm": 0.8783609175595337, + "learning_rate": 1.7628719710009777e-05, + "loss": 0.6493, + "step": 4301 + }, + { + "epoch": 0.25, + "grad_norm": 0.3907321374100405, + "learning_rate": 1.7627516405669876e-05, + "loss": 0.3237, + "step": 4302 + }, + { + "epoch": 0.25, + "grad_norm": 0.49895888305836794, + "learning_rate": 1.7626312837189412e-05, + "loss": 0.4026, + "step": 4303 + }, + { + "epoch": 0.25, + "grad_norm": 0.25989485619288133, + "learning_rate": 1.7625109004610065e-05, + "loss": 0.1937, + "step": 4304 + }, + { + "epoch": 0.25, + "grad_norm": 0.7321965943129197, + "learning_rate": 1.7623904907973515e-05, + "loss": 0.395, + "step": 4305 + }, + { + "epoch": 0.25, + "grad_norm": 0.43686726649567725, + "learning_rate": 1.762270054732147e-05, + "loss": 0.3508, + "step": 4306 + }, + { + "epoch": 0.25, + "grad_norm": 0.37833068558361926, + "learning_rate": 1.7621495922695633e-05, + "loss": 0.307, + "step": 4307 + }, + { + "epoch": 0.25, + "grad_norm": 0.6682080526735688, + "learning_rate": 1.7620291034137718e-05, + "loss": 0.4271, + "step": 4308 + }, + { + "epoch": 0.25, + "grad_norm": 0.3964426570352359, + "learning_rate": 1.7619085881689454e-05, + "loss": 0.3013, + "step": 4309 + }, + { + "epoch": 0.25, + "grad_norm": 0.23222205472935048, + "learning_rate": 1.761788046539257e-05, + "loss": 0.1052, + "step": 4310 + }, + { + "epoch": 0.25, + "grad_norm": 0.3912585076504096, + "learning_rate": 1.7616674785288815e-05, + "loss": 0.301, + "step": 4311 + }, + { + "epoch": 0.25, + "grad_norm": 0.47490678407484405, + "learning_rate": 1.761546884141994e-05, + "loss": 0.3829, + "step": 4312 + }, + { + "epoch": 0.25, + "grad_norm": 0.7637011922535032, + "learning_rate": 1.761426263382771e-05, + "loss": 0.3716, + "step": 4313 + }, + { + "epoch": 0.25, + "grad_norm": 0.506704534728562, + "learning_rate": 1.761305616255389e-05, + "loss": 0.3931, + "step": 4314 + }, + { + "epoch": 0.25, + "grad_norm": 0.36459501700346764, + "learning_rate": 1.761184942764026e-05, + "loss": 0.3025, + "step": 4315 + }, + { + "epoch": 0.25, + "grad_norm": 0.38492087075628456, + "learning_rate": 1.761064242912861e-05, + "loss": 0.2247, + "step": 4316 + }, + { + "epoch": 0.25, + "grad_norm": 0.2590479572690357, + "learning_rate": 1.7609435167060745e-05, + "loss": 0.1378, + "step": 4317 + }, + { + "epoch": 0.25, + "grad_norm": 0.505463936850372, + "learning_rate": 1.7608227641478467e-05, + "loss": 0.3552, + "step": 4318 + }, + { + "epoch": 0.25, + "grad_norm": 0.4637726016282319, + "learning_rate": 1.760701985242359e-05, + "loss": 0.3487, + "step": 4319 + }, + { + "epoch": 0.25, + "grad_norm": 0.9181606038737803, + "learning_rate": 1.7605811799937946e-05, + "loss": 0.3159, + "step": 4320 + }, + { + "epoch": 0.25, + "grad_norm": 0.4582402750612981, + "learning_rate": 1.7604603484063363e-05, + "loss": 0.325, + "step": 4321 + }, + { + "epoch": 0.25, + "grad_norm": 0.3085161695929028, + "learning_rate": 1.760339490484169e-05, + "loss": 0.2247, + "step": 4322 + }, + { + "epoch": 0.25, + "grad_norm": 0.3913027721852565, + "learning_rate": 1.760218606231478e-05, + "loss": 0.2511, + "step": 4323 + }, + { + "epoch": 0.25, + "grad_norm": 0.5046921571413169, + "learning_rate": 1.7600976956524493e-05, + "loss": 0.3392, + "step": 4324 + }, + { + "epoch": 0.25, + "grad_norm": 1.4373319914356162, + "learning_rate": 1.7599767587512698e-05, + "loss": 0.4993, + "step": 4325 + }, + { + "epoch": 0.25, + "grad_norm": 0.5257980257453084, + "learning_rate": 1.7598557955321282e-05, + "loss": 0.2709, + "step": 4326 + }, + { + "epoch": 0.25, + "grad_norm": 0.32672331423731715, + "learning_rate": 1.7597348059992128e-05, + "loss": 0.3039, + "step": 4327 + }, + { + "epoch": 0.25, + "grad_norm": 0.2592941345566562, + "learning_rate": 1.7596137901567138e-05, + "loss": 0.1403, + "step": 4328 + }, + { + "epoch": 0.25, + "grad_norm": 1.4062233302214164, + "learning_rate": 1.759492748008822e-05, + "loss": 0.8632, + "step": 4329 + }, + { + "epoch": 0.25, + "grad_norm": 0.6436373842109813, + "learning_rate": 1.759371679559729e-05, + "loss": 0.2666, + "step": 4330 + }, + { + "epoch": 0.25, + "grad_norm": 0.5413600815168292, + "learning_rate": 1.759250584813627e-05, + "loss": 0.3359, + "step": 4331 + }, + { + "epoch": 0.25, + "grad_norm": 0.6795446930850362, + "learning_rate": 1.7591294637747104e-05, + "loss": 0.4545, + "step": 4332 + }, + { + "epoch": 0.25, + "grad_norm": 0.34797795924646585, + "learning_rate": 1.7590083164471728e-05, + "loss": 0.2473, + "step": 4333 + }, + { + "epoch": 0.25, + "grad_norm": 0.4751834716100019, + "learning_rate": 1.75888714283521e-05, + "loss": 0.2814, + "step": 4334 + }, + { + "epoch": 0.25, + "grad_norm": 0.4792771622907987, + "learning_rate": 1.758765942943018e-05, + "loss": 0.3511, + "step": 4335 + }, + { + "epoch": 0.25, + "grad_norm": 0.4409034335516681, + "learning_rate": 1.7586447167747943e-05, + "loss": 0.2339, + "step": 4336 + }, + { + "epoch": 0.25, + "grad_norm": 1.0216680780688296, + "learning_rate": 1.7585234643347363e-05, + "loss": 0.4759, + "step": 4337 + }, + { + "epoch": 0.25, + "grad_norm": 0.3495006405637813, + "learning_rate": 1.7584021856270435e-05, + "loss": 0.2516, + "step": 4338 + }, + { + "epoch": 0.25, + "grad_norm": 0.399030800710068, + "learning_rate": 1.7582808806559154e-05, + "loss": 0.3012, + "step": 4339 + }, + { + "epoch": 0.25, + "grad_norm": 0.33002710223861065, + "learning_rate": 1.7581595494255533e-05, + "loss": 0.2138, + "step": 4340 + }, + { + "epoch": 0.25, + "grad_norm": 0.6027956584907409, + "learning_rate": 1.7580381919401586e-05, + "loss": 0.4314, + "step": 4341 + }, + { + "epoch": 0.25, + "grad_norm": 0.3981500370131176, + "learning_rate": 1.757916808203934e-05, + "loss": 0.3312, + "step": 4342 + }, + { + "epoch": 0.25, + "grad_norm": 0.36799863739960603, + "learning_rate": 1.757795398221083e-05, + "loss": 0.3211, + "step": 4343 + }, + { + "epoch": 0.25, + "grad_norm": 0.44567616701273444, + "learning_rate": 1.7576739619958096e-05, + "loss": 0.2847, + "step": 4344 + }, + { + "epoch": 0.25, + "grad_norm": 0.37317146489865405, + "learning_rate": 1.7575524995323195e-05, + "loss": 0.2739, + "step": 4345 + }, + { + "epoch": 0.25, + "grad_norm": 0.38858376631551034, + "learning_rate": 1.757431010834819e-05, + "loss": 0.2789, + "step": 4346 + }, + { + "epoch": 0.25, + "grad_norm": 0.7434345776855413, + "learning_rate": 1.7573094959075148e-05, + "loss": 0.5022, + "step": 4347 + }, + { + "epoch": 0.25, + "grad_norm": 0.3514566700372044, + "learning_rate": 1.757187954754616e-05, + "loss": 0.2869, + "step": 4348 + }, + { + "epoch": 0.25, + "grad_norm": 0.7291519504367259, + "learning_rate": 1.7570663873803305e-05, + "loss": 0.4462, + "step": 4349 + }, + { + "epoch": 0.25, + "grad_norm": 0.2717965523574744, + "learning_rate": 1.7569447937888686e-05, + "loss": 0.2532, + "step": 4350 + }, + { + "epoch": 0.25, + "grad_norm": 0.35466411632539085, + "learning_rate": 1.756823173984441e-05, + "loss": 0.29, + "step": 4351 + }, + { + "epoch": 0.25, + "grad_norm": 1.4631381574190476, + "learning_rate": 1.7567015279712598e-05, + "loss": 0.7348, + "step": 4352 + }, + { + "epoch": 0.25, + "grad_norm": 0.8172752249272174, + "learning_rate": 1.756579855753537e-05, + "loss": 0.3552, + "step": 4353 + }, + { + "epoch": 0.25, + "grad_norm": 0.3753765285233513, + "learning_rate": 1.756458157335486e-05, + "loss": 0.3274, + "step": 4354 + }, + { + "epoch": 0.25, + "grad_norm": 0.39089010889022724, + "learning_rate": 1.756336432721322e-05, + "loss": 0.3154, + "step": 4355 + }, + { + "epoch": 0.25, + "grad_norm": 0.23367638912187802, + "learning_rate": 1.7562146819152595e-05, + "loss": 0.1237, + "step": 4356 + }, + { + "epoch": 0.25, + "grad_norm": 0.4181181779260803, + "learning_rate": 1.7560929049215155e-05, + "loss": 0.3227, + "step": 4357 + }, + { + "epoch": 0.25, + "grad_norm": 0.8631313679687664, + "learning_rate": 1.7559711017443062e-05, + "loss": 0.4784, + "step": 4358 + }, + { + "epoch": 0.25, + "grad_norm": 0.41323073775920643, + "learning_rate": 1.7558492723878507e-05, + "loss": 0.3094, + "step": 4359 + }, + { + "epoch": 0.25, + "grad_norm": 0.3782931761755461, + "learning_rate": 1.755727416856367e-05, + "loss": 0.3099, + "step": 4360 + }, + { + "epoch": 0.25, + "grad_norm": 1.2000243266515298, + "learning_rate": 1.7556055351540757e-05, + "loss": 0.8228, + "step": 4361 + }, + { + "epoch": 0.25, + "grad_norm": 0.2778527231030494, + "learning_rate": 1.7554836272851967e-05, + "loss": 0.1797, + "step": 4362 + }, + { + "epoch": 0.25, + "grad_norm": 0.34868729040736435, + "learning_rate": 1.7553616932539522e-05, + "loss": 0.2861, + "step": 4363 + }, + { + "epoch": 0.25, + "grad_norm": 1.0895145164950082, + "learning_rate": 1.7552397330645654e-05, + "loss": 0.5217, + "step": 4364 + }, + { + "epoch": 0.25, + "grad_norm": 0.6910131523731271, + "learning_rate": 1.7551177467212585e-05, + "loss": 0.508, + "step": 4365 + }, + { + "epoch": 0.25, + "grad_norm": 0.3367083352550415, + "learning_rate": 1.7549957342282567e-05, + "loss": 0.2234, + "step": 4366 + }, + { + "epoch": 0.25, + "grad_norm": 0.4083921331597858, + "learning_rate": 1.7548736955897852e-05, + "loss": 0.3233, + "step": 4367 + }, + { + "epoch": 0.25, + "grad_norm": 0.310682312374531, + "learning_rate": 1.75475163081007e-05, + "loss": 0.1883, + "step": 4368 + }, + { + "epoch": 0.25, + "grad_norm": 0.35150586064186157, + "learning_rate": 1.7546295398933383e-05, + "loss": 0.2042, + "step": 4369 + }, + { + "epoch": 0.25, + "grad_norm": 1.1970685231744054, + "learning_rate": 1.754507422843818e-05, + "loss": 0.5545, + "step": 4370 + }, + { + "epoch": 0.25, + "grad_norm": 0.390038478824965, + "learning_rate": 1.7543852796657382e-05, + "loss": 0.3434, + "step": 4371 + }, + { + "epoch": 0.25, + "grad_norm": 0.3332006509863933, + "learning_rate": 1.7542631103633284e-05, + "loss": 0.1989, + "step": 4372 + }, + { + "epoch": 0.25, + "grad_norm": 1.2245310547498793, + "learning_rate": 1.7541409149408198e-05, + "loss": 0.8196, + "step": 4373 + }, + { + "epoch": 0.25, + "grad_norm": 0.2594242254872551, + "learning_rate": 1.7540186934024434e-05, + "loss": 0.2391, + "step": 4374 + }, + { + "epoch": 0.25, + "grad_norm": 0.3663500648240596, + "learning_rate": 1.7538964457524326e-05, + "loss": 0.1984, + "step": 4375 + }, + { + "epoch": 0.25, + "grad_norm": 1.3797280623086665, + "learning_rate": 1.7537741719950197e-05, + "loss": 0.4802, + "step": 4376 + }, + { + "epoch": 0.25, + "grad_norm": 0.9288280071064241, + "learning_rate": 1.75365187213444e-05, + "loss": 0.5739, + "step": 4377 + }, + { + "epoch": 0.25, + "grad_norm": 0.4592788301354964, + "learning_rate": 1.7535295461749285e-05, + "loss": 0.3199, + "step": 4378 + }, + { + "epoch": 0.25, + "grad_norm": 0.371263615108485, + "learning_rate": 1.753407194120721e-05, + "loss": 0.2718, + "step": 4379 + }, + { + "epoch": 0.25, + "grad_norm": 0.4478970966584819, + "learning_rate": 1.753284815976055e-05, + "loss": 0.2777, + "step": 4380 + }, + { + "epoch": 0.25, + "grad_norm": 0.43708047320580123, + "learning_rate": 1.7531624117451678e-05, + "loss": 0.263, + "step": 4381 + }, + { + "epoch": 0.25, + "grad_norm": 0.7647331876742203, + "learning_rate": 1.753039981432299e-05, + "loss": 0.3161, + "step": 4382 + }, + { + "epoch": 0.25, + "grad_norm": 1.2246597599600573, + "learning_rate": 1.7529175250416878e-05, + "loss": 0.5649, + "step": 4383 + }, + { + "epoch": 0.25, + "grad_norm": 0.4019441310451681, + "learning_rate": 1.7527950425775753e-05, + "loss": 0.2567, + "step": 4384 + }, + { + "epoch": 0.25, + "grad_norm": 0.932022146410723, + "learning_rate": 1.7526725340442028e-05, + "loss": 0.5759, + "step": 4385 + }, + { + "epoch": 0.25, + "grad_norm": 0.31841220554720956, + "learning_rate": 1.7525499994458124e-05, + "loss": 0.2613, + "step": 4386 + }, + { + "epoch": 0.25, + "grad_norm": 0.37423694836917065, + "learning_rate": 1.7524274387866483e-05, + "loss": 0.2822, + "step": 4387 + }, + { + "epoch": 0.25, + "grad_norm": 0.5350548955670364, + "learning_rate": 1.7523048520709543e-05, + "loss": 0.2867, + "step": 4388 + }, + { + "epoch": 0.25, + "grad_norm": 1.8947754710817244, + "learning_rate": 1.7521822393029758e-05, + "loss": 0.5852, + "step": 4389 + }, + { + "epoch": 0.25, + "grad_norm": 0.3998881128220424, + "learning_rate": 1.7520596004869584e-05, + "loss": 0.2702, + "step": 4390 + }, + { + "epoch": 0.25, + "grad_norm": 0.5361072708652229, + "learning_rate": 1.7519369356271492e-05, + "loss": 0.3904, + "step": 4391 + }, + { + "epoch": 0.25, + "grad_norm": 0.8123518823570961, + "learning_rate": 1.751814244727797e-05, + "loss": 0.3895, + "step": 4392 + }, + { + "epoch": 0.25, + "grad_norm": 0.6059125687840098, + "learning_rate": 1.751691527793149e-05, + "loss": 0.339, + "step": 4393 + }, + { + "epoch": 0.25, + "grad_norm": 0.3313177646320267, + "learning_rate": 1.7515687848274562e-05, + "loss": 0.2728, + "step": 4394 + }, + { + "epoch": 0.25, + "grad_norm": 0.35552239325197443, + "learning_rate": 1.7514460158349686e-05, + "loss": 0.2089, + "step": 4395 + }, + { + "epoch": 0.25, + "grad_norm": 0.4505421213963791, + "learning_rate": 1.7513232208199378e-05, + "loss": 0.3099, + "step": 4396 + }, + { + "epoch": 0.25, + "grad_norm": 0.6644199425595185, + "learning_rate": 1.751200399786616e-05, + "loss": 0.4381, + "step": 4397 + }, + { + "epoch": 0.25, + "grad_norm": 0.4423052272917962, + "learning_rate": 1.7510775527392566e-05, + "loss": 0.2984, + "step": 4398 + }, + { + "epoch": 0.25, + "grad_norm": 0.6068964287449121, + "learning_rate": 1.7509546796821144e-05, + "loss": 0.3403, + "step": 4399 + }, + { + "epoch": 0.25, + "grad_norm": 0.44069931145334174, + "learning_rate": 1.7508317806194436e-05, + "loss": 0.316, + "step": 4400 + }, + { + "epoch": 0.25, + "grad_norm": 0.30685445519676646, + "learning_rate": 1.7507088555555003e-05, + "loss": 0.1484, + "step": 4401 + }, + { + "epoch": 0.25, + "grad_norm": 0.319991819668123, + "learning_rate": 1.750585904494542e-05, + "loss": 0.2779, + "step": 4402 + }, + { + "epoch": 0.25, + "grad_norm": 0.5975686561638618, + "learning_rate": 1.7504629274408257e-05, + "loss": 0.4789, + "step": 4403 + }, + { + "epoch": 0.25, + "grad_norm": 0.8063943453403252, + "learning_rate": 1.750339924398611e-05, + "loss": 0.4771, + "step": 4404 + }, + { + "epoch": 0.25, + "grad_norm": 0.3169912413687381, + "learning_rate": 1.7502168953721564e-05, + "loss": 0.2506, + "step": 4405 + }, + { + "epoch": 0.25, + "grad_norm": 0.5907726571115687, + "learning_rate": 1.7500938403657235e-05, + "loss": 0.4054, + "step": 4406 + }, + { + "epoch": 0.25, + "grad_norm": 0.28948065757597125, + "learning_rate": 1.7499707593835728e-05, + "loss": 0.1955, + "step": 4407 + }, + { + "epoch": 0.25, + "grad_norm": 0.346293298851469, + "learning_rate": 1.749847652429967e-05, + "loss": 0.2249, + "step": 4408 + }, + { + "epoch": 0.25, + "grad_norm": 1.1109798245103102, + "learning_rate": 1.7497245195091694e-05, + "loss": 0.6656, + "step": 4409 + }, + { + "epoch": 0.25, + "grad_norm": 0.49764755117641957, + "learning_rate": 1.749601360625444e-05, + "loss": 0.3518, + "step": 4410 + }, + { + "epoch": 0.25, + "grad_norm": 0.6586021135670197, + "learning_rate": 1.7494781757830554e-05, + "loss": 0.2586, + "step": 4411 + }, + { + "epoch": 0.25, + "grad_norm": 0.415911984220887, + "learning_rate": 1.74935496498627e-05, + "loss": 0.3387, + "step": 4412 + }, + { + "epoch": 0.25, + "grad_norm": 0.3528941062652166, + "learning_rate": 1.7492317282393543e-05, + "loss": 0.1973, + "step": 4413 + }, + { + "epoch": 0.25, + "grad_norm": 0.5379242200023283, + "learning_rate": 1.7491084655465762e-05, + "loss": 0.3619, + "step": 4414 + }, + { + "epoch": 0.25, + "grad_norm": 0.4547219643409678, + "learning_rate": 1.748985176912204e-05, + "loss": 0.2714, + "step": 4415 + }, + { + "epoch": 0.25, + "grad_norm": 0.9862381216448781, + "learning_rate": 1.7488618623405075e-05, + "loss": 0.5093, + "step": 4416 + }, + { + "epoch": 0.25, + "grad_norm": 0.6014511226082759, + "learning_rate": 1.748738521835757e-05, + "loss": 0.3081, + "step": 4417 + }, + { + "epoch": 0.25, + "grad_norm": 0.42678893832728415, + "learning_rate": 1.7486151554022233e-05, + "loss": 0.266, + "step": 4418 + }, + { + "epoch": 0.25, + "grad_norm": 0.29200160850606793, + "learning_rate": 1.748491763044179e-05, + "loss": 0.1875, + "step": 4419 + }, + { + "epoch": 0.25, + "grad_norm": 0.609705572125106, + "learning_rate": 1.7483683447658976e-05, + "loss": 0.3825, + "step": 4420 + }, + { + "epoch": 0.25, + "grad_norm": 0.6734514732700916, + "learning_rate": 1.748244900571652e-05, + "loss": 0.3149, + "step": 4421 + }, + { + "epoch": 0.25, + "grad_norm": 0.5674551833098238, + "learning_rate": 1.748121430465718e-05, + "loss": 0.3457, + "step": 4422 + }, + { + "epoch": 0.25, + "grad_norm": 0.5692437298539499, + "learning_rate": 1.747997934452371e-05, + "loss": 0.3981, + "step": 4423 + }, + { + "epoch": 0.25, + "grad_norm": 0.40695746244632547, + "learning_rate": 1.7478744125358877e-05, + "loss": 0.2523, + "step": 4424 + }, + { + "epoch": 0.25, + "grad_norm": 0.26111116306772175, + "learning_rate": 1.7477508647205456e-05, + "loss": 0.2255, + "step": 4425 + }, + { + "epoch": 0.25, + "grad_norm": 0.3739424274051894, + "learning_rate": 1.7476272910106233e-05, + "loss": 0.2915, + "step": 4426 + }, + { + "epoch": 0.25, + "grad_norm": 0.5242746749994995, + "learning_rate": 1.7475036914104e-05, + "loss": 0.4205, + "step": 4427 + }, + { + "epoch": 0.25, + "grad_norm": 0.5970732747562592, + "learning_rate": 1.747380065924156e-05, + "loss": 0.409, + "step": 4428 + }, + { + "epoch": 0.25, + "grad_norm": 0.7671707463296635, + "learning_rate": 1.7472564145561725e-05, + "loss": 0.4088, + "step": 4429 + }, + { + "epoch": 0.25, + "grad_norm": 0.3501468041889073, + "learning_rate": 1.7471327373107317e-05, + "loss": 0.2827, + "step": 4430 + }, + { + "epoch": 0.25, + "grad_norm": 0.3703347723536773, + "learning_rate": 1.747009034192116e-05, + "loss": 0.1713, + "step": 4431 + }, + { + "epoch": 0.25, + "grad_norm": 0.6832880354899038, + "learning_rate": 1.7468853052046095e-05, + "loss": 0.4288, + "step": 4432 + }, + { + "epoch": 0.25, + "grad_norm": 0.385508552267273, + "learning_rate": 1.7467615503524973e-05, + "loss": 0.3287, + "step": 4433 + }, + { + "epoch": 0.25, + "grad_norm": 0.2565484503974381, + "learning_rate": 1.7466377696400646e-05, + "loss": 0.2168, + "step": 4434 + }, + { + "epoch": 0.25, + "grad_norm": 0.6083624161878634, + "learning_rate": 1.746513963071598e-05, + "loss": 0.374, + "step": 4435 + }, + { + "epoch": 0.25, + "grad_norm": 0.4290866249248875, + "learning_rate": 1.746390130651385e-05, + "loss": 0.3343, + "step": 4436 + }, + { + "epoch": 0.25, + "grad_norm": 0.4210646856691826, + "learning_rate": 1.746266272383714e-05, + "loss": 0.1923, + "step": 4437 + }, + { + "epoch": 0.25, + "grad_norm": 0.31382665898340373, + "learning_rate": 1.746142388272874e-05, + "loss": 0.2997, + "step": 4438 + }, + { + "epoch": 0.26, + "grad_norm": 0.40780261576434734, + "learning_rate": 1.746018478323155e-05, + "loss": 0.3683, + "step": 4439 + }, + { + "epoch": 0.26, + "grad_norm": 0.42413178269441665, + "learning_rate": 1.7458945425388484e-05, + "loss": 0.2996, + "step": 4440 + }, + { + "epoch": 0.26, + "grad_norm": 0.3334174550196039, + "learning_rate": 1.7457705809242455e-05, + "loss": 0.1964, + "step": 4441 + }, + { + "epoch": 0.26, + "grad_norm": 0.3571659636822857, + "learning_rate": 1.74564659348364e-05, + "loss": 0.3006, + "step": 4442 + }, + { + "epoch": 0.26, + "grad_norm": 0.8945360836656251, + "learning_rate": 1.7455225802213246e-05, + "loss": 0.5754, + "step": 4443 + }, + { + "epoch": 0.26, + "grad_norm": 0.5110983783626839, + "learning_rate": 1.7453985411415945e-05, + "loss": 0.3305, + "step": 4444 + }, + { + "epoch": 0.26, + "grad_norm": 0.40930073398439676, + "learning_rate": 1.745274476248745e-05, + "loss": 0.3161, + "step": 4445 + }, + { + "epoch": 0.26, + "grad_norm": 0.2968822772265499, + "learning_rate": 1.7451503855470722e-05, + "loss": 0.2828, + "step": 4446 + }, + { + "epoch": 0.26, + "grad_norm": 0.27604539059250005, + "learning_rate": 1.745026269040874e-05, + "loss": 0.1201, + "step": 4447 + }, + { + "epoch": 0.26, + "grad_norm": 0.4091950250734807, + "learning_rate": 1.744902126734448e-05, + "loss": 0.305, + "step": 4448 + }, + { + "epoch": 0.26, + "grad_norm": 0.4688130768266721, + "learning_rate": 1.744777958632093e-05, + "loss": 0.3995, + "step": 4449 + }, + { + "epoch": 0.26, + "grad_norm": 0.4199210326535034, + "learning_rate": 1.74465376473811e-05, + "loss": 0.2857, + "step": 4450 + }, + { + "epoch": 0.26, + "grad_norm": 0.36946539820255947, + "learning_rate": 1.7445295450567985e-05, + "loss": 0.3241, + "step": 4451 + }, + { + "epoch": 0.26, + "grad_norm": 0.30417635766535905, + "learning_rate": 1.7444052995924612e-05, + "loss": 0.2137, + "step": 4452 + }, + { + "epoch": 0.26, + "grad_norm": 0.388316362774309, + "learning_rate": 1.7442810283494002e-05, + "loss": 0.3331, + "step": 4453 + }, + { + "epoch": 0.26, + "grad_norm": 0.33429814252358525, + "learning_rate": 1.7441567313319194e-05, + "loss": 0.2181, + "step": 4454 + }, + { + "epoch": 0.26, + "grad_norm": 0.7626510299733518, + "learning_rate": 1.7440324085443227e-05, + "loss": 0.4604, + "step": 4455 + }, + { + "epoch": 0.26, + "grad_norm": 0.5831532138070068, + "learning_rate": 1.7439080599909163e-05, + "loss": 0.4221, + "step": 4456 + }, + { + "epoch": 0.26, + "grad_norm": 0.29553524419263844, + "learning_rate": 1.743783685676005e-05, + "loss": 0.2498, + "step": 4457 + }, + { + "epoch": 0.26, + "grad_norm": 0.49518512762179856, + "learning_rate": 1.743659285603897e-05, + "loss": 0.3983, + "step": 4458 + }, + { + "epoch": 0.26, + "grad_norm": 0.25390198843159745, + "learning_rate": 1.7435348597789e-05, + "loss": 0.1813, + "step": 4459 + }, + { + "epoch": 0.26, + "grad_norm": 0.36903051665820324, + "learning_rate": 1.7434104082053227e-05, + "loss": 0.2264, + "step": 4460 + }, + { + "epoch": 0.26, + "grad_norm": 0.848917048411256, + "learning_rate": 1.743285930887475e-05, + "loss": 0.4192, + "step": 4461 + }, + { + "epoch": 0.26, + "grad_norm": 0.5239600697847725, + "learning_rate": 1.7431614278296672e-05, + "loss": 0.3618, + "step": 4462 + }, + { + "epoch": 0.26, + "grad_norm": 0.39962990775437407, + "learning_rate": 1.7430368990362114e-05, + "loss": 0.2368, + "step": 4463 + }, + { + "epoch": 0.26, + "grad_norm": 0.41735657916516267, + "learning_rate": 1.7429123445114196e-05, + "loss": 0.2857, + "step": 4464 + }, + { + "epoch": 0.26, + "grad_norm": 0.32698671191140255, + "learning_rate": 1.7427877642596053e-05, + "loss": 0.2875, + "step": 4465 + }, + { + "epoch": 0.26, + "grad_norm": 0.3624786033802996, + "learning_rate": 1.7426631582850827e-05, + "loss": 0.2762, + "step": 4466 + }, + { + "epoch": 0.26, + "grad_norm": 0.5845800939907507, + "learning_rate": 1.742538526592167e-05, + "loss": 0.4098, + "step": 4467 + }, + { + "epoch": 0.26, + "grad_norm": 0.9870741959193776, + "learning_rate": 1.742413869185174e-05, + "loss": 0.6744, + "step": 4468 + }, + { + "epoch": 0.26, + "grad_norm": 0.42257891785206114, + "learning_rate": 1.7422891860684202e-05, + "loss": 0.2697, + "step": 4469 + }, + { + "epoch": 0.26, + "grad_norm": 0.32188754467568104, + "learning_rate": 1.7421644772462247e-05, + "loss": 0.2059, + "step": 4470 + }, + { + "epoch": 0.26, + "grad_norm": 0.4304055386861345, + "learning_rate": 1.7420397427229045e-05, + "loss": 0.2905, + "step": 4471 + }, + { + "epoch": 0.26, + "grad_norm": 0.33978838217773394, + "learning_rate": 1.7419149825027802e-05, + "loss": 0.3002, + "step": 4472 + }, + { + "epoch": 0.26, + "grad_norm": 0.3788118884308564, + "learning_rate": 1.7417901965901717e-05, + "loss": 0.3153, + "step": 4473 + }, + { + "epoch": 0.26, + "grad_norm": 0.521900293581829, + "learning_rate": 1.7416653849894008e-05, + "loss": 0.3964, + "step": 4474 + }, + { + "epoch": 0.26, + "grad_norm": 0.3665116506708937, + "learning_rate": 1.7415405477047895e-05, + "loss": 0.2777, + "step": 4475 + }, + { + "epoch": 0.26, + "grad_norm": 0.9883665529741489, + "learning_rate": 1.741415684740661e-05, + "loss": 0.4042, + "step": 4476 + }, + { + "epoch": 0.26, + "grad_norm": 0.24629002200117167, + "learning_rate": 1.741290796101339e-05, + "loss": 0.2254, + "step": 4477 + }, + { + "epoch": 0.26, + "grad_norm": 0.3537787580716675, + "learning_rate": 1.7411658817911487e-05, + "loss": 0.3092, + "step": 4478 + }, + { + "epoch": 0.26, + "grad_norm": 0.6388605821733183, + "learning_rate": 1.741040941814416e-05, + "loss": 0.4883, + "step": 4479 + }, + { + "epoch": 0.26, + "grad_norm": 0.6188260255813043, + "learning_rate": 1.740915976175467e-05, + "loss": 0.3133, + "step": 4480 + }, + { + "epoch": 0.26, + "grad_norm": 0.4176908201497853, + "learning_rate": 1.74079098487863e-05, + "loss": 0.3111, + "step": 4481 + }, + { + "epoch": 0.26, + "grad_norm": 0.45129474320870905, + "learning_rate": 1.7406659679282326e-05, + "loss": 0.3424, + "step": 4482 + }, + { + "epoch": 0.26, + "grad_norm": 0.4174295494281297, + "learning_rate": 1.740540925328605e-05, + "loss": 0.2623, + "step": 4483 + }, + { + "epoch": 0.26, + "grad_norm": 0.3767565914818703, + "learning_rate": 1.7404158570840765e-05, + "loss": 0.282, + "step": 4484 + }, + { + "epoch": 0.26, + "grad_norm": 0.2783113500296502, + "learning_rate": 1.7402907631989793e-05, + "loss": 0.2917, + "step": 4485 + }, + { + "epoch": 0.26, + "grad_norm": 0.9873955581325996, + "learning_rate": 1.7401656436776445e-05, + "loss": 0.4259, + "step": 4486 + }, + { + "epoch": 0.26, + "grad_norm": 0.3845423744262796, + "learning_rate": 1.740040498524405e-05, + "loss": 0.2788, + "step": 4487 + }, + { + "epoch": 0.26, + "grad_norm": 0.7272016165043925, + "learning_rate": 1.7399153277435954e-05, + "loss": 0.5691, + "step": 4488 + }, + { + "epoch": 0.26, + "grad_norm": 0.40153871463464896, + "learning_rate": 1.7397901313395497e-05, + "loss": 0.2958, + "step": 4489 + }, + { + "epoch": 0.26, + "grad_norm": 0.39987319741953187, + "learning_rate": 1.7396649093166034e-05, + "loss": 0.3051, + "step": 4490 + }, + { + "epoch": 0.26, + "grad_norm": 0.35067447545845665, + "learning_rate": 1.7395396616790932e-05, + "loss": 0.1451, + "step": 4491 + }, + { + "epoch": 0.26, + "grad_norm": 0.9550529008649429, + "learning_rate": 1.7394143884313562e-05, + "loss": 0.7368, + "step": 4492 + }, + { + "epoch": 0.26, + "grad_norm": 0.30621419928245114, + "learning_rate": 1.7392890895777305e-05, + "loss": 0.2344, + "step": 4493 + }, + { + "epoch": 0.26, + "grad_norm": 1.4312493233871788, + "learning_rate": 1.7391637651225556e-05, + "loss": 0.7849, + "step": 4494 + }, + { + "epoch": 0.26, + "grad_norm": 0.6076999334668871, + "learning_rate": 1.7390384150701715e-05, + "loss": 0.4489, + "step": 4495 + }, + { + "epoch": 0.26, + "grad_norm": 0.38957330875145113, + "learning_rate": 1.738913039424919e-05, + "loss": 0.1873, + "step": 4496 + }, + { + "epoch": 0.26, + "grad_norm": 0.34020086053360765, + "learning_rate": 1.7387876381911395e-05, + "loss": 0.2616, + "step": 4497 + }, + { + "epoch": 0.26, + "grad_norm": 0.43665721845984257, + "learning_rate": 1.7386622113731758e-05, + "loss": 0.3332, + "step": 4498 + }, + { + "epoch": 0.26, + "grad_norm": 0.35317580614898686, + "learning_rate": 1.7385367589753714e-05, + "loss": 0.2111, + "step": 4499 + }, + { + "epoch": 0.26, + "grad_norm": 0.8179148641200064, + "learning_rate": 1.738411281002071e-05, + "loss": 0.4621, + "step": 4500 + }, + { + "epoch": 0.26, + "grad_norm": 0.37262485183921623, + "learning_rate": 1.7382857774576195e-05, + "loss": 0.3425, + "step": 4501 + }, + { + "epoch": 0.26, + "grad_norm": 0.4434924061456271, + "learning_rate": 1.7381602483463637e-05, + "loss": 0.2924, + "step": 4502 + }, + { + "epoch": 0.26, + "grad_norm": 0.23825968934381375, + "learning_rate": 1.73803469367265e-05, + "loss": 0.1482, + "step": 4503 + }, + { + "epoch": 0.26, + "grad_norm": 1.3993363887214298, + "learning_rate": 1.7379091134408265e-05, + "loss": 0.9296, + "step": 4504 + }, + { + "epoch": 0.26, + "grad_norm": 0.3323388490174505, + "learning_rate": 1.737783507655242e-05, + "loss": 0.2951, + "step": 4505 + }, + { + "epoch": 0.26, + "grad_norm": 0.596493427646773, + "learning_rate": 1.7376578763202465e-05, + "loss": 0.3465, + "step": 4506 + }, + { + "epoch": 0.26, + "grad_norm": 0.85841984621207, + "learning_rate": 1.7375322194401905e-05, + "loss": 0.5163, + "step": 4507 + }, + { + "epoch": 0.26, + "grad_norm": 0.3979185219008631, + "learning_rate": 1.7374065370194253e-05, + "loss": 0.3023, + "step": 4508 + }, + { + "epoch": 0.26, + "grad_norm": 0.3792909090305572, + "learning_rate": 1.7372808290623034e-05, + "loss": 0.2717, + "step": 4509 + }, + { + "epoch": 0.26, + "grad_norm": 0.3288322471843398, + "learning_rate": 1.7371550955731786e-05, + "loss": 0.2334, + "step": 4510 + }, + { + "epoch": 0.26, + "grad_norm": 0.3603072373462248, + "learning_rate": 1.737029336556404e-05, + "loss": 0.2721, + "step": 4511 + }, + { + "epoch": 0.26, + "grad_norm": 1.0258413937489468, + "learning_rate": 1.7369035520163355e-05, + "loss": 0.4195, + "step": 4512 + }, + { + "epoch": 0.26, + "grad_norm": 0.38788937506634363, + "learning_rate": 1.7367777419573285e-05, + "loss": 0.3458, + "step": 4513 + }, + { + "epoch": 0.26, + "grad_norm": 0.3663495306117589, + "learning_rate": 1.73665190638374e-05, + "loss": 0.2827, + "step": 4514 + }, + { + "epoch": 0.26, + "grad_norm": 0.35904037745450235, + "learning_rate": 1.7365260452999277e-05, + "loss": 0.239, + "step": 4515 + }, + { + "epoch": 0.26, + "grad_norm": 0.368230106992383, + "learning_rate": 1.73640015871025e-05, + "loss": 0.2299, + "step": 4516 + }, + { + "epoch": 0.26, + "grad_norm": 0.4124820648794057, + "learning_rate": 1.7362742466190668e-05, + "loss": 0.2929, + "step": 4517 + }, + { + "epoch": 0.26, + "grad_norm": 0.6313760443488636, + "learning_rate": 1.736148309030738e-05, + "loss": 0.3801, + "step": 4518 + }, + { + "epoch": 0.26, + "grad_norm": 0.8649835515367721, + "learning_rate": 1.7360223459496244e-05, + "loss": 0.3975, + "step": 4519 + }, + { + "epoch": 0.26, + "grad_norm": 0.3995323752107863, + "learning_rate": 1.735896357380089e-05, + "loss": 0.2723, + "step": 4520 + }, + { + "epoch": 0.26, + "grad_norm": 0.34544221499812, + "learning_rate": 1.7357703433264945e-05, + "loss": 0.3229, + "step": 4521 + }, + { + "epoch": 0.26, + "grad_norm": 0.2902986551221321, + "learning_rate": 1.7356443037932046e-05, + "loss": 0.1643, + "step": 4522 + }, + { + "epoch": 0.26, + "grad_norm": 0.4838021066394585, + "learning_rate": 1.7355182387845843e-05, + "loss": 0.3209, + "step": 4523 + }, + { + "epoch": 0.26, + "grad_norm": 0.5004591776306092, + "learning_rate": 1.7353921483049985e-05, + "loss": 0.3296, + "step": 4524 + }, + { + "epoch": 0.26, + "grad_norm": 0.4258313917846841, + "learning_rate": 1.7352660323588146e-05, + "loss": 0.2727, + "step": 4525 + }, + { + "epoch": 0.26, + "grad_norm": 0.4033155476729041, + "learning_rate": 1.7351398909503995e-05, + "loss": 0.3081, + "step": 4526 + }, + { + "epoch": 0.26, + "grad_norm": 0.5300104630284196, + "learning_rate": 1.7350137240841218e-05, + "loss": 0.4131, + "step": 4527 + }, + { + "epoch": 0.26, + "grad_norm": 0.6412206824998394, + "learning_rate": 1.73488753176435e-05, + "loss": 0.5222, + "step": 4528 + }, + { + "epoch": 0.26, + "grad_norm": 0.3077706381580906, + "learning_rate": 1.7347613139954548e-05, + "loss": 0.2402, + "step": 4529 + }, + { + "epoch": 0.26, + "grad_norm": 0.6688065829605588, + "learning_rate": 1.734635070781807e-05, + "loss": 0.538, + "step": 4530 + }, + { + "epoch": 0.26, + "grad_norm": 0.35909638887144074, + "learning_rate": 1.734508802127778e-05, + "loss": 0.2608, + "step": 4531 + }, + { + "epoch": 0.26, + "grad_norm": 0.41579088397011704, + "learning_rate": 1.734382508037741e-05, + "loss": 0.1842, + "step": 4532 + }, + { + "epoch": 0.26, + "grad_norm": 0.4211404512665711, + "learning_rate": 1.7342561885160694e-05, + "loss": 0.3147, + "step": 4533 + }, + { + "epoch": 0.26, + "grad_norm": 0.8256111760795807, + "learning_rate": 1.7341298435671373e-05, + "loss": 0.5434, + "step": 4534 + }, + { + "epoch": 0.26, + "grad_norm": 0.5292156928559316, + "learning_rate": 1.7340034731953204e-05, + "loss": 0.2707, + "step": 4535 + }, + { + "epoch": 0.26, + "grad_norm": 0.3626707079391625, + "learning_rate": 1.7338770774049948e-05, + "loss": 0.2969, + "step": 4536 + }, + { + "epoch": 0.26, + "grad_norm": 0.2987530938150695, + "learning_rate": 1.733750656200538e-05, + "loss": 0.2727, + "step": 4537 + }, + { + "epoch": 0.26, + "grad_norm": 0.41984643592449417, + "learning_rate": 1.733624209586327e-05, + "loss": 0.0769, + "step": 4538 + }, + { + "epoch": 0.26, + "grad_norm": 0.4498582260094257, + "learning_rate": 1.733497737566741e-05, + "loss": 0.3627, + "step": 4539 + }, + { + "epoch": 0.26, + "grad_norm": 0.7896624131435914, + "learning_rate": 1.7333712401461602e-05, + "loss": 0.5321, + "step": 4540 + }, + { + "epoch": 0.26, + "grad_norm": 0.33277012471811757, + "learning_rate": 1.7332447173289648e-05, + "loss": 0.3177, + "step": 4541 + }, + { + "epoch": 0.26, + "grad_norm": 0.4780594140246693, + "learning_rate": 1.7331181691195364e-05, + "loss": 0.2849, + "step": 4542 + }, + { + "epoch": 0.26, + "grad_norm": 0.35576731933438077, + "learning_rate": 1.7329915955222578e-05, + "loss": 0.2063, + "step": 4543 + }, + { + "epoch": 0.26, + "grad_norm": 0.5907181564894486, + "learning_rate": 1.7328649965415108e-05, + "loss": 0.358, + "step": 4544 + }, + { + "epoch": 0.26, + "grad_norm": 0.31900993570485137, + "learning_rate": 1.732738372181681e-05, + "loss": 0.2571, + "step": 4545 + }, + { + "epoch": 0.26, + "grad_norm": 0.8407790010179014, + "learning_rate": 1.7326117224471534e-05, + "loss": 0.4832, + "step": 4546 + }, + { + "epoch": 0.26, + "grad_norm": 0.6475543922825446, + "learning_rate": 1.7324850473423124e-05, + "loss": 0.4714, + "step": 4547 + }, + { + "epoch": 0.26, + "grad_norm": 0.4103670098764781, + "learning_rate": 1.7323583468715464e-05, + "loss": 0.2612, + "step": 4548 + }, + { + "epoch": 0.26, + "grad_norm": 0.25746760920249057, + "learning_rate": 1.732231621039242e-05, + "loss": 0.2225, + "step": 4549 + }, + { + "epoch": 0.26, + "grad_norm": 0.45385734178505405, + "learning_rate": 1.732104869849788e-05, + "loss": 0.3283, + "step": 4550 + }, + { + "epoch": 0.26, + "grad_norm": 0.5715738690411651, + "learning_rate": 1.731978093307574e-05, + "loss": 0.3138, + "step": 4551 + }, + { + "epoch": 0.26, + "grad_norm": 0.47645140136250524, + "learning_rate": 1.7318512914169903e-05, + "loss": 0.3329, + "step": 4552 + }, + { + "epoch": 0.26, + "grad_norm": 0.8370401653202683, + "learning_rate": 1.7317244641824275e-05, + "loss": 0.5012, + "step": 4553 + }, + { + "epoch": 0.26, + "grad_norm": 0.40217735496250034, + "learning_rate": 1.731597611608278e-05, + "loss": 0.319, + "step": 4554 + }, + { + "epoch": 0.26, + "grad_norm": 0.22407629826832084, + "learning_rate": 1.731470733698935e-05, + "loss": 0.137, + "step": 4555 + }, + { + "epoch": 0.26, + "grad_norm": 0.43215683904816754, + "learning_rate": 1.7313438304587918e-05, + "loss": 0.3071, + "step": 4556 + }, + { + "epoch": 0.26, + "grad_norm": 0.318869649065026, + "learning_rate": 1.731216901892243e-05, + "loss": 0.2901, + "step": 4557 + }, + { + "epoch": 0.26, + "grad_norm": 0.8355303688845711, + "learning_rate": 1.7310899480036845e-05, + "loss": 0.3845, + "step": 4558 + }, + { + "epoch": 0.26, + "grad_norm": 0.8346211453513296, + "learning_rate": 1.7309629687975126e-05, + "loss": 0.5678, + "step": 4559 + }, + { + "epoch": 0.26, + "grad_norm": 0.3490837116866384, + "learning_rate": 1.730835964278124e-05, + "loss": 0.2892, + "step": 4560 + }, + { + "epoch": 0.26, + "grad_norm": 0.32214562875779884, + "learning_rate": 1.7307089344499178e-05, + "loss": 0.2011, + "step": 4561 + }, + { + "epoch": 0.26, + "grad_norm": 0.4094468270950029, + "learning_rate": 1.730581879317293e-05, + "loss": 0.2334, + "step": 4562 + }, + { + "epoch": 0.26, + "grad_norm": 0.6224042650550641, + "learning_rate": 1.730454798884649e-05, + "loss": 0.3563, + "step": 4563 + }, + { + "epoch": 0.26, + "grad_norm": 0.379128591299934, + "learning_rate": 1.7303276931563862e-05, + "loss": 0.2706, + "step": 4564 + }, + { + "epoch": 0.26, + "grad_norm": 0.6789909142350193, + "learning_rate": 1.730200562136907e-05, + "loss": 0.4433, + "step": 4565 + }, + { + "epoch": 0.26, + "grad_norm": 0.4124711177122947, + "learning_rate": 1.7300734058306138e-05, + "loss": 0.335, + "step": 4566 + }, + { + "epoch": 0.26, + "grad_norm": 0.2858262013205675, + "learning_rate": 1.72994622424191e-05, + "loss": 0.2076, + "step": 4567 + }, + { + "epoch": 0.26, + "grad_norm": 0.30957498941420897, + "learning_rate": 1.7298190173751996e-05, + "loss": 0.2559, + "step": 4568 + }, + { + "epoch": 0.26, + "grad_norm": 0.45178314832303773, + "learning_rate": 1.7296917852348882e-05, + "loss": 0.342, + "step": 4569 + }, + { + "epoch": 0.26, + "grad_norm": 0.6733044811452652, + "learning_rate": 1.7295645278253817e-05, + "loss": 0.5276, + "step": 4570 + }, + { + "epoch": 0.26, + "grad_norm": 0.49577571076452776, + "learning_rate": 1.729437245151087e-05, + "loss": 0.2828, + "step": 4571 + }, + { + "epoch": 0.26, + "grad_norm": 0.3413429533842971, + "learning_rate": 1.7293099372164118e-05, + "loss": 0.2894, + "step": 4572 + }, + { + "epoch": 0.26, + "grad_norm": 0.3407676552622252, + "learning_rate": 1.729182604025765e-05, + "loss": 0.2752, + "step": 4573 + }, + { + "epoch": 0.26, + "grad_norm": 0.561291828414926, + "learning_rate": 1.729055245583556e-05, + "loss": 0.3268, + "step": 4574 + }, + { + "epoch": 0.26, + "grad_norm": 0.29530320708055313, + "learning_rate": 1.728927861894195e-05, + "loss": 0.2669, + "step": 4575 + }, + { + "epoch": 0.26, + "grad_norm": 0.3881094912526957, + "learning_rate": 1.7288004529620935e-05, + "loss": 0.3326, + "step": 4576 + }, + { + "epoch": 0.26, + "grad_norm": 1.1701078373770246, + "learning_rate": 1.7286730187916635e-05, + "loss": 0.8335, + "step": 4577 + }, + { + "epoch": 0.26, + "grad_norm": 0.31692692063010275, + "learning_rate": 1.7285455593873183e-05, + "loss": 0.2234, + "step": 4578 + }, + { + "epoch": 0.26, + "grad_norm": 0.7275364835658681, + "learning_rate": 1.728418074753472e-05, + "loss": 0.5326, + "step": 4579 + }, + { + "epoch": 0.26, + "grad_norm": 0.3451992181340388, + "learning_rate": 1.7282905648945386e-05, + "loss": 0.342, + "step": 4580 + }, + { + "epoch": 0.26, + "grad_norm": 0.2693554960138304, + "learning_rate": 1.7281630298149346e-05, + "loss": 0.2137, + "step": 4581 + }, + { + "epoch": 0.26, + "grad_norm": 0.4373966659922116, + "learning_rate": 1.728035469519076e-05, + "loss": 0.2423, + "step": 4582 + }, + { + "epoch": 0.26, + "grad_norm": 0.8356130794756954, + "learning_rate": 1.7279078840113805e-05, + "loss": 0.4754, + "step": 4583 + }, + { + "epoch": 0.26, + "grad_norm": 0.3556758936236917, + "learning_rate": 1.7277802732962662e-05, + "loss": 0.2365, + "step": 4584 + }, + { + "epoch": 0.26, + "grad_norm": 0.3887816940254315, + "learning_rate": 1.7276526373781525e-05, + "loss": 0.3398, + "step": 4585 + }, + { + "epoch": 0.26, + "grad_norm": 0.6248091341949249, + "learning_rate": 1.7275249762614592e-05, + "loss": 0.4088, + "step": 4586 + }, + { + "epoch": 0.26, + "grad_norm": 0.2780554084812607, + "learning_rate": 1.727397289950607e-05, + "loss": 0.1789, + "step": 4587 + }, + { + "epoch": 0.26, + "grad_norm": 0.35914471939866316, + "learning_rate": 1.7272695784500185e-05, + "loss": 0.2958, + "step": 4588 + }, + { + "epoch": 0.26, + "grad_norm": 1.1927746460401998, + "learning_rate": 1.7271418417641153e-05, + "loss": 0.7851, + "step": 4589 + }, + { + "epoch": 0.26, + "grad_norm": 0.3336997350745605, + "learning_rate": 1.7270140798973215e-05, + "loss": 0.259, + "step": 4590 + }, + { + "epoch": 0.26, + "grad_norm": 0.7934746268294893, + "learning_rate": 1.7268862928540616e-05, + "loss": 0.3916, + "step": 4591 + }, + { + "epoch": 0.26, + "grad_norm": 0.440992452385509, + "learning_rate": 1.7267584806387604e-05, + "loss": 0.3606, + "step": 4592 + }, + { + "epoch": 0.26, + "grad_norm": 0.3643981449840152, + "learning_rate": 1.726630643255844e-05, + "loss": 0.2713, + "step": 4593 + }, + { + "epoch": 0.26, + "grad_norm": 0.23831222638239447, + "learning_rate": 1.7265027807097402e-05, + "loss": 0.1099, + "step": 4594 + }, + { + "epoch": 0.26, + "grad_norm": 0.9794922046894605, + "learning_rate": 1.726374893004876e-05, + "loss": 0.641, + "step": 4595 + }, + { + "epoch": 0.26, + "grad_norm": 0.3302361301129314, + "learning_rate": 1.7262469801456806e-05, + "loss": 0.2753, + "step": 4596 + }, + { + "epoch": 0.26, + "grad_norm": 0.6913170287228856, + "learning_rate": 1.7261190421365836e-05, + "loss": 0.3405, + "step": 4597 + }, + { + "epoch": 0.26, + "grad_norm": 0.7638555754563725, + "learning_rate": 1.7259910789820152e-05, + "loss": 0.5113, + "step": 4598 + }, + { + "epoch": 0.26, + "grad_norm": 0.30601965328502595, + "learning_rate": 1.7258630906864068e-05, + "loss": 0.203, + "step": 4599 + }, + { + "epoch": 0.26, + "grad_norm": 0.35651461621894115, + "learning_rate": 1.7257350772541914e-05, + "loss": 0.2329, + "step": 4600 + }, + { + "epoch": 0.26, + "grad_norm": 0.8782920891777145, + "learning_rate": 1.725607038689801e-05, + "loss": 0.6243, + "step": 4601 + }, + { + "epoch": 0.26, + "grad_norm": 0.42622194472487185, + "learning_rate": 1.7254789749976703e-05, + "loss": 0.2775, + "step": 4602 + }, + { + "epoch": 0.26, + "grad_norm": 1.1987544832356745, + "learning_rate": 1.7253508861822338e-05, + "loss": 0.4799, + "step": 4603 + }, + { + "epoch": 0.26, + "grad_norm": 0.37401071978627676, + "learning_rate": 1.725222772247927e-05, + "loss": 0.2316, + "step": 4604 + }, + { + "epoch": 0.26, + "grad_norm": 0.4714683560992018, + "learning_rate": 1.725094633199187e-05, + "loss": 0.279, + "step": 4605 + }, + { + "epoch": 0.26, + "grad_norm": 0.3252260992389096, + "learning_rate": 1.7249664690404514e-05, + "loss": 0.1989, + "step": 4606 + }, + { + "epoch": 0.26, + "grad_norm": 1.2681138371986997, + "learning_rate": 1.7248382797761576e-05, + "loss": 0.4433, + "step": 4607 + }, + { + "epoch": 0.26, + "grad_norm": 0.38382751971823414, + "learning_rate": 1.7247100654107458e-05, + "loss": 0.2707, + "step": 4608 + }, + { + "epoch": 0.26, + "grad_norm": 0.5398894230168891, + "learning_rate": 1.724581825948655e-05, + "loss": 0.3814, + "step": 4609 + }, + { + "epoch": 0.26, + "grad_norm": 0.7840013858936152, + "learning_rate": 1.7244535613943273e-05, + "loss": 0.3339, + "step": 4610 + }, + { + "epoch": 0.26, + "grad_norm": 0.4177543238883827, + "learning_rate": 1.7243252717522037e-05, + "loss": 0.2759, + "step": 4611 + }, + { + "epoch": 0.26, + "grad_norm": 0.2814456169503759, + "learning_rate": 1.724196957026727e-05, + "loss": 0.2205, + "step": 4612 + }, + { + "epoch": 0.27, + "grad_norm": 1.286573728865289, + "learning_rate": 1.724068617222341e-05, + "loss": 0.4222, + "step": 4613 + }, + { + "epoch": 0.27, + "grad_norm": 0.3835480722641368, + "learning_rate": 1.7239402523434898e-05, + "loss": 0.2947, + "step": 4614 + }, + { + "epoch": 0.27, + "grad_norm": 0.9076811236266914, + "learning_rate": 1.723811862394619e-05, + "loss": 0.5795, + "step": 4615 + }, + { + "epoch": 0.27, + "grad_norm": 0.4121511463726725, + "learning_rate": 1.7236834473801744e-05, + "loss": 0.344, + "step": 4616 + }, + { + "epoch": 0.27, + "grad_norm": 0.33727558721149664, + "learning_rate": 1.723555007304603e-05, + "loss": 0.2264, + "step": 4617 + }, + { + "epoch": 0.27, + "grad_norm": 0.49247949403688007, + "learning_rate": 1.7234265421723528e-05, + "loss": 0.2619, + "step": 4618 + }, + { + "epoch": 0.27, + "grad_norm": 0.7509746626101724, + "learning_rate": 1.7232980519878727e-05, + "loss": 0.4011, + "step": 4619 + }, + { + "epoch": 0.27, + "grad_norm": 0.3204159675138603, + "learning_rate": 1.723169536755612e-05, + "loss": 0.2003, + "step": 4620 + }, + { + "epoch": 0.27, + "grad_norm": 0.41050620070576016, + "learning_rate": 1.7230409964800215e-05, + "loss": 0.3201, + "step": 4621 + }, + { + "epoch": 0.27, + "grad_norm": 1.120953694972054, + "learning_rate": 1.7229124311655524e-05, + "loss": 0.5876, + "step": 4622 + }, + { + "epoch": 0.27, + "grad_norm": 0.34506829258410193, + "learning_rate": 1.722783840816657e-05, + "loss": 0.2215, + "step": 4623 + }, + { + "epoch": 0.27, + "grad_norm": 0.39872366387719055, + "learning_rate": 1.7226552254377883e-05, + "loss": 0.3462, + "step": 4624 + }, + { + "epoch": 0.27, + "grad_norm": 0.5939456678866099, + "learning_rate": 1.7225265850333997e-05, + "loss": 0.4869, + "step": 4625 + }, + { + "epoch": 0.27, + "grad_norm": 0.3497056047001658, + "learning_rate": 1.7223979196079466e-05, + "loss": 0.2402, + "step": 4626 + }, + { + "epoch": 0.27, + "grad_norm": 0.31503886763827527, + "learning_rate": 1.7222692291658853e-05, + "loss": 0.2371, + "step": 4627 + }, + { + "epoch": 0.27, + "grad_norm": 0.5476162155781016, + "learning_rate": 1.7221405137116712e-05, + "loss": 0.3956, + "step": 4628 + }, + { + "epoch": 0.27, + "grad_norm": 0.4289308831084935, + "learning_rate": 1.722011773249762e-05, + "loss": 0.2724, + "step": 4629 + }, + { + "epoch": 0.27, + "grad_norm": 0.45957358140657234, + "learning_rate": 1.7218830077846164e-05, + "loss": 0.329, + "step": 4630 + }, + { + "epoch": 0.27, + "grad_norm": 0.4495152254009832, + "learning_rate": 1.7217542173206932e-05, + "loss": 0.3848, + "step": 4631 + }, + { + "epoch": 0.27, + "grad_norm": 0.3251634536660676, + "learning_rate": 1.7216254018624524e-05, + "loss": 0.2706, + "step": 4632 + }, + { + "epoch": 0.27, + "grad_norm": 0.36095585230014704, + "learning_rate": 1.7214965614143554e-05, + "loss": 0.1692, + "step": 4633 + }, + { + "epoch": 0.27, + "grad_norm": 0.42598816677891027, + "learning_rate": 1.721367695980863e-05, + "loss": 0.2535, + "step": 4634 + }, + { + "epoch": 0.27, + "grad_norm": 0.37269976982163927, + "learning_rate": 1.7212388055664385e-05, + "loss": 0.2948, + "step": 4635 + }, + { + "epoch": 0.27, + "grad_norm": 0.3768645284351786, + "learning_rate": 1.7211098901755453e-05, + "loss": 0.2551, + "step": 4636 + }, + { + "epoch": 0.27, + "grad_norm": 0.7238591197734653, + "learning_rate": 1.7209809498126473e-05, + "loss": 0.4616, + "step": 4637 + }, + { + "epoch": 0.27, + "grad_norm": 0.46787631269308716, + "learning_rate": 1.72085198448221e-05, + "loss": 0.327, + "step": 4638 + }, + { + "epoch": 0.27, + "grad_norm": 0.28015550313024745, + "learning_rate": 1.7207229941887e-05, + "loss": 0.2313, + "step": 4639 + }, + { + "epoch": 0.27, + "grad_norm": 0.32694743935340276, + "learning_rate": 1.7205939789365834e-05, + "loss": 0.2899, + "step": 4640 + }, + { + "epoch": 0.27, + "grad_norm": 0.57415873988465, + "learning_rate": 1.720464938730328e-05, + "loss": 0.3279, + "step": 4641 + }, + { + "epoch": 0.27, + "grad_norm": 0.4253211319708633, + "learning_rate": 1.720335873574403e-05, + "loss": 0.3365, + "step": 4642 + }, + { + "epoch": 0.27, + "grad_norm": 0.4051995236013618, + "learning_rate": 1.7202067834732778e-05, + "loss": 0.3115, + "step": 4643 + }, + { + "epoch": 0.27, + "grad_norm": 0.4540323810623343, + "learning_rate": 1.7200776684314226e-05, + "loss": 0.3126, + "step": 4644 + }, + { + "epoch": 0.27, + "grad_norm": 0.3539122334412321, + "learning_rate": 1.7199485284533088e-05, + "loss": 0.2621, + "step": 4645 + }, + { + "epoch": 0.27, + "grad_norm": 0.26353112650494664, + "learning_rate": 1.7198193635434083e-05, + "loss": 0.0729, + "step": 4646 + }, + { + "epoch": 0.27, + "grad_norm": 0.40396435081770415, + "learning_rate": 1.719690173706194e-05, + "loss": 0.3095, + "step": 4647 + }, + { + "epoch": 0.27, + "grad_norm": 0.33135791809707316, + "learning_rate": 1.71956095894614e-05, + "loss": 0.3202, + "step": 4648 + }, + { + "epoch": 0.27, + "grad_norm": 0.8182839067779433, + "learning_rate": 1.719431719267721e-05, + "loss": 0.4072, + "step": 4649 + }, + { + "epoch": 0.27, + "grad_norm": 0.4000599760224488, + "learning_rate": 1.7193024546754125e-05, + "loss": 0.3128, + "step": 4650 + }, + { + "epoch": 0.27, + "grad_norm": 0.5156932368052041, + "learning_rate": 1.719173165173691e-05, + "loss": 0.406, + "step": 4651 + }, + { + "epoch": 0.27, + "grad_norm": 0.2688587493026527, + "learning_rate": 1.7190438507670337e-05, + "loss": 0.1869, + "step": 4652 + }, + { + "epoch": 0.27, + "grad_norm": 0.36163058971972184, + "learning_rate": 1.7189145114599188e-05, + "loss": 0.3013, + "step": 4653 + }, + { + "epoch": 0.27, + "grad_norm": 0.787712368896139, + "learning_rate": 1.718785147256825e-05, + "loss": 0.4917, + "step": 4654 + }, + { + "epoch": 0.27, + "grad_norm": 0.3618209143003872, + "learning_rate": 1.7186557581622327e-05, + "loss": 0.3338, + "step": 4655 + }, + { + "epoch": 0.27, + "grad_norm": 0.32715965622544163, + "learning_rate": 1.7185263441806227e-05, + "loss": 0.2166, + "step": 4656 + }, + { + "epoch": 0.27, + "grad_norm": 0.5405807570579455, + "learning_rate": 1.7183969053164757e-05, + "loss": 0.3873, + "step": 4657 + }, + { + "epoch": 0.27, + "grad_norm": 0.33734823448871526, + "learning_rate": 1.718267441574275e-05, + "loss": 0.2136, + "step": 4658 + }, + { + "epoch": 0.27, + "grad_norm": 0.41124631225255276, + "learning_rate": 1.718137952958504e-05, + "loss": 0.2576, + "step": 4659 + }, + { + "epoch": 0.27, + "grad_norm": 0.372867877130881, + "learning_rate": 1.7180084394736464e-05, + "loss": 0.3317, + "step": 4660 + }, + { + "epoch": 0.27, + "grad_norm": 1.2657038079052518, + "learning_rate": 1.717878901124187e-05, + "loss": 0.8484, + "step": 4661 + }, + { + "epoch": 0.27, + "grad_norm": 0.30325582500593035, + "learning_rate": 1.7177493379146123e-05, + "loss": 0.1239, + "step": 4662 + }, + { + "epoch": 0.27, + "grad_norm": 0.32165604472772885, + "learning_rate": 1.717619749849409e-05, + "loss": 0.296, + "step": 4663 + }, + { + "epoch": 0.27, + "grad_norm": 0.30992102992511755, + "learning_rate": 1.7174901369330648e-05, + "loss": 0.2242, + "step": 4664 + }, + { + "epoch": 0.27, + "grad_norm": 0.48324249109187434, + "learning_rate": 1.7173604991700678e-05, + "loss": 0.363, + "step": 4665 + }, + { + "epoch": 0.27, + "grad_norm": 0.403222549687474, + "learning_rate": 1.7172308365649077e-05, + "loss": 0.2954, + "step": 4666 + }, + { + "epoch": 0.27, + "grad_norm": 0.45450122410769483, + "learning_rate": 1.7171011491220744e-05, + "loss": 0.3464, + "step": 4667 + }, + { + "epoch": 0.27, + "grad_norm": 0.6262524047569294, + "learning_rate": 1.7169714368460593e-05, + "loss": 0.3987, + "step": 4668 + }, + { + "epoch": 0.27, + "grad_norm": 0.40588895977274775, + "learning_rate": 1.716841699741354e-05, + "loss": 0.241, + "step": 4669 + }, + { + "epoch": 0.27, + "grad_norm": 0.7130249951084894, + "learning_rate": 1.7167119378124516e-05, + "loss": 0.4351, + "step": 4670 + }, + { + "epoch": 0.27, + "grad_norm": 0.3442235779286332, + "learning_rate": 1.7165821510638456e-05, + "loss": 0.2933, + "step": 4671 + }, + { + "epoch": 0.27, + "grad_norm": 0.23626658265469405, + "learning_rate": 1.7164523395000304e-05, + "loss": 0.1589, + "step": 4672 + }, + { + "epoch": 0.27, + "grad_norm": 0.8478012547554327, + "learning_rate": 1.7163225031255018e-05, + "loss": 0.6467, + "step": 4673 + }, + { + "epoch": 0.27, + "grad_norm": 0.5128732762763075, + "learning_rate": 1.7161926419447555e-05, + "loss": 0.3303, + "step": 4674 + }, + { + "epoch": 0.27, + "grad_norm": 0.3949461464032854, + "learning_rate": 1.7160627559622888e-05, + "loss": 0.2547, + "step": 4675 + }, + { + "epoch": 0.27, + "grad_norm": 0.5768859794534354, + "learning_rate": 1.7159328451825995e-05, + "loss": 0.3725, + "step": 4676 + }, + { + "epoch": 0.27, + "grad_norm": 0.4045747712340561, + "learning_rate": 1.7158029096101868e-05, + "loss": 0.2743, + "step": 4677 + }, + { + "epoch": 0.27, + "grad_norm": 0.31346151516334575, + "learning_rate": 1.71567294924955e-05, + "loss": 0.2506, + "step": 4678 + }, + { + "epoch": 0.27, + "grad_norm": 0.38541238161229885, + "learning_rate": 1.71554296410519e-05, + "loss": 0.281, + "step": 4679 + }, + { + "epoch": 0.27, + "grad_norm": 0.8739729047054263, + "learning_rate": 1.7154129541816078e-05, + "loss": 0.509, + "step": 4680 + }, + { + "epoch": 0.27, + "grad_norm": 0.38755430202848945, + "learning_rate": 1.7152829194833054e-05, + "loss": 0.2878, + "step": 4681 + }, + { + "epoch": 0.27, + "grad_norm": 0.7597051840219803, + "learning_rate": 1.7151528600147868e-05, + "loss": 0.3923, + "step": 4682 + }, + { + "epoch": 0.27, + "grad_norm": 0.3355241879573539, + "learning_rate": 1.7150227757805552e-05, + "loss": 0.3151, + "step": 4683 + }, + { + "epoch": 0.27, + "grad_norm": 0.25231565932966504, + "learning_rate": 1.7148926667851156e-05, + "loss": 0.1974, + "step": 4684 + }, + { + "epoch": 0.27, + "grad_norm": 0.9743567662925537, + "learning_rate": 1.7147625330329734e-05, + "loss": 0.5156, + "step": 4685 + }, + { + "epoch": 0.27, + "grad_norm": 0.5801427580892613, + "learning_rate": 1.714632374528636e-05, + "loss": 0.4265, + "step": 4686 + }, + { + "epoch": 0.27, + "grad_norm": 0.38310225366593875, + "learning_rate": 1.7145021912766096e-05, + "loss": 0.3218, + "step": 4687 + }, + { + "epoch": 0.27, + "grad_norm": 0.5825425991209423, + "learning_rate": 1.7143719832814034e-05, + "loss": 0.3381, + "step": 4688 + }, + { + "epoch": 0.27, + "grad_norm": 0.3340408898626788, + "learning_rate": 1.714241750547526e-05, + "loss": 0.239, + "step": 4689 + }, + { + "epoch": 0.27, + "grad_norm": 0.3168336149753657, + "learning_rate": 1.7141114930794876e-05, + "loss": 0.2373, + "step": 4690 + }, + { + "epoch": 0.27, + "grad_norm": 0.40724845843177176, + "learning_rate": 1.7139812108817988e-05, + "loss": 0.3455, + "step": 4691 + }, + { + "epoch": 0.27, + "grad_norm": 0.5094991595317109, + "learning_rate": 1.7138509039589713e-05, + "loss": 0.2222, + "step": 4692 + }, + { + "epoch": 0.27, + "grad_norm": 0.4787797053735826, + "learning_rate": 1.7137205723155178e-05, + "loss": 0.2925, + "step": 4693 + }, + { + "epoch": 0.27, + "grad_norm": 0.7871752853141931, + "learning_rate": 1.7135902159559518e-05, + "loss": 0.4657, + "step": 4694 + }, + { + "epoch": 0.27, + "grad_norm": 0.3929226671181522, + "learning_rate": 1.713459834884787e-05, + "loss": 0.2591, + "step": 4695 + }, + { + "epoch": 0.27, + "grad_norm": 0.31143988193266453, + "learning_rate": 1.7133294291065387e-05, + "loss": 0.2568, + "step": 4696 + }, + { + "epoch": 0.27, + "grad_norm": 0.5147095929397764, + "learning_rate": 1.7131989986257233e-05, + "loss": 0.3835, + "step": 4697 + }, + { + "epoch": 0.27, + "grad_norm": 0.8803237290957555, + "learning_rate": 1.713068543446857e-05, + "loss": 0.337, + "step": 4698 + }, + { + "epoch": 0.27, + "grad_norm": 0.32761454354627956, + "learning_rate": 1.7129380635744578e-05, + "loss": 0.2859, + "step": 4699 + }, + { + "epoch": 0.27, + "grad_norm": 0.6789050208924245, + "learning_rate": 1.712807559013044e-05, + "loss": 0.5107, + "step": 4700 + }, + { + "epoch": 0.27, + "grad_norm": 0.580704421074495, + "learning_rate": 1.7126770297671353e-05, + "loss": 0.1709, + "step": 4701 + }, + { + "epoch": 0.27, + "grad_norm": 0.3050190429982249, + "learning_rate": 1.7125464758412517e-05, + "loss": 0.2517, + "step": 4702 + }, + { + "epoch": 0.27, + "grad_norm": 0.4061107982630693, + "learning_rate": 1.7124158972399142e-05, + "loss": 0.3583, + "step": 4703 + }, + { + "epoch": 0.27, + "grad_norm": 1.1006064066544996, + "learning_rate": 1.7122852939676448e-05, + "loss": 0.4624, + "step": 4704 + }, + { + "epoch": 0.27, + "grad_norm": 0.41879601303777364, + "learning_rate": 1.712154666028966e-05, + "loss": 0.2789, + "step": 4705 + }, + { + "epoch": 0.27, + "grad_norm": 0.4464716584986142, + "learning_rate": 1.712024013428402e-05, + "loss": 0.2636, + "step": 4706 + }, + { + "epoch": 0.27, + "grad_norm": 0.3530264257968865, + "learning_rate": 1.7118933361704773e-05, + "loss": 0.297, + "step": 4707 + }, + { + "epoch": 0.27, + "grad_norm": 0.4282023284180023, + "learning_rate": 1.7117626342597168e-05, + "loss": 0.2344, + "step": 4708 + }, + { + "epoch": 0.27, + "grad_norm": 0.4580585344409273, + "learning_rate": 1.711631907700647e-05, + "loss": 0.3537, + "step": 4709 + }, + { + "epoch": 0.27, + "grad_norm": 0.7468136572361314, + "learning_rate": 1.711501156497794e-05, + "loss": 0.3764, + "step": 4710 + }, + { + "epoch": 0.27, + "grad_norm": 0.37408046619978336, + "learning_rate": 1.7113703806556875e-05, + "loss": 0.2427, + "step": 4711 + }, + { + "epoch": 0.27, + "grad_norm": 0.49604510274681346, + "learning_rate": 1.711239580178855e-05, + "loss": 0.3652, + "step": 4712 + }, + { + "epoch": 0.27, + "grad_norm": 1.1377881037810844, + "learning_rate": 1.7111087550718265e-05, + "loss": 0.6517, + "step": 4713 + }, + { + "epoch": 0.27, + "grad_norm": 0.3778912354875147, + "learning_rate": 1.7109779053391322e-05, + "loss": 0.2191, + "step": 4714 + }, + { + "epoch": 0.27, + "grad_norm": 0.3796078306581816, + "learning_rate": 1.710847030985304e-05, + "loss": 0.3385, + "step": 4715 + }, + { + "epoch": 0.27, + "grad_norm": 0.6783617967841638, + "learning_rate": 1.710716132014873e-05, + "loss": 0.4694, + "step": 4716 + }, + { + "epoch": 0.27, + "grad_norm": 0.3984794551445479, + "learning_rate": 1.7105852084323736e-05, + "loss": 0.2969, + "step": 4717 + }, + { + "epoch": 0.27, + "grad_norm": 0.28023283128766, + "learning_rate": 1.7104542602423385e-05, + "loss": 0.0739, + "step": 4718 + }, + { + "epoch": 0.27, + "grad_norm": 0.365961154130174, + "learning_rate": 1.710323287449303e-05, + "loss": 0.3137, + "step": 4719 + }, + { + "epoch": 0.27, + "grad_norm": 0.3846054233012314, + "learning_rate": 1.710192290057803e-05, + "loss": 0.2914, + "step": 4720 + }, + { + "epoch": 0.27, + "grad_norm": 0.7279304764082829, + "learning_rate": 1.7100612680723744e-05, + "loss": 0.4087, + "step": 4721 + }, + { + "epoch": 0.27, + "grad_norm": 0.35474730728888954, + "learning_rate": 1.7099302214975545e-05, + "loss": 0.3523, + "step": 4722 + }, + { + "epoch": 0.27, + "grad_norm": 0.35652743317062197, + "learning_rate": 1.7097991503378812e-05, + "loss": 0.2756, + "step": 4723 + }, + { + "epoch": 0.27, + "grad_norm": 0.17111219321848872, + "learning_rate": 1.7096680545978946e-05, + "loss": 0.0714, + "step": 4724 + }, + { + "epoch": 0.27, + "grad_norm": 0.7528476749497117, + "learning_rate": 1.709536934282133e-05, + "loss": 0.4266, + "step": 4725 + }, + { + "epoch": 0.27, + "grad_norm": 0.4257309461122103, + "learning_rate": 1.7094057893951385e-05, + "loss": 0.306, + "step": 4726 + }, + { + "epoch": 0.27, + "grad_norm": 0.40249958959659726, + "learning_rate": 1.709274619941452e-05, + "loss": 0.3084, + "step": 4727 + }, + { + "epoch": 0.27, + "grad_norm": 0.6821814944692812, + "learning_rate": 1.7091434259256155e-05, + "loss": 0.4543, + "step": 4728 + }, + { + "epoch": 0.27, + "grad_norm": 0.4068026712533653, + "learning_rate": 1.7090122073521726e-05, + "loss": 0.2961, + "step": 4729 + }, + { + "epoch": 0.27, + "grad_norm": 0.30780879761690466, + "learning_rate": 1.7088809642256677e-05, + "loss": 0.2363, + "step": 4730 + }, + { + "epoch": 0.27, + "grad_norm": 0.4450334908612994, + "learning_rate": 1.7087496965506457e-05, + "loss": 0.2771, + "step": 4731 + }, + { + "epoch": 0.27, + "grad_norm": 0.3757471843854514, + "learning_rate": 1.7086184043316518e-05, + "loss": 0.2816, + "step": 4732 + }, + { + "epoch": 0.27, + "grad_norm": 0.718898235374059, + "learning_rate": 1.7084870875732332e-05, + "loss": 0.4816, + "step": 4733 + }, + { + "epoch": 0.27, + "grad_norm": 0.35620390057471335, + "learning_rate": 1.708355746279937e-05, + "loss": 0.2784, + "step": 4734 + }, + { + "epoch": 0.27, + "grad_norm": 0.3257456059274286, + "learning_rate": 1.7082243804563123e-05, + "loss": 0.2776, + "step": 4735 + }, + { + "epoch": 0.27, + "grad_norm": 0.2670452463661605, + "learning_rate": 1.7080929901069076e-05, + "loss": 0.163, + "step": 4736 + }, + { + "epoch": 0.27, + "grad_norm": 0.6344362497021302, + "learning_rate": 1.7079615752362727e-05, + "loss": 0.3661, + "step": 4737 + }, + { + "epoch": 0.27, + "grad_norm": 0.35318412416104933, + "learning_rate": 1.707830135848959e-05, + "loss": 0.2906, + "step": 4738 + }, + { + "epoch": 0.27, + "grad_norm": 0.415746007004765, + "learning_rate": 1.7076986719495184e-05, + "loss": 0.3621, + "step": 4739 + }, + { + "epoch": 0.27, + "grad_norm": 0.9097433398595413, + "learning_rate": 1.7075671835425032e-05, + "loss": 0.6311, + "step": 4740 + }, + { + "epoch": 0.27, + "grad_norm": 0.30654017642837456, + "learning_rate": 1.7074356706324668e-05, + "loss": 0.22, + "step": 4741 + }, + { + "epoch": 0.27, + "grad_norm": 0.2861658906239352, + "learning_rate": 1.7073041332239634e-05, + "loss": 0.1787, + "step": 4742 + }, + { + "epoch": 0.27, + "grad_norm": 0.38805970077000235, + "learning_rate": 1.7071725713215483e-05, + "loss": 0.3573, + "step": 4743 + }, + { + "epoch": 0.27, + "grad_norm": 0.35965465826234444, + "learning_rate": 1.7070409849297774e-05, + "loss": 0.216, + "step": 4744 + }, + { + "epoch": 0.27, + "grad_norm": 0.7013740357821047, + "learning_rate": 1.7069093740532083e-05, + "loss": 0.4085, + "step": 4745 + }, + { + "epoch": 0.27, + "grad_norm": 0.3788364821404023, + "learning_rate": 1.706777738696397e-05, + "loss": 0.3364, + "step": 4746 + }, + { + "epoch": 0.27, + "grad_norm": 0.33608417673074076, + "learning_rate": 1.7066460788639035e-05, + "loss": 0.2014, + "step": 4747 + }, + { + "epoch": 0.27, + "grad_norm": 0.2977740305003241, + "learning_rate": 1.7065143945602867e-05, + "loss": 0.1881, + "step": 4748 + }, + { + "epoch": 0.27, + "grad_norm": 0.6092323032794157, + "learning_rate": 1.7063826857901066e-05, + "loss": 0.4919, + "step": 4749 + }, + { + "epoch": 0.27, + "grad_norm": 0.35754157093924055, + "learning_rate": 1.7062509525579244e-05, + "loss": 0.2416, + "step": 4750 + }, + { + "epoch": 0.27, + "grad_norm": 0.3915972606090248, + "learning_rate": 1.7061191948683024e-05, + "loss": 0.3557, + "step": 4751 + }, + { + "epoch": 0.27, + "grad_norm": 1.2231230334847767, + "learning_rate": 1.7059874127258028e-05, + "loss": 0.7646, + "step": 4752 + }, + { + "epoch": 0.27, + "grad_norm": 0.3921839884714695, + "learning_rate": 1.7058556061349894e-05, + "loss": 0.3111, + "step": 4753 + }, + { + "epoch": 0.27, + "grad_norm": 0.2249213774799527, + "learning_rate": 1.705723775100427e-05, + "loss": 0.1823, + "step": 4754 + }, + { + "epoch": 0.27, + "grad_norm": 0.46286782233072077, + "learning_rate": 1.7055919196266806e-05, + "loss": 0.3736, + "step": 4755 + }, + { + "epoch": 0.27, + "grad_norm": 0.441673470781393, + "learning_rate": 1.7054600397183162e-05, + "loss": 0.3233, + "step": 4756 + }, + { + "epoch": 0.27, + "grad_norm": 0.5357999621963078, + "learning_rate": 1.705328135379901e-05, + "loss": 0.2646, + "step": 4757 + }, + { + "epoch": 0.27, + "grad_norm": 0.4294620287968328, + "learning_rate": 1.7051962066160027e-05, + "loss": 0.3657, + "step": 4758 + }, + { + "epoch": 0.27, + "grad_norm": 0.36051927885640783, + "learning_rate": 1.7050642534311904e-05, + "loss": 0.2862, + "step": 4759 + }, + { + "epoch": 0.27, + "grad_norm": 0.3010743262062142, + "learning_rate": 1.704932275830033e-05, + "loss": 0.1729, + "step": 4760 + }, + { + "epoch": 0.27, + "grad_norm": 0.4334278974070254, + "learning_rate": 1.704800273817101e-05, + "loss": 0.3478, + "step": 4761 + }, + { + "epoch": 0.27, + "grad_norm": 0.4264062843363303, + "learning_rate": 1.7046682473969664e-05, + "loss": 0.2949, + "step": 4762 + }, + { + "epoch": 0.27, + "grad_norm": 0.4010993186213262, + "learning_rate": 1.7045361965742004e-05, + "loss": 0.2649, + "step": 4763 + }, + { + "epoch": 0.27, + "grad_norm": 1.2493476218292963, + "learning_rate": 1.704404121353376e-05, + "loss": 0.7307, + "step": 4764 + }, + { + "epoch": 0.27, + "grad_norm": 0.5374611777732092, + "learning_rate": 1.7042720217390677e-05, + "loss": 0.3441, + "step": 4765 + }, + { + "epoch": 0.27, + "grad_norm": 0.35668982665157084, + "learning_rate": 1.7041398977358494e-05, + "loss": 0.2937, + "step": 4766 + }, + { + "epoch": 0.27, + "grad_norm": 0.5150162023118233, + "learning_rate": 1.7040077493482964e-05, + "loss": 0.3482, + "step": 4767 + }, + { + "epoch": 0.27, + "grad_norm": 0.3524313571951868, + "learning_rate": 1.7038755765809857e-05, + "loss": 0.2136, + "step": 4768 + }, + { + "epoch": 0.27, + "grad_norm": 0.3518307329267539, + "learning_rate": 1.7037433794384938e-05, + "loss": 0.2492, + "step": 4769 + }, + { + "epoch": 0.27, + "grad_norm": 0.37417425107718427, + "learning_rate": 1.7036111579253992e-05, + "loss": 0.2956, + "step": 4770 + }, + { + "epoch": 0.27, + "grad_norm": 0.3943110391556108, + "learning_rate": 1.70347891204628e-05, + "loss": 0.3086, + "step": 4771 + }, + { + "epoch": 0.27, + "grad_norm": 0.4517027039596723, + "learning_rate": 1.7033466418057166e-05, + "loss": 0.3716, + "step": 4772 + }, + { + "epoch": 0.27, + "grad_norm": 0.44044704690746816, + "learning_rate": 1.7032143472082893e-05, + "loss": 0.2414, + "step": 4773 + }, + { + "epoch": 0.27, + "grad_norm": 0.4506278934272334, + "learning_rate": 1.7030820282585795e-05, + "loss": 0.256, + "step": 4774 + }, + { + "epoch": 0.27, + "grad_norm": 0.39868821005548993, + "learning_rate": 1.7029496849611687e-05, + "loss": 0.2626, + "step": 4775 + }, + { + "epoch": 0.27, + "grad_norm": 1.3150474383898678, + "learning_rate": 1.702817317320641e-05, + "loss": 0.5298, + "step": 4776 + }, + { + "epoch": 0.27, + "grad_norm": 0.3613155978762499, + "learning_rate": 1.70268492534158e-05, + "loss": 0.2521, + "step": 4777 + }, + { + "epoch": 0.27, + "grad_norm": 0.460323545686999, + "learning_rate": 1.70255250902857e-05, + "loss": 0.3628, + "step": 4778 + }, + { + "epoch": 0.27, + "grad_norm": 0.5444376532180347, + "learning_rate": 1.702420068386197e-05, + "loss": 0.3987, + "step": 4779 + }, + { + "epoch": 0.27, + "grad_norm": 0.26635586243885684, + "learning_rate": 1.7022876034190468e-05, + "loss": 0.0774, + "step": 4780 + }, + { + "epoch": 0.27, + "grad_norm": 0.3994512255967334, + "learning_rate": 1.7021551141317075e-05, + "loss": 0.2613, + "step": 4781 + }, + { + "epoch": 0.27, + "grad_norm": 0.5373762784077355, + "learning_rate": 1.7020226005287665e-05, + "loss": 0.3608, + "step": 4782 + }, + { + "epoch": 0.27, + "grad_norm": 0.5289576791227809, + "learning_rate": 1.701890062614813e-05, + "loss": 0.19, + "step": 4783 + }, + { + "epoch": 0.27, + "grad_norm": 0.40705610275602827, + "learning_rate": 1.7017575003944374e-05, + "loss": 0.3555, + "step": 4784 + }, + { + "epoch": 0.27, + "grad_norm": 0.8059748782716618, + "learning_rate": 1.7016249138722295e-05, + "loss": 0.5116, + "step": 4785 + }, + { + "epoch": 0.27, + "grad_norm": 0.3090383888410909, + "learning_rate": 1.7014923030527808e-05, + "loss": 0.1888, + "step": 4786 + }, + { + "epoch": 0.28, + "grad_norm": 0.3782301763328678, + "learning_rate": 1.701359667940684e-05, + "loss": 0.284, + "step": 4787 + }, + { + "epoch": 0.28, + "grad_norm": 1.3901516550893385, + "learning_rate": 1.7012270085405317e-05, + "loss": 0.8495, + "step": 4788 + }, + { + "epoch": 0.28, + "grad_norm": 0.4533944763836812, + "learning_rate": 1.7010943248569185e-05, + "loss": 0.2694, + "step": 4789 + }, + { + "epoch": 0.28, + "grad_norm": 0.4358691412921533, + "learning_rate": 1.700961616894439e-05, + "loss": 0.3246, + "step": 4790 + }, + { + "epoch": 0.28, + "grad_norm": 1.2521917339608342, + "learning_rate": 1.7008288846576886e-05, + "loss": 0.5989, + "step": 4791 + }, + { + "epoch": 0.28, + "grad_norm": 0.3808308096968665, + "learning_rate": 1.7006961281512645e-05, + "loss": 0.251, + "step": 4792 + }, + { + "epoch": 0.28, + "grad_norm": 0.46592574712611845, + "learning_rate": 1.7005633473797632e-05, + "loss": 0.2673, + "step": 4793 + }, + { + "epoch": 0.28, + "grad_norm": 0.3932673619699627, + "learning_rate": 1.7004305423477835e-05, + "loss": 0.3008, + "step": 4794 + }, + { + "epoch": 0.28, + "grad_norm": 0.6720359832254608, + "learning_rate": 1.700297713059924e-05, + "loss": 0.374, + "step": 4795 + }, + { + "epoch": 0.28, + "grad_norm": 0.38595536528371815, + "learning_rate": 1.700164859520785e-05, + "loss": 0.2642, + "step": 4796 + }, + { + "epoch": 0.28, + "grad_norm": 0.525812046298573, + "learning_rate": 1.7000319817349673e-05, + "loss": 0.3478, + "step": 4797 + }, + { + "epoch": 0.28, + "grad_norm": 0.39726507148120443, + "learning_rate": 1.699899079707072e-05, + "loss": 0.3034, + "step": 4798 + }, + { + "epoch": 0.28, + "grad_norm": 0.38302101200138, + "learning_rate": 1.6997661534417015e-05, + "loss": 0.2645, + "step": 4799 + }, + { + "epoch": 0.28, + "grad_norm": 0.45339021595960577, + "learning_rate": 1.699633202943459e-05, + "loss": 0.3409, + "step": 4800 + }, + { + "epoch": 0.28, + "grad_norm": 0.6208504710150762, + "learning_rate": 1.699500228216949e-05, + "loss": 0.4191, + "step": 4801 + }, + { + "epoch": 0.28, + "grad_norm": 0.3185182893708112, + "learning_rate": 1.6993672292667766e-05, + "loss": 0.2207, + "step": 4802 + }, + { + "epoch": 0.28, + "grad_norm": 1.197840212134981, + "learning_rate": 1.6992342060975467e-05, + "loss": 0.6374, + "step": 4803 + }, + { + "epoch": 0.28, + "grad_norm": 0.7354342772061968, + "learning_rate": 1.6991011587138665e-05, + "loss": 0.4787, + "step": 4804 + }, + { + "epoch": 0.28, + "grad_norm": 0.4419399786931295, + "learning_rate": 1.698968087120343e-05, + "loss": 0.3406, + "step": 4805 + }, + { + "epoch": 0.28, + "grad_norm": 0.5150312372588404, + "learning_rate": 1.6988349913215848e-05, + "loss": 0.2817, + "step": 4806 + }, + { + "epoch": 0.28, + "grad_norm": 0.6398701761513857, + "learning_rate": 1.698701871322201e-05, + "loss": 0.3152, + "step": 4807 + }, + { + "epoch": 0.28, + "grad_norm": 0.2889587602038658, + "learning_rate": 1.698568727126801e-05, + "loss": 0.2323, + "step": 4808 + }, + { + "epoch": 0.28, + "grad_norm": 0.46663731677113146, + "learning_rate": 1.6984355587399964e-05, + "loss": 0.284, + "step": 4809 + }, + { + "epoch": 0.28, + "grad_norm": 0.44856779416870474, + "learning_rate": 1.6983023661663987e-05, + "loss": 0.3398, + "step": 4810 + }, + { + "epoch": 0.28, + "grad_norm": 0.5213901240015865, + "learning_rate": 1.6981691494106196e-05, + "loss": 0.3272, + "step": 4811 + }, + { + "epoch": 0.28, + "grad_norm": 1.055086036269047, + "learning_rate": 1.698035908477273e-05, + "loss": 0.4127, + "step": 4812 + }, + { + "epoch": 0.28, + "grad_norm": 0.4546873145751855, + "learning_rate": 1.697902643370973e-05, + "loss": 0.3177, + "step": 4813 + }, + { + "epoch": 0.28, + "grad_norm": 0.4027812289256814, + "learning_rate": 1.6977693540963347e-05, + "loss": 0.3157, + "step": 4814 + }, + { + "epoch": 0.28, + "grad_norm": 0.2686362672041927, + "learning_rate": 1.6976360406579734e-05, + "loss": 0.1518, + "step": 4815 + }, + { + "epoch": 0.28, + "grad_norm": 0.7597687415584575, + "learning_rate": 1.697502703060506e-05, + "loss": 0.4421, + "step": 4816 + }, + { + "epoch": 0.28, + "grad_norm": 0.44251032893440007, + "learning_rate": 1.69736934130855e-05, + "loss": 0.2877, + "step": 4817 + }, + { + "epoch": 0.28, + "grad_norm": 0.5865777905310624, + "learning_rate": 1.6972359554067237e-05, + "loss": 0.3693, + "step": 4818 + }, + { + "epoch": 0.28, + "grad_norm": 0.8136374248949291, + "learning_rate": 1.6971025453596463e-05, + "loss": 0.3066, + "step": 4819 + }, + { + "epoch": 0.28, + "grad_norm": 0.2770444473086494, + "learning_rate": 1.6969691111719377e-05, + "loss": 0.2053, + "step": 4820 + }, + { + "epoch": 0.28, + "grad_norm": 0.5373847607276323, + "learning_rate": 1.6968356528482187e-05, + "loss": 0.399, + "step": 4821 + }, + { + "epoch": 0.28, + "grad_norm": 0.4172327112811106, + "learning_rate": 1.696702170393111e-05, + "loss": 0.225, + "step": 4822 + }, + { + "epoch": 0.28, + "grad_norm": 0.4781253488053807, + "learning_rate": 1.6965686638112373e-05, + "loss": 0.308, + "step": 4823 + }, + { + "epoch": 0.28, + "grad_norm": 1.2234511064967395, + "learning_rate": 1.6964351331072205e-05, + "loss": 0.4584, + "step": 4824 + }, + { + "epoch": 0.28, + "grad_norm": 0.4180514861148252, + "learning_rate": 1.696301578285685e-05, + "loss": 0.2807, + "step": 4825 + }, + { + "epoch": 0.28, + "grad_norm": 0.2841258483758155, + "learning_rate": 1.696167999351256e-05, + "loss": 0.2448, + "step": 4826 + }, + { + "epoch": 0.28, + "grad_norm": 0.459516630611072, + "learning_rate": 1.6960343963085587e-05, + "loss": 0.2917, + "step": 4827 + }, + { + "epoch": 0.28, + "grad_norm": 1.0542058454902612, + "learning_rate": 1.6959007691622206e-05, + "loss": 0.5435, + "step": 4828 + }, + { + "epoch": 0.28, + "grad_norm": 0.3935580375131141, + "learning_rate": 1.6957671179168687e-05, + "loss": 0.2066, + "step": 4829 + }, + { + "epoch": 0.28, + "grad_norm": 0.5260916371309337, + "learning_rate": 1.695633442577131e-05, + "loss": 0.349, + "step": 4830 + }, + { + "epoch": 0.28, + "grad_norm": 0.6822341191380212, + "learning_rate": 1.6954997431476376e-05, + "loss": 0.4439, + "step": 4831 + }, + { + "epoch": 0.28, + "grad_norm": 0.34931542652822906, + "learning_rate": 1.695366019633018e-05, + "loss": 0.2223, + "step": 4832 + }, + { + "epoch": 0.28, + "grad_norm": 0.28818157760600444, + "learning_rate": 1.695232272037903e-05, + "loss": 0.2245, + "step": 4833 + }, + { + "epoch": 0.28, + "grad_norm": 0.44886495313474906, + "learning_rate": 1.695098500366924e-05, + "loss": 0.3747, + "step": 4834 + }, + { + "epoch": 0.28, + "grad_norm": 0.376489095537162, + "learning_rate": 1.694964704624714e-05, + "loss": 0.257, + "step": 4835 + }, + { + "epoch": 0.28, + "grad_norm": 0.6917611206680756, + "learning_rate": 1.6948308848159064e-05, + "loss": 0.5613, + "step": 4836 + }, + { + "epoch": 0.28, + "grad_norm": 0.41105937337798043, + "learning_rate": 1.694697040945135e-05, + "loss": 0.3446, + "step": 4837 + }, + { + "epoch": 0.28, + "grad_norm": 0.3334884198947588, + "learning_rate": 1.694563173017035e-05, + "loss": 0.225, + "step": 4838 + }, + { + "epoch": 0.28, + "grad_norm": 0.3046497158528754, + "learning_rate": 1.694429281036242e-05, + "loss": 0.1881, + "step": 4839 + }, + { + "epoch": 0.28, + "grad_norm": 0.6787872188789502, + "learning_rate": 1.6942953650073926e-05, + "loss": 0.4572, + "step": 4840 + }, + { + "epoch": 0.28, + "grad_norm": 0.3605281610226177, + "learning_rate": 1.6941614249351252e-05, + "loss": 0.3004, + "step": 4841 + }, + { + "epoch": 0.28, + "grad_norm": 0.4345068829504192, + "learning_rate": 1.6940274608240773e-05, + "loss": 0.2796, + "step": 4842 + }, + { + "epoch": 0.28, + "grad_norm": 1.0070853290732158, + "learning_rate": 1.693893472678888e-05, + "loss": 0.6626, + "step": 4843 + }, + { + "epoch": 0.28, + "grad_norm": 0.3629286253161658, + "learning_rate": 1.693759460504198e-05, + "loss": 0.2631, + "step": 4844 + }, + { + "epoch": 0.28, + "grad_norm": 0.2677448034312148, + "learning_rate": 1.6936254243046472e-05, + "loss": 0.152, + "step": 4845 + }, + { + "epoch": 0.28, + "grad_norm": 0.6535965345558263, + "learning_rate": 1.6934913640848782e-05, + "loss": 0.3904, + "step": 4846 + }, + { + "epoch": 0.28, + "grad_norm": 0.3450106487612924, + "learning_rate": 1.6933572798495327e-05, + "loss": 0.2696, + "step": 4847 + }, + { + "epoch": 0.28, + "grad_norm": 0.8901525449936295, + "learning_rate": 1.6932231716032548e-05, + "loss": 0.463, + "step": 4848 + }, + { + "epoch": 0.28, + "grad_norm": 0.35316268141975193, + "learning_rate": 1.6930890393506882e-05, + "loss": 0.3056, + "step": 4849 + }, + { + "epoch": 0.28, + "grad_norm": 0.37333332234721345, + "learning_rate": 1.692954883096478e-05, + "loss": 0.314, + "step": 4850 + }, + { + "epoch": 0.28, + "grad_norm": 0.26962638573625075, + "learning_rate": 1.6928207028452698e-05, + "loss": 0.1461, + "step": 4851 + }, + { + "epoch": 0.28, + "grad_norm": 0.7124643097673313, + "learning_rate": 1.6926864986017105e-05, + "loss": 0.4647, + "step": 4852 + }, + { + "epoch": 0.28, + "grad_norm": 0.3938996265220362, + "learning_rate": 1.6925522703704475e-05, + "loss": 0.3013, + "step": 4853 + }, + { + "epoch": 0.28, + "grad_norm": 0.4226078356649217, + "learning_rate": 1.6924180181561297e-05, + "loss": 0.3757, + "step": 4854 + }, + { + "epoch": 0.28, + "grad_norm": 0.993544391815923, + "learning_rate": 1.6922837419634052e-05, + "loss": 0.4262, + "step": 4855 + }, + { + "epoch": 0.28, + "grad_norm": 0.37675799254882103, + "learning_rate": 1.6921494417969245e-05, + "loss": 0.2918, + "step": 4856 + }, + { + "epoch": 0.28, + "grad_norm": 0.3003610120862018, + "learning_rate": 1.6920151176613383e-05, + "loss": 0.2703, + "step": 4857 + }, + { + "epoch": 0.28, + "grad_norm": 0.5360807413545341, + "learning_rate": 1.6918807695612984e-05, + "loss": 0.2786, + "step": 4858 + }, + { + "epoch": 0.28, + "grad_norm": 0.37804769698458474, + "learning_rate": 1.6917463975014575e-05, + "loss": 0.2754, + "step": 4859 + }, + { + "epoch": 0.28, + "grad_norm": 1.1356673660352166, + "learning_rate": 1.691612001486468e-05, + "loss": 0.6455, + "step": 4860 + }, + { + "epoch": 0.28, + "grad_norm": 0.3493864639681745, + "learning_rate": 1.6914775815209853e-05, + "loss": 0.2859, + "step": 4861 + }, + { + "epoch": 0.28, + "grad_norm": 0.37809936495877855, + "learning_rate": 1.6913431376096633e-05, + "loss": 0.2852, + "step": 4862 + }, + { + "epoch": 0.28, + "grad_norm": 0.7970041802176591, + "learning_rate": 1.6912086697571584e-05, + "loss": 0.5109, + "step": 4863 + }, + { + "epoch": 0.28, + "grad_norm": 0.2716805871139541, + "learning_rate": 1.6910741779681264e-05, + "loss": 0.2215, + "step": 4864 + }, + { + "epoch": 0.28, + "grad_norm": 0.4244076834143325, + "learning_rate": 1.690939662247226e-05, + "loss": 0.2942, + "step": 4865 + }, + { + "epoch": 0.28, + "grad_norm": 0.5023134654075716, + "learning_rate": 1.6908051225991146e-05, + "loss": 0.3721, + "step": 4866 + }, + { + "epoch": 0.28, + "grad_norm": 1.2878202571798292, + "learning_rate": 1.6906705590284517e-05, + "loss": 0.8234, + "step": 4867 + }, + { + "epoch": 0.28, + "grad_norm": 0.37248189574940166, + "learning_rate": 1.690535971539897e-05, + "loss": 0.1709, + "step": 4868 + }, + { + "epoch": 0.28, + "grad_norm": 0.4067382861706183, + "learning_rate": 1.690401360138111e-05, + "loss": 0.3374, + "step": 4869 + }, + { + "epoch": 0.28, + "grad_norm": 0.316105557965605, + "learning_rate": 1.6902667248277557e-05, + "loss": 0.2383, + "step": 4870 + }, + { + "epoch": 0.28, + "grad_norm": 0.38753037141289065, + "learning_rate": 1.6901320656134935e-05, + "loss": 0.2109, + "step": 4871 + }, + { + "epoch": 0.28, + "grad_norm": 0.9724087834349608, + "learning_rate": 1.6899973824999872e-05, + "loss": 0.7267, + "step": 4872 + }, + { + "epoch": 0.28, + "grad_norm": 0.3812345170205305, + "learning_rate": 1.6898626754919018e-05, + "loss": 0.3212, + "step": 4873 + }, + { + "epoch": 0.28, + "grad_norm": 0.3219187996146285, + "learning_rate": 1.6897279445939012e-05, + "loss": 0.1968, + "step": 4874 + }, + { + "epoch": 0.28, + "grad_norm": 0.6624262246155297, + "learning_rate": 1.6895931898106517e-05, + "loss": 0.4522, + "step": 4875 + }, + { + "epoch": 0.28, + "grad_norm": 0.47937324578610235, + "learning_rate": 1.6894584111468196e-05, + "loss": 0.2651, + "step": 4876 + }, + { + "epoch": 0.28, + "grad_norm": 0.2998688492477607, + "learning_rate": 1.6893236086070722e-05, + "loss": 0.2302, + "step": 4877 + }, + { + "epoch": 0.28, + "grad_norm": 0.4740529224435299, + "learning_rate": 1.6891887821960783e-05, + "loss": 0.299, + "step": 4878 + }, + { + "epoch": 0.28, + "grad_norm": 0.7311170213555515, + "learning_rate": 1.689053931918506e-05, + "loss": 0.5443, + "step": 4879 + }, + { + "epoch": 0.28, + "grad_norm": 0.41467295139387256, + "learning_rate": 1.688919057779026e-05, + "loss": 0.3303, + "step": 4880 + }, + { + "epoch": 0.28, + "grad_norm": 0.3324366508784158, + "learning_rate": 1.6887841597823088e-05, + "loss": 0.2817, + "step": 4881 + }, + { + "epoch": 0.28, + "grad_norm": 0.4272022615661138, + "learning_rate": 1.6886492379330254e-05, + "loss": 0.2808, + "step": 4882 + }, + { + "epoch": 0.28, + "grad_norm": 0.29255807880365126, + "learning_rate": 1.6885142922358486e-05, + "loss": 0.2237, + "step": 4883 + }, + { + "epoch": 0.28, + "grad_norm": 0.8529213592258498, + "learning_rate": 1.6883793226954516e-05, + "loss": 0.5751, + "step": 4884 + }, + { + "epoch": 0.28, + "grad_norm": 0.5232124412222744, + "learning_rate": 1.6882443293165083e-05, + "loss": 0.3541, + "step": 4885 + }, + { + "epoch": 0.28, + "grad_norm": 0.513732509312485, + "learning_rate": 1.6881093121036933e-05, + "loss": 0.3429, + "step": 4886 + }, + { + "epoch": 0.28, + "grad_norm": 0.43596299045855724, + "learning_rate": 1.6879742710616826e-05, + "loss": 0.3057, + "step": 4887 + }, + { + "epoch": 0.28, + "grad_norm": 0.33911296956631076, + "learning_rate": 1.6878392061951525e-05, + "loss": 0.2385, + "step": 4888 + }, + { + "epoch": 0.28, + "grad_norm": 0.40006287993866735, + "learning_rate": 1.6877041175087802e-05, + "loss": 0.2909, + "step": 4889 + }, + { + "epoch": 0.28, + "grad_norm": 0.43421895746675027, + "learning_rate": 1.6875690050072435e-05, + "loss": 0.3087, + "step": 4890 + }, + { + "epoch": 0.28, + "grad_norm": 0.4299156983242811, + "learning_rate": 1.687433868695222e-05, + "loss": 0.3169, + "step": 4891 + }, + { + "epoch": 0.28, + "grad_norm": 0.38431969838964786, + "learning_rate": 1.687298708577395e-05, + "loss": 0.3265, + "step": 4892 + }, + { + "epoch": 0.28, + "grad_norm": 0.4932553865657155, + "learning_rate": 1.687163524658444e-05, + "loss": 0.3143, + "step": 4893 + }, + { + "epoch": 0.28, + "grad_norm": 0.6671294183699192, + "learning_rate": 1.687028316943049e-05, + "loss": 0.3774, + "step": 4894 + }, + { + "epoch": 0.28, + "grad_norm": 0.5398718612910458, + "learning_rate": 1.686893085435893e-05, + "loss": 0.2871, + "step": 4895 + }, + { + "epoch": 0.28, + "grad_norm": 0.5618030739271445, + "learning_rate": 1.686757830141659e-05, + "loss": 0.435, + "step": 4896 + }, + { + "epoch": 0.28, + "grad_norm": 0.4098895772036004, + "learning_rate": 1.6866225510650312e-05, + "loss": 0.3044, + "step": 4897 + }, + { + "epoch": 0.28, + "grad_norm": 0.3133046926958272, + "learning_rate": 1.686487248210694e-05, + "loss": 0.2384, + "step": 4898 + }, + { + "epoch": 0.28, + "grad_norm": 0.38071433829603696, + "learning_rate": 1.6863519215833327e-05, + "loss": 0.2787, + "step": 4899 + }, + { + "epoch": 0.28, + "grad_norm": 0.4427576937958323, + "learning_rate": 1.686216571187634e-05, + "loss": 0.2594, + "step": 4900 + }, + { + "epoch": 0.28, + "grad_norm": 0.37213118554343044, + "learning_rate": 1.6860811970282844e-05, + "loss": 0.2937, + "step": 4901 + }, + { + "epoch": 0.28, + "grad_norm": 0.5833133836609231, + "learning_rate": 1.685945799109973e-05, + "loss": 0.3999, + "step": 4902 + }, + { + "epoch": 0.28, + "grad_norm": 0.6780494801302973, + "learning_rate": 1.6858103774373877e-05, + "loss": 0.5117, + "step": 4903 + }, + { + "epoch": 0.28, + "grad_norm": 0.24541000634477722, + "learning_rate": 1.685674932015219e-05, + "loss": 0.0744, + "step": 4904 + }, + { + "epoch": 0.28, + "grad_norm": 0.3375440873846795, + "learning_rate": 1.6855394628481565e-05, + "loss": 0.3181, + "step": 4905 + }, + { + "epoch": 0.28, + "grad_norm": 0.8924703502539542, + "learning_rate": 1.6854039699408923e-05, + "loss": 0.5844, + "step": 4906 + }, + { + "epoch": 0.28, + "grad_norm": 0.4013162255792, + "learning_rate": 1.6852684532981176e-05, + "loss": 0.1698, + "step": 4907 + }, + { + "epoch": 0.28, + "grad_norm": 0.442763641896375, + "learning_rate": 1.685132912924526e-05, + "loss": 0.328, + "step": 4908 + }, + { + "epoch": 0.28, + "grad_norm": 0.48415946618162076, + "learning_rate": 1.684997348824811e-05, + "loss": 0.3534, + "step": 4909 + }, + { + "epoch": 0.28, + "grad_norm": 0.27868722935079016, + "learning_rate": 1.6848617610036676e-05, + "loss": 0.0971, + "step": 4910 + }, + { + "epoch": 0.28, + "grad_norm": 0.32980783132441815, + "learning_rate": 1.6847261494657904e-05, + "loss": 0.2478, + "step": 4911 + }, + { + "epoch": 0.28, + "grad_norm": 1.1178058794694627, + "learning_rate": 1.6845905142158764e-05, + "loss": 0.6226, + "step": 4912 + }, + { + "epoch": 0.28, + "grad_norm": 0.2863922274104645, + "learning_rate": 1.6844548552586225e-05, + "loss": 0.2348, + "step": 4913 + }, + { + "epoch": 0.28, + "grad_norm": 0.49945881324068725, + "learning_rate": 1.684319172598726e-05, + "loss": 0.3969, + "step": 4914 + }, + { + "epoch": 0.28, + "grad_norm": 0.7076134237344921, + "learning_rate": 1.684183466240886e-05, + "loss": 0.5169, + "step": 4915 + }, + { + "epoch": 0.28, + "grad_norm": 0.3167088144334443, + "learning_rate": 1.684047736189802e-05, + "loss": 0.2335, + "step": 4916 + }, + { + "epoch": 0.28, + "grad_norm": 0.3104910368581827, + "learning_rate": 1.6839119824501747e-05, + "loss": 0.2116, + "step": 4917 + }, + { + "epoch": 0.28, + "grad_norm": 0.9284231793627837, + "learning_rate": 1.6837762050267044e-05, + "loss": 0.567, + "step": 4918 + }, + { + "epoch": 0.28, + "grad_norm": 0.596611993917205, + "learning_rate": 1.683640403924094e-05, + "loss": 0.3658, + "step": 4919 + }, + { + "epoch": 0.28, + "grad_norm": 0.3944841548350714, + "learning_rate": 1.6835045791470453e-05, + "loss": 0.2971, + "step": 4920 + }, + { + "epoch": 0.28, + "grad_norm": 0.3928615059353819, + "learning_rate": 1.683368730700263e-05, + "loss": 0.3528, + "step": 4921 + }, + { + "epoch": 0.28, + "grad_norm": 0.8844377675062224, + "learning_rate": 1.6832328585884505e-05, + "loss": 0.4563, + "step": 4922 + }, + { + "epoch": 0.28, + "grad_norm": 0.24744733076868877, + "learning_rate": 1.6830969628163134e-05, + "loss": 0.1611, + "step": 4923 + }, + { + "epoch": 0.28, + "grad_norm": 0.37799077238329093, + "learning_rate": 1.6829610433885583e-05, + "loss": 0.308, + "step": 4924 + }, + { + "epoch": 0.28, + "grad_norm": 0.7391018917542396, + "learning_rate": 1.6828251003098913e-05, + "loss": 0.448, + "step": 4925 + }, + { + "epoch": 0.28, + "grad_norm": 0.4267263829375562, + "learning_rate": 1.6826891335850205e-05, + "loss": 0.2828, + "step": 4926 + }, + { + "epoch": 0.28, + "grad_norm": 0.9901949114209159, + "learning_rate": 1.6825531432186545e-05, + "loss": 0.6693, + "step": 4927 + }, + { + "epoch": 0.28, + "grad_norm": 0.4338305088608627, + "learning_rate": 1.682417129215502e-05, + "loss": 0.3244, + "step": 4928 + }, + { + "epoch": 0.28, + "grad_norm": 0.3122226230150543, + "learning_rate": 1.682281091580274e-05, + "loss": 0.2397, + "step": 4929 + }, + { + "epoch": 0.28, + "grad_norm": 0.2536689856422566, + "learning_rate": 1.682145030317681e-05, + "loss": 0.1118, + "step": 4930 + }, + { + "epoch": 0.28, + "grad_norm": 1.2644511808907366, + "learning_rate": 1.6820089454324355e-05, + "loss": 0.4325, + "step": 4931 + }, + { + "epoch": 0.28, + "grad_norm": 0.3830284678075912, + "learning_rate": 1.681872836929249e-05, + "loss": 0.297, + "step": 4932 + }, + { + "epoch": 0.28, + "grad_norm": 0.44403371374329265, + "learning_rate": 1.6817367048128357e-05, + "loss": 0.2896, + "step": 4933 + }, + { + "epoch": 0.28, + "grad_norm": 0.6483549831795611, + "learning_rate": 1.6816005490879096e-05, + "loss": 0.3426, + "step": 4934 + }, + { + "epoch": 0.28, + "grad_norm": 0.32440225504314685, + "learning_rate": 1.6814643697591857e-05, + "loss": 0.2452, + "step": 4935 + }, + { + "epoch": 0.28, + "grad_norm": 0.39596665611969095, + "learning_rate": 1.68132816683138e-05, + "loss": 0.2387, + "step": 4936 + }, + { + "epoch": 0.28, + "grad_norm": 0.9595030167316971, + "learning_rate": 1.6811919403092093e-05, + "loss": 0.3894, + "step": 4937 + }, + { + "epoch": 0.28, + "grad_norm": 0.4468030719392794, + "learning_rate": 1.6810556901973907e-05, + "loss": 0.3355, + "step": 4938 + }, + { + "epoch": 0.28, + "grad_norm": 0.8717707766987532, + "learning_rate": 1.680919416500643e-05, + "loss": 0.5147, + "step": 4939 + }, + { + "epoch": 0.28, + "grad_norm": 0.38028104901258003, + "learning_rate": 1.6807831192236855e-05, + "loss": 0.3398, + "step": 4940 + }, + { + "epoch": 0.28, + "grad_norm": 0.3862054255947419, + "learning_rate": 1.6806467983712375e-05, + "loss": 0.2686, + "step": 4941 + }, + { + "epoch": 0.28, + "grad_norm": 0.40855243616372805, + "learning_rate": 1.68051045394802e-05, + "loss": 0.2499, + "step": 4942 + }, + { + "epoch": 0.28, + "grad_norm": 0.7503261455917112, + "learning_rate": 1.680374085958755e-05, + "loss": 0.3199, + "step": 4943 + }, + { + "epoch": 0.28, + "grad_norm": 0.47228792715288803, + "learning_rate": 1.680237694408165e-05, + "loss": 0.3086, + "step": 4944 + }, + { + "epoch": 0.28, + "grad_norm": 0.5675653321231017, + "learning_rate": 1.6801012793009725e-05, + "loss": 0.4409, + "step": 4945 + }, + { + "epoch": 0.28, + "grad_norm": 0.657462471357182, + "learning_rate": 1.6799648406419018e-05, + "loss": 0.2353, + "step": 4946 + }, + { + "epoch": 0.28, + "grad_norm": 0.2388900030606124, + "learning_rate": 1.679828378435678e-05, + "loss": 0.1982, + "step": 4947 + }, + { + "epoch": 0.28, + "grad_norm": 0.4841571640687985, + "learning_rate": 1.6796918926870266e-05, + "loss": 0.3666, + "step": 4948 + }, + { + "epoch": 0.28, + "grad_norm": 0.5190146104941877, + "learning_rate": 1.6795553834006746e-05, + "loss": 0.2917, + "step": 4949 + }, + { + "epoch": 0.28, + "grad_norm": 0.44852090196045463, + "learning_rate": 1.6794188505813486e-05, + "loss": 0.3222, + "step": 4950 + }, + { + "epoch": 0.28, + "grad_norm": 0.985136552799657, + "learning_rate": 1.679282294233777e-05, + "loss": 0.622, + "step": 4951 + }, + { + "epoch": 0.28, + "grad_norm": 0.32555809213851533, + "learning_rate": 1.6791457143626887e-05, + "loss": 0.2532, + "step": 4952 + }, + { + "epoch": 0.28, + "grad_norm": 0.3561321537682771, + "learning_rate": 1.679009110972814e-05, + "loss": 0.2804, + "step": 4953 + }, + { + "epoch": 0.28, + "grad_norm": 0.6560341934071064, + "learning_rate": 1.6788724840688823e-05, + "loss": 0.4473, + "step": 4954 + }, + { + "epoch": 0.28, + "grad_norm": 0.31007613366907594, + "learning_rate": 1.678735833655626e-05, + "loss": 0.1873, + "step": 4955 + }, + { + "epoch": 0.28, + "grad_norm": 0.326241133662786, + "learning_rate": 1.6785991597377772e-05, + "loss": 0.243, + "step": 4956 + }, + { + "epoch": 0.28, + "grad_norm": 0.5177649120868529, + "learning_rate": 1.6784624623200684e-05, + "loss": 0.4381, + "step": 4957 + }, + { + "epoch": 0.28, + "grad_norm": 1.240446473986299, + "learning_rate": 1.6783257414072336e-05, + "loss": 0.6064, + "step": 4958 + }, + { + "epoch": 0.28, + "grad_norm": 0.33109360065083926, + "learning_rate": 1.678188997004008e-05, + "loss": 0.2334, + "step": 4959 + }, + { + "epoch": 0.28, + "grad_norm": 0.39985346698702146, + "learning_rate": 1.678052229115126e-05, + "loss": 0.3504, + "step": 4960 + }, + { + "epoch": 0.29, + "grad_norm": 0.5592780579901474, + "learning_rate": 1.6779154377453248e-05, + "loss": 0.1822, + "step": 4961 + }, + { + "epoch": 0.29, + "grad_norm": 0.3496638175036046, + "learning_rate": 1.677778622899341e-05, + "loss": 0.2214, + "step": 4962 + }, + { + "epoch": 0.29, + "grad_norm": 1.0124778899205107, + "learning_rate": 1.677641784581913e-05, + "loss": 0.6795, + "step": 4963 + }, + { + "epoch": 0.29, + "grad_norm": 0.41286864504725357, + "learning_rate": 1.6775049227977788e-05, + "loss": 0.3245, + "step": 4964 + }, + { + "epoch": 0.29, + "grad_norm": 0.37057078830092144, + "learning_rate": 1.6773680375516784e-05, + "loss": 0.2586, + "step": 4965 + }, + { + "epoch": 0.29, + "grad_norm": 0.6333735841850971, + "learning_rate": 1.6772311288483517e-05, + "loss": 0.4853, + "step": 4966 + }, + { + "epoch": 0.29, + "grad_norm": 0.4697963107259023, + "learning_rate": 1.6770941966925404e-05, + "loss": 0.2266, + "step": 4967 + }, + { + "epoch": 0.29, + "grad_norm": 0.33424175052387356, + "learning_rate": 1.676957241088986e-05, + "loss": 0.2609, + "step": 4968 + }, + { + "epoch": 0.29, + "grad_norm": 0.485965282810177, + "learning_rate": 1.6768202620424317e-05, + "loss": 0.2656, + "step": 4969 + }, + { + "epoch": 0.29, + "grad_norm": 0.6859894051180101, + "learning_rate": 1.6766832595576205e-05, + "loss": 0.4445, + "step": 4970 + }, + { + "epoch": 0.29, + "grad_norm": 0.41946713395899093, + "learning_rate": 1.6765462336392975e-05, + "loss": 0.2915, + "step": 4971 + }, + { + "epoch": 0.29, + "grad_norm": 0.29292304755912535, + "learning_rate": 1.6764091842922076e-05, + "loss": 0.2806, + "step": 4972 + }, + { + "epoch": 0.29, + "grad_norm": 0.41222303207216854, + "learning_rate": 1.6762721115210964e-05, + "loss": 0.3055, + "step": 4973 + }, + { + "epoch": 0.29, + "grad_norm": 0.420655109877888, + "learning_rate": 1.676135015330711e-05, + "loss": 0.308, + "step": 4974 + }, + { + "epoch": 0.29, + "grad_norm": 0.29319502490689703, + "learning_rate": 1.675997895725799e-05, + "loss": 0.1811, + "step": 4975 + }, + { + "epoch": 0.29, + "grad_norm": 0.4219624190782678, + "learning_rate": 1.675860752711109e-05, + "loss": 0.33, + "step": 4976 + }, + { + "epoch": 0.29, + "grad_norm": 0.3887698623250924, + "learning_rate": 1.675723586291391e-05, + "loss": 0.2888, + "step": 4977 + }, + { + "epoch": 0.29, + "grad_norm": 0.6488987519940098, + "learning_rate": 1.6755863964713934e-05, + "loss": 0.3712, + "step": 4978 + }, + { + "epoch": 0.29, + "grad_norm": 0.42002949206495394, + "learning_rate": 1.675449183255868e-05, + "loss": 0.3128, + "step": 4979 + }, + { + "epoch": 0.29, + "grad_norm": 0.33873593445127587, + "learning_rate": 1.6753119466495667e-05, + "loss": 0.2737, + "step": 4980 + }, + { + "epoch": 0.29, + "grad_norm": 0.334940371790023, + "learning_rate": 1.6751746866572415e-05, + "loss": 0.2823, + "step": 4981 + }, + { + "epoch": 0.29, + "grad_norm": 0.9608719470015646, + "learning_rate": 1.6750374032836462e-05, + "loss": 0.3832, + "step": 4982 + }, + { + "epoch": 0.29, + "grad_norm": 0.33594948667433394, + "learning_rate": 1.6749000965335344e-05, + "loss": 0.2858, + "step": 4983 + }, + { + "epoch": 0.29, + "grad_norm": 0.3677272022726247, + "learning_rate": 1.6747627664116615e-05, + "loss": 0.3131, + "step": 4984 + }, + { + "epoch": 0.29, + "grad_norm": 0.8046638072008366, + "learning_rate": 1.674625412922783e-05, + "loss": 0.4111, + "step": 4985 + }, + { + "epoch": 0.29, + "grad_norm": 0.32686647655558276, + "learning_rate": 1.6744880360716555e-05, + "loss": 0.2732, + "step": 4986 + }, + { + "epoch": 0.29, + "grad_norm": 0.41757090613796255, + "learning_rate": 1.6743506358630358e-05, + "loss": 0.2837, + "step": 4987 + }, + { + "epoch": 0.29, + "grad_norm": 0.35935865239680986, + "learning_rate": 1.674213212301683e-05, + "loss": 0.268, + "step": 4988 + }, + { + "epoch": 0.29, + "grad_norm": 0.4316263565610041, + "learning_rate": 1.6740757653923557e-05, + "loss": 0.2988, + "step": 4989 + }, + { + "epoch": 0.29, + "grad_norm": 0.5887593298772523, + "learning_rate": 1.673938295139814e-05, + "loss": 0.3779, + "step": 4990 + }, + { + "epoch": 0.29, + "grad_norm": 0.5006344489216323, + "learning_rate": 1.6738008015488174e-05, + "loss": 0.3939, + "step": 4991 + }, + { + "epoch": 0.29, + "grad_norm": 0.32402544400838923, + "learning_rate": 1.6736632846241282e-05, + "loss": 0.2105, + "step": 4992 + }, + { + "epoch": 0.29, + "grad_norm": 0.3172635891197995, + "learning_rate": 1.6735257443705085e-05, + "loss": 0.2832, + "step": 4993 + }, + { + "epoch": 0.29, + "grad_norm": 0.3766031153954571, + "learning_rate": 1.6733881807927212e-05, + "loss": 0.3244, + "step": 4994 + }, + { + "epoch": 0.29, + "grad_norm": 0.33773156689449546, + "learning_rate": 1.67325059389553e-05, + "loss": 0.1948, + "step": 4995 + }, + { + "epoch": 0.29, + "grad_norm": 0.4008554402803368, + "learning_rate": 1.6731129836837e-05, + "loss": 0.3248, + "step": 4996 + }, + { + "epoch": 0.29, + "grad_norm": 1.4279087807348623, + "learning_rate": 1.672975350161996e-05, + "loss": 0.7799, + "step": 4997 + }, + { + "epoch": 0.29, + "grad_norm": 0.32856664750277276, + "learning_rate": 1.672837693335185e-05, + "loss": 0.2259, + "step": 4998 + }, + { + "epoch": 0.29, + "grad_norm": 0.3994700065203357, + "learning_rate": 1.6727000132080332e-05, + "loss": 0.3668, + "step": 4999 + }, + { + "epoch": 0.29, + "grad_norm": 0.4195276798735785, + "learning_rate": 1.6725623097853092e-05, + "loss": 0.3758, + "step": 5000 + }, + { + "epoch": 0.29, + "grad_norm": 0.19978298158962446, + "learning_rate": 1.672424583071781e-05, + "loss": 0.1467, + "step": 5001 + }, + { + "epoch": 0.29, + "grad_norm": 1.198150816066066, + "learning_rate": 1.672286833072219e-05, + "loss": 0.6234, + "step": 5002 + }, + { + "epoch": 0.29, + "grad_norm": 0.45414769727547355, + "learning_rate": 1.6721490597913922e-05, + "loss": 0.3633, + "step": 5003 + }, + { + "epoch": 0.29, + "grad_norm": 0.38110366694566206, + "learning_rate": 1.672011263234073e-05, + "loss": 0.2942, + "step": 5004 + }, + { + "epoch": 0.29, + "grad_norm": 0.43945156180002193, + "learning_rate": 1.671873443405032e-05, + "loss": 0.3225, + "step": 5005 + }, + { + "epoch": 0.29, + "grad_norm": 0.44775285754769784, + "learning_rate": 1.671735600309043e-05, + "loss": 0.3277, + "step": 5006 + }, + { + "epoch": 0.29, + "grad_norm": 0.31152176309096835, + "learning_rate": 1.6715977339508787e-05, + "loss": 0.2134, + "step": 5007 + }, + { + "epoch": 0.29, + "grad_norm": 0.335916270130865, + "learning_rate": 1.6714598443353137e-05, + "loss": 0.2523, + "step": 5008 + }, + { + "epoch": 0.29, + "grad_norm": 1.2214895870371527, + "learning_rate": 1.6713219314671235e-05, + "loss": 0.7403, + "step": 5009 + }, + { + "epoch": 0.29, + "grad_norm": 0.5000389590232583, + "learning_rate": 1.6711839953510835e-05, + "loss": 0.3313, + "step": 5010 + }, + { + "epoch": 0.29, + "grad_norm": 0.371776258828232, + "learning_rate": 1.6710460359919708e-05, + "loss": 0.2937, + "step": 5011 + }, + { + "epoch": 0.29, + "grad_norm": 0.3811309362080807, + "learning_rate": 1.6709080533945624e-05, + "loss": 0.3059, + "step": 5012 + }, + { + "epoch": 0.29, + "grad_norm": 0.3476381176969048, + "learning_rate": 1.6707700475636372e-05, + "loss": 0.1721, + "step": 5013 + }, + { + "epoch": 0.29, + "grad_norm": 0.41573768957075496, + "learning_rate": 1.6706320185039742e-05, + "loss": 0.2663, + "step": 5014 + }, + { + "epoch": 0.29, + "grad_norm": 0.5511130497075707, + "learning_rate": 1.670493966220353e-05, + "loss": 0.3951, + "step": 5015 + }, + { + "epoch": 0.29, + "grad_norm": 0.34083421472885944, + "learning_rate": 1.6703558907175548e-05, + "loss": 0.3184, + "step": 5016 + }, + { + "epoch": 0.29, + "grad_norm": 0.47507592384082353, + "learning_rate": 1.6702177920003607e-05, + "loss": 0.4127, + "step": 5017 + }, + { + "epoch": 0.29, + "grad_norm": 0.8809556008590094, + "learning_rate": 1.6700796700735534e-05, + "loss": 0.3604, + "step": 5018 + }, + { + "epoch": 0.29, + "grad_norm": 0.3058900980432858, + "learning_rate": 1.669941524941916e-05, + "loss": 0.1977, + "step": 5019 + }, + { + "epoch": 0.29, + "grad_norm": 0.4092102952384807, + "learning_rate": 1.6698033566102322e-05, + "loss": 0.2895, + "step": 5020 + }, + { + "epoch": 0.29, + "grad_norm": 0.7002227774841857, + "learning_rate": 1.6696651650832874e-05, + "loss": 0.3808, + "step": 5021 + }, + { + "epoch": 0.29, + "grad_norm": 0.39936961207401184, + "learning_rate": 1.6695269503658663e-05, + "loss": 0.339, + "step": 5022 + }, + { + "epoch": 0.29, + "grad_norm": 0.5260095728189574, + "learning_rate": 1.6693887124627556e-05, + "loss": 0.3984, + "step": 5023 + }, + { + "epoch": 0.29, + "grad_norm": 0.41866801996607933, + "learning_rate": 1.6692504513787432e-05, + "loss": 0.2768, + "step": 5024 + }, + { + "epoch": 0.29, + "grad_norm": 0.2600843617318522, + "learning_rate": 1.669112167118616e-05, + "loss": 0.1399, + "step": 5025 + }, + { + "epoch": 0.29, + "grad_norm": 0.44756851536480885, + "learning_rate": 1.668973859687163e-05, + "loss": 0.3325, + "step": 5026 + }, + { + "epoch": 0.29, + "grad_norm": 0.42835429389794066, + "learning_rate": 1.6688355290891746e-05, + "loss": 0.308, + "step": 5027 + }, + { + "epoch": 0.29, + "grad_norm": 0.5913982247983119, + "learning_rate": 1.66869717532944e-05, + "loss": 0.3365, + "step": 5028 + }, + { + "epoch": 0.29, + "grad_norm": 0.40328321757099445, + "learning_rate": 1.6685587984127513e-05, + "loss": 0.3689, + "step": 5029 + }, + { + "epoch": 0.29, + "grad_norm": 1.0759732739079926, + "learning_rate": 1.6684203983439e-05, + "loss": 0.606, + "step": 5030 + }, + { + "epoch": 0.29, + "grad_norm": 0.2783514820703303, + "learning_rate": 1.668281975127679e-05, + "loss": 0.1537, + "step": 5031 + }, + { + "epoch": 0.29, + "grad_norm": 0.33181140217007127, + "learning_rate": 1.6681435287688823e-05, + "loss": 0.2664, + "step": 5032 + }, + { + "epoch": 0.29, + "grad_norm": 0.8302507762797569, + "learning_rate": 1.6680050592723038e-05, + "loss": 0.5131, + "step": 5033 + }, + { + "epoch": 0.29, + "grad_norm": 0.6098011206817671, + "learning_rate": 1.6678665666427387e-05, + "loss": 0.3049, + "step": 5034 + }, + { + "epoch": 0.29, + "grad_norm": 0.35164039268183567, + "learning_rate": 1.6677280508849828e-05, + "loss": 0.3084, + "step": 5035 + }, + { + "epoch": 0.29, + "grad_norm": 0.951993401806805, + "learning_rate": 1.667589512003834e-05, + "loss": 0.6228, + "step": 5036 + }, + { + "epoch": 0.29, + "grad_norm": 0.34895682350666707, + "learning_rate": 1.6674509500040885e-05, + "loss": 0.2188, + "step": 5037 + }, + { + "epoch": 0.29, + "grad_norm": 0.3204560482309398, + "learning_rate": 1.6673123648905454e-05, + "loss": 0.2321, + "step": 5038 + }, + { + "epoch": 0.29, + "grad_norm": 0.4577995928154056, + "learning_rate": 1.667173756668004e-05, + "loss": 0.337, + "step": 5039 + }, + { + "epoch": 0.29, + "grad_norm": 0.3938689527075941, + "learning_rate": 1.667035125341264e-05, + "loss": 0.2516, + "step": 5040 + }, + { + "epoch": 0.29, + "grad_norm": 0.572570413679517, + "learning_rate": 1.6668964709151265e-05, + "loss": 0.4199, + "step": 5041 + }, + { + "epoch": 0.29, + "grad_norm": 1.4299646855354586, + "learning_rate": 1.6667577933943925e-05, + "loss": 0.7393, + "step": 5042 + }, + { + "epoch": 0.29, + "grad_norm": 0.33121075066613553, + "learning_rate": 1.666619092783865e-05, + "loss": 0.3047, + "step": 5043 + }, + { + "epoch": 0.29, + "grad_norm": 0.24035738339207335, + "learning_rate": 1.6664803690883465e-05, + "loss": 0.1529, + "step": 5044 + }, + { + "epoch": 0.29, + "grad_norm": 0.5652684265469679, + "learning_rate": 1.666341622312642e-05, + "loss": 0.4583, + "step": 5045 + }, + { + "epoch": 0.29, + "grad_norm": 0.8551662769154373, + "learning_rate": 1.666202852461556e-05, + "loss": 0.5423, + "step": 5046 + }, + { + "epoch": 0.29, + "grad_norm": 0.3637742889707305, + "learning_rate": 1.6660640595398934e-05, + "loss": 0.2563, + "step": 5047 + }, + { + "epoch": 0.29, + "grad_norm": 0.49084959630933184, + "learning_rate": 1.6659252435524613e-05, + "loss": 0.3588, + "step": 5048 + }, + { + "epoch": 0.29, + "grad_norm": 0.9077022712537721, + "learning_rate": 1.6657864045040665e-05, + "loss": 0.4459, + "step": 5049 + }, + { + "epoch": 0.29, + "grad_norm": 0.44606011433230913, + "learning_rate": 1.665647542399517e-05, + "loss": 0.2837, + "step": 5050 + }, + { + "epoch": 0.29, + "grad_norm": 0.40943914248299257, + "learning_rate": 1.665508657243622e-05, + "loss": 0.3512, + "step": 5051 + }, + { + "epoch": 0.29, + "grad_norm": 0.30297531819075785, + "learning_rate": 1.6653697490411905e-05, + "loss": 0.187, + "step": 5052 + }, + { + "epoch": 0.29, + "grad_norm": 0.3876810435050498, + "learning_rate": 1.6652308177970335e-05, + "loss": 0.2396, + "step": 5053 + }, + { + "epoch": 0.29, + "grad_norm": 1.1355168231759345, + "learning_rate": 1.665091863515962e-05, + "loss": 0.6173, + "step": 5054 + }, + { + "epoch": 0.29, + "grad_norm": 0.3430108180818135, + "learning_rate": 1.6649528862027877e-05, + "loss": 0.3208, + "step": 5055 + }, + { + "epoch": 0.29, + "grad_norm": 0.3775947002380297, + "learning_rate": 1.6648138858623236e-05, + "loss": 0.3046, + "step": 5056 + }, + { + "epoch": 0.29, + "grad_norm": 0.3376050104963141, + "learning_rate": 1.664674862499383e-05, + "loss": 0.1998, + "step": 5057 + }, + { + "epoch": 0.29, + "grad_norm": 0.5965276651128877, + "learning_rate": 1.664535816118781e-05, + "loss": 0.4102, + "step": 5058 + }, + { + "epoch": 0.29, + "grad_norm": 0.4126818965088, + "learning_rate": 1.6643967467253316e-05, + "loss": 0.3416, + "step": 5059 + }, + { + "epoch": 0.29, + "grad_norm": 0.35265629901301665, + "learning_rate": 1.664257654323852e-05, + "loss": 0.282, + "step": 5060 + }, + { + "epoch": 0.29, + "grad_norm": 0.46529035889049486, + "learning_rate": 1.664118538919158e-05, + "loss": 0.3418, + "step": 5061 + }, + { + "epoch": 0.29, + "grad_norm": 0.386042563376412, + "learning_rate": 1.6639794005160677e-05, + "loss": 0.3206, + "step": 5062 + }, + { + "epoch": 0.29, + "grad_norm": 0.347541240252572, + "learning_rate": 1.6638402391193993e-05, + "loss": 0.234, + "step": 5063 + }, + { + "epoch": 0.29, + "grad_norm": 1.3657479293256798, + "learning_rate": 1.663701054733972e-05, + "loss": 0.8305, + "step": 5064 + }, + { + "epoch": 0.29, + "grad_norm": 0.35412391052650544, + "learning_rate": 1.6635618473646058e-05, + "loss": 0.2778, + "step": 5065 + }, + { + "epoch": 0.29, + "grad_norm": 0.32667171769057474, + "learning_rate": 1.6634226170161213e-05, + "loss": 0.2391, + "step": 5066 + }, + { + "epoch": 0.29, + "grad_norm": 0.43478785542869364, + "learning_rate": 1.66328336369334e-05, + "loss": 0.3431, + "step": 5067 + }, + { + "epoch": 0.29, + "grad_norm": 0.33734429475445615, + "learning_rate": 1.6631440874010842e-05, + "loss": 0.3147, + "step": 5068 + }, + { + "epoch": 0.29, + "grad_norm": 0.8787246640463405, + "learning_rate": 1.6630047881441774e-05, + "loss": 0.6012, + "step": 5069 + }, + { + "epoch": 0.29, + "grad_norm": 0.49363055525613264, + "learning_rate": 1.6628654659274433e-05, + "loss": 0.3192, + "step": 5070 + }, + { + "epoch": 0.29, + "grad_norm": 0.30082111207502216, + "learning_rate": 1.6627261207557068e-05, + "loss": 0.2643, + "step": 5071 + }, + { + "epoch": 0.29, + "grad_norm": 0.26296980966438765, + "learning_rate": 1.6625867526337928e-05, + "loss": 0.1485, + "step": 5072 + }, + { + "epoch": 0.29, + "grad_norm": 0.6053081587769884, + "learning_rate": 1.662447361566528e-05, + "loss": 0.3424, + "step": 5073 + }, + { + "epoch": 0.29, + "grad_norm": 0.41307754279842807, + "learning_rate": 1.6623079475587403e-05, + "loss": 0.3124, + "step": 5074 + }, + { + "epoch": 0.29, + "grad_norm": 0.40972224319212935, + "learning_rate": 1.6621685106152564e-05, + "loss": 0.3288, + "step": 5075 + }, + { + "epoch": 0.29, + "grad_norm": 0.49432112174734544, + "learning_rate": 1.6620290507409053e-05, + "loss": 0.3193, + "step": 5076 + }, + { + "epoch": 0.29, + "grad_norm": 0.380041557575946, + "learning_rate": 1.6618895679405165e-05, + "loss": 0.283, + "step": 5077 + }, + { + "epoch": 0.29, + "grad_norm": 0.35160063555106125, + "learning_rate": 1.6617500622189208e-05, + "loss": 0.246, + "step": 5078 + }, + { + "epoch": 0.29, + "grad_norm": 0.3811836486280641, + "learning_rate": 1.6616105335809487e-05, + "loss": 0.3454, + "step": 5079 + }, + { + "epoch": 0.29, + "grad_norm": 0.33824258296520193, + "learning_rate": 1.6614709820314323e-05, + "loss": 0.2238, + "step": 5080 + }, + { + "epoch": 0.29, + "grad_norm": 0.8228245625272075, + "learning_rate": 1.6613314075752044e-05, + "loss": 0.5268, + "step": 5081 + }, + { + "epoch": 0.29, + "grad_norm": 1.1414700434465859, + "learning_rate": 1.661191810217098e-05, + "loss": 0.7977, + "step": 5082 + }, + { + "epoch": 0.29, + "grad_norm": 0.294198134495597, + "learning_rate": 1.661052189961948e-05, + "loss": 0.2376, + "step": 5083 + }, + { + "epoch": 0.29, + "grad_norm": 0.46694318435537846, + "learning_rate": 1.660912546814589e-05, + "loss": 0.3321, + "step": 5084 + }, + { + "epoch": 0.29, + "grad_norm": 0.40253889624722783, + "learning_rate": 1.6607728807798568e-05, + "loss": 0.2855, + "step": 5085 + }, + { + "epoch": 0.29, + "grad_norm": 0.3838630594646114, + "learning_rate": 1.660633191862588e-05, + "loss": 0.2163, + "step": 5086 + }, + { + "epoch": 0.29, + "grad_norm": 0.4452069568209437, + "learning_rate": 1.6604934800676207e-05, + "loss": 0.3252, + "step": 5087 + }, + { + "epoch": 0.29, + "grad_norm": 1.205226316107024, + "learning_rate": 1.660353745399792e-05, + "loss": 0.7247, + "step": 5088 + }, + { + "epoch": 0.29, + "grad_norm": 0.32386036990488415, + "learning_rate": 1.6602139878639417e-05, + "loss": 0.2436, + "step": 5089 + }, + { + "epoch": 0.29, + "grad_norm": 0.7146860073301247, + "learning_rate": 1.6600742074649095e-05, + "loss": 0.4543, + "step": 5090 + }, + { + "epoch": 0.29, + "grad_norm": 0.2521102701136926, + "learning_rate": 1.659934404207536e-05, + "loss": 0.2253, + "step": 5091 + }, + { + "epoch": 0.29, + "grad_norm": 0.3929148102982469, + "learning_rate": 1.6597945780966626e-05, + "loss": 0.2923, + "step": 5092 + }, + { + "epoch": 0.29, + "grad_norm": 0.7744747631107772, + "learning_rate": 1.659654729137131e-05, + "loss": 0.3298, + "step": 5093 + }, + { + "epoch": 0.29, + "grad_norm": 0.385373842207475, + "learning_rate": 1.6595148573337843e-05, + "loss": 0.3322, + "step": 5094 + }, + { + "epoch": 0.29, + "grad_norm": 0.44188848071352976, + "learning_rate": 1.6593749626914665e-05, + "loss": 0.3031, + "step": 5095 + }, + { + "epoch": 0.29, + "grad_norm": 0.596587404876109, + "learning_rate": 1.6592350452150223e-05, + "loss": 0.34, + "step": 5096 + }, + { + "epoch": 0.29, + "grad_norm": 0.2930106307014018, + "learning_rate": 1.6590951049092966e-05, + "loss": 0.1554, + "step": 5097 + }, + { + "epoch": 0.29, + "grad_norm": 0.45593236462612596, + "learning_rate": 1.658955141779136e-05, + "loss": 0.3053, + "step": 5098 + }, + { + "epoch": 0.29, + "grad_norm": 0.5457068512880814, + "learning_rate": 1.6588151558293874e-05, + "loss": 0.2644, + "step": 5099 + }, + { + "epoch": 0.29, + "grad_norm": 1.2365592496960516, + "learning_rate": 1.658675147064898e-05, + "loss": 0.8412, + "step": 5100 + }, + { + "epoch": 0.29, + "grad_norm": 0.4162610077770308, + "learning_rate": 1.6585351154905163e-05, + "loss": 0.2924, + "step": 5101 + }, + { + "epoch": 0.29, + "grad_norm": 0.4220963462240051, + "learning_rate": 1.6583950611110923e-05, + "loss": 0.3159, + "step": 5102 + }, + { + "epoch": 0.29, + "grad_norm": 0.44314877537383723, + "learning_rate": 1.6582549839314756e-05, + "loss": 0.2598, + "step": 5103 + }, + { + "epoch": 0.29, + "grad_norm": 0.2803168276990925, + "learning_rate": 1.658114883956517e-05, + "loss": 0.2255, + "step": 5104 + }, + { + "epoch": 0.29, + "grad_norm": 0.9679407388981311, + "learning_rate": 1.6579747611910684e-05, + "loss": 0.557, + "step": 5105 + }, + { + "epoch": 0.29, + "grad_norm": 0.7293098670982834, + "learning_rate": 1.657834615639982e-05, + "loss": 0.2901, + "step": 5106 + }, + { + "epoch": 0.29, + "grad_norm": 0.3672744421536344, + "learning_rate": 1.6576944473081112e-05, + "loss": 0.2756, + "step": 5107 + }, + { + "epoch": 0.29, + "grad_norm": 0.8130875641546872, + "learning_rate": 1.6575542562003097e-05, + "loss": 0.5067, + "step": 5108 + }, + { + "epoch": 0.29, + "grad_norm": 0.2011438979448081, + "learning_rate": 1.6574140423214327e-05, + "loss": 0.1124, + "step": 5109 + }, + { + "epoch": 0.29, + "grad_norm": 0.38553771059667385, + "learning_rate": 1.657273805676336e-05, + "loss": 0.2765, + "step": 5110 + }, + { + "epoch": 0.29, + "grad_norm": 0.4494145707717537, + "learning_rate": 1.6571335462698755e-05, + "loss": 0.3405, + "step": 5111 + }, + { + "epoch": 0.29, + "grad_norm": 1.1128740224578688, + "learning_rate": 1.6569932641069083e-05, + "loss": 0.4035, + "step": 5112 + }, + { + "epoch": 0.29, + "grad_norm": 0.40172625854201377, + "learning_rate": 1.656852959192293e-05, + "loss": 0.325, + "step": 5113 + }, + { + "epoch": 0.29, + "grad_norm": 0.4042048518785761, + "learning_rate": 1.656712631530888e-05, + "loss": 0.3685, + "step": 5114 + }, + { + "epoch": 0.29, + "grad_norm": 0.25272424394886184, + "learning_rate": 1.6565722811275526e-05, + "loss": 0.1761, + "step": 5115 + }, + { + "epoch": 0.29, + "grad_norm": 0.3961321724738942, + "learning_rate": 1.6564319079871472e-05, + "loss": 0.2789, + "step": 5116 + }, + { + "epoch": 0.29, + "grad_norm": 0.5680165426058105, + "learning_rate": 1.656291512114533e-05, + "loss": 0.3786, + "step": 5117 + }, + { + "epoch": 0.29, + "grad_norm": 0.5206323825700394, + "learning_rate": 1.656151093514572e-05, + "loss": 0.3638, + "step": 5118 + }, + { + "epoch": 0.29, + "grad_norm": 0.3543282139228944, + "learning_rate": 1.6560106521921272e-05, + "loss": 0.248, + "step": 5119 + }, + { + "epoch": 0.29, + "grad_norm": 0.9717910872721999, + "learning_rate": 1.6558701881520616e-05, + "loss": 0.6354, + "step": 5120 + }, + { + "epoch": 0.29, + "grad_norm": 0.4011467519529105, + "learning_rate": 1.6557297013992395e-05, + "loss": 0.2849, + "step": 5121 + }, + { + "epoch": 0.29, + "grad_norm": 0.2840900794225836, + "learning_rate": 1.6555891919385262e-05, + "loss": 0.1773, + "step": 5122 + }, + { + "epoch": 0.29, + "grad_norm": 0.4765438374827972, + "learning_rate": 1.655448659774787e-05, + "loss": 0.3306, + "step": 5123 + }, + { + "epoch": 0.29, + "grad_norm": 1.0038008012167339, + "learning_rate": 1.6553081049128894e-05, + "loss": 0.4335, + "step": 5124 + }, + { + "epoch": 0.29, + "grad_norm": 0.3737795700734047, + "learning_rate": 1.6551675273577e-05, + "loss": 0.2289, + "step": 5125 + }, + { + "epoch": 0.29, + "grad_norm": 1.0662857043698486, + "learning_rate": 1.6550269271140872e-05, + "loss": 0.6902, + "step": 5126 + }, + { + "epoch": 0.29, + "grad_norm": 0.3832966114148078, + "learning_rate": 1.6548863041869203e-05, + "loss": 0.3521, + "step": 5127 + }, + { + "epoch": 0.29, + "grad_norm": 0.2462215752704896, + "learning_rate": 1.6547456585810687e-05, + "loss": 0.1567, + "step": 5128 + }, + { + "epoch": 0.29, + "grad_norm": 0.9104250748758096, + "learning_rate": 1.6546049903014034e-05, + "loss": 0.5161, + "step": 5129 + }, + { + "epoch": 0.29, + "grad_norm": 0.39445391596511015, + "learning_rate": 1.6544642993527952e-05, + "loss": 0.3519, + "step": 5130 + }, + { + "epoch": 0.29, + "grad_norm": 0.4248670156930363, + "learning_rate": 1.6543235857401163e-05, + "loss": 0.3151, + "step": 5131 + }, + { + "epoch": 0.29, + "grad_norm": 0.5492970992711453, + "learning_rate": 1.6541828494682398e-05, + "loss": 0.3257, + "step": 5132 + }, + { + "epoch": 0.29, + "grad_norm": 0.4673011597855598, + "learning_rate": 1.6540420905420395e-05, + "loss": 0.2742, + "step": 5133 + }, + { + "epoch": 0.29, + "grad_norm": 0.39281271146739444, + "learning_rate": 1.6539013089663897e-05, + "loss": 0.3005, + "step": 5134 + }, + { + "epoch": 0.3, + "grad_norm": 0.4087028084684856, + "learning_rate": 1.6537605047461654e-05, + "loss": 0.2509, + "step": 5135 + }, + { + "epoch": 0.3, + "grad_norm": 0.8586628812462002, + "learning_rate": 1.6536196778862433e-05, + "loss": 0.5085, + "step": 5136 + }, + { + "epoch": 0.3, + "grad_norm": 0.5709002766581852, + "learning_rate": 1.6534788283915e-05, + "loss": 0.3931, + "step": 5137 + }, + { + "epoch": 0.3, + "grad_norm": 0.3001544493179158, + "learning_rate": 1.6533379562668126e-05, + "loss": 0.261, + "step": 5138 + }, + { + "epoch": 0.3, + "grad_norm": 1.2209468608505851, + "learning_rate": 1.6531970615170602e-05, + "loss": 0.7915, + "step": 5139 + }, + { + "epoch": 0.3, + "grad_norm": 0.29863282667681906, + "learning_rate": 1.6530561441471215e-05, + "loss": 0.2372, + "step": 5140 + }, + { + "epoch": 0.3, + "grad_norm": 0.6536704998804858, + "learning_rate": 1.6529152041618767e-05, + "loss": 0.3358, + "step": 5141 + }, + { + "epoch": 0.3, + "grad_norm": 0.2971516691014197, + "learning_rate": 1.652774241566206e-05, + "loss": 0.2563, + "step": 5142 + }, + { + "epoch": 0.3, + "grad_norm": 0.4049693944771069, + "learning_rate": 1.652633256364992e-05, + "loss": 0.3354, + "step": 5143 + }, + { + "epoch": 0.3, + "grad_norm": 0.569984221282444, + "learning_rate": 1.652492248563116e-05, + "loss": 0.452, + "step": 5144 + }, + { + "epoch": 0.3, + "grad_norm": 0.6023169787669885, + "learning_rate": 1.6523512181654616e-05, + "loss": 0.379, + "step": 5145 + }, + { + "epoch": 0.3, + "grad_norm": 0.3194650614374405, + "learning_rate": 1.6522101651769124e-05, + "loss": 0.2582, + "step": 5146 + }, + { + "epoch": 0.3, + "grad_norm": 0.6151729058432179, + "learning_rate": 1.6520690896023536e-05, + "loss": 0.3939, + "step": 5147 + }, + { + "epoch": 0.3, + "grad_norm": 0.2655700172516779, + "learning_rate": 1.6519279914466703e-05, + "loss": 0.1816, + "step": 5148 + }, + { + "epoch": 0.3, + "grad_norm": 0.8313908290203407, + "learning_rate": 1.6517868707147484e-05, + "loss": 0.461, + "step": 5149 + }, + { + "epoch": 0.3, + "grad_norm": 0.40243904423512905, + "learning_rate": 1.651645727411475e-05, + "loss": 0.3019, + "step": 5150 + }, + { + "epoch": 0.3, + "grad_norm": 0.48595523304443944, + "learning_rate": 1.6515045615417385e-05, + "loss": 0.2973, + "step": 5151 + }, + { + "epoch": 0.3, + "grad_norm": 0.6055444378343406, + "learning_rate": 1.6513633731104268e-05, + "loss": 0.3708, + "step": 5152 + }, + { + "epoch": 0.3, + "grad_norm": 0.4178836687928149, + "learning_rate": 1.6512221621224296e-05, + "loss": 0.336, + "step": 5153 + }, + { + "epoch": 0.3, + "grad_norm": 0.2420107469251138, + "learning_rate": 1.651080928582637e-05, + "loss": 0.2302, + "step": 5154 + }, + { + "epoch": 0.3, + "grad_norm": 0.2784278043315081, + "learning_rate": 1.6509396724959396e-05, + "loss": 0.1629, + "step": 5155 + }, + { + "epoch": 0.3, + "grad_norm": 0.4202735086060693, + "learning_rate": 1.6507983938672295e-05, + "loss": 0.3557, + "step": 5156 + }, + { + "epoch": 0.3, + "grad_norm": 0.7828694849798539, + "learning_rate": 1.650657092701399e-05, + "loss": 0.5474, + "step": 5157 + }, + { + "epoch": 0.3, + "grad_norm": 0.30049067721571665, + "learning_rate": 1.6505157690033417e-05, + "loss": 0.2597, + "step": 5158 + }, + { + "epoch": 0.3, + "grad_norm": 0.3824140483311469, + "learning_rate": 1.650374422777951e-05, + "loss": 0.3437, + "step": 5159 + }, + { + "epoch": 0.3, + "grad_norm": 0.42131426536676336, + "learning_rate": 1.6502330540301217e-05, + "loss": 0.266, + "step": 5160 + }, + { + "epoch": 0.3, + "grad_norm": 0.337804821268556, + "learning_rate": 1.6500916627647498e-05, + "loss": 0.224, + "step": 5161 + }, + { + "epoch": 0.3, + "grad_norm": 0.30612728307066595, + "learning_rate": 1.6499502489867318e-05, + "loss": 0.2456, + "step": 5162 + }, + { + "epoch": 0.3, + "grad_norm": 0.5060885778707491, + "learning_rate": 1.6498088127009647e-05, + "loss": 0.4094, + "step": 5163 + }, + { + "epoch": 0.3, + "grad_norm": 0.34297861726048556, + "learning_rate": 1.649667353912346e-05, + "loss": 0.2678, + "step": 5164 + }, + { + "epoch": 0.3, + "grad_norm": 0.5259607161237364, + "learning_rate": 1.649525872625775e-05, + "loss": 0.3798, + "step": 5165 + }, + { + "epoch": 0.3, + "grad_norm": 0.3577087368747908, + "learning_rate": 1.649384368846151e-05, + "loss": 0.3183, + "step": 5166 + }, + { + "epoch": 0.3, + "grad_norm": 0.8135960641640962, + "learning_rate": 1.649242842578374e-05, + "loss": 0.504, + "step": 5167 + }, + { + "epoch": 0.3, + "grad_norm": 0.22855591149860482, + "learning_rate": 1.6491012938273457e-05, + "loss": 0.1531, + "step": 5168 + }, + { + "epoch": 0.3, + "grad_norm": 0.3865190212108506, + "learning_rate": 1.6489597225979673e-05, + "loss": 0.3684, + "step": 5169 + }, + { + "epoch": 0.3, + "grad_norm": 0.45872722550496825, + "learning_rate": 1.6488181288951416e-05, + "loss": 0.4088, + "step": 5170 + }, + { + "epoch": 0.3, + "grad_norm": 0.2849367931072605, + "learning_rate": 1.648676512723772e-05, + "loss": 0.2268, + "step": 5171 + }, + { + "epoch": 0.3, + "grad_norm": 1.246603368276133, + "learning_rate": 1.6485348740887624e-05, + "loss": 0.7727, + "step": 5172 + }, + { + "epoch": 0.3, + "grad_norm": 0.3887406473005966, + "learning_rate": 1.6483932129950183e-05, + "loss": 0.2651, + "step": 5173 + }, + { + "epoch": 0.3, + "grad_norm": 0.24155302974837958, + "learning_rate": 1.648251529447445e-05, + "loss": 0.2227, + "step": 5174 + }, + { + "epoch": 0.3, + "grad_norm": 0.5747540021102897, + "learning_rate": 1.6481098234509493e-05, + "loss": 0.4604, + "step": 5175 + }, + { + "epoch": 0.3, + "grad_norm": 0.6065999774862946, + "learning_rate": 1.647968095010438e-05, + "loss": 0.4313, + "step": 5176 + }, + { + "epoch": 0.3, + "grad_norm": 0.3560299438233281, + "learning_rate": 1.6478263441308197e-05, + "loss": 0.2519, + "step": 5177 + }, + { + "epoch": 0.3, + "grad_norm": 0.4235898813948726, + "learning_rate": 1.6476845708170025e-05, + "loss": 0.3201, + "step": 5178 + }, + { + "epoch": 0.3, + "grad_norm": 0.7045373601265343, + "learning_rate": 1.647542775073897e-05, + "loss": 0.45, + "step": 5179 + }, + { + "epoch": 0.3, + "grad_norm": 0.41738227489163404, + "learning_rate": 1.647400956906413e-05, + "loss": 0.3245, + "step": 5180 + }, + { + "epoch": 0.3, + "grad_norm": 0.3797306952733853, + "learning_rate": 1.6472591163194613e-05, + "loss": 0.2625, + "step": 5181 + }, + { + "epoch": 0.3, + "grad_norm": 0.2729805774158222, + "learning_rate": 1.6471172533179545e-05, + "loss": 0.2373, + "step": 5182 + }, + { + "epoch": 0.3, + "grad_norm": 0.42957414092760055, + "learning_rate": 1.646975367906805e-05, + "loss": 0.3196, + "step": 5183 + }, + { + "epoch": 0.3, + "grad_norm": 1.040323100353576, + "learning_rate": 1.6468334600909265e-05, + "loss": 0.4741, + "step": 5184 + }, + { + "epoch": 0.3, + "grad_norm": 0.4346814146354387, + "learning_rate": 1.646691529875233e-05, + "loss": 0.3474, + "step": 5185 + }, + { + "epoch": 0.3, + "grad_norm": 0.36473624376665237, + "learning_rate": 1.6465495772646395e-05, + "loss": 0.3155, + "step": 5186 + }, + { + "epoch": 0.3, + "grad_norm": 0.3374164626344427, + "learning_rate": 1.646407602264062e-05, + "loss": 0.2212, + "step": 5187 + }, + { + "epoch": 0.3, + "grad_norm": 0.9076271617776354, + "learning_rate": 1.646265604878417e-05, + "loss": 0.5641, + "step": 5188 + }, + { + "epoch": 0.3, + "grad_norm": 0.34881563109518027, + "learning_rate": 1.6461235851126217e-05, + "loss": 0.2696, + "step": 5189 + }, + { + "epoch": 0.3, + "grad_norm": 0.4391047335953409, + "learning_rate": 1.6459815429715947e-05, + "loss": 0.2938, + "step": 5190 + }, + { + "epoch": 0.3, + "grad_norm": 0.6558038337089468, + "learning_rate": 1.6458394784602548e-05, + "loss": 0.4387, + "step": 5191 + }, + { + "epoch": 0.3, + "grad_norm": 0.3632607082060281, + "learning_rate": 1.6456973915835216e-05, + "loss": 0.3324, + "step": 5192 + }, + { + "epoch": 0.3, + "grad_norm": 0.8758092746992635, + "learning_rate": 1.645555282346315e-05, + "loss": 0.5511, + "step": 5193 + }, + { + "epoch": 0.3, + "grad_norm": 0.25760201740560185, + "learning_rate": 1.645413150753557e-05, + "loss": 0.1809, + "step": 5194 + }, + { + "epoch": 0.3, + "grad_norm": 0.37353304142667226, + "learning_rate": 1.645270996810169e-05, + "loss": 0.2963, + "step": 5195 + }, + { + "epoch": 0.3, + "grad_norm": 1.087573636596399, + "learning_rate": 1.645128820521075e-05, + "loss": 0.7903, + "step": 5196 + }, + { + "epoch": 0.3, + "grad_norm": 0.33864893891719766, + "learning_rate": 1.644986621891197e-05, + "loss": 0.281, + "step": 5197 + }, + { + "epoch": 0.3, + "grad_norm": 0.39698780680277895, + "learning_rate": 1.64484440092546e-05, + "loss": 0.2983, + "step": 5198 + }, + { + "epoch": 0.3, + "grad_norm": 0.8776485906722473, + "learning_rate": 1.6447021576287893e-05, + "loss": 0.5644, + "step": 5199 + }, + { + "epoch": 0.3, + "grad_norm": 0.18605182562974185, + "learning_rate": 1.6445598920061104e-05, + "loss": 0.1143, + "step": 5200 + }, + { + "epoch": 0.3, + "grad_norm": 0.3799839090356079, + "learning_rate": 1.6444176040623506e-05, + "loss": 0.2902, + "step": 5201 + }, + { + "epoch": 0.3, + "grad_norm": 0.3898545043893843, + "learning_rate": 1.6442752938024367e-05, + "loss": 0.3586, + "step": 5202 + }, + { + "epoch": 0.3, + "grad_norm": 0.7637000427048503, + "learning_rate": 1.644132961231297e-05, + "loss": 0.3395, + "step": 5203 + }, + { + "epoch": 0.3, + "grad_norm": 0.41850232975506924, + "learning_rate": 1.6439906063538602e-05, + "loss": 0.32, + "step": 5204 + }, + { + "epoch": 0.3, + "grad_norm": 0.5203465807048746, + "learning_rate": 1.6438482291750567e-05, + "loss": 0.4057, + "step": 5205 + }, + { + "epoch": 0.3, + "grad_norm": 0.2428540144139758, + "learning_rate": 1.6437058296998168e-05, + "loss": 0.1995, + "step": 5206 + }, + { + "epoch": 0.3, + "grad_norm": 0.3433948371999074, + "learning_rate": 1.643563407933072e-05, + "loss": 0.2046, + "step": 5207 + }, + { + "epoch": 0.3, + "grad_norm": 0.7815039800380226, + "learning_rate": 1.6434209638797535e-05, + "loss": 0.5257, + "step": 5208 + }, + { + "epoch": 0.3, + "grad_norm": 0.41070430570121524, + "learning_rate": 1.643278497544795e-05, + "loss": 0.3437, + "step": 5209 + }, + { + "epoch": 0.3, + "grad_norm": 0.3379383058006753, + "learning_rate": 1.6431360089331297e-05, + "loss": 0.2553, + "step": 5210 + }, + { + "epoch": 0.3, + "grad_norm": 0.943370279969184, + "learning_rate": 1.642993498049692e-05, + "loss": 0.6497, + "step": 5211 + }, + { + "epoch": 0.3, + "grad_norm": 0.2744195839372174, + "learning_rate": 1.6428509648994172e-05, + "loss": 0.1808, + "step": 5212 + }, + { + "epoch": 0.3, + "grad_norm": 0.2973940784991242, + "learning_rate": 1.6427084094872413e-05, + "loss": 0.2217, + "step": 5213 + }, + { + "epoch": 0.3, + "grad_norm": 0.5296153676122131, + "learning_rate": 1.6425658318181007e-05, + "loss": 0.4247, + "step": 5214 + }, + { + "epoch": 0.3, + "grad_norm": 0.7738880370457544, + "learning_rate": 1.6424232318969327e-05, + "loss": 0.4883, + "step": 5215 + }, + { + "epoch": 0.3, + "grad_norm": 0.3653662339834992, + "learning_rate": 1.642280609728676e-05, + "loss": 0.2146, + "step": 5216 + }, + { + "epoch": 0.3, + "grad_norm": 0.532355787670923, + "learning_rate": 1.6421379653182695e-05, + "loss": 0.3656, + "step": 5217 + }, + { + "epoch": 0.3, + "grad_norm": 0.2788563700674229, + "learning_rate": 1.6419952986706523e-05, + "loss": 0.2249, + "step": 5218 + }, + { + "epoch": 0.3, + "grad_norm": 0.42198391043135125, + "learning_rate": 1.641852609790766e-05, + "loss": 0.3224, + "step": 5219 + }, + { + "epoch": 0.3, + "grad_norm": 0.8746102140737043, + "learning_rate": 1.641709898683552e-05, + "loss": 0.3473, + "step": 5220 + }, + { + "epoch": 0.3, + "grad_norm": 0.4154761127237117, + "learning_rate": 1.641567165353951e-05, + "loss": 0.3684, + "step": 5221 + }, + { + "epoch": 0.3, + "grad_norm": 0.41962129121867997, + "learning_rate": 1.6414244098069068e-05, + "loss": 0.2641, + "step": 5222 + }, + { + "epoch": 0.3, + "grad_norm": 0.46239702845240743, + "learning_rate": 1.641281632047363e-05, + "loss": 0.2797, + "step": 5223 + }, + { + "epoch": 0.3, + "grad_norm": 0.34177914074807497, + "learning_rate": 1.6411388320802637e-05, + "loss": 0.2041, + "step": 5224 + }, + { + "epoch": 0.3, + "grad_norm": 0.3992637612794155, + "learning_rate": 1.6409960099105543e-05, + "loss": 0.2882, + "step": 5225 + }, + { + "epoch": 0.3, + "grad_norm": 0.7002086710277914, + "learning_rate": 1.6408531655431806e-05, + "loss": 0.3284, + "step": 5226 + }, + { + "epoch": 0.3, + "grad_norm": 1.3006242151336662, + "learning_rate": 1.6407102989830894e-05, + "loss": 0.7902, + "step": 5227 + }, + { + "epoch": 0.3, + "grad_norm": 0.4334420186583708, + "learning_rate": 1.640567410235228e-05, + "loss": 0.2615, + "step": 5228 + }, + { + "epoch": 0.3, + "grad_norm": 0.6111767264288636, + "learning_rate": 1.6404244993045447e-05, + "loss": 0.2422, + "step": 5229 + }, + { + "epoch": 0.3, + "grad_norm": 0.30863707616793146, + "learning_rate": 1.6402815661959886e-05, + "loss": 0.227, + "step": 5230 + }, + { + "epoch": 0.3, + "grad_norm": 0.38996995551842206, + "learning_rate": 1.6401386109145098e-05, + "loss": 0.291, + "step": 5231 + }, + { + "epoch": 0.3, + "grad_norm": 0.9864271562835062, + "learning_rate": 1.639995633465058e-05, + "loss": 0.4252, + "step": 5232 + }, + { + "epoch": 0.3, + "grad_norm": 0.5006342644476464, + "learning_rate": 1.6398526338525852e-05, + "loss": 0.3058, + "step": 5233 + }, + { + "epoch": 0.3, + "grad_norm": 0.3515919149907774, + "learning_rate": 1.639709612082043e-05, + "loss": 0.2666, + "step": 5234 + }, + { + "epoch": 0.3, + "grad_norm": 0.9812781991271189, + "learning_rate": 1.6395665681583842e-05, + "loss": 0.5665, + "step": 5235 + }, + { + "epoch": 0.3, + "grad_norm": 0.5594165984678707, + "learning_rate": 1.639423502086563e-05, + "loss": 0.3173, + "step": 5236 + }, + { + "epoch": 0.3, + "grad_norm": 0.4214927341840395, + "learning_rate": 1.6392804138715334e-05, + "loss": 0.28, + "step": 5237 + }, + { + "epoch": 0.3, + "grad_norm": 0.368125224933628, + "learning_rate": 1.6391373035182506e-05, + "loss": 0.296, + "step": 5238 + }, + { + "epoch": 0.3, + "grad_norm": 0.29769042288686703, + "learning_rate": 1.6389941710316703e-05, + "loss": 0.183, + "step": 5239 + }, + { + "epoch": 0.3, + "grad_norm": 0.42317068403499525, + "learning_rate": 1.6388510164167492e-05, + "loss": 0.2838, + "step": 5240 + }, + { + "epoch": 0.3, + "grad_norm": 0.38688120862100955, + "learning_rate": 1.6387078396784447e-05, + "loss": 0.317, + "step": 5241 + }, + { + "epoch": 0.3, + "grad_norm": 0.9506995811934351, + "learning_rate": 1.6385646408217158e-05, + "loss": 0.492, + "step": 5242 + }, + { + "epoch": 0.3, + "grad_norm": 0.3817544458955061, + "learning_rate": 1.638421419851521e-05, + "loss": 0.261, + "step": 5243 + }, + { + "epoch": 0.3, + "grad_norm": 0.3240206473375033, + "learning_rate": 1.638278176772819e-05, + "loss": 0.2811, + "step": 5244 + }, + { + "epoch": 0.3, + "grad_norm": 0.3847891589500509, + "learning_rate": 1.6381349115905718e-05, + "loss": 0.3365, + "step": 5245 + }, + { + "epoch": 0.3, + "grad_norm": 0.3515846322925854, + "learning_rate": 1.6379916243097398e-05, + "loss": 0.2199, + "step": 5246 + }, + { + "epoch": 0.3, + "grad_norm": 0.4147502570911691, + "learning_rate": 1.6378483149352857e-05, + "loss": 0.2787, + "step": 5247 + }, + { + "epoch": 0.3, + "grad_norm": 0.7085441353078938, + "learning_rate": 1.6377049834721713e-05, + "loss": 0.4585, + "step": 5248 + }, + { + "epoch": 0.3, + "grad_norm": 0.3256721748852568, + "learning_rate": 1.637561629925361e-05, + "loss": 0.2384, + "step": 5249 + }, + { + "epoch": 0.3, + "grad_norm": 0.850566415646217, + "learning_rate": 1.637418254299819e-05, + "loss": 0.5317, + "step": 5250 + }, + { + "epoch": 0.3, + "grad_norm": 0.4374892514913494, + "learning_rate": 1.63727485660051e-05, + "loss": 0.3344, + "step": 5251 + }, + { + "epoch": 0.3, + "grad_norm": 0.24873000470380066, + "learning_rate": 1.6371314368324002e-05, + "loss": 0.136, + "step": 5252 + }, + { + "epoch": 0.3, + "grad_norm": 0.3694156067809314, + "learning_rate": 1.6369879950004564e-05, + "loss": 0.3051, + "step": 5253 + }, + { + "epoch": 0.3, + "grad_norm": 0.8041414855390386, + "learning_rate": 1.6368445311096452e-05, + "loss": 0.4433, + "step": 5254 + }, + { + "epoch": 0.3, + "grad_norm": 0.5618816209689073, + "learning_rate": 1.6367010451649357e-05, + "loss": 0.3546, + "step": 5255 + }, + { + "epoch": 0.3, + "grad_norm": 0.39348874710374027, + "learning_rate": 1.636557537171296e-05, + "loss": 0.2843, + "step": 5256 + }, + { + "epoch": 0.3, + "grad_norm": 0.3829165507517021, + "learning_rate": 1.6364140071336967e-05, + "loss": 0.3449, + "step": 5257 + }, + { + "epoch": 0.3, + "grad_norm": 0.3896253368807964, + "learning_rate": 1.6362704550571073e-05, + "loss": 0.2402, + "step": 5258 + }, + { + "epoch": 0.3, + "grad_norm": 0.2660992170071291, + "learning_rate": 1.6361268809464998e-05, + "loss": 0.2279, + "step": 5259 + }, + { + "epoch": 0.3, + "grad_norm": 0.7432002175469005, + "learning_rate": 1.6359832848068455e-05, + "loss": 0.4667, + "step": 5260 + }, + { + "epoch": 0.3, + "grad_norm": 0.33013913089948044, + "learning_rate": 1.6358396666431176e-05, + "loss": 0.269, + "step": 5261 + }, + { + "epoch": 0.3, + "grad_norm": 0.39504619567062776, + "learning_rate": 1.635696026460289e-05, + "loss": 0.2618, + "step": 5262 + }, + { + "epoch": 0.3, + "grad_norm": 1.2763020879980969, + "learning_rate": 1.6355523642633346e-05, + "loss": 0.832, + "step": 5263 + }, + { + "epoch": 0.3, + "grad_norm": 0.23495958054537083, + "learning_rate": 1.6354086800572287e-05, + "loss": 0.1672, + "step": 5264 + }, + { + "epoch": 0.3, + "grad_norm": 0.28905131340359264, + "learning_rate": 1.635264973846948e-05, + "loss": 0.2705, + "step": 5265 + }, + { + "epoch": 0.3, + "grad_norm": 0.7167395323464834, + "learning_rate": 1.6351212456374684e-05, + "loss": 0.4445, + "step": 5266 + }, + { + "epoch": 0.3, + "grad_norm": 0.4054608197750515, + "learning_rate": 1.6349774954337676e-05, + "loss": 0.3239, + "step": 5267 + }, + { + "epoch": 0.3, + "grad_norm": 0.5343281535109453, + "learning_rate": 1.6348337232408235e-05, + "loss": 0.3897, + "step": 5268 + }, + { + "epoch": 0.3, + "grad_norm": 0.3590528787248728, + "learning_rate": 1.6346899290636145e-05, + "loss": 0.288, + "step": 5269 + }, + { + "epoch": 0.3, + "grad_norm": 0.3249108619545788, + "learning_rate": 1.6345461129071207e-05, + "loss": 0.2352, + "step": 5270 + }, + { + "epoch": 0.3, + "grad_norm": 0.5694464956960685, + "learning_rate": 1.6344022747763225e-05, + "loss": 0.3867, + "step": 5271 + }, + { + "epoch": 0.3, + "grad_norm": 0.3305233715465694, + "learning_rate": 1.6342584146762005e-05, + "loss": 0.2424, + "step": 5272 + }, + { + "epoch": 0.3, + "grad_norm": 0.5162607918437077, + "learning_rate": 1.634114532611737e-05, + "loss": 0.3408, + "step": 5273 + }, + { + "epoch": 0.3, + "grad_norm": 0.40391491962918796, + "learning_rate": 1.6339706285879144e-05, + "loss": 0.3143, + "step": 5274 + }, + { + "epoch": 0.3, + "grad_norm": 0.9206819584778668, + "learning_rate": 1.6338267026097162e-05, + "loss": 0.5015, + "step": 5275 + }, + { + "epoch": 0.3, + "grad_norm": 0.4979518942637146, + "learning_rate": 1.633682754682127e-05, + "loss": 0.3239, + "step": 5276 + }, + { + "epoch": 0.3, + "grad_norm": 0.3491657958787403, + "learning_rate": 1.6335387848101307e-05, + "loss": 0.3165, + "step": 5277 + }, + { + "epoch": 0.3, + "grad_norm": 0.28912270831094544, + "learning_rate": 1.6333947929987137e-05, + "loss": 0.177, + "step": 5278 + }, + { + "epoch": 0.3, + "grad_norm": 0.5085041190375486, + "learning_rate": 1.6332507792528626e-05, + "loss": 0.3688, + "step": 5279 + }, + { + "epoch": 0.3, + "grad_norm": 0.5023828501731868, + "learning_rate": 1.633106743577564e-05, + "loss": 0.3515, + "step": 5280 + }, + { + "epoch": 0.3, + "grad_norm": 0.4313172948281927, + "learning_rate": 1.6329626859778057e-05, + "loss": 0.3235, + "step": 5281 + }, + { + "epoch": 0.3, + "grad_norm": 0.4878188145202696, + "learning_rate": 1.632818606458577e-05, + "loss": 0.2731, + "step": 5282 + }, + { + "epoch": 0.3, + "grad_norm": 0.39398917324063887, + "learning_rate": 1.6326745050248675e-05, + "loss": 0.3514, + "step": 5283 + }, + { + "epoch": 0.3, + "grad_norm": 0.3186584290148543, + "learning_rate": 1.632530381681667e-05, + "loss": 0.1892, + "step": 5284 + }, + { + "epoch": 0.3, + "grad_norm": 0.2990432265997818, + "learning_rate": 1.6323862364339663e-05, + "loss": 0.2251, + "step": 5285 + }, + { + "epoch": 0.3, + "grad_norm": 0.5905438488593204, + "learning_rate": 1.6322420692867577e-05, + "loss": 0.3983, + "step": 5286 + }, + { + "epoch": 0.3, + "grad_norm": 0.7228437691044046, + "learning_rate": 1.632097880245033e-05, + "loss": 0.585, + "step": 5287 + }, + { + "epoch": 0.3, + "grad_norm": 0.3268212611120316, + "learning_rate": 1.6319536693137862e-05, + "loss": 0.2669, + "step": 5288 + }, + { + "epoch": 0.3, + "grad_norm": 0.40593698432909653, + "learning_rate": 1.631809436498011e-05, + "loss": 0.3245, + "step": 5289 + }, + { + "epoch": 0.3, + "grad_norm": 0.2983396151975038, + "learning_rate": 1.6316651818027024e-05, + "loss": 0.1933, + "step": 5290 + }, + { + "epoch": 0.3, + "grad_norm": 0.47509655229133435, + "learning_rate": 1.6315209052328554e-05, + "loss": 0.1699, + "step": 5291 + }, + { + "epoch": 0.3, + "grad_norm": 0.46728137226170324, + "learning_rate": 1.6313766067934668e-05, + "loss": 0.3261, + "step": 5292 + }, + { + "epoch": 0.3, + "grad_norm": 0.3831760523182167, + "learning_rate": 1.6312322864895334e-05, + "loss": 0.3321, + "step": 5293 + }, + { + "epoch": 0.3, + "grad_norm": 0.70358765871688, + "learning_rate": 1.631087944326053e-05, + "loss": 0.4898, + "step": 5294 + }, + { + "epoch": 0.3, + "grad_norm": 0.33376431508160115, + "learning_rate": 1.6309435803080244e-05, + "loss": 0.2051, + "step": 5295 + }, + { + "epoch": 0.3, + "grad_norm": 0.2822733219097903, + "learning_rate": 1.6307991944404466e-05, + "loss": 0.2165, + "step": 5296 + }, + { + "epoch": 0.3, + "grad_norm": 0.4344437630599208, + "learning_rate": 1.6306547867283197e-05, + "loss": 0.3403, + "step": 5297 + }, + { + "epoch": 0.3, + "grad_norm": 0.3531370933554759, + "learning_rate": 1.630510357176645e-05, + "loss": 0.2542, + "step": 5298 + }, + { + "epoch": 0.3, + "grad_norm": 0.8157810100988, + "learning_rate": 1.6303659057904232e-05, + "loss": 0.4894, + "step": 5299 + }, + { + "epoch": 0.3, + "grad_norm": 0.33867142000505207, + "learning_rate": 1.6302214325746577e-05, + "loss": 0.336, + "step": 5300 + }, + { + "epoch": 0.3, + "grad_norm": 0.3516250686888519, + "learning_rate": 1.6300769375343508e-05, + "loss": 0.2335, + "step": 5301 + }, + { + "epoch": 0.3, + "grad_norm": 0.4404752724035265, + "learning_rate": 1.6299324206745066e-05, + "loss": 0.2584, + "step": 5302 + }, + { + "epoch": 0.3, + "grad_norm": 0.3540326459594793, + "learning_rate": 1.6297878820001302e-05, + "loss": 0.2618, + "step": 5303 + }, + { + "epoch": 0.3, + "grad_norm": 0.3477074044921296, + "learning_rate": 1.6296433215162258e-05, + "loss": 0.2255, + "step": 5304 + }, + { + "epoch": 0.3, + "grad_norm": 0.444796893845932, + "learning_rate": 1.629498739227801e-05, + "loss": 0.3526, + "step": 5305 + }, + { + "epoch": 0.3, + "grad_norm": 0.5521271885013249, + "learning_rate": 1.6293541351398616e-05, + "loss": 0.3678, + "step": 5306 + }, + { + "epoch": 0.3, + "grad_norm": 0.43164109413325197, + "learning_rate": 1.6292095092574154e-05, + "loss": 0.3293, + "step": 5307 + }, + { + "epoch": 0.3, + "grad_norm": 0.37843319285317356, + "learning_rate": 1.6290648615854712e-05, + "loss": 0.249, + "step": 5308 + }, + { + "epoch": 0.31, + "grad_norm": 0.25288135816628304, + "learning_rate": 1.6289201921290377e-05, + "loss": 0.1774, + "step": 5309 + }, + { + "epoch": 0.31, + "grad_norm": 0.335350277217877, + "learning_rate": 1.6287755008931255e-05, + "loss": 0.2844, + "step": 5310 + }, + { + "epoch": 0.31, + "grad_norm": 0.7224538732310122, + "learning_rate": 1.6286307878827443e-05, + "loss": 0.4002, + "step": 5311 + }, + { + "epoch": 0.31, + "grad_norm": 0.4249497381815193, + "learning_rate": 1.6284860531029062e-05, + "loss": 0.342, + "step": 5312 + }, + { + "epoch": 0.31, + "grad_norm": 0.37589361249803893, + "learning_rate": 1.6283412965586227e-05, + "loss": 0.2856, + "step": 5313 + }, + { + "epoch": 0.31, + "grad_norm": 0.32777900712701974, + "learning_rate": 1.6281965182549077e-05, + "loss": 0.1539, + "step": 5314 + }, + { + "epoch": 0.31, + "grad_norm": 0.46467288740435403, + "learning_rate": 1.6280517181967733e-05, + "loss": 0.3337, + "step": 5315 + }, + { + "epoch": 0.31, + "grad_norm": 0.302574483899151, + "learning_rate": 1.6279068963892358e-05, + "loss": 0.2726, + "step": 5316 + }, + { + "epoch": 0.31, + "grad_norm": 0.5464579445728996, + "learning_rate": 1.6277620528373094e-05, + "loss": 0.3922, + "step": 5317 + }, + { + "epoch": 0.31, + "grad_norm": 0.6586890435544829, + "learning_rate": 1.6276171875460097e-05, + "loss": 0.3371, + "step": 5318 + }, + { + "epoch": 0.31, + "grad_norm": 0.4057018760179316, + "learning_rate": 1.6274723005203542e-05, + "loss": 0.2928, + "step": 5319 + }, + { + "epoch": 0.31, + "grad_norm": 0.5354467630028948, + "learning_rate": 1.6273273917653596e-05, + "loss": 0.3619, + "step": 5320 + }, + { + "epoch": 0.31, + "grad_norm": 0.24525178033362732, + "learning_rate": 1.6271824612860445e-05, + "loss": 0.1796, + "step": 5321 + }, + { + "epoch": 0.31, + "grad_norm": 0.3806740733460614, + "learning_rate": 1.6270375090874276e-05, + "loss": 0.2955, + "step": 5322 + }, + { + "epoch": 0.31, + "grad_norm": 1.016911246302538, + "learning_rate": 1.626892535174529e-05, + "loss": 0.4905, + "step": 5323 + }, + { + "epoch": 0.31, + "grad_norm": 0.39313652227601653, + "learning_rate": 1.6267475395523686e-05, + "loss": 0.265, + "step": 5324 + }, + { + "epoch": 0.31, + "grad_norm": 0.42444131470135105, + "learning_rate": 1.626602522225968e-05, + "loss": 0.31, + "step": 5325 + }, + { + "epoch": 0.31, + "grad_norm": 1.1439901110836537, + "learning_rate": 1.6264574832003492e-05, + "loss": 0.704, + "step": 5326 + }, + { + "epoch": 0.31, + "grad_norm": 0.3857249796155631, + "learning_rate": 1.6263124224805345e-05, + "loss": 0.1889, + "step": 5327 + }, + { + "epoch": 0.31, + "grad_norm": 0.3412589948287796, + "learning_rate": 1.6261673400715475e-05, + "loss": 0.2426, + "step": 5328 + }, + { + "epoch": 0.31, + "grad_norm": 0.5851866759927743, + "learning_rate": 1.6260222359784123e-05, + "loss": 0.4064, + "step": 5329 + }, + { + "epoch": 0.31, + "grad_norm": 1.3336728934135882, + "learning_rate": 1.6258771102061543e-05, + "loss": 0.8571, + "step": 5330 + }, + { + "epoch": 0.31, + "grad_norm": 0.3092169640199647, + "learning_rate": 1.6257319627597986e-05, + "loss": 0.2259, + "step": 5331 + }, + { + "epoch": 0.31, + "grad_norm": 0.4063809902015279, + "learning_rate": 1.6255867936443724e-05, + "loss": 0.3177, + "step": 5332 + }, + { + "epoch": 0.31, + "grad_norm": 0.7712406027659143, + "learning_rate": 1.625441602864902e-05, + "loss": 0.485, + "step": 5333 + }, + { + "epoch": 0.31, + "grad_norm": 0.3238377010953509, + "learning_rate": 1.625296390426416e-05, + "loss": 0.2591, + "step": 5334 + }, + { + "epoch": 0.31, + "grad_norm": 0.3062860851213545, + "learning_rate": 1.6251511563339426e-05, + "loss": 0.1903, + "step": 5335 + }, + { + "epoch": 0.31, + "grad_norm": 0.42276890338671497, + "learning_rate": 1.6250059005925117e-05, + "loss": 0.3612, + "step": 5336 + }, + { + "epoch": 0.31, + "grad_norm": 0.3259165375372027, + "learning_rate": 1.6248606232071536e-05, + "loss": 0.2178, + "step": 5337 + }, + { + "epoch": 0.31, + "grad_norm": 1.165783767802092, + "learning_rate": 1.6247153241828985e-05, + "loss": 0.6111, + "step": 5338 + }, + { + "epoch": 0.31, + "grad_norm": 0.71001471508266, + "learning_rate": 1.624570003524779e-05, + "loss": 0.3921, + "step": 5339 + }, + { + "epoch": 0.31, + "grad_norm": 0.3474997200766379, + "learning_rate": 1.624424661237827e-05, + "loss": 0.2082, + "step": 5340 + }, + { + "epoch": 0.31, + "grad_norm": 0.3418873789831343, + "learning_rate": 1.6242792973270758e-05, + "loss": 0.2739, + "step": 5341 + }, + { + "epoch": 0.31, + "grad_norm": 0.4231357230178703, + "learning_rate": 1.6241339117975596e-05, + "loss": 0.2998, + "step": 5342 + }, + { + "epoch": 0.31, + "grad_norm": 0.4355566229176757, + "learning_rate": 1.6239885046543125e-05, + "loss": 0.2823, + "step": 5343 + }, + { + "epoch": 0.31, + "grad_norm": 0.35899156253158837, + "learning_rate": 1.6238430759023706e-05, + "loss": 0.27, + "step": 5344 + }, + { + "epoch": 0.31, + "grad_norm": 0.8135718113775179, + "learning_rate": 1.6236976255467697e-05, + "loss": 0.4483, + "step": 5345 + }, + { + "epoch": 0.31, + "grad_norm": 0.4186232071036985, + "learning_rate": 1.623552153592547e-05, + "loss": 0.3089, + "step": 5346 + }, + { + "epoch": 0.31, + "grad_norm": 0.26769754516544547, + "learning_rate": 1.6234066600447397e-05, + "loss": 0.1907, + "step": 5347 + }, + { + "epoch": 0.31, + "grad_norm": 0.37312629158031374, + "learning_rate": 1.6232611449083866e-05, + "loss": 0.3283, + "step": 5348 + }, + { + "epoch": 0.31, + "grad_norm": 0.3136542119516781, + "learning_rate": 1.623115608188527e-05, + "loss": 0.2588, + "step": 5349 + }, + { + "epoch": 0.31, + "grad_norm": 1.0894221985880963, + "learning_rate": 1.6229700498902008e-05, + "loss": 0.3676, + "step": 5350 + }, + { + "epoch": 0.31, + "grad_norm": 0.7124718935045236, + "learning_rate": 1.6228244700184484e-05, + "loss": 0.5309, + "step": 5351 + }, + { + "epoch": 0.31, + "grad_norm": 0.3407741653174107, + "learning_rate": 1.622678868578311e-05, + "loss": 0.2742, + "step": 5352 + }, + { + "epoch": 0.31, + "grad_norm": 0.47774823506024416, + "learning_rate": 1.622533245574832e-05, + "loss": 0.2726, + "step": 5353 + }, + { + "epoch": 0.31, + "grad_norm": 0.33453155351886066, + "learning_rate": 1.622387601013053e-05, + "loss": 0.2116, + "step": 5354 + }, + { + "epoch": 0.31, + "grad_norm": 0.3884129554118678, + "learning_rate": 1.622241934898018e-05, + "loss": 0.2891, + "step": 5355 + }, + { + "epoch": 0.31, + "grad_norm": 0.5698439071171101, + "learning_rate": 1.622096247234772e-05, + "loss": 0.3661, + "step": 5356 + }, + { + "epoch": 0.31, + "grad_norm": 1.6227867700244734, + "learning_rate": 1.6219505380283593e-05, + "loss": 0.4115, + "step": 5357 + }, + { + "epoch": 0.31, + "grad_norm": 0.35548790471839764, + "learning_rate": 1.6218048072838265e-05, + "loss": 0.2962, + "step": 5358 + }, + { + "epoch": 0.31, + "grad_norm": 1.3251708651923235, + "learning_rate": 1.62165905500622e-05, + "loss": 0.7052, + "step": 5359 + }, + { + "epoch": 0.31, + "grad_norm": 0.29768315161623826, + "learning_rate": 1.621513281200587e-05, + "loss": 0.2041, + "step": 5360 + }, + { + "epoch": 0.31, + "grad_norm": 0.432601921603546, + "learning_rate": 1.6213674858719758e-05, + "loss": 0.3035, + "step": 5361 + }, + { + "epoch": 0.31, + "grad_norm": 1.1288404236236713, + "learning_rate": 1.6212216690254353e-05, + "loss": 0.4222, + "step": 5362 + }, + { + "epoch": 0.31, + "grad_norm": 0.8101163644596073, + "learning_rate": 1.621075830666015e-05, + "loss": 0.3231, + "step": 5363 + }, + { + "epoch": 0.31, + "grad_norm": 0.3462029633898307, + "learning_rate": 1.6209299707987656e-05, + "loss": 0.2848, + "step": 5364 + }, + { + "epoch": 0.31, + "grad_norm": 0.5494931342333284, + "learning_rate": 1.6207840894287377e-05, + "loss": 0.42, + "step": 5365 + }, + { + "epoch": 0.31, + "grad_norm": 0.31147969450902874, + "learning_rate": 1.6206381865609836e-05, + "loss": 0.1297, + "step": 5366 + }, + { + "epoch": 0.31, + "grad_norm": 0.42917601362153945, + "learning_rate": 1.620492262200556e-05, + "loss": 0.3263, + "step": 5367 + }, + { + "epoch": 0.31, + "grad_norm": 0.43424402968714376, + "learning_rate": 1.620346316352508e-05, + "loss": 0.3371, + "step": 5368 + }, + { + "epoch": 0.31, + "grad_norm": 0.43098949036670525, + "learning_rate": 1.6202003490218932e-05, + "loss": 0.2333, + "step": 5369 + }, + { + "epoch": 0.31, + "grad_norm": 0.4145635482591804, + "learning_rate": 1.6200543602137676e-05, + "loss": 0.2221, + "step": 5370 + }, + { + "epoch": 0.31, + "grad_norm": 1.537768224298209, + "learning_rate": 1.619908349933186e-05, + "loss": 0.851, + "step": 5371 + }, + { + "epoch": 0.31, + "grad_norm": 0.513208048312179, + "learning_rate": 1.619762318185205e-05, + "loss": 0.3555, + "step": 5372 + }, + { + "epoch": 0.31, + "grad_norm": 0.32926945386236683, + "learning_rate": 1.6196162649748815e-05, + "loss": 0.255, + "step": 5373 + }, + { + "epoch": 0.31, + "grad_norm": 0.6119467962093083, + "learning_rate": 1.6194701903072734e-05, + "loss": 0.4787, + "step": 5374 + }, + { + "epoch": 0.31, + "grad_norm": 0.2881545608075555, + "learning_rate": 1.619324094187439e-05, + "loss": 0.2352, + "step": 5375 + }, + { + "epoch": 0.31, + "grad_norm": 0.3695005519980028, + "learning_rate": 1.619177976620438e-05, + "loss": 0.2304, + "step": 5376 + }, + { + "epoch": 0.31, + "grad_norm": 0.6251592955848971, + "learning_rate": 1.6190318376113307e-05, + "loss": 0.4482, + "step": 5377 + }, + { + "epoch": 0.31, + "grad_norm": 0.7395901583447987, + "learning_rate": 1.618885677165177e-05, + "loss": 0.4733, + "step": 5378 + }, + { + "epoch": 0.31, + "grad_norm": 0.39642035979972523, + "learning_rate": 1.6187394952870392e-05, + "loss": 0.2308, + "step": 5379 + }, + { + "epoch": 0.31, + "grad_norm": 0.3336838668804227, + "learning_rate": 1.618593291981979e-05, + "loss": 0.305, + "step": 5380 + }, + { + "epoch": 0.31, + "grad_norm": 0.28950738150393335, + "learning_rate": 1.61844706725506e-05, + "loss": 0.1669, + "step": 5381 + }, + { + "epoch": 0.31, + "grad_norm": 0.5750376618606018, + "learning_rate": 1.6183008211113454e-05, + "loss": 0.3297, + "step": 5382 + }, + { + "epoch": 0.31, + "grad_norm": 0.3405128455992142, + "learning_rate": 1.6181545535559e-05, + "loss": 0.277, + "step": 5383 + }, + { + "epoch": 0.31, + "grad_norm": 0.5570942141284251, + "learning_rate": 1.6180082645937888e-05, + "loss": 0.3617, + "step": 5384 + }, + { + "epoch": 0.31, + "grad_norm": 0.6443057721616354, + "learning_rate": 1.6178619542300783e-05, + "loss": 0.3859, + "step": 5385 + }, + { + "epoch": 0.31, + "grad_norm": 0.3067421355198817, + "learning_rate": 1.617715622469835e-05, + "loss": 0.2076, + "step": 5386 + }, + { + "epoch": 0.31, + "grad_norm": 0.341538422997227, + "learning_rate": 1.617569269318126e-05, + "loss": 0.2534, + "step": 5387 + }, + { + "epoch": 0.31, + "grad_norm": 0.3996699166427557, + "learning_rate": 1.61742289478002e-05, + "loss": 0.2901, + "step": 5388 + }, + { + "epoch": 0.31, + "grad_norm": 0.35294345317967296, + "learning_rate": 1.6172764988605855e-05, + "loss": 0.2503, + "step": 5389 + }, + { + "epoch": 0.31, + "grad_norm": 0.679404700705096, + "learning_rate": 1.6171300815648922e-05, + "loss": 0.4566, + "step": 5390 + }, + { + "epoch": 0.31, + "grad_norm": 0.4050109468817518, + "learning_rate": 1.6169836428980108e-05, + "loss": 0.3135, + "step": 5391 + }, + { + "epoch": 0.31, + "grad_norm": 0.33445459512149867, + "learning_rate": 1.6168371828650123e-05, + "loss": 0.2532, + "step": 5392 + }, + { + "epoch": 0.31, + "grad_norm": 0.29912038151863696, + "learning_rate": 1.616690701470969e-05, + "loss": 0.2024, + "step": 5393 + }, + { + "epoch": 0.31, + "grad_norm": 0.4029994690890542, + "learning_rate": 1.6165441987209532e-05, + "loss": 0.2963, + "step": 5394 + }, + { + "epoch": 0.31, + "grad_norm": 0.5210104635097429, + "learning_rate": 1.6163976746200384e-05, + "loss": 0.3754, + "step": 5395 + }, + { + "epoch": 0.31, + "grad_norm": 0.41946498662119747, + "learning_rate": 1.6162511291732984e-05, + "loss": 0.3246, + "step": 5396 + }, + { + "epoch": 0.31, + "grad_norm": 0.8088520261509311, + "learning_rate": 1.616104562385808e-05, + "loss": 0.4104, + "step": 5397 + }, + { + "epoch": 0.31, + "grad_norm": 0.41364569032392223, + "learning_rate": 1.615957974262644e-05, + "loss": 0.3169, + "step": 5398 + }, + { + "epoch": 0.31, + "grad_norm": 0.25882567121199573, + "learning_rate": 1.615811364808881e-05, + "loss": 0.1889, + "step": 5399 + }, + { + "epoch": 0.31, + "grad_norm": 0.5591145105552626, + "learning_rate": 1.6156647340295973e-05, + "loss": 0.3205, + "step": 5400 + }, + { + "epoch": 0.31, + "grad_norm": 0.39999487861467214, + "learning_rate": 1.6155180819298703e-05, + "loss": 0.3648, + "step": 5401 + }, + { + "epoch": 0.31, + "grad_norm": 0.669508277334371, + "learning_rate": 1.6153714085147783e-05, + "loss": 0.3816, + "step": 5402 + }, + { + "epoch": 0.31, + "grad_norm": 0.397360661949208, + "learning_rate": 1.6152247137894012e-05, + "loss": 0.2861, + "step": 5403 + }, + { + "epoch": 0.31, + "grad_norm": 0.3426434244744426, + "learning_rate": 1.6150779977588186e-05, + "loss": 0.2873, + "step": 5404 + }, + { + "epoch": 0.31, + "grad_norm": 0.40844847649718674, + "learning_rate": 1.6149312604281115e-05, + "loss": 0.2428, + "step": 5405 + }, + { + "epoch": 0.31, + "grad_norm": 0.332255987493901, + "learning_rate": 1.6147845018023612e-05, + "loss": 0.1821, + "step": 5406 + }, + { + "epoch": 0.31, + "grad_norm": 0.35299100766743446, + "learning_rate": 1.61463772188665e-05, + "loss": 0.3115, + "step": 5407 + }, + { + "epoch": 0.31, + "grad_norm": 0.46449691352497247, + "learning_rate": 1.6144909206860607e-05, + "loss": 0.3955, + "step": 5408 + }, + { + "epoch": 0.31, + "grad_norm": 0.3484300824139732, + "learning_rate": 1.6143440982056777e-05, + "loss": 0.2251, + "step": 5409 + }, + { + "epoch": 0.31, + "grad_norm": 0.6109240447729549, + "learning_rate": 1.614197254450585e-05, + "loss": 0.378, + "step": 5410 + }, + { + "epoch": 0.31, + "grad_norm": 0.29480715347424497, + "learning_rate": 1.6140503894258674e-05, + "loss": 0.2724, + "step": 5411 + }, + { + "epoch": 0.31, + "grad_norm": 0.3103918092644329, + "learning_rate": 1.6139035031366116e-05, + "loss": 0.169, + "step": 5412 + }, + { + "epoch": 0.31, + "grad_norm": 0.3729779854233855, + "learning_rate": 1.6137565955879036e-05, + "loss": 0.3555, + "step": 5413 + }, + { + "epoch": 0.31, + "grad_norm": 0.7389313540620621, + "learning_rate": 1.6136096667848313e-05, + "loss": 0.5823, + "step": 5414 + }, + { + "epoch": 0.31, + "grad_norm": 0.3544691682936791, + "learning_rate": 1.6134627167324827e-05, + "loss": 0.2543, + "step": 5415 + }, + { + "epoch": 0.31, + "grad_norm": 0.3960851188225142, + "learning_rate": 1.613315745435946e-05, + "loss": 0.2833, + "step": 5416 + }, + { + "epoch": 0.31, + "grad_norm": 0.2765012263636344, + "learning_rate": 1.613168752900312e-05, + "loss": 0.1917, + "step": 5417 + }, + { + "epoch": 0.31, + "grad_norm": 0.6375209628011956, + "learning_rate": 1.61302173913067e-05, + "loss": 0.4634, + "step": 5418 + }, + { + "epoch": 0.31, + "grad_norm": 0.2715428312700595, + "learning_rate": 1.612874704132112e-05, + "loss": 0.2351, + "step": 5419 + }, + { + "epoch": 0.31, + "grad_norm": 0.7184165179152175, + "learning_rate": 1.612727647909729e-05, + "loss": 0.5226, + "step": 5420 + }, + { + "epoch": 0.31, + "grad_norm": 0.7367803499422515, + "learning_rate": 1.612580570468614e-05, + "loss": 0.481, + "step": 5421 + }, + { + "epoch": 0.31, + "grad_norm": 0.40185485568900076, + "learning_rate": 1.6124334718138602e-05, + "loss": 0.2303, + "step": 5422 + }, + { + "epoch": 0.31, + "grad_norm": 0.4198601147325941, + "learning_rate": 1.6122863519505618e-05, + "loss": 0.3464, + "step": 5423 + }, + { + "epoch": 0.31, + "grad_norm": 0.4956845797294979, + "learning_rate": 1.6121392108838132e-05, + "loss": 0.3011, + "step": 5424 + }, + { + "epoch": 0.31, + "grad_norm": 0.26916066161630525, + "learning_rate": 1.6119920486187102e-05, + "loss": 0.2054, + "step": 5425 + }, + { + "epoch": 0.31, + "grad_norm": 1.1799061771549846, + "learning_rate": 1.611844865160349e-05, + "loss": 0.9037, + "step": 5426 + }, + { + "epoch": 0.31, + "grad_norm": 0.34547055929760423, + "learning_rate": 1.611697660513826e-05, + "loss": 0.3116, + "step": 5427 + }, + { + "epoch": 0.31, + "grad_norm": 0.3723557442890717, + "learning_rate": 1.6115504346842393e-05, + "loss": 0.215, + "step": 5428 + }, + { + "epoch": 0.31, + "grad_norm": 0.7354679497674211, + "learning_rate": 1.6114031876766877e-05, + "loss": 0.4834, + "step": 5429 + }, + { + "epoch": 0.31, + "grad_norm": 0.5397253363814973, + "learning_rate": 1.61125591949627e-05, + "loss": 0.385, + "step": 5430 + }, + { + "epoch": 0.31, + "grad_norm": 0.31820906837570323, + "learning_rate": 1.611108630148086e-05, + "loss": 0.2479, + "step": 5431 + }, + { + "epoch": 0.31, + "grad_norm": 0.43775281349809764, + "learning_rate": 1.610961319637236e-05, + "loss": 0.3013, + "step": 5432 + }, + { + "epoch": 0.31, + "grad_norm": 0.48523621853194515, + "learning_rate": 1.610813987968822e-05, + "loss": 0.3288, + "step": 5433 + }, + { + "epoch": 0.31, + "grad_norm": 0.3286365653300218, + "learning_rate": 1.6106666351479462e-05, + "loss": 0.2691, + "step": 5434 + }, + { + "epoch": 0.31, + "grad_norm": 0.4073669030943272, + "learning_rate": 1.610519261179711e-05, + "loss": 0.2907, + "step": 5435 + }, + { + "epoch": 0.31, + "grad_norm": 0.86365452708418, + "learning_rate": 1.61037186606922e-05, + "loss": 0.5174, + "step": 5436 + }, + { + "epoch": 0.31, + "grad_norm": 0.34139315066637166, + "learning_rate": 1.610224449821577e-05, + "loss": 0.2648, + "step": 5437 + }, + { + "epoch": 0.31, + "grad_norm": 0.781900670758003, + "learning_rate": 1.6100770124418882e-05, + "loss": 0.5627, + "step": 5438 + }, + { + "epoch": 0.31, + "grad_norm": 0.2584851055369354, + "learning_rate": 1.6099295539352583e-05, + "loss": 0.2251, + "step": 5439 + }, + { + "epoch": 0.31, + "grad_norm": 0.32668801819835613, + "learning_rate": 1.6097820743067945e-05, + "loss": 0.2867, + "step": 5440 + }, + { + "epoch": 0.31, + "grad_norm": 0.8179062477321604, + "learning_rate": 1.6096345735616036e-05, + "loss": 0.3766, + "step": 5441 + }, + { + "epoch": 0.31, + "grad_norm": 0.5923473411548568, + "learning_rate": 1.6094870517047937e-05, + "loss": 0.4198, + "step": 5442 + }, + { + "epoch": 0.31, + "grad_norm": 0.31872599364875503, + "learning_rate": 1.609339508741473e-05, + "loss": 0.2893, + "step": 5443 + }, + { + "epoch": 0.31, + "grad_norm": 1.379638334873006, + "learning_rate": 1.6091919446767517e-05, + "loss": 0.8334, + "step": 5444 + }, + { + "epoch": 0.31, + "grad_norm": 0.2200588702556372, + "learning_rate": 1.6090443595157396e-05, + "loss": 0.0865, + "step": 5445 + }, + { + "epoch": 0.31, + "grad_norm": 0.3663693385434264, + "learning_rate": 1.608896753263547e-05, + "loss": 0.3005, + "step": 5446 + }, + { + "epoch": 0.31, + "grad_norm": 0.4523250755868021, + "learning_rate": 1.6087491259252865e-05, + "loss": 0.3534, + "step": 5447 + }, + { + "epoch": 0.31, + "grad_norm": 0.6353127882984126, + "learning_rate": 1.60860147750607e-05, + "loss": 0.316, + "step": 5448 + }, + { + "epoch": 0.31, + "grad_norm": 0.4157340148684122, + "learning_rate": 1.60845380801101e-05, + "loss": 0.316, + "step": 5449 + }, + { + "epoch": 0.31, + "grad_norm": 1.2008936520366085, + "learning_rate": 1.6083061174452214e-05, + "loss": 0.8211, + "step": 5450 + }, + { + "epoch": 0.31, + "grad_norm": 0.22059778641398187, + "learning_rate": 1.6081584058138178e-05, + "loss": 0.1828, + "step": 5451 + }, + { + "epoch": 0.31, + "grad_norm": 0.38232047263168106, + "learning_rate": 1.6080106731219147e-05, + "loss": 0.2847, + "step": 5452 + }, + { + "epoch": 0.31, + "grad_norm": 0.7296256845720059, + "learning_rate": 1.6078629193746283e-05, + "loss": 0.4682, + "step": 5453 + }, + { + "epoch": 0.31, + "grad_norm": 0.49677717222315976, + "learning_rate": 1.607715144577075e-05, + "loss": 0.2671, + "step": 5454 + }, + { + "epoch": 0.31, + "grad_norm": 0.3413141139616929, + "learning_rate": 1.6075673487343725e-05, + "loss": 0.2721, + "step": 5455 + }, + { + "epoch": 0.31, + "grad_norm": 1.2043097896012789, + "learning_rate": 1.6074195318516385e-05, + "loss": 0.8261, + "step": 5456 + }, + { + "epoch": 0.31, + "grad_norm": 0.3009263002823461, + "learning_rate": 1.6072716939339924e-05, + "loss": 0.227, + "step": 5457 + }, + { + "epoch": 0.31, + "grad_norm": 0.40159765325055435, + "learning_rate": 1.607123834986554e-05, + "loss": 0.2515, + "step": 5458 + }, + { + "epoch": 0.31, + "grad_norm": 0.4338965338706003, + "learning_rate": 1.606975955014443e-05, + "loss": 0.3357, + "step": 5459 + }, + { + "epoch": 0.31, + "grad_norm": 1.1490956748629666, + "learning_rate": 1.6068280540227807e-05, + "loss": 0.6926, + "step": 5460 + }, + { + "epoch": 0.31, + "grad_norm": 0.36110726958810213, + "learning_rate": 1.6066801320166897e-05, + "loss": 0.2308, + "step": 5461 + }, + { + "epoch": 0.31, + "grad_norm": 0.998317680190995, + "learning_rate": 1.606532189001291e-05, + "loss": 0.6979, + "step": 5462 + }, + { + "epoch": 0.31, + "grad_norm": 0.32793332659947005, + "learning_rate": 1.606384224981709e-05, + "loss": 0.3374, + "step": 5463 + }, + { + "epoch": 0.31, + "grad_norm": 0.3159672370415879, + "learning_rate": 1.6062362399630673e-05, + "loss": 0.2052, + "step": 5464 + }, + { + "epoch": 0.31, + "grad_norm": 0.28198983815444506, + "learning_rate": 1.6060882339504905e-05, + "loss": 0.2491, + "step": 5465 + }, + { + "epoch": 0.31, + "grad_norm": 0.3914189720919406, + "learning_rate": 1.6059402069491047e-05, + "loss": 0.3358, + "step": 5466 + }, + { + "epoch": 0.31, + "grad_norm": 0.31173129053212056, + "learning_rate": 1.605792158964035e-05, + "loss": 0.2001, + "step": 5467 + }, + { + "epoch": 0.31, + "grad_norm": 1.0947564593865875, + "learning_rate": 1.6056440900004094e-05, + "loss": 0.7421, + "step": 5468 + }, + { + "epoch": 0.31, + "grad_norm": 0.5952027400139973, + "learning_rate": 1.6054960000633545e-05, + "loss": 0.4776, + "step": 5469 + }, + { + "epoch": 0.31, + "grad_norm": 0.420934632151069, + "learning_rate": 1.6053478891579993e-05, + "loss": 0.3136, + "step": 5470 + }, + { + "epoch": 0.31, + "grad_norm": 0.24668825200323996, + "learning_rate": 1.605199757289473e-05, + "loss": 0.1935, + "step": 5471 + }, + { + "epoch": 0.31, + "grad_norm": 1.1178765441595595, + "learning_rate": 1.605051604462905e-05, + "loss": 0.646, + "step": 5472 + }, + { + "epoch": 0.31, + "grad_norm": 0.38129148872885377, + "learning_rate": 1.6049034306834258e-05, + "loss": 0.275, + "step": 5473 + }, + { + "epoch": 0.31, + "grad_norm": 0.4828124601149025, + "learning_rate": 1.6047552359561672e-05, + "loss": 0.3161, + "step": 5474 + }, + { + "epoch": 0.31, + "grad_norm": 0.6728617355740534, + "learning_rate": 1.6046070202862605e-05, + "loss": 0.378, + "step": 5475 + }, + { + "epoch": 0.31, + "grad_norm": 0.37208712058090737, + "learning_rate": 1.6044587836788385e-05, + "loss": 0.2971, + "step": 5476 + }, + { + "epoch": 0.31, + "grad_norm": 0.2293316968318131, + "learning_rate": 1.6043105261390352e-05, + "loss": 0.0965, + "step": 5477 + }, + { + "epoch": 0.31, + "grad_norm": 0.4546947048442612, + "learning_rate": 1.6041622476719842e-05, + "loss": 0.3595, + "step": 5478 + }, + { + "epoch": 0.31, + "grad_norm": 0.3668420835351272, + "learning_rate": 1.6040139482828207e-05, + "loss": 0.2734, + "step": 5479 + }, + { + "epoch": 0.31, + "grad_norm": 1.312312858344449, + "learning_rate": 1.60386562797668e-05, + "loss": 0.7963, + "step": 5480 + }, + { + "epoch": 0.31, + "grad_norm": 0.6925249119934285, + "learning_rate": 1.6037172867586984e-05, + "loss": 0.3647, + "step": 5481 + }, + { + "epoch": 0.31, + "grad_norm": 0.4077235005317406, + "learning_rate": 1.6035689246340132e-05, + "loss": 0.2688, + "step": 5482 + }, + { + "epoch": 0.32, + "grad_norm": 0.34113054517286356, + "learning_rate": 1.603420541607762e-05, + "loss": 0.291, + "step": 5483 + }, + { + "epoch": 0.32, + "grad_norm": 0.23609225173467213, + "learning_rate": 1.6032721376850837e-05, + "loss": 0.1449, + "step": 5484 + }, + { + "epoch": 0.32, + "grad_norm": 0.4239941906638344, + "learning_rate": 1.603123712871117e-05, + "loss": 0.2859, + "step": 5485 + }, + { + "epoch": 0.32, + "grad_norm": 0.42335183714015223, + "learning_rate": 1.602975267171002e-05, + "loss": 0.3501, + "step": 5486 + }, + { + "epoch": 0.32, + "grad_norm": 0.5602162541545916, + "learning_rate": 1.6028268005898798e-05, + "loss": 0.3304, + "step": 5487 + }, + { + "epoch": 0.32, + "grad_norm": 0.4283245085092912, + "learning_rate": 1.6026783131328915e-05, + "loss": 0.292, + "step": 5488 + }, + { + "epoch": 0.32, + "grad_norm": 0.27342802197048033, + "learning_rate": 1.6025298048051784e-05, + "loss": 0.229, + "step": 5489 + }, + { + "epoch": 0.32, + "grad_norm": 0.34821181236786064, + "learning_rate": 1.6023812756118845e-05, + "loss": 0.2565, + "step": 5490 + }, + { + "epoch": 0.32, + "grad_norm": 0.4254165247173863, + "learning_rate": 1.602232725558153e-05, + "loss": 0.3036, + "step": 5491 + }, + { + "epoch": 0.32, + "grad_norm": 0.4819874115221115, + "learning_rate": 1.6020841546491278e-05, + "loss": 0.3834, + "step": 5492 + }, + { + "epoch": 0.32, + "grad_norm": 0.8487213879795744, + "learning_rate": 1.6019355628899545e-05, + "loss": 0.5686, + "step": 5493 + }, + { + "epoch": 0.32, + "grad_norm": 0.33225687121251357, + "learning_rate": 1.6017869502857785e-05, + "loss": 0.2232, + "step": 5494 + }, + { + "epoch": 0.32, + "grad_norm": 0.3973590452255858, + "learning_rate": 1.6016383168417463e-05, + "loss": 0.3365, + "step": 5495 + }, + { + "epoch": 0.32, + "grad_norm": 0.29339420160733515, + "learning_rate": 1.601489662563005e-05, + "loss": 0.1986, + "step": 5496 + }, + { + "epoch": 0.32, + "grad_norm": 0.3666754590084301, + "learning_rate": 1.6013409874547026e-05, + "loss": 0.2422, + "step": 5497 + }, + { + "epoch": 0.32, + "grad_norm": 0.8964912741710412, + "learning_rate": 1.6011922915219877e-05, + "loss": 0.4598, + "step": 5498 + }, + { + "epoch": 0.32, + "grad_norm": 0.38686744503564313, + "learning_rate": 1.6010435747700097e-05, + "loss": 0.3271, + "step": 5499 + }, + { + "epoch": 0.32, + "grad_norm": 0.36402267171787295, + "learning_rate": 1.600894837203918e-05, + "loss": 0.1334, + "step": 5500 + }, + { + "epoch": 0.32, + "grad_norm": 0.39810639722148555, + "learning_rate": 1.6007460788288644e-05, + "loss": 0.3626, + "step": 5501 + }, + { + "epoch": 0.32, + "grad_norm": 0.26937219839700005, + "learning_rate": 1.60059729965e-05, + "loss": 0.237, + "step": 5502 + }, + { + "epoch": 0.32, + "grad_norm": 0.3572405700250666, + "learning_rate": 1.600448499672477e-05, + "loss": 0.2023, + "step": 5503 + }, + { + "epoch": 0.32, + "grad_norm": 0.7519951161732132, + "learning_rate": 1.600299678901448e-05, + "loss": 0.4123, + "step": 5504 + }, + { + "epoch": 0.32, + "grad_norm": 0.8481384242186841, + "learning_rate": 1.6001508373420666e-05, + "loss": 0.609, + "step": 5505 + }, + { + "epoch": 0.32, + "grad_norm": 0.38602966781075637, + "learning_rate": 1.6000019749994882e-05, + "loss": 0.3056, + "step": 5506 + }, + { + "epoch": 0.32, + "grad_norm": 0.3067806475149186, + "learning_rate": 1.5998530918788667e-05, + "loss": 0.254, + "step": 5507 + }, + { + "epoch": 0.32, + "grad_norm": 0.28563113629324893, + "learning_rate": 1.5997041879853585e-05, + "loss": 0.1853, + "step": 5508 + }, + { + "epoch": 0.32, + "grad_norm": 0.560627051750689, + "learning_rate": 1.5995552633241206e-05, + "loss": 0.3737, + "step": 5509 + }, + { + "epoch": 0.32, + "grad_norm": 0.31902956156250195, + "learning_rate": 1.599406317900309e-05, + "loss": 0.2693, + "step": 5510 + }, + { + "epoch": 0.32, + "grad_norm": 1.0324425132045223, + "learning_rate": 1.5992573517190826e-05, + "loss": 0.7321, + "step": 5511 + }, + { + "epoch": 0.32, + "grad_norm": 0.5816127897195071, + "learning_rate": 1.5991083647856003e-05, + "loss": 0.3399, + "step": 5512 + }, + { + "epoch": 0.32, + "grad_norm": 0.3638464303346846, + "learning_rate": 1.5989593571050207e-05, + "loss": 0.2275, + "step": 5513 + }, + { + "epoch": 0.32, + "grad_norm": 0.2997715451537564, + "learning_rate": 1.5988103286825043e-05, + "loss": 0.2977, + "step": 5514 + }, + { + "epoch": 0.32, + "grad_norm": 0.3882699857377122, + "learning_rate": 1.5986612795232122e-05, + "loss": 0.3314, + "step": 5515 + }, + { + "epoch": 0.32, + "grad_norm": 0.4607329976592203, + "learning_rate": 1.5985122096323053e-05, + "loss": 0.3253, + "step": 5516 + }, + { + "epoch": 0.32, + "grad_norm": 0.42979471017625737, + "learning_rate": 1.5983631190149466e-05, + "loss": 0.3066, + "step": 5517 + }, + { + "epoch": 0.32, + "grad_norm": 0.3052137957577575, + "learning_rate": 1.5982140076762988e-05, + "loss": 0.2657, + "step": 5518 + }, + { + "epoch": 0.32, + "grad_norm": 0.4134905162494446, + "learning_rate": 1.5980648756215256e-05, + "loss": 0.3369, + "step": 5519 + }, + { + "epoch": 0.32, + "grad_norm": 0.5637451037693646, + "learning_rate": 1.597915722855792e-05, + "loss": 0.3639, + "step": 5520 + }, + { + "epoch": 0.32, + "grad_norm": 0.37421259673932755, + "learning_rate": 1.597766549384262e-05, + "loss": 0.2469, + "step": 5521 + }, + { + "epoch": 0.32, + "grad_norm": 0.3058915736850958, + "learning_rate": 1.5976173552121023e-05, + "loss": 0.2922, + "step": 5522 + }, + { + "epoch": 0.32, + "grad_norm": 0.2841295684463912, + "learning_rate": 1.597468140344479e-05, + "loss": 0.1977, + "step": 5523 + }, + { + "epoch": 0.32, + "grad_norm": 0.6162024062628615, + "learning_rate": 1.59731890478656e-05, + "loss": 0.2815, + "step": 5524 + }, + { + "epoch": 0.32, + "grad_norm": 0.4094893668831604, + "learning_rate": 1.5971696485435128e-05, + "loss": 0.3378, + "step": 5525 + }, + { + "epoch": 0.32, + "grad_norm": 0.40588421773351036, + "learning_rate": 1.5970203716205066e-05, + "loss": 0.2895, + "step": 5526 + }, + { + "epoch": 0.32, + "grad_norm": 0.6938594055918771, + "learning_rate": 1.5968710740227106e-05, + "loss": 0.3855, + "step": 5527 + }, + { + "epoch": 0.32, + "grad_norm": 0.4116993177994943, + "learning_rate": 1.5967217557552944e-05, + "loss": 0.329, + "step": 5528 + }, + { + "epoch": 0.32, + "grad_norm": 0.22779602155734383, + "learning_rate": 1.5965724168234304e-05, + "loss": 0.086, + "step": 5529 + }, + { + "epoch": 0.32, + "grad_norm": 0.3500048899822258, + "learning_rate": 1.5964230572322884e-05, + "loss": 0.2697, + "step": 5530 + }, + { + "epoch": 0.32, + "grad_norm": 0.4157625173818804, + "learning_rate": 1.596273676987042e-05, + "loss": 0.3202, + "step": 5531 + }, + { + "epoch": 0.32, + "grad_norm": 0.8107650750102577, + "learning_rate": 1.596124276092864e-05, + "loss": 0.4456, + "step": 5532 + }, + { + "epoch": 0.32, + "grad_norm": 0.35678553696656035, + "learning_rate": 1.595974854554928e-05, + "loss": 0.2202, + "step": 5533 + }, + { + "epoch": 0.32, + "grad_norm": 0.40454325538013913, + "learning_rate": 1.5958254123784077e-05, + "loss": 0.3206, + "step": 5534 + }, + { + "epoch": 0.32, + "grad_norm": 0.2564749702657718, + "learning_rate": 1.5956759495684796e-05, + "loss": 0.2145, + "step": 5535 + }, + { + "epoch": 0.32, + "grad_norm": 0.37351136197316315, + "learning_rate": 1.595526466130319e-05, + "loss": 0.1787, + "step": 5536 + }, + { + "epoch": 0.32, + "grad_norm": 0.5263171270492955, + "learning_rate": 1.595376962069102e-05, + "loss": 0.3519, + "step": 5537 + }, + { + "epoch": 0.32, + "grad_norm": 0.42044904764267466, + "learning_rate": 1.5952274373900067e-05, + "loss": 0.3532, + "step": 5538 + }, + { + "epoch": 0.32, + "grad_norm": 0.45297528673077875, + "learning_rate": 1.5950778920982108e-05, + "loss": 0.2163, + "step": 5539 + }, + { + "epoch": 0.32, + "grad_norm": 0.3791943745784225, + "learning_rate": 1.5949283261988934e-05, + "loss": 0.3221, + "step": 5540 + }, + { + "epoch": 0.32, + "grad_norm": 0.2635293913337198, + "learning_rate": 1.5947787396972332e-05, + "loss": 0.2126, + "step": 5541 + }, + { + "epoch": 0.32, + "grad_norm": 0.39755428740194615, + "learning_rate": 1.5946291325984108e-05, + "loss": 0.2607, + "step": 5542 + }, + { + "epoch": 0.32, + "grad_norm": 0.3970521445893526, + "learning_rate": 1.5944795049076072e-05, + "loss": 0.3215, + "step": 5543 + }, + { + "epoch": 0.32, + "grad_norm": 0.6915226148979662, + "learning_rate": 1.594329856630004e-05, + "loss": 0.5286, + "step": 5544 + }, + { + "epoch": 0.32, + "grad_norm": 1.0272727002837614, + "learning_rate": 1.5941801877707835e-05, + "loss": 0.5231, + "step": 5545 + }, + { + "epoch": 0.32, + "grad_norm": 0.2711694648544462, + "learning_rate": 1.594030498335129e-05, + "loss": 0.2551, + "step": 5546 + }, + { + "epoch": 0.32, + "grad_norm": 1.0522809594213387, + "learning_rate": 1.5938807883282234e-05, + "loss": 0.6938, + "step": 5547 + }, + { + "epoch": 0.32, + "grad_norm": 0.2780677764777073, + "learning_rate": 1.5937310577552517e-05, + "loss": 0.2152, + "step": 5548 + }, + { + "epoch": 0.32, + "grad_norm": 0.3764746606296057, + "learning_rate": 1.5935813066213988e-05, + "loss": 0.254, + "step": 5549 + }, + { + "epoch": 0.32, + "grad_norm": 0.39611418844585977, + "learning_rate": 1.593431534931851e-05, + "loss": 0.345, + "step": 5550 + }, + { + "epoch": 0.32, + "grad_norm": 0.6565438887638069, + "learning_rate": 1.5932817426917945e-05, + "loss": 0.4974, + "step": 5551 + }, + { + "epoch": 0.32, + "grad_norm": 0.3633914159676232, + "learning_rate": 1.593131929906417e-05, + "loss": 0.2314, + "step": 5552 + }, + { + "epoch": 0.32, + "grad_norm": 0.31526102547006846, + "learning_rate": 1.5929820965809063e-05, + "loss": 0.2752, + "step": 5553 + }, + { + "epoch": 0.32, + "grad_norm": 0.3736946671960914, + "learning_rate": 1.592832242720451e-05, + "loss": 0.3219, + "step": 5554 + }, + { + "epoch": 0.32, + "grad_norm": 0.3615427063133629, + "learning_rate": 1.5926823683302404e-05, + "loss": 0.2956, + "step": 5555 + }, + { + "epoch": 0.32, + "grad_norm": 0.40873885213362576, + "learning_rate": 1.5925324734154654e-05, + "loss": 0.2416, + "step": 5556 + }, + { + "epoch": 0.32, + "grad_norm": 0.5019426658331753, + "learning_rate": 1.5923825579813158e-05, + "loss": 0.4355, + "step": 5557 + }, + { + "epoch": 0.32, + "grad_norm": 0.3081784728369049, + "learning_rate": 1.592232622032984e-05, + "loss": 0.2679, + "step": 5558 + }, + { + "epoch": 0.32, + "grad_norm": 0.581079170192961, + "learning_rate": 1.5920826655756617e-05, + "loss": 0.4, + "step": 5559 + }, + { + "epoch": 0.32, + "grad_norm": 0.5650872586904916, + "learning_rate": 1.591932688614542e-05, + "loss": 0.4542, + "step": 5560 + }, + { + "epoch": 0.32, + "grad_norm": 0.27258563527685675, + "learning_rate": 1.5917826911548194e-05, + "loss": 0.2233, + "step": 5561 + }, + { + "epoch": 0.32, + "grad_norm": 0.2959203113731466, + "learning_rate": 1.591632673201687e-05, + "loss": 0.2158, + "step": 5562 + }, + { + "epoch": 0.32, + "grad_norm": 1.2341214992158127, + "learning_rate": 1.5914826347603412e-05, + "loss": 0.7517, + "step": 5563 + }, + { + "epoch": 0.32, + "grad_norm": 0.33028417983241837, + "learning_rate": 1.591332575835977e-05, + "loss": 0.2764, + "step": 5564 + }, + { + "epoch": 0.32, + "grad_norm": 0.46893872523644764, + "learning_rate": 1.591182496433791e-05, + "loss": 0.3126, + "step": 5565 + }, + { + "epoch": 0.32, + "grad_norm": 0.5536360678259141, + "learning_rate": 1.5910323965589803e-05, + "loss": 0.3726, + "step": 5566 + }, + { + "epoch": 0.32, + "grad_norm": 0.3735817111568126, + "learning_rate": 1.5908822762167435e-05, + "loss": 0.3044, + "step": 5567 + }, + { + "epoch": 0.32, + "grad_norm": 0.35011633102273504, + "learning_rate": 1.5907321354122788e-05, + "loss": 0.2051, + "step": 5568 + }, + { + "epoch": 0.32, + "grad_norm": 0.3548591682635962, + "learning_rate": 1.5905819741507856e-05, + "loss": 0.2592, + "step": 5569 + }, + { + "epoch": 0.32, + "grad_norm": 0.3581690176525001, + "learning_rate": 1.590431792437464e-05, + "loss": 0.3012, + "step": 5570 + }, + { + "epoch": 0.32, + "grad_norm": 0.7579819660734486, + "learning_rate": 1.590281590277515e-05, + "loss": 0.5101, + "step": 5571 + }, + { + "epoch": 0.32, + "grad_norm": 0.4774255317314944, + "learning_rate": 1.5901313676761397e-05, + "loss": 0.3312, + "step": 5572 + }, + { + "epoch": 0.32, + "grad_norm": 0.3935810128176364, + "learning_rate": 1.5899811246385404e-05, + "loss": 0.2783, + "step": 5573 + }, + { + "epoch": 0.32, + "grad_norm": 0.2434832696741011, + "learning_rate": 1.5898308611699204e-05, + "loss": 0.2066, + "step": 5574 + }, + { + "epoch": 0.32, + "grad_norm": 0.838098712521921, + "learning_rate": 1.589680577275483e-05, + "loss": 0.4197, + "step": 5575 + }, + { + "epoch": 0.32, + "grad_norm": 0.38018436720857546, + "learning_rate": 1.5895302729604323e-05, + "loss": 0.274, + "step": 5576 + }, + { + "epoch": 0.32, + "grad_norm": 0.5879533204990789, + "learning_rate": 1.589379948229974e-05, + "loss": 0.4019, + "step": 5577 + }, + { + "epoch": 0.32, + "grad_norm": 0.469626009070046, + "learning_rate": 1.5892296030893134e-05, + "loss": 0.2904, + "step": 5578 + }, + { + "epoch": 0.32, + "grad_norm": 0.3569116943337263, + "learning_rate": 1.5890792375436568e-05, + "loss": 0.2826, + "step": 5579 + }, + { + "epoch": 0.32, + "grad_norm": 0.30781010648834556, + "learning_rate": 1.5889288515982118e-05, + "loss": 0.214, + "step": 5580 + }, + { + "epoch": 0.32, + "grad_norm": 0.4452871745168325, + "learning_rate": 1.5887784452581857e-05, + "loss": 0.3477, + "step": 5581 + }, + { + "epoch": 0.32, + "grad_norm": 0.3423890676840111, + "learning_rate": 1.5886280185287874e-05, + "loss": 0.2258, + "step": 5582 + }, + { + "epoch": 0.32, + "grad_norm": 0.9397225493977781, + "learning_rate": 1.588477571415226e-05, + "loss": 0.4489, + "step": 5583 + }, + { + "epoch": 0.32, + "grad_norm": 0.8992191750862868, + "learning_rate": 1.588327103922712e-05, + "loss": 0.5669, + "step": 5584 + }, + { + "epoch": 0.32, + "grad_norm": 0.34519578977057513, + "learning_rate": 1.5881766160564553e-05, + "loss": 0.2008, + "step": 5585 + }, + { + "epoch": 0.32, + "grad_norm": 0.2613946128658462, + "learning_rate": 1.588026107821668e-05, + "loss": 0.2106, + "step": 5586 + }, + { + "epoch": 0.32, + "grad_norm": 1.1043781598317808, + "learning_rate": 1.5878755792235616e-05, + "loss": 0.5675, + "step": 5587 + }, + { + "epoch": 0.32, + "grad_norm": 0.3429172657775994, + "learning_rate": 1.5877250302673493e-05, + "loss": 0.2068, + "step": 5588 + }, + { + "epoch": 0.32, + "grad_norm": 0.6522423001767897, + "learning_rate": 1.5875744609582444e-05, + "loss": 0.3912, + "step": 5589 + }, + { + "epoch": 0.32, + "grad_norm": 0.4731252524113648, + "learning_rate": 1.587423871301461e-05, + "loss": 0.3802, + "step": 5590 + }, + { + "epoch": 0.32, + "grad_norm": 0.33128379858329554, + "learning_rate": 1.5872732613022147e-05, + "loss": 0.2088, + "step": 5591 + }, + { + "epoch": 0.32, + "grad_norm": 0.2749688226198136, + "learning_rate": 1.5871226309657203e-05, + "loss": 0.1898, + "step": 5592 + }, + { + "epoch": 0.32, + "grad_norm": 0.449249676522092, + "learning_rate": 1.5869719802971947e-05, + "loss": 0.3547, + "step": 5593 + }, + { + "epoch": 0.32, + "grad_norm": 0.545477058578212, + "learning_rate": 1.5868213093018543e-05, + "loss": 0.3069, + "step": 5594 + }, + { + "epoch": 0.32, + "grad_norm": 0.7375781226958218, + "learning_rate": 1.5866706179849172e-05, + "loss": 0.3483, + "step": 5595 + }, + { + "epoch": 0.32, + "grad_norm": 1.0617075868345798, + "learning_rate": 1.586519906351602e-05, + "loss": 0.6041, + "step": 5596 + }, + { + "epoch": 0.32, + "grad_norm": 0.3294184144488115, + "learning_rate": 1.586369174407128e-05, + "loss": 0.2779, + "step": 5597 + }, + { + "epoch": 0.32, + "grad_norm": 0.2594062832915638, + "learning_rate": 1.5862184221567144e-05, + "loss": 0.1825, + "step": 5598 + }, + { + "epoch": 0.32, + "grad_norm": 0.9733876205705159, + "learning_rate": 1.586067649605582e-05, + "loss": 0.5383, + "step": 5599 + }, + { + "epoch": 0.32, + "grad_norm": 0.4270131446780353, + "learning_rate": 1.585916856758952e-05, + "loss": 0.2909, + "step": 5600 + }, + { + "epoch": 0.32, + "grad_norm": 0.5168643875206522, + "learning_rate": 1.5857660436220466e-05, + "loss": 0.315, + "step": 5601 + }, + { + "epoch": 0.32, + "grad_norm": 0.5136780415745466, + "learning_rate": 1.5856152102000878e-05, + "loss": 0.3597, + "step": 5602 + }, + { + "epoch": 0.32, + "grad_norm": 0.3472687358366042, + "learning_rate": 1.5854643564983e-05, + "loss": 0.2772, + "step": 5603 + }, + { + "epoch": 0.32, + "grad_norm": 0.21984666591055935, + "learning_rate": 1.5853134825219066e-05, + "loss": 0.1203, + "step": 5604 + }, + { + "epoch": 0.32, + "grad_norm": 0.5955025245870108, + "learning_rate": 1.5851625882761326e-05, + "loss": 0.3752, + "step": 5605 + }, + { + "epoch": 0.32, + "grad_norm": 0.3950271910878111, + "learning_rate": 1.585011673766203e-05, + "loss": 0.2838, + "step": 5606 + }, + { + "epoch": 0.32, + "grad_norm": 0.6397240293769488, + "learning_rate": 1.5848607389973446e-05, + "loss": 0.4141, + "step": 5607 + }, + { + "epoch": 0.32, + "grad_norm": 0.4432763685637882, + "learning_rate": 1.584709783974784e-05, + "loss": 0.2897, + "step": 5608 + }, + { + "epoch": 0.32, + "grad_norm": 0.4240833841777455, + "learning_rate": 1.5845588087037484e-05, + "loss": 0.2904, + "step": 5609 + }, + { + "epoch": 0.32, + "grad_norm": 0.43798524592763793, + "learning_rate": 1.5844078131894668e-05, + "loss": 0.3254, + "step": 5610 + }, + { + "epoch": 0.32, + "grad_norm": 0.44403383059135365, + "learning_rate": 1.5842567974371672e-05, + "loss": 0.2548, + "step": 5611 + }, + { + "epoch": 0.32, + "grad_norm": 0.37943493120580635, + "learning_rate": 1.5841057614520803e-05, + "loss": 0.3146, + "step": 5612 + }, + { + "epoch": 0.32, + "grad_norm": 0.3330746925906659, + "learning_rate": 1.5839547052394356e-05, + "loss": 0.3125, + "step": 5613 + }, + { + "epoch": 0.32, + "grad_norm": 0.36957538435382176, + "learning_rate": 1.583803628804465e-05, + "loss": 0.1807, + "step": 5614 + }, + { + "epoch": 0.32, + "grad_norm": 0.3799639377524021, + "learning_rate": 1.5836525321523998e-05, + "loss": 0.2976, + "step": 5615 + }, + { + "epoch": 0.32, + "grad_norm": 0.5407995328754999, + "learning_rate": 1.5835014152884722e-05, + "loss": 0.371, + "step": 5616 + }, + { + "epoch": 0.32, + "grad_norm": 0.4602781375274827, + "learning_rate": 1.5833502782179157e-05, + "loss": 0.3056, + "step": 5617 + }, + { + "epoch": 0.32, + "grad_norm": 0.2703581818132286, + "learning_rate": 1.5831991209459646e-05, + "loss": 0.2294, + "step": 5618 + }, + { + "epoch": 0.32, + "grad_norm": 1.3941351735792076, + "learning_rate": 1.583047943477853e-05, + "loss": 0.8033, + "step": 5619 + }, + { + "epoch": 0.32, + "grad_norm": 0.4269407961412312, + "learning_rate": 1.5828967458188155e-05, + "loss": 0.2612, + "step": 5620 + }, + { + "epoch": 0.32, + "grad_norm": 0.32749877211677897, + "learning_rate": 1.5827455279740892e-05, + "loss": 0.2406, + "step": 5621 + }, + { + "epoch": 0.32, + "grad_norm": 0.7597682432080118, + "learning_rate": 1.5825942899489103e-05, + "loss": 0.4945, + "step": 5622 + }, + { + "epoch": 0.32, + "grad_norm": 0.6648356899162475, + "learning_rate": 1.5824430317485163e-05, + "loss": 0.4611, + "step": 5623 + }, + { + "epoch": 0.32, + "grad_norm": 0.3576858149221899, + "learning_rate": 1.582291753378145e-05, + "loss": 0.1644, + "step": 5624 + }, + { + "epoch": 0.32, + "grad_norm": 0.30960874021564383, + "learning_rate": 1.5821404548430352e-05, + "loss": 0.2993, + "step": 5625 + }, + { + "epoch": 0.32, + "grad_norm": 0.2692899639231665, + "learning_rate": 1.5819891361484266e-05, + "loss": 0.2074, + "step": 5626 + }, + { + "epoch": 0.32, + "grad_norm": 0.362044309838756, + "learning_rate": 1.5818377972995594e-05, + "loss": 0.2007, + "step": 5627 + }, + { + "epoch": 0.32, + "grad_norm": 0.5932912412651169, + "learning_rate": 1.581686438301674e-05, + "loss": 0.3933, + "step": 5628 + }, + { + "epoch": 0.32, + "grad_norm": 0.37069755753292, + "learning_rate": 1.5815350591600124e-05, + "loss": 0.3382, + "step": 5629 + }, + { + "epoch": 0.32, + "grad_norm": 0.30830834981343785, + "learning_rate": 1.5813836598798168e-05, + "loss": 0.1911, + "step": 5630 + }, + { + "epoch": 0.32, + "grad_norm": 0.36059298843687776, + "learning_rate": 1.5812322404663304e-05, + "loss": 0.2967, + "step": 5631 + }, + { + "epoch": 0.32, + "grad_norm": 0.385845649979882, + "learning_rate": 1.581080800924796e-05, + "loss": 0.1999, + "step": 5632 + }, + { + "epoch": 0.32, + "grad_norm": 0.33126056841829, + "learning_rate": 1.5809293412604584e-05, + "loss": 0.299, + "step": 5633 + }, + { + "epoch": 0.32, + "grad_norm": 0.386058427132325, + "learning_rate": 1.580777861478563e-05, + "loss": 0.3071, + "step": 5634 + }, + { + "epoch": 0.32, + "grad_norm": 0.7101238664226756, + "learning_rate": 1.580626361584355e-05, + "loss": 0.5124, + "step": 5635 + }, + { + "epoch": 0.32, + "grad_norm": 0.589818500555523, + "learning_rate": 1.5804748415830814e-05, + "loss": 0.4467, + "step": 5636 + }, + { + "epoch": 0.32, + "grad_norm": 0.2932950338270886, + "learning_rate": 1.5803233014799887e-05, + "loss": 0.2563, + "step": 5637 + }, + { + "epoch": 0.32, + "grad_norm": 0.2636118456350962, + "learning_rate": 1.5801717412803246e-05, + "loss": 0.2042, + "step": 5638 + }, + { + "epoch": 0.32, + "grad_norm": 0.3969761989987564, + "learning_rate": 1.580020160989339e-05, + "loss": 0.3166, + "step": 5639 + }, + { + "epoch": 0.32, + "grad_norm": 0.5113430169982263, + "learning_rate": 1.5798685606122795e-05, + "loss": 0.3068, + "step": 5640 + }, + { + "epoch": 0.32, + "grad_norm": 0.40055815261009947, + "learning_rate": 1.579716940154397e-05, + "loss": 0.3314, + "step": 5641 + }, + { + "epoch": 0.32, + "grad_norm": 0.7927953903429585, + "learning_rate": 1.5795652996209416e-05, + "loss": 0.5219, + "step": 5642 + }, + { + "epoch": 0.32, + "grad_norm": 0.37086620699465855, + "learning_rate": 1.579413639017165e-05, + "loss": 0.3161, + "step": 5643 + }, + { + "epoch": 0.32, + "grad_norm": 0.24236166483538968, + "learning_rate": 1.5792619583483183e-05, + "loss": 0.1854, + "step": 5644 + }, + { + "epoch": 0.32, + "grad_norm": 0.6014945831118405, + "learning_rate": 1.5791102576196555e-05, + "loss": 0.4099, + "step": 5645 + }, + { + "epoch": 0.32, + "grad_norm": 0.3698987883509682, + "learning_rate": 1.5789585368364296e-05, + "loss": 0.3163, + "step": 5646 + }, + { + "epoch": 0.32, + "grad_norm": 0.7593189596779604, + "learning_rate": 1.5788067960038942e-05, + "loss": 0.3718, + "step": 5647 + }, + { + "epoch": 0.32, + "grad_norm": 0.5227360365319026, + "learning_rate": 1.5786550351273043e-05, + "loss": 0.3476, + "step": 5648 + }, + { + "epoch": 0.32, + "grad_norm": 0.34840131645436817, + "learning_rate": 1.5785032542119155e-05, + "loss": 0.3044, + "step": 5649 + }, + { + "epoch": 0.32, + "grad_norm": 0.3537908681190076, + "learning_rate": 1.578351453262984e-05, + "loss": 0.1915, + "step": 5650 + }, + { + "epoch": 0.32, + "grad_norm": 0.5534665540651362, + "learning_rate": 1.578199632285766e-05, + "loss": 0.3911, + "step": 5651 + }, + { + "epoch": 0.32, + "grad_norm": 0.3296603796034221, + "learning_rate": 1.57804779128552e-05, + "loss": 0.2536, + "step": 5652 + }, + { + "epoch": 0.32, + "grad_norm": 0.36719045075931117, + "learning_rate": 1.577895930267504e-05, + "loss": 0.255, + "step": 5653 + }, + { + "epoch": 0.32, + "grad_norm": 0.5549103218709931, + "learning_rate": 1.5777440492369764e-05, + "loss": 0.4179, + "step": 5654 + }, + { + "epoch": 0.32, + "grad_norm": 0.4099773567180406, + "learning_rate": 1.5775921481991976e-05, + "loss": 0.3489, + "step": 5655 + }, + { + "epoch": 0.32, + "grad_norm": 0.9487980274852443, + "learning_rate": 1.5774402271594272e-05, + "loss": 0.4496, + "step": 5656 + }, + { + "epoch": 0.33, + "grad_norm": 0.30146272872090246, + "learning_rate": 1.577288286122927e-05, + "loss": 0.2396, + "step": 5657 + }, + { + "epoch": 0.33, + "grad_norm": 0.32480569594008457, + "learning_rate": 1.5771363250949582e-05, + "loss": 0.246, + "step": 5658 + }, + { + "epoch": 0.33, + "grad_norm": 0.3711344965945374, + "learning_rate": 1.5769843440807828e-05, + "loss": 0.2793, + "step": 5659 + }, + { + "epoch": 0.33, + "grad_norm": 0.3992119007745314, + "learning_rate": 1.5768323430856647e-05, + "loss": 0.2608, + "step": 5660 + }, + { + "epoch": 0.33, + "grad_norm": 0.335697923487834, + "learning_rate": 1.5766803221148676e-05, + "loss": 0.2743, + "step": 5661 + }, + { + "epoch": 0.33, + "grad_norm": 1.1159080031165294, + "learning_rate": 1.576528281173655e-05, + "loss": 0.4803, + "step": 5662 + }, + { + "epoch": 0.33, + "grad_norm": 0.24838037661375026, + "learning_rate": 1.5763762202672933e-05, + "loss": 0.1415, + "step": 5663 + }, + { + "epoch": 0.33, + "grad_norm": 0.30420554106490244, + "learning_rate": 1.576224139401048e-05, + "loss": 0.2233, + "step": 5664 + }, + { + "epoch": 0.33, + "grad_norm": 0.422273654707413, + "learning_rate": 1.5760720385801855e-05, + "loss": 0.3285, + "step": 5665 + }, + { + "epoch": 0.33, + "grad_norm": 0.6169292780868955, + "learning_rate": 1.575919917809973e-05, + "loss": 0.2722, + "step": 5666 + }, + { + "epoch": 0.33, + "grad_norm": 0.4049489150331608, + "learning_rate": 1.5757677770956785e-05, + "loss": 0.2992, + "step": 5667 + }, + { + "epoch": 0.33, + "grad_norm": 0.8471162417110218, + "learning_rate": 1.5756156164425703e-05, + "loss": 0.5039, + "step": 5668 + }, + { + "epoch": 0.33, + "grad_norm": 0.4114900323796081, + "learning_rate": 1.5754634358559187e-05, + "loss": 0.2948, + "step": 5669 + }, + { + "epoch": 0.33, + "grad_norm": 0.24786075406450575, + "learning_rate": 1.5753112353409928e-05, + "loss": 0.1681, + "step": 5670 + }, + { + "epoch": 0.33, + "grad_norm": 1.1452188003836672, + "learning_rate": 1.5751590149030632e-05, + "loss": 0.7431, + "step": 5671 + }, + { + "epoch": 0.33, + "grad_norm": 0.4562773003590891, + "learning_rate": 1.575006774547402e-05, + "loss": 0.3109, + "step": 5672 + }, + { + "epoch": 0.33, + "grad_norm": 0.434282672437878, + "learning_rate": 1.5748545142792807e-05, + "loss": 0.2306, + "step": 5673 + }, + { + "epoch": 0.33, + "grad_norm": 1.0737371125385688, + "learning_rate": 1.5747022341039727e-05, + "loss": 0.4466, + "step": 5674 + }, + { + "epoch": 0.33, + "grad_norm": 0.719146996338594, + "learning_rate": 1.5745499340267508e-05, + "loss": 0.4678, + "step": 5675 + }, + { + "epoch": 0.33, + "grad_norm": 0.314472948884982, + "learning_rate": 1.5743976140528893e-05, + "loss": 0.1734, + "step": 5676 + }, + { + "epoch": 0.33, + "grad_norm": 0.40527113405825066, + "learning_rate": 1.5742452741876632e-05, + "loss": 0.2722, + "step": 5677 + }, + { + "epoch": 0.33, + "grad_norm": 0.7245521666416738, + "learning_rate": 1.574092914436348e-05, + "loss": 0.4212, + "step": 5678 + }, + { + "epoch": 0.33, + "grad_norm": 0.4596505412358331, + "learning_rate": 1.5739405348042197e-05, + "loss": 0.2702, + "step": 5679 + }, + { + "epoch": 0.33, + "grad_norm": 0.42284574176881146, + "learning_rate": 1.5737881352965556e-05, + "loss": 0.34, + "step": 5680 + }, + { + "epoch": 0.33, + "grad_norm": 1.7840092327565245, + "learning_rate": 1.573635715918633e-05, + "loss": 0.7355, + "step": 5681 + }, + { + "epoch": 0.33, + "grad_norm": 0.3697786735178063, + "learning_rate": 1.5734832766757302e-05, + "loss": 0.2946, + "step": 5682 + }, + { + "epoch": 0.33, + "grad_norm": 0.26905521183914954, + "learning_rate": 1.573330817573126e-05, + "loss": 0.1296, + "step": 5683 + }, + { + "epoch": 0.33, + "grad_norm": 0.3968443457830917, + "learning_rate": 1.5731783386161007e-05, + "loss": 0.3475, + "step": 5684 + }, + { + "epoch": 0.33, + "grad_norm": 0.3421390360738499, + "learning_rate": 1.5730258398099335e-05, + "loss": 0.2744, + "step": 5685 + }, + { + "epoch": 0.33, + "grad_norm": 0.7376461566219767, + "learning_rate": 1.5728733211599067e-05, + "loss": 0.3775, + "step": 5686 + }, + { + "epoch": 0.33, + "grad_norm": 1.2203592124029954, + "learning_rate": 1.572720782671301e-05, + "loss": 0.6442, + "step": 5687 + }, + { + "epoch": 0.33, + "grad_norm": 0.3257697053227165, + "learning_rate": 1.5725682243493995e-05, + "loss": 0.2935, + "step": 5688 + }, + { + "epoch": 0.33, + "grad_norm": 0.24273051299883028, + "learning_rate": 1.572415646199485e-05, + "loss": 0.1582, + "step": 5689 + }, + { + "epoch": 0.33, + "grad_norm": 0.7445388041307233, + "learning_rate": 1.5722630482268413e-05, + "loss": 0.4049, + "step": 5690 + }, + { + "epoch": 0.33, + "grad_norm": 0.3916307018462784, + "learning_rate": 1.5721104304367526e-05, + "loss": 0.318, + "step": 5691 + }, + { + "epoch": 0.33, + "grad_norm": 0.3538495633272354, + "learning_rate": 1.5719577928345045e-05, + "loss": 0.2772, + "step": 5692 + }, + { + "epoch": 0.33, + "grad_norm": 0.7364894330489864, + "learning_rate": 1.5718051354253828e-05, + "loss": 0.4585, + "step": 5693 + }, + { + "epoch": 0.33, + "grad_norm": 0.4082152277198964, + "learning_rate": 1.5716524582146734e-05, + "loss": 0.304, + "step": 5694 + }, + { + "epoch": 0.33, + "grad_norm": 0.31484704632365135, + "learning_rate": 1.5714997612076643e-05, + "loss": 0.232, + "step": 5695 + }, + { + "epoch": 0.33, + "grad_norm": 0.39016338699647013, + "learning_rate": 1.571347044409643e-05, + "loss": 0.3046, + "step": 5696 + }, + { + "epoch": 0.33, + "grad_norm": 0.3233481171244161, + "learning_rate": 1.571194307825898e-05, + "loss": 0.2492, + "step": 5697 + }, + { + "epoch": 0.33, + "grad_norm": 1.092219467523616, + "learning_rate": 1.5710415514617187e-05, + "loss": 0.7269, + "step": 5698 + }, + { + "epoch": 0.33, + "grad_norm": 1.0398976979337753, + "learning_rate": 1.5708887753223953e-05, + "loss": 0.2765, + "step": 5699 + }, + { + "epoch": 0.33, + "grad_norm": 0.29904995978400906, + "learning_rate": 1.5707359794132178e-05, + "loss": 0.2535, + "step": 5700 + }, + { + "epoch": 0.33, + "grad_norm": 0.36950212436020996, + "learning_rate": 1.5705831637394783e-05, + "loss": 0.3008, + "step": 5701 + }, + { + "epoch": 0.33, + "grad_norm": 0.5786942167807272, + "learning_rate": 1.570430328306468e-05, + "loss": 0.3812, + "step": 5702 + }, + { + "epoch": 0.33, + "grad_norm": 0.39500817970292046, + "learning_rate": 1.5702774731194802e-05, + "loss": 0.3126, + "step": 5703 + }, + { + "epoch": 0.33, + "grad_norm": 0.3453869948145563, + "learning_rate": 1.570124598183808e-05, + "loss": 0.2725, + "step": 5704 + }, + { + "epoch": 0.33, + "grad_norm": 0.4214946705124231, + "learning_rate": 1.569971703504745e-05, + "loss": 0.2872, + "step": 5705 + }, + { + "epoch": 0.33, + "grad_norm": 0.3257990453502148, + "learning_rate": 1.5698187890875867e-05, + "loss": 0.2885, + "step": 5706 + }, + { + "epoch": 0.33, + "grad_norm": 0.6989314635364635, + "learning_rate": 1.5696658549376286e-05, + "loss": 0.5029, + "step": 5707 + }, + { + "epoch": 0.33, + "grad_norm": 0.34822630078350825, + "learning_rate": 1.569512901060166e-05, + "loss": 0.3442, + "step": 5708 + }, + { + "epoch": 0.33, + "grad_norm": 0.34505897851320166, + "learning_rate": 1.569359927460496e-05, + "loss": 0.1921, + "step": 5709 + }, + { + "epoch": 0.33, + "grad_norm": 0.2695284101966276, + "learning_rate": 1.5692069341439164e-05, + "loss": 0.2228, + "step": 5710 + }, + { + "epoch": 0.33, + "grad_norm": 0.5436723132485777, + "learning_rate": 1.5690539211157255e-05, + "loss": 0.3901, + "step": 5711 + }, + { + "epoch": 0.33, + "grad_norm": 0.33269868149267295, + "learning_rate": 1.5689008883812212e-05, + "loss": 0.2106, + "step": 5712 + }, + { + "epoch": 0.33, + "grad_norm": 0.5032767085391571, + "learning_rate": 1.568747835945704e-05, + "loss": 0.3967, + "step": 5713 + }, + { + "epoch": 0.33, + "grad_norm": 0.7674497920759249, + "learning_rate": 1.5685947638144736e-05, + "loss": 0.6305, + "step": 5714 + }, + { + "epoch": 0.33, + "grad_norm": 0.33746113836252484, + "learning_rate": 1.5684416719928314e-05, + "loss": 0.2059, + "step": 5715 + }, + { + "epoch": 0.33, + "grad_norm": 0.27730558315725434, + "learning_rate": 1.568288560486078e-05, + "loss": 0.2342, + "step": 5716 + }, + { + "epoch": 0.33, + "grad_norm": 0.8937613239506347, + "learning_rate": 1.5681354292995164e-05, + "loss": 0.5943, + "step": 5717 + }, + { + "epoch": 0.33, + "grad_norm": 0.32282729775940155, + "learning_rate": 1.5679822784384492e-05, + "loss": 0.213, + "step": 5718 + }, + { + "epoch": 0.33, + "grad_norm": 1.0217359415141611, + "learning_rate": 1.56782910790818e-05, + "loss": 0.4414, + "step": 5719 + }, + { + "epoch": 0.33, + "grad_norm": 0.37906399149967873, + "learning_rate": 1.5676759177140132e-05, + "loss": 0.3148, + "step": 5720 + }, + { + "epoch": 0.33, + "grad_norm": 0.32548021526366716, + "learning_rate": 1.567522707861254e-05, + "loss": 0.2696, + "step": 5721 + }, + { + "epoch": 0.33, + "grad_norm": 0.27672622569811445, + "learning_rate": 1.567369478355208e-05, + "loss": 0.1349, + "step": 5722 + }, + { + "epoch": 0.33, + "grad_norm": 0.34813866318867676, + "learning_rate": 1.567216229201181e-05, + "loss": 0.2762, + "step": 5723 + }, + { + "epoch": 0.33, + "grad_norm": 0.31379746680097137, + "learning_rate": 1.5670629604044804e-05, + "loss": 0.2616, + "step": 5724 + }, + { + "epoch": 0.33, + "grad_norm": 0.8743307806116454, + "learning_rate": 1.566909671970414e-05, + "loss": 0.3819, + "step": 5725 + }, + { + "epoch": 0.33, + "grad_norm": 0.9740313278256991, + "learning_rate": 1.5667563639042904e-05, + "loss": 0.6429, + "step": 5726 + }, + { + "epoch": 0.33, + "grad_norm": 0.3849691561398392, + "learning_rate": 1.5666030362114175e-05, + "loss": 0.2787, + "step": 5727 + }, + { + "epoch": 0.33, + "grad_norm": 0.29480543187124647, + "learning_rate": 1.566449688897106e-05, + "loss": 0.2065, + "step": 5728 + }, + { + "epoch": 0.33, + "grad_norm": 0.5564381113672089, + "learning_rate": 1.5662963219666666e-05, + "loss": 0.3209, + "step": 5729 + }, + { + "epoch": 0.33, + "grad_norm": 0.6185522440055002, + "learning_rate": 1.5661429354254096e-05, + "loss": 0.3824, + "step": 5730 + }, + { + "epoch": 0.33, + "grad_norm": 0.40049158281522773, + "learning_rate": 1.5659895292786474e-05, + "loss": 0.3183, + "step": 5731 + }, + { + "epoch": 0.33, + "grad_norm": 0.4038008265047527, + "learning_rate": 1.565836103531692e-05, + "loss": 0.3016, + "step": 5732 + }, + { + "epoch": 0.33, + "grad_norm": 0.3987251335173393, + "learning_rate": 1.5656826581898563e-05, + "loss": 0.2744, + "step": 5733 + }, + { + "epoch": 0.33, + "grad_norm": 0.34573611183952435, + "learning_rate": 1.565529193258455e-05, + "loss": 0.2572, + "step": 5734 + }, + { + "epoch": 0.33, + "grad_norm": 0.392359831338795, + "learning_rate": 1.5653757087428015e-05, + "loss": 0.2155, + "step": 5735 + }, + { + "epoch": 0.33, + "grad_norm": 0.29979714182243516, + "learning_rate": 1.5652222046482118e-05, + "loss": 0.288, + "step": 5736 + }, + { + "epoch": 0.33, + "grad_norm": 0.5553731791527236, + "learning_rate": 1.5650686809800016e-05, + "loss": 0.388, + "step": 5737 + }, + { + "epoch": 0.33, + "grad_norm": 0.935348590466938, + "learning_rate": 1.564915137743487e-05, + "loss": 0.5044, + "step": 5738 + }, + { + "epoch": 0.33, + "grad_norm": 0.42901141158263, + "learning_rate": 1.5647615749439858e-05, + "loss": 0.302, + "step": 5739 + }, + { + "epoch": 0.33, + "grad_norm": 0.3289608155747018, + "learning_rate": 1.5646079925868152e-05, + "loss": 0.2922, + "step": 5740 + }, + { + "epoch": 0.33, + "grad_norm": 0.5271177700015318, + "learning_rate": 1.564454390677294e-05, + "loss": 0.2541, + "step": 5741 + }, + { + "epoch": 0.33, + "grad_norm": 0.3476763157178258, + "learning_rate": 1.5643007692207422e-05, + "loss": 0.3031, + "step": 5742 + }, + { + "epoch": 0.33, + "grad_norm": 0.4244417596608532, + "learning_rate": 1.5641471282224788e-05, + "loss": 0.2396, + "step": 5743 + }, + { + "epoch": 0.33, + "grad_norm": 0.4413347170413946, + "learning_rate": 1.563993467687824e-05, + "loss": 0.3494, + "step": 5744 + }, + { + "epoch": 0.33, + "grad_norm": 0.38877102993582696, + "learning_rate": 1.5638397876221002e-05, + "loss": 0.2112, + "step": 5745 + }, + { + "epoch": 0.33, + "grad_norm": 0.4975149211929953, + "learning_rate": 1.563686088030629e-05, + "loss": 0.4052, + "step": 5746 + }, + { + "epoch": 0.33, + "grad_norm": 0.5640690106411652, + "learning_rate": 1.5635323689187323e-05, + "loss": 0.3971, + "step": 5747 + }, + { + "epoch": 0.33, + "grad_norm": 0.2221851855752157, + "learning_rate": 1.5633786302917343e-05, + "loss": 0.1517, + "step": 5748 + }, + { + "epoch": 0.33, + "grad_norm": 0.2994501028669219, + "learning_rate": 1.5632248721549584e-05, + "loss": 0.2548, + "step": 5749 + }, + { + "epoch": 0.33, + "grad_norm": 0.8917217167943163, + "learning_rate": 1.5630710945137293e-05, + "loss": 0.6478, + "step": 5750 + }, + { + "epoch": 0.33, + "grad_norm": 0.31361359549888296, + "learning_rate": 1.5629172973733724e-05, + "loss": 0.1688, + "step": 5751 + }, + { + "epoch": 0.33, + "grad_norm": 0.35912544773740485, + "learning_rate": 1.562763480739214e-05, + "loss": 0.2893, + "step": 5752 + }, + { + "epoch": 0.33, + "grad_norm": 0.711575311545721, + "learning_rate": 1.56260964461658e-05, + "loss": 0.4265, + "step": 5753 + }, + { + "epoch": 0.33, + "grad_norm": 0.2564745105098827, + "learning_rate": 1.5624557890107983e-05, + "loss": 0.1733, + "step": 5754 + }, + { + "epoch": 0.33, + "grad_norm": 0.3070550130073271, + "learning_rate": 1.5623019139271967e-05, + "loss": 0.2762, + "step": 5755 + }, + { + "epoch": 0.33, + "grad_norm": 0.5205475343496394, + "learning_rate": 1.5621480193711046e-05, + "loss": 0.427, + "step": 5756 + }, + { + "epoch": 0.33, + "grad_norm": 0.6040346459958775, + "learning_rate": 1.56199410534785e-05, + "loss": 0.3935, + "step": 5757 + }, + { + "epoch": 0.33, + "grad_norm": 0.5393463384567777, + "learning_rate": 1.5618401718627644e-05, + "loss": 0.2879, + "step": 5758 + }, + { + "epoch": 0.33, + "grad_norm": 0.4889068289885966, + "learning_rate": 1.5616862189211774e-05, + "loss": 0.3763, + "step": 5759 + }, + { + "epoch": 0.33, + "grad_norm": 0.27513668919227896, + "learning_rate": 1.561532246528421e-05, + "loss": 0.2058, + "step": 5760 + }, + { + "epoch": 0.33, + "grad_norm": 0.2966718812667392, + "learning_rate": 1.5613782546898268e-05, + "loss": 0.2096, + "step": 5761 + }, + { + "epoch": 0.33, + "grad_norm": 0.9302097875569516, + "learning_rate": 1.561224243410728e-05, + "loss": 0.6853, + "step": 5762 + }, + { + "epoch": 0.33, + "grad_norm": 0.3034217660889585, + "learning_rate": 1.561070212696458e-05, + "loss": 0.2851, + "step": 5763 + }, + { + "epoch": 0.33, + "grad_norm": 0.3943008222386303, + "learning_rate": 1.56091616255235e-05, + "loss": 0.2914, + "step": 5764 + }, + { + "epoch": 0.33, + "grad_norm": 0.7197890030209596, + "learning_rate": 1.5607620929837398e-05, + "loss": 0.4985, + "step": 5765 + }, + { + "epoch": 0.33, + "grad_norm": 0.3846549791318257, + "learning_rate": 1.5606080039959624e-05, + "loss": 0.2175, + "step": 5766 + }, + { + "epoch": 0.33, + "grad_norm": 0.2644764393345778, + "learning_rate": 1.5604538955943542e-05, + "loss": 0.2057, + "step": 5767 + }, + { + "epoch": 0.33, + "grad_norm": 0.5406969227154103, + "learning_rate": 1.5602997677842515e-05, + "loss": 0.4251, + "step": 5768 + }, + { + "epoch": 0.33, + "grad_norm": 0.6047624630317593, + "learning_rate": 1.560145620570992e-05, + "loss": 0.4523, + "step": 5769 + }, + { + "epoch": 0.33, + "grad_norm": 0.355995131291561, + "learning_rate": 1.5599914539599135e-05, + "loss": 0.314, + "step": 5770 + }, + { + "epoch": 0.33, + "grad_norm": 0.4253729585784995, + "learning_rate": 1.559837267956355e-05, + "loss": 0.2929, + "step": 5771 + }, + { + "epoch": 0.33, + "grad_norm": 0.42743246173578286, + "learning_rate": 1.559683062565656e-05, + "loss": 0.3047, + "step": 5772 + }, + { + "epoch": 0.33, + "grad_norm": 0.2515830454917624, + "learning_rate": 1.559528837793157e-05, + "loss": 0.2118, + "step": 5773 + }, + { + "epoch": 0.33, + "grad_norm": 0.7515736027063699, + "learning_rate": 1.559374593644198e-05, + "loss": 0.5555, + "step": 5774 + }, + { + "epoch": 0.33, + "grad_norm": 0.4013581312855905, + "learning_rate": 1.559220330124121e-05, + "loss": 0.3164, + "step": 5775 + }, + { + "epoch": 0.33, + "grad_norm": 0.32182922729328917, + "learning_rate": 1.5590660472382682e-05, + "loss": 0.3081, + "step": 5776 + }, + { + "epoch": 0.33, + "grad_norm": 0.853128767784892, + "learning_rate": 1.558911744991982e-05, + "loss": 0.2577, + "step": 5777 + }, + { + "epoch": 0.33, + "grad_norm": 0.3250498927602875, + "learning_rate": 1.5587574233906063e-05, + "loss": 0.2265, + "step": 5778 + }, + { + "epoch": 0.33, + "grad_norm": 0.30334383260105524, + "learning_rate": 1.5586030824394848e-05, + "loss": 0.2567, + "step": 5779 + }, + { + "epoch": 0.33, + "grad_norm": 0.36184984465163766, + "learning_rate": 1.5584487221439628e-05, + "loss": 0.3016, + "step": 5780 + }, + { + "epoch": 0.33, + "grad_norm": 0.46282632685077807, + "learning_rate": 1.5582943425093856e-05, + "loss": 0.3936, + "step": 5781 + }, + { + "epoch": 0.33, + "grad_norm": 0.40893305897006493, + "learning_rate": 1.558139943541099e-05, + "loss": 0.3362, + "step": 5782 + }, + { + "epoch": 0.33, + "grad_norm": 0.3743712544882061, + "learning_rate": 1.5579855252444506e-05, + "loss": 0.3249, + "step": 5783 + }, + { + "epoch": 0.33, + "grad_norm": 0.46996244742050114, + "learning_rate": 1.557831087624787e-05, + "loss": 0.1574, + "step": 5784 + }, + { + "epoch": 0.33, + "grad_norm": 0.2683925413029673, + "learning_rate": 1.5576766306874572e-05, + "loss": 0.2282, + "step": 5785 + }, + { + "epoch": 0.33, + "grad_norm": 0.6926340821108188, + "learning_rate": 1.5575221544378094e-05, + "loss": 0.4933, + "step": 5786 + }, + { + "epoch": 0.33, + "grad_norm": 0.31226964320871636, + "learning_rate": 1.5573676588811935e-05, + "loss": 0.2602, + "step": 5787 + }, + { + "epoch": 0.33, + "grad_norm": 0.39932555162861216, + "learning_rate": 1.5572131440229593e-05, + "loss": 0.3255, + "step": 5788 + }, + { + "epoch": 0.33, + "grad_norm": 0.3892684727048228, + "learning_rate": 1.557058609868458e-05, + "loss": 0.2558, + "step": 5789 + }, + { + "epoch": 0.33, + "grad_norm": 0.2385948593781187, + "learning_rate": 1.5569040564230414e-05, + "loss": 0.1092, + "step": 5790 + }, + { + "epoch": 0.33, + "grad_norm": 0.27404269814150173, + "learning_rate": 1.556749483692061e-05, + "loss": 0.2733, + "step": 5791 + }, + { + "epoch": 0.33, + "grad_norm": 0.8492486232920454, + "learning_rate": 1.5565948916808697e-05, + "loss": 0.5681, + "step": 5792 + }, + { + "epoch": 0.33, + "grad_norm": 0.6782578354702179, + "learning_rate": 1.5564402803948215e-05, + "loss": 0.2971, + "step": 5793 + }, + { + "epoch": 0.33, + "grad_norm": 0.3093663841738642, + "learning_rate": 1.55628564983927e-05, + "loss": 0.2356, + "step": 5794 + }, + { + "epoch": 0.33, + "grad_norm": 0.3650624325489127, + "learning_rate": 1.556131000019571e-05, + "loss": 0.3337, + "step": 5795 + }, + { + "epoch": 0.33, + "grad_norm": 0.760194714593758, + "learning_rate": 1.5559763309410787e-05, + "loss": 0.4862, + "step": 5796 + }, + { + "epoch": 0.33, + "grad_norm": 0.3294635430927283, + "learning_rate": 1.5558216426091505e-05, + "loss": 0.2005, + "step": 5797 + }, + { + "epoch": 0.33, + "grad_norm": 1.077418090885813, + "learning_rate": 1.5556669350291422e-05, + "loss": 0.4722, + "step": 5798 + }, + { + "epoch": 0.33, + "grad_norm": 0.3504419610826631, + "learning_rate": 1.5555122082064123e-05, + "loss": 0.323, + "step": 5799 + }, + { + "epoch": 0.33, + "grad_norm": 0.3475869790586581, + "learning_rate": 1.5553574621463183e-05, + "loss": 0.2275, + "step": 5800 + }, + { + "epoch": 0.33, + "grad_norm": 0.3020170894774215, + "learning_rate": 1.5552026968542192e-05, + "loss": 0.224, + "step": 5801 + }, + { + "epoch": 0.33, + "grad_norm": 1.697556497115101, + "learning_rate": 1.555047912335475e-05, + "loss": 0.6149, + "step": 5802 + }, + { + "epoch": 0.33, + "grad_norm": 0.29416917374336826, + "learning_rate": 1.5548931085954448e-05, + "loss": 0.2219, + "step": 5803 + }, + { + "epoch": 0.33, + "grad_norm": 0.6232842819271314, + "learning_rate": 1.5547382856394905e-05, + "loss": 0.3939, + "step": 5804 + }, + { + "epoch": 0.33, + "grad_norm": 1.1892346356469627, + "learning_rate": 1.5545834434729732e-05, + "loss": 0.8569, + "step": 5805 + }, + { + "epoch": 0.33, + "grad_norm": 0.3967731998369141, + "learning_rate": 1.554428582101255e-05, + "loss": 0.2805, + "step": 5806 + }, + { + "epoch": 0.33, + "grad_norm": 0.34087023673325834, + "learning_rate": 1.554273701529699e-05, + "loss": 0.1824, + "step": 5807 + }, + { + "epoch": 0.33, + "grad_norm": 1.374281261563989, + "learning_rate": 1.5541188017636683e-05, + "loss": 0.678, + "step": 5808 + }, + { + "epoch": 0.33, + "grad_norm": 0.3489684607123253, + "learning_rate": 1.5539638828085278e-05, + "loss": 0.2842, + "step": 5809 + }, + { + "epoch": 0.33, + "grad_norm": 0.6789014020469017, + "learning_rate": 1.5538089446696414e-05, + "loss": 0.3743, + "step": 5810 + }, + { + "epoch": 0.33, + "grad_norm": 0.4519718080022967, + "learning_rate": 1.553653987352375e-05, + "loss": 0.3394, + "step": 5811 + }, + { + "epoch": 0.33, + "grad_norm": 0.4552790837774947, + "learning_rate": 1.553499010862095e-05, + "loss": 0.299, + "step": 5812 + }, + { + "epoch": 0.33, + "grad_norm": 0.4965276597889446, + "learning_rate": 1.553344015204168e-05, + "loss": 0.1166, + "step": 5813 + }, + { + "epoch": 0.33, + "grad_norm": 0.5101673833679304, + "learning_rate": 1.553189000383962e-05, + "loss": 0.394, + "step": 5814 + }, + { + "epoch": 0.33, + "grad_norm": 0.42668233072512435, + "learning_rate": 1.553033966406844e-05, + "loss": 0.2904, + "step": 5815 + }, + { + "epoch": 0.33, + "grad_norm": 0.3709815294186113, + "learning_rate": 1.552878913278184e-05, + "loss": 0.3063, + "step": 5816 + }, + { + "epoch": 0.33, + "grad_norm": 0.7836478889317833, + "learning_rate": 1.5527238410033508e-05, + "loss": 0.4855, + "step": 5817 + }, + { + "epoch": 0.33, + "grad_norm": 0.43253924637788094, + "learning_rate": 1.552568749587715e-05, + "loss": 0.2718, + "step": 5818 + }, + { + "epoch": 0.33, + "grad_norm": 0.287688911198083, + "learning_rate": 1.5524136390366468e-05, + "loss": 0.2535, + "step": 5819 + }, + { + "epoch": 0.33, + "grad_norm": 0.723004816360338, + "learning_rate": 1.5522585093555184e-05, + "loss": 0.3463, + "step": 5820 + }, + { + "epoch": 0.33, + "grad_norm": 0.3808008491185971, + "learning_rate": 1.5521033605497013e-05, + "loss": 0.2936, + "step": 5821 + }, + { + "epoch": 0.33, + "grad_norm": 0.8200187852102342, + "learning_rate": 1.5519481926245687e-05, + "loss": 0.5662, + "step": 5822 + }, + { + "epoch": 0.33, + "grad_norm": 0.36108551149142204, + "learning_rate": 1.551793005585494e-05, + "loss": 0.2783, + "step": 5823 + }, + { + "epoch": 0.33, + "grad_norm": 0.3836885248758884, + "learning_rate": 1.5516377994378513e-05, + "loss": 0.288, + "step": 5824 + }, + { + "epoch": 0.33, + "grad_norm": 0.32215157055186644, + "learning_rate": 1.551482574187015e-05, + "loss": 0.234, + "step": 5825 + }, + { + "epoch": 0.33, + "grad_norm": 0.4798535959282858, + "learning_rate": 1.5513273298383607e-05, + "loss": 0.3197, + "step": 5826 + }, + { + "epoch": 0.33, + "grad_norm": 0.32604749167703956, + "learning_rate": 1.551172066397265e-05, + "loss": 0.2737, + "step": 5827 + }, + { + "epoch": 0.33, + "grad_norm": 0.9180829156266932, + "learning_rate": 1.5510167838691047e-05, + "loss": 0.5507, + "step": 5828 + }, + { + "epoch": 0.33, + "grad_norm": 1.2475558469396781, + "learning_rate": 1.550861482259256e-05, + "loss": 0.5742, + "step": 5829 + }, + { + "epoch": 0.33, + "grad_norm": 0.37812167267697283, + "learning_rate": 1.5507061615730986e-05, + "loss": 0.2834, + "step": 5830 + }, + { + "epoch": 0.34, + "grad_norm": 0.31942624860729857, + "learning_rate": 1.5505508218160103e-05, + "loss": 0.2459, + "step": 5831 + }, + { + "epoch": 0.34, + "grad_norm": 0.5164865863370692, + "learning_rate": 1.5503954629933707e-05, + "loss": 0.3329, + "step": 5832 + }, + { + "epoch": 0.34, + "grad_norm": 0.4604448154652938, + "learning_rate": 1.5502400851105603e-05, + "loss": 0.1969, + "step": 5833 + }, + { + "epoch": 0.34, + "grad_norm": 0.57149449313321, + "learning_rate": 1.5500846881729587e-05, + "loss": 0.4065, + "step": 5834 + }, + { + "epoch": 0.34, + "grad_norm": 0.4424655995824884, + "learning_rate": 1.5499292721859483e-05, + "loss": 0.3512, + "step": 5835 + }, + { + "epoch": 0.34, + "grad_norm": 0.3409734791754797, + "learning_rate": 1.5497738371549108e-05, + "loss": 0.2024, + "step": 5836 + }, + { + "epoch": 0.34, + "grad_norm": 0.44336289462689166, + "learning_rate": 1.549618383085229e-05, + "loss": 0.3014, + "step": 5837 + }, + { + "epoch": 0.34, + "grad_norm": 0.3706424088281798, + "learning_rate": 1.549462909982286e-05, + "loss": 0.2969, + "step": 5838 + }, + { + "epoch": 0.34, + "grad_norm": 0.3273551180286123, + "learning_rate": 1.5493074178514665e-05, + "loss": 0.2201, + "step": 5839 + }, + { + "epoch": 0.34, + "grad_norm": 1.4602400150554773, + "learning_rate": 1.5491519066981547e-05, + "loss": 0.7239, + "step": 5840 + }, + { + "epoch": 0.34, + "grad_norm": 1.2934303316229814, + "learning_rate": 1.5489963765277356e-05, + "loss": 0.8344, + "step": 5841 + }, + { + "epoch": 0.34, + "grad_norm": 0.37170829852311305, + "learning_rate": 1.548840827345596e-05, + "loss": 0.2075, + "step": 5842 + }, + { + "epoch": 0.34, + "grad_norm": 0.3690170784055627, + "learning_rate": 1.5486852591571217e-05, + "loss": 0.3312, + "step": 5843 + }, + { + "epoch": 0.34, + "grad_norm": 0.42868695087431713, + "learning_rate": 1.5485296719677005e-05, + "loss": 0.2912, + "step": 5844 + }, + { + "epoch": 0.34, + "grad_norm": 0.32516213319606574, + "learning_rate": 1.5483740657827205e-05, + "loss": 0.2423, + "step": 5845 + }, + { + "epoch": 0.34, + "grad_norm": 0.8624921342883979, + "learning_rate": 1.5482184406075705e-05, + "loss": 0.3653, + "step": 5846 + }, + { + "epoch": 0.34, + "grad_norm": 0.43311550871258736, + "learning_rate": 1.5480627964476392e-05, + "loss": 0.3782, + "step": 5847 + }, + { + "epoch": 0.34, + "grad_norm": 0.3466225539727474, + "learning_rate": 1.547907133308317e-05, + "loss": 0.2827, + "step": 5848 + }, + { + "epoch": 0.34, + "grad_norm": 0.6526142864376929, + "learning_rate": 1.547751451194994e-05, + "loss": 0.3642, + "step": 5849 + }, + { + "epoch": 0.34, + "grad_norm": 0.3194788764576168, + "learning_rate": 1.5475957501130622e-05, + "loss": 0.3314, + "step": 5850 + }, + { + "epoch": 0.34, + "grad_norm": 0.33247814679611337, + "learning_rate": 1.5474400300679128e-05, + "loss": 0.2097, + "step": 5851 + }, + { + "epoch": 0.34, + "grad_norm": 0.34086337965321845, + "learning_rate": 1.5472842910649387e-05, + "loss": 0.2012, + "step": 5852 + }, + { + "epoch": 0.34, + "grad_norm": 0.8793044262444761, + "learning_rate": 1.5471285331095334e-05, + "loss": 0.6676, + "step": 5853 + }, + { + "epoch": 0.34, + "grad_norm": 0.5278240555166073, + "learning_rate": 1.5469727562070904e-05, + "loss": 0.3511, + "step": 5854 + }, + { + "epoch": 0.34, + "grad_norm": 0.3723179875730455, + "learning_rate": 1.5468169603630045e-05, + "loss": 0.2741, + "step": 5855 + }, + { + "epoch": 0.34, + "grad_norm": 0.4558277232143511, + "learning_rate": 1.5466611455826703e-05, + "loss": 0.2971, + "step": 5856 + }, + { + "epoch": 0.34, + "grad_norm": 0.3657369354981838, + "learning_rate": 1.5465053118714846e-05, + "loss": 0.2267, + "step": 5857 + }, + { + "epoch": 0.34, + "grad_norm": 0.40254068196433923, + "learning_rate": 1.5463494592348435e-05, + "loss": 0.3381, + "step": 5858 + }, + { + "epoch": 0.34, + "grad_norm": 0.4389789142234693, + "learning_rate": 1.5461935876781436e-05, + "loss": 0.3111, + "step": 5859 + }, + { + "epoch": 0.34, + "grad_norm": 0.42792874304101514, + "learning_rate": 1.5460376972067837e-05, + "loss": 0.3259, + "step": 5860 + }, + { + "epoch": 0.34, + "grad_norm": 0.5828730662973869, + "learning_rate": 1.5458817878261617e-05, + "loss": 0.3779, + "step": 5861 + }, + { + "epoch": 0.34, + "grad_norm": 0.35919316933562373, + "learning_rate": 1.5457258595416766e-05, + "loss": 0.2507, + "step": 5862 + }, + { + "epoch": 0.34, + "grad_norm": 0.33609490374293677, + "learning_rate": 1.5455699123587286e-05, + "loss": 0.225, + "step": 5863 + }, + { + "epoch": 0.34, + "grad_norm": 0.34888688858881917, + "learning_rate": 1.5454139462827183e-05, + "loss": 0.2434, + "step": 5864 + }, + { + "epoch": 0.34, + "grad_norm": 0.743990986427087, + "learning_rate": 1.5452579613190462e-05, + "loss": 0.4317, + "step": 5865 + }, + { + "epoch": 0.34, + "grad_norm": 0.3496844176665509, + "learning_rate": 1.5451019574731147e-05, + "loss": 0.3032, + "step": 5866 + }, + { + "epoch": 0.34, + "grad_norm": 0.39178507409648256, + "learning_rate": 1.5449459347503255e-05, + "loss": 0.3358, + "step": 5867 + }, + { + "epoch": 0.34, + "grad_norm": 0.30121681347934715, + "learning_rate": 1.5447898931560824e-05, + "loss": 0.2011, + "step": 5868 + }, + { + "epoch": 0.34, + "grad_norm": 0.2932285924621865, + "learning_rate": 1.544633832695788e-05, + "loss": 0.205, + "step": 5869 + }, + { + "epoch": 0.34, + "grad_norm": 0.3773063886258503, + "learning_rate": 1.544477753374848e-05, + "loss": 0.3348, + "step": 5870 + }, + { + "epoch": 0.34, + "grad_norm": 0.4860936216259758, + "learning_rate": 1.5443216551986667e-05, + "loss": 0.4027, + "step": 5871 + }, + { + "epoch": 0.34, + "grad_norm": 0.5365140122310895, + "learning_rate": 1.5441655381726496e-05, + "loss": 0.2675, + "step": 5872 + }, + { + "epoch": 0.34, + "grad_norm": 0.5126169170705192, + "learning_rate": 1.5440094023022035e-05, + "loss": 0.347, + "step": 5873 + }, + { + "epoch": 0.34, + "grad_norm": 0.3880198754870328, + "learning_rate": 1.5438532475927354e-05, + "loss": 0.3277, + "step": 5874 + }, + { + "epoch": 0.34, + "grad_norm": 0.3398774666350326, + "learning_rate": 1.5436970740496527e-05, + "loss": 0.1709, + "step": 5875 + }, + { + "epoch": 0.34, + "grad_norm": 0.2934353428437412, + "learning_rate": 1.5435408816783635e-05, + "loss": 0.2122, + "step": 5876 + }, + { + "epoch": 0.34, + "grad_norm": 1.1948470060235745, + "learning_rate": 1.543384670484277e-05, + "loss": 0.5015, + "step": 5877 + }, + { + "epoch": 0.34, + "grad_norm": 0.37606014000496846, + "learning_rate": 1.5432284404728027e-05, + "loss": 0.256, + "step": 5878 + }, + { + "epoch": 0.34, + "grad_norm": 0.4058687646461349, + "learning_rate": 1.5430721916493507e-05, + "loss": 0.3533, + "step": 5879 + }, + { + "epoch": 0.34, + "grad_norm": 0.8713355870009596, + "learning_rate": 1.542915924019332e-05, + "loss": 0.6042, + "step": 5880 + }, + { + "epoch": 0.34, + "grad_norm": 0.2912589294582795, + "learning_rate": 1.5427596375881587e-05, + "loss": 0.1711, + "step": 5881 + }, + { + "epoch": 0.34, + "grad_norm": 0.6110141021008527, + "learning_rate": 1.5426033323612425e-05, + "loss": 0.3838, + "step": 5882 + }, + { + "epoch": 0.34, + "grad_norm": 0.8743221394886532, + "learning_rate": 1.5424470083439958e-05, + "loss": 0.3455, + "step": 5883 + }, + { + "epoch": 0.34, + "grad_norm": 0.6854045258290827, + "learning_rate": 1.5422906655418327e-05, + "loss": 0.4075, + "step": 5884 + }, + { + "epoch": 0.34, + "grad_norm": 0.28645214108364625, + "learning_rate": 1.5421343039601672e-05, + "loss": 0.185, + "step": 5885 + }, + { + "epoch": 0.34, + "grad_norm": 0.39523662011429733, + "learning_rate": 1.5419779236044142e-05, + "loss": 0.3153, + "step": 5886 + }, + { + "epoch": 0.34, + "grad_norm": 0.6709944581548378, + "learning_rate": 1.541821524479989e-05, + "loss": 0.4063, + "step": 5887 + }, + { + "epoch": 0.34, + "grad_norm": 0.3576383977318532, + "learning_rate": 1.541665106592307e-05, + "loss": 0.2235, + "step": 5888 + }, + { + "epoch": 0.34, + "grad_norm": 1.0212539106154426, + "learning_rate": 1.5415086699467864e-05, + "loss": 0.4622, + "step": 5889 + }, + { + "epoch": 0.34, + "grad_norm": 0.39618828216583873, + "learning_rate": 1.5413522145488437e-05, + "loss": 0.3115, + "step": 5890 + }, + { + "epoch": 0.34, + "grad_norm": 0.27461183094712965, + "learning_rate": 1.541195740403897e-05, + "loss": 0.1857, + "step": 5891 + }, + { + "epoch": 0.34, + "grad_norm": 0.9085111519204856, + "learning_rate": 1.541039247517365e-05, + "loss": 0.5692, + "step": 5892 + }, + { + "epoch": 0.34, + "grad_norm": 0.8533930219299408, + "learning_rate": 1.5408827358946675e-05, + "loss": 0.5139, + "step": 5893 + }, + { + "epoch": 0.34, + "grad_norm": 0.4079774306429693, + "learning_rate": 1.5407262055412238e-05, + "loss": 0.2896, + "step": 5894 + }, + { + "epoch": 0.34, + "grad_norm": 0.7506257968661608, + "learning_rate": 1.540569656462455e-05, + "loss": 0.3278, + "step": 5895 + }, + { + "epoch": 0.34, + "grad_norm": 0.6944121449841651, + "learning_rate": 1.5404130886637822e-05, + "loss": 0.4847, + "step": 5896 + }, + { + "epoch": 0.34, + "grad_norm": 0.289549678372288, + "learning_rate": 1.5402565021506273e-05, + "loss": 0.2154, + "step": 5897 + }, + { + "epoch": 0.34, + "grad_norm": 0.3543644317502926, + "learning_rate": 1.540099896928413e-05, + "loss": 0.237, + "step": 5898 + }, + { + "epoch": 0.34, + "grad_norm": 0.5313370994029551, + "learning_rate": 1.5399432730025626e-05, + "loss": 0.3474, + "step": 5899 + }, + { + "epoch": 0.34, + "grad_norm": 0.3858996506274038, + "learning_rate": 1.5397866303784996e-05, + "loss": 0.3342, + "step": 5900 + }, + { + "epoch": 0.34, + "grad_norm": 0.8419789834381768, + "learning_rate": 1.539629969061649e-05, + "loss": 0.428, + "step": 5901 + }, + { + "epoch": 0.34, + "grad_norm": 0.32299250679010993, + "learning_rate": 1.539473289057436e-05, + "loss": 0.3036, + "step": 5902 + }, + { + "epoch": 0.34, + "grad_norm": 0.27648283554967706, + "learning_rate": 1.5393165903712856e-05, + "loss": 0.2395, + "step": 5903 + }, + { + "epoch": 0.34, + "grad_norm": 0.2843375577344682, + "learning_rate": 1.5391598730086254e-05, + "loss": 0.1435, + "step": 5904 + }, + { + "epoch": 0.34, + "grad_norm": 0.5555925119853822, + "learning_rate": 1.539003136974882e-05, + "loss": 0.3774, + "step": 5905 + }, + { + "epoch": 0.34, + "grad_norm": 0.3144089559292263, + "learning_rate": 1.5388463822754827e-05, + "loss": 0.2653, + "step": 5906 + }, + { + "epoch": 0.34, + "grad_norm": 0.4289801488875986, + "learning_rate": 1.538689608915857e-05, + "loss": 0.3952, + "step": 5907 + }, + { + "epoch": 0.34, + "grad_norm": 0.4080185198275796, + "learning_rate": 1.5385328169014325e-05, + "loss": 0.2654, + "step": 5908 + }, + { + "epoch": 0.34, + "grad_norm": 0.23617679924423607, + "learning_rate": 1.53837600623764e-05, + "loss": 0.1897, + "step": 5909 + }, + { + "epoch": 0.34, + "grad_norm": 0.3640319175940855, + "learning_rate": 1.5382191769299096e-05, + "loss": 0.3338, + "step": 5910 + }, + { + "epoch": 0.34, + "grad_norm": 0.5949894824883931, + "learning_rate": 1.5380623289836724e-05, + "loss": 0.2869, + "step": 5911 + }, + { + "epoch": 0.34, + "grad_norm": 0.33508321042850364, + "learning_rate": 1.5379054624043596e-05, + "loss": 0.3238, + "step": 5912 + }, + { + "epoch": 0.34, + "grad_norm": 1.042458284916835, + "learning_rate": 1.537748577197404e-05, + "loss": 0.7016, + "step": 5913 + }, + { + "epoch": 0.34, + "grad_norm": 0.28987327834740845, + "learning_rate": 1.537591673368238e-05, + "loss": 0.2563, + "step": 5914 + }, + { + "epoch": 0.34, + "grad_norm": 0.25989501170956886, + "learning_rate": 1.5374347509222962e-05, + "loss": 0.2252, + "step": 5915 + }, + { + "epoch": 0.34, + "grad_norm": 0.4262012511260465, + "learning_rate": 1.5372778098650115e-05, + "loss": 0.2876, + "step": 5916 + }, + { + "epoch": 0.34, + "grad_norm": 0.6347305763733735, + "learning_rate": 1.5371208502018194e-05, + "loss": 0.3217, + "step": 5917 + }, + { + "epoch": 0.34, + "grad_norm": 0.312424850972814, + "learning_rate": 1.5369638719381555e-05, + "loss": 0.2914, + "step": 5918 + }, + { + "epoch": 0.34, + "grad_norm": 1.4014464340135386, + "learning_rate": 1.5368068750794557e-05, + "loss": 0.7689, + "step": 5919 + }, + { + "epoch": 0.34, + "grad_norm": 0.6480541705283047, + "learning_rate": 1.5366498596311568e-05, + "loss": 0.4657, + "step": 5920 + }, + { + "epoch": 0.34, + "grad_norm": 0.2557476620889882, + "learning_rate": 1.5364928255986966e-05, + "loss": 0.1722, + "step": 5921 + }, + { + "epoch": 0.34, + "grad_norm": 0.3457499150185082, + "learning_rate": 1.5363357729875126e-05, + "loss": 0.2854, + "step": 5922 + }, + { + "epoch": 0.34, + "grad_norm": 0.7202941326070144, + "learning_rate": 1.536178701803044e-05, + "loss": 0.392, + "step": 5923 + }, + { + "epoch": 0.34, + "grad_norm": 0.34428868696575277, + "learning_rate": 1.5360216120507302e-05, + "loss": 0.2324, + "step": 5924 + }, + { + "epoch": 0.34, + "grad_norm": 0.5147196984487438, + "learning_rate": 1.5358645037360108e-05, + "loss": 0.4216, + "step": 5925 + }, + { + "epoch": 0.34, + "grad_norm": 0.537087396199377, + "learning_rate": 1.535707376864327e-05, + "loss": 0.377, + "step": 5926 + }, + { + "epoch": 0.34, + "grad_norm": 0.34006389687556765, + "learning_rate": 1.5355502314411194e-05, + "loss": 0.1906, + "step": 5927 + }, + { + "epoch": 0.34, + "grad_norm": 0.3373315399765145, + "learning_rate": 1.5353930674718305e-05, + "loss": 0.2455, + "step": 5928 + }, + { + "epoch": 0.34, + "grad_norm": 0.46153518536615695, + "learning_rate": 1.5352358849619024e-05, + "loss": 0.3767, + "step": 5929 + }, + { + "epoch": 0.34, + "grad_norm": 0.3368768244126122, + "learning_rate": 1.535078683916779e-05, + "loss": 0.2229, + "step": 5930 + }, + { + "epoch": 0.34, + "grad_norm": 1.1003227430828704, + "learning_rate": 1.5349214643419034e-05, + "loss": 0.6771, + "step": 5931 + }, + { + "epoch": 0.34, + "grad_norm": 1.4907212686940303, + "learning_rate": 1.5347642262427206e-05, + "loss": 0.8189, + "step": 5932 + }, + { + "epoch": 0.34, + "grad_norm": 0.3184937333207399, + "learning_rate": 1.5346069696246758e-05, + "loss": 0.2215, + "step": 5933 + }, + { + "epoch": 0.34, + "grad_norm": 0.41845407671672097, + "learning_rate": 1.534449694493215e-05, + "loss": 0.2991, + "step": 5934 + }, + { + "epoch": 0.34, + "grad_norm": 0.4774275005408674, + "learning_rate": 1.534292400853784e-05, + "loss": 0.3517, + "step": 5935 + }, + { + "epoch": 0.34, + "grad_norm": 0.379325144992996, + "learning_rate": 1.53413508871183e-05, + "loss": 0.2905, + "step": 5936 + }, + { + "epoch": 0.34, + "grad_norm": 0.563366841637593, + "learning_rate": 1.5339777580728003e-05, + "loss": 0.3085, + "step": 5937 + }, + { + "epoch": 0.34, + "grad_norm": 0.463202999933601, + "learning_rate": 1.5338204089421447e-05, + "loss": 0.3625, + "step": 5938 + }, + { + "epoch": 0.34, + "grad_norm": 0.3695255382677873, + "learning_rate": 1.5336630413253108e-05, + "loss": 0.2843, + "step": 5939 + }, + { + "epoch": 0.34, + "grad_norm": 0.6537639861062193, + "learning_rate": 1.533505655227749e-05, + "loss": 0.3645, + "step": 5940 + }, + { + "epoch": 0.34, + "grad_norm": 0.255986281866402, + "learning_rate": 1.533348250654909e-05, + "loss": 0.2218, + "step": 5941 + }, + { + "epoch": 0.34, + "grad_norm": 0.40879555334744677, + "learning_rate": 1.5331908276122424e-05, + "loss": 0.3137, + "step": 5942 + }, + { + "epoch": 0.34, + "grad_norm": 0.9387421903416497, + "learning_rate": 1.5330333861051998e-05, + "loss": 0.5082, + "step": 5943 + }, + { + "epoch": 0.34, + "grad_norm": 1.137603306784779, + "learning_rate": 1.5328759261392344e-05, + "loss": 0.5119, + "step": 5944 + }, + { + "epoch": 0.34, + "grad_norm": 0.3617661594873792, + "learning_rate": 1.5327184477197984e-05, + "loss": 0.301, + "step": 5945 + }, + { + "epoch": 0.34, + "grad_norm": 0.443054484749188, + "learning_rate": 1.5325609508523456e-05, + "loss": 0.3465, + "step": 5946 + }, + { + "epoch": 0.34, + "grad_norm": 0.2663146536046802, + "learning_rate": 1.53240343554233e-05, + "loss": 0.1144, + "step": 5947 + }, + { + "epoch": 0.34, + "grad_norm": 0.38139187527879836, + "learning_rate": 1.532245901795206e-05, + "loss": 0.2786, + "step": 5948 + }, + { + "epoch": 0.34, + "grad_norm": 0.5305598314897465, + "learning_rate": 1.5320883496164295e-05, + "loss": 0.3807, + "step": 5949 + }, + { + "epoch": 0.34, + "grad_norm": 0.4413082084005958, + "learning_rate": 1.5319307790114563e-05, + "loss": 0.3002, + "step": 5950 + }, + { + "epoch": 0.34, + "grad_norm": 0.3583027015117161, + "learning_rate": 1.5317731899857434e-05, + "loss": 0.3078, + "step": 5951 + }, + { + "epoch": 0.34, + "grad_norm": 0.7092834562172043, + "learning_rate": 1.5316155825447476e-05, + "loss": 0.4817, + "step": 5952 + }, + { + "epoch": 0.34, + "grad_norm": 0.2955321017741597, + "learning_rate": 1.5314579566939274e-05, + "loss": 0.1988, + "step": 5953 + }, + { + "epoch": 0.34, + "grad_norm": 0.37715745593890787, + "learning_rate": 1.5313003124387404e-05, + "loss": 0.2891, + "step": 5954 + }, + { + "epoch": 0.34, + "grad_norm": 1.3060160054166214, + "learning_rate": 1.5311426497846466e-05, + "loss": 0.7819, + "step": 5955 + }, + { + "epoch": 0.34, + "grad_norm": 0.7017722643430202, + "learning_rate": 1.530984968737106e-05, + "loss": 0.3755, + "step": 5956 + }, + { + "epoch": 0.34, + "grad_norm": 0.44304273216374057, + "learning_rate": 1.5308272693015785e-05, + "loss": 0.2946, + "step": 5957 + }, + { + "epoch": 0.34, + "grad_norm": 0.3809476824053261, + "learning_rate": 1.530669551483525e-05, + "loss": 0.2984, + "step": 5958 + }, + { + "epoch": 0.34, + "grad_norm": 0.32731689524967394, + "learning_rate": 1.5305118152884086e-05, + "loss": 0.2025, + "step": 5959 + }, + { + "epoch": 0.34, + "grad_norm": 0.32662163125723054, + "learning_rate": 1.5303540607216906e-05, + "loss": 0.2074, + "step": 5960 + }, + { + "epoch": 0.34, + "grad_norm": 0.6125753143165558, + "learning_rate": 1.5301962877888338e-05, + "loss": 0.4477, + "step": 5961 + }, + { + "epoch": 0.34, + "grad_norm": 0.7210324834024456, + "learning_rate": 1.5300384964953028e-05, + "loss": 0.3894, + "step": 5962 + }, + { + "epoch": 0.34, + "grad_norm": 0.3512008862927995, + "learning_rate": 1.5298806868465615e-05, + "loss": 0.2395, + "step": 5963 + }, + { + "epoch": 0.34, + "grad_norm": 1.203964316197915, + "learning_rate": 1.5297228588480744e-05, + "loss": 0.6873, + "step": 5964 + }, + { + "epoch": 0.34, + "grad_norm": 0.4470807698277849, + "learning_rate": 1.5295650125053078e-05, + "loss": 0.344, + "step": 5965 + }, + { + "epoch": 0.34, + "grad_norm": 0.27900170832730514, + "learning_rate": 1.529407147823728e-05, + "loss": 0.1535, + "step": 5966 + }, + { + "epoch": 0.34, + "grad_norm": 1.2564959101088011, + "learning_rate": 1.529249264808801e-05, + "loss": 0.9462, + "step": 5967 + }, + { + "epoch": 0.34, + "grad_norm": 0.8273397489953469, + "learning_rate": 1.5290913634659946e-05, + "loss": 0.4594, + "step": 5968 + }, + { + "epoch": 0.34, + "grad_norm": 0.38156537104710225, + "learning_rate": 1.528933443800777e-05, + "loss": 0.2717, + "step": 5969 + }, + { + "epoch": 0.34, + "grad_norm": 0.40525374010366605, + "learning_rate": 1.5287755058186173e-05, + "loss": 0.2764, + "step": 5970 + }, + { + "epoch": 0.34, + "grad_norm": 0.44559989652126264, + "learning_rate": 1.5286175495249845e-05, + "loss": 0.2679, + "step": 5971 + }, + { + "epoch": 0.34, + "grad_norm": 0.3800803496165543, + "learning_rate": 1.5284595749253486e-05, + "loss": 0.2887, + "step": 5972 + }, + { + "epoch": 0.34, + "grad_norm": 0.5854777998684502, + "learning_rate": 1.5283015820251802e-05, + "loss": 0.3128, + "step": 5973 + }, + { + "epoch": 0.34, + "grad_norm": 0.6490823546446826, + "learning_rate": 1.528143570829951e-05, + "loss": 0.3814, + "step": 5974 + }, + { + "epoch": 0.34, + "grad_norm": 0.37018842276377245, + "learning_rate": 1.5279855413451323e-05, + "loss": 0.223, + "step": 5975 + }, + { + "epoch": 0.34, + "grad_norm": 0.4312382176580631, + "learning_rate": 1.527827493576197e-05, + "loss": 0.284, + "step": 5976 + }, + { + "epoch": 0.34, + "grad_norm": 0.40708070399001484, + "learning_rate": 1.5276694275286188e-05, + "loss": 0.3473, + "step": 5977 + }, + { + "epoch": 0.34, + "grad_norm": 0.5786187802356816, + "learning_rate": 1.5275113432078707e-05, + "loss": 0.3675, + "step": 5978 + }, + { + "epoch": 0.34, + "grad_norm": 0.5832872436341768, + "learning_rate": 1.5273532406194273e-05, + "loss": 0.3081, + "step": 5979 + }, + { + "epoch": 0.34, + "grad_norm": 0.48552801852112154, + "learning_rate": 1.5271951197687642e-05, + "loss": 0.3343, + "step": 5980 + }, + { + "epoch": 0.34, + "grad_norm": 0.3366837647895135, + "learning_rate": 1.5270369806613566e-05, + "loss": 0.2745, + "step": 5981 + }, + { + "epoch": 0.34, + "grad_norm": 0.30439148010092015, + "learning_rate": 1.5268788233026813e-05, + "loss": 0.2576, + "step": 5982 + }, + { + "epoch": 0.34, + "grad_norm": 0.8410593472550371, + "learning_rate": 1.5267206476982143e-05, + "loss": 0.2501, + "step": 5983 + }, + { + "epoch": 0.34, + "grad_norm": 0.41983616658716516, + "learning_rate": 1.5265624538534346e-05, + "loss": 0.335, + "step": 5984 + }, + { + "epoch": 0.34, + "grad_norm": 0.29918174520908947, + "learning_rate": 1.5264042417738198e-05, + "loss": 0.3013, + "step": 5985 + }, + { + "epoch": 0.34, + "grad_norm": 0.9234263832577031, + "learning_rate": 1.5262460114648487e-05, + "loss": 0.4014, + "step": 5986 + }, + { + "epoch": 0.34, + "grad_norm": 0.23127103984282238, + "learning_rate": 1.5260877629320003e-05, + "loss": 0.1819, + "step": 5987 + }, + { + "epoch": 0.34, + "grad_norm": 0.6068087085849305, + "learning_rate": 1.5259294961807557e-05, + "loss": 0.3792, + "step": 5988 + }, + { + "epoch": 0.34, + "grad_norm": 0.4333690325500259, + "learning_rate": 1.5257712112165952e-05, + "loss": 0.2803, + "step": 5989 + }, + { + "epoch": 0.34, + "grad_norm": 0.3537720037573216, + "learning_rate": 1.5256129080450004e-05, + "loss": 0.2856, + "step": 5990 + }, + { + "epoch": 0.34, + "grad_norm": 1.1232073792582922, + "learning_rate": 1.5254545866714531e-05, + "loss": 0.5189, + "step": 5991 + }, + { + "epoch": 0.34, + "grad_norm": 0.4744517484846774, + "learning_rate": 1.5252962471014358e-05, + "loss": 0.3488, + "step": 5992 + }, + { + "epoch": 0.34, + "grad_norm": 0.3529505990489683, + "learning_rate": 1.5251378893404324e-05, + "loss": 0.2746, + "step": 5993 + }, + { + "epoch": 0.34, + "grad_norm": 0.2696964638067003, + "learning_rate": 1.5249795133939262e-05, + "loss": 0.2123, + "step": 5994 + }, + { + "epoch": 0.34, + "grad_norm": 0.98685682080623, + "learning_rate": 1.5248211192674023e-05, + "loss": 0.5288, + "step": 5995 + }, + { + "epoch": 0.34, + "grad_norm": 0.6945777224570654, + "learning_rate": 1.5246627069663453e-05, + "loss": 0.2634, + "step": 5996 + }, + { + "epoch": 0.34, + "grad_norm": 0.3390663328402181, + "learning_rate": 1.5245042764962416e-05, + "loss": 0.3044, + "step": 5997 + }, + { + "epoch": 0.34, + "grad_norm": 1.1668116852346306, + "learning_rate": 1.5243458278625771e-05, + "loss": 0.8487, + "step": 5998 + }, + { + "epoch": 0.34, + "grad_norm": 0.20443862509215088, + "learning_rate": 1.5241873610708395e-05, + "loss": 0.0946, + "step": 5999 + }, + { + "epoch": 0.34, + "grad_norm": 0.4356866107324079, + "learning_rate": 1.5240288761265158e-05, + "loss": 0.3402, + "step": 6000 + }, + { + "epoch": 0.34, + "grad_norm": 0.36544328154024314, + "learning_rate": 1.523870373035095e-05, + "loss": 0.3262, + "step": 6001 + }, + { + "epoch": 0.34, + "grad_norm": 0.684350221463673, + "learning_rate": 1.5237118518020656e-05, + "loss": 0.2627, + "step": 6002 + }, + { + "epoch": 0.34, + "grad_norm": 0.7652539514029776, + "learning_rate": 1.5235533124329172e-05, + "loss": 0.4287, + "step": 6003 + }, + { + "epoch": 0.34, + "grad_norm": 1.4259890730487947, + "learning_rate": 1.5233947549331399e-05, + "loss": 0.8299, + "step": 6004 + }, + { + "epoch": 0.35, + "grad_norm": 0.2552789675583203, + "learning_rate": 1.5232361793082251e-05, + "loss": 0.2188, + "step": 6005 + }, + { + "epoch": 0.35, + "grad_norm": 0.26345923306917046, + "learning_rate": 1.5230775855636635e-05, + "loss": 0.1973, + "step": 6006 + }, + { + "epoch": 0.35, + "grad_norm": 0.7307326078246826, + "learning_rate": 1.522918973704948e-05, + "loss": 0.4981, + "step": 6007 + }, + { + "epoch": 0.35, + "grad_norm": 0.622572484624821, + "learning_rate": 1.5227603437375704e-05, + "loss": 0.3457, + "step": 6008 + }, + { + "epoch": 0.35, + "grad_norm": 0.3376871552073923, + "learning_rate": 1.5226016956670251e-05, + "loss": 0.2555, + "step": 6009 + }, + { + "epoch": 0.35, + "grad_norm": 1.1334926262165705, + "learning_rate": 1.5224430294988055e-05, + "loss": 0.7165, + "step": 6010 + }, + { + "epoch": 0.35, + "grad_norm": 0.5732166382836338, + "learning_rate": 1.522284345238406e-05, + "loss": 0.3788, + "step": 6011 + }, + { + "epoch": 0.35, + "grad_norm": 0.2403970752189537, + "learning_rate": 1.5221256428913225e-05, + "loss": 0.1541, + "step": 6012 + }, + { + "epoch": 0.35, + "grad_norm": 0.46260415675738403, + "learning_rate": 1.52196692246305e-05, + "loss": 0.3579, + "step": 6013 + }, + { + "epoch": 0.35, + "grad_norm": 0.7041434257174486, + "learning_rate": 1.5218081839590855e-05, + "loss": 0.4429, + "step": 6014 + }, + { + "epoch": 0.35, + "grad_norm": 0.33938295637228305, + "learning_rate": 1.5216494273849261e-05, + "loss": 0.243, + "step": 6015 + }, + { + "epoch": 0.35, + "grad_norm": 0.5118168354209909, + "learning_rate": 1.5214906527460695e-05, + "loss": 0.3826, + "step": 6016 + }, + { + "epoch": 0.35, + "grad_norm": 0.4851060809319056, + "learning_rate": 1.5213318600480138e-05, + "loss": 0.3629, + "step": 6017 + }, + { + "epoch": 0.35, + "grad_norm": 0.22921600058430264, + "learning_rate": 1.5211730492962587e-05, + "loss": 0.161, + "step": 6018 + }, + { + "epoch": 0.35, + "grad_norm": 0.675451182016171, + "learning_rate": 1.521014220496303e-05, + "loss": 0.4479, + "step": 6019 + }, + { + "epoch": 0.35, + "grad_norm": 0.5422555876026078, + "learning_rate": 1.5208553736536473e-05, + "loss": 0.3674, + "step": 6020 + }, + { + "epoch": 0.35, + "grad_norm": 0.34458266162583756, + "learning_rate": 1.5206965087737922e-05, + "loss": 0.2923, + "step": 6021 + }, + { + "epoch": 0.35, + "grad_norm": 0.9441003102508054, + "learning_rate": 1.5205376258622397e-05, + "loss": 0.4712, + "step": 6022 + }, + { + "epoch": 0.35, + "grad_norm": 0.5721243311796721, + "learning_rate": 1.5203787249244914e-05, + "loss": 0.3698, + "step": 6023 + }, + { + "epoch": 0.35, + "grad_norm": 0.32675837351870496, + "learning_rate": 1.5202198059660504e-05, + "loss": 0.2448, + "step": 6024 + }, + { + "epoch": 0.35, + "grad_norm": 0.43602889305513376, + "learning_rate": 1.5200608689924197e-05, + "loss": 0.3166, + "step": 6025 + }, + { + "epoch": 0.35, + "grad_norm": 0.3492478650448725, + "learning_rate": 1.5199019140091037e-05, + "loss": 0.1646, + "step": 6026 + }, + { + "epoch": 0.35, + "grad_norm": 0.3940936399332753, + "learning_rate": 1.5197429410216065e-05, + "loss": 0.3279, + "step": 6027 + }, + { + "epoch": 0.35, + "grad_norm": 0.3892155238830433, + "learning_rate": 1.5195839500354337e-05, + "loss": 0.2918, + "step": 6028 + }, + { + "epoch": 0.35, + "grad_norm": 0.6679840842709365, + "learning_rate": 1.5194249410560913e-05, + "loss": 0.4226, + "step": 6029 + }, + { + "epoch": 0.35, + "grad_norm": 0.3529717737800829, + "learning_rate": 1.5192659140890851e-05, + "loss": 0.3048, + "step": 6030 + }, + { + "epoch": 0.35, + "grad_norm": 0.3234022745735802, + "learning_rate": 1.5191068691399229e-05, + "loss": 0.2089, + "step": 6031 + }, + { + "epoch": 0.35, + "grad_norm": 0.4406803384243917, + "learning_rate": 1.518947806214112e-05, + "loss": 0.356, + "step": 6032 + }, + { + "epoch": 0.35, + "grad_norm": 0.3316857147599567, + "learning_rate": 1.5187887253171609e-05, + "loss": 0.3087, + "step": 6033 + }, + { + "epoch": 0.35, + "grad_norm": 0.9447485585303956, + "learning_rate": 1.5186296264545787e-05, + "loss": 0.6407, + "step": 6034 + }, + { + "epoch": 0.35, + "grad_norm": 0.507392674710051, + "learning_rate": 1.5184705096318748e-05, + "loss": 0.2305, + "step": 6035 + }, + { + "epoch": 0.35, + "grad_norm": 0.31821764679036, + "learning_rate": 1.5183113748545595e-05, + "loss": 0.2953, + "step": 6036 + }, + { + "epoch": 0.35, + "grad_norm": 0.37936593474211566, + "learning_rate": 1.5181522221281435e-05, + "loss": 0.335, + "step": 6037 + }, + { + "epoch": 0.35, + "grad_norm": 0.1908639292571459, + "learning_rate": 1.5179930514581383e-05, + "loss": 0.1056, + "step": 6038 + }, + { + "epoch": 0.35, + "grad_norm": 0.39556981816109443, + "learning_rate": 1.517833862850056e-05, + "loss": 0.3185, + "step": 6039 + }, + { + "epoch": 0.35, + "grad_norm": 1.0851322992534065, + "learning_rate": 1.5176746563094092e-05, + "loss": 0.6819, + "step": 6040 + }, + { + "epoch": 0.35, + "grad_norm": 0.32365313141441393, + "learning_rate": 1.5175154318417116e-05, + "loss": 0.2474, + "step": 6041 + }, + { + "epoch": 0.35, + "grad_norm": 0.44971760340828426, + "learning_rate": 1.5173561894524765e-05, + "loss": 0.3509, + "step": 6042 + }, + { + "epoch": 0.35, + "grad_norm": 0.6905509263632318, + "learning_rate": 1.517196929147219e-05, + "loss": 0.4647, + "step": 6043 + }, + { + "epoch": 0.35, + "grad_norm": 0.24909171253007986, + "learning_rate": 1.5170376509314539e-05, + "loss": 0.1757, + "step": 6044 + }, + { + "epoch": 0.35, + "grad_norm": 0.39274007428317864, + "learning_rate": 1.5168783548106976e-05, + "loss": 0.3238, + "step": 6045 + }, + { + "epoch": 0.35, + "grad_norm": 0.9155198249464075, + "learning_rate": 1.5167190407904656e-05, + "loss": 0.6702, + "step": 6046 + }, + { + "epoch": 0.35, + "grad_norm": 0.573228632743676, + "learning_rate": 1.5165597088762757e-05, + "loss": 0.3907, + "step": 6047 + }, + { + "epoch": 0.35, + "grad_norm": 0.37380539990802764, + "learning_rate": 1.5164003590736452e-05, + "loss": 0.2131, + "step": 6048 + }, + { + "epoch": 0.35, + "grad_norm": 0.43565239129066136, + "learning_rate": 1.5162409913880927e-05, + "loss": 0.3494, + "step": 6049 + }, + { + "epoch": 0.35, + "grad_norm": 0.31162452510656125, + "learning_rate": 1.5160816058251367e-05, + "loss": 0.1867, + "step": 6050 + }, + { + "epoch": 0.35, + "grad_norm": 0.33539240652800983, + "learning_rate": 1.5159222023902969e-05, + "loss": 0.2238, + "step": 6051 + }, + { + "epoch": 0.35, + "grad_norm": 0.3631277574565755, + "learning_rate": 1.5157627810890937e-05, + "loss": 0.3246, + "step": 6052 + }, + { + "epoch": 0.35, + "grad_norm": 0.744527008947136, + "learning_rate": 1.5156033419270472e-05, + "loss": 0.4741, + "step": 6053 + }, + { + "epoch": 0.35, + "grad_norm": 0.3080561599256677, + "learning_rate": 1.5154438849096791e-05, + "loss": 0.2386, + "step": 6054 + }, + { + "epoch": 0.35, + "grad_norm": 0.9356964028488047, + "learning_rate": 1.5152844100425114e-05, + "loss": 0.5416, + "step": 6055 + }, + { + "epoch": 0.35, + "grad_norm": 0.24395016905721084, + "learning_rate": 1.5151249173310672e-05, + "loss": 0.2228, + "step": 6056 + }, + { + "epoch": 0.35, + "grad_norm": 0.36970463621043104, + "learning_rate": 1.5149654067808688e-05, + "loss": 0.2883, + "step": 6057 + }, + { + "epoch": 0.35, + "grad_norm": 0.9048703602569318, + "learning_rate": 1.5148058783974407e-05, + "loss": 0.4467, + "step": 6058 + }, + { + "epoch": 0.35, + "grad_norm": 0.695872288096352, + "learning_rate": 1.5146463321863069e-05, + "loss": 0.4602, + "step": 6059 + }, + { + "epoch": 0.35, + "grad_norm": 0.37486593573199756, + "learning_rate": 1.514486768152993e-05, + "loss": 0.2843, + "step": 6060 + }, + { + "epoch": 0.35, + "grad_norm": 0.3653532580952253, + "learning_rate": 1.5143271863030244e-05, + "loss": 0.2839, + "step": 6061 + }, + { + "epoch": 0.35, + "grad_norm": 0.2517009406199723, + "learning_rate": 1.5141675866419276e-05, + "loss": 0.1826, + "step": 6062 + }, + { + "epoch": 0.35, + "grad_norm": 0.3704254478483168, + "learning_rate": 1.5140079691752293e-05, + "loss": 0.2775, + "step": 6063 + }, + { + "epoch": 0.35, + "grad_norm": 0.46425654567960384, + "learning_rate": 1.5138483339084571e-05, + "loss": 0.3136, + "step": 6064 + }, + { + "epoch": 0.35, + "grad_norm": 0.959500141667516, + "learning_rate": 1.5136886808471389e-05, + "loss": 0.5221, + "step": 6065 + }, + { + "epoch": 0.35, + "grad_norm": 0.3435644123187082, + "learning_rate": 1.5135290099968043e-05, + "loss": 0.2827, + "step": 6066 + }, + { + "epoch": 0.35, + "grad_norm": 1.0712411877235641, + "learning_rate": 1.5133693213629818e-05, + "loss": 0.3403, + "step": 6067 + }, + { + "epoch": 0.35, + "grad_norm": 0.2858791739075835, + "learning_rate": 1.513209614951202e-05, + "loss": 0.2423, + "step": 6068 + }, + { + "epoch": 0.35, + "grad_norm": 0.4780296493408544, + "learning_rate": 1.5130498907669952e-05, + "loss": 0.2785, + "step": 6069 + }, + { + "epoch": 0.35, + "grad_norm": 0.8319222954221501, + "learning_rate": 1.512890148815893e-05, + "loss": 0.479, + "step": 6070 + }, + { + "epoch": 0.35, + "grad_norm": 0.32882328520167026, + "learning_rate": 1.5127303891034264e-05, + "loss": 0.183, + "step": 6071 + }, + { + "epoch": 0.35, + "grad_norm": 0.33276901027696554, + "learning_rate": 1.5125706116351291e-05, + "loss": 0.279, + "step": 6072 + }, + { + "epoch": 0.35, + "grad_norm": 0.5268756881469756, + "learning_rate": 1.5124108164165333e-05, + "loss": 0.3849, + "step": 6073 + }, + { + "epoch": 0.35, + "grad_norm": 1.2489127782935394, + "learning_rate": 1.512251003453173e-05, + "loss": 0.4358, + "step": 6074 + }, + { + "epoch": 0.35, + "grad_norm": 0.34742346899867804, + "learning_rate": 1.5120911727505822e-05, + "loss": 0.2864, + "step": 6075 + }, + { + "epoch": 0.35, + "grad_norm": 0.537616793518178, + "learning_rate": 1.5119313243142964e-05, + "loss": 0.3793, + "step": 6076 + }, + { + "epoch": 0.35, + "grad_norm": 0.4013084106067741, + "learning_rate": 1.5117714581498509e-05, + "loss": 0.1941, + "step": 6077 + }, + { + "epoch": 0.35, + "grad_norm": 0.38191970049336904, + "learning_rate": 1.5116115742627815e-05, + "loss": 0.2683, + "step": 6078 + }, + { + "epoch": 0.35, + "grad_norm": 0.8920627144260248, + "learning_rate": 1.5114516726586254e-05, + "loss": 0.5596, + "step": 6079 + }, + { + "epoch": 0.35, + "grad_norm": 0.4618446581767463, + "learning_rate": 1.51129175334292e-05, + "loss": 0.2894, + "step": 6080 + }, + { + "epoch": 0.35, + "grad_norm": 0.43028981687553913, + "learning_rate": 1.5111318163212032e-05, + "loss": 0.2758, + "step": 6081 + }, + { + "epoch": 0.35, + "grad_norm": 0.6246636674476441, + "learning_rate": 1.5109718615990135e-05, + "loss": 0.397, + "step": 6082 + }, + { + "epoch": 0.35, + "grad_norm": 0.2595488255827491, + "learning_rate": 1.51081188918189e-05, + "loss": 0.2166, + "step": 6083 + }, + { + "epoch": 0.35, + "grad_norm": 0.3366273152459292, + "learning_rate": 1.5106518990753731e-05, + "loss": 0.2066, + "step": 6084 + }, + { + "epoch": 0.35, + "grad_norm": 0.5509473415305383, + "learning_rate": 1.5104918912850029e-05, + "loss": 0.3952, + "step": 6085 + }, + { + "epoch": 0.35, + "grad_norm": 1.168537237871508, + "learning_rate": 1.5103318658163202e-05, + "loss": 0.6159, + "step": 6086 + }, + { + "epoch": 0.35, + "grad_norm": 0.44113838145483497, + "learning_rate": 1.5101718226748673e-05, + "loss": 0.2563, + "step": 6087 + }, + { + "epoch": 0.35, + "grad_norm": 0.42233647100665583, + "learning_rate": 1.5100117618661856e-05, + "loss": 0.3352, + "step": 6088 + }, + { + "epoch": 0.35, + "grad_norm": 1.153551365592648, + "learning_rate": 1.5098516833958187e-05, + "loss": 0.7705, + "step": 6089 + }, + { + "epoch": 0.35, + "grad_norm": 0.21605941110243826, + "learning_rate": 1.50969158726931e-05, + "loss": 0.1453, + "step": 6090 + }, + { + "epoch": 0.35, + "grad_norm": 1.1214770160197514, + "learning_rate": 1.5095314734922037e-05, + "loss": 0.6139, + "step": 6091 + }, + { + "epoch": 0.35, + "grad_norm": 0.4166865421164272, + "learning_rate": 1.509371342070044e-05, + "loss": 0.323, + "step": 6092 + }, + { + "epoch": 0.35, + "grad_norm": 0.3283893206173644, + "learning_rate": 1.509211193008377e-05, + "loss": 0.2456, + "step": 6093 + }, + { + "epoch": 0.35, + "grad_norm": 1.0555578416538822, + "learning_rate": 1.509051026312748e-05, + "loss": 0.6061, + "step": 6094 + }, + { + "epoch": 0.35, + "grad_norm": 0.29921174955999685, + "learning_rate": 1.508890841988704e-05, + "loss": 0.2598, + "step": 6095 + }, + { + "epoch": 0.35, + "grad_norm": 0.3149749288748337, + "learning_rate": 1.5087306400417921e-05, + "loss": 0.2747, + "step": 6096 + }, + { + "epoch": 0.35, + "grad_norm": 0.31212173256557457, + "learning_rate": 1.5085704204775598e-05, + "loss": 0.1728, + "step": 6097 + }, + { + "epoch": 0.35, + "grad_norm": 0.5652954029006605, + "learning_rate": 1.508410183301556e-05, + "loss": 0.4581, + "step": 6098 + }, + { + "epoch": 0.35, + "grad_norm": 0.6856421881804132, + "learning_rate": 1.508249928519329e-05, + "loss": 0.3919, + "step": 6099 + }, + { + "epoch": 0.35, + "grad_norm": 0.30656764100593287, + "learning_rate": 1.5080896561364293e-05, + "loss": 0.2557, + "step": 6100 + }, + { + "epoch": 0.35, + "grad_norm": 0.4687395321208568, + "learning_rate": 1.5079293661584063e-05, + "loss": 0.3314, + "step": 6101 + }, + { + "epoch": 0.35, + "grad_norm": 0.3349070250202825, + "learning_rate": 1.5077690585908113e-05, + "loss": 0.2066, + "step": 6102 + }, + { + "epoch": 0.35, + "grad_norm": 0.3640182696226341, + "learning_rate": 1.5076087334391957e-05, + "loss": 0.2836, + "step": 6103 + }, + { + "epoch": 0.35, + "grad_norm": 0.34746788590963384, + "learning_rate": 1.5074483907091115e-05, + "loss": 0.3276, + "step": 6104 + }, + { + "epoch": 0.35, + "grad_norm": 0.3712053354906611, + "learning_rate": 1.5072880304061112e-05, + "loss": 0.3489, + "step": 6105 + }, + { + "epoch": 0.35, + "grad_norm": 0.4718096602897961, + "learning_rate": 1.5071276525357486e-05, + "loss": 0.3113, + "step": 6106 + }, + { + "epoch": 0.35, + "grad_norm": 0.5120999080003616, + "learning_rate": 1.5069672571035766e-05, + "loss": 0.3747, + "step": 6107 + }, + { + "epoch": 0.35, + "grad_norm": 0.28354797155566364, + "learning_rate": 1.506806844115151e-05, + "loss": 0.2414, + "step": 6108 + }, + { + "epoch": 0.35, + "grad_norm": 0.37166878535602804, + "learning_rate": 1.5066464135760254e-05, + "loss": 0.2915, + "step": 6109 + }, + { + "epoch": 0.35, + "grad_norm": 0.5029551308263018, + "learning_rate": 1.506485965491757e-05, + "loss": 0.3747, + "step": 6110 + }, + { + "epoch": 0.35, + "grad_norm": 0.4002329105834612, + "learning_rate": 1.5063254998679009e-05, + "loss": 0.2825, + "step": 6111 + }, + { + "epoch": 0.35, + "grad_norm": 0.3814977933139438, + "learning_rate": 1.5061650167100146e-05, + "loss": 0.2968, + "step": 6112 + }, + { + "epoch": 0.35, + "grad_norm": 0.829273859116276, + "learning_rate": 1.5060045160236556e-05, + "loss": 0.452, + "step": 6113 + }, + { + "epoch": 0.35, + "grad_norm": 0.27937980686236547, + "learning_rate": 1.505843997814382e-05, + "loss": 0.2292, + "step": 6114 + }, + { + "epoch": 0.35, + "grad_norm": 0.47676045935760386, + "learning_rate": 1.5056834620877525e-05, + "loss": 0.312, + "step": 6115 + }, + { + "epoch": 0.35, + "grad_norm": 0.3451101796926662, + "learning_rate": 1.5055229088493264e-05, + "loss": 0.2801, + "step": 6116 + }, + { + "epoch": 0.35, + "grad_norm": 0.579516426649944, + "learning_rate": 1.5053623381046639e-05, + "loss": 0.394, + "step": 6117 + }, + { + "epoch": 0.35, + "grad_norm": 0.44553554966049524, + "learning_rate": 1.505201749859325e-05, + "loss": 0.3536, + "step": 6118 + }, + { + "epoch": 0.35, + "grad_norm": 0.3590680813627751, + "learning_rate": 1.5050411441188714e-05, + "loss": 0.2677, + "step": 6119 + }, + { + "epoch": 0.35, + "grad_norm": 0.5367161392325959, + "learning_rate": 1.5048805208888651e-05, + "loss": 0.3061, + "step": 6120 + }, + { + "epoch": 0.35, + "grad_norm": 0.4162731031659161, + "learning_rate": 1.5047198801748677e-05, + "loss": 0.3569, + "step": 6121 + }, + { + "epoch": 0.35, + "grad_norm": 0.2970681347926114, + "learning_rate": 1.5045592219824423e-05, + "loss": 0.2119, + "step": 6122 + }, + { + "epoch": 0.35, + "grad_norm": 0.35046893311451455, + "learning_rate": 1.5043985463171532e-05, + "loss": 0.2086, + "step": 6123 + }, + { + "epoch": 0.35, + "grad_norm": 0.313277427937686, + "learning_rate": 1.5042378531845638e-05, + "loss": 0.3118, + "step": 6124 + }, + { + "epoch": 0.35, + "grad_norm": 1.2384222887646135, + "learning_rate": 1.5040771425902393e-05, + "loss": 0.744, + "step": 6125 + }, + { + "epoch": 0.35, + "grad_norm": 0.3171223595122585, + "learning_rate": 1.503916414539745e-05, + "loss": 0.2281, + "step": 6126 + }, + { + "epoch": 0.35, + "grad_norm": 0.413448931832896, + "learning_rate": 1.5037556690386472e-05, + "loss": 0.389, + "step": 6127 + }, + { + "epoch": 0.35, + "grad_norm": 0.23777876075584098, + "learning_rate": 1.5035949060925118e-05, + "loss": 0.2254, + "step": 6128 + }, + { + "epoch": 0.35, + "grad_norm": 0.3183163904203893, + "learning_rate": 1.5034341257069072e-05, + "loss": 0.2173, + "step": 6129 + }, + { + "epoch": 0.35, + "grad_norm": 0.5650146581955648, + "learning_rate": 1.5032733278873996e-05, + "loss": 0.4094, + "step": 6130 + }, + { + "epoch": 0.35, + "grad_norm": 0.513930176388067, + "learning_rate": 1.5031125126395589e-05, + "loss": 0.4267, + "step": 6131 + }, + { + "epoch": 0.35, + "grad_norm": 0.3492885607799953, + "learning_rate": 1.5029516799689533e-05, + "loss": 0.3188, + "step": 6132 + }, + { + "epoch": 0.35, + "grad_norm": 0.3884188797338796, + "learning_rate": 1.5027908298811527e-05, + "loss": 0.2838, + "step": 6133 + }, + { + "epoch": 0.35, + "grad_norm": 0.28370779822743003, + "learning_rate": 1.5026299623817273e-05, + "loss": 0.2077, + "step": 6134 + }, + { + "epoch": 0.35, + "grad_norm": 0.4015496523870909, + "learning_rate": 1.5024690774762478e-05, + "loss": 0.3009, + "step": 6135 + }, + { + "epoch": 0.35, + "grad_norm": 0.3578652014760878, + "learning_rate": 1.5023081751702857e-05, + "loss": 0.2493, + "step": 6136 + }, + { + "epoch": 0.35, + "grad_norm": 0.7011799483411165, + "learning_rate": 1.5021472554694134e-05, + "loss": 0.46, + "step": 6137 + }, + { + "epoch": 0.35, + "grad_norm": 0.5617022132112179, + "learning_rate": 1.501986318379203e-05, + "loss": 0.4276, + "step": 6138 + }, + { + "epoch": 0.35, + "grad_norm": 0.3212717857556724, + "learning_rate": 1.501825363905228e-05, + "loss": 0.2388, + "step": 6139 + }, + { + "epoch": 0.35, + "grad_norm": 0.25259321483356195, + "learning_rate": 1.5016643920530625e-05, + "loss": 0.2136, + "step": 6140 + }, + { + "epoch": 0.35, + "grad_norm": 0.7643284203192063, + "learning_rate": 1.5015034028282802e-05, + "loss": 0.4888, + "step": 6141 + }, + { + "epoch": 0.35, + "grad_norm": 0.3205694041270008, + "learning_rate": 1.5013423962364571e-05, + "loss": 0.2469, + "step": 6142 + }, + { + "epoch": 0.35, + "grad_norm": 0.47535488032774514, + "learning_rate": 1.501181372283168e-05, + "loss": 0.3845, + "step": 6143 + }, + { + "epoch": 0.35, + "grad_norm": 0.5102639689359593, + "learning_rate": 1.5010203309739897e-05, + "loss": 0.3412, + "step": 6144 + }, + { + "epoch": 0.35, + "grad_norm": 0.34843918812053276, + "learning_rate": 1.5008592723144987e-05, + "loss": 0.3098, + "step": 6145 + }, + { + "epoch": 0.35, + "grad_norm": 0.23040537290013943, + "learning_rate": 1.500698196310273e-05, + "loss": 0.0713, + "step": 6146 + }, + { + "epoch": 0.35, + "grad_norm": 0.4026091520264639, + "learning_rate": 1.5005371029668899e-05, + "loss": 0.2918, + "step": 6147 + }, + { + "epoch": 0.35, + "grad_norm": 0.3332787714379259, + "learning_rate": 1.5003759922899286e-05, + "loss": 0.2908, + "step": 6148 + }, + { + "epoch": 0.35, + "grad_norm": 0.8116793651226156, + "learning_rate": 1.5002148642849683e-05, + "loss": 0.3831, + "step": 6149 + }, + { + "epoch": 0.35, + "grad_norm": 0.4649050578936953, + "learning_rate": 1.5000537189575885e-05, + "loss": 0.3492, + "step": 6150 + }, + { + "epoch": 0.35, + "grad_norm": 0.398384632333775, + "learning_rate": 1.4998925563133702e-05, + "loss": 0.3362, + "step": 6151 + }, + { + "epoch": 0.35, + "grad_norm": 0.2859884604619067, + "learning_rate": 1.499731376357894e-05, + "loss": 0.1982, + "step": 6152 + }, + { + "epoch": 0.35, + "grad_norm": 1.2499071609751304, + "learning_rate": 1.499570179096742e-05, + "loss": 0.8175, + "step": 6153 + }, + { + "epoch": 0.35, + "grad_norm": 0.35604390551676335, + "learning_rate": 1.499408964535496e-05, + "loss": 0.2641, + "step": 6154 + }, + { + "epoch": 0.35, + "grad_norm": 0.6096935274842166, + "learning_rate": 1.499247732679739e-05, + "loss": 0.3215, + "step": 6155 + }, + { + "epoch": 0.35, + "grad_norm": 0.4061485551230446, + "learning_rate": 1.4990864835350544e-05, + "loss": 0.1853, + "step": 6156 + }, + { + "epoch": 0.35, + "grad_norm": 0.3753388063151038, + "learning_rate": 1.4989252171070265e-05, + "loss": 0.2943, + "step": 6157 + }, + { + "epoch": 0.35, + "grad_norm": 0.43330743666285093, + "learning_rate": 1.4987639334012398e-05, + "loss": 0.2916, + "step": 6158 + }, + { + "epoch": 0.35, + "grad_norm": 0.3607792270908736, + "learning_rate": 1.4986026324232796e-05, + "loss": 0.2567, + "step": 6159 + }, + { + "epoch": 0.35, + "grad_norm": 0.41735006093173743, + "learning_rate": 1.4984413141787312e-05, + "loss": 0.3247, + "step": 6160 + }, + { + "epoch": 0.35, + "grad_norm": 0.6882683097516364, + "learning_rate": 1.498279978673182e-05, + "loss": 0.4989, + "step": 6161 + }, + { + "epoch": 0.35, + "grad_norm": 0.33450054220257863, + "learning_rate": 1.4981186259122185e-05, + "loss": 0.176, + "step": 6162 + }, + { + "epoch": 0.35, + "grad_norm": 0.3138479440645983, + "learning_rate": 1.4979572559014284e-05, + "loss": 0.2784, + "step": 6163 + }, + { + "epoch": 0.35, + "grad_norm": 0.9177725613411829, + "learning_rate": 1.4977958686463998e-05, + "loss": 0.6058, + "step": 6164 + }, + { + "epoch": 0.35, + "grad_norm": 0.43862792944574786, + "learning_rate": 1.497634464152722e-05, + "loss": 0.2266, + "step": 6165 + }, + { + "epoch": 0.35, + "grad_norm": 0.4316088985564484, + "learning_rate": 1.4974730424259836e-05, + "loss": 0.3325, + "step": 6166 + }, + { + "epoch": 0.35, + "grad_norm": 0.4754951018407942, + "learning_rate": 1.4973116034717754e-05, + "loss": 0.3411, + "step": 6167 + }, + { + "epoch": 0.35, + "grad_norm": 0.25015470689457353, + "learning_rate": 1.4971501472956875e-05, + "loss": 0.1052, + "step": 6168 + }, + { + "epoch": 0.35, + "grad_norm": 0.46799978705707523, + "learning_rate": 1.4969886739033116e-05, + "loss": 0.3381, + "step": 6169 + }, + { + "epoch": 0.35, + "grad_norm": 1.5456881995311484, + "learning_rate": 1.4968271833002393e-05, + "loss": 0.8595, + "step": 6170 + }, + { + "epoch": 0.35, + "grad_norm": 0.35870092364375566, + "learning_rate": 1.4966656754920635e-05, + "loss": 0.3482, + "step": 6171 + }, + { + "epoch": 0.35, + "grad_norm": 0.32167903780767804, + "learning_rate": 1.496504150484376e-05, + "loss": 0.2257, + "step": 6172 + }, + { + "epoch": 0.35, + "grad_norm": 0.4229197497644949, + "learning_rate": 1.4963426082827714e-05, + "loss": 0.3368, + "step": 6173 + }, + { + "epoch": 0.35, + "grad_norm": 0.47531000042653976, + "learning_rate": 1.4961810488928434e-05, + "loss": 0.2762, + "step": 6174 + }, + { + "epoch": 0.35, + "grad_norm": 0.3133918190580812, + "learning_rate": 1.4960194723201873e-05, + "loss": 0.2267, + "step": 6175 + }, + { + "epoch": 0.35, + "grad_norm": 1.041910669070392, + "learning_rate": 1.4958578785703982e-05, + "loss": 0.6656, + "step": 6176 + }, + { + "epoch": 0.35, + "grad_norm": 0.827017268485664, + "learning_rate": 1.4956962676490719e-05, + "loss": 0.5432, + "step": 6177 + }, + { + "epoch": 0.35, + "grad_norm": 0.3650626346399905, + "learning_rate": 1.495534639561805e-05, + "loss": 0.2469, + "step": 6178 + }, + { + "epoch": 0.36, + "grad_norm": 0.3928206475372948, + "learning_rate": 1.4953729943141952e-05, + "loss": 0.3242, + "step": 6179 + }, + { + "epoch": 0.36, + "grad_norm": 0.284766364444701, + "learning_rate": 1.49521133191184e-05, + "loss": 0.1743, + "step": 6180 + }, + { + "epoch": 0.36, + "grad_norm": 0.3653187691143623, + "learning_rate": 1.4950496523603373e-05, + "loss": 0.2148, + "step": 6181 + }, + { + "epoch": 0.36, + "grad_norm": 0.9229835999005866, + "learning_rate": 1.4948879556652866e-05, + "loss": 0.5495, + "step": 6182 + }, + { + "epoch": 0.36, + "grad_norm": 0.4713117458300445, + "learning_rate": 1.4947262418322872e-05, + "loss": 0.3238, + "step": 6183 + }, + { + "epoch": 0.36, + "grad_norm": 0.4509127635088528, + "learning_rate": 1.4945645108669395e-05, + "loss": 0.2793, + "step": 6184 + }, + { + "epoch": 0.36, + "grad_norm": 0.4986348814970563, + "learning_rate": 1.4944027627748438e-05, + "loss": 0.3296, + "step": 6185 + }, + { + "epoch": 0.36, + "grad_norm": 0.2946512585424218, + "learning_rate": 1.4942409975616019e-05, + "loss": 0.1613, + "step": 6186 + }, + { + "epoch": 0.36, + "grad_norm": 0.3968498037859521, + "learning_rate": 1.4940792152328156e-05, + "loss": 0.2815, + "step": 6187 + }, + { + "epoch": 0.36, + "grad_norm": 0.3634508359888088, + "learning_rate": 1.4939174157940872e-05, + "loss": 0.2695, + "step": 6188 + }, + { + "epoch": 0.36, + "grad_norm": 0.6800060796414303, + "learning_rate": 1.4937555992510198e-05, + "loss": 0.4762, + "step": 6189 + }, + { + "epoch": 0.36, + "grad_norm": 0.34781887939904654, + "learning_rate": 1.4935937656092175e-05, + "loss": 0.2857, + "step": 6190 + }, + { + "epoch": 0.36, + "grad_norm": 0.3356382407547884, + "learning_rate": 1.493431914874284e-05, + "loss": 0.2678, + "step": 6191 + }, + { + "epoch": 0.36, + "grad_norm": 0.2680360999255305, + "learning_rate": 1.4932700470518247e-05, + "loss": 0.1794, + "step": 6192 + }, + { + "epoch": 0.36, + "grad_norm": 0.36281121646648407, + "learning_rate": 1.4931081621474448e-05, + "loss": 0.2694, + "step": 6193 + }, + { + "epoch": 0.36, + "grad_norm": 0.7202442627048605, + "learning_rate": 1.4929462601667504e-05, + "loss": 0.3639, + "step": 6194 + }, + { + "epoch": 0.36, + "grad_norm": 0.38416910639035173, + "learning_rate": 1.4927843411153481e-05, + "loss": 0.3403, + "step": 6195 + }, + { + "epoch": 0.36, + "grad_norm": 0.33023100678471856, + "learning_rate": 1.4926224049988456e-05, + "loss": 0.2986, + "step": 6196 + }, + { + "epoch": 0.36, + "grad_norm": 1.2815573161505327, + "learning_rate": 1.4924604518228503e-05, + "loss": 0.8653, + "step": 6197 + }, + { + "epoch": 0.36, + "grad_norm": 0.23076577952264316, + "learning_rate": 1.4922984815929707e-05, + "loss": 0.1842, + "step": 6198 + }, + { + "epoch": 0.36, + "grad_norm": 0.2980217129945579, + "learning_rate": 1.4921364943148158e-05, + "loss": 0.2783, + "step": 6199 + }, + { + "epoch": 0.36, + "grad_norm": 0.8018563432059227, + "learning_rate": 1.4919744899939952e-05, + "loss": 0.4694, + "step": 6200 + }, + { + "epoch": 0.36, + "grad_norm": 0.7658024271159686, + "learning_rate": 1.4918124686361193e-05, + "loss": 0.3918, + "step": 6201 + }, + { + "epoch": 0.36, + "grad_norm": 0.40099316107463984, + "learning_rate": 1.4916504302467987e-05, + "loss": 0.2832, + "step": 6202 + }, + { + "epoch": 0.36, + "grad_norm": 0.3396109439104663, + "learning_rate": 1.4914883748316448e-05, + "loss": 0.3053, + "step": 6203 + }, + { + "epoch": 0.36, + "grad_norm": 0.17946159090846295, + "learning_rate": 1.4913263023962698e-05, + "loss": 0.0944, + "step": 6204 + }, + { + "epoch": 0.36, + "grad_norm": 0.42650868983363494, + "learning_rate": 1.491164212946286e-05, + "loss": 0.2872, + "step": 6205 + }, + { + "epoch": 0.36, + "grad_norm": 0.7748315569899414, + "learning_rate": 1.4910021064873066e-05, + "loss": 0.4013, + "step": 6206 + }, + { + "epoch": 0.36, + "grad_norm": 0.33464339880757243, + "learning_rate": 1.4908399830249454e-05, + "loss": 0.2786, + "step": 6207 + }, + { + "epoch": 0.36, + "grad_norm": 0.32860205040007584, + "learning_rate": 1.4906778425648165e-05, + "loss": 0.2614, + "step": 6208 + }, + { + "epoch": 0.36, + "grad_norm": 1.1586441525453701, + "learning_rate": 1.4905156851125354e-05, + "loss": 0.7866, + "step": 6209 + }, + { + "epoch": 0.36, + "grad_norm": 0.46748475889837865, + "learning_rate": 1.4903535106737166e-05, + "loss": 0.2579, + "step": 6210 + }, + { + "epoch": 0.36, + "grad_norm": 0.26132463339166095, + "learning_rate": 1.4901913192539773e-05, + "loss": 0.2383, + "step": 6211 + }, + { + "epoch": 0.36, + "grad_norm": 0.46282006638803863, + "learning_rate": 1.4900291108589335e-05, + "loss": 0.3461, + "step": 6212 + }, + { + "epoch": 0.36, + "grad_norm": 1.4299229249539138, + "learning_rate": 1.4898668854942029e-05, + "loss": 0.8093, + "step": 6213 + }, + { + "epoch": 0.36, + "grad_norm": 0.3654395694243854, + "learning_rate": 1.4897046431654028e-05, + "loss": 0.2173, + "step": 6214 + }, + { + "epoch": 0.36, + "grad_norm": 0.40808297868803545, + "learning_rate": 1.4895423838781523e-05, + "loss": 0.3156, + "step": 6215 + }, + { + "epoch": 0.36, + "grad_norm": 0.8572774804511525, + "learning_rate": 1.4893801076380697e-05, + "loss": 0.6124, + "step": 6216 + }, + { + "epoch": 0.36, + "grad_norm": 0.3163807117672398, + "learning_rate": 1.4892178144507754e-05, + "loss": 0.2505, + "step": 6217 + }, + { + "epoch": 0.36, + "grad_norm": 0.39502399231456026, + "learning_rate": 1.4890555043218888e-05, + "loss": 0.3094, + "step": 6218 + }, + { + "epoch": 0.36, + "grad_norm": 0.305393097707522, + "learning_rate": 1.4888931772570314e-05, + "loss": 0.26, + "step": 6219 + }, + { + "epoch": 0.36, + "grad_norm": 0.4573475138697167, + "learning_rate": 1.4887308332618245e-05, + "loss": 0.2775, + "step": 6220 + }, + { + "epoch": 0.36, + "grad_norm": 0.5099326579508142, + "learning_rate": 1.4885684723418897e-05, + "loss": 0.2951, + "step": 6221 + }, + { + "epoch": 0.36, + "grad_norm": 0.3622752534920921, + "learning_rate": 1.4884060945028495e-05, + "loss": 0.3238, + "step": 6222 + }, + { + "epoch": 0.36, + "grad_norm": 0.4769109545150248, + "learning_rate": 1.4882436997503273e-05, + "loss": 0.3067, + "step": 6223 + }, + { + "epoch": 0.36, + "grad_norm": 0.31122680816110204, + "learning_rate": 1.4880812880899472e-05, + "loss": 0.2261, + "step": 6224 + }, + { + "epoch": 0.36, + "grad_norm": 0.44219289230671227, + "learning_rate": 1.4879188595273326e-05, + "loss": 0.3029, + "step": 6225 + }, + { + "epoch": 0.36, + "grad_norm": 0.566041714859357, + "learning_rate": 1.487756414068109e-05, + "loss": 0.334, + "step": 6226 + }, + { + "epoch": 0.36, + "grad_norm": 0.3654004821252147, + "learning_rate": 1.4875939517179016e-05, + "loss": 0.2639, + "step": 6227 + }, + { + "epoch": 0.36, + "grad_norm": 0.755425769942834, + "learning_rate": 1.4874314724823368e-05, + "loss": 0.5034, + "step": 6228 + }, + { + "epoch": 0.36, + "grad_norm": 0.3716823951023819, + "learning_rate": 1.487268976367041e-05, + "loss": 0.3237, + "step": 6229 + }, + { + "epoch": 0.36, + "grad_norm": 0.2941511105541551, + "learning_rate": 1.4871064633776418e-05, + "loss": 0.2806, + "step": 6230 + }, + { + "epoch": 0.36, + "grad_norm": 0.31363827129766925, + "learning_rate": 1.4869439335197661e-05, + "loss": 0.1919, + "step": 6231 + }, + { + "epoch": 0.36, + "grad_norm": 0.38920044645943885, + "learning_rate": 1.4867813867990435e-05, + "loss": 0.2882, + "step": 6232 + }, + { + "epoch": 0.36, + "grad_norm": 0.5297592578525103, + "learning_rate": 1.486618823221102e-05, + "loss": 0.4331, + "step": 6233 + }, + { + "epoch": 0.36, + "grad_norm": 0.34389016761519975, + "learning_rate": 1.4864562427915722e-05, + "loss": 0.2951, + "step": 6234 + }, + { + "epoch": 0.36, + "grad_norm": 0.4098129452025419, + "learning_rate": 1.486293645516083e-05, + "loss": 0.3112, + "step": 6235 + }, + { + "epoch": 0.36, + "grad_norm": 0.5460341175814037, + "learning_rate": 1.4861310314002659e-05, + "loss": 0.339, + "step": 6236 + }, + { + "epoch": 0.36, + "grad_norm": 0.24296990938486865, + "learning_rate": 1.485968400449752e-05, + "loss": 0.1272, + "step": 6237 + }, + { + "epoch": 0.36, + "grad_norm": 0.4041372596227764, + "learning_rate": 1.485805752670174e-05, + "loss": 0.2752, + "step": 6238 + }, + { + "epoch": 0.36, + "grad_norm": 0.31120053524746066, + "learning_rate": 1.4856430880671628e-05, + "loss": 0.3153, + "step": 6239 + }, + { + "epoch": 0.36, + "grad_norm": 0.5829267478852111, + "learning_rate": 1.485480406646353e-05, + "loss": 0.3675, + "step": 6240 + }, + { + "epoch": 0.36, + "grad_norm": 0.5853392794597301, + "learning_rate": 1.485317708413377e-05, + "loss": 0.2736, + "step": 6241 + }, + { + "epoch": 0.36, + "grad_norm": 0.3633480158282146, + "learning_rate": 1.48515499337387e-05, + "loss": 0.2897, + "step": 6242 + }, + { + "epoch": 0.36, + "grad_norm": 0.3294481476064836, + "learning_rate": 1.4849922615334662e-05, + "loss": 0.2107, + "step": 6243 + }, + { + "epoch": 0.36, + "grad_norm": 0.596863758511093, + "learning_rate": 1.4848295128978016e-05, + "loss": 0.3861, + "step": 6244 + }, + { + "epoch": 0.36, + "grad_norm": 0.30007973784979886, + "learning_rate": 1.4846667474725115e-05, + "loss": 0.2746, + "step": 6245 + }, + { + "epoch": 0.36, + "grad_norm": 0.4615792309021516, + "learning_rate": 1.484503965263233e-05, + "loss": 0.3483, + "step": 6246 + }, + { + "epoch": 0.36, + "grad_norm": 0.35022336479364224, + "learning_rate": 1.4843411662756028e-05, + "loss": 0.2043, + "step": 6247 + }, + { + "epoch": 0.36, + "grad_norm": 0.4208558050427091, + "learning_rate": 1.484178350515259e-05, + "loss": 0.3449, + "step": 6248 + }, + { + "epoch": 0.36, + "grad_norm": 1.038883513719484, + "learning_rate": 1.4840155179878398e-05, + "loss": 0.7463, + "step": 6249 + }, + { + "epoch": 0.36, + "grad_norm": 0.2896134927667209, + "learning_rate": 1.4838526686989836e-05, + "loss": 0.2265, + "step": 6250 + }, + { + "epoch": 0.36, + "grad_norm": 0.2974585469109081, + "learning_rate": 1.4836898026543307e-05, + "loss": 0.2787, + "step": 6251 + }, + { + "epoch": 0.36, + "grad_norm": 0.4006467030665213, + "learning_rate": 1.4835269198595206e-05, + "loss": 0.3279, + "step": 6252 + }, + { + "epoch": 0.36, + "grad_norm": 0.33309182783458663, + "learning_rate": 1.483364020320194e-05, + "loss": 0.2246, + "step": 6253 + }, + { + "epoch": 0.36, + "grad_norm": 0.36433501957512965, + "learning_rate": 1.4832011040419922e-05, + "loss": 0.3392, + "step": 6254 + }, + { + "epoch": 0.36, + "grad_norm": 0.5059114706018922, + "learning_rate": 1.4830381710305572e-05, + "loss": 0.4148, + "step": 6255 + }, + { + "epoch": 0.36, + "grad_norm": 0.4326822693331583, + "learning_rate": 1.4828752212915309e-05, + "loss": 0.2464, + "step": 6256 + }, + { + "epoch": 0.36, + "grad_norm": 0.346803304093782, + "learning_rate": 1.4827122548305566e-05, + "loss": 0.3429, + "step": 6257 + }, + { + "epoch": 0.36, + "grad_norm": 0.24676986871220055, + "learning_rate": 1.4825492716532772e-05, + "loss": 0.2387, + "step": 6258 + }, + { + "epoch": 0.36, + "grad_norm": 0.8936383019306559, + "learning_rate": 1.4823862717653377e-05, + "loss": 0.4183, + "step": 6259 + }, + { + "epoch": 0.36, + "grad_norm": 0.3613517629923259, + "learning_rate": 1.4822232551723824e-05, + "loss": 0.2395, + "step": 6260 + }, + { + "epoch": 0.36, + "grad_norm": 0.7576742857770312, + "learning_rate": 1.4820602218800562e-05, + "loss": 0.604, + "step": 6261 + }, + { + "epoch": 0.36, + "grad_norm": 0.36124256710193003, + "learning_rate": 1.4818971718940053e-05, + "loss": 0.282, + "step": 6262 + }, + { + "epoch": 0.36, + "grad_norm": 0.3117081152835165, + "learning_rate": 1.4817341052198763e-05, + "loss": 0.2732, + "step": 6263 + }, + { + "epoch": 0.36, + "grad_norm": 0.4077993695451762, + "learning_rate": 1.481571021863316e-05, + "loss": 0.2519, + "step": 6264 + }, + { + "epoch": 0.36, + "grad_norm": 0.36274755930196695, + "learning_rate": 1.4814079218299715e-05, + "loss": 0.2588, + "step": 6265 + }, + { + "epoch": 0.36, + "grad_norm": 0.31755716594371836, + "learning_rate": 1.4812448051254914e-05, + "loss": 0.2515, + "step": 6266 + }, + { + "epoch": 0.36, + "grad_norm": 1.2379686903604816, + "learning_rate": 1.4810816717555248e-05, + "loss": 0.8207, + "step": 6267 + }, + { + "epoch": 0.36, + "grad_norm": 0.6200789147881792, + "learning_rate": 1.4809185217257205e-05, + "loss": 0.4401, + "step": 6268 + }, + { + "epoch": 0.36, + "grad_norm": 0.3576042390007011, + "learning_rate": 1.4807553550417281e-05, + "loss": 0.236, + "step": 6269 + }, + { + "epoch": 0.36, + "grad_norm": 0.3135412551606865, + "learning_rate": 1.4805921717091989e-05, + "loss": 0.2926, + "step": 6270 + }, + { + "epoch": 0.36, + "grad_norm": 0.33504011374279746, + "learning_rate": 1.480428971733783e-05, + "loss": 0.2396, + "step": 6271 + }, + { + "epoch": 0.36, + "grad_norm": 0.395484364592832, + "learning_rate": 1.4802657551211331e-05, + "loss": 0.2966, + "step": 6272 + }, + { + "epoch": 0.36, + "grad_norm": 0.7334203945138051, + "learning_rate": 1.4801025218769001e-05, + "loss": 0.3673, + "step": 6273 + }, + { + "epoch": 0.36, + "grad_norm": 0.3095981068865218, + "learning_rate": 1.4799392720067378e-05, + "loss": 0.2805, + "step": 6274 + }, + { + "epoch": 0.36, + "grad_norm": 0.37500869393675135, + "learning_rate": 1.4797760055162988e-05, + "loss": 0.3432, + "step": 6275 + }, + { + "epoch": 0.36, + "grad_norm": 0.21332335884529122, + "learning_rate": 1.4796127224112378e-05, + "loss": 0.1254, + "step": 6276 + }, + { + "epoch": 0.36, + "grad_norm": 1.1738522008194672, + "learning_rate": 1.4794494226972088e-05, + "loss": 0.7157, + "step": 6277 + }, + { + "epoch": 0.36, + "grad_norm": 0.2622768681089619, + "learning_rate": 1.4792861063798664e-05, + "loss": 0.2618, + "step": 6278 + }, + { + "epoch": 0.36, + "grad_norm": 0.5248605026141624, + "learning_rate": 1.4791227734648672e-05, + "loss": 0.3278, + "step": 6279 + }, + { + "epoch": 0.36, + "grad_norm": 0.5791380522386198, + "learning_rate": 1.4789594239578668e-05, + "loss": 0.3969, + "step": 6280 + }, + { + "epoch": 0.36, + "grad_norm": 0.40689169491954263, + "learning_rate": 1.4787960578645222e-05, + "loss": 0.2991, + "step": 6281 + }, + { + "epoch": 0.36, + "grad_norm": 0.35430059420501625, + "learning_rate": 1.4786326751904907e-05, + "loss": 0.2538, + "step": 6282 + }, + { + "epoch": 0.36, + "grad_norm": 0.25326613938208253, + "learning_rate": 1.4784692759414303e-05, + "loss": 0.1784, + "step": 6283 + }, + { + "epoch": 0.36, + "grad_norm": 0.3879921231259588, + "learning_rate": 1.4783058601229994e-05, + "loss": 0.3079, + "step": 6284 + }, + { + "epoch": 0.36, + "grad_norm": 0.7226592658190863, + "learning_rate": 1.4781424277408572e-05, + "loss": 0.4558, + "step": 6285 + }, + { + "epoch": 0.36, + "grad_norm": 0.30693828632115694, + "learning_rate": 1.4779789788006632e-05, + "loss": 0.2734, + "step": 6286 + }, + { + "epoch": 0.36, + "grad_norm": 0.3880005622421795, + "learning_rate": 1.4778155133080776e-05, + "loss": 0.2958, + "step": 6287 + }, + { + "epoch": 0.36, + "grad_norm": 1.1057367174861266, + "learning_rate": 1.4776520312687614e-05, + "loss": 0.6752, + "step": 6288 + }, + { + "epoch": 0.36, + "grad_norm": 0.21447372968479209, + "learning_rate": 1.477488532688376e-05, + "loss": 0.0726, + "step": 6289 + }, + { + "epoch": 0.36, + "grad_norm": 0.35395443621688044, + "learning_rate": 1.4773250175725833e-05, + "loss": 0.3048, + "step": 6290 + }, + { + "epoch": 0.36, + "grad_norm": 0.5289778545984578, + "learning_rate": 1.4771614859270458e-05, + "loss": 0.3525, + "step": 6291 + }, + { + "epoch": 0.36, + "grad_norm": 0.48440354407701286, + "learning_rate": 1.4769979377574264e-05, + "loss": 0.2645, + "step": 6292 + }, + { + "epoch": 0.36, + "grad_norm": 0.4060500797659063, + "learning_rate": 1.4768343730693888e-05, + "loss": 0.3174, + "step": 6293 + }, + { + "epoch": 0.36, + "grad_norm": 0.3819313122256466, + "learning_rate": 1.4766707918685974e-05, + "loss": 0.3538, + "step": 6294 + }, + { + "epoch": 0.36, + "grad_norm": 0.3470332126651621, + "learning_rate": 1.4765071941607172e-05, + "loss": 0.2331, + "step": 6295 + }, + { + "epoch": 0.36, + "grad_norm": 0.306715059456966, + "learning_rate": 1.4763435799514132e-05, + "loss": 0.2315, + "step": 6296 + }, + { + "epoch": 0.36, + "grad_norm": 0.5344701290774023, + "learning_rate": 1.4761799492463516e-05, + "loss": 0.381, + "step": 6297 + }, + { + "epoch": 0.36, + "grad_norm": 0.45486931737761455, + "learning_rate": 1.4760163020511986e-05, + "loss": 0.3881, + "step": 6298 + }, + { + "epoch": 0.36, + "grad_norm": 0.31770712221775865, + "learning_rate": 1.4758526383716219e-05, + "loss": 0.2241, + "step": 6299 + }, + { + "epoch": 0.36, + "grad_norm": 1.1779532369599066, + "learning_rate": 1.4756889582132886e-05, + "loss": 0.7267, + "step": 6300 + }, + { + "epoch": 0.36, + "grad_norm": 0.25416641771855647, + "learning_rate": 1.4755252615818671e-05, + "loss": 0.2218, + "step": 6301 + }, + { + "epoch": 0.36, + "grad_norm": 0.28196065175567037, + "learning_rate": 1.4753615484830261e-05, + "loss": 0.2354, + "step": 6302 + }, + { + "epoch": 0.36, + "grad_norm": 0.6902396292135428, + "learning_rate": 1.4751978189224354e-05, + "loss": 0.4771, + "step": 6303 + }, + { + "epoch": 0.36, + "grad_norm": 1.0221054595371608, + "learning_rate": 1.4750340729057646e-05, + "loss": 0.728, + "step": 6304 + }, + { + "epoch": 0.36, + "grad_norm": 0.3009926847405351, + "learning_rate": 1.4748703104386843e-05, + "loss": 0.215, + "step": 6305 + }, + { + "epoch": 0.36, + "grad_norm": 0.32641194854409294, + "learning_rate": 1.4747065315268655e-05, + "loss": 0.2941, + "step": 6306 + }, + { + "epoch": 0.36, + "grad_norm": 0.4241970288360463, + "learning_rate": 1.4745427361759801e-05, + "loss": 0.3376, + "step": 6307 + }, + { + "epoch": 0.36, + "grad_norm": 0.45701611260847547, + "learning_rate": 1.4743789243916999e-05, + "loss": 0.2903, + "step": 6308 + }, + { + "epoch": 0.36, + "grad_norm": 0.2999184950849396, + "learning_rate": 1.4742150961796981e-05, + "loss": 0.2455, + "step": 6309 + }, + { + "epoch": 0.36, + "grad_norm": 0.45425353186377854, + "learning_rate": 1.4740512515456479e-05, + "loss": 0.3645, + "step": 6310 + }, + { + "epoch": 0.36, + "grad_norm": 0.374441697495103, + "learning_rate": 1.4738873904952232e-05, + "loss": 0.2927, + "step": 6311 + }, + { + "epoch": 0.36, + "grad_norm": 0.8017812489117565, + "learning_rate": 1.4737235130340985e-05, + "loss": 0.4057, + "step": 6312 + }, + { + "epoch": 0.36, + "grad_norm": 0.4483513029265171, + "learning_rate": 1.473559619167949e-05, + "loss": 0.3609, + "step": 6313 + }, + { + "epoch": 0.36, + "grad_norm": 0.30337256483409936, + "learning_rate": 1.4733957089024502e-05, + "loss": 0.2775, + "step": 6314 + }, + { + "epoch": 0.36, + "grad_norm": 0.27788517511323146, + "learning_rate": 1.4732317822432782e-05, + "loss": 0.1849, + "step": 6315 + }, + { + "epoch": 0.36, + "grad_norm": 1.2877189024995006, + "learning_rate": 1.4730678391961102e-05, + "loss": 0.7865, + "step": 6316 + }, + { + "epoch": 0.36, + "grad_norm": 0.3273282971439404, + "learning_rate": 1.472903879766623e-05, + "loss": 0.2822, + "step": 6317 + }, + { + "epoch": 0.36, + "grad_norm": 0.36865587604384104, + "learning_rate": 1.4727399039604951e-05, + "loss": 0.2766, + "step": 6318 + }, + { + "epoch": 0.36, + "grad_norm": 0.6217638120111219, + "learning_rate": 1.4725759117834045e-05, + "loss": 0.4275, + "step": 6319 + }, + { + "epoch": 0.36, + "grad_norm": 0.34169170317511, + "learning_rate": 1.4724119032410305e-05, + "loss": 0.3136, + "step": 6320 + }, + { + "epoch": 0.36, + "grad_norm": 0.30920422479643384, + "learning_rate": 1.4722478783390522e-05, + "loss": 0.1771, + "step": 6321 + }, + { + "epoch": 0.36, + "grad_norm": 0.34775308402691785, + "learning_rate": 1.472083837083151e-05, + "loss": 0.2888, + "step": 6322 + }, + { + "epoch": 0.36, + "grad_norm": 0.3193141427715202, + "learning_rate": 1.471919779479006e-05, + "loss": 0.256, + "step": 6323 + }, + { + "epoch": 0.36, + "grad_norm": 1.1400092958513408, + "learning_rate": 1.4717557055322997e-05, + "loss": 0.7106, + "step": 6324 + }, + { + "epoch": 0.36, + "grad_norm": 0.40323223147712445, + "learning_rate": 1.4715916152487135e-05, + "loss": 0.3131, + "step": 6325 + }, + { + "epoch": 0.36, + "grad_norm": 0.4250005070452697, + "learning_rate": 1.47142750863393e-05, + "loss": 0.2852, + "step": 6326 + }, + { + "epoch": 0.36, + "grad_norm": 0.3343011248837186, + "learning_rate": 1.471263385693632e-05, + "loss": 0.2307, + "step": 6327 + }, + { + "epoch": 0.36, + "grad_norm": 0.3055552288905154, + "learning_rate": 1.4710992464335034e-05, + "loss": 0.1793, + "step": 6328 + }, + { + "epoch": 0.36, + "grad_norm": 0.40306747549933275, + "learning_rate": 1.4709350908592281e-05, + "loss": 0.2806, + "step": 6329 + }, + { + "epoch": 0.36, + "grad_norm": 0.32815408215415465, + "learning_rate": 1.4707709189764909e-05, + "loss": 0.3094, + "step": 6330 + }, + { + "epoch": 0.36, + "grad_norm": 0.6956160911594513, + "learning_rate": 1.470606730790977e-05, + "loss": 0.356, + "step": 6331 + }, + { + "epoch": 0.36, + "grad_norm": 0.3454136921848711, + "learning_rate": 1.4704425263083722e-05, + "loss": 0.2861, + "step": 6332 + }, + { + "epoch": 0.36, + "grad_norm": 0.30035263482807145, + "learning_rate": 1.470278305534363e-05, + "loss": 0.2579, + "step": 6333 + }, + { + "epoch": 0.36, + "grad_norm": 0.4375086593799831, + "learning_rate": 1.4701140684746363e-05, + "loss": 0.3229, + "step": 6334 + }, + { + "epoch": 0.36, + "grad_norm": 0.32172857440332125, + "learning_rate": 1.4699498151348797e-05, + "loss": 0.2142, + "step": 6335 + }, + { + "epoch": 0.36, + "grad_norm": 0.6809092715571192, + "learning_rate": 1.469785545520781e-05, + "loss": 0.4272, + "step": 6336 + }, + { + "epoch": 0.36, + "grad_norm": 0.3487777939185845, + "learning_rate": 1.469621259638029e-05, + "loss": 0.3407, + "step": 6337 + }, + { + "epoch": 0.36, + "grad_norm": 0.29453832274472436, + "learning_rate": 1.4694569574923132e-05, + "loss": 0.2134, + "step": 6338 + }, + { + "epoch": 0.36, + "grad_norm": 0.8308626976224194, + "learning_rate": 1.4692926390893234e-05, + "loss": 0.5887, + "step": 6339 + }, + { + "epoch": 0.36, + "grad_norm": 0.39159026271164604, + "learning_rate": 1.469128304434749e-05, + "loss": 0.3049, + "step": 6340 + }, + { + "epoch": 0.36, + "grad_norm": 0.24883858751617285, + "learning_rate": 1.4689639535342823e-05, + "loss": 0.1906, + "step": 6341 + }, + { + "epoch": 0.36, + "grad_norm": 0.5107631135808215, + "learning_rate": 1.4687995863936135e-05, + "loss": 0.3797, + "step": 6342 + }, + { + "epoch": 0.36, + "grad_norm": 0.6558962140201424, + "learning_rate": 1.4686352030184354e-05, + "loss": 0.4598, + "step": 6343 + }, + { + "epoch": 0.36, + "grad_norm": 0.32642046172243194, + "learning_rate": 1.4684708034144403e-05, + "loss": 0.174, + "step": 6344 + }, + { + "epoch": 0.36, + "grad_norm": 0.3823814532754202, + "learning_rate": 1.4683063875873215e-05, + "loss": 0.3208, + "step": 6345 + }, + { + "epoch": 0.36, + "grad_norm": 0.4802433585379294, + "learning_rate": 1.4681419555427727e-05, + "loss": 0.4175, + "step": 6346 + }, + { + "epoch": 0.36, + "grad_norm": 0.521892647783289, + "learning_rate": 1.467977507286488e-05, + "loss": 0.3415, + "step": 6347 + }, + { + "epoch": 0.36, + "grad_norm": 0.3458481345611792, + "learning_rate": 1.4678130428241623e-05, + "loss": 0.2438, + "step": 6348 + }, + { + "epoch": 0.36, + "grad_norm": 0.3032108143688598, + "learning_rate": 1.4676485621614913e-05, + "loss": 0.2495, + "step": 6349 + }, + { + "epoch": 0.36, + "grad_norm": 0.5573975802181028, + "learning_rate": 1.4674840653041706e-05, + "loss": 0.3531, + "step": 6350 + }, + { + "epoch": 0.36, + "grad_norm": 0.37910874435426417, + "learning_rate": 1.4673195522578967e-05, + "loss": 0.2586, + "step": 6351 + }, + { + "epoch": 0.36, + "grad_norm": 1.1453819020562075, + "learning_rate": 1.467155023028367e-05, + "loss": 0.8029, + "step": 6352 + }, + { + "epoch": 0.37, + "grad_norm": 0.3225394768457753, + "learning_rate": 1.4669904776212786e-05, + "loss": 0.301, + "step": 6353 + }, + { + "epoch": 0.37, + "grad_norm": 0.4135206494047781, + "learning_rate": 1.4668259160423305e-05, + "loss": 0.2995, + "step": 6354 + }, + { + "epoch": 0.37, + "grad_norm": 0.30632042423602535, + "learning_rate": 1.4666613382972205e-05, + "loss": 0.2069, + "step": 6355 + }, + { + "epoch": 0.37, + "grad_norm": 0.38142377744009864, + "learning_rate": 1.4664967443916489e-05, + "loss": 0.3089, + "step": 6356 + }, + { + "epoch": 0.37, + "grad_norm": 0.3386038426663132, + "learning_rate": 1.4663321343313148e-05, + "loss": 0.2622, + "step": 6357 + }, + { + "epoch": 0.37, + "grad_norm": 0.45717911991857507, + "learning_rate": 1.4661675081219191e-05, + "loss": 0.3994, + "step": 6358 + }, + { + "epoch": 0.37, + "grad_norm": 0.39907584698503695, + "learning_rate": 1.4660028657691626e-05, + "loss": 0.3111, + "step": 6359 + }, + { + "epoch": 0.37, + "grad_norm": 0.4939593127745104, + "learning_rate": 1.465838207278747e-05, + "loss": 0.3689, + "step": 6360 + }, + { + "epoch": 0.37, + "grad_norm": 0.2352579898444018, + "learning_rate": 1.4656735326563738e-05, + "loss": 0.1747, + "step": 6361 + }, + { + "epoch": 0.37, + "grad_norm": 0.59732590291894, + "learning_rate": 1.4655088419077466e-05, + "loss": 0.3185, + "step": 6362 + }, + { + "epoch": 0.37, + "grad_norm": 0.39752606248880057, + "learning_rate": 1.4653441350385682e-05, + "loss": 0.3149, + "step": 6363 + }, + { + "epoch": 0.37, + "grad_norm": 0.4368499204854432, + "learning_rate": 1.4651794120545424e-05, + "loss": 0.3275, + "step": 6364 + }, + { + "epoch": 0.37, + "grad_norm": 0.3946428941104599, + "learning_rate": 1.4650146729613735e-05, + "loss": 0.311, + "step": 6365 + }, + { + "epoch": 0.37, + "grad_norm": 0.4052584218015384, + "learning_rate": 1.4648499177647665e-05, + "loss": 0.3519, + "step": 6366 + }, + { + "epoch": 0.37, + "grad_norm": 0.2228152253683647, + "learning_rate": 1.4646851464704269e-05, + "loss": 0.0727, + "step": 6367 + }, + { + "epoch": 0.37, + "grad_norm": 0.55650450715017, + "learning_rate": 1.4645203590840607e-05, + "loss": 0.3699, + "step": 6368 + }, + { + "epoch": 0.37, + "grad_norm": 0.2782351877624767, + "learning_rate": 1.4643555556113742e-05, + "loss": 0.2925, + "step": 6369 + }, + { + "epoch": 0.37, + "grad_norm": 0.6856577889773536, + "learning_rate": 1.4641907360580749e-05, + "loss": 0.3796, + "step": 6370 + }, + { + "epoch": 0.37, + "grad_norm": 0.49911463412678736, + "learning_rate": 1.4640259004298706e-05, + "loss": 0.3838, + "step": 6371 + }, + { + "epoch": 0.37, + "grad_norm": 0.3727450596138779, + "learning_rate": 1.4638610487324688e-05, + "loss": 0.333, + "step": 6372 + }, + { + "epoch": 0.37, + "grad_norm": 0.2839276525223571, + "learning_rate": 1.4636961809715793e-05, + "loss": 0.2624, + "step": 6373 + }, + { + "epoch": 0.37, + "grad_norm": 0.24988639619999695, + "learning_rate": 1.463531297152911e-05, + "loss": 0.1668, + "step": 6374 + }, + { + "epoch": 0.37, + "grad_norm": 0.5883872688338556, + "learning_rate": 1.4633663972821737e-05, + "loss": 0.4016, + "step": 6375 + }, + { + "epoch": 0.37, + "grad_norm": 0.969636159928934, + "learning_rate": 1.4632014813650779e-05, + "loss": 0.4825, + "step": 6376 + }, + { + "epoch": 0.37, + "grad_norm": 0.27400554273030164, + "learning_rate": 1.4630365494073348e-05, + "loss": 0.2349, + "step": 6377 + }, + { + "epoch": 0.37, + "grad_norm": 0.5706865974074818, + "learning_rate": 1.4628716014146558e-05, + "loss": 0.4436, + "step": 6378 + }, + { + "epoch": 0.37, + "grad_norm": 0.28185065914961815, + "learning_rate": 1.4627066373927534e-05, + "loss": 0.1703, + "step": 6379 + }, + { + "epoch": 0.37, + "grad_norm": 0.5918352536787485, + "learning_rate": 1.4625416573473397e-05, + "loss": 0.2241, + "step": 6380 + }, + { + "epoch": 0.37, + "grad_norm": 0.2945602690656717, + "learning_rate": 1.4623766612841286e-05, + "loss": 0.2792, + "step": 6381 + }, + { + "epoch": 0.37, + "grad_norm": 1.245141698601791, + "learning_rate": 1.4622116492088335e-05, + "loss": 0.4757, + "step": 6382 + }, + { + "epoch": 0.37, + "grad_norm": 0.7575767671387406, + "learning_rate": 1.4620466211271686e-05, + "loss": 0.4512, + "step": 6383 + }, + { + "epoch": 0.37, + "grad_norm": 0.42239162484195003, + "learning_rate": 1.4618815770448493e-05, + "loss": 0.2349, + "step": 6384 + }, + { + "epoch": 0.37, + "grad_norm": 0.28014002852446207, + "learning_rate": 1.461716516967591e-05, + "loss": 0.2347, + "step": 6385 + }, + { + "epoch": 0.37, + "grad_norm": 0.792038715756322, + "learning_rate": 1.4615514409011093e-05, + "loss": 0.4452, + "step": 6386 + }, + { + "epoch": 0.37, + "grad_norm": 0.7042697472461529, + "learning_rate": 1.4613863488511214e-05, + "loss": 0.2951, + "step": 6387 + }, + { + "epoch": 0.37, + "grad_norm": 1.090984118378016, + "learning_rate": 1.4612212408233438e-05, + "loss": 0.5841, + "step": 6388 + }, + { + "epoch": 0.37, + "grad_norm": 0.3318163237944138, + "learning_rate": 1.4610561168234942e-05, + "loss": 0.3015, + "step": 6389 + }, + { + "epoch": 0.37, + "grad_norm": 0.36470561968264775, + "learning_rate": 1.4608909768572917e-05, + "loss": 0.2485, + "step": 6390 + }, + { + "epoch": 0.37, + "grad_norm": 0.5483411965792786, + "learning_rate": 1.4607258209304542e-05, + "loss": 0.2347, + "step": 6391 + }, + { + "epoch": 0.37, + "grad_norm": 0.4024074470010744, + "learning_rate": 1.4605606490487013e-05, + "loss": 0.2603, + "step": 6392 + }, + { + "epoch": 0.37, + "grad_norm": 0.3917554003411267, + "learning_rate": 1.4603954612177532e-05, + "loss": 0.231, + "step": 6393 + }, + { + "epoch": 0.37, + "grad_norm": 1.2772938766587616, + "learning_rate": 1.4602302574433297e-05, + "loss": 0.486, + "step": 6394 + }, + { + "epoch": 0.37, + "grad_norm": 0.6668301917465416, + "learning_rate": 1.4600650377311523e-05, + "loss": 0.4474, + "step": 6395 + }, + { + "epoch": 0.37, + "grad_norm": 0.39712967225015333, + "learning_rate": 1.4598998020869426e-05, + "loss": 0.3148, + "step": 6396 + }, + { + "epoch": 0.37, + "grad_norm": 0.28444972237384936, + "learning_rate": 1.4597345505164222e-05, + "loss": 0.218, + "step": 6397 + }, + { + "epoch": 0.37, + "grad_norm": 0.5753183483183651, + "learning_rate": 1.4595692830253143e-05, + "loss": 0.2875, + "step": 6398 + }, + { + "epoch": 0.37, + "grad_norm": 0.5209561480298429, + "learning_rate": 1.459403999619342e-05, + "loss": 0.3033, + "step": 6399 + }, + { + "epoch": 0.37, + "grad_norm": 0.47041709352127614, + "learning_rate": 1.4592387003042287e-05, + "loss": 0.2843, + "step": 6400 + }, + { + "epoch": 0.37, + "grad_norm": 1.4217366843048336, + "learning_rate": 1.4590733850856989e-05, + "loss": 0.7602, + "step": 6401 + }, + { + "epoch": 0.37, + "grad_norm": 0.3419329387584038, + "learning_rate": 1.4589080539694778e-05, + "loss": 0.2929, + "step": 6402 + }, + { + "epoch": 0.37, + "grad_norm": 0.6029781463091936, + "learning_rate": 1.4587427069612902e-05, + "loss": 0.3747, + "step": 6403 + }, + { + "epoch": 0.37, + "grad_norm": 0.3860877146598658, + "learning_rate": 1.4585773440668626e-05, + "loss": 0.3044, + "step": 6404 + }, + { + "epoch": 0.37, + "grad_norm": 0.3332149808196481, + "learning_rate": 1.4584119652919213e-05, + "loss": 0.2862, + "step": 6405 + }, + { + "epoch": 0.37, + "grad_norm": 0.2517107564009192, + "learning_rate": 1.4582465706421935e-05, + "loss": 0.1132, + "step": 6406 + }, + { + "epoch": 0.37, + "grad_norm": 0.9180317484308079, + "learning_rate": 1.4580811601234067e-05, + "loss": 0.55, + "step": 6407 + }, + { + "epoch": 0.37, + "grad_norm": 0.4199511381911743, + "learning_rate": 1.4579157337412886e-05, + "loss": 0.2734, + "step": 6408 + }, + { + "epoch": 0.37, + "grad_norm": 0.40497014891679506, + "learning_rate": 1.4577502915015687e-05, + "loss": 0.3179, + "step": 6409 + }, + { + "epoch": 0.37, + "grad_norm": 0.7870272373752119, + "learning_rate": 1.4575848334099756e-05, + "loss": 0.3261, + "step": 6410 + }, + { + "epoch": 0.37, + "grad_norm": 0.30582763377378736, + "learning_rate": 1.4574193594722394e-05, + "loss": 0.256, + "step": 6411 + }, + { + "epoch": 0.37, + "grad_norm": 0.3369040954406996, + "learning_rate": 1.4572538696940908e-05, + "loss": 0.2748, + "step": 6412 + }, + { + "epoch": 0.37, + "grad_norm": 0.36361392670595094, + "learning_rate": 1.4570883640812602e-05, + "loss": 0.2604, + "step": 6413 + }, + { + "epoch": 0.37, + "grad_norm": 0.3875128108510935, + "learning_rate": 1.456922842639479e-05, + "loss": 0.2852, + "step": 6414 + }, + { + "epoch": 0.37, + "grad_norm": 0.8988967262988671, + "learning_rate": 1.4567573053744797e-05, + "loss": 0.4492, + "step": 6415 + }, + { + "epoch": 0.37, + "grad_norm": 0.40165688387675075, + "learning_rate": 1.4565917522919944e-05, + "loss": 0.2684, + "step": 6416 + }, + { + "epoch": 0.37, + "grad_norm": 0.38594474836512377, + "learning_rate": 1.4564261833977563e-05, + "loss": 0.3048, + "step": 6417 + }, + { + "epoch": 0.37, + "grad_norm": 0.2507980036522095, + "learning_rate": 1.4562605986974991e-05, + "loss": 0.194, + "step": 6418 + }, + { + "epoch": 0.37, + "grad_norm": 0.872875765589793, + "learning_rate": 1.456094998196957e-05, + "loss": 0.4096, + "step": 6419 + }, + { + "epoch": 0.37, + "grad_norm": 0.358181410493378, + "learning_rate": 1.4559293819018648e-05, + "loss": 0.2836, + "step": 6420 + }, + { + "epoch": 0.37, + "grad_norm": 0.45407158678411264, + "learning_rate": 1.4557637498179577e-05, + "loss": 0.346, + "step": 6421 + }, + { + "epoch": 0.37, + "grad_norm": 0.9255722275983588, + "learning_rate": 1.4555981019509716e-05, + "loss": 0.5238, + "step": 6422 + }, + { + "epoch": 0.37, + "grad_norm": 0.29691564293618045, + "learning_rate": 1.4554324383066427e-05, + "loss": 0.2093, + "step": 6423 + }, + { + "epoch": 0.37, + "grad_norm": 0.4785704057795955, + "learning_rate": 1.4552667588907082e-05, + "loss": 0.3371, + "step": 6424 + }, + { + "epoch": 0.37, + "grad_norm": 0.2952211128804953, + "learning_rate": 1.4551010637089054e-05, + "loss": 0.2773, + "step": 6425 + }, + { + "epoch": 0.37, + "grad_norm": 0.36035741477242134, + "learning_rate": 1.454935352766972e-05, + "loss": 0.2268, + "step": 6426 + }, + { + "epoch": 0.37, + "grad_norm": 0.8603431400721523, + "learning_rate": 1.4547696260706474e-05, + "loss": 0.4749, + "step": 6427 + }, + { + "epoch": 0.37, + "grad_norm": 0.45518616624458275, + "learning_rate": 1.4546038836256698e-05, + "loss": 0.3162, + "step": 6428 + }, + { + "epoch": 0.37, + "grad_norm": 0.32397378870781374, + "learning_rate": 1.4544381254377794e-05, + "loss": 0.2011, + "step": 6429 + }, + { + "epoch": 0.37, + "grad_norm": 0.48798619337408167, + "learning_rate": 1.4542723515127161e-05, + "loss": 0.2987, + "step": 6430 + }, + { + "epoch": 0.37, + "grad_norm": 0.46757065365840805, + "learning_rate": 1.4541065618562211e-05, + "loss": 0.301, + "step": 6431 + }, + { + "epoch": 0.37, + "grad_norm": 0.39111537126054996, + "learning_rate": 1.4539407564740353e-05, + "loss": 0.2013, + "step": 6432 + }, + { + "epoch": 0.37, + "grad_norm": 0.4772229613888993, + "learning_rate": 1.4537749353719006e-05, + "loss": 0.3447, + "step": 6433 + }, + { + "epoch": 0.37, + "grad_norm": 0.7128445538916925, + "learning_rate": 1.4536090985555595e-05, + "loss": 0.4374, + "step": 6434 + }, + { + "epoch": 0.37, + "grad_norm": 0.37996740838238047, + "learning_rate": 1.4534432460307546e-05, + "loss": 0.284, + "step": 6435 + }, + { + "epoch": 0.37, + "grad_norm": 0.3521223528144348, + "learning_rate": 1.4532773778032297e-05, + "loss": 0.2757, + "step": 6436 + }, + { + "epoch": 0.37, + "grad_norm": 0.4727501869932283, + "learning_rate": 1.4531114938787285e-05, + "loss": 0.3093, + "step": 6437 + }, + { + "epoch": 0.37, + "grad_norm": 0.34669175481499864, + "learning_rate": 1.452945594262996e-05, + "loss": 0.2832, + "step": 6438 + }, + { + "epoch": 0.37, + "grad_norm": 0.3986170161984597, + "learning_rate": 1.452779678961777e-05, + "loss": 0.2385, + "step": 6439 + }, + { + "epoch": 0.37, + "grad_norm": 0.37509714723728876, + "learning_rate": 1.4526137479808173e-05, + "loss": 0.3058, + "step": 6440 + }, + { + "epoch": 0.37, + "grad_norm": 0.3784935823336974, + "learning_rate": 1.452447801325863e-05, + "loss": 0.2805, + "step": 6441 + }, + { + "epoch": 0.37, + "grad_norm": 0.9648337594394374, + "learning_rate": 1.4522818390026605e-05, + "loss": 0.4174, + "step": 6442 + }, + { + "epoch": 0.37, + "grad_norm": 0.36332205508671817, + "learning_rate": 1.4521158610169575e-05, + "loss": 0.3118, + "step": 6443 + }, + { + "epoch": 0.37, + "grad_norm": 0.3043988410895945, + "learning_rate": 1.4519498673745019e-05, + "loss": 0.2756, + "step": 6444 + }, + { + "epoch": 0.37, + "grad_norm": 0.3661883748618258, + "learning_rate": 1.4517838580810413e-05, + "loss": 0.2444, + "step": 6445 + }, + { + "epoch": 0.37, + "grad_norm": 0.8097337394192086, + "learning_rate": 1.4516178331423257e-05, + "loss": 0.576, + "step": 6446 + }, + { + "epoch": 0.37, + "grad_norm": 0.33163644933954645, + "learning_rate": 1.4514517925641035e-05, + "loss": 0.3033, + "step": 6447 + }, + { + "epoch": 0.37, + "grad_norm": 0.5218645713887373, + "learning_rate": 1.4512857363521254e-05, + "loss": 0.393, + "step": 6448 + }, + { + "epoch": 0.37, + "grad_norm": 0.3793598909180065, + "learning_rate": 1.4511196645121414e-05, + "loss": 0.319, + "step": 6449 + }, + { + "epoch": 0.37, + "grad_norm": 0.3896431397236735, + "learning_rate": 1.450953577049903e-05, + "loss": 0.2771, + "step": 6450 + }, + { + "epoch": 0.37, + "grad_norm": 0.2705214298693227, + "learning_rate": 1.4507874739711616e-05, + "loss": 0.2298, + "step": 6451 + }, + { + "epoch": 0.37, + "grad_norm": 0.34851249948794466, + "learning_rate": 1.4506213552816694e-05, + "loss": 0.2555, + "step": 6452 + }, + { + "epoch": 0.37, + "grad_norm": 0.39110676315148374, + "learning_rate": 1.4504552209871791e-05, + "loss": 0.2659, + "step": 6453 + }, + { + "epoch": 0.37, + "grad_norm": 0.5909154208429732, + "learning_rate": 1.4502890710934438e-05, + "loss": 0.4545, + "step": 6454 + }, + { + "epoch": 0.37, + "grad_norm": 0.8661006074737951, + "learning_rate": 1.4501229056062174e-05, + "loss": 0.36, + "step": 6455 + }, + { + "epoch": 0.37, + "grad_norm": 0.2980737565737271, + "learning_rate": 1.449956724531254e-05, + "loss": 0.2735, + "step": 6456 + }, + { + "epoch": 0.37, + "grad_norm": 0.37860627191593116, + "learning_rate": 1.4497905278743086e-05, + "loss": 0.3064, + "step": 6457 + }, + { + "epoch": 0.37, + "grad_norm": 0.4986638746583317, + "learning_rate": 1.4496243156411367e-05, + "loss": 0.3235, + "step": 6458 + }, + { + "epoch": 0.37, + "grad_norm": 0.36881140079123487, + "learning_rate": 1.4494580878374942e-05, + "loss": 0.2306, + "step": 6459 + }, + { + "epoch": 0.37, + "grad_norm": 0.4396370739515429, + "learning_rate": 1.4492918444691371e-05, + "loss": 0.3544, + "step": 6460 + }, + { + "epoch": 0.37, + "grad_norm": 1.4881242453846941, + "learning_rate": 1.4491255855418228e-05, + "loss": 0.4526, + "step": 6461 + }, + { + "epoch": 0.37, + "grad_norm": 0.31509315415532924, + "learning_rate": 1.4489593110613087e-05, + "loss": 0.2344, + "step": 6462 + }, + { + "epoch": 0.37, + "grad_norm": 0.4766555392210357, + "learning_rate": 1.4487930210333532e-05, + "loss": 0.29, + "step": 6463 + }, + { + "epoch": 0.37, + "grad_norm": 0.3242238172436043, + "learning_rate": 1.4486267154637146e-05, + "loss": 0.2684, + "step": 6464 + }, + { + "epoch": 0.37, + "grad_norm": 0.38454949832911445, + "learning_rate": 1.4484603943581523e-05, + "loss": 0.1761, + "step": 6465 + }, + { + "epoch": 0.37, + "grad_norm": 0.9162557910018109, + "learning_rate": 1.4482940577224254e-05, + "loss": 0.4149, + "step": 6466 + }, + { + "epoch": 0.37, + "grad_norm": 0.9944619174618472, + "learning_rate": 1.4481277055622948e-05, + "loss": 0.3897, + "step": 6467 + }, + { + "epoch": 0.37, + "grad_norm": 0.3133210801429484, + "learning_rate": 1.4479613378835211e-05, + "loss": 0.2176, + "step": 6468 + }, + { + "epoch": 0.37, + "grad_norm": 0.49623665028789815, + "learning_rate": 1.4477949546918655e-05, + "loss": 0.4053, + "step": 6469 + }, + { + "epoch": 0.37, + "grad_norm": 0.28227034323549527, + "learning_rate": 1.4476285559930899e-05, + "loss": 0.1809, + "step": 6470 + }, + { + "epoch": 0.37, + "grad_norm": 0.5943690582874419, + "learning_rate": 1.4474621417929566e-05, + "loss": 0.2999, + "step": 6471 + }, + { + "epoch": 0.37, + "grad_norm": 0.4368804350447025, + "learning_rate": 1.4472957120972284e-05, + "loss": 0.2854, + "step": 6472 + }, + { + "epoch": 0.37, + "grad_norm": 1.0839653605743804, + "learning_rate": 1.4471292669116692e-05, + "loss": 0.4932, + "step": 6473 + }, + { + "epoch": 0.37, + "grad_norm": 0.6103092418262205, + "learning_rate": 1.4469628062420427e-05, + "loss": 0.3787, + "step": 6474 + }, + { + "epoch": 0.37, + "grad_norm": 0.2815049852060284, + "learning_rate": 1.4467963300941135e-05, + "loss": 0.1699, + "step": 6475 + }, + { + "epoch": 0.37, + "grad_norm": 0.42956074118826143, + "learning_rate": 1.4466298384736463e-05, + "loss": 0.32, + "step": 6476 + }, + { + "epoch": 0.37, + "grad_norm": 0.42963144754338745, + "learning_rate": 1.4464633313864073e-05, + "loss": 0.2869, + "step": 6477 + }, + { + "epoch": 0.37, + "grad_norm": 0.5424138447706254, + "learning_rate": 1.4462968088381621e-05, + "loss": 0.327, + "step": 6478 + }, + { + "epoch": 0.37, + "grad_norm": 0.5458547157279315, + "learning_rate": 1.4461302708346778e-05, + "loss": 0.3884, + "step": 6479 + }, + { + "epoch": 0.37, + "grad_norm": 0.32194605311238567, + "learning_rate": 1.4459637173817214e-05, + "loss": 0.2684, + "step": 6480 + }, + { + "epoch": 0.37, + "grad_norm": 0.4947001893698327, + "learning_rate": 1.4457971484850604e-05, + "loss": 0.2686, + "step": 6481 + }, + { + "epoch": 0.37, + "grad_norm": 0.30792978417290967, + "learning_rate": 1.4456305641504636e-05, + "loss": 0.1929, + "step": 6482 + }, + { + "epoch": 0.37, + "grad_norm": 0.39304218999952134, + "learning_rate": 1.4454639643836993e-05, + "loss": 0.3088, + "step": 6483 + }, + { + "epoch": 0.37, + "grad_norm": 0.350822152919814, + "learning_rate": 1.4452973491905372e-05, + "loss": 0.3195, + "step": 6484 + }, + { + "epoch": 0.37, + "grad_norm": 0.7457583251157986, + "learning_rate": 1.4451307185767469e-05, + "loss": 0.4278, + "step": 6485 + }, + { + "epoch": 0.37, + "grad_norm": 0.9576714232110599, + "learning_rate": 1.4449640725480991e-05, + "loss": 0.471, + "step": 6486 + }, + { + "epoch": 0.37, + "grad_norm": 0.38426895057800664, + "learning_rate": 1.4447974111103645e-05, + "loss": 0.285, + "step": 6487 + }, + { + "epoch": 0.37, + "grad_norm": 0.22401458190615092, + "learning_rate": 1.4446307342693149e-05, + "loss": 0.2138, + "step": 6488 + }, + { + "epoch": 0.37, + "grad_norm": 0.5444407977701945, + "learning_rate": 1.4444640420307217e-05, + "loss": 0.319, + "step": 6489 + }, + { + "epoch": 0.37, + "grad_norm": 0.4500081741351894, + "learning_rate": 1.4442973344003582e-05, + "loss": 0.3505, + "step": 6490 + }, + { + "epoch": 0.37, + "grad_norm": 1.0057233925466538, + "learning_rate": 1.4441306113839971e-05, + "loss": 0.5737, + "step": 6491 + }, + { + "epoch": 0.37, + "grad_norm": 0.34070622377314463, + "learning_rate": 1.4439638729874119e-05, + "loss": 0.2789, + "step": 6492 + }, + { + "epoch": 0.37, + "grad_norm": 0.39274503048589104, + "learning_rate": 1.4437971192163768e-05, + "loss": 0.3363, + "step": 6493 + }, + { + "epoch": 0.37, + "grad_norm": 0.260970503088254, + "learning_rate": 1.4436303500766667e-05, + "loss": 0.139, + "step": 6494 + }, + { + "epoch": 0.37, + "grad_norm": 0.4353457494525623, + "learning_rate": 1.4434635655740566e-05, + "loss": 0.3152, + "step": 6495 + }, + { + "epoch": 0.37, + "grad_norm": 0.36338683442474146, + "learning_rate": 1.4432967657143223e-05, + "loss": 0.3243, + "step": 6496 + }, + { + "epoch": 0.37, + "grad_norm": 1.4910761065746299, + "learning_rate": 1.44312995050324e-05, + "loss": 0.8249, + "step": 6497 + }, + { + "epoch": 0.37, + "grad_norm": 0.3636983831741691, + "learning_rate": 1.4429631199465866e-05, + "loss": 0.2165, + "step": 6498 + }, + { + "epoch": 0.37, + "grad_norm": 0.6184276802434224, + "learning_rate": 1.4427962740501396e-05, + "loss": 0.4225, + "step": 6499 + }, + { + "epoch": 0.37, + "grad_norm": 0.4139216454007609, + "learning_rate": 1.4426294128196763e-05, + "loss": 0.3305, + "step": 6500 + }, + { + "epoch": 0.37, + "grad_norm": 0.49001496573354486, + "learning_rate": 1.4424625362609757e-05, + "loss": 0.2682, + "step": 6501 + }, + { + "epoch": 0.37, + "grad_norm": 0.27947362207932114, + "learning_rate": 1.4422956443798165e-05, + "loss": 0.2087, + "step": 6502 + }, + { + "epoch": 0.37, + "grad_norm": 0.5741699302860873, + "learning_rate": 1.4421287371819781e-05, + "loss": 0.4186, + "step": 6503 + }, + { + "epoch": 0.37, + "grad_norm": 0.7860385506372445, + "learning_rate": 1.4419618146732404e-05, + "loss": 0.2432, + "step": 6504 + }, + { + "epoch": 0.37, + "grad_norm": 0.4018869658660575, + "learning_rate": 1.4417948768593842e-05, + "loss": 0.3231, + "step": 6505 + }, + { + "epoch": 0.37, + "grad_norm": 0.5974194600768572, + "learning_rate": 1.4416279237461903e-05, + "loss": 0.4459, + "step": 6506 + }, + { + "epoch": 0.37, + "grad_norm": 0.5043837478835471, + "learning_rate": 1.4414609553394408e-05, + "loss": 0.2003, + "step": 6507 + }, + { + "epoch": 0.37, + "grad_norm": 0.3785285725739497, + "learning_rate": 1.441293971644917e-05, + "loss": 0.2199, + "step": 6508 + }, + { + "epoch": 0.37, + "grad_norm": 1.1012730369361647, + "learning_rate": 1.4411269726684022e-05, + "loss": 0.7594, + "step": 6509 + }, + { + "epoch": 0.37, + "grad_norm": 0.8161801633283232, + "learning_rate": 1.4409599584156791e-05, + "loss": 0.4433, + "step": 6510 + }, + { + "epoch": 0.37, + "grad_norm": 0.3898556198374485, + "learning_rate": 1.4407929288925316e-05, + "loss": 0.2459, + "step": 6511 + }, + { + "epoch": 0.37, + "grad_norm": 0.5468015768810729, + "learning_rate": 1.440625884104744e-05, + "loss": 0.3783, + "step": 6512 + }, + { + "epoch": 0.37, + "grad_norm": 0.734511890686694, + "learning_rate": 1.440458824058101e-05, + "loss": 0.3978, + "step": 6513 + }, + { + "epoch": 0.37, + "grad_norm": 0.2729669946052368, + "learning_rate": 1.4402917487583876e-05, + "loss": 0.1975, + "step": 6514 + }, + { + "epoch": 0.37, + "grad_norm": 0.5058855067236363, + "learning_rate": 1.4401246582113904e-05, + "loss": 0.4243, + "step": 6515 + }, + { + "epoch": 0.37, + "grad_norm": 0.2797481012857069, + "learning_rate": 1.4399575524228949e-05, + "loss": 0.237, + "step": 6516 + }, + { + "epoch": 0.37, + "grad_norm": 0.36853983359838033, + "learning_rate": 1.4397904313986881e-05, + "loss": 0.2095, + "step": 6517 + }, + { + "epoch": 0.37, + "grad_norm": 0.8808245431323464, + "learning_rate": 1.439623295144558e-05, + "loss": 0.4242, + "step": 6518 + }, + { + "epoch": 0.37, + "grad_norm": 0.32676114481551416, + "learning_rate": 1.4394561436662917e-05, + "loss": 0.282, + "step": 6519 + }, + { + "epoch": 0.37, + "grad_norm": 0.32006809977392964, + "learning_rate": 1.4392889769696783e-05, + "loss": 0.2367, + "step": 6520 + }, + { + "epoch": 0.37, + "grad_norm": 0.4047428854960159, + "learning_rate": 1.4391217950605063e-05, + "loss": 0.3227, + "step": 6521 + }, + { + "epoch": 0.37, + "grad_norm": 0.36465854255387276, + "learning_rate": 1.4389545979445655e-05, + "loss": 0.245, + "step": 6522 + }, + { + "epoch": 0.37, + "grad_norm": 0.35563849101387063, + "learning_rate": 1.438787385627646e-05, + "loss": 0.3014, + "step": 6523 + }, + { + "epoch": 0.37, + "grad_norm": 0.3814440985840305, + "learning_rate": 1.438620158115538e-05, + "loss": 0.2927, + "step": 6524 + }, + { + "epoch": 0.37, + "grad_norm": 1.1983742937891095, + "learning_rate": 1.438452915414033e-05, + "loss": 0.7023, + "step": 6525 + }, + { + "epoch": 0.37, + "grad_norm": 0.34701945637472476, + "learning_rate": 1.4382856575289223e-05, + "loss": 0.2919, + "step": 6526 + }, + { + "epoch": 0.38, + "grad_norm": 0.35292559953145647, + "learning_rate": 1.438118384465998e-05, + "loss": 0.2924, + "step": 6527 + }, + { + "epoch": 0.38, + "grad_norm": 0.26027699865809883, + "learning_rate": 1.4379510962310532e-05, + "loss": 0.1848, + "step": 6528 + }, + { + "epoch": 0.38, + "grad_norm": 0.35662767213458074, + "learning_rate": 1.4377837928298804e-05, + "loss": 0.3079, + "step": 6529 + }, + { + "epoch": 0.38, + "grad_norm": 0.6596498854621378, + "learning_rate": 1.4376164742682738e-05, + "loss": 0.3599, + "step": 6530 + }, + { + "epoch": 0.38, + "grad_norm": 0.36191791630131526, + "learning_rate": 1.4374491405520274e-05, + "loss": 0.2928, + "step": 6531 + }, + { + "epoch": 0.38, + "grad_norm": 0.3860443428448384, + "learning_rate": 1.4372817916869364e-05, + "loss": 0.3045, + "step": 6532 + }, + { + "epoch": 0.38, + "grad_norm": 0.9970264359008559, + "learning_rate": 1.4371144276787954e-05, + "loss": 0.5673, + "step": 6533 + }, + { + "epoch": 0.38, + "grad_norm": 0.4185340140823584, + "learning_rate": 1.436947048533401e-05, + "loss": 0.2696, + "step": 6534 + }, + { + "epoch": 0.38, + "grad_norm": 0.37333837957749794, + "learning_rate": 1.4367796542565486e-05, + "loss": 0.2877, + "step": 6535 + }, + { + "epoch": 0.38, + "grad_norm": 0.38025057980172355, + "learning_rate": 1.4366122448540361e-05, + "loss": 0.266, + "step": 6536 + }, + { + "epoch": 0.38, + "grad_norm": 0.6973816542088787, + "learning_rate": 1.4364448203316599e-05, + "loss": 0.2136, + "step": 6537 + }, + { + "epoch": 0.38, + "grad_norm": 0.354013344957813, + "learning_rate": 1.4362773806952184e-05, + "loss": 0.2847, + "step": 6538 + }, + { + "epoch": 0.38, + "grad_norm": 0.42870882657679155, + "learning_rate": 1.4361099259505102e-05, + "loss": 0.3461, + "step": 6539 + }, + { + "epoch": 0.38, + "grad_norm": 0.7161912133243448, + "learning_rate": 1.4359424561033337e-05, + "loss": 0.3663, + "step": 6540 + }, + { + "epoch": 0.38, + "grad_norm": 0.32702988657639787, + "learning_rate": 1.435774971159489e-05, + "loss": 0.2704, + "step": 6541 + }, + { + "epoch": 0.38, + "grad_norm": 0.2874840199994187, + "learning_rate": 1.4356074711247759e-05, + "loss": 0.1933, + "step": 6542 + }, + { + "epoch": 0.38, + "grad_norm": 0.3422143523196144, + "learning_rate": 1.4354399560049943e-05, + "loss": 0.2543, + "step": 6543 + }, + { + "epoch": 0.38, + "grad_norm": 0.3620610486020329, + "learning_rate": 1.4352724258059461e-05, + "loss": 0.2882, + "step": 6544 + }, + { + "epoch": 0.38, + "grad_norm": 0.6999998230635959, + "learning_rate": 1.4351048805334325e-05, + "loss": 0.5029, + "step": 6545 + }, + { + "epoch": 0.38, + "grad_norm": 0.7040615510823431, + "learning_rate": 1.4349373201932553e-05, + "loss": 0.4828, + "step": 6546 + }, + { + "epoch": 0.38, + "grad_norm": 0.34328707423243926, + "learning_rate": 1.4347697447912176e-05, + "loss": 0.2311, + "step": 6547 + }, + { + "epoch": 0.38, + "grad_norm": 0.34255042274766273, + "learning_rate": 1.4346021543331224e-05, + "loss": 0.2109, + "step": 6548 + }, + { + "epoch": 0.38, + "grad_norm": 0.9416350640193081, + "learning_rate": 1.4344345488247733e-05, + "loss": 0.5087, + "step": 6549 + }, + { + "epoch": 0.38, + "grad_norm": 0.3524115680822469, + "learning_rate": 1.4342669282719741e-05, + "loss": 0.2236, + "step": 6550 + }, + { + "epoch": 0.38, + "grad_norm": 0.34939936511288033, + "learning_rate": 1.4340992926805304e-05, + "loss": 0.3067, + "step": 6551 + }, + { + "epoch": 0.38, + "grad_norm": 0.9389882276144642, + "learning_rate": 1.4339316420562464e-05, + "loss": 0.4627, + "step": 6552 + }, + { + "epoch": 0.38, + "grad_norm": 0.3572146276189607, + "learning_rate": 1.4337639764049285e-05, + "loss": 0.2025, + "step": 6553 + }, + { + "epoch": 0.38, + "grad_norm": 0.3440940221274594, + "learning_rate": 1.4335962957323827e-05, + "loss": 0.2088, + "step": 6554 + }, + { + "epoch": 0.38, + "grad_norm": 0.43469991652072393, + "learning_rate": 1.433428600044416e-05, + "loss": 0.3331, + "step": 6555 + }, + { + "epoch": 0.38, + "grad_norm": 0.30077868298122107, + "learning_rate": 1.4332608893468351e-05, + "loss": 0.1945, + "step": 6556 + }, + { + "epoch": 0.38, + "grad_norm": 0.6253365617274365, + "learning_rate": 1.4330931636454489e-05, + "loss": 0.4415, + "step": 6557 + }, + { + "epoch": 0.38, + "grad_norm": 0.47640301786281314, + "learning_rate": 1.4329254229460645e-05, + "loss": 0.3846, + "step": 6558 + }, + { + "epoch": 0.38, + "grad_norm": 0.30769115910174644, + "learning_rate": 1.4327576672544917e-05, + "loss": 0.2608, + "step": 6559 + }, + { + "epoch": 0.38, + "grad_norm": 0.24923379904346613, + "learning_rate": 1.432589896576539e-05, + "loss": 0.152, + "step": 6560 + }, + { + "epoch": 0.38, + "grad_norm": 1.1160575763268, + "learning_rate": 1.4324221109180173e-05, + "loss": 0.6775, + "step": 6561 + }, + { + "epoch": 0.38, + "grad_norm": 0.39827529165073156, + "learning_rate": 1.4322543102847362e-05, + "loss": 0.2889, + "step": 6562 + }, + { + "epoch": 0.38, + "grad_norm": 0.4097091571777183, + "learning_rate": 1.432086494682507e-05, + "loss": 0.3128, + "step": 6563 + }, + { + "epoch": 0.38, + "grad_norm": 1.4122864900557979, + "learning_rate": 1.4319186641171412e-05, + "loss": 0.7525, + "step": 6564 + }, + { + "epoch": 0.38, + "grad_norm": 0.345257798485318, + "learning_rate": 1.4317508185944504e-05, + "loss": 0.2947, + "step": 6565 + }, + { + "epoch": 0.38, + "grad_norm": 0.18780929074416594, + "learning_rate": 1.4315829581202474e-05, + "loss": 0.0706, + "step": 6566 + }, + { + "epoch": 0.38, + "grad_norm": 0.37982000063976756, + "learning_rate": 1.431415082700345e-05, + "loss": 0.326, + "step": 6567 + }, + { + "epoch": 0.38, + "grad_norm": 0.37529555056231606, + "learning_rate": 1.4312471923405571e-05, + "loss": 0.2817, + "step": 6568 + }, + { + "epoch": 0.38, + "grad_norm": 0.7558881857203945, + "learning_rate": 1.4310792870466973e-05, + "loss": 0.3752, + "step": 6569 + }, + { + "epoch": 0.38, + "grad_norm": 0.4981673910095424, + "learning_rate": 1.4309113668245804e-05, + "loss": 0.4285, + "step": 6570 + }, + { + "epoch": 0.38, + "grad_norm": 0.32064816753904357, + "learning_rate": 1.4307434316800213e-05, + "loss": 0.2705, + "step": 6571 + }, + { + "epoch": 0.38, + "grad_norm": 0.30413949280789165, + "learning_rate": 1.4305754816188358e-05, + "loss": 0.2559, + "step": 6572 + }, + { + "epoch": 0.38, + "grad_norm": 0.4309987650378867, + "learning_rate": 1.4304075166468396e-05, + "loss": 0.2168, + "step": 6573 + }, + { + "epoch": 0.38, + "grad_norm": 0.40311342743064493, + "learning_rate": 1.43023953676985e-05, + "loss": 0.2705, + "step": 6574 + }, + { + "epoch": 0.38, + "grad_norm": 0.2954206348253972, + "learning_rate": 1.4300715419936834e-05, + "loss": 0.3028, + "step": 6575 + }, + { + "epoch": 0.38, + "grad_norm": 0.9026406553036119, + "learning_rate": 1.4299035323241583e-05, + "loss": 0.5688, + "step": 6576 + }, + { + "epoch": 0.38, + "grad_norm": 0.3587211236933703, + "learning_rate": 1.429735507767092e-05, + "loss": 0.2561, + "step": 6577 + }, + { + "epoch": 0.38, + "grad_norm": 0.26006833779910876, + "learning_rate": 1.4295674683283037e-05, + "loss": 0.2293, + "step": 6578 + }, + { + "epoch": 0.38, + "grad_norm": 0.3906207143635448, + "learning_rate": 1.4293994140136123e-05, + "loss": 0.2902, + "step": 6579 + }, + { + "epoch": 0.38, + "grad_norm": 0.3336786346711805, + "learning_rate": 1.4292313448288377e-05, + "loss": 0.2849, + "step": 6580 + }, + { + "epoch": 0.38, + "grad_norm": 0.7731830224890437, + "learning_rate": 1.4290632607797998e-05, + "loss": 0.4842, + "step": 6581 + }, + { + "epoch": 0.38, + "grad_norm": 0.46191721868987656, + "learning_rate": 1.4288951618723201e-05, + "loss": 0.2651, + "step": 6582 + }, + { + "epoch": 0.38, + "grad_norm": 0.3088949080198425, + "learning_rate": 1.428727048112219e-05, + "loss": 0.2683, + "step": 6583 + }, + { + "epoch": 0.38, + "grad_norm": 1.0472390701706424, + "learning_rate": 1.4285589195053191e-05, + "loss": 0.6867, + "step": 6584 + }, + { + "epoch": 0.38, + "grad_norm": 0.3409265332982283, + "learning_rate": 1.428390776057442e-05, + "loss": 0.2472, + "step": 6585 + }, + { + "epoch": 0.38, + "grad_norm": 0.35059501414858735, + "learning_rate": 1.4282226177744107e-05, + "loss": 0.2534, + "step": 6586 + }, + { + "epoch": 0.38, + "grad_norm": 0.3968681655453618, + "learning_rate": 1.4280544446620485e-05, + "loss": 0.3418, + "step": 6587 + }, + { + "epoch": 0.38, + "grad_norm": 1.3699405066952637, + "learning_rate": 1.4278862567261796e-05, + "loss": 0.8169, + "step": 6588 + }, + { + "epoch": 0.38, + "grad_norm": 0.36354057094968956, + "learning_rate": 1.4277180539726278e-05, + "loss": 0.1807, + "step": 6589 + }, + { + "epoch": 0.38, + "grad_norm": 0.37436367140991333, + "learning_rate": 1.427549836407218e-05, + "loss": 0.274, + "step": 6590 + }, + { + "epoch": 0.38, + "grad_norm": 0.4031137876109123, + "learning_rate": 1.4273816040357762e-05, + "loss": 0.3387, + "step": 6591 + }, + { + "epoch": 0.38, + "grad_norm": 0.39913767829709507, + "learning_rate": 1.4272133568641273e-05, + "loss": 0.2186, + "step": 6592 + }, + { + "epoch": 0.38, + "grad_norm": 0.34901425704231603, + "learning_rate": 1.4270450948980989e-05, + "loss": 0.2849, + "step": 6593 + }, + { + "epoch": 0.38, + "grad_norm": 0.43807677930615996, + "learning_rate": 1.4268768181435166e-05, + "loss": 0.3457, + "step": 6594 + }, + { + "epoch": 0.38, + "grad_norm": 0.33170476466487625, + "learning_rate": 1.4267085266062088e-05, + "loss": 0.1751, + "step": 6595 + }, + { + "epoch": 0.38, + "grad_norm": 0.47900650476435713, + "learning_rate": 1.4265402202920029e-05, + "loss": 0.3583, + "step": 6596 + }, + { + "epoch": 0.38, + "grad_norm": 0.6101026898693607, + "learning_rate": 1.4263718992067276e-05, + "loss": 0.44, + "step": 6597 + }, + { + "epoch": 0.38, + "grad_norm": 0.3100847262447763, + "learning_rate": 1.4262035633562117e-05, + "loss": 0.2415, + "step": 6598 + }, + { + "epoch": 0.38, + "grad_norm": 0.29612737366854475, + "learning_rate": 1.4260352127462848e-05, + "loss": 0.2147, + "step": 6599 + }, + { + "epoch": 0.38, + "grad_norm": 1.0381415145756596, + "learning_rate": 1.4258668473827766e-05, + "loss": 0.7037, + "step": 6600 + }, + { + "epoch": 0.38, + "grad_norm": 0.40518343031761545, + "learning_rate": 1.4256984672715182e-05, + "loss": 0.2977, + "step": 6601 + }, + { + "epoch": 0.38, + "grad_norm": 0.4991942601955228, + "learning_rate": 1.4255300724183396e-05, + "loss": 0.2844, + "step": 6602 + }, + { + "epoch": 0.38, + "grad_norm": 0.43905524147113645, + "learning_rate": 1.4253616628290735e-05, + "loss": 0.3412, + "step": 6603 + }, + { + "epoch": 0.38, + "grad_norm": 0.32293505302934256, + "learning_rate": 1.425193238509551e-05, + "loss": 0.2129, + "step": 6604 + }, + { + "epoch": 0.38, + "grad_norm": 0.30523910076856414, + "learning_rate": 1.425024799465605e-05, + "loss": 0.1843, + "step": 6605 + }, + { + "epoch": 0.38, + "grad_norm": 0.7134300357593555, + "learning_rate": 1.4248563457030684e-05, + "loss": 0.3323, + "step": 6606 + }, + { + "epoch": 0.38, + "grad_norm": 0.3821121733080358, + "learning_rate": 1.4246878772277748e-05, + "loss": 0.2799, + "step": 6607 + }, + { + "epoch": 0.38, + "grad_norm": 0.5362957512770763, + "learning_rate": 1.4245193940455583e-05, + "loss": 0.2907, + "step": 6608 + }, + { + "epoch": 0.38, + "grad_norm": 0.7015232324733971, + "learning_rate": 1.4243508961622536e-05, + "loss": 0.4711, + "step": 6609 + }, + { + "epoch": 0.38, + "grad_norm": 0.36283666869718023, + "learning_rate": 1.4241823835836957e-05, + "loss": 0.231, + "step": 6610 + }, + { + "epoch": 0.38, + "grad_norm": 0.3676225906926481, + "learning_rate": 1.4240138563157197e-05, + "loss": 0.2852, + "step": 6611 + }, + { + "epoch": 0.38, + "grad_norm": 0.3206621409131723, + "learning_rate": 1.4238453143641623e-05, + "loss": 0.1736, + "step": 6612 + }, + { + "epoch": 0.38, + "grad_norm": 0.6024596594608613, + "learning_rate": 1.4236767577348597e-05, + "loss": 0.3085, + "step": 6613 + }, + { + "epoch": 0.38, + "grad_norm": 0.3601264112579988, + "learning_rate": 1.4235081864336495e-05, + "loss": 0.3118, + "step": 6614 + }, + { + "epoch": 0.38, + "grad_norm": 0.4754139334883257, + "learning_rate": 1.4233396004663686e-05, + "loss": 0.2908, + "step": 6615 + }, + { + "epoch": 0.38, + "grad_norm": 0.6780714577012267, + "learning_rate": 1.423170999838856e-05, + "loss": 0.3664, + "step": 6616 + }, + { + "epoch": 0.38, + "grad_norm": 0.4271908689523186, + "learning_rate": 1.4230023845569497e-05, + "loss": 0.375, + "step": 6617 + }, + { + "epoch": 0.38, + "grad_norm": 0.2646098949172002, + "learning_rate": 1.422833754626489e-05, + "loss": 0.2093, + "step": 6618 + }, + { + "epoch": 0.38, + "grad_norm": 0.36022570135160514, + "learning_rate": 1.4226651100533136e-05, + "loss": 0.2529, + "step": 6619 + }, + { + "epoch": 0.38, + "grad_norm": 0.4342374594084135, + "learning_rate": 1.4224964508432635e-05, + "loss": 0.3454, + "step": 6620 + }, + { + "epoch": 0.38, + "grad_norm": 1.0431596454133143, + "learning_rate": 1.4223277770021794e-05, + "loss": 0.4782, + "step": 6621 + }, + { + "epoch": 0.38, + "grad_norm": 0.2630385189545252, + "learning_rate": 1.4221590885359029e-05, + "loss": 0.2208, + "step": 6622 + }, + { + "epoch": 0.38, + "grad_norm": 0.37912554241453955, + "learning_rate": 1.421990385450275e-05, + "loss": 0.3119, + "step": 6623 + }, + { + "epoch": 0.38, + "grad_norm": 0.31620577959594415, + "learning_rate": 1.4218216677511383e-05, + "loss": 0.2133, + "step": 6624 + }, + { + "epoch": 0.38, + "grad_norm": 0.7817419011776064, + "learning_rate": 1.4216529354443355e-05, + "loss": 0.2797, + "step": 6625 + }, + { + "epoch": 0.38, + "grad_norm": 0.3580806273138164, + "learning_rate": 1.4214841885357096e-05, + "loss": 0.3249, + "step": 6626 + }, + { + "epoch": 0.38, + "grad_norm": 0.47803700129219767, + "learning_rate": 1.4213154270311043e-05, + "loss": 0.4245, + "step": 6627 + }, + { + "epoch": 0.38, + "grad_norm": 0.29809480038275465, + "learning_rate": 1.421146650936364e-05, + "loss": 0.1387, + "step": 6628 + }, + { + "epoch": 0.38, + "grad_norm": 0.3552015102693456, + "learning_rate": 1.4209778602573332e-05, + "loss": 0.2911, + "step": 6629 + }, + { + "epoch": 0.38, + "grad_norm": 0.371671833280386, + "learning_rate": 1.4208090549998572e-05, + "loss": 0.3407, + "step": 6630 + }, + { + "epoch": 0.38, + "grad_norm": 0.774988864764028, + "learning_rate": 1.420640235169782e-05, + "loss": 0.3303, + "step": 6631 + }, + { + "epoch": 0.38, + "grad_norm": 0.2656032968656245, + "learning_rate": 1.420471400772953e-05, + "loss": 0.2185, + "step": 6632 + }, + { + "epoch": 0.38, + "grad_norm": 0.7574900916209983, + "learning_rate": 1.4203025518152178e-05, + "loss": 0.4876, + "step": 6633 + }, + { + "epoch": 0.38, + "grad_norm": 0.3571697183187376, + "learning_rate": 1.420133688302423e-05, + "loss": 0.3068, + "step": 6634 + }, + { + "epoch": 0.38, + "grad_norm": 0.3627025977269262, + "learning_rate": 1.419964810240417e-05, + "loss": 0.2454, + "step": 6635 + }, + { + "epoch": 0.38, + "grad_norm": 0.9722718639281581, + "learning_rate": 1.4197959176350476e-05, + "loss": 0.5431, + "step": 6636 + }, + { + "epoch": 0.38, + "grad_norm": 0.45319845739485237, + "learning_rate": 1.4196270104921637e-05, + "loss": 0.2968, + "step": 6637 + }, + { + "epoch": 0.38, + "grad_norm": 0.3168864543772265, + "learning_rate": 1.4194580888176141e-05, + "loss": 0.2416, + "step": 6638 + }, + { + "epoch": 0.38, + "grad_norm": 0.4402300020745205, + "learning_rate": 1.4192891526172494e-05, + "loss": 0.2527, + "step": 6639 + }, + { + "epoch": 0.38, + "grad_norm": 0.7619509636775692, + "learning_rate": 1.419120201896919e-05, + "loss": 0.4639, + "step": 6640 + }, + { + "epoch": 0.38, + "grad_norm": 0.4094808720491614, + "learning_rate": 1.4189512366624745e-05, + "loss": 0.2223, + "step": 6641 + }, + { + "epoch": 0.38, + "grad_norm": 0.49203402514236594, + "learning_rate": 1.4187822569197662e-05, + "loss": 0.3586, + "step": 6642 + }, + { + "epoch": 0.38, + "grad_norm": 0.6042575546599518, + "learning_rate": 1.4186132626746466e-05, + "loss": 0.3776, + "step": 6643 + }, + { + "epoch": 0.38, + "grad_norm": 0.2850850597785273, + "learning_rate": 1.4184442539329677e-05, + "loss": 0.1775, + "step": 6644 + }, + { + "epoch": 0.38, + "grad_norm": 0.41097360765078583, + "learning_rate": 1.4182752307005822e-05, + "loss": 0.2656, + "step": 6645 + }, + { + "epoch": 0.38, + "grad_norm": 0.3749280168749956, + "learning_rate": 1.4181061929833435e-05, + "loss": 0.3093, + "step": 6646 + }, + { + "epoch": 0.38, + "grad_norm": 0.36949049484555013, + "learning_rate": 1.4179371407871054e-05, + "loss": 0.2755, + "step": 6647 + }, + { + "epoch": 0.38, + "grad_norm": 0.8305477195457657, + "learning_rate": 1.4177680741177217e-05, + "loss": 0.3878, + "step": 6648 + }, + { + "epoch": 0.38, + "grad_norm": 0.9615488498518213, + "learning_rate": 1.4175989929810481e-05, + "loss": 0.5805, + "step": 6649 + }, + { + "epoch": 0.38, + "grad_norm": 0.2774945282091035, + "learning_rate": 1.417429897382939e-05, + "loss": 0.2652, + "step": 6650 + }, + { + "epoch": 0.38, + "grad_norm": 0.21826789390351636, + "learning_rate": 1.4172607873292505e-05, + "loss": 0.1444, + "step": 6651 + }, + { + "epoch": 0.38, + "grad_norm": 0.7917466348827511, + "learning_rate": 1.4170916628258392e-05, + "loss": 0.5094, + "step": 6652 + }, + { + "epoch": 0.38, + "grad_norm": 0.4402092475426011, + "learning_rate": 1.4169225238785611e-05, + "loss": 0.3204, + "step": 6653 + }, + { + "epoch": 0.38, + "grad_norm": 0.48816511867608736, + "learning_rate": 1.4167533704932743e-05, + "loss": 0.2935, + "step": 6654 + }, + { + "epoch": 0.38, + "grad_norm": 0.7306057653718944, + "learning_rate": 1.416584202675836e-05, + "loss": 0.44, + "step": 6655 + }, + { + "epoch": 0.38, + "grad_norm": 0.32525920456793944, + "learning_rate": 1.4164150204321046e-05, + "loss": 0.2374, + "step": 6656 + }, + { + "epoch": 0.38, + "grad_norm": 0.3296501402587944, + "learning_rate": 1.4162458237679389e-05, + "loss": 0.1485, + "step": 6657 + }, + { + "epoch": 0.38, + "grad_norm": 0.36981045940986673, + "learning_rate": 1.4160766126891985e-05, + "loss": 0.3043, + "step": 6658 + }, + { + "epoch": 0.38, + "grad_norm": 0.3883473725235902, + "learning_rate": 1.4159073872017427e-05, + "loss": 0.2873, + "step": 6659 + }, + { + "epoch": 0.38, + "grad_norm": 0.8316799239970233, + "learning_rate": 1.4157381473114323e-05, + "loss": 0.4332, + "step": 6660 + }, + { + "epoch": 0.38, + "grad_norm": 0.40166375360596235, + "learning_rate": 1.4155688930241274e-05, + "loss": 0.218, + "step": 6661 + }, + { + "epoch": 0.38, + "grad_norm": 0.375701225006493, + "learning_rate": 1.4153996243456898e-05, + "loss": 0.2991, + "step": 6662 + }, + { + "epoch": 0.38, + "grad_norm": 0.2775566069575302, + "learning_rate": 1.4152303412819808e-05, + "loss": 0.2017, + "step": 6663 + }, + { + "epoch": 0.38, + "grad_norm": 0.7846062358543829, + "learning_rate": 1.4150610438388633e-05, + "loss": 0.3898, + "step": 6664 + }, + { + "epoch": 0.38, + "grad_norm": 0.35540285474583694, + "learning_rate": 1.4148917320221992e-05, + "loss": 0.2879, + "step": 6665 + }, + { + "epoch": 0.38, + "grad_norm": 0.36580621119695167, + "learning_rate": 1.4147224058378525e-05, + "loss": 0.3285, + "step": 6666 + }, + { + "epoch": 0.38, + "grad_norm": 0.8255865521291464, + "learning_rate": 1.4145530652916868e-05, + "loss": 0.3617, + "step": 6667 + }, + { + "epoch": 0.38, + "grad_norm": 0.35219097025016854, + "learning_rate": 1.4143837103895663e-05, + "loss": 0.278, + "step": 6668 + }, + { + "epoch": 0.38, + "grad_norm": 0.29669356912673234, + "learning_rate": 1.4142143411373559e-05, + "loss": 0.2179, + "step": 6669 + }, + { + "epoch": 0.38, + "grad_norm": 0.4085564007164582, + "learning_rate": 1.4140449575409203e-05, + "loss": 0.2935, + "step": 6670 + }, + { + "epoch": 0.38, + "grad_norm": 0.3408342696053366, + "learning_rate": 1.4138755596061257e-05, + "loss": 0.2819, + "step": 6671 + }, + { + "epoch": 0.38, + "grad_norm": 0.8311186813808776, + "learning_rate": 1.4137061473388383e-05, + "loss": 0.617, + "step": 6672 + }, + { + "epoch": 0.38, + "grad_norm": 0.5219159840804157, + "learning_rate": 1.4135367207449248e-05, + "loss": 0.3674, + "step": 6673 + }, + { + "epoch": 0.38, + "grad_norm": 0.29763937422590947, + "learning_rate": 1.4133672798302525e-05, + "loss": 0.2223, + "step": 6674 + }, + { + "epoch": 0.38, + "grad_norm": 0.25470244060159813, + "learning_rate": 1.4131978246006892e-05, + "loss": 0.1708, + "step": 6675 + }, + { + "epoch": 0.38, + "grad_norm": 0.6082879025289717, + "learning_rate": 1.4130283550621027e-05, + "loss": 0.4386, + "step": 6676 + }, + { + "epoch": 0.38, + "grad_norm": 0.3082473816953717, + "learning_rate": 1.4128588712203626e-05, + "loss": 0.1922, + "step": 6677 + }, + { + "epoch": 0.38, + "grad_norm": 0.40828261098657215, + "learning_rate": 1.4126893730813369e-05, + "loss": 0.3571, + "step": 6678 + }, + { + "epoch": 0.38, + "grad_norm": 1.284813336317904, + "learning_rate": 1.4125198606508963e-05, + "loss": 0.587, + "step": 6679 + }, + { + "epoch": 0.38, + "grad_norm": 0.3249047265979789, + "learning_rate": 1.4123503339349105e-05, + "loss": 0.2049, + "step": 6680 + }, + { + "epoch": 0.38, + "grad_norm": 0.27026754771556555, + "learning_rate": 1.4121807929392505e-05, + "loss": 0.2416, + "step": 6681 + }, + { + "epoch": 0.38, + "grad_norm": 0.4587471969379784, + "learning_rate": 1.4120112376697873e-05, + "loss": 0.3916, + "step": 6682 + }, + { + "epoch": 0.38, + "grad_norm": 0.30805966674421015, + "learning_rate": 1.4118416681323925e-05, + "loss": 0.1992, + "step": 6683 + }, + { + "epoch": 0.38, + "grad_norm": 1.2854672987600315, + "learning_rate": 1.4116720843329385e-05, + "loss": 0.8468, + "step": 6684 + }, + { + "epoch": 0.38, + "grad_norm": 0.5225364984100592, + "learning_rate": 1.4115024862772981e-05, + "loss": 0.3391, + "step": 6685 + }, + { + "epoch": 0.38, + "grad_norm": 0.3050026501666236, + "learning_rate": 1.4113328739713442e-05, + "loss": 0.2727, + "step": 6686 + }, + { + "epoch": 0.38, + "grad_norm": 0.7048532439613371, + "learning_rate": 1.4111632474209506e-05, + "loss": 0.3857, + "step": 6687 + }, + { + "epoch": 0.38, + "grad_norm": 0.2856578199252998, + "learning_rate": 1.4109936066319915e-05, + "loss": 0.2344, + "step": 6688 + }, + { + "epoch": 0.38, + "grad_norm": 0.38790911949598295, + "learning_rate": 1.4108239516103412e-05, + "loss": 0.2793, + "step": 6689 + }, + { + "epoch": 0.38, + "grad_norm": 0.42389402150691363, + "learning_rate": 1.4106542823618754e-05, + "loss": 0.2845, + "step": 6690 + }, + { + "epoch": 0.38, + "grad_norm": 1.1114232450461785, + "learning_rate": 1.4104845988924694e-05, + "loss": 0.6795, + "step": 6691 + }, + { + "epoch": 0.38, + "grad_norm": 0.31314498273119074, + "learning_rate": 1.4103149012079994e-05, + "loss": 0.2629, + "step": 6692 + }, + { + "epoch": 0.38, + "grad_norm": 0.4220924174883716, + "learning_rate": 1.4101451893143418e-05, + "loss": 0.2873, + "step": 6693 + }, + { + "epoch": 0.38, + "grad_norm": 0.34138109317922455, + "learning_rate": 1.4099754632173744e-05, + "loss": 0.2751, + "step": 6694 + }, + { + "epoch": 0.38, + "grad_norm": 0.404482535836102, + "learning_rate": 1.409805722922974e-05, + "loss": 0.2508, + "step": 6695 + }, + { + "epoch": 0.38, + "grad_norm": 0.33534481802465277, + "learning_rate": 1.409635968437019e-05, + "loss": 0.2233, + "step": 6696 + }, + { + "epoch": 0.38, + "grad_norm": 0.3605268365663909, + "learning_rate": 1.409466199765388e-05, + "loss": 0.3063, + "step": 6697 + }, + { + "epoch": 0.38, + "grad_norm": 0.39065776412591735, + "learning_rate": 1.4092964169139603e-05, + "loss": 0.2807, + "step": 6698 + }, + { + "epoch": 0.38, + "grad_norm": 0.44905297298310376, + "learning_rate": 1.409126619888615e-05, + "loss": 0.3731, + "step": 6699 + }, + { + "epoch": 0.38, + "grad_norm": 0.34506324518643255, + "learning_rate": 1.4089568086952327e-05, + "loss": 0.2082, + "step": 6700 + }, + { + "epoch": 0.39, + "grad_norm": 0.39996919181229135, + "learning_rate": 1.4087869833396936e-05, + "loss": 0.2956, + "step": 6701 + }, + { + "epoch": 0.39, + "grad_norm": 0.3419182623503886, + "learning_rate": 1.408617143827879e-05, + "loss": 0.2966, + "step": 6702 + }, + { + "epoch": 0.39, + "grad_norm": 0.35602932018929834, + "learning_rate": 1.40844729016567e-05, + "loss": 0.1709, + "step": 6703 + }, + { + "epoch": 0.39, + "grad_norm": 0.33780685153209516, + "learning_rate": 1.4082774223589492e-05, + "loss": 0.2794, + "step": 6704 + }, + { + "epoch": 0.39, + "grad_norm": 0.3433742555355464, + "learning_rate": 1.4081075404135987e-05, + "loss": 0.3471, + "step": 6705 + }, + { + "epoch": 0.39, + "grad_norm": 0.698436131545571, + "learning_rate": 1.4079376443355016e-05, + "loss": 0.0464, + "step": 6706 + }, + { + "epoch": 0.39, + "grad_norm": 0.3513574365202577, + "learning_rate": 1.4077677341305414e-05, + "loss": 0.2612, + "step": 6707 + }, + { + "epoch": 0.39, + "grad_norm": 1.17360579573699, + "learning_rate": 1.4075978098046022e-05, + "loss": 0.7839, + "step": 6708 + }, + { + "epoch": 0.39, + "grad_norm": 0.26317161420211993, + "learning_rate": 1.4074278713635683e-05, + "loss": 0.2323, + "step": 6709 + }, + { + "epoch": 0.39, + "grad_norm": 0.2971917186585945, + "learning_rate": 1.4072579188133247e-05, + "loss": 0.2247, + "step": 6710 + }, + { + "epoch": 0.39, + "grad_norm": 0.6271548854008792, + "learning_rate": 1.407087952159757e-05, + "loss": 0.4772, + "step": 6711 + }, + { + "epoch": 0.39, + "grad_norm": 0.871101430569621, + "learning_rate": 1.406917971408751e-05, + "loss": 0.6613, + "step": 6712 + }, + { + "epoch": 0.39, + "grad_norm": 0.3915942662772745, + "learning_rate": 1.4067479765661929e-05, + "loss": 0.2208, + "step": 6713 + }, + { + "epoch": 0.39, + "grad_norm": 0.33841239049328914, + "learning_rate": 1.4065779676379702e-05, + "loss": 0.3083, + "step": 6714 + }, + { + "epoch": 0.39, + "grad_norm": 0.30049951964028676, + "learning_rate": 1.4064079446299699e-05, + "loss": 0.2178, + "step": 6715 + }, + { + "epoch": 0.39, + "grad_norm": 0.38831455943753335, + "learning_rate": 1.4062379075480799e-05, + "loss": 0.2218, + "step": 6716 + }, + { + "epoch": 0.39, + "grad_norm": 0.291218423496063, + "learning_rate": 1.4060678563981886e-05, + "loss": 0.317, + "step": 6717 + }, + { + "epoch": 0.39, + "grad_norm": 1.0342262669644686, + "learning_rate": 1.4058977911861846e-05, + "loss": 0.6031, + "step": 6718 + }, + { + "epoch": 0.39, + "grad_norm": 0.35540629091607656, + "learning_rate": 1.405727711917958e-05, + "loss": 0.2125, + "step": 6719 + }, + { + "epoch": 0.39, + "grad_norm": 0.3154034414211785, + "learning_rate": 1.405557618599398e-05, + "loss": 0.2661, + "step": 6720 + }, + { + "epoch": 0.39, + "grad_norm": 0.3914215878276067, + "learning_rate": 1.4053875112363953e-05, + "loss": 0.3172, + "step": 6721 + }, + { + "epoch": 0.39, + "grad_norm": 0.5473122810280361, + "learning_rate": 1.40521738983484e-05, + "loss": 0.3438, + "step": 6722 + }, + { + "epoch": 0.39, + "grad_norm": 0.2795370643280423, + "learning_rate": 1.4050472544006243e-05, + "loss": 0.2083, + "step": 6723 + }, + { + "epoch": 0.39, + "grad_norm": 1.4035101146683626, + "learning_rate": 1.4048771049396397e-05, + "loss": 0.7028, + "step": 6724 + }, + { + "epoch": 0.39, + "grad_norm": 0.2976029002338224, + "learning_rate": 1.4047069414577782e-05, + "loss": 0.2697, + "step": 6725 + }, + { + "epoch": 0.39, + "grad_norm": 0.35788245762048565, + "learning_rate": 1.4045367639609326e-05, + "loss": 0.265, + "step": 6726 + }, + { + "epoch": 0.39, + "grad_norm": 0.8534396855898208, + "learning_rate": 1.4043665724549967e-05, + "loss": 0.5141, + "step": 6727 + }, + { + "epoch": 0.39, + "grad_norm": 0.2832789254792039, + "learning_rate": 1.4041963669458633e-05, + "loss": 0.2318, + "step": 6728 + }, + { + "epoch": 0.39, + "grad_norm": 0.3004873857950981, + "learning_rate": 1.4040261474394275e-05, + "loss": 0.1911, + "step": 6729 + }, + { + "epoch": 0.39, + "grad_norm": 0.48748591817012543, + "learning_rate": 1.4038559139415832e-05, + "loss": 0.3871, + "step": 6730 + }, + { + "epoch": 0.39, + "grad_norm": 0.66305837104567, + "learning_rate": 1.4036856664582263e-05, + "loss": 0.4026, + "step": 6731 + }, + { + "epoch": 0.39, + "grad_norm": 0.4050811425920618, + "learning_rate": 1.403515404995252e-05, + "loss": 0.2587, + "step": 6732 + }, + { + "epoch": 0.39, + "grad_norm": 0.41824055428183066, + "learning_rate": 1.4033451295585565e-05, + "loss": 0.3413, + "step": 6733 + }, + { + "epoch": 0.39, + "grad_norm": 0.37066586491062015, + "learning_rate": 1.4031748401540366e-05, + "loss": 0.2305, + "step": 6734 + }, + { + "epoch": 0.39, + "grad_norm": 0.29315654585139084, + "learning_rate": 1.4030045367875893e-05, + "loss": 0.2388, + "step": 6735 + }, + { + "epoch": 0.39, + "grad_norm": 0.8745220798442661, + "learning_rate": 1.4028342194651123e-05, + "loss": 0.4191, + "step": 6736 + }, + { + "epoch": 0.39, + "grad_norm": 0.30931597379685477, + "learning_rate": 1.4026638881925032e-05, + "loss": 0.2733, + "step": 6737 + }, + { + "epoch": 0.39, + "grad_norm": 0.37488368146363077, + "learning_rate": 1.4024935429756614e-05, + "loss": 0.3517, + "step": 6738 + }, + { + "epoch": 0.39, + "grad_norm": 0.548881557197987, + "learning_rate": 1.4023231838204854e-05, + "loss": 0.372, + "step": 6739 + }, + { + "epoch": 0.39, + "grad_norm": 0.31575575686443624, + "learning_rate": 1.4021528107328749e-05, + "loss": 0.1803, + "step": 6740 + }, + { + "epoch": 0.39, + "grad_norm": 0.2777086707952842, + "learning_rate": 1.4019824237187296e-05, + "loss": 0.2545, + "step": 6741 + }, + { + "epoch": 0.39, + "grad_norm": 1.014625121565217, + "learning_rate": 1.4018120227839505e-05, + "loss": 0.3794, + "step": 6742 + }, + { + "epoch": 0.39, + "grad_norm": 0.5247262023251844, + "learning_rate": 1.4016416079344382e-05, + "loss": 0.373, + "step": 6743 + }, + { + "epoch": 0.39, + "grad_norm": 0.3867050547624269, + "learning_rate": 1.4014711791760944e-05, + "loss": 0.3268, + "step": 6744 + }, + { + "epoch": 0.39, + "grad_norm": 0.35604241188905456, + "learning_rate": 1.401300736514821e-05, + "loss": 0.2967, + "step": 6745 + }, + { + "epoch": 0.39, + "grad_norm": 0.4154383606703178, + "learning_rate": 1.4011302799565205e-05, + "loss": 0.3106, + "step": 6746 + }, + { + "epoch": 0.39, + "grad_norm": 0.250345897544341, + "learning_rate": 1.4009598095070951e-05, + "loss": 0.1927, + "step": 6747 + }, + { + "epoch": 0.39, + "grad_norm": 1.2188115276047022, + "learning_rate": 1.4007893251724491e-05, + "loss": 0.7797, + "step": 6748 + }, + { + "epoch": 0.39, + "grad_norm": 0.29690370719853887, + "learning_rate": 1.400618826958486e-05, + "loss": 0.2267, + "step": 6749 + }, + { + "epoch": 0.39, + "grad_norm": 0.40303575141635606, + "learning_rate": 1.4004483148711101e-05, + "loss": 0.3435, + "step": 6750 + }, + { + "epoch": 0.39, + "grad_norm": 0.9008436992328396, + "learning_rate": 1.4002777889162262e-05, + "loss": 0.4848, + "step": 6751 + }, + { + "epoch": 0.39, + "grad_norm": 0.2933102711442764, + "learning_rate": 1.4001072490997399e-05, + "loss": 0.1314, + "step": 6752 + }, + { + "epoch": 0.39, + "grad_norm": 0.2828190568585241, + "learning_rate": 1.3999366954275566e-05, + "loss": 0.2767, + "step": 6753 + }, + { + "epoch": 0.39, + "grad_norm": 0.362682035058518, + "learning_rate": 1.3997661279055826e-05, + "loss": 0.247, + "step": 6754 + }, + { + "epoch": 0.39, + "grad_norm": 0.6611501611701132, + "learning_rate": 1.399595546539725e-05, + "loss": 0.3083, + "step": 6755 + }, + { + "epoch": 0.39, + "grad_norm": 0.3474578684143908, + "learning_rate": 1.3994249513358907e-05, + "loss": 0.3028, + "step": 6756 + }, + { + "epoch": 0.39, + "grad_norm": 0.38528825517022264, + "learning_rate": 1.3992543422999876e-05, + "loss": 0.3243, + "step": 6757 + }, + { + "epoch": 0.39, + "grad_norm": 0.3992245764777032, + "learning_rate": 1.3990837194379236e-05, + "loss": 0.1723, + "step": 6758 + }, + { + "epoch": 0.39, + "grad_norm": 0.24633718428561147, + "learning_rate": 1.3989130827556077e-05, + "loss": 0.2043, + "step": 6759 + }, + { + "epoch": 0.39, + "grad_norm": 1.7078159518251719, + "learning_rate": 1.398742432258949e-05, + "loss": 0.8635, + "step": 6760 + }, + { + "epoch": 0.39, + "grad_norm": 0.386580417191464, + "learning_rate": 1.398571767953857e-05, + "loss": 0.3186, + "step": 6761 + }, + { + "epoch": 0.39, + "grad_norm": 0.4046253648674059, + "learning_rate": 1.3984010898462417e-05, + "loss": 0.2438, + "step": 6762 + }, + { + "epoch": 0.39, + "grad_norm": 0.8689183740842735, + "learning_rate": 1.398230397942014e-05, + "loss": 0.5434, + "step": 6763 + }, + { + "epoch": 0.39, + "grad_norm": 0.4089398395770051, + "learning_rate": 1.3980596922470844e-05, + "loss": 0.3337, + "step": 6764 + }, + { + "epoch": 0.39, + "grad_norm": 0.29438732424321856, + "learning_rate": 1.397888972767365e-05, + "loss": 0.2378, + "step": 6765 + }, + { + "epoch": 0.39, + "grad_norm": 0.3592290968908871, + "learning_rate": 1.3977182395087674e-05, + "loss": 0.2221, + "step": 6766 + }, + { + "epoch": 0.39, + "grad_norm": 0.6970432495304754, + "learning_rate": 1.3975474924772043e-05, + "loss": 0.3725, + "step": 6767 + }, + { + "epoch": 0.39, + "grad_norm": 0.4009812575724056, + "learning_rate": 1.3973767316785887e-05, + "loss": 0.2633, + "step": 6768 + }, + { + "epoch": 0.39, + "grad_norm": 0.37223344888144816, + "learning_rate": 1.397205957118834e-05, + "loss": 0.3468, + "step": 6769 + }, + { + "epoch": 0.39, + "grad_norm": 1.57109691824902, + "learning_rate": 1.397035168803854e-05, + "loss": 0.762, + "step": 6770 + }, + { + "epoch": 0.39, + "grad_norm": 0.2339637760874292, + "learning_rate": 1.3968643667395634e-05, + "loss": 0.1738, + "step": 6771 + }, + { + "epoch": 0.39, + "grad_norm": 0.32562666939438184, + "learning_rate": 1.3966935509318766e-05, + "loss": 0.2632, + "step": 6772 + }, + { + "epoch": 0.39, + "grad_norm": 0.7221752928764033, + "learning_rate": 1.3965227213867093e-05, + "loss": 0.4831, + "step": 6773 + }, + { + "epoch": 0.39, + "grad_norm": 0.33874134003021095, + "learning_rate": 1.3963518781099774e-05, + "loss": 0.2641, + "step": 6774 + }, + { + "epoch": 0.39, + "grad_norm": 0.9198892609799831, + "learning_rate": 1.3961810211075965e-05, + "loss": 0.4748, + "step": 6775 + }, + { + "epoch": 0.39, + "grad_norm": 0.41627147942332504, + "learning_rate": 1.3960101503854843e-05, + "loss": 0.314, + "step": 6776 + }, + { + "epoch": 0.39, + "grad_norm": 0.32450432247327654, + "learning_rate": 1.3958392659495575e-05, + "loss": 0.2792, + "step": 6777 + }, + { + "epoch": 0.39, + "grad_norm": 0.3360326187425709, + "learning_rate": 1.3956683678057342e-05, + "loss": 0.1615, + "step": 6778 + }, + { + "epoch": 0.39, + "grad_norm": 0.7677364221803948, + "learning_rate": 1.395497455959932e-05, + "loss": 0.3848, + "step": 6779 + }, + { + "epoch": 0.39, + "grad_norm": 0.3772113181888643, + "learning_rate": 1.39532653041807e-05, + "loss": 0.2806, + "step": 6780 + }, + { + "epoch": 0.39, + "grad_norm": 0.6590453255458756, + "learning_rate": 1.3951555911860672e-05, + "loss": 0.2858, + "step": 6781 + }, + { + "epoch": 0.39, + "grad_norm": 0.8705894982837394, + "learning_rate": 1.3949846382698433e-05, + "loss": 0.4125, + "step": 6782 + }, + { + "epoch": 0.39, + "grad_norm": 0.3819991956597844, + "learning_rate": 1.3948136716753183e-05, + "loss": 0.2831, + "step": 6783 + }, + { + "epoch": 0.39, + "grad_norm": 0.4511179604074903, + "learning_rate": 1.394642691408413e-05, + "loss": 0.3855, + "step": 6784 + }, + { + "epoch": 0.39, + "grad_norm": 0.23375528534045187, + "learning_rate": 1.394471697475048e-05, + "loss": 0.1753, + "step": 6785 + }, + { + "epoch": 0.39, + "grad_norm": 0.3867215053995416, + "learning_rate": 1.3943006898811453e-05, + "loss": 0.2849, + "step": 6786 + }, + { + "epoch": 0.39, + "grad_norm": 1.1350554649970255, + "learning_rate": 1.3941296686326266e-05, + "loss": 0.8263, + "step": 6787 + }, + { + "epoch": 0.39, + "grad_norm": 0.3652869785590026, + "learning_rate": 1.3939586337354146e-05, + "loss": 0.2795, + "step": 6788 + }, + { + "epoch": 0.39, + "grad_norm": 0.3622856983259643, + "learning_rate": 1.3937875851954316e-05, + "loss": 0.2834, + "step": 6789 + }, + { + "epoch": 0.39, + "grad_norm": 0.5852464787006104, + "learning_rate": 1.3936165230186018e-05, + "loss": 0.4318, + "step": 6790 + }, + { + "epoch": 0.39, + "grad_norm": 0.2725903077636448, + "learning_rate": 1.3934454472108488e-05, + "loss": 0.1255, + "step": 6791 + }, + { + "epoch": 0.39, + "grad_norm": 0.3542987477803968, + "learning_rate": 1.3932743577780967e-05, + "loss": 0.2905, + "step": 6792 + }, + { + "epoch": 0.39, + "grad_norm": 0.3643810295443851, + "learning_rate": 1.3931032547262707e-05, + "loss": 0.2526, + "step": 6793 + }, + { + "epoch": 0.39, + "grad_norm": 0.6951650698020321, + "learning_rate": 1.3929321380612955e-05, + "loss": 0.3213, + "step": 6794 + }, + { + "epoch": 0.39, + "grad_norm": 0.3221414003316139, + "learning_rate": 1.3927610077890976e-05, + "loss": 0.2688, + "step": 6795 + }, + { + "epoch": 0.39, + "grad_norm": 0.6841427532021138, + "learning_rate": 1.3925898639156028e-05, + "loss": 0.4703, + "step": 6796 + }, + { + "epoch": 0.39, + "grad_norm": 0.2893061685049914, + "learning_rate": 1.3924187064467378e-05, + "loss": 0.2869, + "step": 6797 + }, + { + "epoch": 0.39, + "grad_norm": 0.3150811635114615, + "learning_rate": 1.3922475353884302e-05, + "loss": 0.2133, + "step": 6798 + }, + { + "epoch": 0.39, + "grad_norm": 0.39697628073105073, + "learning_rate": 1.3920763507466071e-05, + "loss": 0.2859, + "step": 6799 + }, + { + "epoch": 0.39, + "grad_norm": 0.3824519346405898, + "learning_rate": 1.3919051525271968e-05, + "loss": 0.3316, + "step": 6800 + }, + { + "epoch": 0.39, + "grad_norm": 0.3120471764511407, + "learning_rate": 1.3917339407361278e-05, + "loss": 0.1981, + "step": 6801 + }, + { + "epoch": 0.39, + "grad_norm": 0.7719291568422046, + "learning_rate": 1.3915627153793294e-05, + "loss": 0.4703, + "step": 6802 + }, + { + "epoch": 0.39, + "grad_norm": 1.0406178609357577, + "learning_rate": 1.3913914764627311e-05, + "loss": 0.7213, + "step": 6803 + }, + { + "epoch": 0.39, + "grad_norm": 0.3082881655989037, + "learning_rate": 1.3912202239922627e-05, + "loss": 0.1947, + "step": 6804 + }, + { + "epoch": 0.39, + "grad_norm": 0.269808735942457, + "learning_rate": 1.3910489579738548e-05, + "loss": 0.2328, + "step": 6805 + }, + { + "epoch": 0.39, + "grad_norm": 0.8449664005507034, + "learning_rate": 1.3908776784134382e-05, + "loss": 0.4975, + "step": 6806 + }, + { + "epoch": 0.39, + "grad_norm": 0.3668433609014333, + "learning_rate": 1.3907063853169445e-05, + "loss": 0.2513, + "step": 6807 + }, + { + "epoch": 0.39, + "grad_norm": 0.3828472985391112, + "learning_rate": 1.3905350786903053e-05, + "loss": 0.3159, + "step": 6808 + }, + { + "epoch": 0.39, + "grad_norm": 0.9917346778146909, + "learning_rate": 1.3903637585394534e-05, + "loss": 0.6777, + "step": 6809 + }, + { + "epoch": 0.39, + "grad_norm": 0.34320658920451136, + "learning_rate": 1.390192424870321e-05, + "loss": 0.2877, + "step": 6810 + }, + { + "epoch": 0.39, + "grad_norm": 0.22123271814015424, + "learning_rate": 1.3900210776888421e-05, + "loss": 0.1037, + "step": 6811 + }, + { + "epoch": 0.39, + "grad_norm": 0.43597104711910406, + "learning_rate": 1.38984971700095e-05, + "loss": 0.3482, + "step": 6812 + }, + { + "epoch": 0.39, + "grad_norm": 0.3876067777475436, + "learning_rate": 1.3896783428125789e-05, + "loss": 0.2848, + "step": 6813 + }, + { + "epoch": 0.39, + "grad_norm": 0.814104832790337, + "learning_rate": 1.3895069551296634e-05, + "loss": 0.3313, + "step": 6814 + }, + { + "epoch": 0.39, + "grad_norm": 1.0191123045537875, + "learning_rate": 1.389335553958139e-05, + "loss": 0.7098, + "step": 6815 + }, + { + "epoch": 0.39, + "grad_norm": 0.3189777727484337, + "learning_rate": 1.3891641393039408e-05, + "loss": 0.2773, + "step": 6816 + }, + { + "epoch": 0.39, + "grad_norm": 0.3103356437264256, + "learning_rate": 1.3889927111730056e-05, + "loss": 0.1763, + "step": 6817 + }, + { + "epoch": 0.39, + "grad_norm": 0.6834230208026675, + "learning_rate": 1.3888212695712693e-05, + "loss": 0.3407, + "step": 6818 + }, + { + "epoch": 0.39, + "grad_norm": 0.4373386910283233, + "learning_rate": 1.3886498145046689e-05, + "loss": 0.2706, + "step": 6819 + }, + { + "epoch": 0.39, + "grad_norm": 0.33425168275995765, + "learning_rate": 1.3884783459791427e-05, + "loss": 0.2554, + "step": 6820 + }, + { + "epoch": 0.39, + "grad_norm": 0.9990408288161097, + "learning_rate": 1.3883068640006277e-05, + "loss": 0.6442, + "step": 6821 + }, + { + "epoch": 0.39, + "grad_norm": 0.3974999866738344, + "learning_rate": 1.3881353685750627e-05, + "loss": 0.2808, + "step": 6822 + }, + { + "epoch": 0.39, + "grad_norm": 0.48249560737044717, + "learning_rate": 1.3879638597083864e-05, + "loss": 0.3902, + "step": 6823 + }, + { + "epoch": 0.39, + "grad_norm": 0.406224053807835, + "learning_rate": 1.387792337406539e-05, + "loss": 0.2945, + "step": 6824 + }, + { + "epoch": 0.39, + "grad_norm": 0.2699756257137786, + "learning_rate": 1.3876208016754589e-05, + "loss": 0.2, + "step": 6825 + }, + { + "epoch": 0.39, + "grad_norm": 0.35033622402874065, + "learning_rate": 1.3874492525210874e-05, + "loss": 0.2657, + "step": 6826 + }, + { + "epoch": 0.39, + "grad_norm": 0.923795988793285, + "learning_rate": 1.387277689949365e-05, + "loss": 0.4945, + "step": 6827 + }, + { + "epoch": 0.39, + "grad_norm": 0.29261064970858314, + "learning_rate": 1.3871061139662328e-05, + "loss": 0.2662, + "step": 6828 + }, + { + "epoch": 0.39, + "grad_norm": 0.5097583064881249, + "learning_rate": 1.3869345245776326e-05, + "loss": 0.3505, + "step": 6829 + }, + { + "epoch": 0.39, + "grad_norm": 0.3765657158030522, + "learning_rate": 1.3867629217895067e-05, + "loss": 0.2203, + "step": 6830 + }, + { + "epoch": 0.39, + "grad_norm": 0.3066487278928788, + "learning_rate": 1.3865913056077968e-05, + "loss": 0.2238, + "step": 6831 + }, + { + "epoch": 0.39, + "grad_norm": 0.49241965675255883, + "learning_rate": 1.3864196760384471e-05, + "loss": 0.3577, + "step": 6832 + }, + { + "epoch": 0.39, + "grad_norm": 0.47556465523233393, + "learning_rate": 1.3862480330874004e-05, + "loss": 0.2933, + "step": 6833 + }, + { + "epoch": 0.39, + "grad_norm": 0.3757822961632396, + "learning_rate": 1.3860763767606012e-05, + "loss": 0.2787, + "step": 6834 + }, + { + "epoch": 0.39, + "grad_norm": 0.7440805037762387, + "learning_rate": 1.3859047070639933e-05, + "loss": 0.4623, + "step": 6835 + }, + { + "epoch": 0.39, + "grad_norm": 0.33732921252747394, + "learning_rate": 1.3857330240035223e-05, + "loss": 0.3338, + "step": 6836 + }, + { + "epoch": 0.39, + "grad_norm": 0.18536598268984897, + "learning_rate": 1.385561327585133e-05, + "loss": 0.0748, + "step": 6837 + }, + { + "epoch": 0.39, + "grad_norm": 0.30528617287475135, + "learning_rate": 1.3853896178147717e-05, + "loss": 0.2542, + "step": 6838 + }, + { + "epoch": 0.39, + "grad_norm": 0.48011544732265776, + "learning_rate": 1.3852178946983845e-05, + "loss": 0.4014, + "step": 6839 + }, + { + "epoch": 0.39, + "grad_norm": 0.32213207603181976, + "learning_rate": 1.385046158241918e-05, + "loss": 0.2102, + "step": 6840 + }, + { + "epoch": 0.39, + "grad_norm": 0.6276796859084134, + "learning_rate": 1.3848744084513197e-05, + "loss": 0.3767, + "step": 6841 + }, + { + "epoch": 0.39, + "grad_norm": 0.9645205479290853, + "learning_rate": 1.3847026453325371e-05, + "loss": 0.6006, + "step": 6842 + }, + { + "epoch": 0.39, + "grad_norm": 0.18739949526105895, + "learning_rate": 1.3845308688915187e-05, + "loss": 0.1148, + "step": 6843 + }, + { + "epoch": 0.39, + "grad_norm": 0.29294272479257805, + "learning_rate": 1.3843590791342125e-05, + "loss": 0.2939, + "step": 6844 + }, + { + "epoch": 0.39, + "grad_norm": 1.0560695569212466, + "learning_rate": 1.3841872760665682e-05, + "loss": 0.7409, + "step": 6845 + }, + { + "epoch": 0.39, + "grad_norm": 0.40634206076714197, + "learning_rate": 1.3840154596945352e-05, + "loss": 0.2672, + "step": 6846 + }, + { + "epoch": 0.39, + "grad_norm": 0.5534173099885904, + "learning_rate": 1.383843630024063e-05, + "loss": 0.4431, + "step": 6847 + }, + { + "epoch": 0.39, + "grad_norm": 0.35250494584408637, + "learning_rate": 1.3836717870611025e-05, + "loss": 0.3131, + "step": 6848 + }, + { + "epoch": 0.39, + "grad_norm": 0.3773994520684638, + "learning_rate": 1.3834999308116045e-05, + "loss": 0.2855, + "step": 6849 + }, + { + "epoch": 0.39, + "grad_norm": 0.22077046425320704, + "learning_rate": 1.3833280612815204e-05, + "loss": 0.1251, + "step": 6850 + }, + { + "epoch": 0.39, + "grad_norm": 0.4475654638391578, + "learning_rate": 1.383156178476802e-05, + "loss": 0.42, + "step": 6851 + }, + { + "epoch": 0.39, + "grad_norm": 0.3264987173804153, + "learning_rate": 1.3829842824034016e-05, + "loss": 0.3119, + "step": 6852 + }, + { + "epoch": 0.39, + "grad_norm": 0.4714716770949217, + "learning_rate": 1.382812373067272e-05, + "loss": 0.3138, + "step": 6853 + }, + { + "epoch": 0.39, + "grad_norm": 0.8911497620521466, + "learning_rate": 1.3826404504743662e-05, + "loss": 0.57, + "step": 6854 + }, + { + "epoch": 0.39, + "grad_norm": 0.33170515753882024, + "learning_rate": 1.3824685146306385e-05, + "loss": 0.2292, + "step": 6855 + }, + { + "epoch": 0.39, + "grad_norm": 0.28216547828961547, + "learning_rate": 1.3822965655420422e-05, + "loss": 0.2139, + "step": 6856 + }, + { + "epoch": 0.39, + "grad_norm": 0.735537001474033, + "learning_rate": 1.3821246032145324e-05, + "loss": 0.4583, + "step": 6857 + }, + { + "epoch": 0.39, + "grad_norm": 0.5961310337286932, + "learning_rate": 1.381952627654064e-05, + "loss": 0.4259, + "step": 6858 + }, + { + "epoch": 0.39, + "grad_norm": 0.3900981399016567, + "learning_rate": 1.3817806388665926e-05, + "loss": 0.2397, + "step": 6859 + }, + { + "epoch": 0.39, + "grad_norm": 0.37331750009126125, + "learning_rate": 1.381608636858074e-05, + "loss": 0.3157, + "step": 6860 + }, + { + "epoch": 0.39, + "grad_norm": 0.5407319177531821, + "learning_rate": 1.3814366216344647e-05, + "loss": 0.3703, + "step": 6861 + }, + { + "epoch": 0.39, + "grad_norm": 0.22703111036296625, + "learning_rate": 1.3812645932017217e-05, + "loss": 0.1943, + "step": 6862 + }, + { + "epoch": 0.39, + "grad_norm": 0.653693685219927, + "learning_rate": 1.3810925515658022e-05, + "loss": 0.3258, + "step": 6863 + }, + { + "epoch": 0.39, + "grad_norm": 0.4038868654554686, + "learning_rate": 1.3809204967326641e-05, + "loss": 0.3235, + "step": 6864 + }, + { + "epoch": 0.39, + "grad_norm": 0.40131278450352875, + "learning_rate": 1.3807484287082655e-05, + "loss": 0.3271, + "step": 6865 + }, + { + "epoch": 0.39, + "grad_norm": 0.8587126265813383, + "learning_rate": 1.3805763474985651e-05, + "loss": 0.2415, + "step": 6866 + }, + { + "epoch": 0.39, + "grad_norm": 0.3657681750663915, + "learning_rate": 1.3804042531095223e-05, + "loss": 0.3043, + "step": 6867 + }, + { + "epoch": 0.39, + "grad_norm": 0.31855912993435875, + "learning_rate": 1.3802321455470967e-05, + "loss": 0.2917, + "step": 6868 + }, + { + "epoch": 0.39, + "grad_norm": 0.4853379729560478, + "learning_rate": 1.3800600248172478e-05, + "loss": 0.2178, + "step": 6869 + }, + { + "epoch": 0.39, + "grad_norm": 0.39579207655536114, + "learning_rate": 1.3798878909259368e-05, + "loss": 0.3008, + "step": 6870 + }, + { + "epoch": 0.39, + "grad_norm": 0.5117750783727549, + "learning_rate": 1.3797157438791244e-05, + "loss": 0.4075, + "step": 6871 + }, + { + "epoch": 0.39, + "grad_norm": 0.38032603698465695, + "learning_rate": 1.3795435836827724e-05, + "loss": 0.3377, + "step": 6872 + }, + { + "epoch": 0.39, + "grad_norm": 0.5204418610641315, + "learning_rate": 1.3793714103428421e-05, + "loss": 0.1617, + "step": 6873 + }, + { + "epoch": 0.39, + "grad_norm": 0.2955007971124637, + "learning_rate": 1.3791992238652965e-05, + "loss": 0.2467, + "step": 6874 + }, + { + "epoch": 0.4, + "grad_norm": 0.3308877907838149, + "learning_rate": 1.3790270242560977e-05, + "loss": 0.2977, + "step": 6875 + }, + { + "epoch": 0.4, + "grad_norm": 0.38992600679999917, + "learning_rate": 1.3788548115212095e-05, + "loss": 0.2571, + "step": 6876 + }, + { + "epoch": 0.4, + "grad_norm": 0.38401415930593574, + "learning_rate": 1.3786825856665958e-05, + "loss": 0.3058, + "step": 6877 + }, + { + "epoch": 0.4, + "grad_norm": 0.7709559475093668, + "learning_rate": 1.3785103466982199e-05, + "loss": 0.5874, + "step": 6878 + }, + { + "epoch": 0.4, + "grad_norm": 0.3114444710448705, + "learning_rate": 1.3783380946220474e-05, + "loss": 0.2394, + "step": 6879 + }, + { + "epoch": 0.4, + "grad_norm": 0.3432166855056662, + "learning_rate": 1.3781658294440427e-05, + "loss": 0.3115, + "step": 6880 + }, + { + "epoch": 0.4, + "grad_norm": 0.6522356404592244, + "learning_rate": 1.3779935511701717e-05, + "loss": 0.4908, + "step": 6881 + }, + { + "epoch": 0.4, + "grad_norm": 0.21070036258642488, + "learning_rate": 1.3778212598064002e-05, + "loss": 0.1286, + "step": 6882 + }, + { + "epoch": 0.4, + "grad_norm": 0.27838016788335157, + "learning_rate": 1.3776489553586949e-05, + "loss": 0.2634, + "step": 6883 + }, + { + "epoch": 0.4, + "grad_norm": 0.5184621555490639, + "learning_rate": 1.3774766378330221e-05, + "loss": 0.4013, + "step": 6884 + }, + { + "epoch": 0.4, + "grad_norm": 0.615158987776567, + "learning_rate": 1.3773043072353503e-05, + "loss": 0.4176, + "step": 6885 + }, + { + "epoch": 0.4, + "grad_norm": 0.32917103198871883, + "learning_rate": 1.3771319635716459e-05, + "loss": 0.2412, + "step": 6886 + }, + { + "epoch": 0.4, + "grad_norm": 0.48537442050696666, + "learning_rate": 1.376959606847878e-05, + "loss": 0.3794, + "step": 6887 + }, + { + "epoch": 0.4, + "grad_norm": 0.437233197013368, + "learning_rate": 1.376787237070015e-05, + "loss": 0.3628, + "step": 6888 + }, + { + "epoch": 0.4, + "grad_norm": 0.3363888104947281, + "learning_rate": 1.3766148542440265e-05, + "loss": 0.2289, + "step": 6889 + }, + { + "epoch": 0.4, + "grad_norm": 0.2641410487640712, + "learning_rate": 1.3764424583758816e-05, + "loss": 0.1768, + "step": 6890 + }, + { + "epoch": 0.4, + "grad_norm": 0.3430245182460297, + "learning_rate": 1.3762700494715506e-05, + "loss": 0.3178, + "step": 6891 + }, + { + "epoch": 0.4, + "grad_norm": 0.3694883870067206, + "learning_rate": 1.3760976275370039e-05, + "loss": 0.2562, + "step": 6892 + }, + { + "epoch": 0.4, + "grad_norm": 0.6870317668549498, + "learning_rate": 1.3759251925782127e-05, + "loss": 0.4852, + "step": 6893 + }, + { + "epoch": 0.4, + "grad_norm": 1.2038599651172182, + "learning_rate": 1.3757527446011479e-05, + "loss": 0.8427, + "step": 6894 + }, + { + "epoch": 0.4, + "grad_norm": 0.2434935746268074, + "learning_rate": 1.375580283611782e-05, + "loss": 0.1944, + "step": 6895 + }, + { + "epoch": 0.4, + "grad_norm": 0.3256197247270627, + "learning_rate": 1.3754078096160871e-05, + "loss": 0.2661, + "step": 6896 + }, + { + "epoch": 0.4, + "grad_norm": 0.7596028889004636, + "learning_rate": 1.3752353226200359e-05, + "loss": 0.4223, + "step": 6897 + }, + { + "epoch": 0.4, + "grad_norm": 0.3579206665965782, + "learning_rate": 1.3750628226296012e-05, + "loss": 0.2608, + "step": 6898 + }, + { + "epoch": 0.4, + "grad_norm": 0.42991083239765604, + "learning_rate": 1.3748903096507576e-05, + "loss": 0.2457, + "step": 6899 + }, + { + "epoch": 0.4, + "grad_norm": 0.46276695668103407, + "learning_rate": 1.3747177836894783e-05, + "loss": 0.4311, + "step": 6900 + }, + { + "epoch": 0.4, + "grad_norm": 0.33938255692060043, + "learning_rate": 1.3745452447517384e-05, + "loss": 0.253, + "step": 6901 + }, + { + "epoch": 0.4, + "grad_norm": 0.33146128208950326, + "learning_rate": 1.3743726928435129e-05, + "loss": 0.1271, + "step": 6902 + }, + { + "epoch": 0.4, + "grad_norm": 0.4516192355561343, + "learning_rate": 1.3742001279707771e-05, + "loss": 0.3194, + "step": 6903 + }, + { + "epoch": 0.4, + "grad_norm": 0.3831757519966381, + "learning_rate": 1.3740275501395068e-05, + "loss": 0.2835, + "step": 6904 + }, + { + "epoch": 0.4, + "grad_norm": 0.7703354652013342, + "learning_rate": 1.3738549593556787e-05, + "loss": 0.255, + "step": 6905 + }, + { + "epoch": 0.4, + "grad_norm": 0.6825758957008587, + "learning_rate": 1.3736823556252694e-05, + "loss": 0.5165, + "step": 6906 + }, + { + "epoch": 0.4, + "grad_norm": 0.3866563201141378, + "learning_rate": 1.373509738954256e-05, + "loss": 0.3017, + "step": 6907 + }, + { + "epoch": 0.4, + "grad_norm": 0.2428939493291775, + "learning_rate": 1.3733371093486168e-05, + "loss": 0.2285, + "step": 6908 + }, + { + "epoch": 0.4, + "grad_norm": 0.6721461211190297, + "learning_rate": 1.3731644668143291e-05, + "loss": 0.4576, + "step": 6909 + }, + { + "epoch": 0.4, + "grad_norm": 0.3443745352273169, + "learning_rate": 1.3729918113573723e-05, + "loss": 0.2499, + "step": 6910 + }, + { + "epoch": 0.4, + "grad_norm": 0.3699209177126759, + "learning_rate": 1.3728191429837247e-05, + "loss": 0.3297, + "step": 6911 + }, + { + "epoch": 0.4, + "grad_norm": 0.9218896511237294, + "learning_rate": 1.3726464616993667e-05, + "loss": 0.5562, + "step": 6912 + }, + { + "epoch": 0.4, + "grad_norm": 0.36061835491190963, + "learning_rate": 1.3724737675102774e-05, + "loss": 0.251, + "step": 6913 + }, + { + "epoch": 0.4, + "grad_norm": 0.6697499130698259, + "learning_rate": 1.3723010604224381e-05, + "loss": 0.4177, + "step": 6914 + }, + { + "epoch": 0.4, + "grad_norm": 0.22123181145234125, + "learning_rate": 1.3721283404418283e-05, + "loss": 0.1735, + "step": 6915 + }, + { + "epoch": 0.4, + "grad_norm": 0.31925352597321077, + "learning_rate": 1.3719556075744306e-05, + "loss": 0.2455, + "step": 6916 + }, + { + "epoch": 0.4, + "grad_norm": 0.9239303856324638, + "learning_rate": 1.3717828618262261e-05, + "loss": 0.5894, + "step": 6917 + }, + { + "epoch": 0.4, + "grad_norm": 0.5162709979789786, + "learning_rate": 1.3716101032031972e-05, + "loss": 0.3261, + "step": 6918 + }, + { + "epoch": 0.4, + "grad_norm": 0.29582580195856606, + "learning_rate": 1.371437331711326e-05, + "loss": 0.2552, + "step": 6919 + }, + { + "epoch": 0.4, + "grad_norm": 0.8585276824390256, + "learning_rate": 1.3712645473565964e-05, + "loss": 0.4624, + "step": 6920 + }, + { + "epoch": 0.4, + "grad_norm": 0.3036535597617637, + "learning_rate": 1.3710917501449911e-05, + "loss": 0.1622, + "step": 6921 + }, + { + "epoch": 0.4, + "grad_norm": 0.375439624708647, + "learning_rate": 1.3709189400824948e-05, + "loss": 0.2807, + "step": 6922 + }, + { + "epoch": 0.4, + "grad_norm": 0.38903684691791973, + "learning_rate": 1.3707461171750916e-05, + "loss": 0.2963, + "step": 6923 + }, + { + "epoch": 0.4, + "grad_norm": 1.3030138881612745, + "learning_rate": 1.370573281428766e-05, + "loss": 0.8033, + "step": 6924 + }, + { + "epoch": 0.4, + "grad_norm": 0.3366170348258912, + "learning_rate": 1.3704004328495037e-05, + "loss": 0.2032, + "step": 6925 + }, + { + "epoch": 0.4, + "grad_norm": 0.8395341835724895, + "learning_rate": 1.3702275714432905e-05, + "loss": 0.4634, + "step": 6926 + }, + { + "epoch": 0.4, + "grad_norm": 0.22712153378717959, + "learning_rate": 1.3700546972161121e-05, + "loss": 0.2303, + "step": 6927 + }, + { + "epoch": 0.4, + "grad_norm": 0.3062410578990784, + "learning_rate": 1.3698818101739554e-05, + "loss": 0.1977, + "step": 6928 + }, + { + "epoch": 0.4, + "grad_norm": 1.0418163317880769, + "learning_rate": 1.3697089103228081e-05, + "loss": 0.7044, + "step": 6929 + }, + { + "epoch": 0.4, + "grad_norm": 0.7665050260008015, + "learning_rate": 1.3695359976686568e-05, + "loss": 0.52, + "step": 6930 + }, + { + "epoch": 0.4, + "grad_norm": 0.2718208821347409, + "learning_rate": 1.3693630722174898e-05, + "loss": 0.225, + "step": 6931 + }, + { + "epoch": 0.4, + "grad_norm": 0.5994959857163412, + "learning_rate": 1.3691901339752955e-05, + "loss": 0.48, + "step": 6932 + }, + { + "epoch": 0.4, + "grad_norm": 0.42763359444722393, + "learning_rate": 1.3690171829480628e-05, + "loss": 0.279, + "step": 6933 + }, + { + "epoch": 0.4, + "grad_norm": 0.2819399462682956, + "learning_rate": 1.3688442191417805e-05, + "loss": 0.169, + "step": 6934 + }, + { + "epoch": 0.4, + "grad_norm": 0.3855701660647398, + "learning_rate": 1.3686712425624393e-05, + "loss": 0.3225, + "step": 6935 + }, + { + "epoch": 0.4, + "grad_norm": 0.9462468602813544, + "learning_rate": 1.3684982532160285e-05, + "loss": 0.5966, + "step": 6936 + }, + { + "epoch": 0.4, + "grad_norm": 0.34783075864199253, + "learning_rate": 1.3683252511085391e-05, + "loss": 0.2796, + "step": 6937 + }, + { + "epoch": 0.4, + "grad_norm": 0.702993674672384, + "learning_rate": 1.3681522362459623e-05, + "loss": 0.3554, + "step": 6938 + }, + { + "epoch": 0.4, + "grad_norm": 0.2971323853404743, + "learning_rate": 1.3679792086342892e-05, + "loss": 0.2779, + "step": 6939 + }, + { + "epoch": 0.4, + "grad_norm": 0.2884143111686183, + "learning_rate": 1.367806168279512e-05, + "loss": 0.2304, + "step": 6940 + }, + { + "epoch": 0.4, + "grad_norm": 0.5212153825592891, + "learning_rate": 1.3676331151876227e-05, + "loss": 0.2851, + "step": 6941 + }, + { + "epoch": 0.4, + "grad_norm": 0.4431896490235908, + "learning_rate": 1.3674600493646146e-05, + "loss": 0.3398, + "step": 6942 + }, + { + "epoch": 0.4, + "grad_norm": 0.4475954113556971, + "learning_rate": 1.367286970816481e-05, + "loss": 0.2763, + "step": 6943 + }, + { + "epoch": 0.4, + "grad_norm": 0.669305965541524, + "learning_rate": 1.3671138795492155e-05, + "loss": 0.3324, + "step": 6944 + }, + { + "epoch": 0.4, + "grad_norm": 0.3080889359061354, + "learning_rate": 1.3669407755688117e-05, + "loss": 0.1614, + "step": 6945 + }, + { + "epoch": 0.4, + "grad_norm": 0.40932396074492605, + "learning_rate": 1.366767658881265e-05, + "loss": 0.2977, + "step": 6946 + }, + { + "epoch": 0.4, + "grad_norm": 0.32701274366526195, + "learning_rate": 1.36659452949257e-05, + "loss": 0.2876, + "step": 6947 + }, + { + "epoch": 0.4, + "grad_norm": 0.7057822249995881, + "learning_rate": 1.3664213874087223e-05, + "loss": 0.3661, + "step": 6948 + }, + { + "epoch": 0.4, + "grad_norm": 0.3797999918281963, + "learning_rate": 1.3662482326357172e-05, + "loss": 0.28, + "step": 6949 + }, + { + "epoch": 0.4, + "grad_norm": 0.5054705360107299, + "learning_rate": 1.366075065179552e-05, + "loss": 0.373, + "step": 6950 + }, + { + "epoch": 0.4, + "grad_norm": 0.4181068108631131, + "learning_rate": 1.3659018850462226e-05, + "loss": 0.2382, + "step": 6951 + }, + { + "epoch": 0.4, + "grad_norm": 0.2658521324344886, + "learning_rate": 1.3657286922417272e-05, + "loss": 0.1952, + "step": 6952 + }, + { + "epoch": 0.4, + "grad_norm": 0.33355700539346833, + "learning_rate": 1.3655554867720623e-05, + "loss": 0.2598, + "step": 6953 + }, + { + "epoch": 0.4, + "grad_norm": 0.7434397980880477, + "learning_rate": 1.3653822686432271e-05, + "loss": 0.3192, + "step": 6954 + }, + { + "epoch": 0.4, + "grad_norm": 0.30312485770426095, + "learning_rate": 1.3652090378612198e-05, + "loss": 0.2706, + "step": 6955 + }, + { + "epoch": 0.4, + "grad_norm": 1.0086654469423801, + "learning_rate": 1.3650357944320387e-05, + "loss": 0.6798, + "step": 6956 + }, + { + "epoch": 0.4, + "grad_norm": 0.2880087205351191, + "learning_rate": 1.3648625383616841e-05, + "loss": 0.1437, + "step": 6957 + }, + { + "epoch": 0.4, + "grad_norm": 0.29183807559357594, + "learning_rate": 1.3646892696561554e-05, + "loss": 0.2153, + "step": 6958 + }, + { + "epoch": 0.4, + "grad_norm": 0.36485493480482983, + "learning_rate": 1.3645159883214528e-05, + "loss": 0.3043, + "step": 6959 + }, + { + "epoch": 0.4, + "grad_norm": 0.6233292098008584, + "learning_rate": 1.3643426943635774e-05, + "loss": 0.4441, + "step": 6960 + }, + { + "epoch": 0.4, + "grad_norm": 0.35202902887011656, + "learning_rate": 1.36416938778853e-05, + "loss": 0.1655, + "step": 6961 + }, + { + "epoch": 0.4, + "grad_norm": 0.4126226172553717, + "learning_rate": 1.3639960686023126e-05, + "loss": 0.3344, + "step": 6962 + }, + { + "epoch": 0.4, + "grad_norm": 0.37428085092700025, + "learning_rate": 1.3638227368109268e-05, + "loss": 0.3312, + "step": 6963 + }, + { + "epoch": 0.4, + "grad_norm": 0.30610032926280945, + "learning_rate": 1.3636493924203756e-05, + "loss": 0.1934, + "step": 6964 + }, + { + "epoch": 0.4, + "grad_norm": 0.31909149409640514, + "learning_rate": 1.3634760354366612e-05, + "loss": 0.2953, + "step": 6965 + }, + { + "epoch": 0.4, + "grad_norm": 0.816373087029119, + "learning_rate": 1.3633026658657872e-05, + "loss": 0.6193, + "step": 6966 + }, + { + "epoch": 0.4, + "grad_norm": 0.2950274786523385, + "learning_rate": 1.3631292837137577e-05, + "loss": 0.2384, + "step": 6967 + }, + { + "epoch": 0.4, + "grad_norm": 0.34463380268308075, + "learning_rate": 1.3629558889865768e-05, + "loss": 0.2693, + "step": 6968 + }, + { + "epoch": 0.4, + "grad_norm": 1.445343970358979, + "learning_rate": 1.3627824816902494e-05, + "loss": 0.7602, + "step": 6969 + }, + { + "epoch": 0.4, + "grad_norm": 0.2910854511165072, + "learning_rate": 1.3626090618307796e-05, + "loss": 0.2366, + "step": 6970 + }, + { + "epoch": 0.4, + "grad_norm": 0.3727574615944739, + "learning_rate": 1.3624356294141738e-05, + "loss": 0.347, + "step": 6971 + }, + { + "epoch": 0.4, + "grad_norm": 0.4251921909257409, + "learning_rate": 1.3622621844464379e-05, + "loss": 0.2888, + "step": 6972 + }, + { + "epoch": 0.4, + "grad_norm": 0.3357843692589245, + "learning_rate": 1.362088726933578e-05, + "loss": 0.2384, + "step": 6973 + }, + { + "epoch": 0.4, + "grad_norm": 0.36993298536045216, + "learning_rate": 1.361915256881601e-05, + "loss": 0.2396, + "step": 6974 + }, + { + "epoch": 0.4, + "grad_norm": 0.3614041412393671, + "learning_rate": 1.3617417742965144e-05, + "loss": 0.3096, + "step": 6975 + }, + { + "epoch": 0.4, + "grad_norm": 0.37952388985458196, + "learning_rate": 1.3615682791843257e-05, + "loss": 0.3231, + "step": 6976 + }, + { + "epoch": 0.4, + "grad_norm": 0.3901412474468786, + "learning_rate": 1.3613947715510429e-05, + "loss": 0.3072, + "step": 6977 + }, + { + "epoch": 0.4, + "grad_norm": 0.36218398221854425, + "learning_rate": 1.3612212514026746e-05, + "loss": 0.3417, + "step": 6978 + }, + { + "epoch": 0.4, + "grad_norm": 0.8055590911647899, + "learning_rate": 1.3610477187452303e-05, + "loss": 0.4594, + "step": 6979 + }, + { + "epoch": 0.4, + "grad_norm": 0.2289632841507798, + "learning_rate": 1.3608741735847186e-05, + "loss": 0.1503, + "step": 6980 + }, + { + "epoch": 0.4, + "grad_norm": 0.6391920773085977, + "learning_rate": 1.3607006159271503e-05, + "loss": 0.4503, + "step": 6981 + }, + { + "epoch": 0.4, + "grad_norm": 0.3301091566046304, + "learning_rate": 1.3605270457785346e-05, + "loss": 0.3016, + "step": 6982 + }, + { + "epoch": 0.4, + "grad_norm": 0.3309863876921998, + "learning_rate": 1.3603534631448831e-05, + "loss": 0.2646, + "step": 6983 + }, + { + "epoch": 0.4, + "grad_norm": 1.2898443402936977, + "learning_rate": 1.3601798680322068e-05, + "loss": 0.7883, + "step": 6984 + }, + { + "epoch": 0.4, + "grad_norm": 0.34812557437044317, + "learning_rate": 1.3600062604465168e-05, + "loss": 0.2385, + "step": 6985 + }, + { + "epoch": 0.4, + "grad_norm": 0.277167302217143, + "learning_rate": 1.3598326403938255e-05, + "loss": 0.2632, + "step": 6986 + }, + { + "epoch": 0.4, + "grad_norm": 0.5174657148756132, + "learning_rate": 1.3596590078801458e-05, + "loss": 0.3016, + "step": 6987 + }, + { + "epoch": 0.4, + "grad_norm": 0.4792618762975345, + "learning_rate": 1.3594853629114896e-05, + "loss": 0.3499, + "step": 6988 + }, + { + "epoch": 0.4, + "grad_norm": 0.41025013782946984, + "learning_rate": 1.359311705493871e-05, + "loss": 0.312, + "step": 6989 + }, + { + "epoch": 0.4, + "grad_norm": 0.3738632934541031, + "learning_rate": 1.3591380356333038e-05, + "loss": 0.2848, + "step": 6990 + }, + { + "epoch": 0.4, + "grad_norm": 0.5999221057686619, + "learning_rate": 1.3589643533358013e-05, + "loss": 0.3353, + "step": 6991 + }, + { + "epoch": 0.4, + "grad_norm": 0.32699926243927424, + "learning_rate": 1.358790658607379e-05, + "loss": 0.2355, + "step": 6992 + }, + { + "epoch": 0.4, + "grad_norm": 0.43293676610358867, + "learning_rate": 1.3586169514540512e-05, + "loss": 0.2488, + "step": 6993 + }, + { + "epoch": 0.4, + "grad_norm": 0.32886181991925645, + "learning_rate": 1.3584432318818344e-05, + "loss": 0.3064, + "step": 6994 + }, + { + "epoch": 0.4, + "grad_norm": 0.35377350978253397, + "learning_rate": 1.3582694998967434e-05, + "loss": 0.316, + "step": 6995 + }, + { + "epoch": 0.4, + "grad_norm": 0.685901668579314, + "learning_rate": 1.3580957555047953e-05, + "loss": 0.4758, + "step": 6996 + }, + { + "epoch": 0.4, + "grad_norm": 0.6877362501402533, + "learning_rate": 1.3579219987120065e-05, + "loss": 0.4549, + "step": 6997 + }, + { + "epoch": 0.4, + "grad_norm": 0.27094743803612736, + "learning_rate": 1.3577482295243944e-05, + "loss": 0.2349, + "step": 6998 + }, + { + "epoch": 0.4, + "grad_norm": 0.3247502461542125, + "learning_rate": 1.3575744479479764e-05, + "loss": 0.2856, + "step": 6999 + }, + { + "epoch": 0.4, + "grad_norm": 0.6147448714871042, + "learning_rate": 1.3574006539887707e-05, + "loss": 0.2349, + "step": 7000 + }, + { + "epoch": 0.4, + "grad_norm": 0.38562569593393564, + "learning_rate": 1.3572268476527954e-05, + "loss": 0.2908, + "step": 7001 + }, + { + "epoch": 0.4, + "grad_norm": 0.3642951593240954, + "learning_rate": 1.3570530289460701e-05, + "loss": 0.3264, + "step": 7002 + }, + { + "epoch": 0.4, + "grad_norm": 0.5235074663626574, + "learning_rate": 1.3568791978746137e-05, + "loss": 0.2494, + "step": 7003 + }, + { + "epoch": 0.4, + "grad_norm": 0.2952802191250271, + "learning_rate": 1.356705354444446e-05, + "loss": 0.2524, + "step": 7004 + }, + { + "epoch": 0.4, + "grad_norm": 0.946358029879268, + "learning_rate": 1.3565314986615871e-05, + "loss": 0.4903, + "step": 7005 + }, + { + "epoch": 0.4, + "grad_norm": 0.23563139165705427, + "learning_rate": 1.3563576305320579e-05, + "loss": 0.1791, + "step": 7006 + }, + { + "epoch": 0.4, + "grad_norm": 0.40333290157473234, + "learning_rate": 1.356183750061879e-05, + "loss": 0.3251, + "step": 7007 + }, + { + "epoch": 0.4, + "grad_norm": 1.2051917632433689, + "learning_rate": 1.3560098572570725e-05, + "loss": 0.821, + "step": 7008 + }, + { + "epoch": 0.4, + "grad_norm": 0.37974187511583635, + "learning_rate": 1.35583595212366e-05, + "loss": 0.1655, + "step": 7009 + }, + { + "epoch": 0.4, + "grad_norm": 0.3522486679741345, + "learning_rate": 1.3556620346676633e-05, + "loss": 0.2931, + "step": 7010 + }, + { + "epoch": 0.4, + "grad_norm": 0.6771158949431482, + "learning_rate": 1.355488104895106e-05, + "loss": 0.3763, + "step": 7011 + }, + { + "epoch": 0.4, + "grad_norm": 0.2299304342135175, + "learning_rate": 1.3553141628120107e-05, + "loss": 0.1646, + "step": 7012 + }, + { + "epoch": 0.4, + "grad_norm": 0.33815698261225796, + "learning_rate": 1.3551402084244014e-05, + "loss": 0.2277, + "step": 7013 + }, + { + "epoch": 0.4, + "grad_norm": 0.3550244974142543, + "learning_rate": 1.3549662417383018e-05, + "loss": 0.3468, + "step": 7014 + }, + { + "epoch": 0.4, + "grad_norm": 1.499478939324147, + "learning_rate": 1.354792262759737e-05, + "loss": 0.7654, + "step": 7015 + }, + { + "epoch": 0.4, + "grad_norm": 0.33123396540469313, + "learning_rate": 1.3546182714947309e-05, + "loss": 0.2452, + "step": 7016 + }, + { + "epoch": 0.4, + "grad_norm": 0.30626982106831946, + "learning_rate": 1.3544442679493095e-05, + "loss": 0.2432, + "step": 7017 + }, + { + "epoch": 0.4, + "grad_norm": 0.36381971525818, + "learning_rate": 1.3542702521294981e-05, + "loss": 0.3203, + "step": 7018 + }, + { + "epoch": 0.4, + "grad_norm": 0.2968935067872526, + "learning_rate": 1.3540962240413233e-05, + "loss": 0.2144, + "step": 7019 + }, + { + "epoch": 0.4, + "grad_norm": 0.9758630593144416, + "learning_rate": 1.3539221836908113e-05, + "loss": 0.6957, + "step": 7020 + }, + { + "epoch": 0.4, + "grad_norm": 0.47204156166840916, + "learning_rate": 1.3537481310839897e-05, + "loss": 0.351, + "step": 7021 + }, + { + "epoch": 0.4, + "grad_norm": 0.31181724975707065, + "learning_rate": 1.353574066226885e-05, + "loss": 0.2339, + "step": 7022 + }, + { + "epoch": 0.4, + "grad_norm": 0.8749376420778082, + "learning_rate": 1.353399989125526e-05, + "loss": 0.4849, + "step": 7023 + }, + { + "epoch": 0.4, + "grad_norm": 0.2859353718730429, + "learning_rate": 1.3532258997859404e-05, + "loss": 0.197, + "step": 7024 + }, + { + "epoch": 0.4, + "grad_norm": 0.3325129256222981, + "learning_rate": 1.3530517982141574e-05, + "loss": 0.2746, + "step": 7025 + }, + { + "epoch": 0.4, + "grad_norm": 0.3507133854637492, + "learning_rate": 1.3528776844162052e-05, + "loss": 0.277, + "step": 7026 + }, + { + "epoch": 0.4, + "grad_norm": 0.5776205911314057, + "learning_rate": 1.3527035583981145e-05, + "loss": 0.4077, + "step": 7027 + }, + { + "epoch": 0.4, + "grad_norm": 0.3720017793660261, + "learning_rate": 1.3525294201659145e-05, + "loss": 0.2941, + "step": 7028 + }, + { + "epoch": 0.4, + "grad_norm": 0.6266844281338557, + "learning_rate": 1.3523552697256359e-05, + "loss": 0.3631, + "step": 7029 + }, + { + "epoch": 0.4, + "grad_norm": 0.3606261379793363, + "learning_rate": 1.3521811070833095e-05, + "loss": 0.2194, + "step": 7030 + }, + { + "epoch": 0.4, + "grad_norm": 0.37657437220486956, + "learning_rate": 1.3520069322449663e-05, + "loss": 0.2835, + "step": 7031 + }, + { + "epoch": 0.4, + "grad_norm": 0.8620238092878418, + "learning_rate": 1.3518327452166385e-05, + "loss": 0.4009, + "step": 7032 + }, + { + "epoch": 0.4, + "grad_norm": 0.654820668093201, + "learning_rate": 1.3516585460043576e-05, + "loss": 0.4591, + "step": 7033 + }, + { + "epoch": 0.4, + "grad_norm": 0.3013622890908199, + "learning_rate": 1.3514843346141566e-05, + "loss": 0.2671, + "step": 7034 + }, + { + "epoch": 0.4, + "grad_norm": 0.4458870442813115, + "learning_rate": 1.3513101110520678e-05, + "loss": 0.3693, + "step": 7035 + }, + { + "epoch": 0.4, + "grad_norm": 0.19906848799136267, + "learning_rate": 1.3511358753241254e-05, + "loss": 0.1152, + "step": 7036 + }, + { + "epoch": 0.4, + "grad_norm": 0.32092656053178614, + "learning_rate": 1.3509616274363623e-05, + "loss": 0.2576, + "step": 7037 + }, + { + "epoch": 0.4, + "grad_norm": 0.3677600497999778, + "learning_rate": 1.3507873673948137e-05, + "loss": 0.3478, + "step": 7038 + }, + { + "epoch": 0.4, + "grad_norm": 0.9170207065182442, + "learning_rate": 1.3506130952055132e-05, + "loss": 0.4109, + "step": 7039 + }, + { + "epoch": 0.4, + "grad_norm": 0.35086745820449605, + "learning_rate": 1.3504388108744966e-05, + "loss": 0.3033, + "step": 7040 + }, + { + "epoch": 0.4, + "grad_norm": 1.099592946999764, + "learning_rate": 1.3502645144077987e-05, + "loss": 0.5843, + "step": 7041 + }, + { + "epoch": 0.4, + "grad_norm": 0.212123843237808, + "learning_rate": 1.350090205811456e-05, + "loss": 0.1806, + "step": 7042 + }, + { + "epoch": 0.4, + "grad_norm": 0.33900519569136367, + "learning_rate": 1.3499158850915044e-05, + "loss": 0.2861, + "step": 7043 + }, + { + "epoch": 0.4, + "grad_norm": 0.8621314063592519, + "learning_rate": 1.3497415522539807e-05, + "loss": 0.6942, + "step": 7044 + }, + { + "epoch": 0.4, + "grad_norm": 0.3746493330911956, + "learning_rate": 1.3495672073049221e-05, + "loss": 0.3091, + "step": 7045 + }, + { + "epoch": 0.4, + "grad_norm": 0.3969087626583301, + "learning_rate": 1.3493928502503664e-05, + "loss": 0.2964, + "step": 7046 + }, + { + "epoch": 0.4, + "grad_norm": 0.48381748985794387, + "learning_rate": 1.3492184810963512e-05, + "loss": 0.3625, + "step": 7047 + }, + { + "epoch": 0.4, + "grad_norm": 0.3345842253885811, + "learning_rate": 1.349044099848915e-05, + "loss": 0.2105, + "step": 7048 + }, + { + "epoch": 0.4, + "grad_norm": 0.30381301666422506, + "learning_rate": 1.3488697065140964e-05, + "loss": 0.1968, + "step": 7049 + }, + { + "epoch": 0.41, + "grad_norm": 0.41334840598392936, + "learning_rate": 1.348695301097935e-05, + "loss": 0.3586, + "step": 7050 + }, + { + "epoch": 0.41, + "grad_norm": 0.7832888653278328, + "learning_rate": 1.3485208836064705e-05, + "loss": 0.4857, + "step": 7051 + }, + { + "epoch": 0.41, + "grad_norm": 0.33241008219443263, + "learning_rate": 1.3483464540457428e-05, + "loss": 0.2076, + "step": 7052 + }, + { + "epoch": 0.41, + "grad_norm": 0.49777611048133946, + "learning_rate": 1.348172012421792e-05, + "loss": 0.3706, + "step": 7053 + }, + { + "epoch": 0.41, + "grad_norm": 0.5163034090447539, + "learning_rate": 1.3479975587406595e-05, + "loss": 0.3745, + "step": 7054 + }, + { + "epoch": 0.41, + "grad_norm": 0.22940568788016039, + "learning_rate": 1.3478230930083868e-05, + "loss": 0.1524, + "step": 7055 + }, + { + "epoch": 0.41, + "grad_norm": 0.9077703091017053, + "learning_rate": 1.3476486152310152e-05, + "loss": 0.444, + "step": 7056 + }, + { + "epoch": 0.41, + "grad_norm": 0.5325747150064066, + "learning_rate": 1.3474741254145868e-05, + "loss": 0.397, + "step": 7057 + }, + { + "epoch": 0.41, + "grad_norm": 0.2938576127412327, + "learning_rate": 1.3472996235651446e-05, + "loss": 0.227, + "step": 7058 + }, + { + "epoch": 0.41, + "grad_norm": 1.2376508865292748, + "learning_rate": 1.3471251096887312e-05, + "loss": 0.6284, + "step": 7059 + }, + { + "epoch": 0.41, + "grad_norm": 0.400031588718824, + "learning_rate": 1.3469505837913903e-05, + "loss": 0.2609, + "step": 7060 + }, + { + "epoch": 0.41, + "grad_norm": 0.3362257964775687, + "learning_rate": 1.3467760458791656e-05, + "loss": 0.2845, + "step": 7061 + }, + { + "epoch": 0.41, + "grad_norm": 0.4483199711195374, + "learning_rate": 1.3466014959581013e-05, + "loss": 0.2888, + "step": 7062 + }, + { + "epoch": 0.41, + "grad_norm": 1.0500941292938073, + "learning_rate": 1.3464269340342422e-05, + "loss": 0.6551, + "step": 7063 + }, + { + "epoch": 0.41, + "grad_norm": 0.35210336824936767, + "learning_rate": 1.346252360113633e-05, + "loss": 0.2358, + "step": 7064 + }, + { + "epoch": 0.41, + "grad_norm": 0.3657147809379971, + "learning_rate": 1.3460777742023202e-05, + "loss": 0.2663, + "step": 7065 + }, + { + "epoch": 0.41, + "grad_norm": 0.503373542099447, + "learning_rate": 1.3459031763063482e-05, + "loss": 0.365, + "step": 7066 + }, + { + "epoch": 0.41, + "grad_norm": 0.42739082768414866, + "learning_rate": 1.3457285664317645e-05, + "loss": 0.3174, + "step": 7067 + }, + { + "epoch": 0.41, + "grad_norm": 0.358482347181096, + "learning_rate": 1.3455539445846151e-05, + "loss": 0.2484, + "step": 7068 + }, + { + "epoch": 0.41, + "grad_norm": 0.3661152740979383, + "learning_rate": 1.3453793107709476e-05, + "loss": 0.3302, + "step": 7069 + }, + { + "epoch": 0.41, + "grad_norm": 0.3814990470872192, + "learning_rate": 1.3452046649968091e-05, + "loss": 0.2572, + "step": 7070 + }, + { + "epoch": 0.41, + "grad_norm": 0.3260559821888633, + "learning_rate": 1.3450300072682485e-05, + "loss": 0.1877, + "step": 7071 + }, + { + "epoch": 0.41, + "grad_norm": 0.823857743699866, + "learning_rate": 1.3448553375913132e-05, + "loss": 0.4651, + "step": 7072 + }, + { + "epoch": 0.41, + "grad_norm": 0.35011088470733437, + "learning_rate": 1.3446806559720525e-05, + "loss": 0.3223, + "step": 7073 + }, + { + "epoch": 0.41, + "grad_norm": 0.35841773511333275, + "learning_rate": 1.3445059624165156e-05, + "loss": 0.3149, + "step": 7074 + }, + { + "epoch": 0.41, + "grad_norm": 0.9612427340225015, + "learning_rate": 1.3443312569307517e-05, + "loss": 0.5688, + "step": 7075 + }, + { + "epoch": 0.41, + "grad_norm": 0.25064310434852977, + "learning_rate": 1.3441565395208114e-05, + "loss": 0.1798, + "step": 7076 + }, + { + "epoch": 0.41, + "grad_norm": 0.5838118579171294, + "learning_rate": 1.343981810192745e-05, + "loss": 0.3781, + "step": 7077 + }, + { + "epoch": 0.41, + "grad_norm": 0.40143952342120387, + "learning_rate": 1.3438070689526033e-05, + "loss": 0.2896, + "step": 7078 + }, + { + "epoch": 0.41, + "grad_norm": 0.36313108466885224, + "learning_rate": 1.3436323158064373e-05, + "loss": 0.2885, + "step": 7079 + }, + { + "epoch": 0.41, + "grad_norm": 0.7995956838032602, + "learning_rate": 1.3434575507602991e-05, + "loss": 0.5193, + "step": 7080 + }, + { + "epoch": 0.41, + "grad_norm": 0.4270076800549399, + "learning_rate": 1.3432827738202407e-05, + "loss": 0.2954, + "step": 7081 + }, + { + "epoch": 0.41, + "grad_norm": 0.3050621499863082, + "learning_rate": 1.3431079849923153e-05, + "loss": 0.1848, + "step": 7082 + }, + { + "epoch": 0.41, + "grad_norm": 0.3510960749297572, + "learning_rate": 1.3429331842825742e-05, + "loss": 0.2506, + "step": 7083 + }, + { + "epoch": 0.41, + "grad_norm": 0.6715341783485381, + "learning_rate": 1.342758371697072e-05, + "loss": 0.3628, + "step": 7084 + }, + { + "epoch": 0.41, + "grad_norm": 0.4085613591910861, + "learning_rate": 1.342583547241862e-05, + "loss": 0.2762, + "step": 7085 + }, + { + "epoch": 0.41, + "grad_norm": 0.32885154493809926, + "learning_rate": 1.3424087109229986e-05, + "loss": 0.3057, + "step": 7086 + }, + { + "epoch": 0.41, + "grad_norm": 1.1248597962403253, + "learning_rate": 1.3422338627465364e-05, + "loss": 0.7771, + "step": 7087 + }, + { + "epoch": 0.41, + "grad_norm": 0.2362658079430318, + "learning_rate": 1.3420590027185301e-05, + "loss": 0.1333, + "step": 7088 + }, + { + "epoch": 0.41, + "grad_norm": 0.27743124046471523, + "learning_rate": 1.3418841308450353e-05, + "loss": 0.2448, + "step": 7089 + }, + { + "epoch": 0.41, + "grad_norm": 1.0089479383803033, + "learning_rate": 1.3417092471321076e-05, + "loss": 0.439, + "step": 7090 + }, + { + "epoch": 0.41, + "grad_norm": 0.34819709868420223, + "learning_rate": 1.3415343515858035e-05, + "loss": 0.2192, + "step": 7091 + }, + { + "epoch": 0.41, + "grad_norm": 0.5070973840014404, + "learning_rate": 1.3413594442121796e-05, + "loss": 0.3956, + "step": 7092 + }, + { + "epoch": 0.41, + "grad_norm": 0.3532321667025554, + "learning_rate": 1.3411845250172928e-05, + "loss": 0.2936, + "step": 7093 + }, + { + "epoch": 0.41, + "grad_norm": 0.3256958107973817, + "learning_rate": 1.3410095940072004e-05, + "loss": 0.1915, + "step": 7094 + }, + { + "epoch": 0.41, + "grad_norm": 0.26218402620283915, + "learning_rate": 1.3408346511879604e-05, + "loss": 0.1904, + "step": 7095 + }, + { + "epoch": 0.41, + "grad_norm": 0.9728861537090818, + "learning_rate": 1.340659696565631e-05, + "loss": 0.461, + "step": 7096 + }, + { + "epoch": 0.41, + "grad_norm": 0.29866630802365746, + "learning_rate": 1.3404847301462713e-05, + "loss": 0.2247, + "step": 7097 + }, + { + "epoch": 0.41, + "grad_norm": 0.5127063422711399, + "learning_rate": 1.3403097519359397e-05, + "loss": 0.3778, + "step": 7098 + }, + { + "epoch": 0.41, + "grad_norm": 1.0938596974308756, + "learning_rate": 1.3401347619406966e-05, + "loss": 0.8696, + "step": 7099 + }, + { + "epoch": 0.41, + "grad_norm": 0.3440710027431961, + "learning_rate": 1.3399597601666008e-05, + "loss": 0.214, + "step": 7100 + }, + { + "epoch": 0.41, + "grad_norm": 0.33249458020431705, + "learning_rate": 1.3397847466197133e-05, + "loss": 0.2699, + "step": 7101 + }, + { + "epoch": 0.41, + "grad_norm": 0.3150673143037075, + "learning_rate": 1.3396097213060943e-05, + "loss": 0.3037, + "step": 7102 + }, + { + "epoch": 0.41, + "grad_norm": 0.9226442021708258, + "learning_rate": 1.3394346842318058e-05, + "loss": 0.5105, + "step": 7103 + }, + { + "epoch": 0.41, + "grad_norm": 0.3788209793940605, + "learning_rate": 1.3392596354029084e-05, + "loss": 0.2471, + "step": 7104 + }, + { + "epoch": 0.41, + "grad_norm": 0.36634771446717296, + "learning_rate": 1.3390845748254645e-05, + "loss": 0.3481, + "step": 7105 + }, + { + "epoch": 0.41, + "grad_norm": 0.6158781133726381, + "learning_rate": 1.3389095025055363e-05, + "loss": 0.4241, + "step": 7106 + }, + { + "epoch": 0.41, + "grad_norm": 0.3631589753231446, + "learning_rate": 1.3387344184491869e-05, + "loss": 0.2813, + "step": 7107 + }, + { + "epoch": 0.41, + "grad_norm": 0.27955123400877324, + "learning_rate": 1.3385593226624787e-05, + "loss": 0.1621, + "step": 7108 + }, + { + "epoch": 0.41, + "grad_norm": 0.3783695100880318, + "learning_rate": 1.338384215151476e-05, + "loss": 0.2813, + "step": 7109 + }, + { + "epoch": 0.41, + "grad_norm": 0.6041099103124075, + "learning_rate": 1.3382090959222425e-05, + "loss": 0.3282, + "step": 7110 + }, + { + "epoch": 0.41, + "grad_norm": 1.1035000418322451, + "learning_rate": 1.3380339649808425e-05, + "loss": 0.4968, + "step": 7111 + }, + { + "epoch": 0.41, + "grad_norm": 0.4884716979614059, + "learning_rate": 1.337858822333341e-05, + "loss": 0.3715, + "step": 7112 + }, + { + "epoch": 0.41, + "grad_norm": 0.3107210491119011, + "learning_rate": 1.3376836679858026e-05, + "loss": 0.2974, + "step": 7113 + }, + { + "epoch": 0.41, + "grad_norm": 0.22911849054150063, + "learning_rate": 1.3375085019442937e-05, + "loss": 0.1643, + "step": 7114 + }, + { + "epoch": 0.41, + "grad_norm": 0.5691552869077963, + "learning_rate": 1.3373333242148796e-05, + "loss": 0.3392, + "step": 7115 + }, + { + "epoch": 0.41, + "grad_norm": 0.49816342959997756, + "learning_rate": 1.337158134803627e-05, + "loss": 0.3428, + "step": 7116 + }, + { + "epoch": 0.41, + "grad_norm": 0.4375022231297627, + "learning_rate": 1.3369829337166031e-05, + "loss": 0.3051, + "step": 7117 + }, + { + "epoch": 0.41, + "grad_norm": 0.5267758958092926, + "learning_rate": 1.3368077209598744e-05, + "loss": 0.3613, + "step": 7118 + }, + { + "epoch": 0.41, + "grad_norm": 0.3807706588481771, + "learning_rate": 1.3366324965395088e-05, + "loss": 0.3324, + "step": 7119 + }, + { + "epoch": 0.41, + "grad_norm": 0.23055407507354644, + "learning_rate": 1.3364572604615744e-05, + "loss": 0.156, + "step": 7120 + }, + { + "epoch": 0.41, + "grad_norm": 0.47250351493960807, + "learning_rate": 1.3362820127321391e-05, + "loss": 0.3263, + "step": 7121 + }, + { + "epoch": 0.41, + "grad_norm": 0.40387974042821045, + "learning_rate": 1.3361067533572726e-05, + "loss": 0.2879, + "step": 7122 + }, + { + "epoch": 0.41, + "grad_norm": 0.6065089402375893, + "learning_rate": 1.3359314823430436e-05, + "loss": 0.5049, + "step": 7123 + }, + { + "epoch": 0.41, + "grad_norm": 0.48384521210853454, + "learning_rate": 1.335756199695522e-05, + "loss": 0.3052, + "step": 7124 + }, + { + "epoch": 0.41, + "grad_norm": 0.30084930743731136, + "learning_rate": 1.3355809054207774e-05, + "loss": 0.2649, + "step": 7125 + }, + { + "epoch": 0.41, + "grad_norm": 0.9024344911983133, + "learning_rate": 1.3354055995248805e-05, + "loss": 0.6037, + "step": 7126 + }, + { + "epoch": 0.41, + "grad_norm": 0.19286218457362345, + "learning_rate": 1.335230282013902e-05, + "loss": 0.1028, + "step": 7127 + }, + { + "epoch": 0.41, + "grad_norm": 0.3553929943930889, + "learning_rate": 1.3350549528939135e-05, + "loss": 0.3059, + "step": 7128 + }, + { + "epoch": 0.41, + "grad_norm": 0.3556388786278789, + "learning_rate": 1.3348796121709862e-05, + "loss": 0.3285, + "step": 7129 + }, + { + "epoch": 0.41, + "grad_norm": 0.6547730919119816, + "learning_rate": 1.3347042598511926e-05, + "loss": 0.2819, + "step": 7130 + }, + { + "epoch": 0.41, + "grad_norm": 0.36796600659289685, + "learning_rate": 1.3345288959406045e-05, + "loss": 0.2986, + "step": 7131 + }, + { + "epoch": 0.41, + "grad_norm": 0.37729407521418595, + "learning_rate": 1.3343535204452953e-05, + "loss": 0.224, + "step": 7132 + }, + { + "epoch": 0.41, + "grad_norm": 0.2750629669350354, + "learning_rate": 1.3341781333713381e-05, + "loss": 0.1908, + "step": 7133 + }, + { + "epoch": 0.41, + "grad_norm": 0.44690214839091336, + "learning_rate": 1.3340027347248068e-05, + "loss": 0.3296, + "step": 7134 + }, + { + "epoch": 0.41, + "grad_norm": 1.1545043346285377, + "learning_rate": 1.3338273245117745e-05, + "loss": 0.4603, + "step": 7135 + }, + { + "epoch": 0.41, + "grad_norm": 0.40566944235161556, + "learning_rate": 1.3336519027383168e-05, + "loss": 0.3405, + "step": 7136 + }, + { + "epoch": 0.41, + "grad_norm": 0.31670328181132457, + "learning_rate": 1.3334764694105079e-05, + "loss": 0.2335, + "step": 7137 + }, + { + "epoch": 0.41, + "grad_norm": 0.8579819449562148, + "learning_rate": 1.3333010245344232e-05, + "loss": 0.5027, + "step": 7138 + }, + { + "epoch": 0.41, + "grad_norm": 0.30341933853948494, + "learning_rate": 1.3331255681161386e-05, + "loss": 0.17, + "step": 7139 + }, + { + "epoch": 0.41, + "grad_norm": 0.38527026955972754, + "learning_rate": 1.3329501001617294e-05, + "loss": 0.2346, + "step": 7140 + }, + { + "epoch": 0.41, + "grad_norm": 0.5092949576398142, + "learning_rate": 1.332774620677273e-05, + "loss": 0.3329, + "step": 7141 + }, + { + "epoch": 0.41, + "grad_norm": 1.7811634793100883, + "learning_rate": 1.3325991296688455e-05, + "loss": 0.7817, + "step": 7142 + }, + { + "epoch": 0.41, + "grad_norm": 0.3345933775300051, + "learning_rate": 1.3324236271425245e-05, + "loss": 0.2055, + "step": 7143 + }, + { + "epoch": 0.41, + "grad_norm": 1.1826649808614909, + "learning_rate": 1.3322481131043876e-05, + "loss": 0.6385, + "step": 7144 + }, + { + "epoch": 0.41, + "grad_norm": 0.25436360208374936, + "learning_rate": 1.332072587560513e-05, + "loss": 0.2162, + "step": 7145 + }, + { + "epoch": 0.41, + "grad_norm": 0.3245351746881674, + "learning_rate": 1.3318970505169786e-05, + "loss": 0.2445, + "step": 7146 + }, + { + "epoch": 0.41, + "grad_norm": 0.7222227073899474, + "learning_rate": 1.3317215019798639e-05, + "loss": 0.497, + "step": 7147 + }, + { + "epoch": 0.41, + "grad_norm": 0.3898074555572452, + "learning_rate": 1.3315459419552477e-05, + "loss": 0.3174, + "step": 7148 + }, + { + "epoch": 0.41, + "grad_norm": 0.357549111885812, + "learning_rate": 1.33137037044921e-05, + "loss": 0.2622, + "step": 7149 + }, + { + "epoch": 0.41, + "grad_norm": 0.8507678942147663, + "learning_rate": 1.3311947874678306e-05, + "loss": 0.2878, + "step": 7150 + }, + { + "epoch": 0.41, + "grad_norm": 0.33806956641060104, + "learning_rate": 1.3310191930171898e-05, + "loss": 0.2648, + "step": 7151 + }, + { + "epoch": 0.41, + "grad_norm": 0.41027889521502914, + "learning_rate": 1.3308435871033687e-05, + "loss": 0.2778, + "step": 7152 + }, + { + "epoch": 0.41, + "grad_norm": 0.3561677104384658, + "learning_rate": 1.3306679697324485e-05, + "loss": 0.3, + "step": 7153 + }, + { + "epoch": 0.41, + "grad_norm": 0.4096178462824716, + "learning_rate": 1.3304923409105104e-05, + "loss": 0.2723, + "step": 7154 + }, + { + "epoch": 0.41, + "grad_norm": 0.38934669547306056, + "learning_rate": 1.3303167006436371e-05, + "loss": 0.278, + "step": 7155 + }, + { + "epoch": 0.41, + "grad_norm": 0.37412796658975134, + "learning_rate": 1.3301410489379103e-05, + "loss": 0.2778, + "step": 7156 + }, + { + "epoch": 0.41, + "grad_norm": 0.7818448980631783, + "learning_rate": 1.3299653857994135e-05, + "loss": 0.4927, + "step": 7157 + }, + { + "epoch": 0.41, + "grad_norm": 0.3268093806978687, + "learning_rate": 1.3297897112342294e-05, + "loss": 0.2912, + "step": 7158 + }, + { + "epoch": 0.41, + "grad_norm": 0.686057582809523, + "learning_rate": 1.3296140252484417e-05, + "loss": 0.419, + "step": 7159 + }, + { + "epoch": 0.41, + "grad_norm": 0.2794205927146474, + "learning_rate": 1.3294383278481346e-05, + "loss": 0.2731, + "step": 7160 + }, + { + "epoch": 0.41, + "grad_norm": 0.306089163013698, + "learning_rate": 1.3292626190393923e-05, + "loss": 0.2232, + "step": 7161 + }, + { + "epoch": 0.41, + "grad_norm": 1.0595231500511526, + "learning_rate": 1.3290868988282999e-05, + "loss": 0.5485, + "step": 7162 + }, + { + "epoch": 0.41, + "grad_norm": 0.7733042625073977, + "learning_rate": 1.328911167220942e-05, + "loss": 0.3675, + "step": 7163 + }, + { + "epoch": 0.41, + "grad_norm": 0.3460305444373183, + "learning_rate": 1.3287354242234047e-05, + "loss": 0.2629, + "step": 7164 + }, + { + "epoch": 0.41, + "grad_norm": 0.3792220485449821, + "learning_rate": 1.3285596698417738e-05, + "loss": 0.3372, + "step": 7165 + }, + { + "epoch": 0.41, + "grad_norm": 0.2128910770084968, + "learning_rate": 1.3283839040821355e-05, + "loss": 0.1166, + "step": 7166 + }, + { + "epoch": 0.41, + "grad_norm": 0.35463706912168896, + "learning_rate": 1.3282081269505771e-05, + "loss": 0.2832, + "step": 7167 + }, + { + "epoch": 0.41, + "grad_norm": 0.5153840336512678, + "learning_rate": 1.3280323384531852e-05, + "loss": 0.3804, + "step": 7168 + }, + { + "epoch": 0.41, + "grad_norm": 0.5089755863319722, + "learning_rate": 1.3278565385960476e-05, + "loss": 0.3391, + "step": 7169 + }, + { + "epoch": 0.41, + "grad_norm": 0.4031836019221641, + "learning_rate": 1.3276807273852522e-05, + "loss": 0.2773, + "step": 7170 + }, + { + "epoch": 0.41, + "grad_norm": 0.48049808170008806, + "learning_rate": 1.3275049048268869e-05, + "loss": 0.4095, + "step": 7171 + }, + { + "epoch": 0.41, + "grad_norm": 0.2558281539455274, + "learning_rate": 1.327329070927041e-05, + "loss": 0.1998, + "step": 7172 + }, + { + "epoch": 0.41, + "grad_norm": 0.38630900592037487, + "learning_rate": 1.3271532256918036e-05, + "loss": 0.295, + "step": 7173 + }, + { + "epoch": 0.41, + "grad_norm": 0.39008618040812376, + "learning_rate": 1.326977369127264e-05, + "loss": 0.2581, + "step": 7174 + }, + { + "epoch": 0.41, + "grad_norm": 0.6205933584816866, + "learning_rate": 1.326801501239512e-05, + "loss": 0.4402, + "step": 7175 + }, + { + "epoch": 0.41, + "grad_norm": 0.31882272613528434, + "learning_rate": 1.3266256220346383e-05, + "loss": 0.1977, + "step": 7176 + }, + { + "epoch": 0.41, + "grad_norm": 0.4407270029368251, + "learning_rate": 1.3264497315187334e-05, + "loss": 0.3436, + "step": 7177 + }, + { + "epoch": 0.41, + "grad_norm": 0.38673637353069634, + "learning_rate": 1.326273829697888e-05, + "loss": 0.244, + "step": 7178 + }, + { + "epoch": 0.41, + "grad_norm": 0.27113858546133, + "learning_rate": 1.3260979165781942e-05, + "loss": 0.1703, + "step": 7179 + }, + { + "epoch": 0.41, + "grad_norm": 0.4419932070737161, + "learning_rate": 1.3259219921657436e-05, + "loss": 0.3672, + "step": 7180 + }, + { + "epoch": 0.41, + "grad_norm": 0.45245896039209665, + "learning_rate": 1.3257460564666283e-05, + "loss": 0.3697, + "step": 7181 + }, + { + "epoch": 0.41, + "grad_norm": 0.32808870623561165, + "learning_rate": 1.3255701094869408e-05, + "loss": 0.1927, + "step": 7182 + }, + { + "epoch": 0.41, + "grad_norm": 1.1927858978015542, + "learning_rate": 1.325394151232775e-05, + "loss": 0.7927, + "step": 7183 + }, + { + "epoch": 0.41, + "grad_norm": 0.4014879560091093, + "learning_rate": 1.3252181817102235e-05, + "loss": 0.3279, + "step": 7184 + }, + { + "epoch": 0.41, + "grad_norm": 0.23244466150471954, + "learning_rate": 1.3250422009253802e-05, + "loss": 0.1628, + "step": 7185 + }, + { + "epoch": 0.41, + "grad_norm": 0.4222245281834194, + "learning_rate": 1.3248662088843395e-05, + "loss": 0.2875, + "step": 7186 + }, + { + "epoch": 0.41, + "grad_norm": 0.6467322169281083, + "learning_rate": 1.3246902055931961e-05, + "loss": 0.4428, + "step": 7187 + }, + { + "epoch": 0.41, + "grad_norm": 0.38186519736687974, + "learning_rate": 1.3245141910580446e-05, + "loss": 0.3217, + "step": 7188 + }, + { + "epoch": 0.41, + "grad_norm": 0.3362029065075712, + "learning_rate": 1.324338165284981e-05, + "loss": 0.2656, + "step": 7189 + }, + { + "epoch": 0.41, + "grad_norm": 0.40943953574665665, + "learning_rate": 1.3241621282801002e-05, + "loss": 0.267, + "step": 7190 + }, + { + "epoch": 0.41, + "grad_norm": 0.4200553653950379, + "learning_rate": 1.3239860800494993e-05, + "loss": 0.3042, + "step": 7191 + }, + { + "epoch": 0.41, + "grad_norm": 0.4031697492165725, + "learning_rate": 1.3238100205992739e-05, + "loss": 0.286, + "step": 7192 + }, + { + "epoch": 0.41, + "grad_norm": 0.4600727603852913, + "learning_rate": 1.3236339499355217e-05, + "loss": 0.3152, + "step": 7193 + }, + { + "epoch": 0.41, + "grad_norm": 0.3986539572582897, + "learning_rate": 1.3234578680643394e-05, + "loss": 0.3215, + "step": 7194 + }, + { + "epoch": 0.41, + "grad_norm": 0.4169919148480683, + "learning_rate": 1.3232817749918256e-05, + "loss": 0.3092, + "step": 7195 + }, + { + "epoch": 0.41, + "grad_norm": 0.3654175654707768, + "learning_rate": 1.3231056707240775e-05, + "loss": 0.3141, + "step": 7196 + }, + { + "epoch": 0.41, + "grad_norm": 0.36406119272639237, + "learning_rate": 1.322929555267194e-05, + "loss": 0.3191, + "step": 7197 + }, + { + "epoch": 0.41, + "grad_norm": 0.31941887730802226, + "learning_rate": 1.3227534286272741e-05, + "loss": 0.2926, + "step": 7198 + }, + { + "epoch": 0.41, + "grad_norm": 0.2620408151493418, + "learning_rate": 1.3225772908104165e-05, + "loss": 0.0687, + "step": 7199 + }, + { + "epoch": 0.41, + "grad_norm": 0.30135243734827055, + "learning_rate": 1.3224011418227215e-05, + "loss": 0.268, + "step": 7200 + }, + { + "epoch": 0.41, + "grad_norm": 0.5366585174326642, + "learning_rate": 1.3222249816702885e-05, + "loss": 0.3695, + "step": 7201 + }, + { + "epoch": 0.41, + "grad_norm": 0.590033821849479, + "learning_rate": 1.3220488103592184e-05, + "loss": 0.381, + "step": 7202 + }, + { + "epoch": 0.41, + "grad_norm": 0.3385399492527422, + "learning_rate": 1.3218726278956117e-05, + "loss": 0.2911, + "step": 7203 + }, + { + "epoch": 0.41, + "grad_norm": 0.39860811495487514, + "learning_rate": 1.32169643428557e-05, + "loss": 0.393, + "step": 7204 + }, + { + "epoch": 0.41, + "grad_norm": 0.25190821405668745, + "learning_rate": 1.3215202295351946e-05, + "loss": 0.1631, + "step": 7205 + }, + { + "epoch": 0.41, + "grad_norm": 0.5825783438242356, + "learning_rate": 1.3213440136505872e-05, + "loss": 0.3373, + "step": 7206 + }, + { + "epoch": 0.41, + "grad_norm": 0.46201854347371046, + "learning_rate": 1.3211677866378505e-05, + "loss": 0.3525, + "step": 7207 + }, + { + "epoch": 0.41, + "grad_norm": 0.33494447559061813, + "learning_rate": 1.3209915485030872e-05, + "loss": 0.2768, + "step": 7208 + }, + { + "epoch": 0.41, + "grad_norm": 0.5209714849109807, + "learning_rate": 1.3208152992524004e-05, + "loss": 0.334, + "step": 7209 + }, + { + "epoch": 0.41, + "grad_norm": 0.37421682429920466, + "learning_rate": 1.3206390388918937e-05, + "loss": 0.3097, + "step": 7210 + }, + { + "epoch": 0.41, + "grad_norm": 0.30114301634692187, + "learning_rate": 1.3204627674276706e-05, + "loss": 0.2041, + "step": 7211 + }, + { + "epoch": 0.41, + "grad_norm": 0.30929089763537454, + "learning_rate": 1.320286484865836e-05, + "loss": 0.2373, + "step": 7212 + }, + { + "epoch": 0.41, + "grad_norm": 0.5349211331103826, + "learning_rate": 1.3201101912124938e-05, + "loss": 0.4244, + "step": 7213 + }, + { + "epoch": 0.41, + "grad_norm": 0.7905658849528135, + "learning_rate": 1.31993388647375e-05, + "loss": 0.4311, + "step": 7214 + }, + { + "epoch": 0.41, + "grad_norm": 0.36725135920394, + "learning_rate": 1.3197575706557089e-05, + "loss": 0.2345, + "step": 7215 + }, + { + "epoch": 0.41, + "grad_norm": 0.32845897394315215, + "learning_rate": 1.3195812437644771e-05, + "loss": 0.3105, + "step": 7216 + }, + { + "epoch": 0.41, + "grad_norm": 0.29800837631567817, + "learning_rate": 1.3194049058061606e-05, + "loss": 0.1937, + "step": 7217 + }, + { + "epoch": 0.41, + "grad_norm": 0.3439065437611227, + "learning_rate": 1.3192285567868662e-05, + "loss": 0.2248, + "step": 7218 + }, + { + "epoch": 0.41, + "grad_norm": 0.5481495321388145, + "learning_rate": 1.3190521967127e-05, + "loss": 0.425, + "step": 7219 + }, + { + "epoch": 0.41, + "grad_norm": 0.5796551641203623, + "learning_rate": 1.3188758255897705e-05, + "loss": 0.3267, + "step": 7220 + }, + { + "epoch": 0.41, + "grad_norm": 0.5415045151729231, + "learning_rate": 1.3186994434241845e-05, + "loss": 0.2433, + "step": 7221 + }, + { + "epoch": 0.41, + "grad_norm": 0.5321969246900784, + "learning_rate": 1.3185230502220508e-05, + "loss": 0.3406, + "step": 7222 + }, + { + "epoch": 0.41, + "grad_norm": 0.2422636781051529, + "learning_rate": 1.3183466459894774e-05, + "loss": 0.2162, + "step": 7223 + }, + { + "epoch": 0.42, + "grad_norm": 0.4012142713831807, + "learning_rate": 1.3181702307325732e-05, + "loss": 0.3343, + "step": 7224 + }, + { + "epoch": 0.42, + "grad_norm": 0.4794094927724806, + "learning_rate": 1.3179938044574478e-05, + "loss": 0.2723, + "step": 7225 + }, + { + "epoch": 0.42, + "grad_norm": 0.9303568882746546, + "learning_rate": 1.3178173671702106e-05, + "loss": 0.4344, + "step": 7226 + }, + { + "epoch": 0.42, + "grad_norm": 0.6859661533697369, + "learning_rate": 1.3176409188769715e-05, + "loss": 0.4222, + "step": 7227 + }, + { + "epoch": 0.42, + "grad_norm": 0.27217942492278485, + "learning_rate": 1.3174644595838411e-05, + "loss": 0.2338, + "step": 7228 + }, + { + "epoch": 0.42, + "grad_norm": 0.3033095269716523, + "learning_rate": 1.3172879892969302e-05, + "loss": 0.1795, + "step": 7229 + }, + { + "epoch": 0.42, + "grad_norm": 0.7835735876374561, + "learning_rate": 1.3171115080223498e-05, + "loss": 0.4627, + "step": 7230 + }, + { + "epoch": 0.42, + "grad_norm": 0.6595555477893355, + "learning_rate": 1.3169350157662115e-05, + "loss": 0.2827, + "step": 7231 + }, + { + "epoch": 0.42, + "grad_norm": 0.45468179409236015, + "learning_rate": 1.3167585125346271e-05, + "loss": 0.3494, + "step": 7232 + }, + { + "epoch": 0.42, + "grad_norm": 0.6287105703435774, + "learning_rate": 1.3165819983337093e-05, + "loss": 0.3983, + "step": 7233 + }, + { + "epoch": 0.42, + "grad_norm": 0.4100430193967899, + "learning_rate": 1.3164054731695706e-05, + "loss": 0.2633, + "step": 7234 + }, + { + "epoch": 0.42, + "grad_norm": 0.3711852345127576, + "learning_rate": 1.3162289370483239e-05, + "loss": 0.2647, + "step": 7235 + }, + { + "epoch": 0.42, + "grad_norm": 0.37139285226115515, + "learning_rate": 1.3160523899760824e-05, + "loss": 0.3054, + "step": 7236 + }, + { + "epoch": 0.42, + "grad_norm": 0.4224536498097396, + "learning_rate": 1.3158758319589604e-05, + "loss": 0.3383, + "step": 7237 + }, + { + "epoch": 0.42, + "grad_norm": 0.5765797692400455, + "learning_rate": 1.3156992630030719e-05, + "loss": 0.2837, + "step": 7238 + }, + { + "epoch": 0.42, + "grad_norm": 0.38557712218290724, + "learning_rate": 1.3155226831145316e-05, + "loss": 0.2873, + "step": 7239 + }, + { + "epoch": 0.42, + "grad_norm": 0.31621584525691937, + "learning_rate": 1.3153460922994543e-05, + "loss": 0.2885, + "step": 7240 + }, + { + "epoch": 0.42, + "grad_norm": 0.29877105273955934, + "learning_rate": 1.3151694905639553e-05, + "loss": 0.164, + "step": 7241 + }, + { + "epoch": 0.42, + "grad_norm": 0.5334188626320423, + "learning_rate": 1.3149928779141506e-05, + "loss": 0.4122, + "step": 7242 + }, + { + "epoch": 0.42, + "grad_norm": 0.3828319018088226, + "learning_rate": 1.3148162543561557e-05, + "loss": 0.3108, + "step": 7243 + }, + { + "epoch": 0.42, + "grad_norm": 0.27405350590100247, + "learning_rate": 1.3146396198960881e-05, + "loss": 0.2322, + "step": 7244 + }, + { + "epoch": 0.42, + "grad_norm": 0.7653235271564569, + "learning_rate": 1.3144629745400632e-05, + "loss": 0.3892, + "step": 7245 + }, + { + "epoch": 0.42, + "grad_norm": 0.40187932943275734, + "learning_rate": 1.3142863182941996e-05, + "loss": 0.2975, + "step": 7246 + }, + { + "epoch": 0.42, + "grad_norm": 0.3764866541385056, + "learning_rate": 1.3141096511646141e-05, + "loss": 0.256, + "step": 7247 + }, + { + "epoch": 0.42, + "grad_norm": 0.5491287247419514, + "learning_rate": 1.3139329731574248e-05, + "loss": 0.402, + "step": 7248 + }, + { + "epoch": 0.42, + "grad_norm": 0.33817869550264207, + "learning_rate": 1.3137562842787502e-05, + "loss": 0.314, + "step": 7249 + }, + { + "epoch": 0.42, + "grad_norm": 1.1455881396931549, + "learning_rate": 1.3135795845347091e-05, + "loss": 0.786, + "step": 7250 + }, + { + "epoch": 0.42, + "grad_norm": 0.260305077900469, + "learning_rate": 1.3134028739314204e-05, + "loss": 0.1759, + "step": 7251 + }, + { + "epoch": 0.42, + "grad_norm": 0.3841956692601113, + "learning_rate": 1.3132261524750038e-05, + "loss": 0.2806, + "step": 7252 + }, + { + "epoch": 0.42, + "grad_norm": 1.0728971327464343, + "learning_rate": 1.3130494201715786e-05, + "loss": 0.5737, + "step": 7253 + }, + { + "epoch": 0.42, + "grad_norm": 0.4996657504738954, + "learning_rate": 1.312872677027266e-05, + "loss": 0.2811, + "step": 7254 + }, + { + "epoch": 0.42, + "grad_norm": 0.4259459115097978, + "learning_rate": 1.3126959230481855e-05, + "loss": 0.3429, + "step": 7255 + }, + { + "epoch": 0.42, + "grad_norm": 0.3836148834285065, + "learning_rate": 1.312519158240459e-05, + "loss": 0.3439, + "step": 7256 + }, + { + "epoch": 0.42, + "grad_norm": 0.195603144277449, + "learning_rate": 1.3123423826102074e-05, + "loss": 0.1038, + "step": 7257 + }, + { + "epoch": 0.42, + "grad_norm": 0.3946116029677441, + "learning_rate": 1.3121655961635523e-05, + "loss": 0.3305, + "step": 7258 + }, + { + "epoch": 0.42, + "grad_norm": 0.429020225728886, + "learning_rate": 1.311988798906616e-05, + "loss": 0.3417, + "step": 7259 + }, + { + "epoch": 0.42, + "grad_norm": 0.43981050593951315, + "learning_rate": 1.3118119908455214e-05, + "loss": 0.2962, + "step": 7260 + }, + { + "epoch": 0.42, + "grad_norm": 0.36302632134687873, + "learning_rate": 1.3116351719863906e-05, + "loss": 0.297, + "step": 7261 + }, + { + "epoch": 0.42, + "grad_norm": 0.44066367931176303, + "learning_rate": 1.3114583423353476e-05, + "loss": 0.3472, + "step": 7262 + }, + { + "epoch": 0.42, + "grad_norm": 0.301064575674817, + "learning_rate": 1.3112815018985154e-05, + "loss": 0.2503, + "step": 7263 + }, + { + "epoch": 0.42, + "grad_norm": 0.31585777990855474, + "learning_rate": 1.311104650682018e-05, + "loss": 0.2119, + "step": 7264 + }, + { + "epoch": 0.42, + "grad_norm": 0.7170533675281239, + "learning_rate": 1.3109277886919802e-05, + "loss": 0.4184, + "step": 7265 + }, + { + "epoch": 0.42, + "grad_norm": 0.7914186842307775, + "learning_rate": 1.3107509159345262e-05, + "loss": 0.5291, + "step": 7266 + }, + { + "epoch": 0.42, + "grad_norm": 0.267323184964963, + "learning_rate": 1.3105740324157817e-05, + "loss": 0.2167, + "step": 7267 + }, + { + "epoch": 0.42, + "grad_norm": 0.4627191275190162, + "learning_rate": 1.3103971381418713e-05, + "loss": 0.3843, + "step": 7268 + }, + { + "epoch": 0.42, + "grad_norm": 0.2892535892426708, + "learning_rate": 1.310220233118922e-05, + "loss": 0.1798, + "step": 7269 + }, + { + "epoch": 0.42, + "grad_norm": 0.42638022005244247, + "learning_rate": 1.3100433173530589e-05, + "loss": 0.222, + "step": 7270 + }, + { + "epoch": 0.42, + "grad_norm": 0.6288156366173488, + "learning_rate": 1.3098663908504091e-05, + "loss": 0.3793, + "step": 7271 + }, + { + "epoch": 0.42, + "grad_norm": 0.5008266918389817, + "learning_rate": 1.3096894536170994e-05, + "loss": 0.3658, + "step": 7272 + }, + { + "epoch": 0.42, + "grad_norm": 0.34497131150322446, + "learning_rate": 1.3095125056592575e-05, + "loss": 0.2732, + "step": 7273 + }, + { + "epoch": 0.42, + "grad_norm": 0.9667430455753272, + "learning_rate": 1.3093355469830107e-05, + "loss": 0.5544, + "step": 7274 + }, + { + "epoch": 0.42, + "grad_norm": 0.2635365400899789, + "learning_rate": 1.3091585775944873e-05, + "loss": 0.2102, + "step": 7275 + }, + { + "epoch": 0.42, + "grad_norm": 0.3788616946807546, + "learning_rate": 1.3089815974998154e-05, + "loss": 0.275, + "step": 7276 + }, + { + "epoch": 0.42, + "grad_norm": 0.897185868184525, + "learning_rate": 1.3088046067051243e-05, + "loss": 0.3707, + "step": 7277 + }, + { + "epoch": 0.42, + "grad_norm": 0.9592855330664415, + "learning_rate": 1.308627605216543e-05, + "loss": 0.624, + "step": 7278 + }, + { + "epoch": 0.42, + "grad_norm": 0.347671499106493, + "learning_rate": 1.308450593040201e-05, + "loss": 0.2543, + "step": 7279 + }, + { + "epoch": 0.42, + "grad_norm": 0.4070033059546702, + "learning_rate": 1.3082735701822281e-05, + "loss": 0.2806, + "step": 7280 + }, + { + "epoch": 0.42, + "grad_norm": 0.30304605216105446, + "learning_rate": 1.3080965366487548e-05, + "loss": 0.1902, + "step": 7281 + }, + { + "epoch": 0.42, + "grad_norm": 0.45872537263094, + "learning_rate": 1.3079194924459118e-05, + "loss": 0.2992, + "step": 7282 + }, + { + "epoch": 0.42, + "grad_norm": 0.5951235637057674, + "learning_rate": 1.3077424375798295e-05, + "loss": 0.2988, + "step": 7283 + }, + { + "epoch": 0.42, + "grad_norm": 1.182568248414101, + "learning_rate": 1.3075653720566404e-05, + "loss": 0.7021, + "step": 7284 + }, + { + "epoch": 0.42, + "grad_norm": 0.34373368560518014, + "learning_rate": 1.3073882958824755e-05, + "loss": 0.2923, + "step": 7285 + }, + { + "epoch": 0.42, + "grad_norm": 1.157375136004901, + "learning_rate": 1.307211209063467e-05, + "loss": 0.7641, + "step": 7286 + }, + { + "epoch": 0.42, + "grad_norm": 0.2623129816600877, + "learning_rate": 1.3070341116057476e-05, + "loss": 0.2098, + "step": 7287 + }, + { + "epoch": 0.42, + "grad_norm": 0.3824276930333361, + "learning_rate": 1.3068570035154503e-05, + "loss": 0.2981, + "step": 7288 + }, + { + "epoch": 0.42, + "grad_norm": 0.4109022486432033, + "learning_rate": 1.306679884798708e-05, + "loss": 0.2912, + "step": 7289 + }, + { + "epoch": 0.42, + "grad_norm": 0.5582932916423002, + "learning_rate": 1.3065027554616547e-05, + "loss": 0.302, + "step": 7290 + }, + { + "epoch": 0.42, + "grad_norm": 0.34401297473123493, + "learning_rate": 1.3063256155104239e-05, + "loss": 0.2613, + "step": 7291 + }, + { + "epoch": 0.42, + "grad_norm": 0.5353972926544106, + "learning_rate": 1.3061484649511503e-05, + "loss": 0.4175, + "step": 7292 + }, + { + "epoch": 0.42, + "grad_norm": 0.5389049209563237, + "learning_rate": 1.3059713037899683e-05, + "loss": 0.3414, + "step": 7293 + }, + { + "epoch": 0.42, + "grad_norm": 0.40682939769048926, + "learning_rate": 1.3057941320330134e-05, + "loss": 0.2576, + "step": 7294 + }, + { + "epoch": 0.42, + "grad_norm": 0.270271026585561, + "learning_rate": 1.3056169496864208e-05, + "loss": 0.2742, + "step": 7295 + }, + { + "epoch": 0.42, + "grad_norm": 0.28083438824338586, + "learning_rate": 1.3054397567563266e-05, + "loss": 0.134, + "step": 7296 + }, + { + "epoch": 0.42, + "grad_norm": 0.3797228091663637, + "learning_rate": 1.3052625532488663e-05, + "loss": 0.2747, + "step": 7297 + }, + { + "epoch": 0.42, + "grad_norm": 0.8666369485609857, + "learning_rate": 1.3050853391701774e-05, + "loss": 0.6257, + "step": 7298 + }, + { + "epoch": 0.42, + "grad_norm": 0.3821053180321841, + "learning_rate": 1.304908114526396e-05, + "loss": 0.3131, + "step": 7299 + }, + { + "epoch": 0.42, + "grad_norm": 0.327705833281437, + "learning_rate": 1.3047308793236599e-05, + "loss": 0.2423, + "step": 7300 + }, + { + "epoch": 0.42, + "grad_norm": 0.32553038184470945, + "learning_rate": 1.3045536335681064e-05, + "loss": 0.2019, + "step": 7301 + }, + { + "epoch": 0.42, + "grad_norm": 1.0963164215457855, + "learning_rate": 1.3043763772658739e-05, + "loss": 0.7163, + "step": 7302 + }, + { + "epoch": 0.42, + "grad_norm": 0.2779405820004573, + "learning_rate": 1.3041991104231004e-05, + "loss": 0.2391, + "step": 7303 + }, + { + "epoch": 0.42, + "grad_norm": 1.1690442501518032, + "learning_rate": 1.3040218330459249e-05, + "loss": 0.795, + "step": 7304 + }, + { + "epoch": 0.42, + "grad_norm": 0.6931092185091348, + "learning_rate": 1.3038445451404862e-05, + "loss": 0.4273, + "step": 7305 + }, + { + "epoch": 0.42, + "grad_norm": 0.33132146745020336, + "learning_rate": 1.3036672467129241e-05, + "loss": 0.2127, + "step": 7306 + }, + { + "epoch": 0.42, + "grad_norm": 0.3629514950779212, + "learning_rate": 1.3034899377693782e-05, + "loss": 0.313, + "step": 7307 + }, + { + "epoch": 0.42, + "grad_norm": 0.297556760270983, + "learning_rate": 1.3033126183159887e-05, + "loss": 0.2063, + "step": 7308 + }, + { + "epoch": 0.42, + "grad_norm": 0.2941708532876371, + "learning_rate": 1.3031352883588965e-05, + "loss": 0.2085, + "step": 7309 + }, + { + "epoch": 0.42, + "grad_norm": 0.7852519937723851, + "learning_rate": 1.3029579479042423e-05, + "loss": 0.5261, + "step": 7310 + }, + { + "epoch": 0.42, + "grad_norm": 0.4725984968530518, + "learning_rate": 1.3027805969581674e-05, + "loss": 0.327, + "step": 7311 + }, + { + "epoch": 0.42, + "grad_norm": 0.758050405985813, + "learning_rate": 1.3026032355268132e-05, + "loss": 0.3737, + "step": 7312 + }, + { + "epoch": 0.42, + "grad_norm": 0.27727090117219044, + "learning_rate": 1.3024258636163221e-05, + "loss": 0.2148, + "step": 7313 + }, + { + "epoch": 0.42, + "grad_norm": 0.27413178777534775, + "learning_rate": 1.3022484812328365e-05, + "loss": 0.2706, + "step": 7314 + }, + { + "epoch": 0.42, + "grad_norm": 0.5412617681591296, + "learning_rate": 1.3020710883824987e-05, + "loss": 0.3523, + "step": 7315 + }, + { + "epoch": 0.42, + "grad_norm": 0.45275149176644774, + "learning_rate": 1.3018936850714524e-05, + "loss": 0.2881, + "step": 7316 + }, + { + "epoch": 0.42, + "grad_norm": 0.7923988739792034, + "learning_rate": 1.3017162713058404e-05, + "loss": 0.4677, + "step": 7317 + }, + { + "epoch": 0.42, + "grad_norm": 0.39895495917725043, + "learning_rate": 1.3015388470918072e-05, + "loss": 0.2999, + "step": 7318 + }, + { + "epoch": 0.42, + "grad_norm": 0.23533187055257196, + "learning_rate": 1.3013614124354969e-05, + "loss": 0.201, + "step": 7319 + }, + { + "epoch": 0.42, + "grad_norm": 0.41061255560760335, + "learning_rate": 1.3011839673430536e-05, + "loss": 0.2942, + "step": 7320 + }, + { + "epoch": 0.42, + "grad_norm": 0.3836948488403221, + "learning_rate": 1.3010065118206223e-05, + "loss": 0.3036, + "step": 7321 + }, + { + "epoch": 0.42, + "grad_norm": 0.8039043999491806, + "learning_rate": 1.3008290458743486e-05, + "loss": 0.3215, + "step": 7322 + }, + { + "epoch": 0.42, + "grad_norm": 0.3526267061237842, + "learning_rate": 1.3006515695103779e-05, + "loss": 0.3402, + "step": 7323 + }, + { + "epoch": 0.42, + "grad_norm": 0.3874565419841915, + "learning_rate": 1.3004740827348563e-05, + "loss": 0.2971, + "step": 7324 + }, + { + "epoch": 0.42, + "grad_norm": 0.4916981175451748, + "learning_rate": 1.3002965855539303e-05, + "loss": 0.3836, + "step": 7325 + }, + { + "epoch": 0.42, + "grad_norm": 0.1963129197249394, + "learning_rate": 1.300119077973746e-05, + "loss": 0.1841, + "step": 7326 + }, + { + "epoch": 0.42, + "grad_norm": 0.3374193053149697, + "learning_rate": 1.2999415600004515e-05, + "loss": 0.2758, + "step": 7327 + }, + { + "epoch": 0.42, + "grad_norm": 0.9488092760590442, + "learning_rate": 1.2997640316401934e-05, + "loss": 0.4981, + "step": 7328 + }, + { + "epoch": 0.42, + "grad_norm": 0.6829150533818018, + "learning_rate": 1.2995864928991198e-05, + "loss": 0.3648, + "step": 7329 + }, + { + "epoch": 0.42, + "grad_norm": 0.5376235231735698, + "learning_rate": 1.2994089437833788e-05, + "loss": 0.3335, + "step": 7330 + }, + { + "epoch": 0.42, + "grad_norm": 0.30612015196427483, + "learning_rate": 1.2992313842991189e-05, + "loss": 0.2881, + "step": 7331 + }, + { + "epoch": 0.42, + "grad_norm": 0.35369907883631363, + "learning_rate": 1.2990538144524894e-05, + "loss": 0.2176, + "step": 7332 + }, + { + "epoch": 0.42, + "grad_norm": 0.6509986938076566, + "learning_rate": 1.2988762342496386e-05, + "loss": 0.366, + "step": 7333 + }, + { + "epoch": 0.42, + "grad_norm": 0.4988119608715147, + "learning_rate": 1.298698643696717e-05, + "loss": 0.3416, + "step": 7334 + }, + { + "epoch": 0.42, + "grad_norm": 0.2772638591631809, + "learning_rate": 1.2985210427998743e-05, + "loss": 0.2226, + "step": 7335 + }, + { + "epoch": 0.42, + "grad_norm": 0.3951269297273973, + "learning_rate": 1.2983434315652606e-05, + "loss": 0.2974, + "step": 7336 + }, + { + "epoch": 0.42, + "grad_norm": 0.5102487142923199, + "learning_rate": 1.2981658099990266e-05, + "loss": 0.3945, + "step": 7337 + }, + { + "epoch": 0.42, + "grad_norm": 0.4414157270716968, + "learning_rate": 1.2979881781073235e-05, + "loss": 0.3039, + "step": 7338 + }, + { + "epoch": 0.42, + "grad_norm": 0.2909371552105041, + "learning_rate": 1.2978105358963026e-05, + "loss": 0.2469, + "step": 7339 + }, + { + "epoch": 0.42, + "grad_norm": 0.5649634546078088, + "learning_rate": 1.2976328833721157e-05, + "loss": 0.3579, + "step": 7340 + }, + { + "epoch": 0.42, + "grad_norm": 0.3848501314342395, + "learning_rate": 1.2974552205409147e-05, + "loss": 0.2769, + "step": 7341 + }, + { + "epoch": 0.42, + "grad_norm": 0.3049961532656385, + "learning_rate": 1.2972775474088524e-05, + "loss": 0.2088, + "step": 7342 + }, + { + "epoch": 0.42, + "grad_norm": 0.34617722088590974, + "learning_rate": 1.297099863982081e-05, + "loss": 0.3243, + "step": 7343 + }, + { + "epoch": 0.42, + "grad_norm": 0.7259079907052219, + "learning_rate": 1.2969221702667547e-05, + "loss": 0.5144, + "step": 7344 + }, + { + "epoch": 0.42, + "grad_norm": 0.38372525596694496, + "learning_rate": 1.2967444662690261e-05, + "loss": 0.2622, + "step": 7345 + }, + { + "epoch": 0.42, + "grad_norm": 0.5209288626419487, + "learning_rate": 1.2965667519950494e-05, + "loss": 0.3831, + "step": 7346 + }, + { + "epoch": 0.42, + "grad_norm": 0.23884674114338472, + "learning_rate": 1.2963890274509789e-05, + "loss": 0.2358, + "step": 7347 + }, + { + "epoch": 0.42, + "grad_norm": 0.38743989121030176, + "learning_rate": 1.2962112926429691e-05, + "loss": 0.1331, + "step": 7348 + }, + { + "epoch": 0.42, + "grad_norm": 0.43142029339243576, + "learning_rate": 1.2960335475771748e-05, + "loss": 0.3201, + "step": 7349 + }, + { + "epoch": 0.42, + "grad_norm": 0.3556038132379808, + "learning_rate": 1.2958557922597516e-05, + "loss": 0.34, + "step": 7350 + }, + { + "epoch": 0.42, + "grad_norm": 0.5629136553826217, + "learning_rate": 1.2956780266968552e-05, + "loss": 0.409, + "step": 7351 + }, + { + "epoch": 0.42, + "grad_norm": 0.3375499747099566, + "learning_rate": 1.2955002508946413e-05, + "loss": 0.2461, + "step": 7352 + }, + { + "epoch": 0.42, + "grad_norm": 0.26063028491401785, + "learning_rate": 1.2953224648592664e-05, + "loss": 0.1594, + "step": 7353 + }, + { + "epoch": 0.42, + "grad_norm": 0.3286104317910206, + "learning_rate": 1.2951446685968874e-05, + "loss": 0.2815, + "step": 7354 + }, + { + "epoch": 0.42, + "grad_norm": 0.33363688183935714, + "learning_rate": 1.294966862113661e-05, + "loss": 0.2217, + "step": 7355 + }, + { + "epoch": 0.42, + "grad_norm": 0.6185356603885251, + "learning_rate": 1.2947890454157448e-05, + "loss": 0.4454, + "step": 7356 + }, + { + "epoch": 0.42, + "grad_norm": 0.5052412376086751, + "learning_rate": 1.294611218509297e-05, + "loss": 0.3758, + "step": 7357 + }, + { + "epoch": 0.42, + "grad_norm": 0.31425887157261073, + "learning_rate": 1.2944333814004748e-05, + "loss": 0.2567, + "step": 7358 + }, + { + "epoch": 0.42, + "grad_norm": 0.3024510934501898, + "learning_rate": 1.2942555340954377e-05, + "loss": 0.2408, + "step": 7359 + }, + { + "epoch": 0.42, + "grad_norm": 0.2721480122115268, + "learning_rate": 1.294077676600344e-05, + "loss": 0.1685, + "step": 7360 + }, + { + "epoch": 0.42, + "grad_norm": 0.4334281254358977, + "learning_rate": 1.293899808921353e-05, + "loss": 0.3266, + "step": 7361 + }, + { + "epoch": 0.42, + "grad_norm": 0.33199319188044935, + "learning_rate": 1.2937219310646242e-05, + "loss": 0.2896, + "step": 7362 + }, + { + "epoch": 0.42, + "grad_norm": 0.5113244424261909, + "learning_rate": 1.2935440430363177e-05, + "loss": 0.3779, + "step": 7363 + }, + { + "epoch": 0.42, + "grad_norm": 0.4415669571896982, + "learning_rate": 1.2933661448425933e-05, + "loss": 0.3187, + "step": 7364 + }, + { + "epoch": 0.42, + "grad_norm": 0.21104509788054693, + "learning_rate": 1.2931882364896125e-05, + "loss": 0.1266, + "step": 7365 + }, + { + "epoch": 0.42, + "grad_norm": 0.39429657285673275, + "learning_rate": 1.2930103179835352e-05, + "loss": 0.3014, + "step": 7366 + }, + { + "epoch": 0.42, + "grad_norm": 0.34096887677103693, + "learning_rate": 1.2928323893305233e-05, + "loss": 0.2779, + "step": 7367 + }, + { + "epoch": 0.42, + "grad_norm": 0.5398849722107039, + "learning_rate": 1.2926544505367384e-05, + "loss": 0.3547, + "step": 7368 + }, + { + "epoch": 0.42, + "grad_norm": 0.7583480793502161, + "learning_rate": 1.2924765016083427e-05, + "loss": 0.5297, + "step": 7369 + }, + { + "epoch": 0.42, + "grad_norm": 0.30692372439687576, + "learning_rate": 1.2922985425514977e-05, + "loss": 0.279, + "step": 7370 + }, + { + "epoch": 0.42, + "grad_norm": 0.270890793342731, + "learning_rate": 1.2921205733723672e-05, + "loss": 0.1777, + "step": 7371 + }, + { + "epoch": 0.42, + "grad_norm": 0.33009884553113095, + "learning_rate": 1.2919425940771138e-05, + "loss": 0.2584, + "step": 7372 + }, + { + "epoch": 0.42, + "grad_norm": 0.3529582845492279, + "learning_rate": 1.2917646046719007e-05, + "loss": 0.304, + "step": 7373 + }, + { + "epoch": 0.42, + "grad_norm": 0.4422830853360155, + "learning_rate": 1.2915866051628923e-05, + "loss": 0.3514, + "step": 7374 + }, + { + "epoch": 0.42, + "grad_norm": 0.3936738090536721, + "learning_rate": 1.291408595556252e-05, + "loss": 0.258, + "step": 7375 + }, + { + "epoch": 0.42, + "grad_norm": 0.3645622895731119, + "learning_rate": 1.2912305758581444e-05, + "loss": 0.3064, + "step": 7376 + }, + { + "epoch": 0.42, + "grad_norm": 0.8760977353616266, + "learning_rate": 1.2910525460747346e-05, + "loss": 0.4878, + "step": 7377 + }, + { + "epoch": 0.42, + "grad_norm": 0.1923060008162484, + "learning_rate": 1.290874506212188e-05, + "loss": 0.1669, + "step": 7378 + }, + { + "epoch": 0.42, + "grad_norm": 0.402353938670631, + "learning_rate": 1.2906964562766691e-05, + "loss": 0.3496, + "step": 7379 + }, + { + "epoch": 0.42, + "grad_norm": 0.7286714606166612, + "learning_rate": 1.290518396274345e-05, + "loss": 0.437, + "step": 7380 + }, + { + "epoch": 0.42, + "grad_norm": 0.35226470412228356, + "learning_rate": 1.290340326211381e-05, + "loss": 0.2338, + "step": 7381 + }, + { + "epoch": 0.42, + "grad_norm": 0.3621217534989928, + "learning_rate": 1.290162246093944e-05, + "loss": 0.2912, + "step": 7382 + }, + { + "epoch": 0.42, + "grad_norm": 0.4861495391034015, + "learning_rate": 1.289984155928201e-05, + "loss": 0.3546, + "step": 7383 + }, + { + "epoch": 0.42, + "grad_norm": 0.64343988125234, + "learning_rate": 1.289806055720319e-05, + "loss": 0.2483, + "step": 7384 + }, + { + "epoch": 0.42, + "grad_norm": 0.3655898241983234, + "learning_rate": 1.2896279454764659e-05, + "loss": 0.2909, + "step": 7385 + }, + { + "epoch": 0.42, + "grad_norm": 0.29398028827370154, + "learning_rate": 1.28944982520281e-05, + "loss": 0.2718, + "step": 7386 + }, + { + "epoch": 0.42, + "grad_norm": 1.1800139635163578, + "learning_rate": 1.2892716949055184e-05, + "loss": 0.7545, + "step": 7387 + }, + { + "epoch": 0.42, + "grad_norm": 0.3023709880759031, + "learning_rate": 1.2890935545907608e-05, + "loss": 0.2228, + "step": 7388 + }, + { + "epoch": 0.42, + "grad_norm": 0.6965321600545177, + "learning_rate": 1.2889154042647056e-05, + "loss": 0.4642, + "step": 7389 + }, + { + "epoch": 0.42, + "grad_norm": 0.35911214934551505, + "learning_rate": 1.2887372439335224e-05, + "loss": 0.3357, + "step": 7390 + }, + { + "epoch": 0.42, + "grad_norm": 0.29662533744407216, + "learning_rate": 1.2885590736033808e-05, + "loss": 0.2187, + "step": 7391 + }, + { + "epoch": 0.42, + "grad_norm": 0.27718955489176644, + "learning_rate": 1.2883808932804512e-05, + "loss": 0.1913, + "step": 7392 + }, + { + "epoch": 0.42, + "grad_norm": 0.4086851848096156, + "learning_rate": 1.2882027029709034e-05, + "loss": 0.3497, + "step": 7393 + }, + { + "epoch": 0.42, + "grad_norm": 0.3038815146416857, + "learning_rate": 1.2880245026809085e-05, + "loss": 0.2307, + "step": 7394 + }, + { + "epoch": 0.42, + "grad_norm": 1.500569565176652, + "learning_rate": 1.2878462924166374e-05, + "loss": 0.661, + "step": 7395 + }, + { + "epoch": 0.42, + "grad_norm": 0.6963263635658477, + "learning_rate": 1.2876680721842616e-05, + "loss": 0.4307, + "step": 7396 + }, + { + "epoch": 0.42, + "grad_norm": 0.333813168785862, + "learning_rate": 1.2874898419899528e-05, + "loss": 0.2022, + "step": 7397 + }, + { + "epoch": 0.43, + "grad_norm": 0.24787304670295343, + "learning_rate": 1.287311601839883e-05, + "loss": 0.2155, + "step": 7398 + }, + { + "epoch": 0.43, + "grad_norm": 0.7433576356848729, + "learning_rate": 1.2871333517402251e-05, + "loss": 0.481, + "step": 7399 + }, + { + "epoch": 0.43, + "grad_norm": 0.3939320064161549, + "learning_rate": 1.2869550916971512e-05, + "loss": 0.2881, + "step": 7400 + }, + { + "epoch": 0.43, + "grad_norm": 0.8308971616875137, + "learning_rate": 1.2867768217168353e-05, + "loss": 0.3781, + "step": 7401 + }, + { + "epoch": 0.43, + "grad_norm": 0.3497647575246814, + "learning_rate": 1.28659854180545e-05, + "loss": 0.3359, + "step": 7402 + }, + { + "epoch": 0.43, + "grad_norm": 0.36233416373040706, + "learning_rate": 1.2864202519691698e-05, + "loss": 0.2455, + "step": 7403 + }, + { + "epoch": 0.43, + "grad_norm": 0.21250382304746632, + "learning_rate": 1.2862419522141684e-05, + "loss": 0.1056, + "step": 7404 + }, + { + "epoch": 0.43, + "grad_norm": 0.48055525319680503, + "learning_rate": 1.2860636425466207e-05, + "loss": 0.353, + "step": 7405 + }, + { + "epoch": 0.43, + "grad_norm": 0.2929548307001326, + "learning_rate": 1.285885322972701e-05, + "loss": 0.267, + "step": 7406 + }, + { + "epoch": 0.43, + "grad_norm": 0.9742707277009065, + "learning_rate": 1.2857069934985851e-05, + "loss": 0.3784, + "step": 7407 + }, + { + "epoch": 0.43, + "grad_norm": 0.7845705775706029, + "learning_rate": 1.2855286541304481e-05, + "loss": 0.4938, + "step": 7408 + }, + { + "epoch": 0.43, + "grad_norm": 0.3324160422402967, + "learning_rate": 1.2853503048744664e-05, + "loss": 0.2625, + "step": 7409 + }, + { + "epoch": 0.43, + "grad_norm": 0.2891167340341053, + "learning_rate": 1.2851719457368157e-05, + "loss": 0.2068, + "step": 7410 + }, + { + "epoch": 0.43, + "grad_norm": 0.42210199453028185, + "learning_rate": 1.2849935767236729e-05, + "loss": 0.3184, + "step": 7411 + }, + { + "epoch": 0.43, + "grad_norm": 0.36308547586520445, + "learning_rate": 1.284815197841215e-05, + "loss": 0.3046, + "step": 7412 + }, + { + "epoch": 0.43, + "grad_norm": 1.1643204041845043, + "learning_rate": 1.2846368090956185e-05, + "loss": 0.4728, + "step": 7413 + }, + { + "epoch": 0.43, + "grad_norm": 0.31753775412680657, + "learning_rate": 1.284458410493062e-05, + "loss": 0.2662, + "step": 7414 + }, + { + "epoch": 0.43, + "grad_norm": 0.38438890919644864, + "learning_rate": 1.2842800020397226e-05, + "loss": 0.2863, + "step": 7415 + }, + { + "epoch": 0.43, + "grad_norm": 0.23075903001714582, + "learning_rate": 1.2841015837417792e-05, + "loss": 0.1904, + "step": 7416 + }, + { + "epoch": 0.43, + "grad_norm": 0.3437999763100494, + "learning_rate": 1.2839231556054101e-05, + "loss": 0.2819, + "step": 7417 + }, + { + "epoch": 0.43, + "grad_norm": 0.3856629904474758, + "learning_rate": 1.2837447176367944e-05, + "loss": 0.2964, + "step": 7418 + }, + { + "epoch": 0.43, + "grad_norm": 0.6560835963344139, + "learning_rate": 1.2835662698421112e-05, + "loss": 0.3934, + "step": 7419 + }, + { + "epoch": 0.43, + "grad_norm": 0.7348691624408498, + "learning_rate": 1.2833878122275407e-05, + "loss": 0.2447, + "step": 7420 + }, + { + "epoch": 0.43, + "grad_norm": 0.34732153973656654, + "learning_rate": 1.283209344799262e-05, + "loss": 0.2776, + "step": 7421 + }, + { + "epoch": 0.43, + "grad_norm": 0.29962271123904893, + "learning_rate": 1.283030867563456e-05, + "loss": 0.2691, + "step": 7422 + }, + { + "epoch": 0.43, + "grad_norm": 0.7215084346836231, + "learning_rate": 1.282852380526303e-05, + "loss": 0.4093, + "step": 7423 + }, + { + "epoch": 0.43, + "grad_norm": 0.3187780573476562, + "learning_rate": 1.2826738836939844e-05, + "loss": 0.2666, + "step": 7424 + }, + { + "epoch": 0.43, + "grad_norm": 0.3051997295007057, + "learning_rate": 1.2824953770726813e-05, + "loss": 0.2971, + "step": 7425 + }, + { + "epoch": 0.43, + "grad_norm": 0.5136628961032486, + "learning_rate": 1.2823168606685756e-05, + "loss": 0.3615, + "step": 7426 + }, + { + "epoch": 0.43, + "grad_norm": 0.3726773739401942, + "learning_rate": 1.2821383344878491e-05, + "loss": 0.2014, + "step": 7427 + }, + { + "epoch": 0.43, + "grad_norm": 1.1260492763383276, + "learning_rate": 1.2819597985366843e-05, + "loss": 0.5823, + "step": 7428 + }, + { + "epoch": 0.43, + "grad_norm": 0.49804903503850456, + "learning_rate": 1.2817812528212635e-05, + "loss": 0.3953, + "step": 7429 + }, + { + "epoch": 0.43, + "grad_norm": 0.278617679213449, + "learning_rate": 1.2816026973477702e-05, + "loss": 0.2334, + "step": 7430 + }, + { + "epoch": 0.43, + "grad_norm": 0.8077408746603993, + "learning_rate": 1.2814241321223876e-05, + "loss": 0.5714, + "step": 7431 + }, + { + "epoch": 0.43, + "grad_norm": 0.2938940132824784, + "learning_rate": 1.2812455571512996e-05, + "loss": 0.178, + "step": 7432 + }, + { + "epoch": 0.43, + "grad_norm": 0.31693402945068383, + "learning_rate": 1.28106697244069e-05, + "loss": 0.1713, + "step": 7433 + }, + { + "epoch": 0.43, + "grad_norm": 0.39056029624267147, + "learning_rate": 1.2808883779967429e-05, + "loss": 0.2973, + "step": 7434 + }, + { + "epoch": 0.43, + "grad_norm": 0.8784300285705219, + "learning_rate": 1.2807097738256436e-05, + "loss": 0.4679, + "step": 7435 + }, + { + "epoch": 0.43, + "grad_norm": 0.6096405649192743, + "learning_rate": 1.2805311599335768e-05, + "loss": 0.3627, + "step": 7436 + }, + { + "epoch": 0.43, + "grad_norm": 0.280902071047559, + "learning_rate": 1.2803525363267281e-05, + "loss": 0.2223, + "step": 7437 + }, + { + "epoch": 0.43, + "grad_norm": 0.33005696907181864, + "learning_rate": 1.2801739030112833e-05, + "loss": 0.2775, + "step": 7438 + }, + { + "epoch": 0.43, + "grad_norm": 0.5559933672937305, + "learning_rate": 1.279995259993428e-05, + "loss": 0.3064, + "step": 7439 + }, + { + "epoch": 0.43, + "grad_norm": 0.39420878524487984, + "learning_rate": 1.279816607279349e-05, + "loss": 0.2677, + "step": 7440 + }, + { + "epoch": 0.43, + "grad_norm": 0.5102506758920938, + "learning_rate": 1.279637944875233e-05, + "loss": 0.3898, + "step": 7441 + }, + { + "epoch": 0.43, + "grad_norm": 0.3100867254059578, + "learning_rate": 1.2794592727872665e-05, + "loss": 0.2904, + "step": 7442 + }, + { + "epoch": 0.43, + "grad_norm": 0.2778272955079186, + "learning_rate": 1.279280591021638e-05, + "loss": 0.1926, + "step": 7443 + }, + { + "epoch": 0.43, + "grad_norm": 0.3652081080576812, + "learning_rate": 1.2791018995845343e-05, + "loss": 0.2066, + "step": 7444 + }, + { + "epoch": 0.43, + "grad_norm": 0.4003295351901793, + "learning_rate": 1.278923198482144e-05, + "loss": 0.2906, + "step": 7445 + }, + { + "epoch": 0.43, + "grad_norm": 0.31584749571079734, + "learning_rate": 1.2787444877206552e-05, + "loss": 0.2526, + "step": 7446 + }, + { + "epoch": 0.43, + "grad_norm": 0.6723729764280729, + "learning_rate": 1.2785657673062567e-05, + "loss": 0.4328, + "step": 7447 + }, + { + "epoch": 0.43, + "grad_norm": 0.4069163041642835, + "learning_rate": 1.2783870372451377e-05, + "loss": 0.3099, + "step": 7448 + }, + { + "epoch": 0.43, + "grad_norm": 0.5471645897491344, + "learning_rate": 1.278208297543488e-05, + "loss": 0.3764, + "step": 7449 + }, + { + "epoch": 0.43, + "grad_norm": 0.22582651123557201, + "learning_rate": 1.2780295482074965e-05, + "loss": 0.1677, + "step": 7450 + }, + { + "epoch": 0.43, + "grad_norm": 0.38504551814048305, + "learning_rate": 1.2778507892433538e-05, + "loss": 0.2699, + "step": 7451 + }, + { + "epoch": 0.43, + "grad_norm": 0.5300421426970018, + "learning_rate": 1.2776720206572502e-05, + "loss": 0.3693, + "step": 7452 + }, + { + "epoch": 0.43, + "grad_norm": 0.4316438122749434, + "learning_rate": 1.2774932424553763e-05, + "loss": 0.3068, + "step": 7453 + }, + { + "epoch": 0.43, + "grad_norm": 0.6039224321850404, + "learning_rate": 1.2773144546439235e-05, + "loss": 0.3833, + "step": 7454 + }, + { + "epoch": 0.43, + "grad_norm": 0.41397690083601735, + "learning_rate": 1.2771356572290834e-05, + "loss": 0.3263, + "step": 7455 + }, + { + "epoch": 0.43, + "grad_norm": 0.22875703408665155, + "learning_rate": 1.2769568502170469e-05, + "loss": 0.1126, + "step": 7456 + }, + { + "epoch": 0.43, + "grad_norm": 0.38290759108779704, + "learning_rate": 1.276778033614007e-05, + "loss": 0.2816, + "step": 7457 + }, + { + "epoch": 0.43, + "grad_norm": 0.3645825413303027, + "learning_rate": 1.2765992074261555e-05, + "loss": 0.3286, + "step": 7458 + }, + { + "epoch": 0.43, + "grad_norm": 0.7066287348903324, + "learning_rate": 1.276420371659685e-05, + "loss": 0.3722, + "step": 7459 + }, + { + "epoch": 0.43, + "grad_norm": 0.6244111465506637, + "learning_rate": 1.2762415263207895e-05, + "loss": 0.3676, + "step": 7460 + }, + { + "epoch": 0.43, + "grad_norm": 0.2867319890884178, + "learning_rate": 1.2760626714156614e-05, + "loss": 0.2733, + "step": 7461 + }, + { + "epoch": 0.43, + "grad_norm": 0.37856860799168346, + "learning_rate": 1.2758838069504952e-05, + "loss": 0.2388, + "step": 7462 + }, + { + "epoch": 0.43, + "grad_norm": 0.27983356612817223, + "learning_rate": 1.2757049329314842e-05, + "loss": 0.1823, + "step": 7463 + }, + { + "epoch": 0.43, + "grad_norm": 0.675249528883432, + "learning_rate": 1.2755260493648235e-05, + "loss": 0.3907, + "step": 7464 + }, + { + "epoch": 0.43, + "grad_norm": 0.3990296921748976, + "learning_rate": 1.2753471562567074e-05, + "loss": 0.3226, + "step": 7465 + }, + { + "epoch": 0.43, + "grad_norm": 0.3446096915891851, + "learning_rate": 1.2751682536133313e-05, + "loss": 0.2236, + "step": 7466 + }, + { + "epoch": 0.43, + "grad_norm": 0.5313072118472417, + "learning_rate": 1.2749893414408903e-05, + "loss": 0.3751, + "step": 7467 + }, + { + "epoch": 0.43, + "grad_norm": 0.265255518955803, + "learning_rate": 1.2748104197455804e-05, + "loss": 0.1889, + "step": 7468 + }, + { + "epoch": 0.43, + "grad_norm": 0.3086628494538964, + "learning_rate": 1.274631488533597e-05, + "loss": 0.235, + "step": 7469 + }, + { + "epoch": 0.43, + "grad_norm": 0.3955694497758381, + "learning_rate": 1.2744525478111379e-05, + "loss": 0.3382, + "step": 7470 + }, + { + "epoch": 0.43, + "grad_norm": 0.7593369964854633, + "learning_rate": 1.274273597584398e-05, + "loss": 0.5741, + "step": 7471 + }, + { + "epoch": 0.43, + "grad_norm": 0.38222913352863014, + "learning_rate": 1.2740946378595758e-05, + "loss": 0.1657, + "step": 7472 + }, + { + "epoch": 0.43, + "grad_norm": 0.3301246287937124, + "learning_rate": 1.2739156686428675e-05, + "loss": 0.2843, + "step": 7473 + }, + { + "epoch": 0.43, + "grad_norm": 0.47079369177307934, + "learning_rate": 1.2737366899404718e-05, + "loss": 0.309, + "step": 7474 + }, + { + "epoch": 0.43, + "grad_norm": 0.6375823621608645, + "learning_rate": 1.273557701758586e-05, + "loss": 0.4102, + "step": 7475 + }, + { + "epoch": 0.43, + "grad_norm": 0.3015373234296537, + "learning_rate": 1.2733787041034092e-05, + "loss": 0.2052, + "step": 7476 + }, + { + "epoch": 0.43, + "grad_norm": 0.3550705089146975, + "learning_rate": 1.2731996969811393e-05, + "loss": 0.2863, + "step": 7477 + }, + { + "epoch": 0.43, + "grad_norm": 0.7378788393900801, + "learning_rate": 1.2730206803979754e-05, + "loss": 0.5696, + "step": 7478 + }, + { + "epoch": 0.43, + "grad_norm": 0.3263003907992609, + "learning_rate": 1.2728416543601177e-05, + "loss": 0.2251, + "step": 7479 + }, + { + "epoch": 0.43, + "grad_norm": 0.775695398769665, + "learning_rate": 1.2726626188737647e-05, + "loss": 0.4716, + "step": 7480 + }, + { + "epoch": 0.43, + "grad_norm": 0.33048671823166975, + "learning_rate": 1.2724835739451172e-05, + "loss": 0.2793, + "step": 7481 + }, + { + "epoch": 0.43, + "grad_norm": 0.2443173066345539, + "learning_rate": 1.272304519580375e-05, + "loss": 0.1975, + "step": 7482 + }, + { + "epoch": 0.43, + "grad_norm": 0.37993338584071, + "learning_rate": 1.2721254557857394e-05, + "loss": 0.255, + "step": 7483 + }, + { + "epoch": 0.43, + "grad_norm": 0.5994907663851344, + "learning_rate": 1.2719463825674105e-05, + "loss": 0.3759, + "step": 7484 + }, + { + "epoch": 0.43, + "grad_norm": 0.35639175499384124, + "learning_rate": 1.2717672999315904e-05, + "loss": 0.2441, + "step": 7485 + }, + { + "epoch": 0.43, + "grad_norm": 0.5549223599700548, + "learning_rate": 1.2715882078844804e-05, + "loss": 0.3907, + "step": 7486 + }, + { + "epoch": 0.43, + "grad_norm": 0.5760563913885721, + "learning_rate": 1.2714091064322824e-05, + "loss": 0.3488, + "step": 7487 + }, + { + "epoch": 0.43, + "grad_norm": 0.30595159376311326, + "learning_rate": 1.2712299955811987e-05, + "loss": 0.2508, + "step": 7488 + }, + { + "epoch": 0.43, + "grad_norm": 0.2638652449536984, + "learning_rate": 1.2710508753374317e-05, + "loss": 0.2126, + "step": 7489 + }, + { + "epoch": 0.43, + "grad_norm": 1.237358245192641, + "learning_rate": 1.2708717457071849e-05, + "loss": 0.7199, + "step": 7490 + }, + { + "epoch": 0.43, + "grad_norm": 0.36818599307685185, + "learning_rate": 1.270692606696661e-05, + "loss": 0.2631, + "step": 7491 + }, + { + "epoch": 0.43, + "grad_norm": 0.471986003217928, + "learning_rate": 1.2705134583120638e-05, + "loss": 0.3152, + "step": 7492 + }, + { + "epoch": 0.43, + "grad_norm": 0.42344872331296374, + "learning_rate": 1.2703343005595972e-05, + "loss": 0.3291, + "step": 7493 + }, + { + "epoch": 0.43, + "grad_norm": 0.36850684943447043, + "learning_rate": 1.2701551334454652e-05, + "loss": 0.2939, + "step": 7494 + }, + { + "epoch": 0.43, + "grad_norm": 0.21811476150078793, + "learning_rate": 1.2699759569758727e-05, + "loss": 0.1028, + "step": 7495 + }, + { + "epoch": 0.43, + "grad_norm": 0.6464980054789515, + "learning_rate": 1.2697967711570243e-05, + "loss": 0.4877, + "step": 7496 + }, + { + "epoch": 0.43, + "grad_norm": 0.28044990754809157, + "learning_rate": 1.2696175759951254e-05, + "loss": 0.2786, + "step": 7497 + }, + { + "epoch": 0.43, + "grad_norm": 0.6001584093412066, + "learning_rate": 1.2694383714963818e-05, + "loss": 0.3558, + "step": 7498 + }, + { + "epoch": 0.43, + "grad_norm": 0.6795937624798115, + "learning_rate": 1.2692591576669983e-05, + "loss": 0.443, + "step": 7499 + }, + { + "epoch": 0.43, + "grad_norm": 0.2836693554883349, + "learning_rate": 1.2690799345131824e-05, + "loss": 0.2405, + "step": 7500 + }, + { + "epoch": 0.43, + "grad_norm": 0.3201123731313662, + "learning_rate": 1.2689007020411394e-05, + "loss": 0.2822, + "step": 7501 + }, + { + "epoch": 0.43, + "grad_norm": 0.36880094607275504, + "learning_rate": 1.268721460257077e-05, + "loss": 0.2344, + "step": 7502 + }, + { + "epoch": 0.43, + "grad_norm": 0.42415078679604923, + "learning_rate": 1.2685422091672017e-05, + "loss": 0.3176, + "step": 7503 + }, + { + "epoch": 0.43, + "grad_norm": 1.0388651965272275, + "learning_rate": 1.2683629487777219e-05, + "loss": 0.4227, + "step": 7504 + }, + { + "epoch": 0.43, + "grad_norm": 0.26138461355474946, + "learning_rate": 1.268183679094844e-05, + "loss": 0.2142, + "step": 7505 + }, + { + "epoch": 0.43, + "grad_norm": 0.38880693292751656, + "learning_rate": 1.2680044001247774e-05, + "loss": 0.3062, + "step": 7506 + }, + { + "epoch": 0.43, + "grad_norm": 0.28854957691060595, + "learning_rate": 1.2678251118737293e-05, + "loss": 0.2059, + "step": 7507 + }, + { + "epoch": 0.43, + "grad_norm": 0.6409188766262047, + "learning_rate": 1.2676458143479095e-05, + "loss": 0.306, + "step": 7508 + }, + { + "epoch": 0.43, + "grad_norm": 0.3106921649626693, + "learning_rate": 1.2674665075535264e-05, + "loss": 0.2657, + "step": 7509 + }, + { + "epoch": 0.43, + "grad_norm": 0.9982593919162711, + "learning_rate": 1.26728719149679e-05, + "loss": 0.4393, + "step": 7510 + }, + { + "epoch": 0.43, + "grad_norm": 1.0040110349618188, + "learning_rate": 1.2671078661839093e-05, + "loss": 0.2504, + "step": 7511 + }, + { + "epoch": 0.43, + "grad_norm": 0.27760287035235526, + "learning_rate": 1.2669285316210948e-05, + "loss": 0.2207, + "step": 7512 + }, + { + "epoch": 0.43, + "grad_norm": 0.33738457628735213, + "learning_rate": 1.2667491878145568e-05, + "loss": 0.2674, + "step": 7513 + }, + { + "epoch": 0.43, + "grad_norm": 0.8728551700618518, + "learning_rate": 1.266569834770506e-05, + "loss": 0.4935, + "step": 7514 + }, + { + "epoch": 0.43, + "grad_norm": 0.5237880393460295, + "learning_rate": 1.266390472495153e-05, + "loss": 0.2501, + "step": 7515 + }, + { + "epoch": 0.43, + "grad_norm": 1.2321177960808543, + "learning_rate": 1.2662111009947096e-05, + "loss": 0.7564, + "step": 7516 + }, + { + "epoch": 0.43, + "grad_norm": 0.3852958785860179, + "learning_rate": 1.2660317202753873e-05, + "loss": 0.308, + "step": 7517 + }, + { + "epoch": 0.43, + "grad_norm": 0.2878776513060847, + "learning_rate": 1.2658523303433979e-05, + "loss": 0.2114, + "step": 7518 + }, + { + "epoch": 0.43, + "grad_norm": 0.3866565714290425, + "learning_rate": 1.2656729312049536e-05, + "loss": 0.2787, + "step": 7519 + }, + { + "epoch": 0.43, + "grad_norm": 0.44929076242454746, + "learning_rate": 1.265493522866267e-05, + "loss": 0.3211, + "step": 7520 + }, + { + "epoch": 0.43, + "grad_norm": 0.31019924286079437, + "learning_rate": 1.2653141053335513e-05, + "loss": 0.2021, + "step": 7521 + }, + { + "epoch": 0.43, + "grad_norm": 0.5257400746389835, + "learning_rate": 1.2651346786130199e-05, + "loss": 0.3338, + "step": 7522 + }, + { + "epoch": 0.43, + "grad_norm": 0.7700473026585529, + "learning_rate": 1.2649552427108856e-05, + "loss": 0.4551, + "step": 7523 + }, + { + "epoch": 0.43, + "grad_norm": 0.37836722990428473, + "learning_rate": 1.2647757976333626e-05, + "loss": 0.2697, + "step": 7524 + }, + { + "epoch": 0.43, + "grad_norm": 0.40109303353556514, + "learning_rate": 1.2645963433866653e-05, + "loss": 0.2891, + "step": 7525 + }, + { + "epoch": 0.43, + "grad_norm": 0.5801053269128108, + "learning_rate": 1.2644168799770075e-05, + "loss": 0.4479, + "step": 7526 + }, + { + "epoch": 0.43, + "grad_norm": 0.3038245233952448, + "learning_rate": 1.2642374074106052e-05, + "loss": 0.2673, + "step": 7527 + }, + { + "epoch": 0.43, + "grad_norm": 0.23955365444396562, + "learning_rate": 1.2640579256936723e-05, + "loss": 0.195, + "step": 7528 + }, + { + "epoch": 0.43, + "grad_norm": 0.9894905671520695, + "learning_rate": 1.263878434832425e-05, + "loss": 0.5351, + "step": 7529 + }, + { + "epoch": 0.43, + "grad_norm": 0.33128274197373925, + "learning_rate": 1.2636989348330791e-05, + "loss": 0.2585, + "step": 7530 + }, + { + "epoch": 0.43, + "grad_norm": 0.7002081987136007, + "learning_rate": 1.2635194257018501e-05, + "loss": 0.3638, + "step": 7531 + }, + { + "epoch": 0.43, + "grad_norm": 0.3775323790192974, + "learning_rate": 1.2633399074449548e-05, + "loss": 0.3146, + "step": 7532 + }, + { + "epoch": 0.43, + "grad_norm": 0.33698150077075945, + "learning_rate": 1.2631603800686099e-05, + "loss": 0.279, + "step": 7533 + }, + { + "epoch": 0.43, + "grad_norm": 0.18218076324549323, + "learning_rate": 1.2629808435790322e-05, + "loss": 0.1041, + "step": 7534 + }, + { + "epoch": 0.43, + "grad_norm": 1.185980333284646, + "learning_rate": 1.2628012979824394e-05, + "loss": 0.7281, + "step": 7535 + }, + { + "epoch": 0.43, + "grad_norm": 0.34323992321440966, + "learning_rate": 1.262621743285049e-05, + "loss": 0.262, + "step": 7536 + }, + { + "epoch": 0.43, + "grad_norm": 0.37375107807679175, + "learning_rate": 1.2624421794930785e-05, + "loss": 0.3399, + "step": 7537 + }, + { + "epoch": 0.43, + "grad_norm": 0.9873288871171293, + "learning_rate": 1.262262606612747e-05, + "loss": 0.3846, + "step": 7538 + }, + { + "epoch": 0.43, + "grad_norm": 0.4163728309097746, + "learning_rate": 1.2620830246502729e-05, + "loss": 0.2865, + "step": 7539 + }, + { + "epoch": 0.43, + "grad_norm": 0.29402576393144825, + "learning_rate": 1.2619034336118742e-05, + "loss": 0.2692, + "step": 7540 + }, + { + "epoch": 0.43, + "grad_norm": 0.3024954021730123, + "learning_rate": 1.2617238335037717e-05, + "loss": 0.222, + "step": 7541 + }, + { + "epoch": 0.43, + "grad_norm": 0.4399043946526552, + "learning_rate": 1.2615442243321837e-05, + "loss": 0.2605, + "step": 7542 + }, + { + "epoch": 0.43, + "grad_norm": 0.6267856143786402, + "learning_rate": 1.2613646061033303e-05, + "loss": 0.3875, + "step": 7543 + }, + { + "epoch": 0.43, + "grad_norm": 0.383120587325467, + "learning_rate": 1.261184978823432e-05, + "loss": 0.2875, + "step": 7544 + }, + { + "epoch": 0.43, + "grad_norm": 0.3409458484018371, + "learning_rate": 1.261005342498709e-05, + "loss": 0.277, + "step": 7545 + }, + { + "epoch": 0.43, + "grad_norm": 0.25057745497441336, + "learning_rate": 1.2608256971353825e-05, + "loss": 0.1904, + "step": 7546 + }, + { + "epoch": 0.43, + "grad_norm": 0.9892133398630651, + "learning_rate": 1.2606460427396729e-05, + "loss": 0.0705, + "step": 7547 + }, + { + "epoch": 0.43, + "grad_norm": 0.3220890480663572, + "learning_rate": 1.2604663793178025e-05, + "loss": 0.2779, + "step": 7548 + }, + { + "epoch": 0.43, + "grad_norm": 0.3346067280468399, + "learning_rate": 1.2602867068759921e-05, + "loss": 0.3242, + "step": 7549 + }, + { + "epoch": 0.43, + "grad_norm": 0.9802497675330882, + "learning_rate": 1.2601070254204645e-05, + "loss": 0.7108, + "step": 7550 + }, + { + "epoch": 0.43, + "grad_norm": 0.3310026474654711, + "learning_rate": 1.2599273349574417e-05, + "loss": 0.2248, + "step": 7551 + }, + { + "epoch": 0.43, + "grad_norm": 0.4692480509388019, + "learning_rate": 1.2597476354931466e-05, + "loss": 0.3678, + "step": 7552 + }, + { + "epoch": 0.43, + "grad_norm": 0.26325901123873297, + "learning_rate": 1.2595679270338019e-05, + "loss": 0.2001, + "step": 7553 + }, + { + "epoch": 0.43, + "grad_norm": 0.3018764077905086, + "learning_rate": 1.2593882095856314e-05, + "loss": 0.2367, + "step": 7554 + }, + { + "epoch": 0.43, + "grad_norm": 0.6035545300935379, + "learning_rate": 1.259208483154858e-05, + "loss": 0.4614, + "step": 7555 + }, + { + "epoch": 0.43, + "grad_norm": 0.3817271648715765, + "learning_rate": 1.2590287477477063e-05, + "loss": 0.3635, + "step": 7556 + }, + { + "epoch": 0.43, + "grad_norm": 0.3606104677787802, + "learning_rate": 1.2588490033703999e-05, + "loss": 0.1954, + "step": 7557 + }, + { + "epoch": 0.43, + "grad_norm": 0.33357966342409084, + "learning_rate": 1.2586692500291639e-05, + "loss": 0.2763, + "step": 7558 + }, + { + "epoch": 0.43, + "grad_norm": 0.4054777299779856, + "learning_rate": 1.2584894877302224e-05, + "loss": 0.3124, + "step": 7559 + }, + { + "epoch": 0.43, + "grad_norm": 0.2770112585553754, + "learning_rate": 1.2583097164798015e-05, + "loss": 0.2086, + "step": 7560 + }, + { + "epoch": 0.43, + "grad_norm": 0.33701165254254206, + "learning_rate": 1.2581299362841262e-05, + "loss": 0.3406, + "step": 7561 + }, + { + "epoch": 0.43, + "grad_norm": 1.5593734746120151, + "learning_rate": 1.2579501471494222e-05, + "loss": 0.7475, + "step": 7562 + }, + { + "epoch": 0.43, + "grad_norm": 0.6377691899908655, + "learning_rate": 1.2577703490819157e-05, + "loss": 0.3114, + "step": 7563 + }, + { + "epoch": 0.43, + "grad_norm": 0.27445686268636504, + "learning_rate": 1.2575905420878332e-05, + "loss": 0.2575, + "step": 7564 + }, + { + "epoch": 0.43, + "grad_norm": 0.26622696284658587, + "learning_rate": 1.2574107261734012e-05, + "loss": 0.2002, + "step": 7565 + }, + { + "epoch": 0.43, + "grad_norm": 0.37561943751948673, + "learning_rate": 1.2572309013448467e-05, + "loss": 0.2893, + "step": 7566 + }, + { + "epoch": 0.43, + "grad_norm": 0.4144161359543641, + "learning_rate": 1.2570510676083974e-05, + "loss": 0.2954, + "step": 7567 + }, + { + "epoch": 0.43, + "grad_norm": 0.36156343975898747, + "learning_rate": 1.2568712249702806e-05, + "loss": 0.3277, + "step": 7568 + }, + { + "epoch": 0.43, + "grad_norm": 0.41364106807789164, + "learning_rate": 1.2566913734367244e-05, + "loss": 0.2982, + "step": 7569 + }, + { + "epoch": 0.43, + "grad_norm": 0.4642819832894536, + "learning_rate": 1.2565115130139565e-05, + "loss": 0.2253, + "step": 7570 + }, + { + "epoch": 0.43, + "grad_norm": 0.5140464591085009, + "learning_rate": 1.2563316437082065e-05, + "loss": 0.4326, + "step": 7571 + }, + { + "epoch": 0.44, + "grad_norm": 0.26369296742189074, + "learning_rate": 1.2561517655257025e-05, + "loss": 0.2555, + "step": 7572 + }, + { + "epoch": 0.44, + "grad_norm": 0.25556862333416436, + "learning_rate": 1.255971878472674e-05, + "loss": 0.1509, + "step": 7573 + }, + { + "epoch": 0.44, + "grad_norm": 1.0276481868351715, + "learning_rate": 1.2557919825553502e-05, + "loss": 0.6818, + "step": 7574 + }, + { + "epoch": 0.44, + "grad_norm": 0.6329669628544323, + "learning_rate": 1.255612077779961e-05, + "loss": 0.3306, + "step": 7575 + }, + { + "epoch": 0.44, + "grad_norm": 0.32794636640693425, + "learning_rate": 1.2554321641527367e-05, + "loss": 0.2998, + "step": 7576 + }, + { + "epoch": 0.44, + "grad_norm": 0.5110473328508144, + "learning_rate": 1.2552522416799073e-05, + "loss": 0.3151, + "step": 7577 + }, + { + "epoch": 0.44, + "grad_norm": 0.41058569117740423, + "learning_rate": 1.2550723103677039e-05, + "loss": 0.2154, + "step": 7578 + }, + { + "epoch": 0.44, + "grad_norm": 0.32708584999844575, + "learning_rate": 1.2548923702223574e-05, + "loss": 0.251, + "step": 7579 + }, + { + "epoch": 0.44, + "grad_norm": 0.3670176202785912, + "learning_rate": 1.254712421250099e-05, + "loss": 0.2829, + "step": 7580 + }, + { + "epoch": 0.44, + "grad_norm": 0.5745557867723556, + "learning_rate": 1.2545324634571604e-05, + "loss": 0.3194, + "step": 7581 + }, + { + "epoch": 0.44, + "grad_norm": 0.40854419253185176, + "learning_rate": 1.2543524968497738e-05, + "loss": 0.3315, + "step": 7582 + }, + { + "epoch": 0.44, + "grad_norm": 0.6312594387495574, + "learning_rate": 1.2541725214341708e-05, + "loss": 0.3503, + "step": 7583 + }, + { + "epoch": 0.44, + "grad_norm": 0.3346389065280756, + "learning_rate": 1.2539925372165847e-05, + "loss": 0.2932, + "step": 7584 + }, + { + "epoch": 0.44, + "grad_norm": 0.23633536868367974, + "learning_rate": 1.2538125442032474e-05, + "loss": 0.2035, + "step": 7585 + }, + { + "epoch": 0.44, + "grad_norm": 0.9129265657545105, + "learning_rate": 1.2536325424003932e-05, + "loss": 0.5529, + "step": 7586 + }, + { + "epoch": 0.44, + "grad_norm": 0.38335161706444687, + "learning_rate": 1.2534525318142546e-05, + "loss": 0.3163, + "step": 7587 + }, + { + "epoch": 0.44, + "grad_norm": 0.37815106189321185, + "learning_rate": 1.2532725124510658e-05, + "loss": 0.3199, + "step": 7588 + }, + { + "epoch": 0.44, + "grad_norm": 0.45945715714105756, + "learning_rate": 1.2530924843170609e-05, + "loss": 0.3619, + "step": 7589 + }, + { + "epoch": 0.44, + "grad_norm": 0.25719376284057816, + "learning_rate": 1.2529124474184742e-05, + "loss": 0.154, + "step": 7590 + }, + { + "epoch": 0.44, + "grad_norm": 0.352646974082044, + "learning_rate": 1.25273240176154e-05, + "loss": 0.2434, + "step": 7591 + }, + { + "epoch": 0.44, + "grad_norm": 0.40108702245255196, + "learning_rate": 1.2525523473524938e-05, + "loss": 0.3153, + "step": 7592 + }, + { + "epoch": 0.44, + "grad_norm": 0.5606921794556875, + "learning_rate": 1.2523722841975707e-05, + "loss": 0.153, + "step": 7593 + }, + { + "epoch": 0.44, + "grad_norm": 0.35881739246190164, + "learning_rate": 1.2521922123030061e-05, + "loss": 0.3164, + "step": 7594 + }, + { + "epoch": 0.44, + "grad_norm": 0.4775366714403366, + "learning_rate": 1.252012131675036e-05, + "loss": 0.3781, + "step": 7595 + }, + { + "epoch": 0.44, + "grad_norm": 0.3417720512306077, + "learning_rate": 1.2518320423198968e-05, + "loss": 0.2006, + "step": 7596 + }, + { + "epoch": 0.44, + "grad_norm": 0.2217803683706333, + "learning_rate": 1.251651944243825e-05, + "loss": 0.207, + "step": 7597 + }, + { + "epoch": 0.44, + "grad_norm": 0.9289910609233664, + "learning_rate": 1.2514718374530568e-05, + "loss": 0.6753, + "step": 7598 + }, + { + "epoch": 0.44, + "grad_norm": 0.5061395116029092, + "learning_rate": 1.2512917219538297e-05, + "loss": 0.2249, + "step": 7599 + }, + { + "epoch": 0.44, + "grad_norm": 0.3169813552029995, + "learning_rate": 1.2511115977523813e-05, + "loss": 0.3035, + "step": 7600 + }, + { + "epoch": 0.44, + "grad_norm": 0.7382839722814659, + "learning_rate": 1.2509314648549491e-05, + "loss": 0.5035, + "step": 7601 + }, + { + "epoch": 0.44, + "grad_norm": 0.7547998463825323, + "learning_rate": 1.2507513232677707e-05, + "loss": 0.397, + "step": 7602 + }, + { + "epoch": 0.44, + "grad_norm": 0.2598908072442766, + "learning_rate": 1.250571172997085e-05, + "loss": 0.1986, + "step": 7603 + }, + { + "epoch": 0.44, + "grad_norm": 0.40470480560147853, + "learning_rate": 1.2503910140491305e-05, + "loss": 0.3349, + "step": 7604 + }, + { + "epoch": 0.44, + "grad_norm": 0.5543636608248991, + "learning_rate": 1.2502108464301456e-05, + "loss": 0.3531, + "step": 7605 + }, + { + "epoch": 0.44, + "grad_norm": 0.4062889895351509, + "learning_rate": 1.2500306701463702e-05, + "loss": 0.2834, + "step": 7606 + }, + { + "epoch": 0.44, + "grad_norm": 0.3447667781654027, + "learning_rate": 1.2498504852040433e-05, + "loss": 0.2808, + "step": 7607 + }, + { + "epoch": 0.44, + "grad_norm": 0.363996062685284, + "learning_rate": 1.2496702916094048e-05, + "loss": 0.2969, + "step": 7608 + }, + { + "epoch": 0.44, + "grad_norm": 0.26788663410836405, + "learning_rate": 1.2494900893686949e-05, + "loss": 0.1817, + "step": 7609 + }, + { + "epoch": 0.44, + "grad_norm": 0.9228599503181674, + "learning_rate": 1.2493098784881539e-05, + "loss": 0.5611, + "step": 7610 + }, + { + "epoch": 0.44, + "grad_norm": 0.3804507345579722, + "learning_rate": 1.2491296589740224e-05, + "loss": 0.3399, + "step": 7611 + }, + { + "epoch": 0.44, + "grad_norm": 0.2948051354691912, + "learning_rate": 1.2489494308325415e-05, + "loss": 0.2742, + "step": 7612 + }, + { + "epoch": 0.44, + "grad_norm": 0.3363350955735608, + "learning_rate": 1.2487691940699529e-05, + "loss": 0.2011, + "step": 7613 + }, + { + "epoch": 0.44, + "grad_norm": 1.1140463461241292, + "learning_rate": 1.2485889486924974e-05, + "loss": 0.6561, + "step": 7614 + }, + { + "epoch": 0.44, + "grad_norm": 0.3030326221824844, + "learning_rate": 1.2484086947064175e-05, + "loss": 0.2679, + "step": 7615 + }, + { + "epoch": 0.44, + "grad_norm": 0.36992495041043094, + "learning_rate": 1.2482284321179552e-05, + "loss": 0.2854, + "step": 7616 + }, + { + "epoch": 0.44, + "grad_norm": 0.7049813104276725, + "learning_rate": 1.248048160933353e-05, + "loss": 0.3893, + "step": 7617 + }, + { + "epoch": 0.44, + "grad_norm": 0.2756630402796699, + "learning_rate": 1.2478678811588535e-05, + "loss": 0.2303, + "step": 7618 + }, + { + "epoch": 0.44, + "grad_norm": 0.26760176350237963, + "learning_rate": 1.2476875928006999e-05, + "loss": 0.1611, + "step": 7619 + }, + { + "epoch": 0.44, + "grad_norm": 0.41829783407336724, + "learning_rate": 1.2475072958651358e-05, + "loss": 0.3304, + "step": 7620 + }, + { + "epoch": 0.44, + "grad_norm": 0.35645273997227866, + "learning_rate": 1.2473269903584046e-05, + "loss": 0.2964, + "step": 7621 + }, + { + "epoch": 0.44, + "grad_norm": 0.8604531292814376, + "learning_rate": 1.2471466762867506e-05, + "loss": 0.3573, + "step": 7622 + }, + { + "epoch": 0.44, + "grad_norm": 0.3153244327743291, + "learning_rate": 1.2469663536564177e-05, + "loss": 0.3042, + "step": 7623 + }, + { + "epoch": 0.44, + "grad_norm": 0.3374073034408949, + "learning_rate": 1.2467860224736501e-05, + "loss": 0.2689, + "step": 7624 + }, + { + "epoch": 0.44, + "grad_norm": 0.2715934504776164, + "learning_rate": 1.2466056827446937e-05, + "loss": 0.1519, + "step": 7625 + }, + { + "epoch": 0.44, + "grad_norm": 0.4920404791571963, + "learning_rate": 1.2464253344757929e-05, + "loss": 0.1767, + "step": 7626 + }, + { + "epoch": 0.44, + "grad_norm": 0.40765441951874637, + "learning_rate": 1.2462449776731935e-05, + "loss": 0.3151, + "step": 7627 + }, + { + "epoch": 0.44, + "grad_norm": 0.4929507662219086, + "learning_rate": 1.246064612343141e-05, + "loss": 0.3614, + "step": 7628 + }, + { + "epoch": 0.44, + "grad_norm": 0.8465290944998883, + "learning_rate": 1.2458842384918815e-05, + "loss": 0.3429, + "step": 7629 + }, + { + "epoch": 0.44, + "grad_norm": 0.37771971215198696, + "learning_rate": 1.2457038561256616e-05, + "loss": 0.2933, + "step": 7630 + }, + { + "epoch": 0.44, + "grad_norm": 0.25817417503857, + "learning_rate": 1.2455234652507276e-05, + "loss": 0.2179, + "step": 7631 + }, + { + "epoch": 0.44, + "grad_norm": 1.003852559033286, + "learning_rate": 1.2453430658733265e-05, + "loss": 0.2929, + "step": 7632 + }, + { + "epoch": 0.44, + "grad_norm": 0.33167390172321803, + "learning_rate": 1.2451626579997056e-05, + "loss": 0.2602, + "step": 7633 + }, + { + "epoch": 0.44, + "grad_norm": 0.8273407599930446, + "learning_rate": 1.2449822416361123e-05, + "loss": 0.4349, + "step": 7634 + }, + { + "epoch": 0.44, + "grad_norm": 0.3353811089623452, + "learning_rate": 1.2448018167887947e-05, + "loss": 0.264, + "step": 7635 + }, + { + "epoch": 0.44, + "grad_norm": 0.34648324200161335, + "learning_rate": 1.2446213834640007e-05, + "loss": 0.2492, + "step": 7636 + }, + { + "epoch": 0.44, + "grad_norm": 0.30946096151194946, + "learning_rate": 1.2444409416679786e-05, + "loss": 0.2142, + "step": 7637 + }, + { + "epoch": 0.44, + "grad_norm": 0.813049180954248, + "learning_rate": 1.2442604914069773e-05, + "loss": 0.4697, + "step": 7638 + }, + { + "epoch": 0.44, + "grad_norm": 0.3757559066901632, + "learning_rate": 1.2440800326872457e-05, + "loss": 0.2301, + "step": 7639 + }, + { + "epoch": 0.44, + "grad_norm": 0.4310357553833105, + "learning_rate": 1.2438995655150332e-05, + "loss": 0.3578, + "step": 7640 + }, + { + "epoch": 0.44, + "grad_norm": 1.0897373835543203, + "learning_rate": 1.243719089896589e-05, + "loss": 0.7668, + "step": 7641 + }, + { + "epoch": 0.44, + "grad_norm": 0.2947300212873115, + "learning_rate": 1.2435386058381634e-05, + "loss": 0.2155, + "step": 7642 + }, + { + "epoch": 0.44, + "grad_norm": 0.2607968228356934, + "learning_rate": 1.243358113346006e-05, + "loss": 0.2048, + "step": 7643 + }, + { + "epoch": 0.44, + "grad_norm": 0.444883989342041, + "learning_rate": 1.243177612426368e-05, + "loss": 0.368, + "step": 7644 + }, + { + "epoch": 0.44, + "grad_norm": 0.3069080743699087, + "learning_rate": 1.2429971030854993e-05, + "loss": 0.1943, + "step": 7645 + }, + { + "epoch": 0.44, + "grad_norm": 0.8069208624859093, + "learning_rate": 1.2428165853296517e-05, + "loss": 0.4731, + "step": 7646 + }, + { + "epoch": 0.44, + "grad_norm": 0.3425899314113828, + "learning_rate": 1.2426360591650761e-05, + "loss": 0.2992, + "step": 7647 + }, + { + "epoch": 0.44, + "grad_norm": 0.3365953563747308, + "learning_rate": 1.2424555245980241e-05, + "loss": 0.2127, + "step": 7648 + }, + { + "epoch": 0.44, + "grad_norm": 0.281929789738832, + "learning_rate": 1.2422749816347479e-05, + "loss": 0.1837, + "step": 7649 + }, + { + "epoch": 0.44, + "grad_norm": 0.8104273831878697, + "learning_rate": 1.2420944302814992e-05, + "loss": 0.4816, + "step": 7650 + }, + { + "epoch": 0.44, + "grad_norm": 0.34569478851091506, + "learning_rate": 1.2419138705445314e-05, + "loss": 0.2962, + "step": 7651 + }, + { + "epoch": 0.44, + "grad_norm": 0.3634484166137741, + "learning_rate": 1.241733302430096e-05, + "loss": 0.2862, + "step": 7652 + }, + { + "epoch": 0.44, + "grad_norm": 1.0261468850569297, + "learning_rate": 1.2415527259444471e-05, + "loss": 0.6782, + "step": 7653 + }, + { + "epoch": 0.44, + "grad_norm": 0.3215611242030219, + "learning_rate": 1.2413721410938373e-05, + "loss": 0.2798, + "step": 7654 + }, + { + "epoch": 0.44, + "grad_norm": 0.2449067046131677, + "learning_rate": 1.2411915478845211e-05, + "loss": 0.1634, + "step": 7655 + }, + { + "epoch": 0.44, + "grad_norm": 0.4588366813097026, + "learning_rate": 1.2410109463227519e-05, + "loss": 0.3376, + "step": 7656 + }, + { + "epoch": 0.44, + "grad_norm": 0.3169675064491585, + "learning_rate": 1.240830336414784e-05, + "loss": 0.3055, + "step": 7657 + }, + { + "epoch": 0.44, + "grad_norm": 0.8641165712233401, + "learning_rate": 1.2406497181668717e-05, + "loss": 0.4004, + "step": 7658 + }, + { + "epoch": 0.44, + "grad_norm": 0.34837061522076307, + "learning_rate": 1.2404690915852701e-05, + "loss": 0.3421, + "step": 7659 + }, + { + "epoch": 0.44, + "grad_norm": 0.3212739319107255, + "learning_rate": 1.2402884566762341e-05, + "loss": 0.2886, + "step": 7660 + }, + { + "epoch": 0.44, + "grad_norm": 0.2776234930677816, + "learning_rate": 1.2401078134460194e-05, + "loss": 0.1348, + "step": 7661 + }, + { + "epoch": 0.44, + "grad_norm": 0.41273755963460756, + "learning_rate": 1.2399271619008812e-05, + "loss": 0.3599, + "step": 7662 + }, + { + "epoch": 0.44, + "grad_norm": 0.42377632401126736, + "learning_rate": 1.2397465020470757e-05, + "loss": 0.2785, + "step": 7663 + }, + { + "epoch": 0.44, + "grad_norm": 0.3625811131545241, + "learning_rate": 1.2395658338908594e-05, + "loss": 0.3151, + "step": 7664 + }, + { + "epoch": 0.44, + "grad_norm": 0.7998829300948974, + "learning_rate": 1.2393851574384886e-05, + "loss": 0.4532, + "step": 7665 + }, + { + "epoch": 0.44, + "grad_norm": 0.40151280132662404, + "learning_rate": 1.23920447269622e-05, + "loss": 0.2937, + "step": 7666 + }, + { + "epoch": 0.44, + "grad_norm": 0.3260653072198325, + "learning_rate": 1.2390237796703107e-05, + "loss": 0.3084, + "step": 7667 + }, + { + "epoch": 0.44, + "grad_norm": 0.5939761261701787, + "learning_rate": 1.238843078367018e-05, + "loss": 0.3467, + "step": 7668 + }, + { + "epoch": 0.44, + "grad_norm": 0.2672933275218854, + "learning_rate": 1.2386623687926001e-05, + "loss": 0.2261, + "step": 7669 + }, + { + "epoch": 0.44, + "grad_norm": 0.4603784285403028, + "learning_rate": 1.2384816509533145e-05, + "loss": 0.3053, + "step": 7670 + }, + { + "epoch": 0.44, + "grad_norm": 0.32124928699513156, + "learning_rate": 1.2383009248554195e-05, + "loss": 0.245, + "step": 7671 + }, + { + "epoch": 0.44, + "grad_norm": 0.34815018711320994, + "learning_rate": 1.2381201905051738e-05, + "loss": 0.2903, + "step": 7672 + }, + { + "epoch": 0.44, + "grad_norm": 0.630884248029316, + "learning_rate": 1.2379394479088363e-05, + "loss": 0.4409, + "step": 7673 + }, + { + "epoch": 0.44, + "grad_norm": 0.37651909244520654, + "learning_rate": 1.2377586970726658e-05, + "loss": 0.2451, + "step": 7674 + }, + { + "epoch": 0.44, + "grad_norm": 0.282839422926257, + "learning_rate": 1.2375779380029218e-05, + "loss": 0.2584, + "step": 7675 + }, + { + "epoch": 0.44, + "grad_norm": 0.4063717632353682, + "learning_rate": 1.2373971707058643e-05, + "loss": 0.2808, + "step": 7676 + }, + { + "epoch": 0.44, + "grad_norm": 0.9496003441841634, + "learning_rate": 1.2372163951877525e-05, + "loss": 0.6778, + "step": 7677 + }, + { + "epoch": 0.44, + "grad_norm": 0.2875371706876149, + "learning_rate": 1.2370356114548476e-05, + "loss": 0.2067, + "step": 7678 + }, + { + "epoch": 0.44, + "grad_norm": 0.3650829049658475, + "learning_rate": 1.2368548195134094e-05, + "loss": 0.3043, + "step": 7679 + }, + { + "epoch": 0.44, + "grad_norm": 0.7485902503018145, + "learning_rate": 1.2366740193696991e-05, + "loss": 0.4163, + "step": 7680 + }, + { + "epoch": 0.44, + "grad_norm": 0.23644972909627532, + "learning_rate": 1.2364932110299775e-05, + "loss": 0.1315, + "step": 7681 + }, + { + "epoch": 0.44, + "grad_norm": 0.3908523069948069, + "learning_rate": 1.2363123945005064e-05, + "loss": 0.2984, + "step": 7682 + }, + { + "epoch": 0.44, + "grad_norm": 0.36885901856842995, + "learning_rate": 1.2361315697875469e-05, + "loss": 0.3257, + "step": 7683 + }, + { + "epoch": 0.44, + "grad_norm": 0.3138719168813281, + "learning_rate": 1.2359507368973618e-05, + "loss": 0.1951, + "step": 7684 + }, + { + "epoch": 0.44, + "grad_norm": 0.6338918277920743, + "learning_rate": 1.2357698958362123e-05, + "loss": 0.3682, + "step": 7685 + }, + { + "epoch": 0.44, + "grad_norm": 1.075166134031894, + "learning_rate": 1.2355890466103619e-05, + "loss": 0.6696, + "step": 7686 + }, + { + "epoch": 0.44, + "grad_norm": 0.2538342119211804, + "learning_rate": 1.2354081892260728e-05, + "loss": 0.2455, + "step": 7687 + }, + { + "epoch": 0.44, + "grad_norm": 0.2697614085126694, + "learning_rate": 1.2352273236896082e-05, + "loss": 0.2123, + "step": 7688 + }, + { + "epoch": 0.44, + "grad_norm": 1.115602045075998, + "learning_rate": 1.2350464500072314e-05, + "loss": 0.5415, + "step": 7689 + }, + { + "epoch": 0.44, + "grad_norm": 0.40017360263197127, + "learning_rate": 1.2348655681852064e-05, + "loss": 0.299, + "step": 7690 + }, + { + "epoch": 0.44, + "grad_norm": 0.27748444108314924, + "learning_rate": 1.2346846782297966e-05, + "loss": 0.2675, + "step": 7691 + }, + { + "epoch": 0.44, + "grad_norm": 1.024308978329414, + "learning_rate": 1.2345037801472669e-05, + "loss": 0.5259, + "step": 7692 + }, + { + "epoch": 0.44, + "grad_norm": 0.2658736169881998, + "learning_rate": 1.2343228739438811e-05, + "loss": 0.2082, + "step": 7693 + }, + { + "epoch": 0.44, + "grad_norm": 0.5155626562375746, + "learning_rate": 1.2341419596259038e-05, + "loss": 0.295, + "step": 7694 + }, + { + "epoch": 0.44, + "grad_norm": 0.26905222497297376, + "learning_rate": 1.2339610371996012e-05, + "loss": 0.2577, + "step": 7695 + }, + { + "epoch": 0.44, + "grad_norm": 0.3606434958043496, + "learning_rate": 1.2337801066712376e-05, + "loss": 0.3027, + "step": 7696 + }, + { + "epoch": 0.44, + "grad_norm": 0.49890333651717567, + "learning_rate": 1.2335991680470792e-05, + "loss": 0.2738, + "step": 7697 + }, + { + "epoch": 0.44, + "grad_norm": 0.39467269562385565, + "learning_rate": 1.2334182213333912e-05, + "loss": 0.3395, + "step": 7698 + }, + { + "epoch": 0.44, + "grad_norm": 0.5731151803650315, + "learning_rate": 1.2332372665364406e-05, + "loss": 0.3394, + "step": 7699 + }, + { + "epoch": 0.44, + "grad_norm": 0.4065624798269097, + "learning_rate": 1.2330563036624934e-05, + "loss": 0.3214, + "step": 7700 + }, + { + "epoch": 0.44, + "grad_norm": 0.36053450120448083, + "learning_rate": 1.2328753327178164e-05, + "loss": 0.2515, + "step": 7701 + }, + { + "epoch": 0.44, + "grad_norm": 0.5689075010121288, + "learning_rate": 1.2326943537086766e-05, + "loss": 0.3741, + "step": 7702 + }, + { + "epoch": 0.44, + "grad_norm": 0.2783654959597047, + "learning_rate": 1.2325133666413414e-05, + "loss": 0.2591, + "step": 7703 + }, + { + "epoch": 0.44, + "grad_norm": 1.3908455924860992, + "learning_rate": 1.2323323715220783e-05, + "loss": 0.2327, + "step": 7704 + }, + { + "epoch": 0.44, + "grad_norm": 0.47826736293204375, + "learning_rate": 1.2321513683571553e-05, + "loss": 0.3268, + "step": 7705 + }, + { + "epoch": 0.44, + "grad_norm": 0.4121896236281999, + "learning_rate": 1.2319703571528403e-05, + "loss": 0.3567, + "step": 7706 + }, + { + "epoch": 0.44, + "grad_norm": 0.35279691238740696, + "learning_rate": 1.231789337915402e-05, + "loss": 0.2763, + "step": 7707 + }, + { + "epoch": 0.44, + "grad_norm": 0.37557769110490014, + "learning_rate": 1.2316083106511085e-05, + "loss": 0.2277, + "step": 7708 + }, + { + "epoch": 0.44, + "grad_norm": 0.3360999778532407, + "learning_rate": 1.2314272753662295e-05, + "loss": 0.2438, + "step": 7709 + }, + { + "epoch": 0.44, + "grad_norm": 0.46270153126000013, + "learning_rate": 1.2312462320670335e-05, + "loss": 0.2576, + "step": 7710 + }, + { + "epoch": 0.44, + "grad_norm": 0.3572732447396155, + "learning_rate": 1.2310651807597909e-05, + "loss": 0.233, + "step": 7711 + }, + { + "epoch": 0.44, + "grad_norm": 0.4912302945710772, + "learning_rate": 1.2308841214507708e-05, + "loss": 0.3823, + "step": 7712 + }, + { + "epoch": 0.44, + "grad_norm": 0.5918930747562702, + "learning_rate": 1.2307030541462435e-05, + "loss": 0.4092, + "step": 7713 + }, + { + "epoch": 0.44, + "grad_norm": 0.27316585495061246, + "learning_rate": 1.2305219788524794e-05, + "loss": 0.224, + "step": 7714 + }, + { + "epoch": 0.44, + "grad_norm": 0.2925029836392248, + "learning_rate": 1.230340895575749e-05, + "loss": 0.248, + "step": 7715 + }, + { + "epoch": 0.44, + "grad_norm": 0.5948366275202696, + "learning_rate": 1.2301598043223233e-05, + "loss": 0.2956, + "step": 7716 + }, + { + "epoch": 0.44, + "grad_norm": 0.5585403674560185, + "learning_rate": 1.2299787050984736e-05, + "loss": 0.2098, + "step": 7717 + }, + { + "epoch": 0.44, + "grad_norm": 0.357036950905096, + "learning_rate": 1.2297975979104711e-05, + "loss": 0.3173, + "step": 7718 + }, + { + "epoch": 0.44, + "grad_norm": 0.3489053320187153, + "learning_rate": 1.2296164827645875e-05, + "loss": 0.3219, + "step": 7719 + }, + { + "epoch": 0.44, + "grad_norm": 0.26655390566547604, + "learning_rate": 1.2294353596670954e-05, + "loss": 0.1146, + "step": 7720 + }, + { + "epoch": 0.44, + "grad_norm": 0.22260566173399035, + "learning_rate": 1.2292542286242663e-05, + "loss": 0.1856, + "step": 7721 + }, + { + "epoch": 0.44, + "grad_norm": 0.5210406943968808, + "learning_rate": 1.2290730896423733e-05, + "loss": 0.3649, + "step": 7722 + }, + { + "epoch": 0.44, + "grad_norm": 0.30909289929601924, + "learning_rate": 1.2288919427276889e-05, + "loss": 0.2107, + "step": 7723 + }, + { + "epoch": 0.44, + "grad_norm": 0.3394742788735173, + "learning_rate": 1.2287107878864868e-05, + "loss": 0.3283, + "step": 7724 + }, + { + "epoch": 0.44, + "grad_norm": 0.6441934367210518, + "learning_rate": 1.2285296251250396e-05, + "loss": 0.4857, + "step": 7725 + }, + { + "epoch": 0.44, + "grad_norm": 0.33547352374751016, + "learning_rate": 1.2283484544496214e-05, + "loss": 0.3212, + "step": 7726 + }, + { + "epoch": 0.44, + "grad_norm": 0.22535303095029596, + "learning_rate": 1.2281672758665057e-05, + "loss": 0.1664, + "step": 7727 + }, + { + "epoch": 0.44, + "grad_norm": 1.1198897473384355, + "learning_rate": 1.2279860893819677e-05, + "loss": 0.5423, + "step": 7728 + }, + { + "epoch": 0.44, + "grad_norm": 0.5315956158941079, + "learning_rate": 1.2278048950022807e-05, + "loss": 0.4195, + "step": 7729 + }, + { + "epoch": 0.44, + "grad_norm": 0.3316998991025966, + "learning_rate": 1.2276236927337201e-05, + "loss": 0.2396, + "step": 7730 + }, + { + "epoch": 0.44, + "grad_norm": 0.3949812567558907, + "learning_rate": 1.2274424825825608e-05, + "loss": 0.3378, + "step": 7731 + }, + { + "epoch": 0.44, + "grad_norm": 0.5902242463857145, + "learning_rate": 1.2272612645550783e-05, + "loss": 0.4106, + "step": 7732 + }, + { + "epoch": 0.44, + "grad_norm": 0.27268811226890377, + "learning_rate": 1.2270800386575479e-05, + "loss": 0.182, + "step": 7733 + }, + { + "epoch": 0.44, + "grad_norm": 0.30230494310639255, + "learning_rate": 1.2268988048962454e-05, + "loss": 0.2506, + "step": 7734 + }, + { + "epoch": 0.44, + "grad_norm": 0.7007259407472975, + "learning_rate": 1.2267175632774472e-05, + "loss": 0.4328, + "step": 7735 + }, + { + "epoch": 0.44, + "grad_norm": 0.3078098222416652, + "learning_rate": 1.2265363138074294e-05, + "loss": 0.2572, + "step": 7736 + }, + { + "epoch": 0.44, + "grad_norm": 0.7233622730142071, + "learning_rate": 1.226355056492469e-05, + "loss": 0.5583, + "step": 7737 + }, + { + "epoch": 0.44, + "grad_norm": 0.3706652593809387, + "learning_rate": 1.2261737913388424e-05, + "loss": 0.3142, + "step": 7738 + }, + { + "epoch": 0.44, + "grad_norm": 0.3325676696521458, + "learning_rate": 1.2259925183528275e-05, + "loss": 0.2802, + "step": 7739 + }, + { + "epoch": 0.44, + "grad_norm": 0.285868913478341, + "learning_rate": 1.225811237540701e-05, + "loss": 0.1341, + "step": 7740 + }, + { + "epoch": 0.44, + "grad_norm": 0.6615062653955808, + "learning_rate": 1.2256299489087416e-05, + "loss": 0.3796, + "step": 7741 + }, + { + "epoch": 0.44, + "grad_norm": 0.29847265124901856, + "learning_rate": 1.2254486524632263e-05, + "loss": 0.2744, + "step": 7742 + }, + { + "epoch": 0.44, + "grad_norm": 0.39855067333182725, + "learning_rate": 1.225267348210434e-05, + "loss": 0.2633, + "step": 7743 + }, + { + "epoch": 0.44, + "grad_norm": 0.6883013273514436, + "learning_rate": 1.225086036156643e-05, + "loss": 0.475, + "step": 7744 + }, + { + "epoch": 0.44, + "grad_norm": 0.3010695239911448, + "learning_rate": 1.2249047163081325e-05, + "loss": 0.2446, + "step": 7745 + }, + { + "epoch": 0.45, + "grad_norm": 0.265481799290773, + "learning_rate": 1.2247233886711811e-05, + "loss": 0.1961, + "step": 7746 + }, + { + "epoch": 0.45, + "grad_norm": 0.6405863195516796, + "learning_rate": 1.2245420532520687e-05, + "loss": 0.3769, + "step": 7747 + }, + { + "epoch": 0.45, + "grad_norm": 0.3782124832205112, + "learning_rate": 1.2243607100570743e-05, + "loss": 0.3033, + "step": 7748 + }, + { + "epoch": 0.45, + "grad_norm": 0.8096337104484459, + "learning_rate": 1.2241793590924785e-05, + "loss": 0.4681, + "step": 7749 + }, + { + "epoch": 0.45, + "grad_norm": 0.2962372025335575, + "learning_rate": 1.2239980003645606e-05, + "loss": 0.2728, + "step": 7750 + }, + { + "epoch": 0.45, + "grad_norm": 0.4048492625045128, + "learning_rate": 1.2238166338796021e-05, + "loss": 0.3013, + "step": 7751 + }, + { + "epoch": 0.45, + "grad_norm": 0.307624574114885, + "learning_rate": 1.2236352596438832e-05, + "loss": 0.1997, + "step": 7752 + }, + { + "epoch": 0.45, + "grad_norm": 1.1069574333300105, + "learning_rate": 1.2234538776636846e-05, + "loss": 0.3514, + "step": 7753 + }, + { + "epoch": 0.45, + "grad_norm": 0.31546488061455996, + "learning_rate": 1.223272487945288e-05, + "loss": 0.2837, + "step": 7754 + }, + { + "epoch": 0.45, + "grad_norm": 0.4835867800375805, + "learning_rate": 1.2230910904949747e-05, + "loss": 0.3979, + "step": 7755 + }, + { + "epoch": 0.45, + "grad_norm": 0.959949705436744, + "learning_rate": 1.2229096853190265e-05, + "loss": 0.3485, + "step": 7756 + }, + { + "epoch": 0.45, + "grad_norm": 0.3530160623767107, + "learning_rate": 1.2227282724237254e-05, + "loss": 0.2498, + "step": 7757 + }, + { + "epoch": 0.45, + "grad_norm": 0.5273841705816225, + "learning_rate": 1.2225468518153543e-05, + "loss": 0.2861, + "step": 7758 + }, + { + "epoch": 0.45, + "grad_norm": 0.9535026076828771, + "learning_rate": 1.2223654235001948e-05, + "loss": 0.2452, + "step": 7759 + }, + { + "epoch": 0.45, + "grad_norm": 0.36487906220355076, + "learning_rate": 1.2221839874845303e-05, + "loss": 0.2854, + "step": 7760 + }, + { + "epoch": 0.45, + "grad_norm": 1.1723647175488505, + "learning_rate": 1.2220025437746437e-05, + "loss": 0.8119, + "step": 7761 + }, + { + "epoch": 0.45, + "grad_norm": 0.395196663415754, + "learning_rate": 1.221821092376819e-05, + "loss": 0.2566, + "step": 7762 + }, + { + "epoch": 0.45, + "grad_norm": 0.43346883361783417, + "learning_rate": 1.2216396332973391e-05, + "loss": 0.3076, + "step": 7763 + }, + { + "epoch": 0.45, + "grad_norm": 0.8082014848351015, + "learning_rate": 1.2214581665424883e-05, + "loss": 0.442, + "step": 7764 + }, + { + "epoch": 0.45, + "grad_norm": 0.2674446403474515, + "learning_rate": 1.2212766921185506e-05, + "loss": 0.2374, + "step": 7765 + }, + { + "epoch": 0.45, + "grad_norm": 0.28619470020545024, + "learning_rate": 1.2210952100318107e-05, + "loss": 0.1977, + "step": 7766 + }, + { + "epoch": 0.45, + "grad_norm": 0.5161481861718609, + "learning_rate": 1.220913720288553e-05, + "loss": 0.4137, + "step": 7767 + }, + { + "epoch": 0.45, + "grad_norm": 1.4577512973298372, + "learning_rate": 1.2207322228950628e-05, + "loss": 0.7202, + "step": 7768 + }, + { + "epoch": 0.45, + "grad_norm": 0.3080743765473305, + "learning_rate": 1.2205507178576252e-05, + "loss": 0.1984, + "step": 7769 + }, + { + "epoch": 0.45, + "grad_norm": 0.439692529348634, + "learning_rate": 1.220369205182526e-05, + "loss": 0.3539, + "step": 7770 + }, + { + "epoch": 0.45, + "grad_norm": 0.2810154705395384, + "learning_rate": 1.2201876848760505e-05, + "loss": 0.2121, + "step": 7771 + }, + { + "epoch": 0.45, + "grad_norm": 0.34147306795497445, + "learning_rate": 1.2200061569444848e-05, + "loss": 0.2114, + "step": 7772 + }, + { + "epoch": 0.45, + "grad_norm": 1.1078075037797748, + "learning_rate": 1.2198246213941156e-05, + "loss": 0.6738, + "step": 7773 + }, + { + "epoch": 0.45, + "grad_norm": 0.43075098792032634, + "learning_rate": 1.2196430782312292e-05, + "loss": 0.3043, + "step": 7774 + }, + { + "epoch": 0.45, + "grad_norm": 0.3794600600488902, + "learning_rate": 1.2194615274621122e-05, + "loss": 0.2685, + "step": 7775 + }, + { + "epoch": 0.45, + "grad_norm": 0.5827109851181835, + "learning_rate": 1.2192799690930521e-05, + "loss": 0.3391, + "step": 7776 + }, + { + "epoch": 0.45, + "grad_norm": 0.35902548591666494, + "learning_rate": 1.2190984031303361e-05, + "loss": 0.2472, + "step": 7777 + }, + { + "epoch": 0.45, + "grad_norm": 0.3640745890902366, + "learning_rate": 1.2189168295802519e-05, + "loss": 0.2622, + "step": 7778 + }, + { + "epoch": 0.45, + "grad_norm": 0.34088465488055364, + "learning_rate": 1.218735248449087e-05, + "loss": 0.1616, + "step": 7779 + }, + { + "epoch": 0.45, + "grad_norm": 0.7991062158846808, + "learning_rate": 1.21855365974313e-05, + "loss": 0.4441, + "step": 7780 + }, + { + "epoch": 0.45, + "grad_norm": 0.3664535149552145, + "learning_rate": 1.2183720634686693e-05, + "loss": 0.2753, + "step": 7781 + }, + { + "epoch": 0.45, + "grad_norm": 0.37060343525461587, + "learning_rate": 1.2181904596319933e-05, + "loss": 0.2746, + "step": 7782 + }, + { + "epoch": 0.45, + "grad_norm": 0.26177238030517913, + "learning_rate": 1.2180088482393911e-05, + "loss": 0.1789, + "step": 7783 + }, + { + "epoch": 0.45, + "grad_norm": 0.3335241264892765, + "learning_rate": 1.2178272292971519e-05, + "loss": 0.2824, + "step": 7784 + }, + { + "epoch": 0.45, + "grad_norm": 0.7427978808671635, + "learning_rate": 1.217645602811565e-05, + "loss": 0.484, + "step": 7785 + }, + { + "epoch": 0.45, + "grad_norm": 0.4443305120052916, + "learning_rate": 1.2174639687889202e-05, + "loss": 0.3144, + "step": 7786 + }, + { + "epoch": 0.45, + "grad_norm": 0.39557505313196073, + "learning_rate": 1.2172823272355077e-05, + "loss": 0.2631, + "step": 7787 + }, + { + "epoch": 0.45, + "grad_norm": 0.6536016348918546, + "learning_rate": 1.2171006781576173e-05, + "loss": 0.3575, + "step": 7788 + }, + { + "epoch": 0.45, + "grad_norm": 0.2895298768999007, + "learning_rate": 1.2169190215615401e-05, + "loss": 0.1717, + "step": 7789 + }, + { + "epoch": 0.45, + "grad_norm": 0.3057102143177872, + "learning_rate": 1.2167373574535661e-05, + "loss": 0.258, + "step": 7790 + }, + { + "epoch": 0.45, + "grad_norm": 0.5624184239293547, + "learning_rate": 1.2165556858399874e-05, + "loss": 0.443, + "step": 7791 + }, + { + "epoch": 0.45, + "grad_norm": 0.4411597730654119, + "learning_rate": 1.2163740067270941e-05, + "loss": 0.2398, + "step": 7792 + }, + { + "epoch": 0.45, + "grad_norm": 0.33775600014775914, + "learning_rate": 1.2161923201211783e-05, + "loss": 0.273, + "step": 7793 + }, + { + "epoch": 0.45, + "grad_norm": 0.354861755129926, + "learning_rate": 1.2160106260285316e-05, + "loss": 0.293, + "step": 7794 + }, + { + "epoch": 0.45, + "grad_norm": 0.7224381640648487, + "learning_rate": 1.2158289244554467e-05, + "loss": 0.3158, + "step": 7795 + }, + { + "epoch": 0.45, + "grad_norm": 0.34561253546241905, + "learning_rate": 1.2156472154082149e-05, + "loss": 0.255, + "step": 7796 + }, + { + "epoch": 0.45, + "grad_norm": 1.1561107168749716, + "learning_rate": 1.2154654988931296e-05, + "loss": 0.7711, + "step": 7797 + }, + { + "epoch": 0.45, + "grad_norm": 0.37902420717857044, + "learning_rate": 1.2152837749164834e-05, + "loss": 0.2881, + "step": 7798 + }, + { + "epoch": 0.45, + "grad_norm": 0.3027625498266799, + "learning_rate": 1.215102043484569e-05, + "loss": 0.224, + "step": 7799 + }, + { + "epoch": 0.45, + "grad_norm": 0.4722136725028722, + "learning_rate": 1.2149203046036803e-05, + "loss": 0.2767, + "step": 7800 + }, + { + "epoch": 0.45, + "grad_norm": 0.47601793386394814, + "learning_rate": 1.2147385582801106e-05, + "loss": 0.3417, + "step": 7801 + }, + { + "epoch": 0.45, + "grad_norm": 0.30416584956647924, + "learning_rate": 1.214556804520154e-05, + "loss": 0.2099, + "step": 7802 + }, + { + "epoch": 0.45, + "grad_norm": 0.5322689707569284, + "learning_rate": 1.2143750433301043e-05, + "loss": 0.3731, + "step": 7803 + }, + { + "epoch": 0.45, + "grad_norm": 0.48119429572557804, + "learning_rate": 1.2141932747162564e-05, + "loss": 0.3049, + "step": 7804 + }, + { + "epoch": 0.45, + "grad_norm": 0.2579226364490064, + "learning_rate": 1.2140114986849043e-05, + "loss": 0.138, + "step": 7805 + }, + { + "epoch": 0.45, + "grad_norm": 0.32117319040447845, + "learning_rate": 1.2138297152423432e-05, + "loss": 0.2956, + "step": 7806 + }, + { + "epoch": 0.45, + "grad_norm": 0.9370941699181503, + "learning_rate": 1.2136479243948683e-05, + "loss": 0.6731, + "step": 7807 + }, + { + "epoch": 0.45, + "grad_norm": 0.3129022751134737, + "learning_rate": 1.2134661261487752e-05, + "loss": 0.1771, + "step": 7808 + }, + { + "epoch": 0.45, + "grad_norm": 0.3969024582867962, + "learning_rate": 1.2132843205103591e-05, + "loss": 0.322, + "step": 7809 + }, + { + "epoch": 0.45, + "grad_norm": 0.4938649637259429, + "learning_rate": 1.2131025074859164e-05, + "loss": 0.3403, + "step": 7810 + }, + { + "epoch": 0.45, + "grad_norm": 0.21944148398620011, + "learning_rate": 1.2129206870817428e-05, + "loss": 0.1346, + "step": 7811 + }, + { + "epoch": 0.45, + "grad_norm": 0.3246560530948743, + "learning_rate": 1.2127388593041348e-05, + "loss": 0.24, + "step": 7812 + }, + { + "epoch": 0.45, + "grad_norm": 0.48767280290232373, + "learning_rate": 1.2125570241593894e-05, + "loss": 0.3602, + "step": 7813 + }, + { + "epoch": 0.45, + "grad_norm": 0.33893538071778284, + "learning_rate": 1.2123751816538037e-05, + "loss": 0.2849, + "step": 7814 + }, + { + "epoch": 0.45, + "grad_norm": 0.49358569403480806, + "learning_rate": 1.2121933317936741e-05, + "loss": 0.3024, + "step": 7815 + }, + { + "epoch": 0.45, + "grad_norm": 0.6051859788689352, + "learning_rate": 1.2120114745852989e-05, + "loss": 0.4997, + "step": 7816 + }, + { + "epoch": 0.45, + "grad_norm": 0.2764055452780025, + "learning_rate": 1.2118296100349753e-05, + "loss": 0.2308, + "step": 7817 + }, + { + "epoch": 0.45, + "grad_norm": 0.25970750334520953, + "learning_rate": 1.2116477381490013e-05, + "loss": 0.1922, + "step": 7818 + }, + { + "epoch": 0.45, + "grad_norm": 0.8639977301196575, + "learning_rate": 1.2114658589336754e-05, + "loss": 0.5905, + "step": 7819 + }, + { + "epoch": 0.45, + "grad_norm": 0.4017731721362114, + "learning_rate": 1.2112839723952958e-05, + "loss": 0.2922, + "step": 7820 + }, + { + "epoch": 0.45, + "grad_norm": 0.3604215580984441, + "learning_rate": 1.2111020785401615e-05, + "loss": 0.2731, + "step": 7821 + }, + { + "epoch": 0.45, + "grad_norm": 0.6402061764469947, + "learning_rate": 1.210920177374571e-05, + "loss": 0.3718, + "step": 7822 + }, + { + "epoch": 0.45, + "grad_norm": 0.6287622775741888, + "learning_rate": 1.210738268904824e-05, + "loss": 0.3071, + "step": 7823 + }, + { + "epoch": 0.45, + "grad_norm": 0.23030235373629632, + "learning_rate": 1.21055635313722e-05, + "loss": 0.1565, + "step": 7824 + }, + { + "epoch": 0.45, + "grad_norm": 0.36849606576333455, + "learning_rate": 1.2103744300780586e-05, + "loss": 0.3311, + "step": 7825 + }, + { + "epoch": 0.45, + "grad_norm": 0.5539057972494684, + "learning_rate": 1.2101924997336393e-05, + "loss": 0.3156, + "step": 7826 + }, + { + "epoch": 0.45, + "grad_norm": 0.39605459800732945, + "learning_rate": 1.2100105621102631e-05, + "loss": 0.3243, + "step": 7827 + }, + { + "epoch": 0.45, + "grad_norm": 0.8298682413471936, + "learning_rate": 1.20982861721423e-05, + "loss": 0.462, + "step": 7828 + }, + { + "epoch": 0.45, + "grad_norm": 0.38887716172679526, + "learning_rate": 1.209646665051841e-05, + "loss": 0.2785, + "step": 7829 + }, + { + "epoch": 0.45, + "grad_norm": 0.22922781355910388, + "learning_rate": 1.2094647056293969e-05, + "loss": 0.2256, + "step": 7830 + }, + { + "epoch": 0.45, + "grad_norm": 0.6853732992858654, + "learning_rate": 1.2092827389531992e-05, + "loss": 0.3065, + "step": 7831 + }, + { + "epoch": 0.45, + "grad_norm": 0.5288262475482187, + "learning_rate": 1.209100765029549e-05, + "loss": 0.3367, + "step": 7832 + }, + { + "epoch": 0.45, + "grad_norm": 0.3683993478374688, + "learning_rate": 1.2089187838647484e-05, + "loss": 0.2997, + "step": 7833 + }, + { + "epoch": 0.45, + "grad_norm": 0.46618559440736795, + "learning_rate": 1.2087367954650992e-05, + "loss": 0.3075, + "step": 7834 + }, + { + "epoch": 0.45, + "grad_norm": 0.3954069825574937, + "learning_rate": 1.208554799836904e-05, + "loss": 0.2891, + "step": 7835 + }, + { + "epoch": 0.45, + "grad_norm": 0.2476792511905212, + "learning_rate": 1.2083727969864652e-05, + "loss": 0.2011, + "step": 7836 + }, + { + "epoch": 0.45, + "grad_norm": 0.4296079969449888, + "learning_rate": 1.208190786920085e-05, + "loss": 0.2942, + "step": 7837 + }, + { + "epoch": 0.45, + "grad_norm": 0.6835585045500028, + "learning_rate": 1.208008769644067e-05, + "loss": 0.3775, + "step": 7838 + }, + { + "epoch": 0.45, + "grad_norm": 0.429431380987189, + "learning_rate": 1.2078267451647141e-05, + "loss": 0.319, + "step": 7839 + }, + { + "epoch": 0.45, + "grad_norm": 0.9934331425210245, + "learning_rate": 1.20764471348833e-05, + "loss": 0.6671, + "step": 7840 + }, + { + "epoch": 0.45, + "grad_norm": 0.35648383354079055, + "learning_rate": 1.2074626746212183e-05, + "loss": 0.2447, + "step": 7841 + }, + { + "epoch": 0.45, + "grad_norm": 0.2711537520251157, + "learning_rate": 1.2072806285696836e-05, + "loss": 0.2366, + "step": 7842 + }, + { + "epoch": 0.45, + "grad_norm": 0.5954785393208696, + "learning_rate": 1.2070985753400293e-05, + "loss": 0.3112, + "step": 7843 + }, + { + "epoch": 0.45, + "grad_norm": 0.506182932788288, + "learning_rate": 1.2069165149385603e-05, + "loss": 0.1937, + "step": 7844 + }, + { + "epoch": 0.45, + "grad_norm": 0.3082872471946073, + "learning_rate": 1.2067344473715813e-05, + "loss": 0.2891, + "step": 7845 + }, + { + "epoch": 0.45, + "grad_norm": 1.3423903304220317, + "learning_rate": 1.2065523726453971e-05, + "loss": 0.7992, + "step": 7846 + }, + { + "epoch": 0.45, + "grad_norm": 0.4917154874110692, + "learning_rate": 1.206370290766313e-05, + "loss": 0.1749, + "step": 7847 + }, + { + "epoch": 0.45, + "grad_norm": 0.241763552867306, + "learning_rate": 1.206188201740635e-05, + "loss": 0.2102, + "step": 7848 + }, + { + "epoch": 0.45, + "grad_norm": 0.3783400126203032, + "learning_rate": 1.206006105574668e-05, + "loss": 0.3344, + "step": 7849 + }, + { + "epoch": 0.45, + "grad_norm": 0.73194324677599, + "learning_rate": 1.205824002274719e-05, + "loss": 0.4001, + "step": 7850 + }, + { + "epoch": 0.45, + "grad_norm": 0.31024938969861154, + "learning_rate": 1.2056418918470931e-05, + "loss": 0.2304, + "step": 7851 + }, + { + "epoch": 0.45, + "grad_norm": 0.9587907228244877, + "learning_rate": 1.2054597742980975e-05, + "loss": 0.6535, + "step": 7852 + }, + { + "epoch": 0.45, + "grad_norm": 0.353162787883959, + "learning_rate": 1.2052776496340389e-05, + "loss": 0.2805, + "step": 7853 + }, + { + "epoch": 0.45, + "grad_norm": 0.32028660740744386, + "learning_rate": 1.205095517861224e-05, + "loss": 0.2061, + "step": 7854 + }, + { + "epoch": 0.45, + "grad_norm": 0.43666200782343506, + "learning_rate": 1.2049133789859602e-05, + "loss": 0.3048, + "step": 7855 + }, + { + "epoch": 0.45, + "grad_norm": 0.364837631487842, + "learning_rate": 1.2047312330145549e-05, + "loss": 0.2357, + "step": 7856 + }, + { + "epoch": 0.45, + "grad_norm": 0.2910287527496673, + "learning_rate": 1.204549079953316e-05, + "loss": 0.253, + "step": 7857 + }, + { + "epoch": 0.45, + "grad_norm": 0.8730968420317119, + "learning_rate": 1.2043669198085509e-05, + "loss": 0.6054, + "step": 7858 + }, + { + "epoch": 0.45, + "grad_norm": 1.4669922543739415, + "learning_rate": 1.2041847525865681e-05, + "loss": 0.7951, + "step": 7859 + }, + { + "epoch": 0.45, + "grad_norm": 0.3773913635551674, + "learning_rate": 1.2040025782936766e-05, + "loss": 0.2398, + "step": 7860 + }, + { + "epoch": 0.45, + "grad_norm": 0.2937030728988133, + "learning_rate": 1.2038203969361841e-05, + "loss": 0.2758, + "step": 7861 + }, + { + "epoch": 0.45, + "grad_norm": 0.39829030321756786, + "learning_rate": 1.2036382085204004e-05, + "loss": 0.2639, + "step": 7862 + }, + { + "epoch": 0.45, + "grad_norm": 0.35718100232276506, + "learning_rate": 1.2034560130526341e-05, + "loss": 0.2594, + "step": 7863 + }, + { + "epoch": 0.45, + "grad_norm": 0.8819076750989475, + "learning_rate": 1.2032738105391945e-05, + "loss": 0.4058, + "step": 7864 + }, + { + "epoch": 0.45, + "grad_norm": 0.37511149307403135, + "learning_rate": 1.2030916009863921e-05, + "loss": 0.3146, + "step": 7865 + }, + { + "epoch": 0.45, + "grad_norm": 0.3427182179012295, + "learning_rate": 1.2029093844005359e-05, + "loss": 0.2962, + "step": 7866 + }, + { + "epoch": 0.45, + "grad_norm": 0.6080154357699522, + "learning_rate": 1.2027271607879368e-05, + "loss": 0.362, + "step": 7867 + }, + { + "epoch": 0.45, + "grad_norm": 0.24903988506822783, + "learning_rate": 1.2025449301549046e-05, + "loss": 0.1698, + "step": 7868 + }, + { + "epoch": 0.45, + "grad_norm": 0.3080660901489215, + "learning_rate": 1.2023626925077503e-05, + "loss": 0.272, + "step": 7869 + }, + { + "epoch": 0.45, + "grad_norm": 1.1014977729702664, + "learning_rate": 1.2021804478527845e-05, + "loss": 0.401, + "step": 7870 + }, + { + "epoch": 0.45, + "grad_norm": 0.6438849569428409, + "learning_rate": 1.2019981961963185e-05, + "loss": 0.4141, + "step": 7871 + }, + { + "epoch": 0.45, + "grad_norm": 0.3717529729312076, + "learning_rate": 1.2018159375446636e-05, + "loss": 0.2579, + "step": 7872 + }, + { + "epoch": 0.45, + "grad_norm": 0.3212381811770286, + "learning_rate": 1.2016336719041318e-05, + "loss": 0.2701, + "step": 7873 + }, + { + "epoch": 0.45, + "grad_norm": 0.38203566172353803, + "learning_rate": 1.2014513992810344e-05, + "loss": 0.2637, + "step": 7874 + }, + { + "epoch": 0.45, + "grad_norm": 0.3642964971611716, + "learning_rate": 1.2012691196816836e-05, + "loss": 0.275, + "step": 7875 + }, + { + "epoch": 0.45, + "grad_norm": 0.37088618583675204, + "learning_rate": 1.2010868331123922e-05, + "loss": 0.2343, + "step": 7876 + }, + { + "epoch": 0.45, + "grad_norm": 0.40984124709798897, + "learning_rate": 1.2009045395794723e-05, + "loss": 0.283, + "step": 7877 + }, + { + "epoch": 0.45, + "grad_norm": 0.31850183923540143, + "learning_rate": 1.2007222390892365e-05, + "loss": 0.2847, + "step": 7878 + }, + { + "epoch": 0.45, + "grad_norm": 0.6904602282111612, + "learning_rate": 1.2005399316479984e-05, + "loss": 0.4745, + "step": 7879 + }, + { + "epoch": 0.45, + "grad_norm": 0.28231886869155365, + "learning_rate": 1.200357617262071e-05, + "loss": 0.1993, + "step": 7880 + }, + { + "epoch": 0.45, + "grad_norm": 0.2864075909628802, + "learning_rate": 1.2001752959377681e-05, + "loss": 0.2689, + "step": 7881 + }, + { + "epoch": 0.45, + "grad_norm": 0.5250714104982417, + "learning_rate": 1.1999929676814036e-05, + "loss": 0.2763, + "step": 7882 + }, + { + "epoch": 0.45, + "grad_norm": 0.6084203947302514, + "learning_rate": 1.1998106324992906e-05, + "loss": 0.3492, + "step": 7883 + }, + { + "epoch": 0.45, + "grad_norm": 0.3261697352200898, + "learning_rate": 1.1996282903977442e-05, + "loss": 0.3021, + "step": 7884 + }, + { + "epoch": 0.45, + "grad_norm": 0.3600944615199071, + "learning_rate": 1.1994459413830785e-05, + "loss": 0.3117, + "step": 7885 + }, + { + "epoch": 0.45, + "grad_norm": 0.43975626688178493, + "learning_rate": 1.1992635854616088e-05, + "loss": 0.2537, + "step": 7886 + }, + { + "epoch": 0.45, + "grad_norm": 0.3502728085639639, + "learning_rate": 1.1990812226396496e-05, + "loss": 0.3062, + "step": 7887 + }, + { + "epoch": 0.45, + "grad_norm": 0.43356883571119836, + "learning_rate": 1.1988988529235161e-05, + "loss": 0.305, + "step": 7888 + }, + { + "epoch": 0.45, + "grad_norm": 0.29485545554777076, + "learning_rate": 1.198716476319524e-05, + "loss": 0.2916, + "step": 7889 + }, + { + "epoch": 0.45, + "grad_norm": 0.31064543092327446, + "learning_rate": 1.198534092833989e-05, + "loss": 0.2028, + "step": 7890 + }, + { + "epoch": 0.45, + "grad_norm": 0.8897180222566816, + "learning_rate": 1.1983517024732266e-05, + "loss": 0.59, + "step": 7891 + }, + { + "epoch": 0.45, + "grad_norm": 0.3955907200246761, + "learning_rate": 1.1981693052435537e-05, + "loss": 0.3412, + "step": 7892 + }, + { + "epoch": 0.45, + "grad_norm": 0.29039852394862853, + "learning_rate": 1.1979869011512859e-05, + "loss": 0.2017, + "step": 7893 + }, + { + "epoch": 0.45, + "grad_norm": 0.39928201842865546, + "learning_rate": 1.1978044902027403e-05, + "loss": 0.3085, + "step": 7894 + }, + { + "epoch": 0.45, + "grad_norm": 0.4353633841389933, + "learning_rate": 1.1976220724042336e-05, + "loss": 0.2926, + "step": 7895 + }, + { + "epoch": 0.45, + "grad_norm": 0.32887704414837243, + "learning_rate": 1.1974396477620833e-05, + "loss": 0.1961, + "step": 7896 + }, + { + "epoch": 0.45, + "grad_norm": 0.34286785641908873, + "learning_rate": 1.1972572162826061e-05, + "loss": 0.314, + "step": 7897 + }, + { + "epoch": 0.45, + "grad_norm": 1.3682061627663276, + "learning_rate": 1.1970747779721203e-05, + "loss": 0.7447, + "step": 7898 + }, + { + "epoch": 0.45, + "grad_norm": 0.29070012715612625, + "learning_rate": 1.1968923328369433e-05, + "loss": 0.219, + "step": 7899 + }, + { + "epoch": 0.45, + "grad_norm": 0.4741431492632248, + "learning_rate": 1.1967098808833935e-05, + "loss": 0.3583, + "step": 7900 + }, + { + "epoch": 0.45, + "grad_norm": 0.298824920933953, + "learning_rate": 1.196527422117789e-05, + "loss": 0.2645, + "step": 7901 + }, + { + "epoch": 0.45, + "grad_norm": 0.24902107345337846, + "learning_rate": 1.1963449565464478e-05, + "loss": 0.2042, + "step": 7902 + }, + { + "epoch": 0.45, + "grad_norm": 1.0625977661145665, + "learning_rate": 1.1961624841756896e-05, + "loss": 0.3038, + "step": 7903 + }, + { + "epoch": 0.45, + "grad_norm": 0.42317283623319885, + "learning_rate": 1.1959800050118328e-05, + "loss": 0.3523, + "step": 7904 + }, + { + "epoch": 0.45, + "grad_norm": 0.3517066275803351, + "learning_rate": 1.195797519061197e-05, + "loss": 0.2577, + "step": 7905 + }, + { + "epoch": 0.45, + "grad_norm": 0.7735868151817158, + "learning_rate": 1.1956150263301014e-05, + "loss": 0.3493, + "step": 7906 + }, + { + "epoch": 0.45, + "grad_norm": 0.310163938160124, + "learning_rate": 1.1954325268248662e-05, + "loss": 0.1994, + "step": 7907 + }, + { + "epoch": 0.45, + "grad_norm": 0.397983349336781, + "learning_rate": 1.1952500205518107e-05, + "loss": 0.241, + "step": 7908 + }, + { + "epoch": 0.45, + "grad_norm": 0.3896181687123652, + "learning_rate": 1.1950675075172557e-05, + "loss": 0.2484, + "step": 7909 + }, + { + "epoch": 0.45, + "grad_norm": 0.9368903933280383, + "learning_rate": 1.1948849877275209e-05, + "loss": 0.5781, + "step": 7910 + }, + { + "epoch": 0.45, + "grad_norm": 0.39187076583274866, + "learning_rate": 1.1947024611889276e-05, + "loss": 0.2909, + "step": 7911 + }, + { + "epoch": 0.45, + "grad_norm": 0.3288277529256562, + "learning_rate": 1.1945199279077962e-05, + "loss": 0.2785, + "step": 7912 + }, + { + "epoch": 0.45, + "grad_norm": 0.5272823310712186, + "learning_rate": 1.1943373878904482e-05, + "loss": 0.2873, + "step": 7913 + }, + { + "epoch": 0.45, + "grad_norm": 0.29573508428328504, + "learning_rate": 1.194154841143205e-05, + "loss": 0.1921, + "step": 7914 + }, + { + "epoch": 0.45, + "grad_norm": 0.6374701531641034, + "learning_rate": 1.1939722876723878e-05, + "loss": 0.4224, + "step": 7915 + }, + { + "epoch": 0.45, + "grad_norm": 0.3905831304108459, + "learning_rate": 1.1937897274843184e-05, + "loss": 0.2766, + "step": 7916 + }, + { + "epoch": 0.45, + "grad_norm": 0.41233010441397044, + "learning_rate": 1.1936071605853195e-05, + "loss": 0.286, + "step": 7917 + }, + { + "epoch": 0.45, + "grad_norm": 1.0236772511840988, + "learning_rate": 1.1934245869817127e-05, + "loss": 0.4651, + "step": 7918 + }, + { + "epoch": 0.45, + "grad_norm": 0.27799086007368734, + "learning_rate": 1.193242006679821e-05, + "loss": 0.1277, + "step": 7919 + }, + { + "epoch": 0.46, + "grad_norm": 0.2699895568941243, + "learning_rate": 1.1930594196859667e-05, + "loss": 0.2208, + "step": 7920 + }, + { + "epoch": 0.46, + "grad_norm": 0.4589470961563092, + "learning_rate": 1.1928768260064732e-05, + "loss": 0.3091, + "step": 7921 + }, + { + "epoch": 0.46, + "grad_norm": 1.4691715710811628, + "learning_rate": 1.1926942256476632e-05, + "loss": 0.3769, + "step": 7922 + }, + { + "epoch": 0.46, + "grad_norm": 0.37149958982181935, + "learning_rate": 1.1925116186158605e-05, + "loss": 0.2761, + "step": 7923 + }, + { + "epoch": 0.46, + "grad_norm": 0.5020463855050367, + "learning_rate": 1.192329004917389e-05, + "loss": 0.3984, + "step": 7924 + }, + { + "epoch": 0.46, + "grad_norm": 0.31379760183411154, + "learning_rate": 1.1921463845585722e-05, + "loss": 0.2095, + "step": 7925 + }, + { + "epoch": 0.46, + "grad_norm": 0.304322085714914, + "learning_rate": 1.1919637575457343e-05, + "loss": 0.1815, + "step": 7926 + }, + { + "epoch": 0.46, + "grad_norm": 0.4280709827379945, + "learning_rate": 1.1917811238852e-05, + "loss": 0.3252, + "step": 7927 + }, + { + "epoch": 0.46, + "grad_norm": 0.5484844773978607, + "learning_rate": 1.1915984835832934e-05, + "loss": 0.3202, + "step": 7928 + }, + { + "epoch": 0.46, + "grad_norm": 0.49450017929156964, + "learning_rate": 1.1914158366463392e-05, + "loss": 0.2197, + "step": 7929 + }, + { + "epoch": 0.46, + "grad_norm": 0.6972758363104524, + "learning_rate": 1.1912331830806634e-05, + "loss": 0.4078, + "step": 7930 + }, + { + "epoch": 0.46, + "grad_norm": 1.163113774171106, + "learning_rate": 1.1910505228925903e-05, + "loss": 0.6485, + "step": 7931 + }, + { + "epoch": 0.46, + "grad_norm": 0.23145347746277942, + "learning_rate": 1.1908678560884462e-05, + "loss": 0.1715, + "step": 7932 + }, + { + "epoch": 0.46, + "grad_norm": 0.30815546200945876, + "learning_rate": 1.190685182674556e-05, + "loss": 0.2405, + "step": 7933 + }, + { + "epoch": 0.46, + "grad_norm": 0.6610420498319607, + "learning_rate": 1.1905025026572466e-05, + "loss": 0.4497, + "step": 7934 + }, + { + "epoch": 0.46, + "grad_norm": 0.3484729311489025, + "learning_rate": 1.1903198160428433e-05, + "loss": 0.2436, + "step": 7935 + }, + { + "epoch": 0.46, + "grad_norm": 0.37479821138076064, + "learning_rate": 1.190137122837673e-05, + "loss": 0.3104, + "step": 7936 + }, + { + "epoch": 0.46, + "grad_norm": 0.903404747222253, + "learning_rate": 1.1899544230480623e-05, + "loss": 0.5389, + "step": 7937 + }, + { + "epoch": 0.46, + "grad_norm": 0.3793020472891658, + "learning_rate": 1.1897717166803384e-05, + "loss": 0.2626, + "step": 7938 + }, + { + "epoch": 0.46, + "grad_norm": 0.21536537499616923, + "learning_rate": 1.189589003740828e-05, + "loss": 0.158, + "step": 7939 + }, + { + "epoch": 0.46, + "grad_norm": 0.37013074965928505, + "learning_rate": 1.1894062842358585e-05, + "loss": 0.3346, + "step": 7940 + }, + { + "epoch": 0.46, + "grad_norm": 0.4442486156485244, + "learning_rate": 1.1892235581717575e-05, + "loss": 0.2893, + "step": 7941 + }, + { + "epoch": 0.46, + "grad_norm": 0.5788706585727145, + "learning_rate": 1.1890408255548526e-05, + "loss": 0.2968, + "step": 7942 + }, + { + "epoch": 0.46, + "grad_norm": 1.205983665744992, + "learning_rate": 1.1888580863914724e-05, + "loss": 0.613, + "step": 7943 + }, + { + "epoch": 0.46, + "grad_norm": 0.3300548857885787, + "learning_rate": 1.188675340687945e-05, + "loss": 0.2639, + "step": 7944 + }, + { + "epoch": 0.46, + "grad_norm": 0.23304407517691592, + "learning_rate": 1.188492588450598e-05, + "loss": 0.1743, + "step": 7945 + }, + { + "epoch": 0.46, + "grad_norm": 0.6386065237872193, + "learning_rate": 1.1883098296857614e-05, + "loss": 0.444, + "step": 7946 + }, + { + "epoch": 0.46, + "grad_norm": 0.5258883988322739, + "learning_rate": 1.188127064399763e-05, + "loss": 0.2976, + "step": 7947 + }, + { + "epoch": 0.46, + "grad_norm": 0.30515301220401475, + "learning_rate": 1.1879442925989327e-05, + "loss": 0.2492, + "step": 7948 + }, + { + "epoch": 0.46, + "grad_norm": 0.9557336485183242, + "learning_rate": 1.1877615142895995e-05, + "loss": 0.5884, + "step": 7949 + }, + { + "epoch": 0.46, + "grad_norm": 0.58178474444118, + "learning_rate": 1.1875787294780932e-05, + "loss": 0.324, + "step": 7950 + }, + { + "epoch": 0.46, + "grad_norm": 0.4263233009317428, + "learning_rate": 1.1873959381707437e-05, + "loss": 0.323, + "step": 7951 + }, + { + "epoch": 0.46, + "grad_norm": 0.2837105656508334, + "learning_rate": 1.1872131403738807e-05, + "loss": 0.2144, + "step": 7952 + }, + { + "epoch": 0.46, + "grad_norm": 0.3091377428079506, + "learning_rate": 1.1870303360938349e-05, + "loss": 0.2136, + "step": 7953 + }, + { + "epoch": 0.46, + "grad_norm": 0.38310899231449197, + "learning_rate": 1.1868475253369362e-05, + "loss": 0.313, + "step": 7954 + }, + { + "epoch": 0.46, + "grad_norm": 0.9306600839745718, + "learning_rate": 1.1866647081095162e-05, + "loss": 0.2579, + "step": 7955 + }, + { + "epoch": 0.46, + "grad_norm": 0.3050132731937429, + "learning_rate": 1.186481884417905e-05, + "loss": 0.2768, + "step": 7956 + }, + { + "epoch": 0.46, + "grad_norm": 0.42166795211866515, + "learning_rate": 1.1862990542684345e-05, + "loss": 0.3322, + "step": 7957 + }, + { + "epoch": 0.46, + "grad_norm": 0.2862765123153871, + "learning_rate": 1.1861162176674354e-05, + "loss": 0.1944, + "step": 7958 + }, + { + "epoch": 0.46, + "grad_norm": 0.3983928765816261, + "learning_rate": 1.1859333746212403e-05, + "loss": 0.3066, + "step": 7959 + }, + { + "epoch": 0.46, + "grad_norm": 0.37464310126796313, + "learning_rate": 1.1857505251361801e-05, + "loss": 0.3424, + "step": 7960 + }, + { + "epoch": 0.46, + "grad_norm": 0.4032873786677103, + "learning_rate": 1.1855676692185872e-05, + "loss": 0.2819, + "step": 7961 + }, + { + "epoch": 0.46, + "grad_norm": 0.4679357898041959, + "learning_rate": 1.1853848068747938e-05, + "loss": 0.3131, + "step": 7962 + }, + { + "epoch": 0.46, + "grad_norm": 0.3412178021471056, + "learning_rate": 1.1852019381111326e-05, + "loss": 0.3276, + "step": 7963 + }, + { + "epoch": 0.46, + "grad_norm": 0.2875821893013411, + "learning_rate": 1.185019062933936e-05, + "loss": 0.2642, + "step": 7964 + }, + { + "epoch": 0.46, + "grad_norm": 0.4411680471167332, + "learning_rate": 1.1848361813495376e-05, + "loss": 0.2624, + "step": 7965 + }, + { + "epoch": 0.46, + "grad_norm": 0.2880036099235008, + "learning_rate": 1.18465329336427e-05, + "loss": 0.2542, + "step": 7966 + }, + { + "epoch": 0.46, + "grad_norm": 0.4021660522898735, + "learning_rate": 1.1844703989844666e-05, + "loss": 0.3471, + "step": 7967 + }, + { + "epoch": 0.46, + "grad_norm": 0.33634227392857685, + "learning_rate": 1.1842874982164616e-05, + "loss": 0.2271, + "step": 7968 + }, + { + "epoch": 0.46, + "grad_norm": 0.3303900188393209, + "learning_rate": 1.1841045910665881e-05, + "loss": 0.3177, + "step": 7969 + }, + { + "epoch": 0.46, + "grad_norm": 0.9671616145587876, + "learning_rate": 1.1839216775411808e-05, + "loss": 0.609, + "step": 7970 + }, + { + "epoch": 0.46, + "grad_norm": 0.35308979972001275, + "learning_rate": 1.1837387576465735e-05, + "loss": 0.2297, + "step": 7971 + }, + { + "epoch": 0.46, + "grad_norm": 0.24389530242129417, + "learning_rate": 1.183555831389101e-05, + "loss": 0.2464, + "step": 7972 + }, + { + "epoch": 0.46, + "grad_norm": 0.4290048907771581, + "learning_rate": 1.183372898775098e-05, + "loss": 0.2936, + "step": 7973 + }, + { + "epoch": 0.46, + "grad_norm": 0.4132601964727188, + "learning_rate": 1.1831899598108993e-05, + "loss": 0.2794, + "step": 7974 + }, + { + "epoch": 0.46, + "grad_norm": 0.4167507268336926, + "learning_rate": 1.1830070145028404e-05, + "loss": 0.3419, + "step": 7975 + }, + { + "epoch": 0.46, + "grad_norm": 0.4073307537683485, + "learning_rate": 1.1828240628572563e-05, + "loss": 0.3289, + "step": 7976 + }, + { + "epoch": 0.46, + "grad_norm": 0.7557884514763259, + "learning_rate": 1.182641104880483e-05, + "loss": 0.4952, + "step": 7977 + }, + { + "epoch": 0.46, + "grad_norm": 0.26140284114058343, + "learning_rate": 1.1824581405788558e-05, + "loss": 0.1731, + "step": 7978 + }, + { + "epoch": 0.46, + "grad_norm": 0.39551359172857126, + "learning_rate": 1.1822751699587109e-05, + "loss": 0.2648, + "step": 7979 + }, + { + "epoch": 0.46, + "grad_norm": 0.3736897265914453, + "learning_rate": 1.1820921930263851e-05, + "loss": 0.2989, + "step": 7980 + }, + { + "epoch": 0.46, + "grad_norm": 0.38382012833460544, + "learning_rate": 1.1819092097882141e-05, + "loss": 0.231, + "step": 7981 + }, + { + "epoch": 0.46, + "grad_norm": 1.296301622420184, + "learning_rate": 1.1817262202505353e-05, + "loss": 0.4792, + "step": 7982 + }, + { + "epoch": 0.46, + "grad_norm": 0.5060071372814816, + "learning_rate": 1.1815432244196849e-05, + "loss": 0.3495, + "step": 7983 + }, + { + "epoch": 0.46, + "grad_norm": 0.285491659038382, + "learning_rate": 1.181360222302001e-05, + "loss": 0.2228, + "step": 7984 + }, + { + "epoch": 0.46, + "grad_norm": 0.43293902333173934, + "learning_rate": 1.18117721390382e-05, + "loss": 0.2953, + "step": 7985 + }, + { + "epoch": 0.46, + "grad_norm": 0.42922801670662475, + "learning_rate": 1.1809941992314799e-05, + "loss": 0.2395, + "step": 7986 + }, + { + "epoch": 0.46, + "grad_norm": 0.33588685351110065, + "learning_rate": 1.1808111782913188e-05, + "loss": 0.2126, + "step": 7987 + }, + { + "epoch": 0.46, + "grad_norm": 0.40812800461735654, + "learning_rate": 1.180628151089674e-05, + "loss": 0.3165, + "step": 7988 + }, + { + "epoch": 0.46, + "grad_norm": 0.6953194615167572, + "learning_rate": 1.1804451176328844e-05, + "loss": 0.524, + "step": 7989 + }, + { + "epoch": 0.46, + "grad_norm": 0.38056672727143354, + "learning_rate": 1.1802620779272877e-05, + "loss": 0.3416, + "step": 7990 + }, + { + "epoch": 0.46, + "grad_norm": 0.4374708121864055, + "learning_rate": 1.1800790319792234e-05, + "loss": 0.3152, + "step": 7991 + }, + { + "epoch": 0.46, + "grad_norm": 0.24034260281344746, + "learning_rate": 1.1798959797950298e-05, + "loss": 0.1912, + "step": 7992 + }, + { + "epoch": 0.46, + "grad_norm": 0.3737299347380316, + "learning_rate": 1.1797129213810462e-05, + "loss": 0.3117, + "step": 7993 + }, + { + "epoch": 0.46, + "grad_norm": 0.8087123591015803, + "learning_rate": 1.179529856743612e-05, + "loss": 0.3435, + "step": 7994 + }, + { + "epoch": 0.46, + "grad_norm": 0.3312821317125005, + "learning_rate": 1.1793467858890666e-05, + "loss": 0.2908, + "step": 7995 + }, + { + "epoch": 0.46, + "grad_norm": 0.37260236653991463, + "learning_rate": 1.1791637088237493e-05, + "loss": 0.2907, + "step": 7996 + }, + { + "epoch": 0.46, + "grad_norm": 0.5993123611192235, + "learning_rate": 1.1789806255540008e-05, + "loss": 0.3494, + "step": 7997 + }, + { + "epoch": 0.46, + "grad_norm": 0.23486965767951468, + "learning_rate": 1.1787975360861607e-05, + "loss": 0.1612, + "step": 7998 + }, + { + "epoch": 0.46, + "grad_norm": 0.36941228758433226, + "learning_rate": 1.1786144404265701e-05, + "loss": 0.3084, + "step": 7999 + }, + { + "epoch": 0.46, + "grad_norm": 0.38811622550655456, + "learning_rate": 1.1784313385815685e-05, + "loss": 0.2744, + "step": 8000 + }, + { + "epoch": 0.46, + "grad_norm": 0.8777252991056873, + "learning_rate": 1.1782482305574976e-05, + "loss": 0.5783, + "step": 8001 + }, + { + "epoch": 0.46, + "grad_norm": 0.33300708206449414, + "learning_rate": 1.178065116360698e-05, + "loss": 0.2683, + "step": 8002 + }, + { + "epoch": 0.46, + "grad_norm": 0.3506847663910355, + "learning_rate": 1.1778819959975114e-05, + "loss": 0.3212, + "step": 8003 + }, + { + "epoch": 0.46, + "grad_norm": 0.2039203231034237, + "learning_rate": 1.1776988694742786e-05, + "loss": 0.0902, + "step": 8004 + }, + { + "epoch": 0.46, + "grad_norm": 0.32754667156190614, + "learning_rate": 1.1775157367973417e-05, + "loss": 0.2659, + "step": 8005 + }, + { + "epoch": 0.46, + "grad_norm": 0.9840863478213498, + "learning_rate": 1.1773325979730428e-05, + "loss": 0.6268, + "step": 8006 + }, + { + "epoch": 0.46, + "grad_norm": 0.4432248748342312, + "learning_rate": 1.1771494530077233e-05, + "loss": 0.3049, + "step": 8007 + }, + { + "epoch": 0.46, + "grad_norm": 0.3207501681884594, + "learning_rate": 1.1769663019077258e-05, + "loss": 0.2882, + "step": 8008 + }, + { + "epoch": 0.46, + "grad_norm": 0.8120058617367432, + "learning_rate": 1.176783144679393e-05, + "loss": 0.5619, + "step": 8009 + }, + { + "epoch": 0.46, + "grad_norm": 0.20825762019102417, + "learning_rate": 1.1765999813290674e-05, + "loss": 0.0947, + "step": 8010 + }, + { + "epoch": 0.46, + "grad_norm": 0.4053812780070775, + "learning_rate": 1.1764168118630922e-05, + "loss": 0.277, + "step": 8011 + }, + { + "epoch": 0.46, + "grad_norm": 0.3988311894219567, + "learning_rate": 1.1762336362878104e-05, + "loss": 0.3246, + "step": 8012 + }, + { + "epoch": 0.46, + "grad_norm": 0.6500249623410269, + "learning_rate": 1.1760504546095653e-05, + "loss": 0.4227, + "step": 8013 + }, + { + "epoch": 0.46, + "grad_norm": 0.29550813912741236, + "learning_rate": 1.1758672668347005e-05, + "loss": 0.19, + "step": 8014 + }, + { + "epoch": 0.46, + "grad_norm": 0.34503398766813, + "learning_rate": 1.1756840729695598e-05, + "loss": 0.342, + "step": 8015 + }, + { + "epoch": 0.46, + "grad_norm": 0.28919837746058225, + "learning_rate": 1.1755008730204873e-05, + "loss": 0.2108, + "step": 8016 + }, + { + "epoch": 0.46, + "grad_norm": 0.31556082209556957, + "learning_rate": 1.1753176669938269e-05, + "loss": 0.1865, + "step": 8017 + }, + { + "epoch": 0.46, + "grad_norm": 0.8571833591537861, + "learning_rate": 1.1751344548959233e-05, + "loss": 0.5349, + "step": 8018 + }, + { + "epoch": 0.46, + "grad_norm": 0.3899301745997571, + "learning_rate": 1.174951236733121e-05, + "loss": 0.3314, + "step": 8019 + }, + { + "epoch": 0.46, + "grad_norm": 0.2910575101476483, + "learning_rate": 1.1747680125117654e-05, + "loss": 0.2382, + "step": 8020 + }, + { + "epoch": 0.46, + "grad_norm": 0.9408216363736895, + "learning_rate": 1.1745847822382004e-05, + "loss": 0.6722, + "step": 8021 + }, + { + "epoch": 0.46, + "grad_norm": 0.3182946462400599, + "learning_rate": 1.174401545918772e-05, + "loss": 0.2214, + "step": 8022 + }, + { + "epoch": 0.46, + "grad_norm": 0.2663466323044907, + "learning_rate": 1.1742183035598258e-05, + "loss": 0.2274, + "step": 8023 + }, + { + "epoch": 0.46, + "grad_norm": 0.49731607824725593, + "learning_rate": 1.1740350551677073e-05, + "loss": 0.3698, + "step": 8024 + }, + { + "epoch": 0.46, + "grad_norm": 0.9450188765195661, + "learning_rate": 1.1738518007487621e-05, + "loss": 0.4426, + "step": 8025 + }, + { + "epoch": 0.46, + "grad_norm": 0.31989191955177276, + "learning_rate": 1.1736685403093367e-05, + "loss": 0.2602, + "step": 8026 + }, + { + "epoch": 0.46, + "grad_norm": 0.4274492080687954, + "learning_rate": 1.1734852738557772e-05, + "loss": 0.2698, + "step": 8027 + }, + { + "epoch": 0.46, + "grad_norm": 0.32934834987845174, + "learning_rate": 1.1733020013944301e-05, + "loss": 0.2708, + "step": 8028 + }, + { + "epoch": 0.46, + "grad_norm": 0.2622235332469892, + "learning_rate": 1.1731187229316418e-05, + "loss": 0.2088, + "step": 8029 + }, + { + "epoch": 0.46, + "grad_norm": 1.6319701577728358, + "learning_rate": 1.1729354384737602e-05, + "loss": 0.3641, + "step": 8030 + }, + { + "epoch": 0.46, + "grad_norm": 0.39856243611000813, + "learning_rate": 1.1727521480271315e-05, + "loss": 0.3438, + "step": 8031 + }, + { + "epoch": 0.46, + "grad_norm": 0.3740268094501319, + "learning_rate": 1.172568851598103e-05, + "loss": 0.2601, + "step": 8032 + }, + { + "epoch": 0.46, + "grad_norm": 0.49836553108072196, + "learning_rate": 1.1723855491930232e-05, + "loss": 0.3013, + "step": 8033 + }, + { + "epoch": 0.46, + "grad_norm": 0.3562893968298132, + "learning_rate": 1.1722022408182388e-05, + "loss": 0.288, + "step": 8034 + }, + { + "epoch": 0.46, + "grad_norm": 0.3640195155365308, + "learning_rate": 1.1720189264800983e-05, + "loss": 0.2428, + "step": 8035 + }, + { + "epoch": 0.46, + "grad_norm": 0.706087521346157, + "learning_rate": 1.1718356061849496e-05, + "loss": 0.2559, + "step": 8036 + }, + { + "epoch": 0.46, + "grad_norm": 1.2503475403720288, + "learning_rate": 1.1716522799391417e-05, + "loss": 0.5743, + "step": 8037 + }, + { + "epoch": 0.46, + "grad_norm": 0.4138155307392456, + "learning_rate": 1.1714689477490224e-05, + "loss": 0.3201, + "step": 8038 + }, + { + "epoch": 0.46, + "grad_norm": 0.3644446915489398, + "learning_rate": 1.1712856096209411e-05, + "loss": 0.292, + "step": 8039 + }, + { + "epoch": 0.46, + "grad_norm": 0.7135645309500714, + "learning_rate": 1.1711022655612461e-05, + "loss": 0.2894, + "step": 8040 + }, + { + "epoch": 0.46, + "grad_norm": 0.28773646997277114, + "learning_rate": 1.1709189155762872e-05, + "loss": 0.2276, + "step": 8041 + }, + { + "epoch": 0.46, + "grad_norm": 0.8314554766196716, + "learning_rate": 1.1707355596724135e-05, + "loss": 0.4062, + "step": 8042 + }, + { + "epoch": 0.46, + "grad_norm": 0.34693532859102244, + "learning_rate": 1.1705521978559748e-05, + "loss": 0.215, + "step": 8043 + }, + { + "epoch": 0.46, + "grad_norm": 0.3694538811668248, + "learning_rate": 1.1703688301333211e-05, + "loss": 0.2752, + "step": 8044 + }, + { + "epoch": 0.46, + "grad_norm": 1.349991276656821, + "learning_rate": 1.1701854565108019e-05, + "loss": 0.7906, + "step": 8045 + }, + { + "epoch": 0.46, + "grad_norm": 0.4942671800850725, + "learning_rate": 1.1700020769947675e-05, + "loss": 0.3041, + "step": 8046 + }, + { + "epoch": 0.46, + "grad_norm": 0.28376124490302723, + "learning_rate": 1.1698186915915689e-05, + "loss": 0.258, + "step": 8047 + }, + { + "epoch": 0.46, + "grad_norm": 0.5974320339927423, + "learning_rate": 1.1696353003075558e-05, + "loss": 0.4175, + "step": 8048 + }, + { + "epoch": 0.46, + "grad_norm": 0.25196002764368464, + "learning_rate": 1.16945190314908e-05, + "loss": 0.1402, + "step": 8049 + }, + { + "epoch": 0.46, + "grad_norm": 0.5777653906969942, + "learning_rate": 1.1692685001224918e-05, + "loss": 0.335, + "step": 8050 + }, + { + "epoch": 0.46, + "grad_norm": 0.45541649563077186, + "learning_rate": 1.1690850912341427e-05, + "loss": 0.2842, + "step": 8051 + }, + { + "epoch": 0.46, + "grad_norm": 0.9243817653395691, + "learning_rate": 1.1689016764903841e-05, + "loss": 0.4757, + "step": 8052 + }, + { + "epoch": 0.46, + "grad_norm": 0.4541786271595083, + "learning_rate": 1.168718255897568e-05, + "loss": 0.2181, + "step": 8053 + }, + { + "epoch": 0.46, + "grad_norm": 0.3405747715638682, + "learning_rate": 1.1685348294620457e-05, + "loss": 0.2851, + "step": 8054 + }, + { + "epoch": 0.46, + "grad_norm": 0.31051270814141485, + "learning_rate": 1.1683513971901697e-05, + "loss": 0.2689, + "step": 8055 + }, + { + "epoch": 0.46, + "grad_norm": 0.3512555459275891, + "learning_rate": 1.168167959088292e-05, + "loss": 0.2056, + "step": 8056 + }, + { + "epoch": 0.46, + "grad_norm": 0.5055296842698704, + "learning_rate": 1.1679845151627648e-05, + "loss": 0.3748, + "step": 8057 + }, + { + "epoch": 0.46, + "grad_norm": 0.7602648728429585, + "learning_rate": 1.1678010654199417e-05, + "loss": 0.4803, + "step": 8058 + }, + { + "epoch": 0.46, + "grad_norm": 0.41065482545930426, + "learning_rate": 1.1676176098661742e-05, + "loss": 0.2424, + "step": 8059 + }, + { + "epoch": 0.46, + "grad_norm": 0.47665228757374284, + "learning_rate": 1.1674341485078167e-05, + "loss": 0.3772, + "step": 8060 + }, + { + "epoch": 0.46, + "grad_norm": 0.4010258063879982, + "learning_rate": 1.1672506813512217e-05, + "loss": 0.3169, + "step": 8061 + }, + { + "epoch": 0.46, + "grad_norm": 0.3090499814345548, + "learning_rate": 1.1670672084027425e-05, + "loss": 0.2161, + "step": 8062 + }, + { + "epoch": 0.46, + "grad_norm": 0.32229611953247095, + "learning_rate": 1.1668837296687332e-05, + "loss": 0.2474, + "step": 8063 + }, + { + "epoch": 0.46, + "grad_norm": 0.723866973557168, + "learning_rate": 1.1667002451555476e-05, + "loss": 0.4764, + "step": 8064 + }, + { + "epoch": 0.46, + "grad_norm": 0.38413806263337497, + "learning_rate": 1.1665167548695395e-05, + "loss": 0.2978, + "step": 8065 + }, + { + "epoch": 0.46, + "grad_norm": 0.443860016995288, + "learning_rate": 1.1663332588170637e-05, + "loss": 0.2366, + "step": 8066 + }, + { + "epoch": 0.46, + "grad_norm": 0.34763785680591497, + "learning_rate": 1.1661497570044737e-05, + "loss": 0.3424, + "step": 8067 + }, + { + "epoch": 0.46, + "grad_norm": 0.8125107733973791, + "learning_rate": 1.1659662494381255e-05, + "loss": 0.4655, + "step": 8068 + }, + { + "epoch": 0.46, + "grad_norm": 0.23319806165726573, + "learning_rate": 1.1657827361243725e-05, + "loss": 0.1526, + "step": 8069 + }, + { + "epoch": 0.46, + "grad_norm": 0.38548295708245645, + "learning_rate": 1.1655992170695709e-05, + "loss": 0.3449, + "step": 8070 + }, + { + "epoch": 0.46, + "grad_norm": 0.6652817103965797, + "learning_rate": 1.1654156922800757e-05, + "loss": 0.4387, + "step": 8071 + }, + { + "epoch": 0.46, + "grad_norm": 0.40606341336922763, + "learning_rate": 1.1652321617622418e-05, + "loss": 0.2247, + "step": 8072 + }, + { + "epoch": 0.46, + "grad_norm": 1.2211351878797165, + "learning_rate": 1.1650486255224254e-05, + "loss": 0.7863, + "step": 8073 + }, + { + "epoch": 0.46, + "grad_norm": 0.2775457316895789, + "learning_rate": 1.1648650835669821e-05, + "loss": 0.1874, + "step": 8074 + }, + { + "epoch": 0.46, + "grad_norm": 0.23958314554752064, + "learning_rate": 1.1646815359022683e-05, + "loss": 0.2295, + "step": 8075 + }, + { + "epoch": 0.46, + "grad_norm": 0.5807486031966581, + "learning_rate": 1.1644979825346397e-05, + "loss": 0.4551, + "step": 8076 + }, + { + "epoch": 0.46, + "grad_norm": 0.5713200590272652, + "learning_rate": 1.1643144234704531e-05, + "loss": 0.3726, + "step": 8077 + }, + { + "epoch": 0.46, + "grad_norm": 0.4064945951489773, + "learning_rate": 1.1641308587160654e-05, + "loss": 0.3286, + "step": 8078 + }, + { + "epoch": 0.46, + "grad_norm": 0.3521036176468557, + "learning_rate": 1.1639472882778328e-05, + "loss": 0.2556, + "step": 8079 + }, + { + "epoch": 0.46, + "grad_norm": 0.563728121201371, + "learning_rate": 1.1637637121621126e-05, + "loss": 0.3345, + "step": 8080 + }, + { + "epoch": 0.46, + "grad_norm": 0.38309492806397877, + "learning_rate": 1.1635801303752622e-05, + "loss": 0.3351, + "step": 8081 + }, + { + "epoch": 0.46, + "grad_norm": 0.3018626635818792, + "learning_rate": 1.1633965429236389e-05, + "loss": 0.2433, + "step": 8082 + }, + { + "epoch": 0.46, + "grad_norm": 0.27316504593409413, + "learning_rate": 1.1632129498136005e-05, + "loss": 0.2163, + "step": 8083 + }, + { + "epoch": 0.46, + "grad_norm": 0.5375141536539753, + "learning_rate": 1.1630293510515043e-05, + "loss": 0.3382, + "step": 8084 + }, + { + "epoch": 0.46, + "grad_norm": 0.9905869281952622, + "learning_rate": 1.1628457466437091e-05, + "loss": 0.5418, + "step": 8085 + }, + { + "epoch": 0.46, + "grad_norm": 0.43572613249512804, + "learning_rate": 1.1626621365965725e-05, + "loss": 0.2925, + "step": 8086 + }, + { + "epoch": 0.46, + "grad_norm": 0.29459633182802186, + "learning_rate": 1.162478520916453e-05, + "loss": 0.267, + "step": 8087 + }, + { + "epoch": 0.46, + "grad_norm": 0.35473443242054237, + "learning_rate": 1.1622948996097095e-05, + "loss": 0.2073, + "step": 8088 + }, + { + "epoch": 0.46, + "grad_norm": 0.6581069323859764, + "learning_rate": 1.1621112726827004e-05, + "loss": 0.4389, + "step": 8089 + }, + { + "epoch": 0.46, + "grad_norm": 0.36928184336908865, + "learning_rate": 1.161927640141785e-05, + "loss": 0.306, + "step": 8090 + }, + { + "epoch": 0.46, + "grad_norm": 0.3721782809106861, + "learning_rate": 1.1617440019933226e-05, + "loss": 0.3317, + "step": 8091 + }, + { + "epoch": 0.46, + "grad_norm": 0.4796932591821559, + "learning_rate": 1.1615603582436723e-05, + "loss": 0.2115, + "step": 8092 + }, + { + "epoch": 0.46, + "grad_norm": 0.3904149073219585, + "learning_rate": 1.1613767088991935e-05, + "loss": 0.3234, + "step": 8093 + }, + { + "epoch": 0.47, + "grad_norm": 0.36772963613224796, + "learning_rate": 1.1611930539662463e-05, + "loss": 0.2398, + "step": 8094 + }, + { + "epoch": 0.47, + "grad_norm": 0.23813307940661996, + "learning_rate": 1.1610093934511908e-05, + "loss": 0.1875, + "step": 8095 + }, + { + "epoch": 0.47, + "grad_norm": 0.4083627297184952, + "learning_rate": 1.1608257273603864e-05, + "loss": 0.3244, + "step": 8096 + }, + { + "epoch": 0.47, + "grad_norm": 1.3618253703159187, + "learning_rate": 1.1606420557001945e-05, + "loss": 0.7872, + "step": 8097 + }, + { + "epoch": 0.47, + "grad_norm": 0.282233577421707, + "learning_rate": 1.160458378476975e-05, + "loss": 0.2427, + "step": 8098 + }, + { + "epoch": 0.47, + "grad_norm": 0.3926305634323061, + "learning_rate": 1.1602746956970886e-05, + "loss": 0.3108, + "step": 8099 + }, + { + "epoch": 0.47, + "grad_norm": 0.8545494870759214, + "learning_rate": 1.1600910073668964e-05, + "loss": 0.5018, + "step": 8100 + }, + { + "epoch": 0.47, + "grad_norm": 0.2593570986614807, + "learning_rate": 1.1599073134927597e-05, + "loss": 0.1651, + "step": 8101 + }, + { + "epoch": 0.47, + "grad_norm": 0.34342933446892515, + "learning_rate": 1.1597236140810394e-05, + "loss": 0.217, + "step": 8102 + }, + { + "epoch": 0.47, + "grad_norm": 0.3786898079471971, + "learning_rate": 1.1595399091380972e-05, + "loss": 0.3624, + "step": 8103 + }, + { + "epoch": 0.47, + "grad_norm": 0.661006336761479, + "learning_rate": 1.159356198670295e-05, + "loss": 0.4046, + "step": 8104 + }, + { + "epoch": 0.47, + "grad_norm": 0.33321983617894935, + "learning_rate": 1.1591724826839943e-05, + "loss": 0.2469, + "step": 8105 + }, + { + "epoch": 0.47, + "grad_norm": 0.3513617843380767, + "learning_rate": 1.1589887611855574e-05, + "loss": 0.3298, + "step": 8106 + }, + { + "epoch": 0.47, + "grad_norm": 0.2512576604987261, + "learning_rate": 1.1588050341813466e-05, + "loss": 0.1484, + "step": 8107 + }, + { + "epoch": 0.47, + "grad_norm": 0.2924953972160349, + "learning_rate": 1.1586213016777244e-05, + "loss": 0.2089, + "step": 8108 + }, + { + "epoch": 0.47, + "grad_norm": 0.7746893833833915, + "learning_rate": 1.158437563681053e-05, + "loss": 0.4779, + "step": 8109 + }, + { + "epoch": 0.47, + "grad_norm": 0.3601070577324971, + "learning_rate": 1.1582538201976958e-05, + "loss": 0.3198, + "step": 8110 + }, + { + "epoch": 0.47, + "grad_norm": 0.27886876485206846, + "learning_rate": 1.1580700712340159e-05, + "loss": 0.2167, + "step": 8111 + }, + { + "epoch": 0.47, + "grad_norm": 0.8927279441564796, + "learning_rate": 1.1578863167963761e-05, + "loss": 0.58, + "step": 8112 + }, + { + "epoch": 0.47, + "grad_norm": 0.25440229767955796, + "learning_rate": 1.1577025568911395e-05, + "loss": 0.1733, + "step": 8113 + }, + { + "epoch": 0.47, + "grad_norm": 0.28882003772125364, + "learning_rate": 1.1575187915246706e-05, + "loss": 0.262, + "step": 8114 + }, + { + "epoch": 0.47, + "grad_norm": 0.47526159258141804, + "learning_rate": 1.1573350207033324e-05, + "loss": 0.3023, + "step": 8115 + }, + { + "epoch": 0.47, + "grad_norm": 0.7238904758177274, + "learning_rate": 1.1571512444334894e-05, + "loss": 0.4253, + "step": 8116 + }, + { + "epoch": 0.47, + "grad_norm": 0.3652372666944264, + "learning_rate": 1.1569674627215057e-05, + "loss": 0.2946, + "step": 8117 + }, + { + "epoch": 0.47, + "grad_norm": 0.3398527200099904, + "learning_rate": 1.1567836755737452e-05, + "loss": 0.2535, + "step": 8118 + }, + { + "epoch": 0.47, + "grad_norm": 0.25940567321868835, + "learning_rate": 1.156599882996573e-05, + "loss": 0.1895, + "step": 8119 + }, + { + "epoch": 0.47, + "grad_norm": 0.41358852547742536, + "learning_rate": 1.1564160849963533e-05, + "loss": 0.2941, + "step": 8120 + }, + { + "epoch": 0.47, + "grad_norm": 0.6890446909789003, + "learning_rate": 1.1562322815794516e-05, + "loss": 0.3646, + "step": 8121 + }, + { + "epoch": 0.47, + "grad_norm": 0.39128113875580217, + "learning_rate": 1.1560484727522323e-05, + "loss": 0.3401, + "step": 8122 + }, + { + "epoch": 0.47, + "grad_norm": 0.3501552171817226, + "learning_rate": 1.1558646585210615e-05, + "loss": 0.2678, + "step": 8123 + }, + { + "epoch": 0.47, + "grad_norm": 0.8310288981112209, + "learning_rate": 1.1556808388923043e-05, + "loss": 0.3499, + "step": 8124 + }, + { + "epoch": 0.47, + "grad_norm": 0.32194066075690564, + "learning_rate": 1.155497013872326e-05, + "loss": 0.2148, + "step": 8125 + }, + { + "epoch": 0.47, + "grad_norm": 0.3097208230540672, + "learning_rate": 1.1553131834674929e-05, + "loss": 0.2589, + "step": 8126 + }, + { + "epoch": 0.47, + "grad_norm": 0.5418465886898102, + "learning_rate": 1.1551293476841712e-05, + "loss": 0.3533, + "step": 8127 + }, + { + "epoch": 0.47, + "grad_norm": 0.9933329488460488, + "learning_rate": 1.1549455065287267e-05, + "loss": 0.5601, + "step": 8128 + }, + { + "epoch": 0.47, + "grad_norm": 0.33251164007014095, + "learning_rate": 1.1547616600075262e-05, + "loss": 0.2702, + "step": 8129 + }, + { + "epoch": 0.47, + "grad_norm": 0.48264580396110907, + "learning_rate": 1.1545778081269356e-05, + "loss": 0.3385, + "step": 8130 + }, + { + "epoch": 0.47, + "grad_norm": 0.26561831151431287, + "learning_rate": 1.1543939508933226e-05, + "loss": 0.1731, + "step": 8131 + }, + { + "epoch": 0.47, + "grad_norm": 0.38986731532730423, + "learning_rate": 1.1542100883130534e-05, + "loss": 0.2678, + "step": 8132 + }, + { + "epoch": 0.47, + "grad_norm": 0.8541658768168554, + "learning_rate": 1.1540262203924957e-05, + "loss": 0.4704, + "step": 8133 + }, + { + "epoch": 0.47, + "grad_norm": 0.37102575891994377, + "learning_rate": 1.1538423471380162e-05, + "loss": 0.2628, + "step": 8134 + }, + { + "epoch": 0.47, + "grad_norm": 0.4084559161761699, + "learning_rate": 1.1536584685559833e-05, + "loss": 0.2971, + "step": 8135 + }, + { + "epoch": 0.47, + "grad_norm": 1.090623885447246, + "learning_rate": 1.1534745846527643e-05, + "loss": 0.5709, + "step": 8136 + }, + { + "epoch": 0.47, + "grad_norm": 0.5691462758797022, + "learning_rate": 1.1532906954347265e-05, + "loss": 0.2653, + "step": 8137 + }, + { + "epoch": 0.47, + "grad_norm": 0.3616131380469849, + "learning_rate": 1.1531068009082388e-05, + "loss": 0.2771, + "step": 8138 + }, + { + "epoch": 0.47, + "grad_norm": 0.35732383039321136, + "learning_rate": 1.1529229010796693e-05, + "loss": 0.2351, + "step": 8139 + }, + { + "epoch": 0.47, + "grad_norm": 1.2853466987734752, + "learning_rate": 1.152738995955386e-05, + "loss": 0.8433, + "step": 8140 + }, + { + "epoch": 0.47, + "grad_norm": 0.2917511597672303, + "learning_rate": 1.1525550855417579e-05, + "loss": 0.2001, + "step": 8141 + }, + { + "epoch": 0.47, + "grad_norm": 0.41454996938168326, + "learning_rate": 1.152371169845154e-05, + "loss": 0.322, + "step": 8142 + }, + { + "epoch": 0.47, + "grad_norm": 0.7889175233489341, + "learning_rate": 1.152187248871943e-05, + "loss": 0.4598, + "step": 8143 + }, + { + "epoch": 0.47, + "grad_norm": 0.39376249011371567, + "learning_rate": 1.1520033226284942e-05, + "loss": 0.2423, + "step": 8144 + }, + { + "epoch": 0.47, + "grad_norm": 0.42249860389750094, + "learning_rate": 1.1518193911211763e-05, + "loss": 0.3088, + "step": 8145 + }, + { + "epoch": 0.47, + "grad_norm": 0.3832570482048342, + "learning_rate": 1.1516354543563603e-05, + "loss": 0.3023, + "step": 8146 + }, + { + "epoch": 0.47, + "grad_norm": 0.27028430753402555, + "learning_rate": 1.1514515123404144e-05, + "loss": 0.1744, + "step": 8147 + }, + { + "epoch": 0.47, + "grad_norm": 0.9814466672326471, + "learning_rate": 1.1512675650797093e-05, + "loss": 0.5945, + "step": 8148 + }, + { + "epoch": 0.47, + "grad_norm": 0.5236065950219225, + "learning_rate": 1.1510836125806148e-05, + "loss": 0.3632, + "step": 8149 + }, + { + "epoch": 0.47, + "grad_norm": 0.2567177557850539, + "learning_rate": 1.1508996548495015e-05, + "loss": 0.2272, + "step": 8150 + }, + { + "epoch": 0.47, + "grad_norm": 0.48664804419956864, + "learning_rate": 1.1507156918927396e-05, + "loss": 0.2979, + "step": 8151 + }, + { + "epoch": 0.47, + "grad_norm": 0.4403235305691835, + "learning_rate": 1.1505317237166997e-05, + "loss": 0.342, + "step": 8152 + }, + { + "epoch": 0.47, + "grad_norm": 0.4455779532667353, + "learning_rate": 1.1503477503277526e-05, + "loss": 0.304, + "step": 8153 + }, + { + "epoch": 0.47, + "grad_norm": 0.3233973459258135, + "learning_rate": 1.1501637717322695e-05, + "loss": 0.2459, + "step": 8154 + }, + { + "epoch": 0.47, + "grad_norm": 0.6969251101798604, + "learning_rate": 1.1499797879366214e-05, + "loss": 0.4513, + "step": 8155 + }, + { + "epoch": 0.47, + "grad_norm": 0.4039155950028945, + "learning_rate": 1.1497957989471798e-05, + "loss": 0.2988, + "step": 8156 + }, + { + "epoch": 0.47, + "grad_norm": 0.5008602945483102, + "learning_rate": 1.1496118047703162e-05, + "loss": 0.2993, + "step": 8157 + }, + { + "epoch": 0.47, + "grad_norm": 0.29040380927590353, + "learning_rate": 1.1494278054124019e-05, + "loss": 0.2655, + "step": 8158 + }, + { + "epoch": 0.47, + "grad_norm": 0.39909885373138854, + "learning_rate": 1.1492438008798093e-05, + "loss": 0.264, + "step": 8159 + }, + { + "epoch": 0.47, + "grad_norm": 0.4596488600608055, + "learning_rate": 1.1490597911789104e-05, + "loss": 0.2365, + "step": 8160 + }, + { + "epoch": 0.47, + "grad_norm": 0.5358031775312709, + "learning_rate": 1.1488757763160771e-05, + "loss": 0.3619, + "step": 8161 + }, + { + "epoch": 0.47, + "grad_norm": 0.289051100704963, + "learning_rate": 1.148691756297682e-05, + "loss": 0.2643, + "step": 8162 + }, + { + "epoch": 0.47, + "grad_norm": 0.46616362480908935, + "learning_rate": 1.1485077311300983e-05, + "loss": 0.2833, + "step": 8163 + }, + { + "epoch": 0.47, + "grad_norm": 0.43853018014503176, + "learning_rate": 1.1483237008196978e-05, + "loss": 0.3051, + "step": 8164 + }, + { + "epoch": 0.47, + "grad_norm": 0.27216529737512757, + "learning_rate": 1.1481396653728542e-05, + "loss": 0.2282, + "step": 8165 + }, + { + "epoch": 0.47, + "grad_norm": 0.3954700014749519, + "learning_rate": 1.14795562479594e-05, + "loss": 0.3425, + "step": 8166 + }, + { + "epoch": 0.47, + "grad_norm": 0.639378666232857, + "learning_rate": 1.1477715790953293e-05, + "loss": 0.3417, + "step": 8167 + }, + { + "epoch": 0.47, + "grad_norm": 0.3230244326253486, + "learning_rate": 1.1475875282773948e-05, + "loss": 0.2642, + "step": 8168 + }, + { + "epoch": 0.47, + "grad_norm": 1.0780614602536482, + "learning_rate": 1.1474034723485108e-05, + "loss": 0.6736, + "step": 8169 + }, + { + "epoch": 0.47, + "grad_norm": 0.299867058226962, + "learning_rate": 1.1472194113150507e-05, + "loss": 0.2598, + "step": 8170 + }, + { + "epoch": 0.47, + "grad_norm": 0.2996405103989052, + "learning_rate": 1.1470353451833889e-05, + "loss": 0.2208, + "step": 8171 + }, + { + "epoch": 0.47, + "grad_norm": 0.62197502201416, + "learning_rate": 1.1468512739598991e-05, + "loss": 0.3585, + "step": 8172 + }, + { + "epoch": 0.47, + "grad_norm": 0.28867732255449474, + "learning_rate": 1.1466671976509564e-05, + "loss": 0.2173, + "step": 8173 + }, + { + "epoch": 0.47, + "grad_norm": 0.613279875657845, + "learning_rate": 1.1464831162629346e-05, + "loss": 0.3476, + "step": 8174 + }, + { + "epoch": 0.47, + "grad_norm": 0.3803647956262368, + "learning_rate": 1.146299029802209e-05, + "loss": 0.3359, + "step": 8175 + }, + { + "epoch": 0.47, + "grad_norm": 1.0181850562146808, + "learning_rate": 1.1461149382751544e-05, + "loss": 0.6673, + "step": 8176 + }, + { + "epoch": 0.47, + "grad_norm": 0.7603144529101236, + "learning_rate": 1.1459308416881454e-05, + "loss": 0.2102, + "step": 8177 + }, + { + "epoch": 0.47, + "grad_norm": 0.30168260451446044, + "learning_rate": 1.145746740047558e-05, + "loss": 0.2995, + "step": 8178 + }, + { + "epoch": 0.47, + "grad_norm": 0.3321460276508209, + "learning_rate": 1.1455626333597672e-05, + "loss": 0.2085, + "step": 8179 + }, + { + "epoch": 0.47, + "grad_norm": 0.34455866728257245, + "learning_rate": 1.1453785216311484e-05, + "loss": 0.2043, + "step": 8180 + }, + { + "epoch": 0.47, + "grad_norm": 0.3685518357521418, + "learning_rate": 1.1451944048680779e-05, + "loss": 0.3475, + "step": 8181 + }, + { + "epoch": 0.47, + "grad_norm": 0.4831933139625984, + "learning_rate": 1.1450102830769314e-05, + "loss": 0.3685, + "step": 8182 + }, + { + "epoch": 0.47, + "grad_norm": 0.3531600940862889, + "learning_rate": 1.1448261562640848e-05, + "loss": 0.2355, + "step": 8183 + }, + { + "epoch": 0.47, + "grad_norm": 0.439610405654752, + "learning_rate": 1.1446420244359148e-05, + "loss": 0.3821, + "step": 8184 + }, + { + "epoch": 0.47, + "grad_norm": 0.26939221681386455, + "learning_rate": 1.1444578875987978e-05, + "loss": 0.2208, + "step": 8185 + }, + { + "epoch": 0.47, + "grad_norm": 0.27650453372703404, + "learning_rate": 1.1442737457591102e-05, + "loss": 0.2099, + "step": 8186 + }, + { + "epoch": 0.47, + "grad_norm": 1.3629929252444442, + "learning_rate": 1.144089598923229e-05, + "loss": 0.7209, + "step": 8187 + }, + { + "epoch": 0.47, + "grad_norm": 0.6214268287048519, + "learning_rate": 1.1439054470975312e-05, + "loss": 0.506, + "step": 8188 + }, + { + "epoch": 0.47, + "grad_norm": 0.3119158539908204, + "learning_rate": 1.143721290288394e-05, + "loss": 0.2878, + "step": 8189 + }, + { + "epoch": 0.47, + "grad_norm": 0.3318970742581742, + "learning_rate": 1.1435371285021948e-05, + "loss": 0.262, + "step": 8190 + }, + { + "epoch": 0.47, + "grad_norm": 0.2773017863670644, + "learning_rate": 1.1433529617453108e-05, + "loss": 0.1932, + "step": 8191 + }, + { + "epoch": 0.47, + "grad_norm": 0.8760997978912737, + "learning_rate": 1.1431687900241201e-05, + "loss": 0.4663, + "step": 8192 + }, + { + "epoch": 0.47, + "grad_norm": 0.3367584318928723, + "learning_rate": 1.142984613345e-05, + "loss": 0.2346, + "step": 8193 + }, + { + "epoch": 0.47, + "grad_norm": 0.4579062321186222, + "learning_rate": 1.1428004317143293e-05, + "loss": 0.3515, + "step": 8194 + }, + { + "epoch": 0.47, + "grad_norm": 0.6150169647018473, + "learning_rate": 1.1426162451384857e-05, + "loss": 0.3871, + "step": 8195 + }, + { + "epoch": 0.47, + "grad_norm": 0.3216805849443447, + "learning_rate": 1.1424320536238478e-05, + "loss": 0.2268, + "step": 8196 + }, + { + "epoch": 0.47, + "grad_norm": 0.25155788365831927, + "learning_rate": 1.1422478571767937e-05, + "loss": 0.206, + "step": 8197 + }, + { + "epoch": 0.47, + "grad_norm": 0.34325115785592575, + "learning_rate": 1.1420636558037026e-05, + "loss": 0.2802, + "step": 8198 + }, + { + "epoch": 0.47, + "grad_norm": 0.3558996480535589, + "learning_rate": 1.1418794495109528e-05, + "loss": 0.2655, + "step": 8199 + }, + { + "epoch": 0.47, + "grad_norm": 0.7393643667231588, + "learning_rate": 1.1416952383049244e-05, + "loss": 0.4348, + "step": 8200 + }, + { + "epoch": 0.47, + "grad_norm": 0.3308899478444616, + "learning_rate": 1.1415110221919958e-05, + "loss": 0.2869, + "step": 8201 + }, + { + "epoch": 0.47, + "grad_norm": 0.4330551569974486, + "learning_rate": 1.1413268011785463e-05, + "loss": 0.3299, + "step": 8202 + }, + { + "epoch": 0.47, + "grad_norm": 0.16466774797088232, + "learning_rate": 1.1411425752709561e-05, + "loss": 0.0933, + "step": 8203 + }, + { + "epoch": 0.47, + "grad_norm": 0.5658043381450948, + "learning_rate": 1.1409583444756043e-05, + "loss": 0.3813, + "step": 8204 + }, + { + "epoch": 0.47, + "grad_norm": 0.39733186363612516, + "learning_rate": 1.1407741087988713e-05, + "loss": 0.311, + "step": 8205 + }, + { + "epoch": 0.47, + "grad_norm": 0.5293473050356182, + "learning_rate": 1.1405898682471367e-05, + "loss": 0.3052, + "step": 8206 + }, + { + "epoch": 0.47, + "grad_norm": 0.3863878784564842, + "learning_rate": 1.1404056228267813e-05, + "loss": 0.3073, + "step": 8207 + }, + { + "epoch": 0.47, + "grad_norm": 0.5500023359847825, + "learning_rate": 1.140221372544185e-05, + "loss": 0.3302, + "step": 8208 + }, + { + "epoch": 0.47, + "grad_norm": 0.26832299732179304, + "learning_rate": 1.1400371174057287e-05, + "loss": 0.2081, + "step": 8209 + }, + { + "epoch": 0.47, + "grad_norm": 0.3489058273162678, + "learning_rate": 1.139852857417793e-05, + "loss": 0.2296, + "step": 8210 + }, + { + "epoch": 0.47, + "grad_norm": 0.35245208526279237, + "learning_rate": 1.139668592586759e-05, + "loss": 0.2888, + "step": 8211 + }, + { + "epoch": 0.47, + "grad_norm": 0.6348054017588185, + "learning_rate": 1.1394843229190076e-05, + "loss": 0.3672, + "step": 8212 + }, + { + "epoch": 0.47, + "grad_norm": 0.3565702444562063, + "learning_rate": 1.1393000484209202e-05, + "loss": 0.2839, + "step": 8213 + }, + { + "epoch": 0.47, + "grad_norm": 0.35892910995714983, + "learning_rate": 1.139115769098878e-05, + "loss": 0.2883, + "step": 8214 + }, + { + "epoch": 0.47, + "grad_norm": 0.4652118043069549, + "learning_rate": 1.1389314849592626e-05, + "loss": 0.2623, + "step": 8215 + }, + { + "epoch": 0.47, + "grad_norm": 0.31534488534631117, + "learning_rate": 1.1387471960084557e-05, + "loss": 0.1603, + "step": 8216 + }, + { + "epoch": 0.47, + "grad_norm": 0.3043071870765772, + "learning_rate": 1.1385629022528397e-05, + "loss": 0.2729, + "step": 8217 + }, + { + "epoch": 0.47, + "grad_norm": 0.791220490302, + "learning_rate": 1.1383786036987963e-05, + "loss": 0.433, + "step": 8218 + }, + { + "epoch": 0.47, + "grad_norm": 0.3690113603730788, + "learning_rate": 1.1381943003527077e-05, + "loss": 0.1939, + "step": 8219 + }, + { + "epoch": 0.47, + "grad_norm": 0.3844076124295093, + "learning_rate": 1.1380099922209564e-05, + "loss": 0.3177, + "step": 8220 + }, + { + "epoch": 0.47, + "grad_norm": 0.3525552292700522, + "learning_rate": 1.1378256793099251e-05, + "loss": 0.3186, + "step": 8221 + }, + { + "epoch": 0.47, + "grad_norm": 0.2606543369638635, + "learning_rate": 1.1376413616259965e-05, + "loss": 0.1299, + "step": 8222 + }, + { + "epoch": 0.47, + "grad_norm": 0.45789735825217354, + "learning_rate": 1.1374570391755532e-05, + "loss": 0.3216, + "step": 8223 + }, + { + "epoch": 0.47, + "grad_norm": 0.7984697434570868, + "learning_rate": 1.137272711964979e-05, + "loss": 0.4888, + "step": 8224 + }, + { + "epoch": 0.47, + "grad_norm": 0.31774601929632706, + "learning_rate": 1.1370883800006562e-05, + "loss": 0.2252, + "step": 8225 + }, + { + "epoch": 0.47, + "grad_norm": 0.42364764216329703, + "learning_rate": 1.1369040432889691e-05, + "loss": 0.3239, + "step": 8226 + }, + { + "epoch": 0.47, + "grad_norm": 0.43654263319431763, + "learning_rate": 1.1367197018363005e-05, + "loss": 0.2615, + "step": 8227 + }, + { + "epoch": 0.47, + "grad_norm": 0.5638770827996676, + "learning_rate": 1.1365353556490348e-05, + "loss": 0.3034, + "step": 8228 + }, + { + "epoch": 0.47, + "grad_norm": 0.26140699917356136, + "learning_rate": 1.1363510047335553e-05, + "loss": 0.2156, + "step": 8229 + }, + { + "epoch": 0.47, + "grad_norm": 0.8208422504947541, + "learning_rate": 1.1361666490962468e-05, + "loss": 0.4557, + "step": 8230 + }, + { + "epoch": 0.47, + "grad_norm": 0.9822644350984603, + "learning_rate": 1.1359822887434927e-05, + "loss": 0.7397, + "step": 8231 + }, + { + "epoch": 0.47, + "grad_norm": 0.3314402613856236, + "learning_rate": 1.1357979236816781e-05, + "loss": 0.2076, + "step": 8232 + }, + { + "epoch": 0.47, + "grad_norm": 0.4487027140708006, + "learning_rate": 1.135613553917187e-05, + "loss": 0.3122, + "step": 8233 + }, + { + "epoch": 0.47, + "grad_norm": 0.9682417335085425, + "learning_rate": 1.1354291794564045e-05, + "loss": 0.4434, + "step": 8234 + }, + { + "epoch": 0.47, + "grad_norm": 0.2746870381544497, + "learning_rate": 1.1352448003057153e-05, + "loss": 0.1845, + "step": 8235 + }, + { + "epoch": 0.47, + "grad_norm": 0.3943326196154475, + "learning_rate": 1.1350604164715044e-05, + "loss": 0.237, + "step": 8236 + }, + { + "epoch": 0.47, + "grad_norm": 0.3423516469836019, + "learning_rate": 1.1348760279601572e-05, + "loss": 0.3115, + "step": 8237 + }, + { + "epoch": 0.47, + "grad_norm": 0.3458819009956223, + "learning_rate": 1.134691634778059e-05, + "loss": 0.219, + "step": 8238 + }, + { + "epoch": 0.47, + "grad_norm": 0.8749680292286744, + "learning_rate": 1.1345072369315951e-05, + "loss": 0.4917, + "step": 8239 + }, + { + "epoch": 0.47, + "grad_norm": 0.444178078876986, + "learning_rate": 1.1343228344271515e-05, + "loss": 0.352, + "step": 8240 + }, + { + "epoch": 0.47, + "grad_norm": 0.7944684734767636, + "learning_rate": 1.1341384272711138e-05, + "loss": 0.2822, + "step": 8241 + }, + { + "epoch": 0.47, + "grad_norm": 0.28858447543694066, + "learning_rate": 1.1339540154698682e-05, + "loss": 0.1913, + "step": 8242 + }, + { + "epoch": 0.47, + "grad_norm": 0.33636234607029175, + "learning_rate": 1.133769599029801e-05, + "loss": 0.2435, + "step": 8243 + }, + { + "epoch": 0.47, + "grad_norm": 0.37425119175237026, + "learning_rate": 1.1335851779572979e-05, + "loss": 0.2879, + "step": 8244 + }, + { + "epoch": 0.47, + "grad_norm": 0.3420784443581969, + "learning_rate": 1.1334007522587462e-05, + "loss": 0.2764, + "step": 8245 + }, + { + "epoch": 0.47, + "grad_norm": 0.626268923474981, + "learning_rate": 1.1332163219405318e-05, + "loss": 0.4184, + "step": 8246 + }, + { + "epoch": 0.47, + "grad_norm": 0.33327889942284983, + "learning_rate": 1.1330318870090427e-05, + "loss": 0.2542, + "step": 8247 + }, + { + "epoch": 0.47, + "grad_norm": 0.3209884215108911, + "learning_rate": 1.1328474474706643e-05, + "loss": 0.224, + "step": 8248 + }, + { + "epoch": 0.47, + "grad_norm": 0.38396143689121437, + "learning_rate": 1.132663003331785e-05, + "loss": 0.2768, + "step": 8249 + }, + { + "epoch": 0.47, + "grad_norm": 0.34285888950900517, + "learning_rate": 1.1324785545987911e-05, + "loss": 0.2586, + "step": 8250 + }, + { + "epoch": 0.47, + "grad_norm": 0.7016183186742044, + "learning_rate": 1.1322941012780707e-05, + "loss": 0.3433, + "step": 8251 + }, + { + "epoch": 0.47, + "grad_norm": 0.4224569861018966, + "learning_rate": 1.1321096433760116e-05, + "loss": 0.3503, + "step": 8252 + }, + { + "epoch": 0.47, + "grad_norm": 0.2995614695059995, + "learning_rate": 1.1319251808990009e-05, + "loss": 0.2572, + "step": 8253 + }, + { + "epoch": 0.47, + "grad_norm": 1.2672865639780329, + "learning_rate": 1.1317407138534268e-05, + "loss": 0.714, + "step": 8254 + }, + { + "epoch": 0.47, + "grad_norm": 0.20313439742018613, + "learning_rate": 1.1315562422456776e-05, + "loss": 0.1281, + "step": 8255 + }, + { + "epoch": 0.47, + "grad_norm": 0.37114924445860703, + "learning_rate": 1.1313717660821413e-05, + "loss": 0.2819, + "step": 8256 + }, + { + "epoch": 0.47, + "grad_norm": 0.4130806594357466, + "learning_rate": 1.1311872853692065e-05, + "loss": 0.3215, + "step": 8257 + }, + { + "epoch": 0.47, + "grad_norm": 0.7315074555515565, + "learning_rate": 1.1310028001132615e-05, + "loss": 0.2889, + "step": 8258 + }, + { + "epoch": 0.47, + "grad_norm": 0.3518162924006683, + "learning_rate": 1.1308183103206956e-05, + "loss": 0.2678, + "step": 8259 + }, + { + "epoch": 0.47, + "grad_norm": 1.263822710183851, + "learning_rate": 1.1306338159978968e-05, + "loss": 0.8082, + "step": 8260 + }, + { + "epoch": 0.47, + "grad_norm": 0.2290140963302349, + "learning_rate": 1.1304493171512548e-05, + "loss": 0.1771, + "step": 8261 + }, + { + "epoch": 0.47, + "grad_norm": 0.3609070041446921, + "learning_rate": 1.1302648137871584e-05, + "loss": 0.2591, + "step": 8262 + }, + { + "epoch": 0.47, + "grad_norm": 0.7500955160282625, + "learning_rate": 1.1300803059119969e-05, + "loss": 0.4461, + "step": 8263 + }, + { + "epoch": 0.47, + "grad_norm": 0.47247522252554397, + "learning_rate": 1.1298957935321604e-05, + "loss": 0.4008, + "step": 8264 + }, + { + "epoch": 0.47, + "grad_norm": 0.29187071697633193, + "learning_rate": 1.129711276654038e-05, + "loss": 0.2102, + "step": 8265 + }, + { + "epoch": 0.47, + "grad_norm": 0.9689601512300138, + "learning_rate": 1.1295267552840198e-05, + "loss": 0.7327, + "step": 8266 + }, + { + "epoch": 0.47, + "grad_norm": 0.39459373406513565, + "learning_rate": 1.1293422294284955e-05, + "loss": 0.255, + "step": 8267 + }, + { + "epoch": 0.48, + "grad_norm": 0.2841527542011416, + "learning_rate": 1.1291576990938556e-05, + "loss": 0.234, + "step": 8268 + }, + { + "epoch": 0.48, + "grad_norm": 0.26846990348811833, + "learning_rate": 1.1289731642864896e-05, + "loss": 0.2712, + "step": 8269 + }, + { + "epoch": 0.48, + "grad_norm": 1.2160805062145421, + "learning_rate": 1.1287886250127888e-05, + "loss": 0.7619, + "step": 8270 + }, + { + "epoch": 0.48, + "grad_norm": 0.3180662479564137, + "learning_rate": 1.1286040812791431e-05, + "loss": 0.2088, + "step": 8271 + }, + { + "epoch": 0.48, + "grad_norm": 1.2780419499498312, + "learning_rate": 1.1284195330919443e-05, + "loss": 0.763, + "step": 8272 + }, + { + "epoch": 0.48, + "grad_norm": 0.34751151421346865, + "learning_rate": 1.128234980457582e-05, + "loss": 0.3257, + "step": 8273 + }, + { + "epoch": 0.48, + "grad_norm": 0.29037446054088023, + "learning_rate": 1.1280504233824481e-05, + "loss": 0.2382, + "step": 8274 + }, + { + "epoch": 0.48, + "grad_norm": 0.400642610476773, + "learning_rate": 1.1278658618729334e-05, + "loss": 0.2889, + "step": 8275 + }, + { + "epoch": 0.48, + "grad_norm": 0.28690418079254904, + "learning_rate": 1.1276812959354295e-05, + "loss": 0.2466, + "step": 8276 + }, + { + "epoch": 0.48, + "grad_norm": 0.3976401918567556, + "learning_rate": 1.1274967255763278e-05, + "loss": 0.2747, + "step": 8277 + }, + { + "epoch": 0.48, + "grad_norm": 0.4454863521161837, + "learning_rate": 1.1273121508020202e-05, + "loss": 0.2982, + "step": 8278 + }, + { + "epoch": 0.48, + "grad_norm": 0.5729809893476822, + "learning_rate": 1.1271275716188978e-05, + "loss": 0.4256, + "step": 8279 + }, + { + "epoch": 0.48, + "grad_norm": 0.410761763093016, + "learning_rate": 1.1269429880333533e-05, + "loss": 0.2653, + "step": 8280 + }, + { + "epoch": 0.48, + "grad_norm": 0.2517747570353783, + "learning_rate": 1.1267584000517788e-05, + "loss": 0.2289, + "step": 8281 + }, + { + "epoch": 0.48, + "grad_norm": 0.4175701025964965, + "learning_rate": 1.1265738076805663e-05, + "loss": 0.2465, + "step": 8282 + }, + { + "epoch": 0.48, + "grad_norm": 0.4040133712318601, + "learning_rate": 1.1263892109261081e-05, + "loss": 0.2884, + "step": 8283 + }, + { + "epoch": 0.48, + "grad_norm": 0.3315685666860372, + "learning_rate": 1.126204609794797e-05, + "loss": 0.2601, + "step": 8284 + }, + { + "epoch": 0.48, + "grad_norm": 0.6738959505588366, + "learning_rate": 1.1260200042930257e-05, + "loss": 0.4439, + "step": 8285 + }, + { + "epoch": 0.48, + "grad_norm": 0.3713996287944544, + "learning_rate": 1.125835394427187e-05, + "loss": 0.3126, + "step": 8286 + }, + { + "epoch": 0.48, + "grad_norm": 0.26116662030586907, + "learning_rate": 1.1256507802036742e-05, + "loss": 0.1798, + "step": 8287 + }, + { + "epoch": 0.48, + "grad_norm": 0.2923099580818043, + "learning_rate": 1.12546616162888e-05, + "loss": 0.241, + "step": 8288 + }, + { + "epoch": 0.48, + "grad_norm": 0.4591105534812718, + "learning_rate": 1.1252815387091984e-05, + "loss": 0.293, + "step": 8289 + }, + { + "epoch": 0.48, + "grad_norm": 0.5414758905754559, + "learning_rate": 1.1250969114510221e-05, + "loss": 0.4012, + "step": 8290 + }, + { + "epoch": 0.48, + "grad_norm": 0.8942911584626979, + "learning_rate": 1.1249122798607454e-05, + "loss": 0.3676, + "step": 8291 + }, + { + "epoch": 0.48, + "grad_norm": 0.35940592339450533, + "learning_rate": 1.1247276439447616e-05, + "loss": 0.2682, + "step": 8292 + }, + { + "epoch": 0.48, + "grad_norm": 0.3883606131416445, + "learning_rate": 1.124543003709465e-05, + "loss": 0.3171, + "step": 8293 + }, + { + "epoch": 0.48, + "grad_norm": 0.2069420790155224, + "learning_rate": 1.1243583591612495e-05, + "loss": 0.1109, + "step": 8294 + }, + { + "epoch": 0.48, + "grad_norm": 0.3893463873924719, + "learning_rate": 1.1241737103065096e-05, + "loss": 0.2684, + "step": 8295 + }, + { + "epoch": 0.48, + "grad_norm": 0.44638528446469994, + "learning_rate": 1.1239890571516389e-05, + "loss": 0.3573, + "step": 8296 + }, + { + "epoch": 0.48, + "grad_norm": 0.9358041698450383, + "learning_rate": 1.123804399703033e-05, + "loss": 0.3285, + "step": 8297 + }, + { + "epoch": 0.48, + "grad_norm": 0.6012185686365062, + "learning_rate": 1.1236197379670861e-05, + "loss": 0.3471, + "step": 8298 + }, + { + "epoch": 0.48, + "grad_norm": 0.36799061504790626, + "learning_rate": 1.1234350719501927e-05, + "loss": 0.3477, + "step": 8299 + }, + { + "epoch": 0.48, + "grad_norm": 0.22412798408524393, + "learning_rate": 1.1232504016587482e-05, + "loss": 0.1747, + "step": 8300 + }, + { + "epoch": 0.48, + "grad_norm": 0.4149487453115047, + "learning_rate": 1.1230657270991476e-05, + "loss": 0.304, + "step": 8301 + }, + { + "epoch": 0.48, + "grad_norm": 0.5459475622031438, + "learning_rate": 1.1228810482777859e-05, + "loss": 0.3991, + "step": 8302 + }, + { + "epoch": 0.48, + "grad_norm": 0.9562637175288264, + "learning_rate": 1.1226963652010592e-05, + "loss": 0.5005, + "step": 8303 + }, + { + "epoch": 0.48, + "grad_norm": 0.27131366109251476, + "learning_rate": 1.1225116778753622e-05, + "loss": 0.2124, + "step": 8304 + }, + { + "epoch": 0.48, + "grad_norm": 0.37477020048094534, + "learning_rate": 1.1223269863070913e-05, + "loss": 0.3101, + "step": 8305 + }, + { + "epoch": 0.48, + "grad_norm": 0.37990887770967674, + "learning_rate": 1.1221422905026424e-05, + "loss": 0.1822, + "step": 8306 + }, + { + "epoch": 0.48, + "grad_norm": 0.4007847446197605, + "learning_rate": 1.1219575904684109e-05, + "loss": 0.2274, + "step": 8307 + }, + { + "epoch": 0.48, + "grad_norm": 0.3859141568843464, + "learning_rate": 1.1217728862107932e-05, + "loss": 0.3088, + "step": 8308 + }, + { + "epoch": 0.48, + "grad_norm": 0.5814554045581455, + "learning_rate": 1.1215881777361858e-05, + "loss": 0.3877, + "step": 8309 + }, + { + "epoch": 0.48, + "grad_norm": 0.31317402052160326, + "learning_rate": 1.1214034650509853e-05, + "loss": 0.2069, + "step": 8310 + }, + { + "epoch": 0.48, + "grad_norm": 1.3864302814986327, + "learning_rate": 1.1212187481615875e-05, + "loss": 0.6831, + "step": 8311 + }, + { + "epoch": 0.48, + "grad_norm": 0.24158373982842526, + "learning_rate": 1.1210340270743903e-05, + "loss": 0.2158, + "step": 8312 + }, + { + "epoch": 0.48, + "grad_norm": 0.38673341680171747, + "learning_rate": 1.1208493017957893e-05, + "loss": 0.2194, + "step": 8313 + }, + { + "epoch": 0.48, + "grad_norm": 0.3480815756689843, + "learning_rate": 1.1206645723321825e-05, + "loss": 0.3104, + "step": 8314 + }, + { + "epoch": 0.48, + "grad_norm": 1.142989272685945, + "learning_rate": 1.1204798386899669e-05, + "loss": 0.8243, + "step": 8315 + }, + { + "epoch": 0.48, + "grad_norm": 0.334355108496965, + "learning_rate": 1.1202951008755395e-05, + "loss": 0.2723, + "step": 8316 + }, + { + "epoch": 0.48, + "grad_norm": 0.342405895746166, + "learning_rate": 1.1201103588952979e-05, + "loss": 0.2292, + "step": 8317 + }, + { + "epoch": 0.48, + "grad_norm": 0.35075106621619223, + "learning_rate": 1.11992561275564e-05, + "loss": 0.2068, + "step": 8318 + }, + { + "epoch": 0.48, + "grad_norm": 0.5953512589583299, + "learning_rate": 1.1197408624629626e-05, + "loss": 0.4117, + "step": 8319 + }, + { + "epoch": 0.48, + "grad_norm": 0.27076852878493896, + "learning_rate": 1.119556108023665e-05, + "loss": 0.2509, + "step": 8320 + }, + { + "epoch": 0.48, + "grad_norm": 1.2379250749740365, + "learning_rate": 1.119371349444144e-05, + "loss": 0.8403, + "step": 8321 + }, + { + "epoch": 0.48, + "grad_norm": 0.6147005884618181, + "learning_rate": 1.1191865867307987e-05, + "loss": 0.4054, + "step": 8322 + }, + { + "epoch": 0.48, + "grad_norm": 0.3673949040504266, + "learning_rate": 1.1190018198900267e-05, + "loss": 0.2359, + "step": 8323 + }, + { + "epoch": 0.48, + "grad_norm": 0.2579786089458528, + "learning_rate": 1.118817048928227e-05, + "loss": 0.2306, + "step": 8324 + }, + { + "epoch": 0.48, + "grad_norm": 0.5153506800417209, + "learning_rate": 1.1186322738517983e-05, + "loss": 0.3424, + "step": 8325 + }, + { + "epoch": 0.48, + "grad_norm": 0.2880843628827029, + "learning_rate": 1.1184474946671384e-05, + "loss": 0.2012, + "step": 8326 + }, + { + "epoch": 0.48, + "grad_norm": 1.1330484123308262, + "learning_rate": 1.1182627113806475e-05, + "loss": 0.7822, + "step": 8327 + }, + { + "epoch": 0.48, + "grad_norm": 0.29700312651214944, + "learning_rate": 1.1180779239987233e-05, + "loss": 0.2688, + "step": 8328 + }, + { + "epoch": 0.48, + "grad_norm": 0.3926248997185353, + "learning_rate": 1.1178931325277662e-05, + "loss": 0.3018, + "step": 8329 + }, + { + "epoch": 0.48, + "grad_norm": 0.4880065387650843, + "learning_rate": 1.1177083369741749e-05, + "loss": 0.3457, + "step": 8330 + }, + { + "epoch": 0.48, + "grad_norm": 0.4622472597385853, + "learning_rate": 1.117523537344349e-05, + "loss": 0.3208, + "step": 8331 + }, + { + "epoch": 0.48, + "grad_norm": 0.26773122651731995, + "learning_rate": 1.1173387336446879e-05, + "loss": 0.246, + "step": 8332 + }, + { + "epoch": 0.48, + "grad_norm": 0.27469813985436253, + "learning_rate": 1.1171539258815916e-05, + "loss": 0.2141, + "step": 8333 + }, + { + "epoch": 0.48, + "grad_norm": 0.6729771240210307, + "learning_rate": 1.1169691140614597e-05, + "loss": 0.4272, + "step": 8334 + }, + { + "epoch": 0.48, + "grad_norm": 0.40650217240748127, + "learning_rate": 1.1167842981906927e-05, + "loss": 0.3098, + "step": 8335 + }, + { + "epoch": 0.48, + "grad_norm": 0.33788023906972464, + "learning_rate": 1.1165994782756902e-05, + "loss": 0.2863, + "step": 8336 + }, + { + "epoch": 0.48, + "grad_norm": 0.8965463163501143, + "learning_rate": 1.1164146543228529e-05, + "loss": 0.4183, + "step": 8337 + }, + { + "epoch": 0.48, + "grad_norm": 0.37608376944922794, + "learning_rate": 1.116229826338581e-05, + "loss": 0.2967, + "step": 8338 + }, + { + "epoch": 0.48, + "grad_norm": 1.1709144692691702, + "learning_rate": 1.1160449943292754e-05, + "loss": 0.7408, + "step": 8339 + }, + { + "epoch": 0.48, + "grad_norm": 0.20606643679960182, + "learning_rate": 1.1158601583013365e-05, + "loss": 0.1803, + "step": 8340 + }, + { + "epoch": 0.48, + "grad_norm": 0.3825272882807248, + "learning_rate": 1.1156753182611655e-05, + "loss": 0.2924, + "step": 8341 + }, + { + "epoch": 0.48, + "grad_norm": 0.757439344922993, + "learning_rate": 1.1154904742151628e-05, + "loss": 0.408, + "step": 8342 + }, + { + "epoch": 0.48, + "grad_norm": 0.47983847004271224, + "learning_rate": 1.1153056261697303e-05, + "loss": 0.2154, + "step": 8343 + }, + { + "epoch": 0.48, + "grad_norm": 0.31408770009618836, + "learning_rate": 1.1151207741312688e-05, + "loss": 0.2766, + "step": 8344 + }, + { + "epoch": 0.48, + "grad_norm": 1.1777256065853905, + "learning_rate": 1.11493591810618e-05, + "loss": 0.8841, + "step": 8345 + }, + { + "epoch": 0.48, + "grad_norm": 0.14795722092036293, + "learning_rate": 1.1147510581008654e-05, + "loss": 0.0727, + "step": 8346 + }, + { + "epoch": 0.48, + "grad_norm": 0.44407525535508613, + "learning_rate": 1.114566194121726e-05, + "loss": 0.3331, + "step": 8347 + }, + { + "epoch": 0.48, + "grad_norm": 0.4429171903471908, + "learning_rate": 1.1143813261751648e-05, + "loss": 0.3368, + "step": 8348 + }, + { + "epoch": 0.48, + "grad_norm": 0.5326441778644128, + "learning_rate": 1.1141964542675831e-05, + "loss": 0.2415, + "step": 8349 + }, + { + "epoch": 0.48, + "grad_norm": 0.35629373156998423, + "learning_rate": 1.1140115784053828e-05, + "loss": 0.2704, + "step": 8350 + }, + { + "epoch": 0.48, + "grad_norm": 0.473832740368251, + "learning_rate": 1.1138266985949668e-05, + "loss": 0.3667, + "step": 8351 + }, + { + "epoch": 0.48, + "grad_norm": 0.24716345117047317, + "learning_rate": 1.113641814842737e-05, + "loss": 0.1892, + "step": 8352 + }, + { + "epoch": 0.48, + "grad_norm": 0.37177419017363733, + "learning_rate": 1.1134569271550959e-05, + "loss": 0.2241, + "step": 8353 + }, + { + "epoch": 0.48, + "grad_norm": 0.6795655987417271, + "learning_rate": 1.1132720355384466e-05, + "loss": 0.433, + "step": 8354 + }, + { + "epoch": 0.48, + "grad_norm": 0.4546558880720396, + "learning_rate": 1.1130871399991912e-05, + "loss": 0.3476, + "step": 8355 + }, + { + "epoch": 0.48, + "grad_norm": 0.2735908648714051, + "learning_rate": 1.1129022405437333e-05, + "loss": 0.2299, + "step": 8356 + }, + { + "epoch": 0.48, + "grad_norm": 1.1504777351766975, + "learning_rate": 1.1127173371784755e-05, + "loss": 0.7199, + "step": 8357 + }, + { + "epoch": 0.48, + "grad_norm": 0.320853590136227, + "learning_rate": 1.112532429909821e-05, + "loss": 0.22, + "step": 8358 + }, + { + "epoch": 0.48, + "grad_norm": 0.2922420371940266, + "learning_rate": 1.1123475187441735e-05, + "loss": 0.2365, + "step": 8359 + }, + { + "epoch": 0.48, + "grad_norm": 0.4621379502333083, + "learning_rate": 1.1121626036879362e-05, + "loss": 0.3615, + "step": 8360 + }, + { + "epoch": 0.48, + "grad_norm": 0.7091643725104544, + "learning_rate": 1.1119776847475128e-05, + "loss": 0.3798, + "step": 8361 + }, + { + "epoch": 0.48, + "grad_norm": 0.32042337282411815, + "learning_rate": 1.1117927619293072e-05, + "loss": 0.2344, + "step": 8362 + }, + { + "epoch": 0.48, + "grad_norm": 0.46225350017426375, + "learning_rate": 1.1116078352397226e-05, + "loss": 0.3815, + "step": 8363 + }, + { + "epoch": 0.48, + "grad_norm": 0.44794763153718986, + "learning_rate": 1.1114229046851639e-05, + "loss": 0.3283, + "step": 8364 + }, + { + "epoch": 0.48, + "grad_norm": 0.35863928811423473, + "learning_rate": 1.1112379702720346e-05, + "loss": 0.2993, + "step": 8365 + }, + { + "epoch": 0.48, + "grad_norm": 0.2997559968892406, + "learning_rate": 1.1110530320067395e-05, + "loss": 0.1826, + "step": 8366 + }, + { + "epoch": 0.48, + "grad_norm": 0.31177442430328983, + "learning_rate": 1.110868089895682e-05, + "loss": 0.2818, + "step": 8367 + }, + { + "epoch": 0.48, + "grad_norm": 0.34426675271911766, + "learning_rate": 1.1106831439452678e-05, + "loss": 0.2813, + "step": 8368 + }, + { + "epoch": 0.48, + "grad_norm": 1.159122972842692, + "learning_rate": 1.1104981941619008e-05, + "loss": 0.5404, + "step": 8369 + }, + { + "epoch": 0.48, + "grad_norm": 0.6308084552291687, + "learning_rate": 1.1103132405519866e-05, + "loss": 0.4178, + "step": 8370 + }, + { + "epoch": 0.48, + "grad_norm": 0.3508909818733613, + "learning_rate": 1.1101282831219292e-05, + "loss": 0.2844, + "step": 8371 + }, + { + "epoch": 0.48, + "grad_norm": 0.24925963458866746, + "learning_rate": 1.1099433218781342e-05, + "loss": 0.1824, + "step": 8372 + }, + { + "epoch": 0.48, + "grad_norm": 1.0146895770631508, + "learning_rate": 1.1097583568270068e-05, + "loss": 0.5618, + "step": 8373 + }, + { + "epoch": 0.48, + "grad_norm": 0.3402111836733542, + "learning_rate": 1.1095733879749517e-05, + "loss": 0.2561, + "step": 8374 + }, + { + "epoch": 0.48, + "grad_norm": 0.44394453768906555, + "learning_rate": 1.1093884153283755e-05, + "loss": 0.2666, + "step": 8375 + }, + { + "epoch": 0.48, + "grad_norm": 0.5030724555789422, + "learning_rate": 1.1092034388936827e-05, + "loss": 0.3637, + "step": 8376 + }, + { + "epoch": 0.48, + "grad_norm": 0.3053604077245766, + "learning_rate": 1.1090184586772798e-05, + "loss": 0.2541, + "step": 8377 + }, + { + "epoch": 0.48, + "grad_norm": 0.26072680670778176, + "learning_rate": 1.1088334746855724e-05, + "loss": 0.1699, + "step": 8378 + }, + { + "epoch": 0.48, + "grad_norm": 0.33213697117973734, + "learning_rate": 1.1086484869249664e-05, + "loss": 0.2501, + "step": 8379 + }, + { + "epoch": 0.48, + "grad_norm": 0.3411697209711852, + "learning_rate": 1.1084634954018679e-05, + "loss": 0.2749, + "step": 8380 + }, + { + "epoch": 0.48, + "grad_norm": 0.7937308936897097, + "learning_rate": 1.1082785001226833e-05, + "loss": 0.527, + "step": 8381 + }, + { + "epoch": 0.48, + "grad_norm": 0.5427995751624755, + "learning_rate": 1.108093501093819e-05, + "loss": 0.2829, + "step": 8382 + }, + { + "epoch": 0.48, + "grad_norm": 0.37423001686239854, + "learning_rate": 1.1079084983216812e-05, + "loss": 0.294, + "step": 8383 + }, + { + "epoch": 0.48, + "grad_norm": 0.2610551272973318, + "learning_rate": 1.107723491812677e-05, + "loss": 0.2371, + "step": 8384 + }, + { + "epoch": 0.48, + "grad_norm": 0.2848863442961672, + "learning_rate": 1.1075384815732126e-05, + "loss": 0.1655, + "step": 8385 + }, + { + "epoch": 0.48, + "grad_norm": 0.36620040221470535, + "learning_rate": 1.1073534676096953e-05, + "loss": 0.3269, + "step": 8386 + }, + { + "epoch": 0.48, + "grad_norm": 0.40895567597138205, + "learning_rate": 1.107168449928532e-05, + "loss": 0.3493, + "step": 8387 + }, + { + "epoch": 0.48, + "grad_norm": 0.9694638436873771, + "learning_rate": 1.1069834285361299e-05, + "loss": 0.3581, + "step": 8388 + }, + { + "epoch": 0.48, + "grad_norm": 0.3307306363709223, + "learning_rate": 1.1067984034388963e-05, + "loss": 0.2843, + "step": 8389 + }, + { + "epoch": 0.48, + "grad_norm": 0.2628285063571431, + "learning_rate": 1.1066133746432388e-05, + "loss": 0.1729, + "step": 8390 + }, + { + "epoch": 0.48, + "grad_norm": 0.39208782700167216, + "learning_rate": 1.1064283421555643e-05, + "loss": 0.2981, + "step": 8391 + }, + { + "epoch": 0.48, + "grad_norm": 0.32337473433052144, + "learning_rate": 1.1062433059822813e-05, + "loss": 0.2115, + "step": 8392 + }, + { + "epoch": 0.48, + "grad_norm": 0.9751414436431275, + "learning_rate": 1.106058266129797e-05, + "loss": 0.4711, + "step": 8393 + }, + { + "epoch": 0.48, + "grad_norm": 0.917279257161967, + "learning_rate": 1.1058732226045195e-05, + "loss": 0.504, + "step": 8394 + }, + { + "epoch": 0.48, + "grad_norm": 0.27816877474586016, + "learning_rate": 1.1056881754128568e-05, + "loss": 0.2211, + "step": 8395 + }, + { + "epoch": 0.48, + "grad_norm": 0.4602594483651638, + "learning_rate": 1.1055031245612172e-05, + "loss": 0.3818, + "step": 8396 + }, + { + "epoch": 0.48, + "grad_norm": 0.2664228010575755, + "learning_rate": 1.1053180700560086e-05, + "loss": 0.1863, + "step": 8397 + }, + { + "epoch": 0.48, + "grad_norm": 0.36439137476338745, + "learning_rate": 1.1051330119036404e-05, + "loss": 0.2241, + "step": 8398 + }, + { + "epoch": 0.48, + "grad_norm": 0.638885939720972, + "learning_rate": 1.1049479501105202e-05, + "loss": 0.3925, + "step": 8399 + }, + { + "epoch": 0.48, + "grad_norm": 0.4978465949232517, + "learning_rate": 1.1047628846830571e-05, + "loss": 0.3931, + "step": 8400 + }, + { + "epoch": 0.48, + "grad_norm": 0.33079743434334197, + "learning_rate": 1.1045778156276596e-05, + "loss": 0.1901, + "step": 8401 + }, + { + "epoch": 0.48, + "grad_norm": 0.3101247786660534, + "learning_rate": 1.104392742950737e-05, + "loss": 0.2588, + "step": 8402 + }, + { + "epoch": 0.48, + "grad_norm": 0.2805944667132338, + "learning_rate": 1.104207666658698e-05, + "loss": 0.2896, + "step": 8403 + }, + { + "epoch": 0.48, + "grad_norm": 0.3730808898739765, + "learning_rate": 1.1040225867579522e-05, + "loss": 0.2604, + "step": 8404 + }, + { + "epoch": 0.48, + "grad_norm": 0.5112786993131957, + "learning_rate": 1.1038375032549085e-05, + "loss": 0.33, + "step": 8405 + }, + { + "epoch": 0.48, + "grad_norm": 0.9322563183427266, + "learning_rate": 1.1036524161559767e-05, + "loss": 0.5786, + "step": 8406 + }, + { + "epoch": 0.48, + "grad_norm": 0.355904824217904, + "learning_rate": 1.103467325467566e-05, + "loss": 0.2834, + "step": 8407 + }, + { + "epoch": 0.48, + "grad_norm": 0.2743782333531795, + "learning_rate": 1.1032822311960866e-05, + "loss": 0.2112, + "step": 8408 + }, + { + "epoch": 0.48, + "grad_norm": 0.4259052635545149, + "learning_rate": 1.1030971333479477e-05, + "loss": 0.3138, + "step": 8409 + }, + { + "epoch": 0.48, + "grad_norm": 0.37998773225911436, + "learning_rate": 1.1029120319295597e-05, + "loss": 0.3443, + "step": 8410 + }, + { + "epoch": 0.48, + "grad_norm": 0.34994216114806836, + "learning_rate": 1.1027269269473324e-05, + "loss": 0.2692, + "step": 8411 + }, + { + "epoch": 0.48, + "grad_norm": 1.0286726329274982, + "learning_rate": 1.102541818407676e-05, + "loss": 0.6472, + "step": 8412 + }, + { + "epoch": 0.48, + "grad_norm": 0.39969223773471535, + "learning_rate": 1.1023567063170008e-05, + "loss": 0.3095, + "step": 8413 + }, + { + "epoch": 0.48, + "grad_norm": 0.4792364386637519, + "learning_rate": 1.1021715906817172e-05, + "loss": 0.2102, + "step": 8414 + }, + { + "epoch": 0.48, + "grad_norm": 0.24998225105266594, + "learning_rate": 1.101986471508236e-05, + "loss": 0.2621, + "step": 8415 + }, + { + "epoch": 0.48, + "grad_norm": 0.3296303617196106, + "learning_rate": 1.1018013488029675e-05, + "loss": 0.272, + "step": 8416 + }, + { + "epoch": 0.48, + "grad_norm": 0.831873250616347, + "learning_rate": 1.1016162225723227e-05, + "loss": 0.5823, + "step": 8417 + }, + { + "epoch": 0.48, + "grad_norm": 0.3007402335020122, + "learning_rate": 1.1014310928227125e-05, + "loss": 0.2131, + "step": 8418 + }, + { + "epoch": 0.48, + "grad_norm": 0.364565455538296, + "learning_rate": 1.101245959560548e-05, + "loss": 0.2672, + "step": 8419 + }, + { + "epoch": 0.48, + "grad_norm": 0.3912456702815622, + "learning_rate": 1.1010608227922401e-05, + "loss": 0.3275, + "step": 8420 + }, + { + "epoch": 0.48, + "grad_norm": 0.4532113401173461, + "learning_rate": 1.1008756825242007e-05, + "loss": 0.2709, + "step": 8421 + }, + { + "epoch": 0.48, + "grad_norm": 0.5767832010251636, + "learning_rate": 1.10069053876284e-05, + "loss": 0.3767, + "step": 8422 + }, + { + "epoch": 0.48, + "grad_norm": 0.3190711161352703, + "learning_rate": 1.100505391514571e-05, + "loss": 0.309, + "step": 8423 + }, + { + "epoch": 0.48, + "grad_norm": 0.2566852014475039, + "learning_rate": 1.1003202407858042e-05, + "loss": 0.1297, + "step": 8424 + }, + { + "epoch": 0.48, + "grad_norm": 0.3887013888366776, + "learning_rate": 1.1001350865829519e-05, + "loss": 0.2731, + "step": 8425 + }, + { + "epoch": 0.48, + "grad_norm": 0.49546719830955543, + "learning_rate": 1.0999499289124259e-05, + "loss": 0.371, + "step": 8426 + }, + { + "epoch": 0.48, + "grad_norm": 0.3685020261407154, + "learning_rate": 1.0997647677806381e-05, + "loss": 0.318, + "step": 8427 + }, + { + "epoch": 0.48, + "grad_norm": 0.323905474118738, + "learning_rate": 1.0995796031940004e-05, + "loss": 0.2349, + "step": 8428 + }, + { + "epoch": 0.48, + "grad_norm": 0.5429412404412678, + "learning_rate": 1.0993944351589257e-05, + "loss": 0.4474, + "step": 8429 + }, + { + "epoch": 0.48, + "grad_norm": 0.2698732310913593, + "learning_rate": 1.0992092636818261e-05, + "loss": 0.1615, + "step": 8430 + }, + { + "epoch": 0.48, + "grad_norm": 0.2698012591809379, + "learning_rate": 1.0990240887691135e-05, + "loss": 0.2257, + "step": 8431 + }, + { + "epoch": 0.48, + "grad_norm": 0.5174891868486703, + "learning_rate": 1.0988389104272012e-05, + "loss": 0.3601, + "step": 8432 + }, + { + "epoch": 0.48, + "grad_norm": 0.6007606961336437, + "learning_rate": 1.098653728662502e-05, + "loss": 0.4257, + "step": 8433 + }, + { + "epoch": 0.48, + "grad_norm": 0.32474766627803614, + "learning_rate": 1.098468543481428e-05, + "loss": 0.2144, + "step": 8434 + }, + { + "epoch": 0.48, + "grad_norm": 0.39732993885930923, + "learning_rate": 1.0982833548903926e-05, + "loss": 0.3183, + "step": 8435 + }, + { + "epoch": 0.48, + "grad_norm": 0.26185582493114695, + "learning_rate": 1.0980981628958091e-05, + "loss": 0.2002, + "step": 8436 + }, + { + "epoch": 0.48, + "grad_norm": 0.3630987442855988, + "learning_rate": 1.0979129675040902e-05, + "loss": 0.1708, + "step": 8437 + }, + { + "epoch": 0.48, + "grad_norm": 0.4030675818976453, + "learning_rate": 1.0977277687216497e-05, + "loss": 0.3411, + "step": 8438 + }, + { + "epoch": 0.48, + "grad_norm": 0.37339136570184556, + "learning_rate": 1.0975425665549005e-05, + "loss": 0.3173, + "step": 8439 + }, + { + "epoch": 0.48, + "grad_norm": 0.8845059457123733, + "learning_rate": 1.0973573610102566e-05, + "loss": 0.4562, + "step": 8440 + }, + { + "epoch": 0.48, + "grad_norm": 0.31660898433428125, + "learning_rate": 1.0971721520941312e-05, + "loss": 0.239, + "step": 8441 + }, + { + "epoch": 0.49, + "grad_norm": 0.23668371578828956, + "learning_rate": 1.0969869398129385e-05, + "loss": 0.1924, + "step": 8442 + }, + { + "epoch": 0.49, + "grad_norm": 0.40438627675862854, + "learning_rate": 1.0968017241730922e-05, + "loss": 0.3176, + "step": 8443 + }, + { + "epoch": 0.49, + "grad_norm": 0.4396742144328353, + "learning_rate": 1.0966165051810066e-05, + "loss": 0.2638, + "step": 8444 + }, + { + "epoch": 0.49, + "grad_norm": 0.8565682090328727, + "learning_rate": 1.0964312828430952e-05, + "loss": 0.4941, + "step": 8445 + }, + { + "epoch": 0.49, + "grad_norm": 0.644386355055355, + "learning_rate": 1.096246057165773e-05, + "loss": 0.3571, + "step": 8446 + }, + { + "epoch": 0.49, + "grad_norm": 0.2632623269668236, + "learning_rate": 1.0960608281554536e-05, + "loss": 0.258, + "step": 8447 + }, + { + "epoch": 0.49, + "grad_norm": 0.3898501466449026, + "learning_rate": 1.0958755958185521e-05, + "loss": 0.2782, + "step": 8448 + }, + { + "epoch": 0.49, + "grad_norm": 0.39076503964584275, + "learning_rate": 1.0956903601614827e-05, + "loss": 0.2519, + "step": 8449 + }, + { + "epoch": 0.49, + "grad_norm": 0.34799939748395714, + "learning_rate": 1.0955051211906607e-05, + "loss": 0.2699, + "step": 8450 + }, + { + "epoch": 0.49, + "grad_norm": 0.34692010363267356, + "learning_rate": 1.0953198789125e-05, + "loss": 0.3284, + "step": 8451 + }, + { + "epoch": 0.49, + "grad_norm": 0.5508850692361121, + "learning_rate": 1.095134633333416e-05, + "loss": 0.3315, + "step": 8452 + }, + { + "epoch": 0.49, + "grad_norm": 0.38972264224855135, + "learning_rate": 1.0949493844598237e-05, + "loss": 0.3421, + "step": 8453 + }, + { + "epoch": 0.49, + "grad_norm": 0.2861316486348512, + "learning_rate": 1.0947641322981387e-05, + "loss": 0.2046, + "step": 8454 + }, + { + "epoch": 0.49, + "grad_norm": 0.4901872853125752, + "learning_rate": 1.0945788768547754e-05, + "loss": 0.3408, + "step": 8455 + }, + { + "epoch": 0.49, + "grad_norm": 0.34229442973648827, + "learning_rate": 1.0943936181361501e-05, + "loss": 0.3144, + "step": 8456 + }, + { + "epoch": 0.49, + "grad_norm": 0.30739139755930395, + "learning_rate": 1.0942083561486775e-05, + "loss": 0.1327, + "step": 8457 + }, + { + "epoch": 0.49, + "grad_norm": 0.3992928835707834, + "learning_rate": 1.0940230908987737e-05, + "loss": 0.3175, + "step": 8458 + }, + { + "epoch": 0.49, + "grad_norm": 0.2975848893390521, + "learning_rate": 1.0938378223928545e-05, + "loss": 0.2855, + "step": 8459 + }, + { + "epoch": 0.49, + "grad_norm": 0.6888931226475903, + "learning_rate": 1.0936525506373353e-05, + "loss": 0.3482, + "step": 8460 + }, + { + "epoch": 0.49, + "grad_norm": 0.5397706730536465, + "learning_rate": 1.0934672756386324e-05, + "loss": 0.4013, + "step": 8461 + }, + { + "epoch": 0.49, + "grad_norm": 0.2018669105288497, + "learning_rate": 1.0932819974031616e-05, + "loss": 0.1768, + "step": 8462 + }, + { + "epoch": 0.49, + "grad_norm": 0.3994888465967652, + "learning_rate": 1.0930967159373393e-05, + "loss": 0.2612, + "step": 8463 + }, + { + "epoch": 0.49, + "grad_norm": 0.7874372028522038, + "learning_rate": 1.0929114312475818e-05, + "loss": 0.4882, + "step": 8464 + }, + { + "epoch": 0.49, + "grad_norm": 0.3514672262598415, + "learning_rate": 1.0927261433403055e-05, + "loss": 0.3112, + "step": 8465 + }, + { + "epoch": 0.49, + "grad_norm": 0.5423505007021057, + "learning_rate": 1.092540852221927e-05, + "loss": 0.3962, + "step": 8466 + }, + { + "epoch": 0.49, + "grad_norm": 0.47049811026281263, + "learning_rate": 1.0923555578988624e-05, + "loss": 0.2611, + "step": 8467 + }, + { + "epoch": 0.49, + "grad_norm": 0.39710438531386655, + "learning_rate": 1.0921702603775288e-05, + "loss": 0.2879, + "step": 8468 + }, + { + "epoch": 0.49, + "grad_norm": 0.2792052805726842, + "learning_rate": 1.0919849596643434e-05, + "loss": 0.1829, + "step": 8469 + }, + { + "epoch": 0.49, + "grad_norm": 0.3047677379880632, + "learning_rate": 1.0917996557657224e-05, + "loss": 0.2355, + "step": 8470 + }, + { + "epoch": 0.49, + "grad_norm": 0.40934080266730394, + "learning_rate": 1.0916143486880836e-05, + "loss": 0.3056, + "step": 8471 + }, + { + "epoch": 0.49, + "grad_norm": 0.8182633785891162, + "learning_rate": 1.0914290384378436e-05, + "loss": 0.4636, + "step": 8472 + }, + { + "epoch": 0.49, + "grad_norm": 0.5738428159353489, + "learning_rate": 1.09124372502142e-05, + "loss": 0.284, + "step": 8473 + }, + { + "epoch": 0.49, + "grad_norm": 0.2757029445511652, + "learning_rate": 1.09105840844523e-05, + "loss": 0.2102, + "step": 8474 + }, + { + "epoch": 0.49, + "grad_norm": 0.24592269658330476, + "learning_rate": 1.0908730887156915e-05, + "loss": 0.2472, + "step": 8475 + }, + { + "epoch": 0.49, + "grad_norm": 0.4877436876419471, + "learning_rate": 1.090687765839222e-05, + "loss": 0.2014, + "step": 8476 + }, + { + "epoch": 0.49, + "grad_norm": 0.3949928498732002, + "learning_rate": 1.0905024398222386e-05, + "loss": 0.3212, + "step": 8477 + }, + { + "epoch": 0.49, + "grad_norm": 0.5094201877628184, + "learning_rate": 1.09031711067116e-05, + "loss": 0.3381, + "step": 8478 + }, + { + "epoch": 0.49, + "grad_norm": 0.9709515948124112, + "learning_rate": 1.0901317783924032e-05, + "loss": 0.4758, + "step": 8479 + }, + { + "epoch": 0.49, + "grad_norm": 0.3008208066982179, + "learning_rate": 1.0899464429923874e-05, + "loss": 0.2053, + "step": 8480 + }, + { + "epoch": 0.49, + "grad_norm": 0.2425174984408598, + "learning_rate": 1.0897611044775299e-05, + "loss": 0.1676, + "step": 8481 + }, + { + "epoch": 0.49, + "grad_norm": 0.3567217475863464, + "learning_rate": 1.0895757628542492e-05, + "loss": 0.3243, + "step": 8482 + }, + { + "epoch": 0.49, + "grad_norm": 0.3865743507662977, + "learning_rate": 1.0893904181289637e-05, + "loss": 0.2332, + "step": 8483 + }, + { + "epoch": 0.49, + "grad_norm": 0.9081131636400291, + "learning_rate": 1.0892050703080918e-05, + "loss": 0.4564, + "step": 8484 + }, + { + "epoch": 0.49, + "grad_norm": 1.337498254558959, + "learning_rate": 1.0890197193980523e-05, + "loss": 0.8662, + "step": 8485 + }, + { + "epoch": 0.49, + "grad_norm": 0.3029460382510471, + "learning_rate": 1.0888343654052636e-05, + "loss": 0.2082, + "step": 8486 + }, + { + "epoch": 0.49, + "grad_norm": 0.22595061674438283, + "learning_rate": 1.0886490083361445e-05, + "loss": 0.2207, + "step": 8487 + }, + { + "epoch": 0.49, + "grad_norm": 0.6513624736080968, + "learning_rate": 1.0884636481971145e-05, + "loss": 0.4554, + "step": 8488 + }, + { + "epoch": 0.49, + "grad_norm": 0.3546626256247301, + "learning_rate": 1.0882782849945917e-05, + "loss": 0.2338, + "step": 8489 + }, + { + "epoch": 0.49, + "grad_norm": 0.4004379725262802, + "learning_rate": 1.088092918734996e-05, + "loss": 0.3369, + "step": 8490 + }, + { + "epoch": 0.49, + "grad_norm": 1.06392676133309, + "learning_rate": 1.0879075494247459e-05, + "loss": 0.6164, + "step": 8491 + }, + { + "epoch": 0.49, + "grad_norm": 0.33643344970466654, + "learning_rate": 1.0877221770702618e-05, + "loss": 0.2818, + "step": 8492 + }, + { + "epoch": 0.49, + "grad_norm": 0.20948852930825718, + "learning_rate": 1.087536801677962e-05, + "loss": 0.1037, + "step": 8493 + }, + { + "epoch": 0.49, + "grad_norm": 0.3585673923696074, + "learning_rate": 1.0873514232542665e-05, + "loss": 0.3189, + "step": 8494 + }, + { + "epoch": 0.49, + "grad_norm": 0.35729435806128756, + "learning_rate": 1.0871660418055954e-05, + "loss": 0.2793, + "step": 8495 + }, + { + "epoch": 0.49, + "grad_norm": 0.616608207430578, + "learning_rate": 1.0869806573383675e-05, + "loss": 0.362, + "step": 8496 + }, + { + "epoch": 0.49, + "grad_norm": 1.3825893173323816, + "learning_rate": 1.0867952698590036e-05, + "loss": 0.6839, + "step": 8497 + }, + { + "epoch": 0.49, + "grad_norm": 0.27447229366512443, + "learning_rate": 1.0866098793739229e-05, + "loss": 0.2518, + "step": 8498 + }, + { + "epoch": 0.49, + "grad_norm": 0.21800662896904763, + "learning_rate": 1.0864244858895461e-05, + "loss": 0.1464, + "step": 8499 + }, + { + "epoch": 0.49, + "grad_norm": 0.5799123938435271, + "learning_rate": 1.086239089412293e-05, + "loss": 0.4081, + "step": 8500 + }, + { + "epoch": 0.49, + "grad_norm": 0.3538200919550498, + "learning_rate": 1.086053689948584e-05, + "loss": 0.2998, + "step": 8501 + }, + { + "epoch": 0.49, + "grad_norm": 0.441401374507026, + "learning_rate": 1.085868287504839e-05, + "loss": 0.3884, + "step": 8502 + }, + { + "epoch": 0.49, + "grad_norm": 0.4445663999073777, + "learning_rate": 1.0856828820874794e-05, + "loss": 0.2894, + "step": 8503 + }, + { + "epoch": 0.49, + "grad_norm": 0.3221303501831563, + "learning_rate": 1.0854974737029248e-05, + "loss": 0.2564, + "step": 8504 + }, + { + "epoch": 0.49, + "grad_norm": 0.2526415990634436, + "learning_rate": 1.0853120623575968e-05, + "loss": 0.1755, + "step": 8505 + }, + { + "epoch": 0.49, + "grad_norm": 0.3396586056153518, + "learning_rate": 1.0851266480579155e-05, + "loss": 0.2631, + "step": 8506 + }, + { + "epoch": 0.49, + "grad_norm": 0.32619397347087004, + "learning_rate": 1.0849412308103023e-05, + "loss": 0.2644, + "step": 8507 + }, + { + "epoch": 0.49, + "grad_norm": 0.8049881277903056, + "learning_rate": 1.0847558106211775e-05, + "loss": 0.6005, + "step": 8508 + }, + { + "epoch": 0.49, + "grad_norm": 0.39477084899592646, + "learning_rate": 1.0845703874969629e-05, + "loss": 0.2584, + "step": 8509 + }, + { + "epoch": 0.49, + "grad_norm": 0.3284641100010038, + "learning_rate": 1.0843849614440793e-05, + "loss": 0.2518, + "step": 8510 + }, + { + "epoch": 0.49, + "grad_norm": 0.3164534348135442, + "learning_rate": 1.0841995324689482e-05, + "loss": 0.2347, + "step": 8511 + }, + { + "epoch": 0.49, + "grad_norm": 0.6746338727776406, + "learning_rate": 1.0840141005779907e-05, + "loss": 0.3469, + "step": 8512 + }, + { + "epoch": 0.49, + "grad_norm": 0.3406299935814668, + "learning_rate": 1.0838286657776289e-05, + "loss": 0.2758, + "step": 8513 + }, + { + "epoch": 0.49, + "grad_norm": 0.368507523241613, + "learning_rate": 1.0836432280742837e-05, + "loss": 0.3614, + "step": 8514 + }, + { + "epoch": 0.49, + "grad_norm": 0.4196989763584233, + "learning_rate": 1.0834577874743772e-05, + "loss": 0.2835, + "step": 8515 + }, + { + "epoch": 0.49, + "grad_norm": 0.286344189141085, + "learning_rate": 1.0832723439843313e-05, + "loss": 0.2135, + "step": 8516 + }, + { + "epoch": 0.49, + "grad_norm": 0.8103735048771367, + "learning_rate": 1.0830868976105677e-05, + "loss": 0.4709, + "step": 8517 + }, + { + "epoch": 0.49, + "grad_norm": 0.3806559083984567, + "learning_rate": 1.0829014483595081e-05, + "loss": 0.3363, + "step": 8518 + }, + { + "epoch": 0.49, + "grad_norm": 0.22981873993975727, + "learning_rate": 1.0827159962375753e-05, + "loss": 0.1717, + "step": 8519 + }, + { + "epoch": 0.49, + "grad_norm": 1.1201661013138515, + "learning_rate": 1.0825305412511906e-05, + "loss": 0.7243, + "step": 8520 + }, + { + "epoch": 0.49, + "grad_norm": 0.29503616397144117, + "learning_rate": 1.0823450834067772e-05, + "loss": 0.2378, + "step": 8521 + }, + { + "epoch": 0.49, + "grad_norm": 0.2551033595031021, + "learning_rate": 1.0821596227107572e-05, + "loss": 0.2024, + "step": 8522 + }, + { + "epoch": 0.49, + "grad_norm": 0.7899047872546144, + "learning_rate": 1.0819741591695526e-05, + "loss": 0.4307, + "step": 8523 + }, + { + "epoch": 0.49, + "grad_norm": 0.6356314127947628, + "learning_rate": 1.0817886927895866e-05, + "loss": 0.4684, + "step": 8524 + }, + { + "epoch": 0.49, + "grad_norm": 0.344138473561077, + "learning_rate": 1.0816032235772816e-05, + "loss": 0.194, + "step": 8525 + }, + { + "epoch": 0.49, + "grad_norm": 0.3089788067804815, + "learning_rate": 1.0814177515390605e-05, + "loss": 0.297, + "step": 8526 + }, + { + "epoch": 0.49, + "grad_norm": 0.32417727591946566, + "learning_rate": 1.081232276681346e-05, + "loss": 0.2092, + "step": 8527 + }, + { + "epoch": 0.49, + "grad_norm": 0.4153972292522417, + "learning_rate": 1.0810467990105617e-05, + "loss": 0.2856, + "step": 8528 + }, + { + "epoch": 0.49, + "grad_norm": 0.655625681624118, + "learning_rate": 1.0808613185331297e-05, + "loss": 0.3203, + "step": 8529 + }, + { + "epoch": 0.49, + "grad_norm": 0.3785813411354443, + "learning_rate": 1.0806758352554743e-05, + "loss": 0.3061, + "step": 8530 + }, + { + "epoch": 0.49, + "grad_norm": 0.37824841761056816, + "learning_rate": 1.0804903491840178e-05, + "loss": 0.2755, + "step": 8531 + }, + { + "epoch": 0.49, + "grad_norm": 0.21373532506861836, + "learning_rate": 1.080304860325184e-05, + "loss": 0.1536, + "step": 8532 + }, + { + "epoch": 0.49, + "grad_norm": 0.4860820363746777, + "learning_rate": 1.0801193686853964e-05, + "loss": 0.3688, + "step": 8533 + }, + { + "epoch": 0.49, + "grad_norm": 0.2978394874490292, + "learning_rate": 1.0799338742710788e-05, + "loss": 0.24, + "step": 8534 + }, + { + "epoch": 0.49, + "grad_norm": 0.48456413182199815, + "learning_rate": 1.0797483770886542e-05, + "loss": 0.2991, + "step": 8535 + }, + { + "epoch": 0.49, + "grad_norm": 0.6425265613421929, + "learning_rate": 1.0795628771445467e-05, + "loss": 0.4898, + "step": 8536 + }, + { + "epoch": 0.49, + "grad_norm": 0.3882124012170663, + "learning_rate": 1.0793773744451804e-05, + "loss": 0.2956, + "step": 8537 + }, + { + "epoch": 0.49, + "grad_norm": 0.29591018567833727, + "learning_rate": 1.079191868996979e-05, + "loss": 0.2537, + "step": 8538 + }, + { + "epoch": 0.49, + "grad_norm": 0.275873528349889, + "learning_rate": 1.0790063608063664e-05, + "loss": 0.2111, + "step": 8539 + }, + { + "epoch": 0.49, + "grad_norm": 0.309022281433369, + "learning_rate": 1.078820849879767e-05, + "loss": 0.2702, + "step": 8540 + }, + { + "epoch": 0.49, + "grad_norm": 0.7002771887809298, + "learning_rate": 1.0786353362236051e-05, + "loss": 0.4943, + "step": 8541 + }, + { + "epoch": 0.49, + "grad_norm": 0.3367810286858073, + "learning_rate": 1.0784498198443048e-05, + "loss": 0.267, + "step": 8542 + }, + { + "epoch": 0.49, + "grad_norm": 0.6251041919918563, + "learning_rate": 1.0782643007482908e-05, + "loss": 0.3424, + "step": 8543 + }, + { + "epoch": 0.49, + "grad_norm": 0.4196247509404695, + "learning_rate": 1.0780787789419868e-05, + "loss": 0.3126, + "step": 8544 + }, + { + "epoch": 0.49, + "grad_norm": 0.222604196010212, + "learning_rate": 1.0778932544318185e-05, + "loss": 0.1972, + "step": 8545 + }, + { + "epoch": 0.49, + "grad_norm": 0.5323880097633098, + "learning_rate": 1.0777077272242103e-05, + "loss": 0.3519, + "step": 8546 + }, + { + "epoch": 0.49, + "grad_norm": 0.3522018391320144, + "learning_rate": 1.0775221973255866e-05, + "loss": 0.3304, + "step": 8547 + }, + { + "epoch": 0.49, + "grad_norm": 0.7935920052225027, + "learning_rate": 1.0773366647423724e-05, + "loss": 0.3471, + "step": 8548 + }, + { + "epoch": 0.49, + "grad_norm": 0.3614599034269359, + "learning_rate": 1.0771511294809933e-05, + "loss": 0.2682, + "step": 8549 + }, + { + "epoch": 0.49, + "grad_norm": 0.30816488025273653, + "learning_rate": 1.0769655915478734e-05, + "loss": 0.2914, + "step": 8550 + }, + { + "epoch": 0.49, + "grad_norm": 0.41552829457555635, + "learning_rate": 1.076780050949439e-05, + "loss": 0.2073, + "step": 8551 + }, + { + "epoch": 0.49, + "grad_norm": 0.3957754702487912, + "learning_rate": 1.0765945076921143e-05, + "loss": 0.3081, + "step": 8552 + }, + { + "epoch": 0.49, + "grad_norm": 0.3438857205809205, + "learning_rate": 1.0764089617823252e-05, + "loss": 0.2495, + "step": 8553 + }, + { + "epoch": 0.49, + "grad_norm": 0.3916725126911459, + "learning_rate": 1.0762234132264969e-05, + "loss": 0.3022, + "step": 8554 + }, + { + "epoch": 0.49, + "grad_norm": 0.3472478565503047, + "learning_rate": 1.0760378620310551e-05, + "loss": 0.2252, + "step": 8555 + }, + { + "epoch": 0.49, + "grad_norm": 0.5140747501812202, + "learning_rate": 1.0758523082024255e-05, + "loss": 0.39, + "step": 8556 + }, + { + "epoch": 0.49, + "grad_norm": 0.46369297394140513, + "learning_rate": 1.0756667517470337e-05, + "loss": 0.3532, + "step": 8557 + }, + { + "epoch": 0.49, + "grad_norm": 0.2763335707004524, + "learning_rate": 1.0754811926713053e-05, + "loss": 0.233, + "step": 8558 + }, + { + "epoch": 0.49, + "grad_norm": 0.2642442656817401, + "learning_rate": 1.075295630981667e-05, + "loss": 0.1955, + "step": 8559 + }, + { + "epoch": 0.49, + "grad_norm": 0.8901340050690897, + "learning_rate": 1.0751100666845437e-05, + "loss": 0.5739, + "step": 8560 + }, + { + "epoch": 0.49, + "grad_norm": 0.33780611056603577, + "learning_rate": 1.0749244997863624e-05, + "loss": 0.1916, + "step": 8561 + }, + { + "epoch": 0.49, + "grad_norm": 0.314626187389568, + "learning_rate": 1.0747389302935487e-05, + "loss": 0.2898, + "step": 8562 + }, + { + "epoch": 0.49, + "grad_norm": 0.5661987660912312, + "learning_rate": 1.074553358212529e-05, + "loss": 0.4068, + "step": 8563 + }, + { + "epoch": 0.49, + "grad_norm": 0.22613700731661518, + "learning_rate": 1.07436778354973e-05, + "loss": 0.1042, + "step": 8564 + }, + { + "epoch": 0.49, + "grad_norm": 0.26126861145961344, + "learning_rate": 1.0741822063115774e-05, + "loss": 0.2493, + "step": 8565 + }, + { + "epoch": 0.49, + "grad_norm": 0.48082663751471855, + "learning_rate": 1.0739966265044985e-05, + "loss": 0.3952, + "step": 8566 + }, + { + "epoch": 0.49, + "grad_norm": 0.776697669372256, + "learning_rate": 1.0738110441349194e-05, + "loss": 0.568, + "step": 8567 + }, + { + "epoch": 0.49, + "grad_norm": 0.35344836698827203, + "learning_rate": 1.0736254592092674e-05, + "loss": 0.2445, + "step": 8568 + }, + { + "epoch": 0.49, + "grad_norm": 0.41759160329707223, + "learning_rate": 1.0734398717339687e-05, + "loss": 0.3361, + "step": 8569 + }, + { + "epoch": 0.49, + "grad_norm": 0.36406742647874546, + "learning_rate": 1.0732542817154505e-05, + "loss": 0.268, + "step": 8570 + }, + { + "epoch": 0.49, + "grad_norm": 0.21027776814527477, + "learning_rate": 1.0730686891601394e-05, + "loss": 0.1567, + "step": 8571 + }, + { + "epoch": 0.49, + "grad_norm": 1.0486613084863996, + "learning_rate": 1.072883094074463e-05, + "loss": 0.66, + "step": 8572 + }, + { + "epoch": 0.49, + "grad_norm": 0.30409813856147094, + "learning_rate": 1.0726974964648478e-05, + "loss": 0.2823, + "step": 8573 + }, + { + "epoch": 0.49, + "grad_norm": 0.3632538787528237, + "learning_rate": 1.072511896337722e-05, + "loss": 0.2477, + "step": 8574 + }, + { + "epoch": 0.49, + "grad_norm": 0.6178813686029212, + "learning_rate": 1.0723262936995118e-05, + "loss": 0.4058, + "step": 8575 + }, + { + "epoch": 0.49, + "grad_norm": 0.5408942484954933, + "learning_rate": 1.0721406885566455e-05, + "loss": 0.3104, + "step": 8576 + }, + { + "epoch": 0.49, + "grad_norm": 0.22758907901569544, + "learning_rate": 1.07195508091555e-05, + "loss": 0.1969, + "step": 8577 + }, + { + "epoch": 0.49, + "grad_norm": 0.29891880470997145, + "learning_rate": 1.0717694707826534e-05, + "loss": 0.2564, + "step": 8578 + }, + { + "epoch": 0.49, + "grad_norm": 0.6764541746498948, + "learning_rate": 1.0715838581643829e-05, + "loss": 0.4602, + "step": 8579 + }, + { + "epoch": 0.49, + "grad_norm": 0.3694126858365215, + "learning_rate": 1.0713982430671668e-05, + "loss": 0.3039, + "step": 8580 + }, + { + "epoch": 0.49, + "grad_norm": 0.3388532266528095, + "learning_rate": 1.0712126254974325e-05, + "loss": 0.2766, + "step": 8581 + }, + { + "epoch": 0.49, + "grad_norm": 0.8196371361869945, + "learning_rate": 1.0710270054616077e-05, + "loss": 0.3736, + "step": 8582 + }, + { + "epoch": 0.49, + "grad_norm": 0.2575004355415229, + "learning_rate": 1.070841382966121e-05, + "loss": 0.2168, + "step": 8583 + }, + { + "epoch": 0.49, + "grad_norm": 0.3064963483696533, + "learning_rate": 1.0706557580174002e-05, + "loss": 0.1623, + "step": 8584 + }, + { + "epoch": 0.49, + "grad_norm": 0.30059761729633716, + "learning_rate": 1.0704701306218737e-05, + "loss": 0.2986, + "step": 8585 + }, + { + "epoch": 0.49, + "grad_norm": 0.3834769871882269, + "learning_rate": 1.0702845007859697e-05, + "loss": 0.2874, + "step": 8586 + }, + { + "epoch": 0.49, + "grad_norm": 0.7050746353337347, + "learning_rate": 1.0700988685161162e-05, + "loss": 0.3589, + "step": 8587 + }, + { + "epoch": 0.49, + "grad_norm": 0.8554268457849118, + "learning_rate": 1.069913233818742e-05, + "loss": 0.5188, + "step": 8588 + }, + { + "epoch": 0.49, + "grad_norm": 0.2600277824616293, + "learning_rate": 1.0697275967002754e-05, + "loss": 0.26, + "step": 8589 + }, + { + "epoch": 0.49, + "grad_norm": 0.2661738871990709, + "learning_rate": 1.069541957167145e-05, + "loss": 0.1824, + "step": 8590 + }, + { + "epoch": 0.49, + "grad_norm": 0.5728659494558793, + "learning_rate": 1.06935631522578e-05, + "loss": 0.2859, + "step": 8591 + }, + { + "epoch": 0.49, + "grad_norm": 0.36724855285541735, + "learning_rate": 1.0691706708826084e-05, + "loss": 0.3184, + "step": 8592 + }, + { + "epoch": 0.49, + "grad_norm": 0.36684883389905065, + "learning_rate": 1.0689850241440598e-05, + "loss": 0.3254, + "step": 8593 + }, + { + "epoch": 0.49, + "grad_norm": 0.448707775246444, + "learning_rate": 1.0687993750165623e-05, + "loss": 0.2209, + "step": 8594 + }, + { + "epoch": 0.49, + "grad_norm": 0.40597125420971497, + "learning_rate": 1.0686137235065458e-05, + "loss": 0.3169, + "step": 8595 + }, + { + "epoch": 0.49, + "grad_norm": 0.34023890211065355, + "learning_rate": 1.068428069620439e-05, + "loss": 0.2522, + "step": 8596 + }, + { + "epoch": 0.49, + "grad_norm": 0.41372972524648555, + "learning_rate": 1.0682424133646712e-05, + "loss": 0.275, + "step": 8597 + }, + { + "epoch": 0.49, + "grad_norm": 0.3490980753166338, + "learning_rate": 1.068056754745671e-05, + "loss": 0.2738, + "step": 8598 + }, + { + "epoch": 0.49, + "grad_norm": 0.9919442403924039, + "learning_rate": 1.0678710937698689e-05, + "loss": 0.7263, + "step": 8599 + }, + { + "epoch": 0.49, + "grad_norm": 0.477348484888539, + "learning_rate": 1.0676854304436936e-05, + "loss": 0.1544, + "step": 8600 + }, + { + "epoch": 0.49, + "grad_norm": 0.29238634288399545, + "learning_rate": 1.0674997647735745e-05, + "loss": 0.2659, + "step": 8601 + }, + { + "epoch": 0.49, + "grad_norm": 0.4137595889493302, + "learning_rate": 1.0673140967659418e-05, + "loss": 0.2705, + "step": 8602 + }, + { + "epoch": 0.49, + "grad_norm": 0.7976162801001311, + "learning_rate": 1.0671284264272249e-05, + "loss": 0.4373, + "step": 8603 + }, + { + "epoch": 0.49, + "grad_norm": 0.35688291511303527, + "learning_rate": 1.066942753763853e-05, + "loss": 0.2226, + "step": 8604 + }, + { + "epoch": 0.49, + "grad_norm": 0.28563980083841095, + "learning_rate": 1.0667570787822568e-05, + "loss": 0.2589, + "step": 8605 + }, + { + "epoch": 0.49, + "grad_norm": 0.8815030483266307, + "learning_rate": 1.0665714014888657e-05, + "loss": 0.5098, + "step": 8606 + }, + { + "epoch": 0.49, + "grad_norm": 0.3215761513411512, + "learning_rate": 1.0663857218901097e-05, + "loss": 0.2103, + "step": 8607 + }, + { + "epoch": 0.49, + "grad_norm": 0.9842923959569092, + "learning_rate": 1.0662000399924193e-05, + "loss": 0.4303, + "step": 8608 + }, + { + "epoch": 0.49, + "grad_norm": 0.34745505755938316, + "learning_rate": 1.066014355802224e-05, + "loss": 0.3268, + "step": 8609 + }, + { + "epoch": 0.49, + "grad_norm": 0.31255754616013104, + "learning_rate": 1.0658286693259544e-05, + "loss": 0.1985, + "step": 8610 + }, + { + "epoch": 0.49, + "grad_norm": 0.273327258396647, + "learning_rate": 1.065642980570041e-05, + "loss": 0.1981, + "step": 8611 + }, + { + "epoch": 0.49, + "grad_norm": 0.4237158009498341, + "learning_rate": 1.0654572895409142e-05, + "loss": 0.3071, + "step": 8612 + }, + { + "epoch": 0.49, + "grad_norm": 0.3323909110807824, + "learning_rate": 1.065271596245004e-05, + "loss": 0.2247, + "step": 8613 + }, + { + "epoch": 0.49, + "grad_norm": 1.197242401674588, + "learning_rate": 1.0650859006887412e-05, + "loss": 0.4439, + "step": 8614 + }, + { + "epoch": 0.49, + "grad_norm": 1.1780449474286145, + "learning_rate": 1.0649002028785564e-05, + "loss": 0.7566, + "step": 8615 + }, + { + "epoch": 0.5, + "grad_norm": 0.3025044117036119, + "learning_rate": 1.0647145028208808e-05, + "loss": 0.2614, + "step": 8616 + }, + { + "epoch": 0.5, + "grad_norm": 0.21862780541713553, + "learning_rate": 1.0645288005221443e-05, + "loss": 0.1848, + "step": 8617 + }, + { + "epoch": 0.5, + "grad_norm": 0.9973641936045772, + "learning_rate": 1.0643430959887786e-05, + "loss": 0.5432, + "step": 8618 + }, + { + "epoch": 0.5, + "grad_norm": 0.3686939049542852, + "learning_rate": 1.064157389227214e-05, + "loss": 0.2805, + "step": 8619 + }, + { + "epoch": 0.5, + "grad_norm": 1.3729969071937, + "learning_rate": 1.063971680243882e-05, + "loss": 0.3485, + "step": 8620 + }, + { + "epoch": 0.5, + "grad_norm": 0.41373947900301006, + "learning_rate": 1.063785969045213e-05, + "loss": 0.3296, + "step": 8621 + }, + { + "epoch": 0.5, + "grad_norm": 0.3333660307427822, + "learning_rate": 1.063600255637639e-05, + "loss": 0.2753, + "step": 8622 + }, + { + "epoch": 0.5, + "grad_norm": 0.1846235788447922, + "learning_rate": 1.0634145400275906e-05, + "loss": 0.0857, + "step": 8623 + }, + { + "epoch": 0.5, + "grad_norm": 0.4198005388095872, + "learning_rate": 1.0632288222214998e-05, + "loss": 0.333, + "step": 8624 + }, + { + "epoch": 0.5, + "grad_norm": 0.4751118320166682, + "learning_rate": 1.0630431022257975e-05, + "loss": 0.2883, + "step": 8625 + }, + { + "epoch": 0.5, + "grad_norm": 1.5403740119919342, + "learning_rate": 1.062857380046915e-05, + "loss": 0.3654, + "step": 8626 + }, + { + "epoch": 0.5, + "grad_norm": 0.906474785955322, + "learning_rate": 1.0626716556912845e-05, + "loss": 0.656, + "step": 8627 + }, + { + "epoch": 0.5, + "grad_norm": 0.40934538461942965, + "learning_rate": 1.062485929165337e-05, + "loss": 0.3045, + "step": 8628 + }, + { + "epoch": 0.5, + "grad_norm": 0.2483934265695432, + "learning_rate": 1.0623002004755045e-05, + "loss": 0.2398, + "step": 8629 + }, + { + "epoch": 0.5, + "grad_norm": 0.8469478281869853, + "learning_rate": 1.0621144696282187e-05, + "loss": 0.2506, + "step": 8630 + }, + { + "epoch": 0.5, + "grad_norm": 0.41460984148063096, + "learning_rate": 1.0619287366299116e-05, + "loss": 0.2685, + "step": 8631 + }, + { + "epoch": 0.5, + "grad_norm": 0.7057658416397729, + "learning_rate": 1.061743001487015e-05, + "loss": 0.3861, + "step": 8632 + }, + { + "epoch": 0.5, + "grad_norm": 0.44006599321635453, + "learning_rate": 1.0615572642059608e-05, + "loss": 0.2791, + "step": 8633 + }, + { + "epoch": 0.5, + "grad_norm": 0.35366409363198736, + "learning_rate": 1.0613715247931811e-05, + "loss": 0.2718, + "step": 8634 + }, + { + "epoch": 0.5, + "grad_norm": 0.28522502179300874, + "learning_rate": 1.0611857832551088e-05, + "loss": 0.1987, + "step": 8635 + }, + { + "epoch": 0.5, + "grad_norm": 0.4444119647481306, + "learning_rate": 1.0610000395981748e-05, + "loss": 0.2836, + "step": 8636 + }, + { + "epoch": 0.5, + "grad_norm": 0.29795136052669713, + "learning_rate": 1.0608142938288122e-05, + "loss": 0.2719, + "step": 8637 + }, + { + "epoch": 0.5, + "grad_norm": 0.7921438236185548, + "learning_rate": 1.0606285459534531e-05, + "loss": 0.4997, + "step": 8638 + }, + { + "epoch": 0.5, + "grad_norm": 1.113545353925166, + "learning_rate": 1.0604427959785305e-05, + "loss": 0.5553, + "step": 8639 + }, + { + "epoch": 0.5, + "grad_norm": 0.35533823163207906, + "learning_rate": 1.0602570439104758e-05, + "loss": 0.2753, + "step": 8640 + }, + { + "epoch": 0.5, + "grad_norm": 0.4131396122160858, + "learning_rate": 1.0600712897557229e-05, + "loss": 0.3235, + "step": 8641 + }, + { + "epoch": 0.5, + "grad_norm": 0.578787262125609, + "learning_rate": 1.0598855335207032e-05, + "loss": 0.3251, + "step": 8642 + }, + { + "epoch": 0.5, + "grad_norm": 0.2622329097119207, + "learning_rate": 1.0596997752118505e-05, + "loss": 0.1882, + "step": 8643 + }, + { + "epoch": 0.5, + "grad_norm": 1.6319084606170982, + "learning_rate": 1.0595140148355971e-05, + "loss": 0.7732, + "step": 8644 + }, + { + "epoch": 0.5, + "grad_norm": 0.38998737359035557, + "learning_rate": 1.059328252398376e-05, + "loss": 0.329, + "step": 8645 + }, + { + "epoch": 0.5, + "grad_norm": 0.3543340025838131, + "learning_rate": 1.0591424879066199e-05, + "loss": 0.1859, + "step": 8646 + }, + { + "epoch": 0.5, + "grad_norm": 0.4751509222241394, + "learning_rate": 1.058956721366762e-05, + "loss": 0.4029, + "step": 8647 + }, + { + "epoch": 0.5, + "grad_norm": 0.40849080293214024, + "learning_rate": 1.0587709527852354e-05, + "loss": 0.3217, + "step": 8648 + }, + { + "epoch": 0.5, + "grad_norm": 0.2225141351938953, + "learning_rate": 1.0585851821684731e-05, + "loss": 0.1289, + "step": 8649 + }, + { + "epoch": 0.5, + "grad_norm": 0.35872861250269855, + "learning_rate": 1.0583994095229086e-05, + "loss": 0.2722, + "step": 8650 + }, + { + "epoch": 0.5, + "grad_norm": 1.3321392581467473, + "learning_rate": 1.0582136348549751e-05, + "loss": 0.7329, + "step": 8651 + }, + { + "epoch": 0.5, + "grad_norm": 0.2864610099756312, + "learning_rate": 1.0580278581711062e-05, + "loss": 0.1966, + "step": 8652 + }, + { + "epoch": 0.5, + "grad_norm": 0.36678325852272914, + "learning_rate": 1.0578420794777347e-05, + "loss": 0.3385, + "step": 8653 + }, + { + "epoch": 0.5, + "grad_norm": 0.6607280902420078, + "learning_rate": 1.0576562987812946e-05, + "loss": 0.4297, + "step": 8654 + }, + { + "epoch": 0.5, + "grad_norm": 0.26838225983595704, + "learning_rate": 1.057470516088219e-05, + "loss": 0.2015, + "step": 8655 + }, + { + "epoch": 0.5, + "grad_norm": 0.23625021150066397, + "learning_rate": 1.0572847314049424e-05, + "loss": 0.2171, + "step": 8656 + }, + { + "epoch": 0.5, + "grad_norm": 1.3904254222964338, + "learning_rate": 1.0570989447378977e-05, + "loss": 0.8084, + "step": 8657 + }, + { + "epoch": 0.5, + "grad_norm": 0.3996383469966337, + "learning_rate": 1.056913156093519e-05, + "loss": 0.3012, + "step": 8658 + }, + { + "epoch": 0.5, + "grad_norm": 0.47744810436214125, + "learning_rate": 1.0567273654782402e-05, + "loss": 0.2646, + "step": 8659 + }, + { + "epoch": 0.5, + "grad_norm": 0.37398280158270103, + "learning_rate": 1.0565415728984954e-05, + "loss": 0.3085, + "step": 8660 + }, + { + "epoch": 0.5, + "grad_norm": 0.2956477490756631, + "learning_rate": 1.0563557783607182e-05, + "loss": 0.173, + "step": 8661 + }, + { + "epoch": 0.5, + "grad_norm": 0.26479803730153023, + "learning_rate": 1.0561699818713427e-05, + "loss": 0.1764, + "step": 8662 + }, + { + "epoch": 0.5, + "grad_norm": 1.1463907876064328, + "learning_rate": 1.0559841834368032e-05, + "loss": 0.6209, + "step": 8663 + }, + { + "epoch": 0.5, + "grad_norm": 0.3129762774498092, + "learning_rate": 1.055798383063534e-05, + "loss": 0.2687, + "step": 8664 + }, + { + "epoch": 0.5, + "grad_norm": 0.3635218135502235, + "learning_rate": 1.0556125807579691e-05, + "loss": 0.3221, + "step": 8665 + }, + { + "epoch": 0.5, + "grad_norm": 0.692553553878542, + "learning_rate": 1.0554267765265428e-05, + "loss": 0.3318, + "step": 8666 + }, + { + "epoch": 0.5, + "grad_norm": 0.3232964697528742, + "learning_rate": 1.0552409703756896e-05, + "loss": 0.2057, + "step": 8667 + }, + { + "epoch": 0.5, + "grad_norm": 0.2696835374164365, + "learning_rate": 1.0550551623118442e-05, + "loss": 0.2448, + "step": 8668 + }, + { + "epoch": 0.5, + "grad_norm": 0.4353801277831827, + "learning_rate": 1.0548693523414408e-05, + "loss": 0.3043, + "step": 8669 + }, + { + "epoch": 0.5, + "grad_norm": 0.6110278457480871, + "learning_rate": 1.0546835404709142e-05, + "loss": 0.3705, + "step": 8670 + }, + { + "epoch": 0.5, + "grad_norm": 0.4375873666361446, + "learning_rate": 1.0544977267066986e-05, + "loss": 0.3336, + "step": 8671 + }, + { + "epoch": 0.5, + "grad_norm": 0.3254332303228954, + "learning_rate": 1.0543119110552293e-05, + "loss": 0.2477, + "step": 8672 + }, + { + "epoch": 0.5, + "grad_norm": 0.4354120294911844, + "learning_rate": 1.054126093522941e-05, + "loss": 0.3037, + "step": 8673 + }, + { + "epoch": 0.5, + "grad_norm": 0.2439818838843636, + "learning_rate": 1.053940274116268e-05, + "loss": 0.1891, + "step": 8674 + }, + { + "epoch": 0.5, + "grad_norm": 1.0463292921514271, + "learning_rate": 1.0537544528416462e-05, + "loss": 0.396, + "step": 8675 + }, + { + "epoch": 0.5, + "grad_norm": 0.2922783263343222, + "learning_rate": 1.0535686297055095e-05, + "loss": 0.2758, + "step": 8676 + }, + { + "epoch": 0.5, + "grad_norm": 0.33915434052209564, + "learning_rate": 1.0533828047142936e-05, + "loss": 0.313, + "step": 8677 + }, + { + "epoch": 0.5, + "grad_norm": 1.2238073735137502, + "learning_rate": 1.0531969778744333e-05, + "loss": 0.7319, + "step": 8678 + }, + { + "epoch": 0.5, + "grad_norm": 0.24234025299640638, + "learning_rate": 1.0530111491923642e-05, + "loss": 0.1626, + "step": 8679 + }, + { + "epoch": 0.5, + "grad_norm": 0.2732911508199734, + "learning_rate": 1.0528253186745212e-05, + "loss": 0.2467, + "step": 8680 + }, + { + "epoch": 0.5, + "grad_norm": 0.5100215847515718, + "learning_rate": 1.05263948632734e-05, + "loss": 0.4121, + "step": 8681 + }, + { + "epoch": 0.5, + "grad_norm": 0.7570537100397893, + "learning_rate": 1.052453652157255e-05, + "loss": 0.2958, + "step": 8682 + }, + { + "epoch": 0.5, + "grad_norm": 0.40521850584754937, + "learning_rate": 1.0522678161707028e-05, + "loss": 0.3425, + "step": 8683 + }, + { + "epoch": 0.5, + "grad_norm": 0.35722274671332144, + "learning_rate": 1.0520819783741183e-05, + "loss": 0.3157, + "step": 8684 + }, + { + "epoch": 0.5, + "grad_norm": 0.378465025509005, + "learning_rate": 1.0518961387739371e-05, + "loss": 0.1266, + "step": 8685 + }, + { + "epoch": 0.5, + "grad_norm": 0.28385498048625774, + "learning_rate": 1.0517102973765947e-05, + "loss": 0.2468, + "step": 8686 + }, + { + "epoch": 0.5, + "grad_norm": 0.6575638629333442, + "learning_rate": 1.0515244541885272e-05, + "loss": 0.4461, + "step": 8687 + }, + { + "epoch": 0.5, + "grad_norm": 0.2883829375610862, + "learning_rate": 1.0513386092161698e-05, + "loss": 0.2233, + "step": 8688 + }, + { + "epoch": 0.5, + "grad_norm": 0.2855510500547569, + "learning_rate": 1.0511527624659585e-05, + "loss": 0.2701, + "step": 8689 + }, + { + "epoch": 0.5, + "grad_norm": 0.9124086442455803, + "learning_rate": 1.0509669139443298e-05, + "loss": 0.5854, + "step": 8690 + }, + { + "epoch": 0.5, + "grad_norm": 0.3336901059066564, + "learning_rate": 1.0507810636577183e-05, + "loss": 0.1928, + "step": 8691 + }, + { + "epoch": 0.5, + "grad_norm": 0.263736365371567, + "learning_rate": 1.0505952116125613e-05, + "loss": 0.2334, + "step": 8692 + }, + { + "epoch": 0.5, + "grad_norm": 0.731716892313344, + "learning_rate": 1.0504093578152939e-05, + "loss": 0.4945, + "step": 8693 + }, + { + "epoch": 0.5, + "grad_norm": 0.5263262316670798, + "learning_rate": 1.050223502272353e-05, + "loss": 0.3654, + "step": 8694 + }, + { + "epoch": 0.5, + "grad_norm": 0.2641345245374191, + "learning_rate": 1.050037644990174e-05, + "loss": 0.1886, + "step": 8695 + }, + { + "epoch": 0.5, + "grad_norm": 0.36928392656217834, + "learning_rate": 1.0498517859751937e-05, + "loss": 0.3137, + "step": 8696 + }, + { + "epoch": 0.5, + "grad_norm": 0.609889087343759, + "learning_rate": 1.0496659252338481e-05, + "loss": 0.3196, + "step": 8697 + }, + { + "epoch": 0.5, + "grad_norm": 0.32459381957992633, + "learning_rate": 1.049480062772574e-05, + "loss": 0.2378, + "step": 8698 + }, + { + "epoch": 0.5, + "grad_norm": 0.7666747207357101, + "learning_rate": 1.0492941985978068e-05, + "loss": 0.4289, + "step": 8699 + }, + { + "epoch": 0.5, + "grad_norm": 0.3039731492970164, + "learning_rate": 1.049108332715984e-05, + "loss": 0.2918, + "step": 8700 + }, + { + "epoch": 0.5, + "grad_norm": 0.30935907905723153, + "learning_rate": 1.048922465133542e-05, + "loss": 0.1984, + "step": 8701 + }, + { + "epoch": 0.5, + "grad_norm": 0.3842240130182527, + "learning_rate": 1.0487365958569168e-05, + "loss": 0.2887, + "step": 8702 + }, + { + "epoch": 0.5, + "grad_norm": 1.4564138910898665, + "learning_rate": 1.0485507248925455e-05, + "loss": 0.7804, + "step": 8703 + }, + { + "epoch": 0.5, + "grad_norm": 0.2959826982997899, + "learning_rate": 1.0483648522468648e-05, + "loss": 0.2633, + "step": 8704 + }, + { + "epoch": 0.5, + "grad_norm": 0.6727655285294992, + "learning_rate": 1.0481789779263112e-05, + "loss": 0.3068, + "step": 8705 + }, + { + "epoch": 0.5, + "grad_norm": 0.8354526542235158, + "learning_rate": 1.0479931019373218e-05, + "loss": 0.5327, + "step": 8706 + }, + { + "epoch": 0.5, + "grad_norm": 0.28734045291129795, + "learning_rate": 1.0478072242863329e-05, + "loss": 0.2392, + "step": 8707 + }, + { + "epoch": 0.5, + "grad_norm": 0.2747471101459275, + "learning_rate": 1.0476213449797823e-05, + "loss": 0.2075, + "step": 8708 + }, + { + "epoch": 0.5, + "grad_norm": 0.7019374127703849, + "learning_rate": 1.0474354640241065e-05, + "loss": 0.4068, + "step": 8709 + }, + { + "epoch": 0.5, + "grad_norm": 0.36357077422014367, + "learning_rate": 1.0472495814257426e-05, + "loss": 0.294, + "step": 8710 + }, + { + "epoch": 0.5, + "grad_norm": 0.6829568268139392, + "learning_rate": 1.0470636971911277e-05, + "loss": 0.3449, + "step": 8711 + }, + { + "epoch": 0.5, + "grad_norm": 0.3321892388960226, + "learning_rate": 1.046877811326699e-05, + "loss": 0.3047, + "step": 8712 + }, + { + "epoch": 0.5, + "grad_norm": 0.3844356675302627, + "learning_rate": 1.0466919238388937e-05, + "loss": 0.2823, + "step": 8713 + }, + { + "epoch": 0.5, + "grad_norm": 0.2569912841925734, + "learning_rate": 1.046506034734149e-05, + "loss": 0.1119, + "step": 8714 + }, + { + "epoch": 0.5, + "grad_norm": 0.5010661824434921, + "learning_rate": 1.0463201440189026e-05, + "loss": 0.3222, + "step": 8715 + }, + { + "epoch": 0.5, + "grad_norm": 0.3171331861477249, + "learning_rate": 1.0461342516995911e-05, + "loss": 0.2934, + "step": 8716 + }, + { + "epoch": 0.5, + "grad_norm": 0.43958228142956374, + "learning_rate": 1.0459483577826531e-05, + "loss": 0.3712, + "step": 8717 + }, + { + "epoch": 0.5, + "grad_norm": 0.3325947509858215, + "learning_rate": 1.0457624622745249e-05, + "loss": 0.2327, + "step": 8718 + }, + { + "epoch": 0.5, + "grad_norm": 0.4079029698952522, + "learning_rate": 1.0455765651816447e-05, + "loss": 0.2972, + "step": 8719 + }, + { + "epoch": 0.5, + "grad_norm": 0.23506748013692239, + "learning_rate": 1.0453906665104503e-05, + "loss": 0.2195, + "step": 8720 + }, + { + "epoch": 0.5, + "grad_norm": 0.5733380652274306, + "learning_rate": 1.045204766267379e-05, + "loss": 0.3302, + "step": 8721 + }, + { + "epoch": 0.5, + "grad_norm": 0.3160060820791429, + "learning_rate": 1.0450188644588684e-05, + "loss": 0.2767, + "step": 8722 + }, + { + "epoch": 0.5, + "grad_norm": 0.4478524014302271, + "learning_rate": 1.0448329610913566e-05, + "loss": 0.3633, + "step": 8723 + }, + { + "epoch": 0.5, + "grad_norm": 0.3882152266604375, + "learning_rate": 1.0446470561712811e-05, + "loss": 0.2507, + "step": 8724 + }, + { + "epoch": 0.5, + "grad_norm": 0.34054329567829944, + "learning_rate": 1.0444611497050802e-05, + "loss": 0.2896, + "step": 8725 + }, + { + "epoch": 0.5, + "grad_norm": 0.2992192854650521, + "learning_rate": 1.0442752416991912e-05, + "loss": 0.1979, + "step": 8726 + }, + { + "epoch": 0.5, + "grad_norm": 0.45370200964194424, + "learning_rate": 1.0440893321600529e-05, + "loss": 0.2756, + "step": 8727 + }, + { + "epoch": 0.5, + "grad_norm": 0.32295994917909365, + "learning_rate": 1.0439034210941029e-05, + "loss": 0.2951, + "step": 8728 + }, + { + "epoch": 0.5, + "grad_norm": 1.5271479543748887, + "learning_rate": 1.043717508507779e-05, + "loss": 0.6307, + "step": 8729 + }, + { + "epoch": 0.5, + "grad_norm": 1.2596569844559131, + "learning_rate": 1.0435315944075202e-05, + "loss": 0.8572, + "step": 8730 + }, + { + "epoch": 0.5, + "grad_norm": 0.3131215205253095, + "learning_rate": 1.0433456787997636e-05, + "loss": 0.198, + "step": 8731 + }, + { + "epoch": 0.5, + "grad_norm": 0.24968167949562506, + "learning_rate": 1.0431597616909483e-05, + "loss": 0.2025, + "step": 8732 + }, + { + "epoch": 0.5, + "grad_norm": 0.48335571779497233, + "learning_rate": 1.0429738430875123e-05, + "loss": 0.4133, + "step": 8733 + }, + { + "epoch": 0.5, + "grad_norm": 0.31200844208398004, + "learning_rate": 1.042787922995894e-05, + "loss": 0.1918, + "step": 8734 + }, + { + "epoch": 0.5, + "grad_norm": 0.45930456874730524, + "learning_rate": 1.0426020014225313e-05, + "loss": 0.3733, + "step": 8735 + }, + { + "epoch": 0.5, + "grad_norm": 0.40373907057363134, + "learning_rate": 1.0424160783738637e-05, + "loss": 0.342, + "step": 8736 + }, + { + "epoch": 0.5, + "grad_norm": 0.3350587314333596, + "learning_rate": 1.042230153856329e-05, + "loss": 0.2144, + "step": 8737 + }, + { + "epoch": 0.5, + "grad_norm": 0.3563876657519294, + "learning_rate": 1.0420442278763658e-05, + "loss": 0.2922, + "step": 8738 + }, + { + "epoch": 0.5, + "grad_norm": 0.31504314484019597, + "learning_rate": 1.0418583004404128e-05, + "loss": 0.2687, + "step": 8739 + }, + { + "epoch": 0.5, + "grad_norm": 0.30162193745736154, + "learning_rate": 1.0416723715549086e-05, + "loss": 0.2186, + "step": 8740 + }, + { + "epoch": 0.5, + "grad_norm": 1.3170050829277982, + "learning_rate": 1.041486441226292e-05, + "loss": 0.8277, + "step": 8741 + }, + { + "epoch": 0.5, + "grad_norm": 1.3552064424071086, + "learning_rate": 1.0413005094610018e-05, + "loss": 0.8414, + "step": 8742 + }, + { + "epoch": 0.5, + "grad_norm": 0.380534543132684, + "learning_rate": 1.0411145762654767e-05, + "loss": 0.2638, + "step": 8743 + }, + { + "epoch": 0.5, + "grad_norm": 0.36920878357651454, + "learning_rate": 1.0409286416461557e-05, + "loss": 0.2848, + "step": 8744 + }, + { + "epoch": 0.5, + "grad_norm": 0.2959147152051227, + "learning_rate": 1.0407427056094772e-05, + "loss": 0.2261, + "step": 8745 + }, + { + "epoch": 0.5, + "grad_norm": 0.34193258538459975, + "learning_rate": 1.040556768161881e-05, + "loss": 0.2593, + "step": 8746 + }, + { + "epoch": 0.5, + "grad_norm": 0.45975870258676144, + "learning_rate": 1.0403708293098054e-05, + "loss": 0.2501, + "step": 8747 + }, + { + "epoch": 0.5, + "grad_norm": 0.4738679517364789, + "learning_rate": 1.04018488905969e-05, + "loss": 0.3933, + "step": 8748 + }, + { + "epoch": 0.5, + "grad_norm": 0.3316696976128548, + "learning_rate": 1.0399989474179735e-05, + "loss": 0.2681, + "step": 8749 + }, + { + "epoch": 0.5, + "grad_norm": 0.567887887250414, + "learning_rate": 1.0398130043910949e-05, + "loss": 0.3258, + "step": 8750 + }, + { + "epoch": 0.5, + "grad_norm": 0.2892458714833695, + "learning_rate": 1.0396270599854939e-05, + "loss": 0.2585, + "step": 8751 + }, + { + "epoch": 0.5, + "grad_norm": 0.4389296101630068, + "learning_rate": 1.0394411142076092e-05, + "loss": 0.2814, + "step": 8752 + }, + { + "epoch": 0.5, + "grad_norm": 0.35415454086871245, + "learning_rate": 1.039255167063881e-05, + "loss": 0.2628, + "step": 8753 + }, + { + "epoch": 0.5, + "grad_norm": 0.6841238312775954, + "learning_rate": 1.0390692185607479e-05, + "loss": 0.3674, + "step": 8754 + }, + { + "epoch": 0.5, + "grad_norm": 0.41943936748101157, + "learning_rate": 1.0388832687046493e-05, + "loss": 0.3115, + "step": 8755 + }, + { + "epoch": 0.5, + "grad_norm": 0.3136131249500698, + "learning_rate": 1.0386973175020248e-05, + "loss": 0.3109, + "step": 8756 + }, + { + "epoch": 0.5, + "grad_norm": 0.2241031296136957, + "learning_rate": 1.0385113649593137e-05, + "loss": 0.1017, + "step": 8757 + }, + { + "epoch": 0.5, + "grad_norm": 0.25406672869397945, + "learning_rate": 1.0383254110829557e-05, + "loss": 0.2081, + "step": 8758 + }, + { + "epoch": 0.5, + "grad_norm": 0.3567819115485671, + "learning_rate": 1.0381394558793907e-05, + "loss": 0.3352, + "step": 8759 + }, + { + "epoch": 0.5, + "grad_norm": 0.8288749721657916, + "learning_rate": 1.0379534993550574e-05, + "loss": 0.4522, + "step": 8760 + }, + { + "epoch": 0.5, + "grad_norm": 0.35424045018318084, + "learning_rate": 1.0377675415163965e-05, + "loss": 0.2828, + "step": 8761 + }, + { + "epoch": 0.5, + "grad_norm": 0.7610154560364215, + "learning_rate": 1.0375815823698471e-05, + "loss": 0.4219, + "step": 8762 + }, + { + "epoch": 0.5, + "grad_norm": 0.3571386552082526, + "learning_rate": 1.0373956219218495e-05, + "loss": 0.2569, + "step": 8763 + }, + { + "epoch": 0.5, + "grad_norm": 0.27089130035377157, + "learning_rate": 1.0372096601788426e-05, + "loss": 0.2144, + "step": 8764 + }, + { + "epoch": 0.5, + "grad_norm": 0.4068884998575498, + "learning_rate": 1.0370236971472671e-05, + "loss": 0.251, + "step": 8765 + }, + { + "epoch": 0.5, + "grad_norm": 0.7412488012692768, + "learning_rate": 1.0368377328335623e-05, + "loss": 0.5212, + "step": 8766 + }, + { + "epoch": 0.5, + "grad_norm": 0.27216313191409564, + "learning_rate": 1.0366517672441687e-05, + "loss": 0.2245, + "step": 8767 + }, + { + "epoch": 0.5, + "grad_norm": 0.429758312937244, + "learning_rate": 1.0364658003855256e-05, + "loss": 0.3467, + "step": 8768 + }, + { + "epoch": 0.5, + "grad_norm": 0.4300319006050432, + "learning_rate": 1.0362798322640736e-05, + "loss": 0.2656, + "step": 8769 + }, + { + "epoch": 0.5, + "grad_norm": 0.2356666590382897, + "learning_rate": 1.0360938628862527e-05, + "loss": 0.1391, + "step": 8770 + }, + { + "epoch": 0.5, + "grad_norm": 0.3233756808129512, + "learning_rate": 1.0359078922585029e-05, + "loss": 0.2775, + "step": 8771 + }, + { + "epoch": 0.5, + "grad_norm": 0.8330387344459389, + "learning_rate": 1.0357219203872641e-05, + "loss": 0.4525, + "step": 8772 + }, + { + "epoch": 0.5, + "grad_norm": 0.34761547025612005, + "learning_rate": 1.035535947278977e-05, + "loss": 0.2182, + "step": 8773 + }, + { + "epoch": 0.5, + "grad_norm": 0.4854843573284205, + "learning_rate": 1.035349972940081e-05, + "loss": 0.3956, + "step": 8774 + }, + { + "epoch": 0.5, + "grad_norm": 0.3489720860066471, + "learning_rate": 1.0351639973770175e-05, + "loss": 0.2953, + "step": 8775 + }, + { + "epoch": 0.5, + "grad_norm": 0.29659453460115837, + "learning_rate": 1.0349780205962264e-05, + "loss": 0.1995, + "step": 8776 + }, + { + "epoch": 0.5, + "grad_norm": 0.2555721583072774, + "learning_rate": 1.0347920426041475e-05, + "loss": 0.188, + "step": 8777 + }, + { + "epoch": 0.5, + "grad_norm": 1.1484304454055863, + "learning_rate": 1.034606063407222e-05, + "loss": 0.4487, + "step": 8778 + }, + { + "epoch": 0.5, + "grad_norm": 0.3225354838444197, + "learning_rate": 1.0344200830118899e-05, + "loss": 0.2934, + "step": 8779 + }, + { + "epoch": 0.5, + "grad_norm": 0.36739556613949326, + "learning_rate": 1.0342341014245918e-05, + "loss": 0.2392, + "step": 8780 + }, + { + "epoch": 0.5, + "grad_norm": 0.7747310744381001, + "learning_rate": 1.0340481186517678e-05, + "loss": 0.4761, + "step": 8781 + }, + { + "epoch": 0.5, + "grad_norm": 0.27809320794294334, + "learning_rate": 1.0338621346998596e-05, + "loss": 0.2168, + "step": 8782 + }, + { + "epoch": 0.5, + "grad_norm": 0.399339632887001, + "learning_rate": 1.0336761495753067e-05, + "loss": 0.2646, + "step": 8783 + }, + { + "epoch": 0.5, + "grad_norm": 0.7950306926717371, + "learning_rate": 1.0334901632845504e-05, + "loss": 0.3684, + "step": 8784 + }, + { + "epoch": 0.5, + "grad_norm": 0.37529210841211885, + "learning_rate": 1.0333041758340312e-05, + "loss": 0.2767, + "step": 8785 + }, + { + "epoch": 0.5, + "grad_norm": 0.2740845981263997, + "learning_rate": 1.0331181872301898e-05, + "loss": 0.1405, + "step": 8786 + }, + { + "epoch": 0.5, + "grad_norm": 0.3420145569049249, + "learning_rate": 1.0329321974794671e-05, + "loss": 0.2961, + "step": 8787 + }, + { + "epoch": 0.5, + "grad_norm": 0.5831138037987301, + "learning_rate": 1.0327462065883036e-05, + "loss": 0.3274, + "step": 8788 + }, + { + "epoch": 0.5, + "grad_norm": 0.2896592046347293, + "learning_rate": 1.0325602145631403e-05, + "loss": 0.223, + "step": 8789 + }, + { + "epoch": 0.51, + "grad_norm": 0.43954585956748027, + "learning_rate": 1.0323742214104185e-05, + "loss": 0.3921, + "step": 8790 + }, + { + "epoch": 0.51, + "grad_norm": 0.3625988463122244, + "learning_rate": 1.0321882271365786e-05, + "loss": 0.2806, + "step": 8791 + }, + { + "epoch": 0.51, + "grad_norm": 0.3160804484447476, + "learning_rate": 1.0320022317480618e-05, + "loss": 0.2677, + "step": 8792 + }, + { + "epoch": 0.51, + "grad_norm": 1.0366060736983413, + "learning_rate": 1.031816235251309e-05, + "loss": 0.3159, + "step": 8793 + }, + { + "epoch": 0.51, + "grad_norm": 0.5889250828589246, + "learning_rate": 1.0316302376527616e-05, + "loss": 0.3359, + "step": 8794 + }, + { + "epoch": 0.51, + "grad_norm": 0.270751161601078, + "learning_rate": 1.0314442389588603e-05, + "loss": 0.2913, + "step": 8795 + }, + { + "epoch": 0.51, + "grad_norm": 0.6323219923443495, + "learning_rate": 1.0312582391760462e-05, + "loss": 0.3475, + "step": 8796 + }, + { + "epoch": 0.51, + "grad_norm": 0.4125990956262591, + "learning_rate": 1.0310722383107608e-05, + "loss": 0.2992, + "step": 8797 + }, + { + "epoch": 0.51, + "grad_norm": 0.23697489736616656, + "learning_rate": 1.030886236369445e-05, + "loss": 0.1779, + "step": 8798 + }, + { + "epoch": 0.51, + "grad_norm": 0.3704019970521971, + "learning_rate": 1.0307002333585404e-05, + "loss": 0.2598, + "step": 8799 + }, + { + "epoch": 0.51, + "grad_norm": 0.44374597809610156, + "learning_rate": 1.0305142292844876e-05, + "loss": 0.2887, + "step": 8800 + }, + { + "epoch": 0.51, + "grad_norm": 0.4554383476417022, + "learning_rate": 1.0303282241537287e-05, + "loss": 0.3395, + "step": 8801 + }, + { + "epoch": 0.51, + "grad_norm": 0.3874203617171426, + "learning_rate": 1.0301422179727045e-05, + "loss": 0.3243, + "step": 8802 + }, + { + "epoch": 0.51, + "grad_norm": 0.3125693395295528, + "learning_rate": 1.0299562107478569e-05, + "loss": 0.2679, + "step": 8803 + }, + { + "epoch": 0.51, + "grad_norm": 0.25728668270074057, + "learning_rate": 1.0297702024856268e-05, + "loss": 0.1966, + "step": 8804 + }, + { + "epoch": 0.51, + "grad_norm": 1.0480113904388246, + "learning_rate": 1.0295841931924559e-05, + "loss": 0.5405, + "step": 8805 + }, + { + "epoch": 0.51, + "grad_norm": 0.663364129540765, + "learning_rate": 1.0293981828747857e-05, + "loss": 0.2988, + "step": 8806 + }, + { + "epoch": 0.51, + "grad_norm": 0.2966797163973741, + "learning_rate": 1.0292121715390576e-05, + "loss": 0.2727, + "step": 8807 + }, + { + "epoch": 0.51, + "grad_norm": 0.6970721071112972, + "learning_rate": 1.0290261591917132e-05, + "loss": 0.5367, + "step": 8808 + }, + { + "epoch": 0.51, + "grad_norm": 0.38124847024879227, + "learning_rate": 1.0288401458391943e-05, + "loss": 0.1685, + "step": 8809 + }, + { + "epoch": 0.51, + "grad_norm": 0.2504095296687366, + "learning_rate": 1.0286541314879424e-05, + "loss": 0.2031, + "step": 8810 + }, + { + "epoch": 0.51, + "grad_norm": 0.3809688844646517, + "learning_rate": 1.028468116144399e-05, + "loss": 0.3369, + "step": 8811 + }, + { + "epoch": 0.51, + "grad_norm": 0.4955121367154073, + "learning_rate": 1.028282099815006e-05, + "loss": 0.2312, + "step": 8812 + }, + { + "epoch": 0.51, + "grad_norm": 0.4093725872251935, + "learning_rate": 1.0280960825062054e-05, + "loss": 0.3284, + "step": 8813 + }, + { + "epoch": 0.51, + "grad_norm": 1.1636539511340667, + "learning_rate": 1.0279100642244382e-05, + "loss": 0.8014, + "step": 8814 + }, + { + "epoch": 0.51, + "grad_norm": 0.28726866422333247, + "learning_rate": 1.027724044976147e-05, + "loss": 0.2295, + "step": 8815 + }, + { + "epoch": 0.51, + "grad_norm": 0.24419854470114155, + "learning_rate": 1.0275380247677733e-05, + "loss": 0.1984, + "step": 8816 + }, + { + "epoch": 0.51, + "grad_norm": 0.9213705414253189, + "learning_rate": 1.0273520036057587e-05, + "loss": 0.517, + "step": 8817 + }, + { + "epoch": 0.51, + "grad_norm": 0.4880627076951752, + "learning_rate": 1.0271659814965457e-05, + "loss": 0.3437, + "step": 8818 + }, + { + "epoch": 0.51, + "grad_norm": 0.25429892494500267, + "learning_rate": 1.0269799584465758e-05, + "loss": 0.2382, + "step": 8819 + }, + { + "epoch": 0.51, + "grad_norm": 1.1443560930208179, + "learning_rate": 1.0267939344622912e-05, + "loss": 0.7465, + "step": 8820 + }, + { + "epoch": 0.51, + "grad_norm": 0.47916608663820603, + "learning_rate": 1.0266079095501338e-05, + "loss": 0.3078, + "step": 8821 + }, + { + "epoch": 0.51, + "grad_norm": 0.22200518903738178, + "learning_rate": 1.0264218837165459e-05, + "loss": 0.1626, + "step": 8822 + }, + { + "epoch": 0.51, + "grad_norm": 0.38383767303997884, + "learning_rate": 1.0262358569679686e-05, + "loss": 0.3217, + "step": 8823 + }, + { + "epoch": 0.51, + "grad_norm": 0.6558207841443644, + "learning_rate": 1.0260498293108452e-05, + "loss": 0.4036, + "step": 8824 + }, + { + "epoch": 0.51, + "grad_norm": 0.3591505119739801, + "learning_rate": 1.025863800751617e-05, + "loss": 0.2283, + "step": 8825 + }, + { + "epoch": 0.51, + "grad_norm": 0.36957746518241613, + "learning_rate": 1.025677771296727e-05, + "loss": 0.3354, + "step": 8826 + }, + { + "epoch": 0.51, + "grad_norm": 1.4188426062019395, + "learning_rate": 1.0254917409526163e-05, + "loss": 0.7654, + "step": 8827 + }, + { + "epoch": 0.51, + "grad_norm": 0.30075617474235133, + "learning_rate": 1.0253057097257281e-05, + "loss": 0.2479, + "step": 8828 + }, + { + "epoch": 0.51, + "grad_norm": 0.3521081179987658, + "learning_rate": 1.025119677622504e-05, + "loss": 0.2155, + "step": 8829 + }, + { + "epoch": 0.51, + "grad_norm": 0.4290538039923925, + "learning_rate": 1.0249336446493869e-05, + "loss": 0.3082, + "step": 8830 + }, + { + "epoch": 0.51, + "grad_norm": 0.28813142501039557, + "learning_rate": 1.0247476108128183e-05, + "loss": 0.2763, + "step": 8831 + }, + { + "epoch": 0.51, + "grad_norm": 0.7845090507131877, + "learning_rate": 1.0245615761192414e-05, + "loss": 0.4415, + "step": 8832 + }, + { + "epoch": 0.51, + "grad_norm": 0.8072316896704939, + "learning_rate": 1.024375540575098e-05, + "loss": 0.4035, + "step": 8833 + }, + { + "epoch": 0.51, + "grad_norm": 0.2726303021214843, + "learning_rate": 1.0241895041868306e-05, + "loss": 0.2369, + "step": 8834 + }, + { + "epoch": 0.51, + "grad_norm": 0.36492778874634096, + "learning_rate": 1.024003466960882e-05, + "loss": 0.2715, + "step": 8835 + }, + { + "epoch": 0.51, + "grad_norm": 0.360313855746173, + "learning_rate": 1.0238174289036942e-05, + "loss": 0.2589, + "step": 8836 + }, + { + "epoch": 0.51, + "grad_norm": 0.36305815876103154, + "learning_rate": 1.0236313900217099e-05, + "loss": 0.2774, + "step": 8837 + }, + { + "epoch": 0.51, + "grad_norm": 0.35402837649284924, + "learning_rate": 1.0234453503213715e-05, + "loss": 0.286, + "step": 8838 + }, + { + "epoch": 0.51, + "grad_norm": 0.6168437114744936, + "learning_rate": 1.0232593098091215e-05, + "loss": 0.3838, + "step": 8839 + }, + { + "epoch": 0.51, + "grad_norm": 0.4155028386232177, + "learning_rate": 1.0230732684914029e-05, + "loss": 0.3072, + "step": 8840 + }, + { + "epoch": 0.51, + "grad_norm": 0.5305176733751163, + "learning_rate": 1.022887226374658e-05, + "loss": 0.4033, + "step": 8841 + }, + { + "epoch": 0.51, + "grad_norm": 0.20583299188599177, + "learning_rate": 1.022701183465329e-05, + "loss": 0.1745, + "step": 8842 + }, + { + "epoch": 0.51, + "grad_norm": 0.3890161245544743, + "learning_rate": 1.0225151397698597e-05, + "loss": 0.3028, + "step": 8843 + }, + { + "epoch": 0.51, + "grad_norm": 0.9808698140562502, + "learning_rate": 1.0223290952946914e-05, + "loss": 0.6994, + "step": 8844 + }, + { + "epoch": 0.51, + "grad_norm": 0.6121490823974144, + "learning_rate": 1.0221430500462677e-05, + "loss": 0.3077, + "step": 8845 + }, + { + "epoch": 0.51, + "grad_norm": 0.3066633720162846, + "learning_rate": 1.0219570040310312e-05, + "loss": 0.2655, + "step": 8846 + }, + { + "epoch": 0.51, + "grad_norm": 0.34538918095313825, + "learning_rate": 1.0217709572554247e-05, + "loss": 0.3273, + "step": 8847 + }, + { + "epoch": 0.51, + "grad_norm": 0.1859650911764527, + "learning_rate": 1.0215849097258905e-05, + "loss": 0.0884, + "step": 8848 + }, + { + "epoch": 0.51, + "grad_norm": 0.3703599398367777, + "learning_rate": 1.0213988614488721e-05, + "loss": 0.289, + "step": 8849 + }, + { + "epoch": 0.51, + "grad_norm": 0.4604316756843436, + "learning_rate": 1.0212128124308121e-05, + "loss": 0.3816, + "step": 8850 + }, + { + "epoch": 0.51, + "grad_norm": 0.38250000158150516, + "learning_rate": 1.0210267626781532e-05, + "loss": 0.2585, + "step": 8851 + }, + { + "epoch": 0.51, + "grad_norm": 0.350693722826386, + "learning_rate": 1.0208407121973383e-05, + "loss": 0.3005, + "step": 8852 + }, + { + "epoch": 0.51, + "grad_norm": 0.7957509715262783, + "learning_rate": 1.0206546609948107e-05, + "loss": 0.4827, + "step": 8853 + }, + { + "epoch": 0.51, + "grad_norm": 0.235842723340732, + "learning_rate": 1.020468609077013e-05, + "loss": 0.216, + "step": 8854 + }, + { + "epoch": 0.51, + "grad_norm": 0.30607668526376075, + "learning_rate": 1.0202825564503885e-05, + "loss": 0.2021, + "step": 8855 + }, + { + "epoch": 0.51, + "grad_norm": 1.1314724300561396, + "learning_rate": 1.0200965031213795e-05, + "loss": 0.7834, + "step": 8856 + }, + { + "epoch": 0.51, + "grad_norm": 0.7721586850426669, + "learning_rate": 1.0199104490964296e-05, + "loss": 0.4185, + "step": 8857 + }, + { + "epoch": 0.51, + "grad_norm": 0.3153541338301299, + "learning_rate": 1.0197243943819816e-05, + "loss": 0.2, + "step": 8858 + }, + { + "epoch": 0.51, + "grad_norm": 0.3830402764051056, + "learning_rate": 1.0195383389844789e-05, + "loss": 0.3396, + "step": 8859 + }, + { + "epoch": 0.51, + "grad_norm": 0.2716412665480148, + "learning_rate": 1.0193522829103643e-05, + "loss": 0.183, + "step": 8860 + }, + { + "epoch": 0.51, + "grad_norm": 0.3956289122519081, + "learning_rate": 1.0191662261660809e-05, + "loss": 0.2267, + "step": 8861 + }, + { + "epoch": 0.51, + "grad_norm": 0.3719950768589626, + "learning_rate": 1.018980168758072e-05, + "loss": 0.3198, + "step": 8862 + }, + { + "epoch": 0.51, + "grad_norm": 1.073061393618599, + "learning_rate": 1.0187941106927803e-05, + "loss": 0.4103, + "step": 8863 + }, + { + "epoch": 0.51, + "grad_norm": 0.3268498389256042, + "learning_rate": 1.0186080519766499e-05, + "loss": 0.2171, + "step": 8864 + }, + { + "epoch": 0.51, + "grad_norm": 0.932570603873562, + "learning_rate": 1.0184219926161229e-05, + "loss": 0.5862, + "step": 8865 + }, + { + "epoch": 0.51, + "grad_norm": 0.2876069287491476, + "learning_rate": 1.0182359326176437e-05, + "loss": 0.2418, + "step": 8866 + }, + { + "epoch": 0.51, + "grad_norm": 0.2760938844417518, + "learning_rate": 1.0180498719876546e-05, + "loss": 0.2243, + "step": 8867 + }, + { + "epoch": 0.51, + "grad_norm": 0.9738896002035565, + "learning_rate": 1.0178638107325993e-05, + "loss": 0.5697, + "step": 8868 + }, + { + "epoch": 0.51, + "grad_norm": 0.9192592820798557, + "learning_rate": 1.0176777488589206e-05, + "loss": 0.4044, + "step": 8869 + }, + { + "epoch": 0.51, + "grad_norm": 0.2846380947105442, + "learning_rate": 1.0174916863730628e-05, + "loss": 0.244, + "step": 8870 + }, + { + "epoch": 0.51, + "grad_norm": 0.4158621955491559, + "learning_rate": 1.0173056232814684e-05, + "loss": 0.2808, + "step": 8871 + }, + { + "epoch": 0.51, + "grad_norm": 0.4635268387198839, + "learning_rate": 1.0171195595905811e-05, + "loss": 0.2996, + "step": 8872 + }, + { + "epoch": 0.51, + "grad_norm": 0.33203805455122837, + "learning_rate": 1.0169334953068442e-05, + "loss": 0.2639, + "step": 8873 + }, + { + "epoch": 0.51, + "grad_norm": 0.3697048151949442, + "learning_rate": 1.0167474304367011e-05, + "loss": 0.2824, + "step": 8874 + }, + { + "epoch": 0.51, + "grad_norm": 0.4844233725817165, + "learning_rate": 1.0165613649865951e-05, + "loss": 0.3137, + "step": 8875 + }, + { + "epoch": 0.51, + "grad_norm": 0.3984242851912228, + "learning_rate": 1.0163752989629698e-05, + "loss": 0.2471, + "step": 8876 + }, + { + "epoch": 0.51, + "grad_norm": 0.47180498037132856, + "learning_rate": 1.0161892323722684e-05, + "loss": 0.274, + "step": 8877 + }, + { + "epoch": 0.51, + "grad_norm": 0.3669597020882024, + "learning_rate": 1.0160031652209348e-05, + "loss": 0.2947, + "step": 8878 + }, + { + "epoch": 0.51, + "grad_norm": 0.386735586286777, + "learning_rate": 1.0158170975154121e-05, + "loss": 0.2804, + "step": 8879 + }, + { + "epoch": 0.51, + "grad_norm": 0.472778223664258, + "learning_rate": 1.015631029262144e-05, + "loss": 0.3634, + "step": 8880 + }, + { + "epoch": 0.51, + "grad_norm": 0.31498560183851326, + "learning_rate": 1.0154449604675745e-05, + "loss": 0.2056, + "step": 8881 + }, + { + "epoch": 0.51, + "grad_norm": 0.31106123240309985, + "learning_rate": 1.015258891138146e-05, + "loss": 0.2597, + "step": 8882 + }, + { + "epoch": 0.51, + "grad_norm": 0.32163140709549864, + "learning_rate": 1.0150728212803034e-05, + "loss": 0.248, + "step": 8883 + }, + { + "epoch": 0.51, + "grad_norm": 1.0163468760891674, + "learning_rate": 1.0148867509004892e-05, + "loss": 0.3417, + "step": 8884 + }, + { + "epoch": 0.51, + "grad_norm": 0.3535521516167228, + "learning_rate": 1.0147006800051475e-05, + "loss": 0.273, + "step": 8885 + }, + { + "epoch": 0.51, + "grad_norm": 0.32974085632081673, + "learning_rate": 1.0145146086007219e-05, + "loss": 0.3063, + "step": 8886 + }, + { + "epoch": 0.51, + "grad_norm": 0.37848673394257054, + "learning_rate": 1.0143285366936562e-05, + "loss": 0.2336, + "step": 8887 + }, + { + "epoch": 0.51, + "grad_norm": 0.27078939239282207, + "learning_rate": 1.0141424642903936e-05, + "loss": 0.208, + "step": 8888 + }, + { + "epoch": 0.51, + "grad_norm": 1.0221622920030797, + "learning_rate": 1.0139563913973787e-05, + "loss": 0.5142, + "step": 8889 + }, + { + "epoch": 0.51, + "grad_norm": 0.34970550608599016, + "learning_rate": 1.0137703180210538e-05, + "loss": 0.2834, + "step": 8890 + }, + { + "epoch": 0.51, + "grad_norm": 0.35189717797611997, + "learning_rate": 1.0135842441678639e-05, + "loss": 0.2754, + "step": 8891 + }, + { + "epoch": 0.51, + "grad_norm": 0.6613352038736564, + "learning_rate": 1.0133981698442519e-05, + "loss": 0.411, + "step": 8892 + }, + { + "epoch": 0.51, + "grad_norm": 0.49333120019293597, + "learning_rate": 1.013212095056662e-05, + "loss": 0.4239, + "step": 8893 + }, + { + "epoch": 0.51, + "grad_norm": 0.2691281560347184, + "learning_rate": 1.0130260198115376e-05, + "loss": 0.2087, + "step": 8894 + }, + { + "epoch": 0.51, + "grad_norm": 0.25133156816965974, + "learning_rate": 1.012839944115323e-05, + "loss": 0.1896, + "step": 8895 + }, + { + "epoch": 0.51, + "grad_norm": 0.644446483143422, + "learning_rate": 1.0126538679744615e-05, + "loss": 0.4346, + "step": 8896 + }, + { + "epoch": 0.51, + "grad_norm": 0.31080535051286085, + "learning_rate": 1.0124677913953971e-05, + "loss": 0.2094, + "step": 8897 + }, + { + "epoch": 0.51, + "grad_norm": 0.2925056576358622, + "learning_rate": 1.0122817143845736e-05, + "loss": 0.29, + "step": 8898 + }, + { + "epoch": 0.51, + "grad_norm": 1.1700861136940273, + "learning_rate": 1.0120956369484352e-05, + "loss": 0.8421, + "step": 8899 + }, + { + "epoch": 0.51, + "grad_norm": 0.18646929013358124, + "learning_rate": 1.011909559093425e-05, + "loss": 0.1259, + "step": 8900 + }, + { + "epoch": 0.51, + "grad_norm": 0.5031315658177309, + "learning_rate": 1.0117234808259875e-05, + "loss": 0.3329, + "step": 8901 + }, + { + "epoch": 0.51, + "grad_norm": 0.3837696227726515, + "learning_rate": 1.0115374021525664e-05, + "loss": 0.2908, + "step": 8902 + }, + { + "epoch": 0.51, + "grad_norm": 0.3224777264779916, + "learning_rate": 1.0113513230796052e-05, + "loss": 0.2378, + "step": 8903 + }, + { + "epoch": 0.51, + "grad_norm": 0.7486941413631881, + "learning_rate": 1.0111652436135486e-05, + "loss": 0.4879, + "step": 8904 + }, + { + "epoch": 0.51, + "grad_norm": 0.4577781436154072, + "learning_rate": 1.01097916376084e-05, + "loss": 0.4012, + "step": 8905 + }, + { + "epoch": 0.51, + "grad_norm": 0.3227090641491666, + "learning_rate": 1.0107930835279234e-05, + "loss": 0.2838, + "step": 8906 + }, + { + "epoch": 0.51, + "grad_norm": 0.25374936217959587, + "learning_rate": 1.0106070029212424e-05, + "loss": 0.1262, + "step": 8907 + }, + { + "epoch": 0.51, + "grad_norm": 0.6524870029201337, + "learning_rate": 1.0104209219472418e-05, + "loss": 0.3958, + "step": 8908 + }, + { + "epoch": 0.51, + "grad_norm": 0.37586082968520385, + "learning_rate": 1.010234840612365e-05, + "loss": 0.2897, + "step": 8909 + }, + { + "epoch": 0.51, + "grad_norm": 0.3307571873747576, + "learning_rate": 1.010048758923056e-05, + "loss": 0.2648, + "step": 8910 + }, + { + "epoch": 0.51, + "grad_norm": 0.8419980738165997, + "learning_rate": 1.0098626768857591e-05, + "loss": 0.6101, + "step": 8911 + }, + { + "epoch": 0.51, + "grad_norm": 0.6204223015862753, + "learning_rate": 1.009676594506918e-05, + "loss": 0.3407, + "step": 8912 + }, + { + "epoch": 0.51, + "grad_norm": 0.28043986557667766, + "learning_rate": 1.0094905117929767e-05, + "loss": 0.1755, + "step": 8913 + }, + { + "epoch": 0.51, + "grad_norm": 0.35706107090418515, + "learning_rate": 1.0093044287503797e-05, + "loss": 0.3173, + "step": 8914 + }, + { + "epoch": 0.51, + "grad_norm": 0.564795786408195, + "learning_rate": 1.0091183453855706e-05, + "loss": 0.2933, + "step": 8915 + }, + { + "epoch": 0.51, + "grad_norm": 0.37689587316033274, + "learning_rate": 1.0089322617049936e-05, + "loss": 0.3088, + "step": 8916 + }, + { + "epoch": 0.51, + "grad_norm": 0.4513654962243863, + "learning_rate": 1.0087461777150926e-05, + "loss": 0.2704, + "step": 8917 + }, + { + "epoch": 0.51, + "grad_norm": 0.3927024710302205, + "learning_rate": 1.0085600934223121e-05, + "loss": 0.2996, + "step": 8918 + }, + { + "epoch": 0.51, + "grad_norm": 0.26729943984490195, + "learning_rate": 1.008374008833096e-05, + "loss": 0.2113, + "step": 8919 + }, + { + "epoch": 0.51, + "grad_norm": 0.6478463082866031, + "learning_rate": 1.0081879239538881e-05, + "loss": 0.332, + "step": 8920 + }, + { + "epoch": 0.51, + "grad_norm": 0.3996713800799195, + "learning_rate": 1.0080018387911328e-05, + "loss": 0.2707, + "step": 8921 + }, + { + "epoch": 0.51, + "grad_norm": 0.3221994879121444, + "learning_rate": 1.0078157533512742e-05, + "loss": 0.3001, + "step": 8922 + }, + { + "epoch": 0.51, + "grad_norm": 0.8538514540552568, + "learning_rate": 1.0076296676407565e-05, + "loss": 0.404, + "step": 8923 + }, + { + "epoch": 0.51, + "grad_norm": 0.41161279723696287, + "learning_rate": 1.0074435816660235e-05, + "loss": 0.2951, + "step": 8924 + }, + { + "epoch": 0.51, + "grad_norm": 0.35715732608291045, + "learning_rate": 1.00725749543352e-05, + "loss": 0.2687, + "step": 8925 + }, + { + "epoch": 0.51, + "grad_norm": 0.3368658460212253, + "learning_rate": 1.0070714089496891e-05, + "loss": 0.2479, + "step": 8926 + }, + { + "epoch": 0.51, + "grad_norm": 0.44449844022617024, + "learning_rate": 1.006885322220976e-05, + "loss": 0.3089, + "step": 8927 + }, + { + "epoch": 0.51, + "grad_norm": 0.5794718664509232, + "learning_rate": 1.0066992352538245e-05, + "loss": 0.3989, + "step": 8928 + }, + { + "epoch": 0.51, + "grad_norm": 0.40439067942242224, + "learning_rate": 1.0065131480546788e-05, + "loss": 0.327, + "step": 8929 + }, + { + "epoch": 0.51, + "grad_norm": 0.7094782461959981, + "learning_rate": 1.006327060629983e-05, + "loss": 0.2071, + "step": 8930 + }, + { + "epoch": 0.51, + "grad_norm": 0.32681962706815443, + "learning_rate": 1.0061409729861814e-05, + "loss": 0.3041, + "step": 8931 + }, + { + "epoch": 0.51, + "grad_norm": 0.29148296068996754, + "learning_rate": 1.0059548851297178e-05, + "loss": 0.2023, + "step": 8932 + }, + { + "epoch": 0.51, + "grad_norm": 0.3429142652730741, + "learning_rate": 1.0057687970670372e-05, + "loss": 0.2153, + "step": 8933 + }, + { + "epoch": 0.51, + "grad_norm": 0.34103484654619365, + "learning_rate": 1.005582708804583e-05, + "loss": 0.2975, + "step": 8934 + }, + { + "epoch": 0.51, + "grad_norm": 1.0197996496950272, + "learning_rate": 1.0053966203488003e-05, + "loss": 0.677, + "step": 8935 + }, + { + "epoch": 0.51, + "grad_norm": 0.36527832406679867, + "learning_rate": 1.0052105317061327e-05, + "loss": 0.182, + "step": 8936 + }, + { + "epoch": 0.51, + "grad_norm": 0.29033341390121514, + "learning_rate": 1.0050244428830246e-05, + "loss": 0.2933, + "step": 8937 + }, + { + "epoch": 0.51, + "grad_norm": 0.42647799922546814, + "learning_rate": 1.0048383538859202e-05, + "loss": 0.354, + "step": 8938 + }, + { + "epoch": 0.51, + "grad_norm": 0.1574000968867463, + "learning_rate": 1.0046522647212642e-05, + "loss": 0.087, + "step": 8939 + }, + { + "epoch": 0.51, + "grad_norm": 0.41188216839380715, + "learning_rate": 1.0044661753955001e-05, + "loss": 0.3561, + "step": 8940 + }, + { + "epoch": 0.51, + "grad_norm": 0.4381041332600563, + "learning_rate": 1.0042800859150726e-05, + "loss": 0.3384, + "step": 8941 + }, + { + "epoch": 0.51, + "grad_norm": 0.3689628610234497, + "learning_rate": 1.0040939962864258e-05, + "loss": 0.311, + "step": 8942 + }, + { + "epoch": 0.51, + "grad_norm": 0.3731679048957679, + "learning_rate": 1.0039079065160042e-05, + "loss": 0.2616, + "step": 8943 + }, + { + "epoch": 0.51, + "grad_norm": 0.41815692396401, + "learning_rate": 1.0037218166102518e-05, + "loss": 0.2991, + "step": 8944 + }, + { + "epoch": 0.51, + "grad_norm": 0.2680388741950821, + "learning_rate": 1.0035357265756134e-05, + "loss": 0.2294, + "step": 8945 + }, + { + "epoch": 0.51, + "grad_norm": 0.3967633759634121, + "learning_rate": 1.003349636418533e-05, + "loss": 0.2559, + "step": 8946 + }, + { + "epoch": 0.51, + "grad_norm": 0.7089779370160822, + "learning_rate": 1.0031635461454544e-05, + "loss": 0.5292, + "step": 8947 + }, + { + "epoch": 0.51, + "grad_norm": 0.8142086961246247, + "learning_rate": 1.0029774557628224e-05, + "loss": 0.3636, + "step": 8948 + }, + { + "epoch": 0.51, + "grad_norm": 0.3104295700200426, + "learning_rate": 1.0027913652770813e-05, + "loss": 0.2351, + "step": 8949 + }, + { + "epoch": 0.51, + "grad_norm": 0.3667967386304453, + "learning_rate": 1.0026052746946756e-05, + "loss": 0.2733, + "step": 8950 + }, + { + "epoch": 0.51, + "grad_norm": 0.45760292541987996, + "learning_rate": 1.002419184022049e-05, + "loss": 0.282, + "step": 8951 + }, + { + "epoch": 0.51, + "grad_norm": 0.2975732477040278, + "learning_rate": 1.0022330932656463e-05, + "loss": 0.2097, + "step": 8952 + }, + { + "epoch": 0.51, + "grad_norm": 0.36123061495278136, + "learning_rate": 1.0020470024319115e-05, + "loss": 0.2913, + "step": 8953 + }, + { + "epoch": 0.51, + "grad_norm": 0.6903017856807934, + "learning_rate": 1.0018609115272896e-05, + "loss": 0.3646, + "step": 8954 + }, + { + "epoch": 0.51, + "grad_norm": 0.34900373266645707, + "learning_rate": 1.0016748205582238e-05, + "loss": 0.2853, + "step": 8955 + }, + { + "epoch": 0.51, + "grad_norm": 0.6639895995416654, + "learning_rate": 1.0014887295311595e-05, + "loss": 0.3339, + "step": 8956 + }, + { + "epoch": 0.51, + "grad_norm": 0.21779948373374317, + "learning_rate": 1.0013026384525404e-05, + "loss": 0.2023, + "step": 8957 + }, + { + "epoch": 0.51, + "grad_norm": 0.3989690662409945, + "learning_rate": 1.0011165473288108e-05, + "loss": 0.3183, + "step": 8958 + }, + { + "epoch": 0.51, + "grad_norm": 0.9027589397240701, + "learning_rate": 1.0009304561664154e-05, + "loss": 0.359, + "step": 8959 + }, + { + "epoch": 0.51, + "grad_norm": 0.6025413935849674, + "learning_rate": 1.0007443649717985e-05, + "loss": 0.3867, + "step": 8960 + }, + { + "epoch": 0.51, + "grad_norm": 0.3535480063313833, + "learning_rate": 1.0005582737514039e-05, + "loss": 0.2922, + "step": 8961 + }, + { + "epoch": 0.51, + "grad_norm": 0.34940809513436644, + "learning_rate": 1.0003721825116766e-05, + "loss": 0.2565, + "step": 8962 + }, + { + "epoch": 0.51, + "grad_norm": 0.23572550353217794, + "learning_rate": 1.0001860912590604e-05, + "loss": 0.1681, + "step": 8963 + }, + { + "epoch": 0.52, + "grad_norm": 0.4371986239915237, + "learning_rate": 1e-05, + "loss": 0.3286, + "step": 8964 + }, + { + "epoch": 0.52, + "grad_norm": 0.5882681268821435, + "learning_rate": 9.998139087409399e-06, + "loss": 0.3003, + "step": 8965 + }, + { + "epoch": 0.52, + "grad_norm": 0.998562898170738, + "learning_rate": 9.996278174883236e-06, + "loss": 0.4324, + "step": 8966 + }, + { + "epoch": 0.52, + "grad_norm": 0.32535718394732693, + "learning_rate": 9.994417262485963e-06, + "loss": 0.2808, + "step": 8967 + }, + { + "epoch": 0.52, + "grad_norm": 0.8845442812310932, + "learning_rate": 9.992556350282018e-06, + "loss": 0.5168, + "step": 8968 + }, + { + "epoch": 0.52, + "grad_norm": 0.2605945618913011, + "learning_rate": 9.990695438335847e-06, + "loss": 0.2, + "step": 8969 + }, + { + "epoch": 0.52, + "grad_norm": 0.3912375752348602, + "learning_rate": 9.988834526711893e-06, + "loss": 0.267, + "step": 8970 + }, + { + "epoch": 0.52, + "grad_norm": 0.8202549742402676, + "learning_rate": 9.9869736154746e-06, + "loss": 0.4242, + "step": 8971 + }, + { + "epoch": 0.52, + "grad_norm": 0.2838554077787522, + "learning_rate": 9.985112704688406e-06, + "loss": 0.1891, + "step": 8972 + }, + { + "epoch": 0.52, + "grad_norm": 0.2958047596139349, + "learning_rate": 9.983251794417763e-06, + "loss": 0.2796, + "step": 8973 + }, + { + "epoch": 0.52, + "grad_norm": 1.146356366991916, + "learning_rate": 9.981390884727106e-06, + "loss": 0.6581, + "step": 8974 + }, + { + "epoch": 0.52, + "grad_norm": 0.4452802708748933, + "learning_rate": 9.979529975680885e-06, + "loss": 0.2432, + "step": 8975 + }, + { + "epoch": 0.52, + "grad_norm": 0.38232368072779344, + "learning_rate": 9.977669067343537e-06, + "loss": 0.3037, + "step": 8976 + }, + { + "epoch": 0.52, + "grad_norm": 0.3709480557645404, + "learning_rate": 9.975808159779512e-06, + "loss": 0.2857, + "step": 8977 + }, + { + "epoch": 0.52, + "grad_norm": 0.2848080628979088, + "learning_rate": 9.973947253053248e-06, + "loss": 0.1488, + "step": 8978 + }, + { + "epoch": 0.52, + "grad_norm": 0.36194839241988114, + "learning_rate": 9.972086347229187e-06, + "loss": 0.3028, + "step": 8979 + }, + { + "epoch": 0.52, + "grad_norm": 0.9172902205994092, + "learning_rate": 9.970225442371778e-06, + "loss": 0.5909, + "step": 8980 + }, + { + "epoch": 0.52, + "grad_norm": 0.3406603301566798, + "learning_rate": 9.968364538545461e-06, + "loss": 0.3361, + "step": 8981 + }, + { + "epoch": 0.52, + "grad_norm": 0.3044827668809311, + "learning_rate": 9.966503635814677e-06, + "loss": 0.2007, + "step": 8982 + }, + { + "epoch": 0.52, + "grad_norm": 0.5498909403562776, + "learning_rate": 9.96464273424387e-06, + "loss": 0.4395, + "step": 8983 + }, + { + "epoch": 0.52, + "grad_norm": 0.2707344192696009, + "learning_rate": 9.962781833897484e-06, + "loss": 0.2111, + "step": 8984 + }, + { + "epoch": 0.52, + "grad_norm": 0.27766021070239455, + "learning_rate": 9.960920934839963e-06, + "loss": 0.2124, + "step": 8985 + }, + { + "epoch": 0.52, + "grad_norm": 1.0646904612166812, + "learning_rate": 9.959060037135745e-06, + "loss": 0.5268, + "step": 8986 + }, + { + "epoch": 0.52, + "grad_norm": 0.7117116899008378, + "learning_rate": 9.95719914084928e-06, + "loss": 0.4817, + "step": 8987 + }, + { + "epoch": 0.52, + "grad_norm": 0.2762098719031825, + "learning_rate": 9.955338246045004e-06, + "loss": 0.2282, + "step": 8988 + }, + { + "epoch": 0.52, + "grad_norm": 0.3723392795742885, + "learning_rate": 9.953477352787363e-06, + "loss": 0.3297, + "step": 8989 + }, + { + "epoch": 0.52, + "grad_norm": 0.4044795205958676, + "learning_rate": 9.9516164611408e-06, + "loss": 0.2681, + "step": 8990 + }, + { + "epoch": 0.52, + "grad_norm": 0.2391132673895818, + "learning_rate": 9.949755571169757e-06, + "loss": 0.2128, + "step": 8991 + }, + { + "epoch": 0.52, + "grad_norm": 0.9771484815283424, + "learning_rate": 9.947894682938676e-06, + "loss": 0.2187, + "step": 8992 + }, + { + "epoch": 0.52, + "grad_norm": 0.3742803679785533, + "learning_rate": 9.946033796511999e-06, + "loss": 0.3154, + "step": 8993 + }, + { + "epoch": 0.52, + "grad_norm": 0.36770528833525723, + "learning_rate": 9.944172911954173e-06, + "loss": 0.2913, + "step": 8994 + }, + { + "epoch": 0.52, + "grad_norm": 0.7614209857492386, + "learning_rate": 9.942312029329631e-06, + "loss": 0.4018, + "step": 8995 + }, + { + "epoch": 0.52, + "grad_norm": 0.27484815842745103, + "learning_rate": 9.940451148702826e-06, + "loss": 0.2612, + "step": 8996 + }, + { + "epoch": 0.52, + "grad_norm": 0.31502147368921235, + "learning_rate": 9.938590270138191e-06, + "loss": 0.2551, + "step": 8997 + }, + { + "epoch": 0.52, + "grad_norm": 0.43320342033214754, + "learning_rate": 9.936729393700176e-06, + "loss": 0.1657, + "step": 8998 + }, + { + "epoch": 0.52, + "grad_norm": 0.7310020498893697, + "learning_rate": 9.934868519453215e-06, + "loss": 0.4141, + "step": 8999 + }, + { + "epoch": 0.52, + "grad_norm": 0.3868343055255914, + "learning_rate": 9.933007647461758e-06, + "loss": 0.2496, + "step": 9000 + }, + { + "epoch": 0.52, + "grad_norm": 0.304909472398119, + "learning_rate": 9.931146777790241e-06, + "loss": 0.2584, + "step": 9001 + }, + { + "epoch": 0.52, + "grad_norm": 0.43745718846557247, + "learning_rate": 9.929285910503112e-06, + "loss": 0.2569, + "step": 9002 + }, + { + "epoch": 0.52, + "grad_norm": 0.2749356993094986, + "learning_rate": 9.927425045664804e-06, + "loss": 0.202, + "step": 9003 + }, + { + "epoch": 0.52, + "grad_norm": 0.44467210424965653, + "learning_rate": 9.925564183339768e-06, + "loss": 0.3516, + "step": 9004 + }, + { + "epoch": 0.52, + "grad_norm": 0.3609803021729023, + "learning_rate": 9.92370332359244e-06, + "loss": 0.2799, + "step": 9005 + }, + { + "epoch": 0.52, + "grad_norm": 0.32493117789295467, + "learning_rate": 9.92184246648726e-06, + "loss": 0.2776, + "step": 9006 + }, + { + "epoch": 0.52, + "grad_norm": 1.0347469550216204, + "learning_rate": 9.919981612088676e-06, + "loss": 0.7681, + "step": 9007 + }, + { + "epoch": 0.52, + "grad_norm": 0.33748772354766315, + "learning_rate": 9.91812076046112e-06, + "loss": 0.2435, + "step": 9008 + }, + { + "epoch": 0.52, + "grad_norm": 0.26513883317931297, + "learning_rate": 9.916259911669044e-06, + "loss": 0.2091, + "step": 9009 + }, + { + "epoch": 0.52, + "grad_norm": 0.46014096183328146, + "learning_rate": 9.914399065776879e-06, + "loss": 0.2877, + "step": 9010 + }, + { + "epoch": 0.52, + "grad_norm": 0.6082292573499609, + "learning_rate": 9.912538222849074e-06, + "loss": 0.3486, + "step": 9011 + }, + { + "epoch": 0.52, + "grad_norm": 0.34178859785901966, + "learning_rate": 9.910677382950064e-06, + "loss": 0.2791, + "step": 9012 + }, + { + "epoch": 0.52, + "grad_norm": 0.3524107406208595, + "learning_rate": 9.908816546144296e-06, + "loss": 0.3128, + "step": 9013 + }, + { + "epoch": 0.52, + "grad_norm": 0.2773592503100069, + "learning_rate": 9.906955712496203e-06, + "loss": 0.1512, + "step": 9014 + }, + { + "epoch": 0.52, + "grad_norm": 0.3265634728938896, + "learning_rate": 9.905094882070234e-06, + "loss": 0.2592, + "step": 9015 + }, + { + "epoch": 0.52, + "grad_norm": 0.40937969659931533, + "learning_rate": 9.903234054930824e-06, + "loss": 0.3167, + "step": 9016 + }, + { + "epoch": 0.52, + "grad_norm": 0.33132514739426283, + "learning_rate": 9.901373231142416e-06, + "loss": 0.3214, + "step": 9017 + }, + { + "epoch": 0.52, + "grad_norm": 0.35046526567785413, + "learning_rate": 9.899512410769443e-06, + "loss": 0.1871, + "step": 9018 + }, + { + "epoch": 0.52, + "grad_norm": 0.4758272095091834, + "learning_rate": 9.897651593876356e-06, + "loss": 0.4102, + "step": 9019 + }, + { + "epoch": 0.52, + "grad_norm": 0.32645946398502895, + "learning_rate": 9.895790780527585e-06, + "loss": 0.2765, + "step": 9020 + }, + { + "epoch": 0.52, + "grad_norm": 0.33879706747924987, + "learning_rate": 9.89392997078758e-06, + "loss": 0.1954, + "step": 9021 + }, + { + "epoch": 0.52, + "grad_norm": 0.420195059981251, + "learning_rate": 9.892069164720771e-06, + "loss": 0.3489, + "step": 9022 + }, + { + "epoch": 0.52, + "grad_norm": 0.27953102565373356, + "learning_rate": 9.890208362391606e-06, + "loss": 0.1685, + "step": 9023 + }, + { + "epoch": 0.52, + "grad_norm": 0.33324986313949784, + "learning_rate": 9.888347563864517e-06, + "loss": 0.2261, + "step": 9024 + }, + { + "epoch": 0.52, + "grad_norm": 0.4158775305414562, + "learning_rate": 9.886486769203951e-06, + "loss": 0.3704, + "step": 9025 + }, + { + "epoch": 0.52, + "grad_norm": 1.1118021729267935, + "learning_rate": 9.884625978474341e-06, + "loss": 0.6197, + "step": 9026 + }, + { + "epoch": 0.52, + "grad_norm": 0.2940205641682392, + "learning_rate": 9.88276519174013e-06, + "loss": 0.2177, + "step": 9027 + }, + { + "epoch": 0.52, + "grad_norm": 0.632974645298959, + "learning_rate": 9.880904409065753e-06, + "loss": 0.4462, + "step": 9028 + }, + { + "epoch": 0.52, + "grad_norm": 0.21338348316802783, + "learning_rate": 9.879043630515651e-06, + "loss": 0.2196, + "step": 9029 + }, + { + "epoch": 0.52, + "grad_norm": 0.4146681584696092, + "learning_rate": 9.877182856154267e-06, + "loss": 0.3047, + "step": 9030 + }, + { + "epoch": 0.52, + "grad_norm": 0.4489759474178878, + "learning_rate": 9.87532208604603e-06, + "loss": 0.2851, + "step": 9031 + }, + { + "epoch": 0.52, + "grad_norm": 0.36796592998808814, + "learning_rate": 9.873461320255388e-06, + "loss": 0.3119, + "step": 9032 + }, + { + "epoch": 0.52, + "grad_norm": 0.3727674532757231, + "learning_rate": 9.871600558846772e-06, + "loss": 0.2888, + "step": 9033 + }, + { + "epoch": 0.52, + "grad_norm": 0.4707617476263641, + "learning_rate": 9.869739801884627e-06, + "loss": 0.2876, + "step": 9034 + }, + { + "epoch": 0.52, + "grad_norm": 0.2443750988765055, + "learning_rate": 9.867879049433383e-06, + "loss": 0.1874, + "step": 9035 + }, + { + "epoch": 0.52, + "grad_norm": 0.39422198057692154, + "learning_rate": 9.866018301557484e-06, + "loss": 0.2668, + "step": 9036 + }, + { + "epoch": 0.52, + "grad_norm": 0.29803699521115307, + "learning_rate": 9.864157558321364e-06, + "loss": 0.2592, + "step": 9037 + }, + { + "epoch": 0.52, + "grad_norm": 0.4831922205500076, + "learning_rate": 9.862296819789464e-06, + "loss": 0.3992, + "step": 9038 + }, + { + "epoch": 0.52, + "grad_norm": 0.5305541289799832, + "learning_rate": 9.860436086026218e-06, + "loss": 0.3633, + "step": 9039 + }, + { + "epoch": 0.52, + "grad_norm": 0.2684294559842977, + "learning_rate": 9.858575357096064e-06, + "loss": 0.2333, + "step": 9040 + }, + { + "epoch": 0.52, + "grad_norm": 0.25245525380867007, + "learning_rate": 9.85671463306344e-06, + "loss": 0.1646, + "step": 9041 + }, + { + "epoch": 0.52, + "grad_norm": 0.5212038708909201, + "learning_rate": 9.854853913992783e-06, + "loss": 0.3353, + "step": 9042 + }, + { + "epoch": 0.52, + "grad_norm": 0.3956254025587982, + "learning_rate": 9.852993199948527e-06, + "loss": 0.3331, + "step": 9043 + }, + { + "epoch": 0.52, + "grad_norm": 0.3427049716407958, + "learning_rate": 9.85113249099511e-06, + "loss": 0.276, + "step": 9044 + }, + { + "epoch": 0.52, + "grad_norm": 0.46855115446728846, + "learning_rate": 9.849271787196971e-06, + "loss": 0.3368, + "step": 9045 + }, + { + "epoch": 0.52, + "grad_norm": 0.3783074851487932, + "learning_rate": 9.847411088618539e-06, + "loss": 0.3037, + "step": 9046 + }, + { + "epoch": 0.52, + "grad_norm": 0.18974821301931535, + "learning_rate": 9.845550395324259e-06, + "loss": 0.0865, + "step": 9047 + }, + { + "epoch": 0.52, + "grad_norm": 0.2976246395263267, + "learning_rate": 9.843689707378558e-06, + "loss": 0.2641, + "step": 9048 + }, + { + "epoch": 0.52, + "grad_norm": 0.3856321462242667, + "learning_rate": 9.841829024845882e-06, + "loss": 0.3264, + "step": 9049 + }, + { + "epoch": 0.52, + "grad_norm": 0.8229565405098527, + "learning_rate": 9.839968347790657e-06, + "loss": 0.3381, + "step": 9050 + }, + { + "epoch": 0.52, + "grad_norm": 0.5464022009127502, + "learning_rate": 9.83810767627732e-06, + "loss": 0.3216, + "step": 9051 + }, + { + "epoch": 0.52, + "grad_norm": 0.3146404356039652, + "learning_rate": 9.836247010370308e-06, + "loss": 0.2941, + "step": 9052 + }, + { + "epoch": 0.52, + "grad_norm": 0.29591631958317766, + "learning_rate": 9.834386350134052e-06, + "loss": 0.1869, + "step": 9053 + }, + { + "epoch": 0.52, + "grad_norm": 0.7751683065263755, + "learning_rate": 9.832525695632994e-06, + "loss": 0.4217, + "step": 9054 + }, + { + "epoch": 0.52, + "grad_norm": 0.4249413501512791, + "learning_rate": 9.830665046931563e-06, + "loss": 0.2905, + "step": 9055 + }, + { + "epoch": 0.52, + "grad_norm": 0.5985386409545421, + "learning_rate": 9.828804404094192e-06, + "loss": 0.3324, + "step": 9056 + }, + { + "epoch": 0.52, + "grad_norm": 0.20868793648929493, + "learning_rate": 9.82694376718532e-06, + "loss": 0.0741, + "step": 9057 + }, + { + "epoch": 0.52, + "grad_norm": 0.3726465468468453, + "learning_rate": 9.825083136269375e-06, + "loss": 0.3055, + "step": 9058 + }, + { + "epoch": 0.52, + "grad_norm": 0.4228458238529731, + "learning_rate": 9.823222511410795e-06, + "loss": 0.2778, + "step": 9059 + }, + { + "epoch": 0.52, + "grad_norm": 0.29394146644376107, + "learning_rate": 9.82136189267401e-06, + "loss": 0.2364, + "step": 9060 + }, + { + "epoch": 0.52, + "grad_norm": 0.43066936972759234, + "learning_rate": 9.819501280123458e-06, + "loss": 0.3554, + "step": 9061 + }, + { + "epoch": 0.52, + "grad_norm": 0.6720219973395819, + "learning_rate": 9.817640673823566e-06, + "loss": 0.4414, + "step": 9062 + }, + { + "epoch": 0.52, + "grad_norm": 0.22057283222630394, + "learning_rate": 9.815780073838773e-06, + "loss": 0.0968, + "step": 9063 + }, + { + "epoch": 0.52, + "grad_norm": 0.27013861542171497, + "learning_rate": 9.813919480233503e-06, + "loss": 0.283, + "step": 9064 + }, + { + "epoch": 0.52, + "grad_norm": 0.7716994426637789, + "learning_rate": 9.812058893072199e-06, + "loss": 0.5575, + "step": 9065 + }, + { + "epoch": 0.52, + "grad_norm": 0.34090727687569566, + "learning_rate": 9.810198312419284e-06, + "loss": 0.2088, + "step": 9066 + }, + { + "epoch": 0.52, + "grad_norm": 0.4536499925740262, + "learning_rate": 9.808337738339194e-06, + "loss": 0.355, + "step": 9067 + }, + { + "epoch": 0.52, + "grad_norm": 0.3564000526722853, + "learning_rate": 9.80647717089636e-06, + "loss": 0.3234, + "step": 9068 + }, + { + "epoch": 0.52, + "grad_norm": 0.23213456762043275, + "learning_rate": 9.804616610155215e-06, + "loss": 0.1778, + "step": 9069 + }, + { + "epoch": 0.52, + "grad_norm": 0.3640646952779975, + "learning_rate": 9.802756056180187e-06, + "loss": 0.2301, + "step": 9070 + }, + { + "epoch": 0.52, + "grad_norm": 1.2155181820109497, + "learning_rate": 9.800895509035708e-06, + "loss": 0.8132, + "step": 9071 + }, + { + "epoch": 0.52, + "grad_norm": 0.3652622733588144, + "learning_rate": 9.799034968786209e-06, + "loss": 0.2981, + "step": 9072 + }, + { + "epoch": 0.52, + "grad_norm": 0.3160744756567268, + "learning_rate": 9.797174435496119e-06, + "loss": 0.2346, + "step": 9073 + }, + { + "epoch": 0.52, + "grad_norm": 0.3198722182667888, + "learning_rate": 9.795313909229872e-06, + "loss": 0.2408, + "step": 9074 + }, + { + "epoch": 0.52, + "grad_norm": 0.43873354265547654, + "learning_rate": 9.793453390051894e-06, + "loss": 0.3458, + "step": 9075 + }, + { + "epoch": 0.52, + "grad_norm": 0.27745929926602364, + "learning_rate": 9.791592878026617e-06, + "loss": 0.2203, + "step": 9076 + }, + { + "epoch": 0.52, + "grad_norm": 0.962358758398474, + "learning_rate": 9.789732373218468e-06, + "loss": 0.5281, + "step": 9077 + }, + { + "epoch": 0.52, + "grad_norm": 0.5943049810697584, + "learning_rate": 9.78787187569188e-06, + "loss": 0.3826, + "step": 9078 + }, + { + "epoch": 0.52, + "grad_norm": 0.3278974230441489, + "learning_rate": 9.786011385511279e-06, + "loss": 0.2963, + "step": 9079 + }, + { + "epoch": 0.52, + "grad_norm": 0.3273067224989936, + "learning_rate": 9.784150902741095e-06, + "loss": 0.2385, + "step": 9080 + }, + { + "epoch": 0.52, + "grad_norm": 0.27902999532971506, + "learning_rate": 9.782290427445755e-06, + "loss": 0.1677, + "step": 9081 + }, + { + "epoch": 0.52, + "grad_norm": 0.3192097580526225, + "learning_rate": 9.78042995968969e-06, + "loss": 0.2732, + "step": 9082 + }, + { + "epoch": 0.52, + "grad_norm": 0.6567345097224239, + "learning_rate": 9.778569499537327e-06, + "loss": 0.3247, + "step": 9083 + }, + { + "epoch": 0.52, + "grad_norm": 0.30540297396342236, + "learning_rate": 9.77670904705309e-06, + "loss": 0.2962, + "step": 9084 + }, + { + "epoch": 0.52, + "grad_norm": 0.368732504569253, + "learning_rate": 9.77484860230141e-06, + "loss": 0.292, + "step": 9085 + }, + { + "epoch": 0.52, + "grad_norm": 0.8805145259459297, + "learning_rate": 9.772988165346715e-06, + "loss": 0.4137, + "step": 9086 + }, + { + "epoch": 0.52, + "grad_norm": 0.22124704588757194, + "learning_rate": 9.771127736253426e-06, + "loss": 0.1537, + "step": 9087 + }, + { + "epoch": 0.52, + "grad_norm": 0.3510303095693387, + "learning_rate": 9.769267315085976e-06, + "loss": 0.2932, + "step": 9088 + }, + { + "epoch": 0.52, + "grad_norm": 0.4614132599295984, + "learning_rate": 9.767406901908787e-06, + "loss": 0.2882, + "step": 9089 + }, + { + "epoch": 0.52, + "grad_norm": 0.8664585977799852, + "learning_rate": 9.76554649678629e-06, + "loss": 0.4554, + "step": 9090 + }, + { + "epoch": 0.52, + "grad_norm": 0.3287018378571944, + "learning_rate": 9.763686099782905e-06, + "loss": 0.2582, + "step": 9091 + }, + { + "epoch": 0.52, + "grad_norm": 0.31568488663477967, + "learning_rate": 9.761825710963063e-06, + "loss": 0.2942, + "step": 9092 + }, + { + "epoch": 0.52, + "grad_norm": 0.20206609002791392, + "learning_rate": 9.759965330391182e-06, + "loss": 0.0898, + "step": 9093 + }, + { + "epoch": 0.52, + "grad_norm": 0.3151578729799421, + "learning_rate": 9.758104958131696e-06, + "loss": 0.2367, + "step": 9094 + }, + { + "epoch": 0.52, + "grad_norm": 0.741103555846677, + "learning_rate": 9.756244594249024e-06, + "loss": 0.4884, + "step": 9095 + }, + { + "epoch": 0.52, + "grad_norm": 0.368566194853679, + "learning_rate": 9.754384238807589e-06, + "loss": 0.2917, + "step": 9096 + }, + { + "epoch": 0.52, + "grad_norm": 0.32908075448316854, + "learning_rate": 9.752523891871819e-06, + "loss": 0.2796, + "step": 9097 + }, + { + "epoch": 0.52, + "grad_norm": 0.9739561754616147, + "learning_rate": 9.750663553506134e-06, + "loss": 0.6682, + "step": 9098 + }, + { + "epoch": 0.52, + "grad_norm": 0.19140722525879253, + "learning_rate": 9.748803223774962e-06, + "loss": 0.1553, + "step": 9099 + }, + { + "epoch": 0.52, + "grad_norm": 0.29655050143646866, + "learning_rate": 9.746942902742722e-06, + "loss": 0.2873, + "step": 9100 + }, + { + "epoch": 0.52, + "grad_norm": 0.8291151161681248, + "learning_rate": 9.745082590473839e-06, + "loss": 0.4544, + "step": 9101 + }, + { + "epoch": 0.52, + "grad_norm": 0.6396141518635055, + "learning_rate": 9.743222287032734e-06, + "loss": 0.3046, + "step": 9102 + }, + { + "epoch": 0.52, + "grad_norm": 0.3562190167610645, + "learning_rate": 9.741361992483832e-06, + "loss": 0.2791, + "step": 9103 + }, + { + "epoch": 0.52, + "grad_norm": 0.3584912974204283, + "learning_rate": 9.739501706891551e-06, + "loss": 0.3235, + "step": 9104 + }, + { + "epoch": 0.52, + "grad_norm": 0.28189812469588293, + "learning_rate": 9.737641430320315e-06, + "loss": 0.1743, + "step": 9105 + }, + { + "epoch": 0.52, + "grad_norm": 0.30844703586005295, + "learning_rate": 9.735781162834546e-06, + "loss": 0.1798, + "step": 9106 + }, + { + "epoch": 0.52, + "grad_norm": 0.8857786094208216, + "learning_rate": 9.733920904498664e-06, + "loss": 0.3613, + "step": 9107 + }, + { + "epoch": 0.52, + "grad_norm": 0.4731519913769104, + "learning_rate": 9.73206065537709e-06, + "loss": 0.3541, + "step": 9108 + }, + { + "epoch": 0.52, + "grad_norm": 0.291075703344724, + "learning_rate": 9.730200415534242e-06, + "loss": 0.1994, + "step": 9109 + }, + { + "epoch": 0.52, + "grad_norm": 1.2261999869951392, + "learning_rate": 9.728340185034545e-06, + "loss": 0.7143, + "step": 9110 + }, + { + "epoch": 0.52, + "grad_norm": 0.31050079628050553, + "learning_rate": 9.726479963942412e-06, + "loss": 0.2554, + "step": 9111 + }, + { + "epoch": 0.52, + "grad_norm": 0.26466322118910507, + "learning_rate": 9.72461975232227e-06, + "loss": 0.1923, + "step": 9112 + }, + { + "epoch": 0.52, + "grad_norm": 0.6613580619617312, + "learning_rate": 9.72275955023853e-06, + "loss": 0.4344, + "step": 9113 + }, + { + "epoch": 0.52, + "grad_norm": 1.3485587659150748, + "learning_rate": 9.720899357755618e-06, + "loss": 0.8047, + "step": 9114 + }, + { + "epoch": 0.52, + "grad_norm": 0.29397938026410353, + "learning_rate": 9.719039174937948e-06, + "loss": 0.2338, + "step": 9115 + }, + { + "epoch": 0.52, + "grad_norm": 0.4540959149910563, + "learning_rate": 9.717179001849942e-06, + "loss": 0.3446, + "step": 9116 + }, + { + "epoch": 0.52, + "grad_norm": 0.7506127708683249, + "learning_rate": 9.715318838556014e-06, + "loss": 0.4407, + "step": 9117 + }, + { + "epoch": 0.52, + "grad_norm": 0.3762547929412095, + "learning_rate": 9.71345868512058e-06, + "loss": 0.2911, + "step": 9118 + }, + { + "epoch": 0.52, + "grad_norm": 0.32214352086262227, + "learning_rate": 9.711598541608062e-06, + "loss": 0.2218, + "step": 9119 + }, + { + "epoch": 0.52, + "grad_norm": 0.3327082599332806, + "learning_rate": 9.709738408082873e-06, + "loss": 0.2906, + "step": 9120 + }, + { + "epoch": 0.52, + "grad_norm": 0.3958060769664337, + "learning_rate": 9.707878284609429e-06, + "loss": 0.2693, + "step": 9121 + }, + { + "epoch": 0.52, + "grad_norm": 0.4592079526507075, + "learning_rate": 9.706018171252148e-06, + "loss": 0.2672, + "step": 9122 + }, + { + "epoch": 0.52, + "grad_norm": 0.34486135538857204, + "learning_rate": 9.704158068075445e-06, + "loss": 0.314, + "step": 9123 + }, + { + "epoch": 0.52, + "grad_norm": 0.3742732158714571, + "learning_rate": 9.702297975143737e-06, + "loss": 0.2434, + "step": 9124 + }, + { + "epoch": 0.52, + "grad_norm": 0.2477396220146489, + "learning_rate": 9.700437892521434e-06, + "loss": 0.1747, + "step": 9125 + }, + { + "epoch": 0.52, + "grad_norm": 1.2186332134145217, + "learning_rate": 9.698577820272958e-06, + "loss": 0.81, + "step": 9126 + }, + { + "epoch": 0.52, + "grad_norm": 0.4225976525036132, + "learning_rate": 9.696717758462716e-06, + "loss": 0.2788, + "step": 9127 + }, + { + "epoch": 0.52, + "grad_norm": 0.2970628514194815, + "learning_rate": 9.694857707155126e-06, + "loss": 0.2496, + "step": 9128 + }, + { + "epoch": 0.52, + "grad_norm": 0.8264539934908051, + "learning_rate": 9.6929976664146e-06, + "loss": 0.4638, + "step": 9129 + }, + { + "epoch": 0.52, + "grad_norm": 0.33276779372939486, + "learning_rate": 9.691137636305554e-06, + "loss": 0.2866, + "step": 9130 + }, + { + "epoch": 0.52, + "grad_norm": 0.3552299923460887, + "learning_rate": 9.689277616892396e-06, + "loss": 0.3332, + "step": 9131 + }, + { + "epoch": 0.52, + "grad_norm": 0.21249879155980383, + "learning_rate": 9.687417608239541e-06, + "loss": 0.1038, + "step": 9132 + }, + { + "epoch": 0.52, + "grad_norm": 0.3190251853080398, + "learning_rate": 9.6855576104114e-06, + "loss": 0.2811, + "step": 9133 + }, + { + "epoch": 0.52, + "grad_norm": 1.1275659444191837, + "learning_rate": 9.683697623472387e-06, + "loss": 0.6108, + "step": 9134 + }, + { + "epoch": 0.52, + "grad_norm": 0.4722264674934552, + "learning_rate": 9.681837647486912e-06, + "loss": 0.2835, + "step": 9135 + }, + { + "epoch": 0.52, + "grad_norm": 0.3269246466042898, + "learning_rate": 9.679977682519385e-06, + "loss": 0.2699, + "step": 9136 + }, + { + "epoch": 0.52, + "grad_norm": 1.6250715059284526, + "learning_rate": 9.678117728634217e-06, + "loss": 0.6513, + "step": 9137 + }, + { + "epoch": 0.53, + "grad_norm": 0.23143067024166109, + "learning_rate": 9.676257785895817e-06, + "loss": 0.134, + "step": 9138 + }, + { + "epoch": 0.53, + "grad_norm": 0.39965635338532934, + "learning_rate": 9.674397854368598e-06, + "loss": 0.2694, + "step": 9139 + }, + { + "epoch": 0.53, + "grad_norm": 0.3096227242107919, + "learning_rate": 9.672537934116966e-06, + "loss": 0.3045, + "step": 9140 + }, + { + "epoch": 0.53, + "grad_norm": 0.5551639828567845, + "learning_rate": 9.670678025205332e-06, + "loss": 0.3214, + "step": 9141 + }, + { + "epoch": 0.53, + "grad_norm": 0.3793778776155998, + "learning_rate": 9.668818127698103e-06, + "loss": 0.2593, + "step": 9142 + }, + { + "epoch": 0.53, + "grad_norm": 0.3052842446630093, + "learning_rate": 9.66695824165969e-06, + "loss": 0.2919, + "step": 9143 + }, + { + "epoch": 0.53, + "grad_norm": 0.3974048517058976, + "learning_rate": 9.665098367154496e-06, + "loss": 0.2662, + "step": 9144 + }, + { + "epoch": 0.53, + "grad_norm": 0.30419717373936883, + "learning_rate": 9.663238504246933e-06, + "loss": 0.2007, + "step": 9145 + }, + { + "epoch": 0.53, + "grad_norm": 0.3827763477642118, + "learning_rate": 9.661378653001404e-06, + "loss": 0.2769, + "step": 9146 + }, + { + "epoch": 0.53, + "grad_norm": 0.3473256096785234, + "learning_rate": 9.65951881348232e-06, + "loss": 0.3079, + "step": 9147 + }, + { + "epoch": 0.53, + "grad_norm": 0.3339712098319444, + "learning_rate": 9.657658985754085e-06, + "loss": 0.162, + "step": 9148 + }, + { + "epoch": 0.53, + "grad_norm": 0.507041220843939, + "learning_rate": 9.655799169881103e-06, + "loss": 0.3754, + "step": 9149 + }, + { + "epoch": 0.53, + "grad_norm": 1.3073406626204165, + "learning_rate": 9.653939365927785e-06, + "loss": 0.7838, + "step": 9150 + }, + { + "epoch": 0.53, + "grad_norm": 0.28633539371312033, + "learning_rate": 9.652079573958529e-06, + "loss": 0.2202, + "step": 9151 + }, + { + "epoch": 0.53, + "grad_norm": 0.3691162201113786, + "learning_rate": 9.650219794037741e-06, + "loss": 0.238, + "step": 9152 + }, + { + "epoch": 0.53, + "grad_norm": 0.41623336946895095, + "learning_rate": 9.648360026229828e-06, + "loss": 0.3067, + "step": 9153 + }, + { + "epoch": 0.53, + "grad_norm": 0.35646845738442257, + "learning_rate": 9.646500270599191e-06, + "loss": 0.2647, + "step": 9154 + }, + { + "epoch": 0.53, + "grad_norm": 0.28283345328135584, + "learning_rate": 9.644640527210235e-06, + "loss": 0.2443, + "step": 9155 + }, + { + "epoch": 0.53, + "grad_norm": 1.1947336670116657, + "learning_rate": 9.642780796127362e-06, + "loss": 0.6928, + "step": 9156 + }, + { + "epoch": 0.53, + "grad_norm": 0.5466204845370135, + "learning_rate": 9.640921077414975e-06, + "loss": 0.3285, + "step": 9157 + }, + { + "epoch": 0.53, + "grad_norm": 0.35102520321136116, + "learning_rate": 9.639061371137475e-06, + "loss": 0.2716, + "step": 9158 + }, + { + "epoch": 0.53, + "grad_norm": 0.24634222677994205, + "learning_rate": 9.637201677359266e-06, + "loss": 0.2251, + "step": 9159 + }, + { + "epoch": 0.53, + "grad_norm": 0.5712291239043696, + "learning_rate": 9.635341996144747e-06, + "loss": 0.3271, + "step": 9160 + }, + { + "epoch": 0.53, + "grad_norm": 0.34496339742721027, + "learning_rate": 9.633482327558316e-06, + "loss": 0.2331, + "step": 9161 + }, + { + "epoch": 0.53, + "grad_norm": 0.8177903308392893, + "learning_rate": 9.63162267166438e-06, + "loss": 0.6087, + "step": 9162 + }, + { + "epoch": 0.53, + "grad_norm": 0.3266062325787849, + "learning_rate": 9.629763028527332e-06, + "loss": 0.2651, + "step": 9163 + }, + { + "epoch": 0.53, + "grad_norm": 0.42567311427029086, + "learning_rate": 9.627903398211577e-06, + "loss": 0.2801, + "step": 9164 + }, + { + "epoch": 0.53, + "grad_norm": 0.3037908672972681, + "learning_rate": 9.626043780781508e-06, + "loss": 0.1905, + "step": 9165 + }, + { + "epoch": 0.53, + "grad_norm": 0.623542999401843, + "learning_rate": 9.62418417630153e-06, + "loss": 0.3826, + "step": 9166 + }, + { + "epoch": 0.53, + "grad_norm": 0.31378311285481736, + "learning_rate": 9.622324584836036e-06, + "loss": 0.2816, + "step": 9167 + }, + { + "epoch": 0.53, + "grad_norm": 1.04329856978667, + "learning_rate": 9.620465006449427e-06, + "loss": 0.4929, + "step": 9168 + }, + { + "epoch": 0.53, + "grad_norm": 0.6656212816613781, + "learning_rate": 9.618605441206098e-06, + "loss": 0.3594, + "step": 9169 + }, + { + "epoch": 0.53, + "grad_norm": 0.3644528569994132, + "learning_rate": 9.616745889170446e-06, + "loss": 0.3054, + "step": 9170 + }, + { + "epoch": 0.53, + "grad_norm": 0.1996589631527687, + "learning_rate": 9.614886350406865e-06, + "loss": 0.1666, + "step": 9171 + }, + { + "epoch": 0.53, + "grad_norm": 0.3726302590229509, + "learning_rate": 9.613026824979757e-06, + "loss": 0.2799, + "step": 9172 + }, + { + "epoch": 0.53, + "grad_norm": 0.5874746934630642, + "learning_rate": 9.61116731295351e-06, + "loss": 0.3598, + "step": 9173 + }, + { + "epoch": 0.53, + "grad_norm": 0.4754181343382663, + "learning_rate": 9.609307814392525e-06, + "loss": 0.3157, + "step": 9174 + }, + { + "epoch": 0.53, + "grad_norm": 0.3358430754062846, + "learning_rate": 9.607448329361193e-06, + "loss": 0.2765, + "step": 9175 + }, + { + "epoch": 0.53, + "grad_norm": 0.368547482274709, + "learning_rate": 9.605588857923906e-06, + "loss": 0.3226, + "step": 9176 + }, + { + "epoch": 0.53, + "grad_norm": 0.24855846818078145, + "learning_rate": 9.603729400145063e-06, + "loss": 0.1304, + "step": 9177 + }, + { + "epoch": 0.53, + "grad_norm": 0.8221257736419765, + "learning_rate": 9.601869956089051e-06, + "loss": 0.3965, + "step": 9178 + }, + { + "epoch": 0.53, + "grad_norm": 0.2844103771443083, + "learning_rate": 9.60001052582027e-06, + "loss": 0.2756, + "step": 9179 + }, + { + "epoch": 0.53, + "grad_norm": 0.5660900591073189, + "learning_rate": 9.598151109403102e-06, + "loss": 0.4276, + "step": 9180 + }, + { + "epoch": 0.53, + "grad_norm": 0.4704157923383228, + "learning_rate": 9.596291706901946e-06, + "loss": 0.2179, + "step": 9181 + }, + { + "epoch": 0.53, + "grad_norm": 0.4820397687697669, + "learning_rate": 9.59443231838119e-06, + "loss": 0.3491, + "step": 9182 + }, + { + "epoch": 0.53, + "grad_norm": 0.38983163889496963, + "learning_rate": 9.59257294390523e-06, + "loss": 0.3285, + "step": 9183 + }, + { + "epoch": 0.53, + "grad_norm": 0.2131714527848812, + "learning_rate": 9.59071358353845e-06, + "loss": 0.1013, + "step": 9184 + }, + { + "epoch": 0.53, + "grad_norm": 0.3957734107129791, + "learning_rate": 9.588854237345238e-06, + "loss": 0.3476, + "step": 9185 + }, + { + "epoch": 0.53, + "grad_norm": 0.9320135709989978, + "learning_rate": 9.586994905389985e-06, + "loss": 0.4693, + "step": 9186 + }, + { + "epoch": 0.53, + "grad_norm": 0.3124984898779434, + "learning_rate": 9.585135587737085e-06, + "loss": 0.2266, + "step": 9187 + }, + { + "epoch": 0.53, + "grad_norm": 0.45270701652538803, + "learning_rate": 9.583276284450917e-06, + "loss": 0.3355, + "step": 9188 + }, + { + "epoch": 0.53, + "grad_norm": 0.44931577222450564, + "learning_rate": 9.581416995595877e-06, + "loss": 0.2828, + "step": 9189 + }, + { + "epoch": 0.53, + "grad_norm": 0.23532191378371217, + "learning_rate": 9.579557721236345e-06, + "loss": 0.1269, + "step": 9190 + }, + { + "epoch": 0.53, + "grad_norm": 0.34700214444362804, + "learning_rate": 9.577698461436715e-06, + "loss": 0.2821, + "step": 9191 + }, + { + "epoch": 0.53, + "grad_norm": 1.1069957482466108, + "learning_rate": 9.575839216261366e-06, + "loss": 0.4473, + "step": 9192 + }, + { + "epoch": 0.53, + "grad_norm": 0.539025147105325, + "learning_rate": 9.573979985774689e-06, + "loss": 0.3448, + "step": 9193 + }, + { + "epoch": 0.53, + "grad_norm": 0.4325392202671178, + "learning_rate": 9.572120770041065e-06, + "loss": 0.2551, + "step": 9194 + }, + { + "epoch": 0.53, + "grad_norm": 0.37278555312821754, + "learning_rate": 9.570261569124882e-06, + "loss": 0.3115, + "step": 9195 + }, + { + "epoch": 0.53, + "grad_norm": 0.31715591922240705, + "learning_rate": 9.568402383090519e-06, + "loss": 0.1692, + "step": 9196 + }, + { + "epoch": 0.53, + "grad_norm": 0.3931850717706773, + "learning_rate": 9.566543212002365e-06, + "loss": 0.2381, + "step": 9197 + }, + { + "epoch": 0.53, + "grad_norm": 0.4006785161291414, + "learning_rate": 9.564684055924801e-06, + "loss": 0.3082, + "step": 9198 + }, + { + "epoch": 0.53, + "grad_norm": 0.9563471812307568, + "learning_rate": 9.562824914922211e-06, + "loss": 0.6652, + "step": 9199 + }, + { + "epoch": 0.53, + "grad_norm": 0.29294573478695535, + "learning_rate": 9.560965789058975e-06, + "loss": 0.2166, + "step": 9200 + }, + { + "epoch": 0.53, + "grad_norm": 1.056246726061743, + "learning_rate": 9.559106678399473e-06, + "loss": 0.5694, + "step": 9201 + }, + { + "epoch": 0.53, + "grad_norm": 0.27936225502157397, + "learning_rate": 9.55724758300809e-06, + "loss": 0.1959, + "step": 9202 + }, + { + "epoch": 0.53, + "grad_norm": 0.3162580378410517, + "learning_rate": 9.555388502949201e-06, + "loss": 0.2596, + "step": 9203 + }, + { + "epoch": 0.53, + "grad_norm": 0.6572106044140896, + "learning_rate": 9.553529438287192e-06, + "loss": 0.4197, + "step": 9204 + }, + { + "epoch": 0.53, + "grad_norm": 0.8055700596054751, + "learning_rate": 9.551670389086438e-06, + "loss": 0.5843, + "step": 9205 + }, + { + "epoch": 0.53, + "grad_norm": 0.372228470556821, + "learning_rate": 9.54981135541132e-06, + "loss": 0.2797, + "step": 9206 + }, + { + "epoch": 0.53, + "grad_norm": 0.3561420751933189, + "learning_rate": 9.547952337326214e-06, + "loss": 0.2531, + "step": 9207 + }, + { + "epoch": 0.53, + "grad_norm": 0.37148428650642595, + "learning_rate": 9.546093334895498e-06, + "loss": 0.2567, + "step": 9208 + }, + { + "epoch": 0.53, + "grad_norm": 0.36363796098136053, + "learning_rate": 9.544234348183553e-06, + "loss": 0.279, + "step": 9209 + }, + { + "epoch": 0.53, + "grad_norm": 0.2959650585604881, + "learning_rate": 9.542375377254753e-06, + "loss": 0.238, + "step": 9210 + }, + { + "epoch": 0.53, + "grad_norm": 0.6060688718356312, + "learning_rate": 9.54051642217347e-06, + "loss": 0.399, + "step": 9211 + }, + { + "epoch": 0.53, + "grad_norm": 0.3963007962854568, + "learning_rate": 9.538657483004088e-06, + "loss": 0.3126, + "step": 9212 + }, + { + "epoch": 0.53, + "grad_norm": 0.7595275160558781, + "learning_rate": 9.536798559810978e-06, + "loss": 0.3205, + "step": 9213 + }, + { + "epoch": 0.53, + "grad_norm": 0.3413631598251066, + "learning_rate": 9.53493965265851e-06, + "loss": 0.3021, + "step": 9214 + }, + { + "epoch": 0.53, + "grad_norm": 0.327046762074509, + "learning_rate": 9.533080761611066e-06, + "loss": 0.2814, + "step": 9215 + }, + { + "epoch": 0.53, + "grad_norm": 0.21901892281945962, + "learning_rate": 9.53122188673301e-06, + "loss": 0.1508, + "step": 9216 + }, + { + "epoch": 0.53, + "grad_norm": 1.0974836030044757, + "learning_rate": 9.529363028088725e-06, + "loss": 0.7183, + "step": 9217 + }, + { + "epoch": 0.53, + "grad_norm": 0.34926035459810145, + "learning_rate": 9.52750418574258e-06, + "loss": 0.265, + "step": 9218 + }, + { + "epoch": 0.53, + "grad_norm": 0.3801483295500776, + "learning_rate": 9.525645359758939e-06, + "loss": 0.3289, + "step": 9219 + }, + { + "epoch": 0.53, + "grad_norm": 1.1846574314137546, + "learning_rate": 9.523786550202182e-06, + "loss": 0.3465, + "step": 9220 + }, + { + "epoch": 0.53, + "grad_norm": 0.35928822346783335, + "learning_rate": 9.521927757136673e-06, + "loss": 0.2593, + "step": 9221 + }, + { + "epoch": 0.53, + "grad_norm": 0.25623013450271037, + "learning_rate": 9.520068980626789e-06, + "loss": 0.1978, + "step": 9222 + }, + { + "epoch": 0.53, + "grad_norm": 0.37505431562660235, + "learning_rate": 9.518210220736892e-06, + "loss": 0.2664, + "step": 9223 + }, + { + "epoch": 0.53, + "grad_norm": 0.37891600500710415, + "learning_rate": 9.516351477531357e-06, + "loss": 0.2994, + "step": 9224 + }, + { + "epoch": 0.53, + "grad_norm": 0.9228496278274424, + "learning_rate": 9.51449275107455e-06, + "loss": 0.4677, + "step": 9225 + }, + { + "epoch": 0.53, + "grad_norm": 0.3824480382733829, + "learning_rate": 9.512634041430835e-06, + "loss": 0.2635, + "step": 9226 + }, + { + "epoch": 0.53, + "grad_norm": 0.41175544519260576, + "learning_rate": 9.510775348664584e-06, + "loss": 0.3034, + "step": 9227 + }, + { + "epoch": 0.53, + "grad_norm": 0.26939742805578254, + "learning_rate": 9.508916672840161e-06, + "loss": 0.1751, + "step": 9228 + }, + { + "epoch": 0.53, + "grad_norm": 1.172159570979836, + "learning_rate": 9.507058014021933e-06, + "loss": 0.5465, + "step": 9229 + }, + { + "epoch": 0.53, + "grad_norm": 0.35814475799141565, + "learning_rate": 9.505199372274264e-06, + "loss": 0.27, + "step": 9230 + }, + { + "epoch": 0.53, + "grad_norm": 0.43551068922587277, + "learning_rate": 9.50334074766152e-06, + "loss": 0.3128, + "step": 9231 + }, + { + "epoch": 0.53, + "grad_norm": 0.7961712293430698, + "learning_rate": 9.501482140248064e-06, + "loss": 0.4733, + "step": 9232 + }, + { + "epoch": 0.53, + "grad_norm": 0.3121898207774114, + "learning_rate": 9.499623550098262e-06, + "loss": 0.1957, + "step": 9233 + }, + { + "epoch": 0.53, + "grad_norm": 0.2227896224663921, + "learning_rate": 9.497764977276473e-06, + "loss": 0.2213, + "step": 9234 + }, + { + "epoch": 0.53, + "grad_norm": 1.5631757758484857, + "learning_rate": 9.495906421847063e-06, + "loss": 0.792, + "step": 9235 + }, + { + "epoch": 0.53, + "grad_norm": 0.3107139886169333, + "learning_rate": 9.49404788387439e-06, + "loss": 0.2195, + "step": 9236 + }, + { + "epoch": 0.53, + "grad_norm": 0.7244261519601902, + "learning_rate": 9.492189363422819e-06, + "loss": 0.4441, + "step": 9237 + }, + { + "epoch": 0.53, + "grad_norm": 0.3605582089400721, + "learning_rate": 9.490330860556707e-06, + "loss": 0.3125, + "step": 9238 + }, + { + "epoch": 0.53, + "grad_norm": 0.2873919445850608, + "learning_rate": 9.488472375340417e-06, + "loss": 0.2106, + "step": 9239 + }, + { + "epoch": 0.53, + "grad_norm": 0.39888390668387463, + "learning_rate": 9.486613907838306e-06, + "loss": 0.2769, + "step": 9240 + }, + { + "epoch": 0.53, + "grad_norm": 0.4867865905958677, + "learning_rate": 9.484755458114732e-06, + "loss": 0.3455, + "step": 9241 + }, + { + "epoch": 0.53, + "grad_norm": 0.310940342891282, + "learning_rate": 9.482897026234056e-06, + "loss": 0.2665, + "step": 9242 + }, + { + "epoch": 0.53, + "grad_norm": 0.501526397698931, + "learning_rate": 9.48103861226063e-06, + "loss": 0.3035, + "step": 9243 + }, + { + "epoch": 0.53, + "grad_norm": 0.7521124857164626, + "learning_rate": 9.47918021625882e-06, + "loss": 0.5031, + "step": 9244 + }, + { + "epoch": 0.53, + "grad_norm": 0.393541576127973, + "learning_rate": 9.477321838292972e-06, + "loss": 0.279, + "step": 9245 + }, + { + "epoch": 0.53, + "grad_norm": 0.3212033534001573, + "learning_rate": 9.475463478427451e-06, + "loss": 0.2417, + "step": 9246 + }, + { + "epoch": 0.53, + "grad_norm": 0.7874596505559587, + "learning_rate": 9.473605136726602e-06, + "loss": 0.6043, + "step": 9247 + }, + { + "epoch": 0.53, + "grad_norm": 0.3961777779705941, + "learning_rate": 9.471746813254788e-06, + "loss": 0.2912, + "step": 9248 + }, + { + "epoch": 0.53, + "grad_norm": 0.27725278619850396, + "learning_rate": 9.469888508076357e-06, + "loss": 0.1882, + "step": 9249 + }, + { + "epoch": 0.53, + "grad_norm": 0.3362006708506701, + "learning_rate": 9.468030221255667e-06, + "loss": 0.3169, + "step": 9250 + }, + { + "epoch": 0.53, + "grad_norm": 0.38625457503857924, + "learning_rate": 9.46617195285707e-06, + "loss": 0.2824, + "step": 9251 + }, + { + "epoch": 0.53, + "grad_norm": 0.4559670161202051, + "learning_rate": 9.464313702944912e-06, + "loss": 0.272, + "step": 9252 + }, + { + "epoch": 0.53, + "grad_norm": 0.8027237790991724, + "learning_rate": 9.462455471583545e-06, + "loss": 0.5867, + "step": 9253 + }, + { + "epoch": 0.53, + "grad_norm": 0.2779288614551335, + "learning_rate": 9.460597258837325e-06, + "loss": 0.2458, + "step": 9254 + }, + { + "epoch": 0.53, + "grad_norm": 0.43085408967342126, + "learning_rate": 9.458739064770595e-06, + "loss": 0.351, + "step": 9255 + }, + { + "epoch": 0.53, + "grad_norm": 0.22237328741189238, + "learning_rate": 9.456880889447712e-06, + "loss": 0.093, + "step": 9256 + }, + { + "epoch": 0.53, + "grad_norm": 0.3228147343712859, + "learning_rate": 9.455022732933017e-06, + "loss": 0.2727, + "step": 9257 + }, + { + "epoch": 0.53, + "grad_norm": 0.4869726882315915, + "learning_rate": 9.453164595290865e-06, + "loss": 0.3442, + "step": 9258 + }, + { + "epoch": 0.53, + "grad_norm": 0.4765999794871249, + "learning_rate": 9.451306476585595e-06, + "loss": 0.3115, + "step": 9259 + }, + { + "epoch": 0.53, + "grad_norm": 0.37056859041713597, + "learning_rate": 9.449448376881563e-06, + "loss": 0.2723, + "step": 9260 + }, + { + "epoch": 0.53, + "grad_norm": 0.9036552153671678, + "learning_rate": 9.447590296243106e-06, + "loss": 0.4958, + "step": 9261 + }, + { + "epoch": 0.53, + "grad_norm": 0.21923658796039047, + "learning_rate": 9.445732234734576e-06, + "loss": 0.1774, + "step": 9262 + }, + { + "epoch": 0.53, + "grad_norm": 0.36316978936914124, + "learning_rate": 9.443874192420312e-06, + "loss": 0.2585, + "step": 9263 + }, + { + "epoch": 0.53, + "grad_norm": 0.5077816681609485, + "learning_rate": 9.442016169364664e-06, + "loss": 0.3649, + "step": 9264 + }, + { + "epoch": 0.53, + "grad_norm": 0.33319325267717365, + "learning_rate": 9.440158165631972e-06, + "loss": 0.2898, + "step": 9265 + }, + { + "epoch": 0.53, + "grad_norm": 0.4992737656729488, + "learning_rate": 9.438300181286576e-06, + "loss": 0.284, + "step": 9266 + }, + { + "epoch": 0.53, + "grad_norm": 0.4975012780414002, + "learning_rate": 9.436442216392823e-06, + "loss": 0.4145, + "step": 9267 + }, + { + "epoch": 0.53, + "grad_norm": 0.2694784425314511, + "learning_rate": 9.43458427101505e-06, + "loss": 0.1441, + "step": 9268 + }, + { + "epoch": 0.53, + "grad_norm": 0.3240022249400918, + "learning_rate": 9.4327263452176e-06, + "loss": 0.1771, + "step": 9269 + }, + { + "epoch": 0.53, + "grad_norm": 0.3343235297742924, + "learning_rate": 9.430868439064813e-06, + "loss": 0.3089, + "step": 9270 + }, + { + "epoch": 0.53, + "grad_norm": 0.6910139457529847, + "learning_rate": 9.429010552621027e-06, + "loss": 0.4781, + "step": 9271 + }, + { + "epoch": 0.53, + "grad_norm": 0.35862362126432507, + "learning_rate": 9.42715268595058e-06, + "loss": 0.243, + "step": 9272 + }, + { + "epoch": 0.53, + "grad_norm": 0.49851971643361387, + "learning_rate": 9.425294839117812e-06, + "loss": 0.391, + "step": 9273 + }, + { + "epoch": 0.53, + "grad_norm": 0.23422338164763837, + "learning_rate": 9.423437012187057e-06, + "loss": 0.2075, + "step": 9274 + }, + { + "epoch": 0.53, + "grad_norm": 0.3086926262711173, + "learning_rate": 9.421579205222657e-06, + "loss": 0.2067, + "step": 9275 + }, + { + "epoch": 0.53, + "grad_norm": 1.310073728398527, + "learning_rate": 9.41972141828894e-06, + "loss": 0.6654, + "step": 9276 + }, + { + "epoch": 0.53, + "grad_norm": 0.5841986899196692, + "learning_rate": 9.41786365145025e-06, + "loss": 0.3337, + "step": 9277 + }, + { + "epoch": 0.53, + "grad_norm": 0.2536795044961282, + "learning_rate": 9.416005904770916e-06, + "loss": 0.2111, + "step": 9278 + }, + { + "epoch": 0.53, + "grad_norm": 1.2215967277636537, + "learning_rate": 9.414148178315268e-06, + "loss": 0.8136, + "step": 9279 + }, + { + "epoch": 0.53, + "grad_norm": 0.3058690669119812, + "learning_rate": 9.412290472147648e-06, + "loss": 0.1969, + "step": 9280 + }, + { + "epoch": 0.53, + "grad_norm": 0.7589007743382404, + "learning_rate": 9.41043278633238e-06, + "loss": 0.409, + "step": 9281 + }, + { + "epoch": 0.53, + "grad_norm": 0.3057591447527336, + "learning_rate": 9.408575120933804e-06, + "loss": 0.2498, + "step": 9282 + }, + { + "epoch": 0.53, + "grad_norm": 0.6995045368657311, + "learning_rate": 9.406717476016242e-06, + "loss": 0.4688, + "step": 9283 + }, + { + "epoch": 0.53, + "grad_norm": 0.5984000260809333, + "learning_rate": 9.40485985164403e-06, + "loss": 0.3472, + "step": 9284 + }, + { + "epoch": 0.53, + "grad_norm": 0.375960869641853, + "learning_rate": 9.403002247881499e-06, + "loss": 0.2511, + "step": 9285 + }, + { + "epoch": 0.53, + "grad_norm": 0.2200191456962238, + "learning_rate": 9.40114466479297e-06, + "loss": 0.209, + "step": 9286 + }, + { + "epoch": 0.53, + "grad_norm": 0.5939482634092973, + "learning_rate": 9.399287102442776e-06, + "loss": 0.333, + "step": 9287 + }, + { + "epoch": 0.53, + "grad_norm": 0.3786700230040074, + "learning_rate": 9.397429560895243e-06, + "loss": 0.2799, + "step": 9288 + }, + { + "epoch": 0.53, + "grad_norm": 0.47843379765938265, + "learning_rate": 9.395572040214702e-06, + "loss": 0.3526, + "step": 9289 + }, + { + "epoch": 0.53, + "grad_norm": 0.3725937972585066, + "learning_rate": 9.393714540465474e-06, + "loss": 0.2929, + "step": 9290 + }, + { + "epoch": 0.53, + "grad_norm": 0.34822543851312543, + "learning_rate": 9.391857061711883e-06, + "loss": 0.2587, + "step": 9291 + }, + { + "epoch": 0.53, + "grad_norm": 0.2866067925550406, + "learning_rate": 9.389999604018258e-06, + "loss": 0.1637, + "step": 9292 + }, + { + "epoch": 0.53, + "grad_norm": 0.3738816323207938, + "learning_rate": 9.388142167448917e-06, + "loss": 0.3084, + "step": 9293 + }, + { + "epoch": 0.53, + "grad_norm": 0.29667286372725454, + "learning_rate": 9.38628475206819e-06, + "loss": 0.2788, + "step": 9294 + }, + { + "epoch": 0.53, + "grad_norm": 0.6007497819778793, + "learning_rate": 9.384427357940394e-06, + "loss": 0.3506, + "step": 9295 + }, + { + "epoch": 0.53, + "grad_norm": 0.41917867119031543, + "learning_rate": 9.382569985129854e-06, + "loss": 0.3215, + "step": 9296 + }, + { + "epoch": 0.53, + "grad_norm": 0.5345523972264065, + "learning_rate": 9.380712633700887e-06, + "loss": 0.37, + "step": 9297 + }, + { + "epoch": 0.53, + "grad_norm": 0.20961728479650352, + "learning_rate": 9.378855303717817e-06, + "loss": 0.1741, + "step": 9298 + }, + { + "epoch": 0.53, + "grad_norm": 0.5007677585180572, + "learning_rate": 9.376997995244957e-06, + "loss": 0.3596, + "step": 9299 + }, + { + "epoch": 0.53, + "grad_norm": 0.30960347623118034, + "learning_rate": 9.375140708346634e-06, + "loss": 0.2864, + "step": 9300 + }, + { + "epoch": 0.53, + "grad_norm": 0.3443777763500569, + "learning_rate": 9.373283443087159e-06, + "loss": 0.2645, + "step": 9301 + }, + { + "epoch": 0.53, + "grad_norm": 0.7207968905840805, + "learning_rate": 9.371426199530853e-06, + "loss": 0.4504, + "step": 9302 + }, + { + "epoch": 0.53, + "grad_norm": 0.3588460744152287, + "learning_rate": 9.369568977742028e-06, + "loss": 0.3213, + "step": 9303 + }, + { + "epoch": 0.53, + "grad_norm": 0.6060702782939552, + "learning_rate": 9.367711777785004e-06, + "loss": 0.324, + "step": 9304 + }, + { + "epoch": 0.53, + "grad_norm": 0.3480442813272276, + "learning_rate": 9.365854599724096e-06, + "loss": 0.2939, + "step": 9305 + }, + { + "epoch": 0.53, + "grad_norm": 0.23986228711116675, + "learning_rate": 9.363997443623612e-06, + "loss": 0.253, + "step": 9306 + }, + { + "epoch": 0.53, + "grad_norm": 0.4254434470268579, + "learning_rate": 9.362140309547873e-06, + "loss": 0.2891, + "step": 9307 + }, + { + "epoch": 0.53, + "grad_norm": 0.4305528719251428, + "learning_rate": 9.360283197561185e-06, + "loss": 0.1963, + "step": 9308 + }, + { + "epoch": 0.53, + "grad_norm": 0.32013724856043474, + "learning_rate": 9.358426107727862e-06, + "loss": 0.2812, + "step": 9309 + }, + { + "epoch": 0.53, + "grad_norm": 0.49958189678242726, + "learning_rate": 9.356569040112216e-06, + "loss": 0.351, + "step": 9310 + }, + { + "epoch": 0.53, + "grad_norm": 0.4889885463858359, + "learning_rate": 9.354711994778558e-06, + "loss": 0.2716, + "step": 9311 + }, + { + "epoch": 0.54, + "grad_norm": 0.2855203325007413, + "learning_rate": 9.352854971791192e-06, + "loss": 0.2248, + "step": 9312 + }, + { + "epoch": 0.54, + "grad_norm": 0.27880863601709716, + "learning_rate": 9.350997971214434e-06, + "loss": 0.2507, + "step": 9313 + }, + { + "epoch": 0.54, + "grad_norm": 0.3792599482042661, + "learning_rate": 9.349140993112588e-06, + "loss": 0.2169, + "step": 9314 + }, + { + "epoch": 0.54, + "grad_norm": 0.38684532344974176, + "learning_rate": 9.347284037549962e-06, + "loss": 0.3065, + "step": 9315 + }, + { + "epoch": 0.54, + "grad_norm": 0.8003084968892227, + "learning_rate": 9.34542710459086e-06, + "loss": 0.4174, + "step": 9316 + }, + { + "epoch": 0.54, + "grad_norm": 0.3193010117283392, + "learning_rate": 9.343570194299591e-06, + "loss": 0.2854, + "step": 9317 + }, + { + "epoch": 0.54, + "grad_norm": 0.32379332391684124, + "learning_rate": 9.341713306740457e-06, + "loss": 0.2349, + "step": 9318 + }, + { + "epoch": 0.54, + "grad_norm": 0.30827057912219974, + "learning_rate": 9.339856441977767e-06, + "loss": 0.2344, + "step": 9319 + }, + { + "epoch": 0.54, + "grad_norm": 1.1331119563905843, + "learning_rate": 9.337999600075814e-06, + "loss": 0.6408, + "step": 9320 + }, + { + "epoch": 0.54, + "grad_norm": 0.2831832360300413, + "learning_rate": 9.336142781098908e-06, + "loss": 0.2151, + "step": 9321 + }, + { + "epoch": 0.54, + "grad_norm": 0.5405839352824342, + "learning_rate": 9.33428598511135e-06, + "loss": 0.3459, + "step": 9322 + }, + { + "epoch": 0.54, + "grad_norm": 0.9654227292116724, + "learning_rate": 9.332429212177438e-06, + "loss": 0.4725, + "step": 9323 + }, + { + "epoch": 0.54, + "grad_norm": 0.23482472356001713, + "learning_rate": 9.330572462361474e-06, + "loss": 0.1729, + "step": 9324 + }, + { + "epoch": 0.54, + "grad_norm": 0.4684689393595989, + "learning_rate": 9.328715735727758e-06, + "loss": 0.3859, + "step": 9325 + }, + { + "epoch": 0.54, + "grad_norm": 0.297606052083825, + "learning_rate": 9.326859032340585e-06, + "loss": 0.2425, + "step": 9326 + }, + { + "epoch": 0.54, + "grad_norm": 0.4108345649916392, + "learning_rate": 9.325002352264257e-06, + "loss": 0.2403, + "step": 9327 + }, + { + "epoch": 0.54, + "grad_norm": 0.9941058792175076, + "learning_rate": 9.323145695563067e-06, + "loss": 0.4418, + "step": 9328 + }, + { + "epoch": 0.54, + "grad_norm": 0.32359946359922775, + "learning_rate": 9.321289062301313e-06, + "loss": 0.2928, + "step": 9329 + }, + { + "epoch": 0.54, + "grad_norm": 0.3409617393200946, + "learning_rate": 9.319432452543292e-06, + "loss": 0.2698, + "step": 9330 + }, + { + "epoch": 0.54, + "grad_norm": 0.24943573775102337, + "learning_rate": 9.317575866353293e-06, + "loss": 0.151, + "step": 9331 + }, + { + "epoch": 0.54, + "grad_norm": 0.47506315731861976, + "learning_rate": 9.315719303795614e-06, + "loss": 0.2959, + "step": 9332 + }, + { + "epoch": 0.54, + "grad_norm": 0.36402547396764845, + "learning_rate": 9.313862764934543e-06, + "loss": 0.2759, + "step": 9333 + }, + { + "epoch": 0.54, + "grad_norm": 0.5439363586977312, + "learning_rate": 9.312006249834378e-06, + "loss": 0.2909, + "step": 9334 + }, + { + "epoch": 0.54, + "grad_norm": 0.7236335627092366, + "learning_rate": 9.310149758559405e-06, + "loss": 0.3895, + "step": 9335 + }, + { + "epoch": 0.54, + "grad_norm": 0.36068005871087383, + "learning_rate": 9.30829329117392e-06, + "loss": 0.2747, + "step": 9336 + }, + { + "epoch": 0.54, + "grad_norm": 0.3165484712620228, + "learning_rate": 9.306436847742203e-06, + "loss": 0.2673, + "step": 9337 + }, + { + "epoch": 0.54, + "grad_norm": 0.38534131183340886, + "learning_rate": 9.304580428328552e-06, + "loss": 0.2414, + "step": 9338 + }, + { + "epoch": 0.54, + "grad_norm": 0.3115474788513869, + "learning_rate": 9.30272403299725e-06, + "loss": 0.2647, + "step": 9339 + }, + { + "epoch": 0.54, + "grad_norm": 0.3285219921891418, + "learning_rate": 9.300867661812585e-06, + "loss": 0.1915, + "step": 9340 + }, + { + "epoch": 0.54, + "grad_norm": 0.40017987691315415, + "learning_rate": 9.29901131483884e-06, + "loss": 0.3231, + "step": 9341 + }, + { + "epoch": 0.54, + "grad_norm": 0.34327243727753615, + "learning_rate": 9.297154992140307e-06, + "loss": 0.2567, + "step": 9342 + }, + { + "epoch": 0.54, + "grad_norm": 1.2467554100617364, + "learning_rate": 9.295298693781267e-06, + "loss": 0.7486, + "step": 9343 + }, + { + "epoch": 0.54, + "grad_norm": 0.3513490054263849, + "learning_rate": 9.293442419825998e-06, + "loss": 0.2287, + "step": 9344 + }, + { + "epoch": 0.54, + "grad_norm": 0.29446477079644806, + "learning_rate": 9.291586170338793e-06, + "loss": 0.2604, + "step": 9345 + }, + { + "epoch": 0.54, + "grad_norm": 0.3906785495034913, + "learning_rate": 9.289729945383924e-06, + "loss": 0.2681, + "step": 9346 + }, + { + "epoch": 0.54, + "grad_norm": 0.693760446359131, + "learning_rate": 9.28787374502568e-06, + "loss": 0.2316, + "step": 9347 + }, + { + "epoch": 0.54, + "grad_norm": 0.34998798570881384, + "learning_rate": 9.286017569328334e-06, + "loss": 0.2665, + "step": 9348 + }, + { + "epoch": 0.54, + "grad_norm": 0.3792652287586318, + "learning_rate": 9.284161418356171e-06, + "loss": 0.3214, + "step": 9349 + }, + { + "epoch": 0.54, + "grad_norm": 0.6013268840193028, + "learning_rate": 9.282305292173467e-06, + "loss": 0.3168, + "step": 9350 + }, + { + "epoch": 0.54, + "grad_norm": 0.40681134523342405, + "learning_rate": 9.280449190844501e-06, + "loss": 0.2882, + "step": 9351 + }, + { + "epoch": 0.54, + "grad_norm": 0.32759946772814447, + "learning_rate": 9.278593114433547e-06, + "loss": 0.2228, + "step": 9352 + }, + { + "epoch": 0.54, + "grad_norm": 0.38246028250584957, + "learning_rate": 9.276737063004884e-06, + "loss": 0.2469, + "step": 9353 + }, + { + "epoch": 0.54, + "grad_norm": 0.3570238331886624, + "learning_rate": 9.274881036622785e-06, + "loss": 0.2766, + "step": 9354 + }, + { + "epoch": 0.54, + "grad_norm": 0.8536521626653547, + "learning_rate": 9.273025035351526e-06, + "loss": 0.5547, + "step": 9355 + }, + { + "epoch": 0.54, + "grad_norm": 0.6642206879063963, + "learning_rate": 9.271169059255376e-06, + "loss": 0.4194, + "step": 9356 + }, + { + "epoch": 0.54, + "grad_norm": 0.2544235207525964, + "learning_rate": 9.269313108398611e-06, + "loss": 0.2227, + "step": 9357 + }, + { + "epoch": 0.54, + "grad_norm": 0.27520770254521076, + "learning_rate": 9.2674571828455e-06, + "loss": 0.1867, + "step": 9358 + }, + { + "epoch": 0.54, + "grad_norm": 0.9026816205634214, + "learning_rate": 9.265601282660318e-06, + "loss": 0.5432, + "step": 9359 + }, + { + "epoch": 0.54, + "grad_norm": 0.3427639614610134, + "learning_rate": 9.263745407907329e-06, + "loss": 0.2059, + "step": 9360 + }, + { + "epoch": 0.54, + "grad_norm": 0.4234555347458187, + "learning_rate": 9.261889558650809e-06, + "loss": 0.3324, + "step": 9361 + }, + { + "epoch": 0.54, + "grad_norm": 1.0330872273748664, + "learning_rate": 9.260033734955018e-06, + "loss": 0.421, + "step": 9362 + }, + { + "epoch": 0.54, + "grad_norm": 0.3320606941363923, + "learning_rate": 9.25817793688423e-06, + "loss": 0.2124, + "step": 9363 + }, + { + "epoch": 0.54, + "grad_norm": 0.32162030886140774, + "learning_rate": 9.256322164502704e-06, + "loss": 0.1961, + "step": 9364 + }, + { + "epoch": 0.54, + "grad_norm": 0.3775542765593719, + "learning_rate": 9.254466417874714e-06, + "loss": 0.3093, + "step": 9365 + }, + { + "epoch": 0.54, + "grad_norm": 0.3245176121995681, + "learning_rate": 9.252610697064516e-06, + "loss": 0.1544, + "step": 9366 + }, + { + "epoch": 0.54, + "grad_norm": 0.6532298265771281, + "learning_rate": 9.25075500213638e-06, + "loss": 0.3814, + "step": 9367 + }, + { + "epoch": 0.54, + "grad_norm": 0.39797220000426015, + "learning_rate": 9.248899333154565e-06, + "loss": 0.3394, + "step": 9368 + }, + { + "epoch": 0.54, + "grad_norm": 0.3901046536766521, + "learning_rate": 9.247043690183334e-06, + "loss": 0.2712, + "step": 9369 + }, + { + "epoch": 0.54, + "grad_norm": 0.2983187384453381, + "learning_rate": 9.245188073286949e-06, + "loss": 0.1997, + "step": 9370 + }, + { + "epoch": 0.54, + "grad_norm": 0.4306936639740022, + "learning_rate": 9.243332482529665e-06, + "loss": 0.3333, + "step": 9371 + }, + { + "epoch": 0.54, + "grad_norm": 0.4124403701710802, + "learning_rate": 9.241476917975748e-06, + "loss": 0.2959, + "step": 9372 + }, + { + "epoch": 0.54, + "grad_norm": 0.3857979000578478, + "learning_rate": 9.239621379689452e-06, + "loss": 0.2773, + "step": 9373 + }, + { + "epoch": 0.54, + "grad_norm": 0.7877835736477719, + "learning_rate": 9.237765867735035e-06, + "loss": 0.5544, + "step": 9374 + }, + { + "epoch": 0.54, + "grad_norm": 0.3805108940267185, + "learning_rate": 9.235910382176751e-06, + "loss": 0.2811, + "step": 9375 + }, + { + "epoch": 0.54, + "grad_norm": 0.2183372803331831, + "learning_rate": 9.234054923078862e-06, + "loss": 0.1758, + "step": 9376 + }, + { + "epoch": 0.54, + "grad_norm": 0.44649442283170376, + "learning_rate": 9.232199490505613e-06, + "loss": 0.3603, + "step": 9377 + }, + { + "epoch": 0.54, + "grad_norm": 0.3408016275041556, + "learning_rate": 9.230344084521266e-06, + "loss": 0.2609, + "step": 9378 + }, + { + "epoch": 0.54, + "grad_norm": 0.8404217815307761, + "learning_rate": 9.228488705190069e-06, + "loss": 0.3583, + "step": 9379 + }, + { + "epoch": 0.54, + "grad_norm": 0.34473996252645167, + "learning_rate": 9.226633352576276e-06, + "loss": 0.3451, + "step": 9380 + }, + { + "epoch": 0.54, + "grad_norm": 0.3742015521440866, + "learning_rate": 9.224778026744135e-06, + "loss": 0.2612, + "step": 9381 + }, + { + "epoch": 0.54, + "grad_norm": 0.9236809927403062, + "learning_rate": 9.222922727757899e-06, + "loss": 0.5384, + "step": 9382 + }, + { + "epoch": 0.54, + "grad_norm": 0.2849500682429431, + "learning_rate": 9.221067455681817e-06, + "loss": 0.1571, + "step": 9383 + }, + { + "epoch": 0.54, + "grad_norm": 0.4049939669342111, + "learning_rate": 9.219212210580132e-06, + "loss": 0.318, + "step": 9384 + }, + { + "epoch": 0.54, + "grad_norm": 0.29913254424780833, + "learning_rate": 9.217356992517097e-06, + "loss": 0.2886, + "step": 9385 + }, + { + "epoch": 0.54, + "grad_norm": 1.0583760086865348, + "learning_rate": 9.215501801556954e-06, + "loss": 0.4293, + "step": 9386 + }, + { + "epoch": 0.54, + "grad_norm": 0.61751445391918, + "learning_rate": 9.213646637763954e-06, + "loss": 0.3031, + "step": 9387 + }, + { + "epoch": 0.54, + "grad_norm": 0.4187489970847992, + "learning_rate": 9.211791501202333e-06, + "loss": 0.3314, + "step": 9388 + }, + { + "epoch": 0.54, + "grad_norm": 0.2328530107323132, + "learning_rate": 9.209936391936339e-06, + "loss": 0.2078, + "step": 9389 + }, + { + "epoch": 0.54, + "grad_norm": 0.42827604942936515, + "learning_rate": 9.208081310030216e-06, + "loss": 0.273, + "step": 9390 + }, + { + "epoch": 0.54, + "grad_norm": 0.5354159507363013, + "learning_rate": 9.2062262555482e-06, + "loss": 0.3933, + "step": 9391 + }, + { + "epoch": 0.54, + "grad_norm": 0.4463089490948037, + "learning_rate": 9.204371228554538e-06, + "loss": 0.2974, + "step": 9392 + }, + { + "epoch": 0.54, + "grad_norm": 0.3010100233617082, + "learning_rate": 9.202516229113462e-06, + "loss": 0.2662, + "step": 9393 + }, + { + "epoch": 0.54, + "grad_norm": 0.508972715684767, + "learning_rate": 9.200661257289217e-06, + "loss": 0.3506, + "step": 9394 + }, + { + "epoch": 0.54, + "grad_norm": 0.3217573305101514, + "learning_rate": 9.19880631314604e-06, + "loss": 0.2201, + "step": 9395 + }, + { + "epoch": 0.54, + "grad_norm": 0.29336545819378834, + "learning_rate": 9.196951396748164e-06, + "loss": 0.2371, + "step": 9396 + }, + { + "epoch": 0.54, + "grad_norm": 0.4756760206599183, + "learning_rate": 9.195096508159826e-06, + "loss": 0.323, + "step": 9397 + }, + { + "epoch": 0.54, + "grad_norm": 1.0804796916074983, + "learning_rate": 9.193241647445262e-06, + "loss": 0.77, + "step": 9398 + }, + { + "epoch": 0.54, + "grad_norm": 0.3241278568868132, + "learning_rate": 9.191386814668704e-06, + "loss": 0.201, + "step": 9399 + }, + { + "epoch": 0.54, + "grad_norm": 1.8403532514129441, + "learning_rate": 9.189532009894387e-06, + "loss": 0.7033, + "step": 9400 + }, + { + "epoch": 0.54, + "grad_norm": 0.32997349588142116, + "learning_rate": 9.187677233186541e-06, + "loss": 0.318, + "step": 9401 + }, + { + "epoch": 0.54, + "grad_norm": 0.37909775087026976, + "learning_rate": 9.185822484609397e-06, + "loss": 0.2387, + "step": 9402 + }, + { + "epoch": 0.54, + "grad_norm": 0.26048756876240237, + "learning_rate": 9.183967764227188e-06, + "loss": 0.2072, + "step": 9403 + }, + { + "epoch": 0.54, + "grad_norm": 0.3638139283443146, + "learning_rate": 9.182113072104137e-06, + "loss": 0.3321, + "step": 9404 + }, + { + "epoch": 0.54, + "grad_norm": 0.7221700317212014, + "learning_rate": 9.180258408304478e-06, + "loss": 0.3916, + "step": 9405 + }, + { + "epoch": 0.54, + "grad_norm": 0.3277302216404148, + "learning_rate": 9.178403772892433e-06, + "loss": 0.2227, + "step": 9406 + }, + { + "epoch": 0.54, + "grad_norm": 0.6393616769102479, + "learning_rate": 9.176549165932231e-06, + "loss": 0.3885, + "step": 9407 + }, + { + "epoch": 0.54, + "grad_norm": 0.3951954935016409, + "learning_rate": 9.174694587488097e-06, + "loss": 0.2902, + "step": 9408 + }, + { + "epoch": 0.54, + "grad_norm": 0.22636610131498916, + "learning_rate": 9.17284003762425e-06, + "loss": 0.1794, + "step": 9409 + }, + { + "epoch": 0.54, + "grad_norm": 1.2675540511328809, + "learning_rate": 9.170985516404922e-06, + "loss": 0.7407, + "step": 9410 + }, + { + "epoch": 0.54, + "grad_norm": 0.59893272808341, + "learning_rate": 9.169131023894325e-06, + "loss": 0.2856, + "step": 9411 + }, + { + "epoch": 0.54, + "grad_norm": 0.28230999589030675, + "learning_rate": 9.16727656015669e-06, + "loss": 0.2509, + "step": 9412 + }, + { + "epoch": 0.54, + "grad_norm": 0.8586813691539498, + "learning_rate": 9.165422125256228e-06, + "loss": 0.4289, + "step": 9413 + }, + { + "epoch": 0.54, + "grad_norm": 0.5314017571523516, + "learning_rate": 9.163567719257164e-06, + "loss": 0.3422, + "step": 9414 + }, + { + "epoch": 0.54, + "grad_norm": 0.31946600380526713, + "learning_rate": 9.161713342223711e-06, + "loss": 0.2, + "step": 9415 + }, + { + "epoch": 0.54, + "grad_norm": 0.3639625388147878, + "learning_rate": 9.159858994220092e-06, + "loss": 0.3099, + "step": 9416 + }, + { + "epoch": 0.54, + "grad_norm": 0.2937050738295473, + "learning_rate": 9.15800467531052e-06, + "loss": 0.1719, + "step": 9417 + }, + { + "epoch": 0.54, + "grad_norm": 0.42166472914570036, + "learning_rate": 9.156150385559208e-06, + "loss": 0.3034, + "step": 9418 + }, + { + "epoch": 0.54, + "grad_norm": 1.4713264661584862, + "learning_rate": 9.154296125030371e-06, + "loss": 0.3491, + "step": 9419 + }, + { + "epoch": 0.54, + "grad_norm": 0.28848784892017665, + "learning_rate": 9.15244189378823e-06, + "loss": 0.2629, + "step": 9420 + }, + { + "epoch": 0.54, + "grad_norm": 0.5160241916906537, + "learning_rate": 9.150587691896984e-06, + "loss": 0.344, + "step": 9421 + }, + { + "epoch": 0.54, + "grad_norm": 0.183426975433477, + "learning_rate": 9.14873351942085e-06, + "loss": 0.1326, + "step": 9422 + }, + { + "epoch": 0.54, + "grad_norm": 0.7740940117037703, + "learning_rate": 9.146879376424037e-06, + "loss": 0.3699, + "step": 9423 + }, + { + "epoch": 0.54, + "grad_norm": 0.36265406185090515, + "learning_rate": 9.145025262970757e-06, + "loss": 0.2904, + "step": 9424 + }, + { + "epoch": 0.54, + "grad_norm": 0.7197643231905059, + "learning_rate": 9.143171179125212e-06, + "loss": 0.2937, + "step": 9425 + }, + { + "epoch": 0.54, + "grad_norm": 0.6953661847377259, + "learning_rate": 9.141317124951613e-06, + "loss": 0.349, + "step": 9426 + }, + { + "epoch": 0.54, + "grad_norm": 0.32798387026370746, + "learning_rate": 9.139463100514166e-06, + "loss": 0.2851, + "step": 9427 + }, + { + "epoch": 0.54, + "grad_norm": 0.3583724552430915, + "learning_rate": 9.137609105877075e-06, + "loss": 0.2518, + "step": 9428 + }, + { + "epoch": 0.54, + "grad_norm": 0.21343072518400563, + "learning_rate": 9.135755141104544e-06, + "loss": 0.1528, + "step": 9429 + }, + { + "epoch": 0.54, + "grad_norm": 0.4644618331947836, + "learning_rate": 9.133901206260773e-06, + "loss": 0.3436, + "step": 9430 + }, + { + "epoch": 0.54, + "grad_norm": 0.8170445704875847, + "learning_rate": 9.132047301409968e-06, + "loss": 0.4853, + "step": 9431 + }, + { + "epoch": 0.54, + "grad_norm": 0.27928105311252444, + "learning_rate": 9.130193426616327e-06, + "loss": 0.2082, + "step": 9432 + }, + { + "epoch": 0.54, + "grad_norm": 0.41359155985409485, + "learning_rate": 9.12833958194405e-06, + "loss": 0.31, + "step": 9433 + }, + { + "epoch": 0.54, + "grad_norm": 1.1546078883310094, + "learning_rate": 9.126485767457336e-06, + "loss": 0.7811, + "step": 9434 + }, + { + "epoch": 0.54, + "grad_norm": 0.3413134026821165, + "learning_rate": 9.124631983220384e-06, + "loss": 0.1997, + "step": 9435 + }, + { + "epoch": 0.54, + "grad_norm": 0.3295985332780704, + "learning_rate": 9.122778229297387e-06, + "loss": 0.2877, + "step": 9436 + }, + { + "epoch": 0.54, + "grad_norm": 0.3604904526688763, + "learning_rate": 9.120924505752543e-06, + "loss": 0.2817, + "step": 9437 + }, + { + "epoch": 0.54, + "grad_norm": 0.45457565674240885, + "learning_rate": 9.119070812650044e-06, + "loss": 0.155, + "step": 9438 + }, + { + "epoch": 0.54, + "grad_norm": 0.4349575002134113, + "learning_rate": 9.117217150054087e-06, + "loss": 0.3293, + "step": 9439 + }, + { + "epoch": 0.54, + "grad_norm": 0.40328109153502684, + "learning_rate": 9.115363518028858e-06, + "loss": 0.3267, + "step": 9440 + }, + { + "epoch": 0.54, + "grad_norm": 0.7613078562304955, + "learning_rate": 9.113509916638557e-06, + "loss": 0.2908, + "step": 9441 + }, + { + "epoch": 0.54, + "grad_norm": 0.27332115307343685, + "learning_rate": 9.111656345947367e-06, + "loss": 0.2487, + "step": 9442 + }, + { + "epoch": 0.54, + "grad_norm": 0.4121158185394958, + "learning_rate": 9.10980280601948e-06, + "loss": 0.2577, + "step": 9443 + }, + { + "epoch": 0.54, + "grad_norm": 0.34272848135119255, + "learning_rate": 9.107949296919084e-06, + "loss": 0.3144, + "step": 9444 + }, + { + "epoch": 0.54, + "grad_norm": 0.3178424358331697, + "learning_rate": 9.106095818710367e-06, + "loss": 0.2214, + "step": 9445 + }, + { + "epoch": 0.54, + "grad_norm": 0.8274851482459838, + "learning_rate": 9.10424237145751e-06, + "loss": 0.4761, + "step": 9446 + }, + { + "epoch": 0.54, + "grad_norm": 0.4605336475712775, + "learning_rate": 9.102388955224703e-06, + "loss": 0.3781, + "step": 9447 + }, + { + "epoch": 0.54, + "grad_norm": 0.2601899726482571, + "learning_rate": 9.10053557007613e-06, + "loss": 0.2162, + "step": 9448 + }, + { + "epoch": 0.54, + "grad_norm": 0.2737288914687979, + "learning_rate": 9.098682216075968e-06, + "loss": 0.1548, + "step": 9449 + }, + { + "epoch": 0.54, + "grad_norm": 0.7389906969176646, + "learning_rate": 9.096828893288404e-06, + "loss": 0.3916, + "step": 9450 + }, + { + "epoch": 0.54, + "grad_norm": 0.35483877986484735, + "learning_rate": 9.094975601777615e-06, + "loss": 0.231, + "step": 9451 + }, + { + "epoch": 0.54, + "grad_norm": 0.38272983487780293, + "learning_rate": 9.093122341607782e-06, + "loss": 0.3315, + "step": 9452 + }, + { + "epoch": 0.54, + "grad_norm": 0.7223595246219131, + "learning_rate": 9.091269112843084e-06, + "loss": 0.4254, + "step": 9453 + }, + { + "epoch": 0.54, + "grad_norm": 0.34796801665638943, + "learning_rate": 9.089415915547702e-06, + "loss": 0.2188, + "step": 9454 + }, + { + "epoch": 0.54, + "grad_norm": 0.30394563246761563, + "learning_rate": 9.087562749785805e-06, + "loss": 0.1796, + "step": 9455 + }, + { + "epoch": 0.54, + "grad_norm": 0.3333343056858731, + "learning_rate": 9.085709615621567e-06, + "loss": 0.2878, + "step": 9456 + }, + { + "epoch": 0.54, + "grad_norm": 0.3613631132845819, + "learning_rate": 9.083856513119169e-06, + "loss": 0.2873, + "step": 9457 + }, + { + "epoch": 0.54, + "grad_norm": 0.6852597899585396, + "learning_rate": 9.082003442342779e-06, + "loss": 0.3644, + "step": 9458 + }, + { + "epoch": 0.54, + "grad_norm": 0.4615449689732116, + "learning_rate": 9.080150403356571e-06, + "loss": 0.3572, + "step": 9459 + }, + { + "epoch": 0.54, + "grad_norm": 0.27980817096660626, + "learning_rate": 9.078297396224716e-06, + "loss": 0.2563, + "step": 9460 + }, + { + "epoch": 0.54, + "grad_norm": 0.25145383400299104, + "learning_rate": 9.07644442101138e-06, + "loss": 0.1177, + "step": 9461 + }, + { + "epoch": 0.54, + "grad_norm": 1.4001380538324593, + "learning_rate": 9.074591477780736e-06, + "loss": 0.7624, + "step": 9462 + }, + { + "epoch": 0.54, + "grad_norm": 0.322723513333794, + "learning_rate": 9.072738566596948e-06, + "loss": 0.2826, + "step": 9463 + }, + { + "epoch": 0.54, + "grad_norm": 0.3481210569513449, + "learning_rate": 9.070885687524184e-06, + "loss": 0.2795, + "step": 9464 + }, + { + "epoch": 0.54, + "grad_norm": 1.013606524113796, + "learning_rate": 9.069032840626608e-06, + "loss": 0.6801, + "step": 9465 + }, + { + "epoch": 0.54, + "grad_norm": 0.3272263147205583, + "learning_rate": 9.067180025968387e-06, + "loss": 0.2762, + "step": 9466 + }, + { + "epoch": 0.54, + "grad_norm": 0.20755738352285572, + "learning_rate": 9.065327243613679e-06, + "loss": 0.0841, + "step": 9467 + }, + { + "epoch": 0.54, + "grad_norm": 0.35855047633435044, + "learning_rate": 9.06347449362665e-06, + "loss": 0.3212, + "step": 9468 + }, + { + "epoch": 0.54, + "grad_norm": 0.3261481520927009, + "learning_rate": 9.061621776071458e-06, + "loss": 0.2756, + "step": 9469 + }, + { + "epoch": 0.54, + "grad_norm": 0.6254025257211152, + "learning_rate": 9.059769091012265e-06, + "loss": 0.3968, + "step": 9470 + }, + { + "epoch": 0.54, + "grad_norm": 0.3530171497321099, + "learning_rate": 9.057916438513226e-06, + "loss": 0.2633, + "step": 9471 + }, + { + "epoch": 0.54, + "grad_norm": 0.36080066395287175, + "learning_rate": 9.056063818638502e-06, + "loss": 0.3188, + "step": 9472 + }, + { + "epoch": 0.54, + "grad_norm": 0.2664798556440693, + "learning_rate": 9.054211231452248e-06, + "loss": 0.2171, + "step": 9473 + }, + { + "epoch": 0.54, + "grad_norm": 0.5965216255280918, + "learning_rate": 9.052358677018615e-06, + "loss": 0.2632, + "step": 9474 + }, + { + "epoch": 0.54, + "grad_norm": 0.376392912726472, + "learning_rate": 9.050506155401764e-06, + "loss": 0.2729, + "step": 9475 + }, + { + "epoch": 0.54, + "grad_norm": 0.3404934744042724, + "learning_rate": 9.048653666665841e-06, + "loss": 0.3248, + "step": 9476 + }, + { + "epoch": 0.54, + "grad_norm": 1.040043861261599, + "learning_rate": 9.046801210875002e-06, + "loss": 0.4977, + "step": 9477 + }, + { + "epoch": 0.54, + "grad_norm": 0.3204967432613155, + "learning_rate": 9.044948788093396e-06, + "loss": 0.2685, + "step": 9478 + }, + { + "epoch": 0.54, + "grad_norm": 0.22589665188179028, + "learning_rate": 9.043096398385174e-06, + "loss": 0.208, + "step": 9479 + }, + { + "epoch": 0.54, + "grad_norm": 0.4867779438407952, + "learning_rate": 9.041244041814479e-06, + "loss": 0.3523, + "step": 9480 + }, + { + "epoch": 0.54, + "grad_norm": 0.3340647249545658, + "learning_rate": 9.039391718445466e-06, + "loss": 0.2391, + "step": 9481 + }, + { + "epoch": 0.54, + "grad_norm": 0.8060812933217341, + "learning_rate": 9.03753942834227e-06, + "loss": 0.5286, + "step": 9482 + }, + { + "epoch": 0.54, + "grad_norm": 0.518908014472298, + "learning_rate": 9.03568717156905e-06, + "loss": 0.3698, + "step": 9483 + }, + { + "epoch": 0.54, + "grad_norm": 0.28165729999453204, + "learning_rate": 9.033834948189936e-06, + "loss": 0.2253, + "step": 9484 + }, + { + "epoch": 0.54, + "grad_norm": 1.099977564464965, + "learning_rate": 9.031982758269078e-06, + "loss": 0.626, + "step": 9485 + }, + { + "epoch": 0.55, + "grad_norm": 0.40518128341349524, + "learning_rate": 9.030130601870615e-06, + "loss": 0.2398, + "step": 9486 + }, + { + "epoch": 0.55, + "grad_norm": 0.27175186242628224, + "learning_rate": 9.02827847905869e-06, + "loss": 0.2198, + "step": 9487 + }, + { + "epoch": 0.55, + "grad_norm": 0.47784160961695693, + "learning_rate": 9.02642638989744e-06, + "loss": 0.3479, + "step": 9488 + }, + { + "epoch": 0.55, + "grad_norm": 1.1214209878128412, + "learning_rate": 9.024574334451002e-06, + "loss": 0.7194, + "step": 9489 + }, + { + "epoch": 0.55, + "grad_norm": 0.32215965041453914, + "learning_rate": 9.02272231278351e-06, + "loss": 0.1616, + "step": 9490 + }, + { + "epoch": 0.55, + "grad_norm": 0.28186208190846546, + "learning_rate": 9.020870324959103e-06, + "loss": 0.2528, + "step": 9491 + }, + { + "epoch": 0.55, + "grad_norm": 0.4286246139050462, + "learning_rate": 9.019018371041914e-06, + "loss": 0.3669, + "step": 9492 + }, + { + "epoch": 0.55, + "grad_norm": 0.433043538846795, + "learning_rate": 9.017166451096077e-06, + "loss": 0.2856, + "step": 9493 + }, + { + "epoch": 0.55, + "grad_norm": 0.252036692216471, + "learning_rate": 9.015314565185724e-06, + "loss": 0.1777, + "step": 9494 + }, + { + "epoch": 0.55, + "grad_norm": 0.3567792880638821, + "learning_rate": 9.013462713374986e-06, + "loss": 0.3031, + "step": 9495 + }, + { + "epoch": 0.55, + "grad_norm": 0.4525301068269673, + "learning_rate": 9.01161089572799e-06, + "loss": 0.3151, + "step": 9496 + }, + { + "epoch": 0.55, + "grad_norm": 0.4602195249201141, + "learning_rate": 9.009759112308867e-06, + "loss": 0.2832, + "step": 9497 + }, + { + "epoch": 0.55, + "grad_norm": 0.6256387629393889, + "learning_rate": 9.007907363181742e-06, + "loss": 0.4464, + "step": 9498 + }, + { + "epoch": 0.55, + "grad_norm": 0.2897790404691268, + "learning_rate": 9.006055648410745e-06, + "loss": 0.2242, + "step": 9499 + }, + { + "epoch": 0.55, + "grad_norm": 0.2947409062044821, + "learning_rate": 9.004203968059997e-06, + "loss": 0.203, + "step": 9500 + }, + { + "epoch": 0.55, + "grad_norm": 0.8779701484144261, + "learning_rate": 9.002352322193622e-06, + "loss": 0.5903, + "step": 9501 + }, + { + "epoch": 0.55, + "grad_norm": 0.31908124321909886, + "learning_rate": 9.000500710875746e-06, + "loss": 0.2542, + "step": 9502 + }, + { + "epoch": 0.55, + "grad_norm": 0.450366955751652, + "learning_rate": 8.998649134170484e-06, + "loss": 0.2701, + "step": 9503 + }, + { + "epoch": 0.55, + "grad_norm": 0.5082350399387127, + "learning_rate": 8.996797592141962e-06, + "loss": 0.337, + "step": 9504 + }, + { + "epoch": 0.55, + "grad_norm": 0.2484863804487214, + "learning_rate": 8.994946084854294e-06, + "loss": 0.2122, + "step": 9505 + }, + { + "epoch": 0.55, + "grad_norm": 0.3488933962122594, + "learning_rate": 8.9930946123716e-06, + "loss": 0.2352, + "step": 9506 + }, + { + "epoch": 0.55, + "grad_norm": 0.3152421095593874, + "learning_rate": 8.991243174757997e-06, + "loss": 0.2676, + "step": 9507 + }, + { + "epoch": 0.55, + "grad_norm": 0.4266176579389591, + "learning_rate": 8.9893917720776e-06, + "loss": 0.3114, + "step": 9508 + }, + { + "epoch": 0.55, + "grad_norm": 0.4938255856927612, + "learning_rate": 8.987540404394521e-06, + "loss": 0.3598, + "step": 9509 + }, + { + "epoch": 0.55, + "grad_norm": 0.3394737874270989, + "learning_rate": 8.985689071772877e-06, + "loss": 0.2153, + "step": 9510 + }, + { + "epoch": 0.55, + "grad_norm": 0.37853025059595635, + "learning_rate": 8.983837774276774e-06, + "loss": 0.2724, + "step": 9511 + }, + { + "epoch": 0.55, + "grad_norm": 0.2888364011937344, + "learning_rate": 8.981986511970327e-06, + "loss": 0.2864, + "step": 9512 + }, + { + "epoch": 0.55, + "grad_norm": 0.28390144484902724, + "learning_rate": 8.980135284917644e-06, + "loss": 0.1234, + "step": 9513 + }, + { + "epoch": 0.55, + "grad_norm": 0.35464412918329336, + "learning_rate": 8.97828409318283e-06, + "loss": 0.2683, + "step": 9514 + }, + { + "epoch": 0.55, + "grad_norm": 0.286350130274255, + "learning_rate": 8.976432936829995e-06, + "loss": 0.2999, + "step": 9515 + }, + { + "epoch": 0.55, + "grad_norm": 0.6943883642781716, + "learning_rate": 8.974581815923242e-06, + "loss": 0.3359, + "step": 9516 + }, + { + "epoch": 0.55, + "grad_norm": 0.40022381769441084, + "learning_rate": 8.972730730526679e-06, + "loss": 0.243, + "step": 9517 + }, + { + "epoch": 0.55, + "grad_norm": 0.5276954077075993, + "learning_rate": 8.970879680704404e-06, + "loss": 0.4043, + "step": 9518 + }, + { + "epoch": 0.55, + "grad_norm": 0.28500783585719586, + "learning_rate": 8.969028666520524e-06, + "loss": 0.2542, + "step": 9519 + }, + { + "epoch": 0.55, + "grad_norm": 0.24384593736610496, + "learning_rate": 8.967177688039135e-06, + "loss": 0.1619, + "step": 9520 + }, + { + "epoch": 0.55, + "grad_norm": 0.48283497815577264, + "learning_rate": 8.96532674532434e-06, + "loss": 0.3448, + "step": 9521 + }, + { + "epoch": 0.55, + "grad_norm": 0.8189859030284328, + "learning_rate": 8.963475838440237e-06, + "loss": 0.4574, + "step": 9522 + }, + { + "epoch": 0.55, + "grad_norm": 0.2617952258223295, + "learning_rate": 8.961624967450917e-06, + "loss": 0.2188, + "step": 9523 + }, + { + "epoch": 0.55, + "grad_norm": 0.48019246869526944, + "learning_rate": 8.959774132420481e-06, + "loss": 0.3712, + "step": 9524 + }, + { + "epoch": 0.55, + "grad_norm": 0.29000128064258673, + "learning_rate": 8.957923333413024e-06, + "loss": 0.2129, + "step": 9525 + }, + { + "epoch": 0.55, + "grad_norm": 0.526363917290331, + "learning_rate": 8.956072570492635e-06, + "loss": 0.2352, + "step": 9526 + }, + { + "epoch": 0.55, + "grad_norm": 0.2704870327838035, + "learning_rate": 8.954221843723409e-06, + "loss": 0.2755, + "step": 9527 + }, + { + "epoch": 0.55, + "grad_norm": 1.0257404142025073, + "learning_rate": 8.952371153169435e-06, + "loss": 0.4987, + "step": 9528 + }, + { + "epoch": 0.55, + "grad_norm": 0.2940447139600711, + "learning_rate": 8.950520498894803e-06, + "loss": 0.1177, + "step": 9529 + }, + { + "epoch": 0.55, + "grad_norm": 0.38604536877936113, + "learning_rate": 8.9486698809636e-06, + "loss": 0.3443, + "step": 9530 + }, + { + "epoch": 0.55, + "grad_norm": 0.33418039366402746, + "learning_rate": 8.946819299439915e-06, + "loss": 0.3424, + "step": 9531 + }, + { + "epoch": 0.55, + "grad_norm": 0.5842996172246837, + "learning_rate": 8.944968754387832e-06, + "loss": 0.3918, + "step": 9532 + }, + { + "epoch": 0.55, + "grad_norm": 0.20671343129546188, + "learning_rate": 8.943118245871437e-06, + "loss": 0.1549, + "step": 9533 + }, + { + "epoch": 0.55, + "grad_norm": 0.8778299994060808, + "learning_rate": 8.941267773954809e-06, + "loss": 0.475, + "step": 9534 + }, + { + "epoch": 0.55, + "grad_norm": 0.3016216513617832, + "learning_rate": 8.939417338702034e-06, + "loss": 0.285, + "step": 9535 + }, + { + "epoch": 0.55, + "grad_norm": 0.3695515460906184, + "learning_rate": 8.93756694017719e-06, + "loss": 0.2673, + "step": 9536 + }, + { + "epoch": 0.55, + "grad_norm": 0.7614491699387651, + "learning_rate": 8.935716578444358e-06, + "loss": 0.4398, + "step": 9537 + }, + { + "epoch": 0.55, + "grad_norm": 0.341722836767795, + "learning_rate": 8.933866253567615e-06, + "loss": 0.2352, + "step": 9538 + }, + { + "epoch": 0.55, + "grad_norm": 0.25011501068791103, + "learning_rate": 8.932015965611039e-06, + "loss": 0.2378, + "step": 9539 + }, + { + "epoch": 0.55, + "grad_norm": 0.39852031588177406, + "learning_rate": 8.930165714638705e-06, + "loss": 0.1926, + "step": 9540 + }, + { + "epoch": 0.55, + "grad_norm": 0.5183265685725699, + "learning_rate": 8.928315500714682e-06, + "loss": 0.3365, + "step": 9541 + }, + { + "epoch": 0.55, + "grad_norm": 0.3523020413766487, + "learning_rate": 8.92646532390305e-06, + "loss": 0.2559, + "step": 9542 + }, + { + "epoch": 0.55, + "grad_norm": 0.4905252780482375, + "learning_rate": 8.924615184267876e-06, + "loss": 0.3216, + "step": 9543 + }, + { + "epoch": 0.55, + "grad_norm": 0.5270482680361739, + "learning_rate": 8.922765081873235e-06, + "loss": 0.3081, + "step": 9544 + }, + { + "epoch": 0.55, + "grad_norm": 0.25970806818561254, + "learning_rate": 8.92091501678319e-06, + "loss": 0.2239, + "step": 9545 + }, + { + "epoch": 0.55, + "grad_norm": 0.3026984576541297, + "learning_rate": 8.919064989061813e-06, + "loss": 0.194, + "step": 9546 + }, + { + "epoch": 0.55, + "grad_norm": 0.4164454229618692, + "learning_rate": 8.917214998773169e-06, + "loss": 0.3015, + "step": 9547 + }, + { + "epoch": 0.55, + "grad_norm": 0.32582574801627273, + "learning_rate": 8.915365045981323e-06, + "loss": 0.285, + "step": 9548 + }, + { + "epoch": 0.55, + "grad_norm": 0.7217319400615111, + "learning_rate": 8.913515130750336e-06, + "loss": 0.3338, + "step": 9549 + }, + { + "epoch": 0.55, + "grad_norm": 0.7820623728911443, + "learning_rate": 8.911665253144277e-06, + "loss": 0.4671, + "step": 9550 + }, + { + "epoch": 0.55, + "grad_norm": 0.24558093961398544, + "learning_rate": 8.9098154132272e-06, + "loss": 0.2646, + "step": 9551 + }, + { + "epoch": 0.55, + "grad_norm": 0.19719586673338096, + "learning_rate": 8.907965611063173e-06, + "loss": 0.073, + "step": 9552 + }, + { + "epoch": 0.55, + "grad_norm": 0.5547086944121592, + "learning_rate": 8.906115846716247e-06, + "loss": 0.3461, + "step": 9553 + }, + { + "epoch": 0.55, + "grad_norm": 0.3694099603084277, + "learning_rate": 8.904266120250483e-06, + "loss": 0.3258, + "step": 9554 + }, + { + "epoch": 0.55, + "grad_norm": 0.4425574975466733, + "learning_rate": 8.902416431729939e-06, + "loss": 0.2769, + "step": 9555 + }, + { + "epoch": 0.55, + "grad_norm": 0.5256141648470348, + "learning_rate": 8.900566781218665e-06, + "loss": 0.3479, + "step": 9556 + }, + { + "epoch": 0.55, + "grad_norm": 0.3022772910244815, + "learning_rate": 8.898717168780713e-06, + "loss": 0.2375, + "step": 9557 + }, + { + "epoch": 0.55, + "grad_norm": 0.409281807309569, + "learning_rate": 8.896867594480141e-06, + "loss": 0.2499, + "step": 9558 + }, + { + "epoch": 0.55, + "grad_norm": 0.3148261067696356, + "learning_rate": 8.895018058380995e-06, + "loss": 0.2314, + "step": 9559 + }, + { + "epoch": 0.55, + "grad_norm": 0.3935927137427299, + "learning_rate": 8.893168560547327e-06, + "loss": 0.3369, + "step": 9560 + }, + { + "epoch": 0.55, + "grad_norm": 0.6398746892434829, + "learning_rate": 8.891319101043181e-06, + "loss": 0.4199, + "step": 9561 + }, + { + "epoch": 0.55, + "grad_norm": 0.39721004210862404, + "learning_rate": 8.889469679932612e-06, + "loss": 0.2113, + "step": 9562 + }, + { + "epoch": 0.55, + "grad_norm": 0.28574456478066207, + "learning_rate": 8.887620297279656e-06, + "loss": 0.2957, + "step": 9563 + }, + { + "epoch": 0.55, + "grad_norm": 0.26193897143332595, + "learning_rate": 8.885770953148364e-06, + "loss": 0.1744, + "step": 9564 + }, + { + "epoch": 0.55, + "grad_norm": 0.6740258025951248, + "learning_rate": 8.883921647602777e-06, + "loss": 0.2764, + "step": 9565 + }, + { + "epoch": 0.55, + "grad_norm": 0.3760220070940269, + "learning_rate": 8.882072380706931e-06, + "loss": 0.3045, + "step": 9566 + }, + { + "epoch": 0.55, + "grad_norm": 0.3810517214167689, + "learning_rate": 8.880223152524875e-06, + "loss": 0.3385, + "step": 9567 + }, + { + "epoch": 0.55, + "grad_norm": 0.9536008985955754, + "learning_rate": 8.87837396312064e-06, + "loss": 0.5612, + "step": 9568 + }, + { + "epoch": 0.55, + "grad_norm": 0.22293813348609617, + "learning_rate": 8.876524812558269e-06, + "loss": 0.1594, + "step": 9569 + }, + { + "epoch": 0.55, + "grad_norm": 0.3957805630650527, + "learning_rate": 8.874675700901791e-06, + "loss": 0.2501, + "step": 9570 + }, + { + "epoch": 0.55, + "grad_norm": 0.38770512159826404, + "learning_rate": 8.87282662821525e-06, + "loss": 0.3198, + "step": 9571 + }, + { + "epoch": 0.55, + "grad_norm": 0.3391420199028762, + "learning_rate": 8.87097759456267e-06, + "loss": 0.2365, + "step": 9572 + }, + { + "epoch": 0.55, + "grad_norm": 1.202499855422108, + "learning_rate": 8.869128600008092e-06, + "loss": 0.818, + "step": 9573 + }, + { + "epoch": 0.55, + "grad_norm": 0.45885122466342415, + "learning_rate": 8.867279644615537e-06, + "loss": 0.3281, + "step": 9574 + }, + { + "epoch": 0.55, + "grad_norm": 0.31119385570500957, + "learning_rate": 8.865430728449043e-06, + "loss": 0.2231, + "step": 9575 + }, + { + "epoch": 0.55, + "grad_norm": 0.2595652472913027, + "learning_rate": 8.863581851572633e-06, + "loss": 0.1539, + "step": 9576 + }, + { + "epoch": 0.55, + "grad_norm": 0.4554085077648678, + "learning_rate": 8.861733014050334e-06, + "loss": 0.3657, + "step": 9577 + }, + { + "epoch": 0.55, + "grad_norm": 0.3500096854930678, + "learning_rate": 8.859884215946174e-06, + "loss": 0.2237, + "step": 9578 + }, + { + "epoch": 0.55, + "grad_norm": 0.3774828819594551, + "learning_rate": 8.858035457324172e-06, + "loss": 0.3163, + "step": 9579 + }, + { + "epoch": 0.55, + "grad_norm": 0.7309407493660592, + "learning_rate": 8.856186738248355e-06, + "loss": 0.4135, + "step": 9580 + }, + { + "epoch": 0.55, + "grad_norm": 0.3706488154985834, + "learning_rate": 8.85433805878274e-06, + "loss": 0.288, + "step": 9581 + }, + { + "epoch": 0.55, + "grad_norm": 0.21088115991413472, + "learning_rate": 8.85248941899135e-06, + "loss": 0.1959, + "step": 9582 + }, + { + "epoch": 0.55, + "grad_norm": 0.850973894592171, + "learning_rate": 8.850640818938202e-06, + "loss": 0.4281, + "step": 9583 + }, + { + "epoch": 0.55, + "grad_norm": 0.31743512024875603, + "learning_rate": 8.848792258687312e-06, + "loss": 0.2678, + "step": 9584 + }, + { + "epoch": 0.55, + "grad_norm": 0.773444547415871, + "learning_rate": 8.846943738302697e-06, + "loss": 0.5385, + "step": 9585 + }, + { + "epoch": 0.55, + "grad_norm": 0.40078285238849626, + "learning_rate": 8.845095257848372e-06, + "loss": 0.3058, + "step": 9586 + }, + { + "epoch": 0.55, + "grad_norm": 0.364521054432038, + "learning_rate": 8.843246817388345e-06, + "loss": 0.2559, + "step": 9587 + }, + { + "epoch": 0.55, + "grad_norm": 1.3184618070839194, + "learning_rate": 8.841398416986635e-06, + "loss": 0.3191, + "step": 9588 + }, + { + "epoch": 0.55, + "grad_norm": 0.3073635659674038, + "learning_rate": 8.83955005670725e-06, + "loss": 0.2267, + "step": 9589 + }, + { + "epoch": 0.55, + "grad_norm": 0.2940086234938814, + "learning_rate": 8.837701736614194e-06, + "loss": 0.2482, + "step": 9590 + }, + { + "epoch": 0.55, + "grad_norm": 0.29719764315455904, + "learning_rate": 8.835853456771476e-06, + "loss": 0.2412, + "step": 9591 + }, + { + "epoch": 0.55, + "grad_norm": 0.9759795203201609, + "learning_rate": 8.834005217243103e-06, + "loss": 0.593, + "step": 9592 + }, + { + "epoch": 0.55, + "grad_norm": 0.3481287823907185, + "learning_rate": 8.832157018093078e-06, + "loss": 0.2459, + "step": 9593 + }, + { + "epoch": 0.55, + "grad_norm": 0.5531869767322063, + "learning_rate": 8.830308859385408e-06, + "loss": 0.3371, + "step": 9594 + }, + { + "epoch": 0.55, + "grad_norm": 0.34635142391335155, + "learning_rate": 8.828460741184089e-06, + "loss": 0.2135, + "step": 9595 + }, + { + "epoch": 0.55, + "grad_norm": 0.4122181775243568, + "learning_rate": 8.826612663553126e-06, + "loss": 0.2867, + "step": 9596 + }, + { + "epoch": 0.55, + "grad_norm": 0.4546733222643512, + "learning_rate": 8.824764626556514e-06, + "loss": 0.3519, + "step": 9597 + }, + { + "epoch": 0.55, + "grad_norm": 0.30491457595218185, + "learning_rate": 8.822916630258255e-06, + "loss": 0.2441, + "step": 9598 + }, + { + "epoch": 0.55, + "grad_norm": 0.35806364278988395, + "learning_rate": 8.82106867472234e-06, + "loss": 0.2882, + "step": 9599 + }, + { + "epoch": 0.55, + "grad_norm": 0.6824198286469831, + "learning_rate": 8.819220760012768e-06, + "loss": 0.4241, + "step": 9600 + }, + { + "epoch": 0.55, + "grad_norm": 0.2923506374089054, + "learning_rate": 8.81737288619353e-06, + "loss": 0.1254, + "step": 9601 + }, + { + "epoch": 0.55, + "grad_norm": 0.3244416849157942, + "learning_rate": 8.815525053328617e-06, + "loss": 0.2596, + "step": 9602 + }, + { + "epoch": 0.55, + "grad_norm": 0.28496715222960844, + "learning_rate": 8.81367726148202e-06, + "loss": 0.2617, + "step": 9603 + }, + { + "epoch": 0.55, + "grad_norm": 0.7812979157651535, + "learning_rate": 8.811829510717731e-06, + "loss": 0.3444, + "step": 9604 + }, + { + "epoch": 0.55, + "grad_norm": 0.31375336364159706, + "learning_rate": 8.809981801099735e-06, + "loss": 0.2695, + "step": 9605 + }, + { + "epoch": 0.55, + "grad_norm": 0.392233640141547, + "learning_rate": 8.808134132692015e-06, + "loss": 0.3176, + "step": 9606 + }, + { + "epoch": 0.55, + "grad_norm": 1.3762085157918527, + "learning_rate": 8.806286505558563e-06, + "loss": 0.6777, + "step": 9607 + }, + { + "epoch": 0.55, + "grad_norm": 0.3082493565251616, + "learning_rate": 8.804438919763352e-06, + "loss": 0.219, + "step": 9608 + }, + { + "epoch": 0.55, + "grad_norm": 0.4539330693613155, + "learning_rate": 8.802591375370375e-06, + "loss": 0.2814, + "step": 9609 + }, + { + "epoch": 0.55, + "grad_norm": 0.30749822562633533, + "learning_rate": 8.800743872443605e-06, + "loss": 0.2762, + "step": 9610 + }, + { + "epoch": 0.55, + "grad_norm": 0.3004335057158179, + "learning_rate": 8.798896411047024e-06, + "loss": 0.2093, + "step": 9611 + }, + { + "epoch": 0.55, + "grad_norm": 0.651005317404619, + "learning_rate": 8.797048991244606e-06, + "loss": 0.4288, + "step": 9612 + }, + { + "epoch": 0.55, + "grad_norm": 0.9232587293760873, + "learning_rate": 8.795201613100334e-06, + "loss": 0.627, + "step": 9613 + }, + { + "epoch": 0.55, + "grad_norm": 0.2907613834716408, + "learning_rate": 8.793354276678176e-06, + "loss": 0.1997, + "step": 9614 + }, + { + "epoch": 0.55, + "grad_norm": 0.3186326974408788, + "learning_rate": 8.791506982042107e-06, + "loss": 0.2948, + "step": 9615 + }, + { + "epoch": 0.55, + "grad_norm": 0.5159427141535858, + "learning_rate": 8.789659729256099e-06, + "loss": 0.2764, + "step": 9616 + }, + { + "epoch": 0.55, + "grad_norm": 0.3412013433538205, + "learning_rate": 8.787812518384125e-06, + "loss": 0.1942, + "step": 9617 + }, + { + "epoch": 0.55, + "grad_norm": 0.35941120379477665, + "learning_rate": 8.78596534949015e-06, + "loss": 0.283, + "step": 9618 + }, + { + "epoch": 0.55, + "grad_norm": 1.0007832706509505, + "learning_rate": 8.784118222638142e-06, + "loss": 0.6069, + "step": 9619 + }, + { + "epoch": 0.55, + "grad_norm": 0.4190917204278424, + "learning_rate": 8.78227113789207e-06, + "loss": 0.3474, + "step": 9620 + }, + { + "epoch": 0.55, + "grad_norm": 0.25232330543617304, + "learning_rate": 8.780424095315893e-06, + "loss": 0.2104, + "step": 9621 + }, + { + "epoch": 0.55, + "grad_norm": 0.3665822398797408, + "learning_rate": 8.778577094973579e-06, + "loss": 0.3238, + "step": 9622 + }, + { + "epoch": 0.55, + "grad_norm": 0.41883098987828316, + "learning_rate": 8.77673013692909e-06, + "loss": 0.3045, + "step": 9623 + }, + { + "epoch": 0.55, + "grad_norm": 0.3007795637019158, + "learning_rate": 8.77488322124638e-06, + "loss": 0.1753, + "step": 9624 + }, + { + "epoch": 0.55, + "grad_norm": 1.344525945362686, + "learning_rate": 8.773036347989413e-06, + "loss": 0.7523, + "step": 9625 + }, + { + "epoch": 0.55, + "grad_norm": 0.26911175627901446, + "learning_rate": 8.771189517222143e-06, + "loss": 0.265, + "step": 9626 + }, + { + "epoch": 0.55, + "grad_norm": 0.42709579508130063, + "learning_rate": 8.769342729008529e-06, + "loss": 0.2923, + "step": 9627 + }, + { + "epoch": 0.55, + "grad_norm": 0.6199825882275222, + "learning_rate": 8.767495983412521e-06, + "loss": 0.3981, + "step": 9628 + }, + { + "epoch": 0.55, + "grad_norm": 0.25178532363408446, + "learning_rate": 8.765649280498076e-06, + "loss": 0.197, + "step": 9629 + }, + { + "epoch": 0.55, + "grad_norm": 0.2845849191707805, + "learning_rate": 8.763802620329146e-06, + "loss": 0.2088, + "step": 9630 + }, + { + "epoch": 0.55, + "grad_norm": 1.0896261704932444, + "learning_rate": 8.761956002969672e-06, + "loss": 0.7473, + "step": 9631 + }, + { + "epoch": 0.55, + "grad_norm": 0.5818223951584665, + "learning_rate": 8.760109428483613e-06, + "loss": 0.3236, + "step": 9632 + }, + { + "epoch": 0.55, + "grad_norm": 0.41990213718374075, + "learning_rate": 8.758262896934909e-06, + "loss": 0.3098, + "step": 9633 + }, + { + "epoch": 0.55, + "grad_norm": 0.399969974428762, + "learning_rate": 8.756416408387507e-06, + "loss": 0.2806, + "step": 9634 + }, + { + "epoch": 0.55, + "grad_norm": 0.3299745775632892, + "learning_rate": 8.754569962905351e-06, + "loss": 0.2081, + "step": 9635 + }, + { + "epoch": 0.55, + "grad_norm": 0.3008769243224494, + "learning_rate": 8.752723560552386e-06, + "loss": 0.2482, + "step": 9636 + }, + { + "epoch": 0.55, + "grad_norm": 0.9013635666683019, + "learning_rate": 8.750877201392547e-06, + "loss": 0.3913, + "step": 9637 + }, + { + "epoch": 0.55, + "grad_norm": 0.28342819302573247, + "learning_rate": 8.749030885489782e-06, + "loss": 0.2537, + "step": 9638 + }, + { + "epoch": 0.55, + "grad_norm": 0.5810023181066697, + "learning_rate": 8.747184612908019e-06, + "loss": 0.3466, + "step": 9639 + }, + { + "epoch": 0.55, + "grad_norm": 0.6679312576871723, + "learning_rate": 8.745338383711202e-06, + "loss": 0.3194, + "step": 9640 + }, + { + "epoch": 0.55, + "grad_norm": 0.29402336429450904, + "learning_rate": 8.74349219796326e-06, + "loss": 0.2023, + "step": 9641 + }, + { + "epoch": 0.55, + "grad_norm": 0.26832438600589964, + "learning_rate": 8.741646055728133e-06, + "loss": 0.2494, + "step": 9642 + }, + { + "epoch": 0.55, + "grad_norm": 1.104583917262513, + "learning_rate": 8.739799957069747e-06, + "loss": 0.681, + "step": 9643 + }, + { + "epoch": 0.55, + "grad_norm": 0.35403449470833537, + "learning_rate": 8.737953902052031e-06, + "loss": 0.2109, + "step": 9644 + }, + { + "epoch": 0.55, + "grad_norm": 0.6163331604318542, + "learning_rate": 8.736107890738922e-06, + "loss": 0.3423, + "step": 9645 + }, + { + "epoch": 0.55, + "grad_norm": 0.44648031150704204, + "learning_rate": 8.73426192319434e-06, + "loss": 0.3083, + "step": 9646 + }, + { + "epoch": 0.55, + "grad_norm": 0.32973426498978503, + "learning_rate": 8.732415999482214e-06, + "loss": 0.2018, + "step": 9647 + }, + { + "epoch": 0.55, + "grad_norm": 0.29871322069317574, + "learning_rate": 8.730570119666465e-06, + "loss": 0.18, + "step": 9648 + }, + { + "epoch": 0.55, + "grad_norm": 0.5177502319297812, + "learning_rate": 8.728724283811024e-06, + "loss": 0.3988, + "step": 9649 + }, + { + "epoch": 0.55, + "grad_norm": 0.3241226683385959, + "learning_rate": 8.7268784919798e-06, + "loss": 0.2099, + "step": 9650 + }, + { + "epoch": 0.55, + "grad_norm": 0.45021331398071146, + "learning_rate": 8.725032744236723e-06, + "loss": 0.3137, + "step": 9651 + }, + { + "epoch": 0.55, + "grad_norm": 0.7969625967830009, + "learning_rate": 8.723187040645704e-06, + "loss": 0.5277, + "step": 9652 + }, + { + "epoch": 0.55, + "grad_norm": 0.27981093470443125, + "learning_rate": 8.721341381270668e-06, + "loss": 0.0984, + "step": 9653 + }, + { + "epoch": 0.55, + "grad_norm": 0.2597047101805497, + "learning_rate": 8.719495766175519e-06, + "loss": 0.2336, + "step": 9654 + }, + { + "epoch": 0.55, + "grad_norm": 1.1135473319190738, + "learning_rate": 8.717650195424182e-06, + "loss": 0.6095, + "step": 9655 + }, + { + "epoch": 0.55, + "grad_norm": 0.6191392989069763, + "learning_rate": 8.715804669080559e-06, + "loss": 0.3826, + "step": 9656 + }, + { + "epoch": 0.55, + "grad_norm": 0.3043730092485221, + "learning_rate": 8.713959187208572e-06, + "loss": 0.2419, + "step": 9657 + }, + { + "epoch": 0.55, + "grad_norm": 0.4603338423923025, + "learning_rate": 8.712113749872117e-06, + "loss": 0.3772, + "step": 9658 + }, + { + "epoch": 0.55, + "grad_norm": 0.5274180041754517, + "learning_rate": 8.710268357135109e-06, + "loss": 0.3372, + "step": 9659 + }, + { + "epoch": 0.56, + "grad_norm": 0.20931527382171677, + "learning_rate": 8.70842300906145e-06, + "loss": 0.1499, + "step": 9660 + }, + { + "epoch": 0.56, + "grad_norm": 0.45032579061938405, + "learning_rate": 8.70657770571505e-06, + "loss": 0.3645, + "step": 9661 + }, + { + "epoch": 0.56, + "grad_norm": 0.4065200216698055, + "learning_rate": 8.704732447159807e-06, + "loss": 0.2926, + "step": 9662 + }, + { + "epoch": 0.56, + "grad_norm": 0.41762152764098875, + "learning_rate": 8.702887233459625e-06, + "loss": 0.2756, + "step": 9663 + }, + { + "epoch": 0.56, + "grad_norm": 1.3246494443880283, + "learning_rate": 8.7010420646784e-06, + "loss": 0.8345, + "step": 9664 + }, + { + "epoch": 0.56, + "grad_norm": 0.34809237419193545, + "learning_rate": 8.699196940880032e-06, + "loss": 0.2729, + "step": 9665 + }, + { + "epoch": 0.56, + "grad_norm": 0.2492864391185059, + "learning_rate": 8.69735186212842e-06, + "loss": 0.2019, + "step": 9666 + }, + { + "epoch": 0.56, + "grad_norm": 0.5684745090978655, + "learning_rate": 8.695506828487457e-06, + "loss": 0.285, + "step": 9667 + }, + { + "epoch": 0.56, + "grad_norm": 0.4835198325712554, + "learning_rate": 8.693661840021035e-06, + "loss": 0.3404, + "step": 9668 + }, + { + "epoch": 0.56, + "grad_norm": 0.41002756921375183, + "learning_rate": 8.691816896793049e-06, + "loss": 0.3262, + "step": 9669 + }, + { + "epoch": 0.56, + "grad_norm": 0.3301324479001951, + "learning_rate": 8.689971998867386e-06, + "loss": 0.271, + "step": 9670 + }, + { + "epoch": 0.56, + "grad_norm": 0.8899810603812406, + "learning_rate": 8.688127146307938e-06, + "loss": 0.4589, + "step": 9671 + }, + { + "epoch": 0.56, + "grad_norm": 0.26078813677723967, + "learning_rate": 8.68628233917859e-06, + "loss": 0.2169, + "step": 9672 + }, + { + "epoch": 0.56, + "grad_norm": 0.2721884189437094, + "learning_rate": 8.684437577543227e-06, + "loss": 0.2406, + "step": 9673 + }, + { + "epoch": 0.56, + "grad_norm": 0.5468489073374028, + "learning_rate": 8.682592861465735e-06, + "loss": 0.3952, + "step": 9674 + }, + { + "epoch": 0.56, + "grad_norm": 0.3647467307258367, + "learning_rate": 8.680748191009995e-06, + "loss": 0.295, + "step": 9675 + }, + { + "epoch": 0.56, + "grad_norm": 0.7706378980657368, + "learning_rate": 8.67890356623989e-06, + "loss": 0.3905, + "step": 9676 + }, + { + "epoch": 0.56, + "grad_norm": 0.3748336812202746, + "learning_rate": 8.677058987219294e-06, + "loss": 0.3049, + "step": 9677 + }, + { + "epoch": 0.56, + "grad_norm": 0.2454214308007065, + "learning_rate": 8.675214454012092e-06, + "loss": 0.233, + "step": 9678 + }, + { + "epoch": 0.56, + "grad_norm": 0.45322202546424467, + "learning_rate": 8.673369966682154e-06, + "loss": 0.2341, + "step": 9679 + }, + { + "epoch": 0.56, + "grad_norm": 0.5365199502435255, + "learning_rate": 8.67152552529336e-06, + "loss": 0.3428, + "step": 9680 + }, + { + "epoch": 0.56, + "grad_norm": 0.4117158142211994, + "learning_rate": 8.669681129909578e-06, + "loss": 0.3221, + "step": 9681 + }, + { + "epoch": 0.56, + "grad_norm": 0.34998858461194654, + "learning_rate": 8.667836780594682e-06, + "loss": 0.336, + "step": 9682 + }, + { + "epoch": 0.56, + "grad_norm": 0.3850112115715607, + "learning_rate": 8.665992477412541e-06, + "loss": 0.1246, + "step": 9683 + }, + { + "epoch": 0.56, + "grad_norm": 0.3888304992876215, + "learning_rate": 8.664148220427023e-06, + "loss": 0.3117, + "step": 9684 + }, + { + "epoch": 0.56, + "grad_norm": 0.38035497224060644, + "learning_rate": 8.662304009701994e-06, + "loss": 0.3066, + "step": 9685 + }, + { + "epoch": 0.56, + "grad_norm": 0.2546173089481329, + "learning_rate": 8.66045984530132e-06, + "loss": 0.1335, + "step": 9686 + }, + { + "epoch": 0.56, + "grad_norm": 0.35621827719168836, + "learning_rate": 8.658615727288863e-06, + "loss": 0.2854, + "step": 9687 + }, + { + "epoch": 0.56, + "grad_norm": 1.0338373334263178, + "learning_rate": 8.656771655728487e-06, + "loss": 0.6266, + "step": 9688 + }, + { + "epoch": 0.56, + "grad_norm": 0.3064225676834638, + "learning_rate": 8.65492763068405e-06, + "loss": 0.2283, + "step": 9689 + }, + { + "epoch": 0.56, + "grad_norm": 0.33605010056283907, + "learning_rate": 8.653083652219417e-06, + "loss": 0.302, + "step": 9690 + }, + { + "epoch": 0.56, + "grad_norm": 0.6969521748952202, + "learning_rate": 8.651239720398433e-06, + "loss": 0.4025, + "step": 9691 + }, + { + "epoch": 0.56, + "grad_norm": 0.22137321403803517, + "learning_rate": 8.64939583528496e-06, + "loss": 0.114, + "step": 9692 + }, + { + "epoch": 0.56, + "grad_norm": 0.31067351486742123, + "learning_rate": 8.647551996942852e-06, + "loss": 0.2666, + "step": 9693 + }, + { + "epoch": 0.56, + "grad_norm": 0.3017237884993278, + "learning_rate": 8.645708205435959e-06, + "loss": 0.237, + "step": 9694 + }, + { + "epoch": 0.56, + "grad_norm": 0.7947918989546467, + "learning_rate": 8.643864460828135e-06, + "loss": 0.4867, + "step": 9695 + }, + { + "epoch": 0.56, + "grad_norm": 0.30815653925994546, + "learning_rate": 8.642020763183224e-06, + "loss": 0.2316, + "step": 9696 + }, + { + "epoch": 0.56, + "grad_norm": 0.4541266781213096, + "learning_rate": 8.640177112565078e-06, + "loss": 0.3681, + "step": 9697 + }, + { + "epoch": 0.56, + "grad_norm": 0.2920545878428287, + "learning_rate": 8.638333509037537e-06, + "loss": 0.2506, + "step": 9698 + }, + { + "epoch": 0.56, + "grad_norm": 0.3173736734585809, + "learning_rate": 8.63648995266445e-06, + "loss": 0.211, + "step": 9699 + }, + { + "epoch": 0.56, + "grad_norm": 0.38440130858759364, + "learning_rate": 8.634646443509656e-06, + "loss": 0.2679, + "step": 9700 + }, + { + "epoch": 0.56, + "grad_norm": 0.32061793907154607, + "learning_rate": 8.632802981636998e-06, + "loss": 0.3013, + "step": 9701 + }, + { + "epoch": 0.56, + "grad_norm": 0.31661079459501085, + "learning_rate": 8.630959567110314e-06, + "loss": 0.2308, + "step": 9702 + }, + { + "epoch": 0.56, + "grad_norm": 0.839441115520653, + "learning_rate": 8.629116199993441e-06, + "loss": 0.4207, + "step": 9703 + }, + { + "epoch": 0.56, + "grad_norm": 1.1032675562865573, + "learning_rate": 8.627272880350214e-06, + "loss": 0.7347, + "step": 9704 + }, + { + "epoch": 0.56, + "grad_norm": 0.26791299634997684, + "learning_rate": 8.62542960824447e-06, + "loss": 0.212, + "step": 9705 + }, + { + "epoch": 0.56, + "grad_norm": 0.2568316260810858, + "learning_rate": 8.623586383740037e-06, + "loss": 0.2, + "step": 9706 + }, + { + "epoch": 0.56, + "grad_norm": 1.0746648875443938, + "learning_rate": 8.621743206900752e-06, + "loss": 0.407, + "step": 9707 + }, + { + "epoch": 0.56, + "grad_norm": 0.3060183253023324, + "learning_rate": 8.619900077790439e-06, + "loss": 0.2603, + "step": 9708 + }, + { + "epoch": 0.56, + "grad_norm": 0.3354086747479937, + "learning_rate": 8.618056996472925e-06, + "loss": 0.2808, + "step": 9709 + }, + { + "epoch": 0.56, + "grad_norm": 1.25565098457294, + "learning_rate": 8.616213963012042e-06, + "loss": 0.7637, + "step": 9710 + }, + { + "epoch": 0.56, + "grad_norm": 0.34005069327044857, + "learning_rate": 8.614370977471604e-06, + "loss": 0.27, + "step": 9711 + }, + { + "epoch": 0.56, + "grad_norm": 0.18000557563902286, + "learning_rate": 8.612528039915444e-06, + "loss": 0.0875, + "step": 9712 + }, + { + "epoch": 0.56, + "grad_norm": 0.5228609692059021, + "learning_rate": 8.610685150407376e-06, + "loss": 0.3247, + "step": 9713 + }, + { + "epoch": 0.56, + "grad_norm": 0.3874828243771713, + "learning_rate": 8.608842309011224e-06, + "loss": 0.2822, + "step": 9714 + }, + { + "epoch": 0.56, + "grad_norm": 0.9765194619225318, + "learning_rate": 8.606999515790801e-06, + "loss": 0.2146, + "step": 9715 + }, + { + "epoch": 0.56, + "grad_norm": 0.4766873680274542, + "learning_rate": 8.605156770809926e-06, + "loss": 0.3851, + "step": 9716 + }, + { + "epoch": 0.56, + "grad_norm": 0.33428061403293535, + "learning_rate": 8.603314074132411e-06, + "loss": 0.2666, + "step": 9717 + }, + { + "epoch": 0.56, + "grad_norm": 0.29411656533246155, + "learning_rate": 8.60147142582207e-06, + "loss": 0.2137, + "step": 9718 + }, + { + "epoch": 0.56, + "grad_norm": 0.45824528014285415, + "learning_rate": 8.599628825942713e-06, + "loss": 0.2705, + "step": 9719 + }, + { + "epoch": 0.56, + "grad_norm": 0.37961707152946494, + "learning_rate": 8.597786274558152e-06, + "loss": 0.2672, + "step": 9720 + }, + { + "epoch": 0.56, + "grad_norm": 0.375183535077221, + "learning_rate": 8.595943771732187e-06, + "loss": 0.3041, + "step": 9721 + }, + { + "epoch": 0.56, + "grad_norm": 1.2665256641960105, + "learning_rate": 8.594101317528634e-06, + "loss": 0.4257, + "step": 9722 + }, + { + "epoch": 0.56, + "grad_norm": 0.3198305090286384, + "learning_rate": 8.59225891201129e-06, + "loss": 0.227, + "step": 9723 + }, + { + "epoch": 0.56, + "grad_norm": 0.5603332070743713, + "learning_rate": 8.590416555243962e-06, + "loss": 0.4177, + "step": 9724 + }, + { + "epoch": 0.56, + "grad_norm": 0.34251976719886146, + "learning_rate": 8.588574247290444e-06, + "loss": 0.2623, + "step": 9725 + }, + { + "epoch": 0.56, + "grad_norm": 0.26791062951199285, + "learning_rate": 8.586731988214542e-06, + "loss": 0.21, + "step": 9726 + }, + { + "epoch": 0.56, + "grad_norm": 0.43863608264963566, + "learning_rate": 8.584889778080049e-06, + "loss": 0.2461, + "step": 9727 + }, + { + "epoch": 0.56, + "grad_norm": 0.4854419789559304, + "learning_rate": 8.583047616950761e-06, + "loss": 0.3134, + "step": 9728 + }, + { + "epoch": 0.56, + "grad_norm": 0.32119730040485317, + "learning_rate": 8.581205504890474e-06, + "loss": 0.2772, + "step": 9729 + }, + { + "epoch": 0.56, + "grad_norm": 0.8707909117456779, + "learning_rate": 8.57936344196298e-06, + "loss": 0.4635, + "step": 9730 + }, + { + "epoch": 0.56, + "grad_norm": 0.49345414113964153, + "learning_rate": 8.577521428232067e-06, + "loss": 0.2946, + "step": 9731 + }, + { + "epoch": 0.56, + "grad_norm": 0.23250652886101789, + "learning_rate": 8.575679463761527e-06, + "loss": 0.1565, + "step": 9732 + }, + { + "epoch": 0.56, + "grad_norm": 0.3752357585942652, + "learning_rate": 8.573837548615144e-06, + "loss": 0.3268, + "step": 9733 + }, + { + "epoch": 0.56, + "grad_norm": 1.1076609438672396, + "learning_rate": 8.57199568285671e-06, + "loss": 0.6822, + "step": 9734 + }, + { + "epoch": 0.56, + "grad_norm": 0.31491087231924286, + "learning_rate": 8.570153866550002e-06, + "loss": 0.2089, + "step": 9735 + }, + { + "epoch": 0.56, + "grad_norm": 1.3889797767245986, + "learning_rate": 8.568312099758802e-06, + "loss": 0.4492, + "step": 9736 + }, + { + "epoch": 0.56, + "grad_norm": 0.36769358781100087, + "learning_rate": 8.566470382546896e-06, + "loss": 0.3141, + "step": 9737 + }, + { + "epoch": 0.56, + "grad_norm": 0.21757133357345376, + "learning_rate": 8.564628714978055e-06, + "loss": 0.1293, + "step": 9738 + }, + { + "epoch": 0.56, + "grad_norm": 0.34371264630242915, + "learning_rate": 8.562787097116063e-06, + "loss": 0.2301, + "step": 9739 + }, + { + "epoch": 0.56, + "grad_norm": 0.36254630132426113, + "learning_rate": 8.56094552902469e-06, + "loss": 0.354, + "step": 9740 + }, + { + "epoch": 0.56, + "grad_norm": 0.321207211418575, + "learning_rate": 8.559104010767713e-06, + "loss": 0.1934, + "step": 9741 + }, + { + "epoch": 0.56, + "grad_norm": 1.018284315811994, + "learning_rate": 8.5572625424089e-06, + "loss": 0.4464, + "step": 9742 + }, + { + "epoch": 0.56, + "grad_norm": 1.050265768211607, + "learning_rate": 8.555421124012026e-06, + "loss": 0.5515, + "step": 9743 + }, + { + "epoch": 0.56, + "grad_norm": 0.23753029210938723, + "learning_rate": 8.553579755640853e-06, + "loss": 0.1735, + "step": 9744 + }, + { + "epoch": 0.56, + "grad_norm": 0.3188849467880673, + "learning_rate": 8.551738437359154e-06, + "loss": 0.2574, + "step": 9745 + }, + { + "epoch": 0.56, + "grad_norm": 0.7259221869316915, + "learning_rate": 8.549897169230689e-06, + "loss": 0.5058, + "step": 9746 + }, + { + "epoch": 0.56, + "grad_norm": 0.4847398425340026, + "learning_rate": 8.548055951319223e-06, + "loss": 0.3058, + "step": 9747 + }, + { + "epoch": 0.56, + "grad_norm": 0.4390574813917582, + "learning_rate": 8.546214783688518e-06, + "loss": 0.2942, + "step": 9748 + }, + { + "epoch": 0.56, + "grad_norm": 0.3452402290361021, + "learning_rate": 8.544373666402331e-06, + "loss": 0.3128, + "step": 9749 + }, + { + "epoch": 0.56, + "grad_norm": 0.3458191816422412, + "learning_rate": 8.542532599524422e-06, + "loss": 0.2582, + "step": 9750 + }, + { + "epoch": 0.56, + "grad_norm": 0.18158561259795064, + "learning_rate": 8.540691583118545e-06, + "loss": 0.0899, + "step": 9751 + }, + { + "epoch": 0.56, + "grad_norm": 0.4406492459517275, + "learning_rate": 8.53885061724846e-06, + "loss": 0.3035, + "step": 9752 + }, + { + "epoch": 0.56, + "grad_norm": 0.3702396463379801, + "learning_rate": 8.537009701977909e-06, + "loss": 0.276, + "step": 9753 + }, + { + "epoch": 0.56, + "grad_norm": 0.8471022600906709, + "learning_rate": 8.535168837370656e-06, + "loss": 0.357, + "step": 9754 + }, + { + "epoch": 0.56, + "grad_norm": 0.8915584935820783, + "learning_rate": 8.533328023490438e-06, + "loss": 0.5318, + "step": 9755 + }, + { + "epoch": 0.56, + "grad_norm": 0.3460314169075071, + "learning_rate": 8.531487260401009e-06, + "loss": 0.2315, + "step": 9756 + }, + { + "epoch": 0.56, + "grad_norm": 0.2540429650530294, + "learning_rate": 8.529646548166113e-06, + "loss": 0.2456, + "step": 9757 + }, + { + "epoch": 0.56, + "grad_norm": 0.8162276591758156, + "learning_rate": 8.527805886849496e-06, + "loss": 0.3404, + "step": 9758 + }, + { + "epoch": 0.56, + "grad_norm": 0.6224711881741437, + "learning_rate": 8.525965276514897e-06, + "loss": 0.4223, + "step": 9759 + }, + { + "epoch": 0.56, + "grad_norm": 0.41310833028155763, + "learning_rate": 8.524124717226057e-06, + "loss": 0.3289, + "step": 9760 + }, + { + "epoch": 0.56, + "grad_norm": 0.3572582038454073, + "learning_rate": 8.522284209046713e-06, + "loss": 0.2494, + "step": 9761 + }, + { + "epoch": 0.56, + "grad_norm": 0.3742903886740477, + "learning_rate": 8.520443752040604e-06, + "loss": 0.2831, + "step": 9762 + }, + { + "epoch": 0.56, + "grad_norm": 0.2706597483671145, + "learning_rate": 8.518603346271463e-06, + "loss": 0.2184, + "step": 9763 + }, + { + "epoch": 0.56, + "grad_norm": 0.38218243499476895, + "learning_rate": 8.516762991803027e-06, + "loss": 0.2738, + "step": 9764 + }, + { + "epoch": 0.56, + "grad_norm": 0.4708346428419067, + "learning_rate": 8.51492268869902e-06, + "loss": 0.2916, + "step": 9765 + }, + { + "epoch": 0.56, + "grad_norm": 0.5177037495364069, + "learning_rate": 8.513082437023182e-06, + "loss": 0.4067, + "step": 9766 + }, + { + "epoch": 0.56, + "grad_norm": 0.8805166112100425, + "learning_rate": 8.511242236839232e-06, + "loss": 0.2842, + "step": 9767 + }, + { + "epoch": 0.56, + "grad_norm": 0.32887364777368694, + "learning_rate": 8.509402088210901e-06, + "loss": 0.2746, + "step": 9768 + }, + { + "epoch": 0.56, + "grad_norm": 0.26738679639436275, + "learning_rate": 8.507561991201908e-06, + "loss": 0.2453, + "step": 9769 + }, + { + "epoch": 0.56, + "grad_norm": 0.41725219260940183, + "learning_rate": 8.505721945875985e-06, + "loss": 0.3137, + "step": 9770 + }, + { + "epoch": 0.56, + "grad_norm": 0.3031029677592879, + "learning_rate": 8.503881952296842e-06, + "loss": 0.2186, + "step": 9771 + }, + { + "epoch": 0.56, + "grad_norm": 1.254047894329755, + "learning_rate": 8.502042010528205e-06, + "loss": 0.7768, + "step": 9772 + }, + { + "epoch": 0.56, + "grad_norm": 0.3559282773742175, + "learning_rate": 8.50020212063379e-06, + "loss": 0.3021, + "step": 9773 + }, + { + "epoch": 0.56, + "grad_norm": 0.3623989422829272, + "learning_rate": 8.498362282677308e-06, + "loss": 0.1237, + "step": 9774 + }, + { + "epoch": 0.56, + "grad_norm": 0.33483177445849416, + "learning_rate": 8.496522496722476e-06, + "loss": 0.2634, + "step": 9775 + }, + { + "epoch": 0.56, + "grad_norm": 0.32082119699761735, + "learning_rate": 8.494682762833004e-06, + "loss": 0.2741, + "step": 9776 + }, + { + "epoch": 0.56, + "grad_norm": 0.3887285850178219, + "learning_rate": 8.492843081072609e-06, + "loss": 0.1544, + "step": 9777 + }, + { + "epoch": 0.56, + "grad_norm": 0.4023507557929457, + "learning_rate": 8.491003451504987e-06, + "loss": 0.3336, + "step": 9778 + }, + { + "epoch": 0.56, + "grad_norm": 0.8923515179116659, + "learning_rate": 8.489163874193854e-06, + "loss": 0.5183, + "step": 9779 + }, + { + "epoch": 0.56, + "grad_norm": 0.32433786243686613, + "learning_rate": 8.487324349202909e-06, + "loss": 0.2238, + "step": 9780 + }, + { + "epoch": 0.56, + "grad_norm": 0.3837847982567723, + "learning_rate": 8.485484876595859e-06, + "loss": 0.3084, + "step": 9781 + }, + { + "epoch": 0.56, + "grad_norm": 0.45428916728106616, + "learning_rate": 8.4836454564364e-06, + "loss": 0.2587, + "step": 9782 + }, + { + "epoch": 0.56, + "grad_norm": 0.5838701470386773, + "learning_rate": 8.481806088788235e-06, + "loss": 0.3887, + "step": 9783 + }, + { + "epoch": 0.56, + "grad_norm": 0.23669142326985326, + "learning_rate": 8.47996677371506e-06, + "loss": 0.2022, + "step": 9784 + }, + { + "epoch": 0.56, + "grad_norm": 0.4869737162516183, + "learning_rate": 8.478127511280571e-06, + "loss": 0.399, + "step": 9785 + }, + { + "epoch": 0.56, + "grad_norm": 0.5174517432230594, + "learning_rate": 8.476288301548458e-06, + "loss": 0.3913, + "step": 9786 + }, + { + "epoch": 0.56, + "grad_norm": 0.36426826902545983, + "learning_rate": 8.474449144582419e-06, + "loss": 0.2611, + "step": 9787 + }, + { + "epoch": 0.56, + "grad_norm": 0.339033501439424, + "learning_rate": 8.472610040446142e-06, + "loss": 0.3378, + "step": 9788 + }, + { + "epoch": 0.56, + "grad_norm": 0.6482947546336639, + "learning_rate": 8.470770989203309e-06, + "loss": 0.4807, + "step": 9789 + }, + { + "epoch": 0.56, + "grad_norm": 0.2700529166944222, + "learning_rate": 8.468931990917613e-06, + "loss": 0.1869, + "step": 9790 + }, + { + "epoch": 0.56, + "grad_norm": 0.38726787939089174, + "learning_rate": 8.467093045652736e-06, + "loss": 0.2809, + "step": 9791 + }, + { + "epoch": 0.56, + "grad_norm": 0.3210957803126272, + "learning_rate": 8.465254153472362e-06, + "loss": 0.2913, + "step": 9792 + }, + { + "epoch": 0.56, + "grad_norm": 0.3114223586414909, + "learning_rate": 8.463415314440172e-06, + "loss": 0.2626, + "step": 9793 + }, + { + "epoch": 0.56, + "grad_norm": 0.9172670409997502, + "learning_rate": 8.46157652861984e-06, + "loss": 0.6035, + "step": 9794 + }, + { + "epoch": 0.56, + "grad_norm": 0.7608940459805851, + "learning_rate": 8.45973779607505e-06, + "loss": 0.4936, + "step": 9795 + }, + { + "epoch": 0.56, + "grad_norm": 0.2258225225155695, + "learning_rate": 8.457899116869469e-06, + "loss": 0.2257, + "step": 9796 + }, + { + "epoch": 0.56, + "grad_norm": 0.2627940231472568, + "learning_rate": 8.45606049106678e-06, + "loss": 0.1897, + "step": 9797 + }, + { + "epoch": 0.56, + "grad_norm": 0.5623646815898292, + "learning_rate": 8.454221918730646e-06, + "loss": 0.3763, + "step": 9798 + }, + { + "epoch": 0.56, + "grad_norm": 0.3464750026436955, + "learning_rate": 8.452383399924743e-06, + "loss": 0.2948, + "step": 9799 + }, + { + "epoch": 0.56, + "grad_norm": 0.3381683606193286, + "learning_rate": 8.450544934712736e-06, + "loss": 0.2535, + "step": 9800 + }, + { + "epoch": 0.56, + "grad_norm": 0.6811041565579721, + "learning_rate": 8.44870652315829e-06, + "loss": 0.4924, + "step": 9801 + }, + { + "epoch": 0.56, + "grad_norm": 0.2585541466426452, + "learning_rate": 8.446868165325073e-06, + "loss": 0.2242, + "step": 9802 + }, + { + "epoch": 0.56, + "grad_norm": 0.4408860747267185, + "learning_rate": 8.445029861276742e-06, + "loss": 0.1794, + "step": 9803 + }, + { + "epoch": 0.56, + "grad_norm": 0.30415946757221135, + "learning_rate": 8.443191611076962e-06, + "loss": 0.2894, + "step": 9804 + }, + { + "epoch": 0.56, + "grad_norm": 0.3788792995472617, + "learning_rate": 8.441353414789386e-06, + "loss": 0.2975, + "step": 9805 + }, + { + "epoch": 0.56, + "grad_norm": 1.3016544580089682, + "learning_rate": 8.439515272477679e-06, + "loss": 0.6672, + "step": 9806 + }, + { + "epoch": 0.56, + "grad_norm": 0.4190179098500511, + "learning_rate": 8.437677184205488e-06, + "loss": 0.2239, + "step": 9807 + }, + { + "epoch": 0.56, + "grad_norm": 0.3089991674098943, + "learning_rate": 8.43583915003647e-06, + "loss": 0.2844, + "step": 9808 + }, + { + "epoch": 0.56, + "grad_norm": 0.2639149079109884, + "learning_rate": 8.434001170034273e-06, + "loss": 0.2243, + "step": 9809 + }, + { + "epoch": 0.56, + "grad_norm": 0.556166511583863, + "learning_rate": 8.432163244262551e-06, + "loss": 0.3236, + "step": 9810 + }, + { + "epoch": 0.56, + "grad_norm": 0.32418630784592684, + "learning_rate": 8.430325372784946e-06, + "loss": 0.2629, + "step": 9811 + }, + { + "epoch": 0.56, + "grad_norm": 0.3574565696512591, + "learning_rate": 8.428487555665108e-06, + "loss": 0.3277, + "step": 9812 + }, + { + "epoch": 0.56, + "grad_norm": 0.43378655974248065, + "learning_rate": 8.426649792966679e-06, + "loss": 0.2564, + "step": 9813 + }, + { + "epoch": 0.56, + "grad_norm": 0.3671253402319884, + "learning_rate": 8.424812084753297e-06, + "loss": 0.2683, + "step": 9814 + }, + { + "epoch": 0.56, + "grad_norm": 0.6749395956942922, + "learning_rate": 8.422974431088607e-06, + "loss": 0.3903, + "step": 9815 + }, + { + "epoch": 0.56, + "grad_norm": 0.2000129236440116, + "learning_rate": 8.421136832036242e-06, + "loss": 0.1704, + "step": 9816 + }, + { + "epoch": 0.56, + "grad_norm": 0.3516750238446244, + "learning_rate": 8.419299287659844e-06, + "loss": 0.2996, + "step": 9817 + }, + { + "epoch": 0.56, + "grad_norm": 1.055794532902847, + "learning_rate": 8.417461798023042e-06, + "loss": 0.6188, + "step": 9818 + }, + { + "epoch": 0.56, + "grad_norm": 0.4547799461366989, + "learning_rate": 8.41562436318947e-06, + "loss": 0.3617, + "step": 9819 + }, + { + "epoch": 0.56, + "grad_norm": 0.28796199443351, + "learning_rate": 8.413786983222758e-06, + "loss": 0.2079, + "step": 9820 + }, + { + "epoch": 0.56, + "grad_norm": 0.828814734820839, + "learning_rate": 8.411949658186536e-06, + "loss": 0.3814, + "step": 9821 + }, + { + "epoch": 0.56, + "grad_norm": 0.29849571473835407, + "learning_rate": 8.410112388144426e-06, + "loss": 0.2056, + "step": 9822 + }, + { + "epoch": 0.56, + "grad_norm": 0.3716093461533886, + "learning_rate": 8.408275173160059e-06, + "loss": 0.207, + "step": 9823 + }, + { + "epoch": 0.56, + "grad_norm": 0.3931711629255428, + "learning_rate": 8.406438013297052e-06, + "loss": 0.3198, + "step": 9824 + }, + { + "epoch": 0.56, + "grad_norm": 0.8020438520926569, + "learning_rate": 8.404600908619033e-06, + "loss": 0.5594, + "step": 9825 + }, + { + "epoch": 0.56, + "grad_norm": 0.39734590436170697, + "learning_rate": 8.40276385918961e-06, + "loss": 0.2346, + "step": 9826 + }, + { + "epoch": 0.56, + "grad_norm": 0.8506087767154734, + "learning_rate": 8.40092686507241e-06, + "loss": 0.4187, + "step": 9827 + }, + { + "epoch": 0.56, + "grad_norm": 0.255074633086988, + "learning_rate": 8.39908992633104e-06, + "loss": 0.2158, + "step": 9828 + }, + { + "epoch": 0.56, + "grad_norm": 0.31639947374690586, + "learning_rate": 8.39725304302912e-06, + "loss": 0.1998, + "step": 9829 + }, + { + "epoch": 0.56, + "grad_norm": 1.2054591386012041, + "learning_rate": 8.395416215230255e-06, + "loss": 0.7528, + "step": 9830 + }, + { + "epoch": 0.56, + "grad_norm": 0.4489425221494689, + "learning_rate": 8.39357944299806e-06, + "loss": 0.3473, + "step": 9831 + }, + { + "epoch": 0.56, + "grad_norm": 0.3019501530801851, + "learning_rate": 8.391742726396138e-06, + "loss": 0.2471, + "step": 9832 + }, + { + "epoch": 0.56, + "grad_norm": 0.6973811792398468, + "learning_rate": 8.389906065488099e-06, + "loss": 0.3255, + "step": 9833 + }, + { + "epoch": 0.57, + "grad_norm": 0.28463618303054367, + "learning_rate": 8.38806946033754e-06, + "loss": 0.1894, + "step": 9834 + }, + { + "epoch": 0.57, + "grad_norm": 0.35151178079415824, + "learning_rate": 8.386232911008069e-06, + "loss": 0.2611, + "step": 9835 + }, + { + "epoch": 0.57, + "grad_norm": 0.40041939562222845, + "learning_rate": 8.38439641756328e-06, + "loss": 0.2496, + "step": 9836 + }, + { + "epoch": 0.57, + "grad_norm": 0.876402219608769, + "learning_rate": 8.382559980066778e-06, + "loss": 0.4565, + "step": 9837 + }, + { + "epoch": 0.57, + "grad_norm": 0.31643695032595764, + "learning_rate": 8.380723598582152e-06, + "loss": 0.2451, + "step": 9838 + }, + { + "epoch": 0.57, + "grad_norm": 0.9533380440004469, + "learning_rate": 8.378887273172997e-06, + "loss": 0.3012, + "step": 9839 + }, + { + "epoch": 0.57, + "grad_norm": 0.20603783465622966, + "learning_rate": 8.37705100390291e-06, + "loss": 0.2236, + "step": 9840 + }, + { + "epoch": 0.57, + "grad_norm": 0.37732886480516004, + "learning_rate": 8.375214790835471e-06, + "loss": 0.2641, + "step": 9841 + }, + { + "epoch": 0.57, + "grad_norm": 0.48964154358706147, + "learning_rate": 8.37337863403428e-06, + "loss": 0.2918, + "step": 9842 + }, + { + "epoch": 0.57, + "grad_norm": 0.4250539613089976, + "learning_rate": 8.371542533562912e-06, + "loss": 0.3089, + "step": 9843 + }, + { + "epoch": 0.57, + "grad_norm": 0.35552109627010425, + "learning_rate": 8.369706489484958e-06, + "loss": 0.2745, + "step": 9844 + }, + { + "epoch": 0.57, + "grad_norm": 0.9900011631599458, + "learning_rate": 8.367870501863999e-06, + "loss": 0.4337, + "step": 9845 + }, + { + "epoch": 0.57, + "grad_norm": 0.18633542192754032, + "learning_rate": 8.366034570763614e-06, + "loss": 0.0689, + "step": 9846 + }, + { + "epoch": 0.57, + "grad_norm": 0.3414535184442479, + "learning_rate": 8.36419869624738e-06, + "loss": 0.2813, + "step": 9847 + }, + { + "epoch": 0.57, + "grad_norm": 0.37358328219071146, + "learning_rate": 8.362362878378876e-06, + "loss": 0.329, + "step": 9848 + }, + { + "epoch": 0.57, + "grad_norm": 0.8248357384042281, + "learning_rate": 8.360527117221675e-06, + "loss": 0.3291, + "step": 9849 + }, + { + "epoch": 0.57, + "grad_norm": 0.36147984520054255, + "learning_rate": 8.358691412839351e-06, + "loss": 0.3034, + "step": 9850 + }, + { + "epoch": 0.57, + "grad_norm": 0.5104748583363516, + "learning_rate": 8.35685576529547e-06, + "loss": 0.3727, + "step": 9851 + }, + { + "epoch": 0.57, + "grad_norm": 0.40198951064850447, + "learning_rate": 8.355020174653605e-06, + "loss": 0.2165, + "step": 9852 + }, + { + "epoch": 0.57, + "grad_norm": 0.24647184833766747, + "learning_rate": 8.35318464097732e-06, + "loss": 0.2172, + "step": 9853 + }, + { + "epoch": 0.57, + "grad_norm": 0.3810295500607417, + "learning_rate": 8.35134916433018e-06, + "loss": 0.2782, + "step": 9854 + }, + { + "epoch": 0.57, + "grad_norm": 0.3857340949697676, + "learning_rate": 8.349513744775748e-06, + "loss": 0.289, + "step": 9855 + }, + { + "epoch": 0.57, + "grad_norm": 0.34685648534680974, + "learning_rate": 8.347678382377584e-06, + "loss": 0.2889, + "step": 9856 + }, + { + "epoch": 0.57, + "grad_norm": 1.0213706600766048, + "learning_rate": 8.345843077199247e-06, + "loss": 0.604, + "step": 9857 + }, + { + "epoch": 0.57, + "grad_norm": 0.43460712379136257, + "learning_rate": 8.344007829304291e-06, + "loss": 0.2663, + "step": 9858 + }, + { + "epoch": 0.57, + "grad_norm": 0.2396752936639507, + "learning_rate": 8.342172638756276e-06, + "loss": 0.1397, + "step": 9859 + }, + { + "epoch": 0.57, + "grad_norm": 0.3169017785415661, + "learning_rate": 8.34033750561875e-06, + "loss": 0.29, + "step": 9860 + }, + { + "epoch": 0.57, + "grad_norm": 0.9931863567969424, + "learning_rate": 8.338502429955264e-06, + "loss": 0.4568, + "step": 9861 + }, + { + "epoch": 0.57, + "grad_norm": 0.3480195681104575, + "learning_rate": 8.33666741182937e-06, + "loss": 0.1948, + "step": 9862 + }, + { + "epoch": 0.57, + "grad_norm": 0.37565505477248806, + "learning_rate": 8.334832451304607e-06, + "loss": 0.3106, + "step": 9863 + }, + { + "epoch": 0.57, + "grad_norm": 0.45130905555980977, + "learning_rate": 8.332997548444528e-06, + "loss": 0.3433, + "step": 9864 + }, + { + "epoch": 0.57, + "grad_norm": 0.3033398640505855, + "learning_rate": 8.331162703312671e-06, + "loss": 0.1934, + "step": 9865 + }, + { + "epoch": 0.57, + "grad_norm": 0.5363775633765079, + "learning_rate": 8.329327915972578e-06, + "loss": 0.2919, + "step": 9866 + }, + { + "epoch": 0.57, + "grad_norm": 0.747351620662827, + "learning_rate": 8.32749318648779e-06, + "loss": 0.3933, + "step": 9867 + }, + { + "epoch": 0.57, + "grad_norm": 0.26553280669644175, + "learning_rate": 8.325658514921838e-06, + "loss": 0.211, + "step": 9868 + }, + { + "epoch": 0.57, + "grad_norm": 0.4145040507342814, + "learning_rate": 8.32382390133826e-06, + "loss": 0.227, + "step": 9869 + }, + { + "epoch": 0.57, + "grad_norm": 1.1950814437419346, + "learning_rate": 8.321989345800587e-06, + "loss": 0.6061, + "step": 9870 + }, + { + "epoch": 0.57, + "grad_norm": 0.3002830485967934, + "learning_rate": 8.320154848372353e-06, + "loss": 0.2718, + "step": 9871 + }, + { + "epoch": 0.57, + "grad_norm": 0.3226666488160495, + "learning_rate": 8.318320409117082e-06, + "loss": 0.2688, + "step": 9872 + }, + { + "epoch": 0.57, + "grad_norm": 0.37089315357089786, + "learning_rate": 8.316486028098306e-06, + "loss": 0.2992, + "step": 9873 + }, + { + "epoch": 0.57, + "grad_norm": 0.29002209727779943, + "learning_rate": 8.314651705379544e-06, + "loss": 0.2231, + "step": 9874 + }, + { + "epoch": 0.57, + "grad_norm": 0.3455087689241484, + "learning_rate": 8.312817441024324e-06, + "loss": 0.2336, + "step": 9875 + }, + { + "epoch": 0.57, + "grad_norm": 0.5069761719351095, + "learning_rate": 8.31098323509616e-06, + "loss": 0.3668, + "step": 9876 + }, + { + "epoch": 0.57, + "grad_norm": 0.38171523506523025, + "learning_rate": 8.309149087658576e-06, + "loss": 0.278, + "step": 9877 + }, + { + "epoch": 0.57, + "grad_norm": 0.37069169080183434, + "learning_rate": 8.307314998775087e-06, + "loss": 0.2917, + "step": 9878 + }, + { + "epoch": 0.57, + "grad_norm": 0.3392341659426202, + "learning_rate": 8.305480968509204e-06, + "loss": 0.3168, + "step": 9879 + }, + { + "epoch": 0.57, + "grad_norm": 0.5341042702280032, + "learning_rate": 8.303646996924445e-06, + "loss": 0.3165, + "step": 9880 + }, + { + "epoch": 0.57, + "grad_norm": 0.2169102274872214, + "learning_rate": 8.301813084084315e-06, + "loss": 0.1475, + "step": 9881 + }, + { + "epoch": 0.57, + "grad_norm": 0.5998300732121216, + "learning_rate": 8.299979230052327e-06, + "loss": 0.4361, + "step": 9882 + }, + { + "epoch": 0.57, + "grad_norm": 0.31913217855811254, + "learning_rate": 8.298145434891983e-06, + "loss": 0.2759, + "step": 9883 + }, + { + "epoch": 0.57, + "grad_norm": 0.3384502713432209, + "learning_rate": 8.296311698666792e-06, + "loss": 0.3219, + "step": 9884 + }, + { + "epoch": 0.57, + "grad_norm": 0.7524840483991162, + "learning_rate": 8.29447802144025e-06, + "loss": 0.4665, + "step": 9885 + }, + { + "epoch": 0.57, + "grad_norm": 0.2888469947900723, + "learning_rate": 8.292644403275865e-06, + "loss": 0.2242, + "step": 9886 + }, + { + "epoch": 0.57, + "grad_norm": 0.2532786012933536, + "learning_rate": 8.290810844237128e-06, + "loss": 0.2485, + "step": 9887 + }, + { + "epoch": 0.57, + "grad_norm": 0.7263770853364647, + "learning_rate": 8.28897734438754e-06, + "loss": 0.2903, + "step": 9888 + }, + { + "epoch": 0.57, + "grad_norm": 0.3729535266775858, + "learning_rate": 8.28714390379059e-06, + "loss": 0.2678, + "step": 9889 + }, + { + "epoch": 0.57, + "grad_norm": 0.4971089032472461, + "learning_rate": 8.285310522509777e-06, + "loss": 0.3529, + "step": 9890 + }, + { + "epoch": 0.57, + "grad_norm": 0.32657011764466537, + "learning_rate": 8.283477200608585e-06, + "loss": 0.2734, + "step": 9891 + }, + { + "epoch": 0.57, + "grad_norm": 0.4085045169813695, + "learning_rate": 8.281643938150504e-06, + "loss": 0.2973, + "step": 9892 + }, + { + "epoch": 0.57, + "grad_norm": 0.25379677627879427, + "learning_rate": 8.27981073519902e-06, + "loss": 0.1814, + "step": 9893 + }, + { + "epoch": 0.57, + "grad_norm": 0.8729493209016004, + "learning_rate": 8.277977591817617e-06, + "loss": 0.3863, + "step": 9894 + }, + { + "epoch": 0.57, + "grad_norm": 0.30599910372112143, + "learning_rate": 8.276144508069775e-06, + "loss": 0.2213, + "step": 9895 + }, + { + "epoch": 0.57, + "grad_norm": 0.38495626365299385, + "learning_rate": 8.274311484018975e-06, + "loss": 0.3128, + "step": 9896 + }, + { + "epoch": 0.57, + "grad_norm": 0.9544093017904444, + "learning_rate": 8.27247851972869e-06, + "loss": 0.6904, + "step": 9897 + }, + { + "epoch": 0.57, + "grad_norm": 0.4332396370100671, + "learning_rate": 8.270645615262405e-06, + "loss": 0.1635, + "step": 9898 + }, + { + "epoch": 0.57, + "grad_norm": 0.21958238359771295, + "learning_rate": 8.268812770683583e-06, + "loss": 0.2311, + "step": 9899 + }, + { + "epoch": 0.57, + "grad_norm": 0.7528076032824318, + "learning_rate": 8.266979986055704e-06, + "loss": 0.3034, + "step": 9900 + }, + { + "epoch": 0.57, + "grad_norm": 0.2277184270609133, + "learning_rate": 8.265147261442232e-06, + "loss": 0.0727, + "step": 9901 + }, + { + "epoch": 0.57, + "grad_norm": 0.3900048604849335, + "learning_rate": 8.263314596906636e-06, + "loss": 0.3282, + "step": 9902 + }, + { + "epoch": 0.57, + "grad_norm": 0.3516000182623754, + "learning_rate": 8.261481992512382e-06, + "loss": 0.3266, + "step": 9903 + }, + { + "epoch": 0.57, + "grad_norm": 0.41670733313075314, + "learning_rate": 8.25964944832293e-06, + "loss": 0.1948, + "step": 9904 + }, + { + "epoch": 0.57, + "grad_norm": 0.309400465234977, + "learning_rate": 8.257816964401745e-06, + "loss": 0.2583, + "step": 9905 + }, + { + "epoch": 0.57, + "grad_norm": 0.4465142891914701, + "learning_rate": 8.255984540812281e-06, + "loss": 0.3008, + "step": 9906 + }, + { + "epoch": 0.57, + "grad_norm": 0.28221141190811483, + "learning_rate": 8.254152177618e-06, + "loss": 0.2664, + "step": 9907 + }, + { + "epoch": 0.57, + "grad_norm": 0.3505287455300988, + "learning_rate": 8.252319874882351e-06, + "loss": 0.2389, + "step": 9908 + }, + { + "epoch": 0.57, + "grad_norm": 1.0994942283600821, + "learning_rate": 8.250487632668793e-06, + "loss": 0.7445, + "step": 9909 + }, + { + "epoch": 0.57, + "grad_norm": 0.522218227682, + "learning_rate": 8.248655451040768e-06, + "loss": 0.2764, + "step": 9910 + }, + { + "epoch": 0.57, + "grad_norm": 0.2631576808456909, + "learning_rate": 8.246823330061734e-06, + "loss": 0.2347, + "step": 9911 + }, + { + "epoch": 0.57, + "grad_norm": 0.40714845276580147, + "learning_rate": 8.24499126979513e-06, + "loss": 0.281, + "step": 9912 + }, + { + "epoch": 0.57, + "grad_norm": 0.28989171280742454, + "learning_rate": 8.243159270304406e-06, + "loss": 0.1528, + "step": 9913 + }, + { + "epoch": 0.57, + "grad_norm": 0.3758477950586018, + "learning_rate": 8.241327331652997e-06, + "loss": 0.2369, + "step": 9914 + }, + { + "epoch": 0.57, + "grad_norm": 0.35371479814125156, + "learning_rate": 8.23949545390435e-06, + "loss": 0.3125, + "step": 9915 + }, + { + "epoch": 0.57, + "grad_norm": 0.7429248777607208, + "learning_rate": 8.237663637121897e-06, + "loss": 0.4045, + "step": 9916 + }, + { + "epoch": 0.57, + "grad_norm": 0.32476985877232545, + "learning_rate": 8.23583188136908e-06, + "loss": 0.2445, + "step": 9917 + }, + { + "epoch": 0.57, + "grad_norm": 0.2535536568991639, + "learning_rate": 8.234000186709327e-06, + "loss": 0.2115, + "step": 9918 + }, + { + "epoch": 0.57, + "grad_norm": 0.424295206181053, + "learning_rate": 8.232168553206072e-06, + "loss": 0.3238, + "step": 9919 + }, + { + "epoch": 0.57, + "grad_norm": 0.32505346775808885, + "learning_rate": 8.230336980922744e-06, + "loss": 0.2858, + "step": 9920 + }, + { + "epoch": 0.57, + "grad_norm": 0.9237249641595617, + "learning_rate": 8.228505469922769e-06, + "loss": 0.4477, + "step": 9921 + }, + { + "epoch": 0.57, + "grad_norm": 0.4630004104417319, + "learning_rate": 8.226674020269576e-06, + "loss": 0.3362, + "step": 9922 + }, + { + "epoch": 0.57, + "grad_norm": 0.2790719952889474, + "learning_rate": 8.224842632026583e-06, + "loss": 0.2751, + "step": 9923 + }, + { + "epoch": 0.57, + "grad_norm": 0.7399588708936765, + "learning_rate": 8.223011305257214e-06, + "loss": 0.2648, + "step": 9924 + }, + { + "epoch": 0.57, + "grad_norm": 0.24700257914947743, + "learning_rate": 8.221180040024887e-06, + "loss": 0.1378, + "step": 9925 + }, + { + "epoch": 0.57, + "grad_norm": 0.36109041110206347, + "learning_rate": 8.21934883639302e-06, + "loss": 0.2974, + "step": 9926 + }, + { + "epoch": 0.57, + "grad_norm": 0.36150839242761174, + "learning_rate": 8.217517694425027e-06, + "loss": 0.2756, + "step": 9927 + }, + { + "epoch": 0.57, + "grad_norm": 0.5338739079601947, + "learning_rate": 8.215686614184317e-06, + "loss": 0.3363, + "step": 9928 + }, + { + "epoch": 0.57, + "grad_norm": 0.3856577519384225, + "learning_rate": 8.213855595734306e-06, + "loss": 0.3088, + "step": 9929 + }, + { + "epoch": 0.57, + "grad_norm": 0.4388607395567644, + "learning_rate": 8.212024639138398e-06, + "loss": 0.2918, + "step": 9930 + }, + { + "epoch": 0.57, + "grad_norm": 0.22893949698281893, + "learning_rate": 8.210193744459997e-06, + "loss": 0.1762, + "step": 9931 + }, + { + "epoch": 0.57, + "grad_norm": 0.38411850632766775, + "learning_rate": 8.20836291176251e-06, + "loss": 0.3018, + "step": 9932 + }, + { + "epoch": 0.57, + "grad_norm": 0.7225551464888974, + "learning_rate": 8.20653214110934e-06, + "loss": 0.5503, + "step": 9933 + }, + { + "epoch": 0.57, + "grad_norm": 0.4470092332721713, + "learning_rate": 8.204701432563886e-06, + "loss": 0.2851, + "step": 9934 + }, + { + "epoch": 0.57, + "grad_norm": 0.28828857751457004, + "learning_rate": 8.202870786189541e-06, + "loss": 0.2635, + "step": 9935 + }, + { + "epoch": 0.57, + "grad_norm": 0.9999090217246076, + "learning_rate": 8.201040202049705e-06, + "loss": 0.6211, + "step": 9936 + }, + { + "epoch": 0.57, + "grad_norm": 0.1682762823248837, + "learning_rate": 8.199209680207768e-06, + "loss": 0.0922, + "step": 9937 + }, + { + "epoch": 0.57, + "grad_norm": 0.35653206640661295, + "learning_rate": 8.197379220727124e-06, + "loss": 0.2672, + "step": 9938 + }, + { + "epoch": 0.57, + "grad_norm": 0.34611855850166906, + "learning_rate": 8.19554882367116e-06, + "loss": 0.3211, + "step": 9939 + }, + { + "epoch": 0.57, + "grad_norm": 0.7223852385709048, + "learning_rate": 8.193718489103261e-06, + "loss": 0.3111, + "step": 9940 + }, + { + "epoch": 0.57, + "grad_norm": 0.3878265662993998, + "learning_rate": 8.191888217086813e-06, + "loss": 0.3133, + "step": 9941 + }, + { + "epoch": 0.57, + "grad_norm": 0.9687515986579172, + "learning_rate": 8.190058007685203e-06, + "loss": 0.5916, + "step": 9942 + }, + { + "epoch": 0.57, + "grad_norm": 0.21520660321755714, + "learning_rate": 8.188227860961804e-06, + "loss": 0.1676, + "step": 9943 + }, + { + "epoch": 0.57, + "grad_norm": 0.343814550581402, + "learning_rate": 8.186397776979992e-06, + "loss": 0.2711, + "step": 9944 + }, + { + "epoch": 0.57, + "grad_norm": 0.9159296933741887, + "learning_rate": 8.184567755803153e-06, + "loss": 0.4973, + "step": 9945 + }, + { + "epoch": 0.57, + "grad_norm": 0.38092338926104646, + "learning_rate": 8.18273779749465e-06, + "loss": 0.3144, + "step": 9946 + }, + { + "epoch": 0.57, + "grad_norm": 0.29189600354117673, + "learning_rate": 8.180907902117862e-06, + "loss": 0.1929, + "step": 9947 + }, + { + "epoch": 0.57, + "grad_norm": 1.1424900738622954, + "learning_rate": 8.179078069736152e-06, + "loss": 0.6561, + "step": 9948 + }, + { + "epoch": 0.57, + "grad_norm": 0.22268099279160083, + "learning_rate": 8.177248300412893e-06, + "loss": 0.1552, + "step": 9949 + }, + { + "epoch": 0.57, + "grad_norm": 0.3583785102766591, + "learning_rate": 8.175418594211445e-06, + "loss": 0.2161, + "step": 9950 + }, + { + "epoch": 0.57, + "grad_norm": 0.5628579774742724, + "learning_rate": 8.173588951195175e-06, + "loss": 0.3217, + "step": 9951 + }, + { + "epoch": 0.57, + "grad_norm": 0.9119904572848944, + "learning_rate": 8.171759371427439e-06, + "loss": 0.492, + "step": 9952 + }, + { + "epoch": 0.57, + "grad_norm": 0.2712431380235841, + "learning_rate": 8.169929854971598e-06, + "loss": 0.2144, + "step": 9953 + }, + { + "epoch": 0.57, + "grad_norm": 0.41873206115862327, + "learning_rate": 8.168100401891007e-06, + "loss": 0.3233, + "step": 9954 + }, + { + "epoch": 0.57, + "grad_norm": 0.41447916444623933, + "learning_rate": 8.166271012249022e-06, + "loss": 0.26, + "step": 9955 + }, + { + "epoch": 0.57, + "grad_norm": 0.2745882143555028, + "learning_rate": 8.164441686108991e-06, + "loss": 0.1833, + "step": 9956 + }, + { + "epoch": 0.57, + "grad_norm": 0.8934275316001319, + "learning_rate": 8.162612423534266e-06, + "loss": 0.4585, + "step": 9957 + }, + { + "epoch": 0.57, + "grad_norm": 0.37576440877617384, + "learning_rate": 8.160783224588196e-06, + "loss": 0.3174, + "step": 9958 + }, + { + "epoch": 0.57, + "grad_norm": 0.34228438562146457, + "learning_rate": 8.15895408933412e-06, + "loss": 0.2592, + "step": 9959 + }, + { + "epoch": 0.57, + "grad_norm": 1.0015700431321963, + "learning_rate": 8.157125017835389e-06, + "loss": 0.2788, + "step": 9960 + }, + { + "epoch": 0.57, + "grad_norm": 0.3365784712512407, + "learning_rate": 8.15529601015534e-06, + "loss": 0.2207, + "step": 9961 + }, + { + "epoch": 0.57, + "grad_norm": 0.3051365494564319, + "learning_rate": 8.153467066357305e-06, + "loss": 0.2439, + "step": 9962 + }, + { + "epoch": 0.57, + "grad_norm": 0.4885135862243723, + "learning_rate": 8.15163818650463e-06, + "loss": 0.3013, + "step": 9963 + }, + { + "epoch": 0.57, + "grad_norm": 0.38036239490366125, + "learning_rate": 8.149809370660643e-06, + "loss": 0.2774, + "step": 9964 + }, + { + "epoch": 0.57, + "grad_norm": 0.39821866202603395, + "learning_rate": 8.14798061888868e-06, + "loss": 0.2925, + "step": 9965 + }, + { + "epoch": 0.57, + "grad_norm": 0.3459147936489671, + "learning_rate": 8.146151931252067e-06, + "loss": 0.2577, + "step": 9966 + }, + { + "epoch": 0.57, + "grad_norm": 0.590203539009197, + "learning_rate": 8.144323307814133e-06, + "loss": 0.3359, + "step": 9967 + }, + { + "epoch": 0.57, + "grad_norm": 0.3644950630409327, + "learning_rate": 8.142494748638204e-06, + "loss": 0.2875, + "step": 9968 + }, + { + "epoch": 0.57, + "grad_norm": 0.38221492079631614, + "learning_rate": 8.140666253787602e-06, + "loss": 0.2538, + "step": 9969 + }, + { + "epoch": 0.57, + "grad_norm": 0.399983798988496, + "learning_rate": 8.138837823325647e-06, + "loss": 0.3374, + "step": 9970 + }, + { + "epoch": 0.57, + "grad_norm": 0.35302886052202936, + "learning_rate": 8.137009457315658e-06, + "loss": 0.2862, + "step": 9971 + }, + { + "epoch": 0.57, + "grad_norm": 0.40507971925164943, + "learning_rate": 8.135181155820953e-06, + "loss": 0.2151, + "step": 9972 + }, + { + "epoch": 0.57, + "grad_norm": 0.5666935537735343, + "learning_rate": 8.13335291890484e-06, + "loss": 0.3137, + "step": 9973 + }, + { + "epoch": 0.57, + "grad_norm": 0.32067561902428615, + "learning_rate": 8.13152474663064e-06, + "loss": 0.2686, + "step": 9974 + }, + { + "epoch": 0.57, + "grad_norm": 0.33814247044172163, + "learning_rate": 8.129696639061654e-06, + "loss": 0.286, + "step": 9975 + }, + { + "epoch": 0.57, + "grad_norm": 0.3246915939454487, + "learning_rate": 8.127868596261198e-06, + "loss": 0.1764, + "step": 9976 + }, + { + "epoch": 0.57, + "grad_norm": 0.26615674398341954, + "learning_rate": 8.126040618292566e-06, + "loss": 0.2069, + "step": 9977 + }, + { + "epoch": 0.57, + "grad_norm": 0.4585736080937011, + "learning_rate": 8.124212705219071e-06, + "loss": 0.3394, + "step": 9978 + }, + { + "epoch": 0.57, + "grad_norm": 0.4703515008785008, + "learning_rate": 8.122384857104006e-06, + "loss": 0.2791, + "step": 9979 + }, + { + "epoch": 0.57, + "grad_norm": 0.3081159193556577, + "learning_rate": 8.120557074010677e-06, + "loss": 0.251, + "step": 9980 + }, + { + "epoch": 0.57, + "grad_norm": 1.0902723825131568, + "learning_rate": 8.118729356002371e-06, + "loss": 0.6268, + "step": 9981 + }, + { + "epoch": 0.57, + "grad_norm": 0.3609668953923149, + "learning_rate": 8.11690170314239e-06, + "loss": 0.3442, + "step": 9982 + }, + { + "epoch": 0.57, + "grad_norm": 0.23808650024305908, + "learning_rate": 8.115074115494022e-06, + "loss": 0.1346, + "step": 9983 + }, + { + "epoch": 0.57, + "grad_norm": 0.34721727640723965, + "learning_rate": 8.113246593120554e-06, + "loss": 0.2742, + "step": 9984 + }, + { + "epoch": 0.57, + "grad_norm": 0.8939576818059674, + "learning_rate": 8.111419136085278e-06, + "loss": 0.4012, + "step": 9985 + }, + { + "epoch": 0.57, + "grad_norm": 0.37324261637531364, + "learning_rate": 8.109591744451472e-06, + "loss": 0.2016, + "step": 9986 + }, + { + "epoch": 0.57, + "grad_norm": 0.407045014028079, + "learning_rate": 8.107764418282427e-06, + "loss": 0.3083, + "step": 9987 + }, + { + "epoch": 0.57, + "grad_norm": 1.0490869448387106, + "learning_rate": 8.105937157641416e-06, + "loss": 0.6916, + "step": 9988 + }, + { + "epoch": 0.57, + "grad_norm": 0.2294748663492301, + "learning_rate": 8.104109962591722e-06, + "loss": 0.1641, + "step": 9989 + }, + { + "epoch": 0.57, + "grad_norm": 0.28437388048864876, + "learning_rate": 8.102282833196616e-06, + "loss": 0.2555, + "step": 9990 + }, + { + "epoch": 0.57, + "grad_norm": 0.774892354817154, + "learning_rate": 8.100455769519377e-06, + "loss": 0.4188, + "step": 9991 + }, + { + "epoch": 0.57, + "grad_norm": 0.29569982792830685, + "learning_rate": 8.09862877162327e-06, + "loss": 0.1961, + "step": 9992 + }, + { + "epoch": 0.57, + "grad_norm": 1.0072369091205406, + "learning_rate": 8.096801839571569e-06, + "loss": 0.6177, + "step": 9993 + }, + { + "epoch": 0.57, + "grad_norm": 0.3855864777216197, + "learning_rate": 8.094974973427541e-06, + "loss": 0.3251, + "step": 9994 + }, + { + "epoch": 0.57, + "grad_norm": 0.39142238746136504, + "learning_rate": 8.093148173254445e-06, + "loss": 0.2776, + "step": 9995 + }, + { + "epoch": 0.57, + "grad_norm": 0.23644514454316207, + "learning_rate": 8.091321439115543e-06, + "loss": 0.1532, + "step": 9996 + }, + { + "epoch": 0.57, + "grad_norm": 0.7087803679347151, + "learning_rate": 8.089494771074102e-06, + "loss": 0.4177, + "step": 9997 + }, + { + "epoch": 0.57, + "grad_norm": 0.2797651307049413, + "learning_rate": 8.08766816919337e-06, + "loss": 0.2555, + "step": 9998 + }, + { + "epoch": 0.57, + "grad_norm": 0.4959220267266461, + "learning_rate": 8.085841633536611e-06, + "loss": 0.2582, + "step": 9999 + }, + { + "epoch": 0.57, + "grad_norm": 1.0009644158738145, + "learning_rate": 8.084015164167071e-06, + "loss": 0.6752, + "step": 10000 + }, + { + "epoch": 0.57, + "grad_norm": 0.32313335118258313, + "learning_rate": 8.082188761148007e-06, + "loss": 0.1926, + "step": 10001 + }, + { + "epoch": 0.57, + "grad_norm": 0.2669066233121326, + "learning_rate": 8.08036242454266e-06, + "loss": 0.2554, + "step": 10002 + }, + { + "epoch": 0.57, + "grad_norm": 0.4574088631849031, + "learning_rate": 8.078536154414283e-06, + "loss": 0.2827, + "step": 10003 + }, + { + "epoch": 0.57, + "grad_norm": 0.507204777496856, + "learning_rate": 8.076709950826113e-06, + "loss": 0.2304, + "step": 10004 + }, + { + "epoch": 0.57, + "grad_norm": 0.38781786620956266, + "learning_rate": 8.074883813841397e-06, + "loss": 0.2686, + "step": 10005 + }, + { + "epoch": 0.57, + "grad_norm": 0.36301702077513615, + "learning_rate": 8.073057743523371e-06, + "loss": 0.3049, + "step": 10006 + }, + { + "epoch": 0.57, + "grad_norm": 0.4861354405825026, + "learning_rate": 8.071231739935272e-06, + "loss": 0.3207, + "step": 10007 + }, + { + "epoch": 0.58, + "grad_norm": 0.35373086697907874, + "learning_rate": 8.069405803140338e-06, + "loss": 0.3251, + "step": 10008 + }, + { + "epoch": 0.58, + "grad_norm": 0.2171049695260447, + "learning_rate": 8.067579933201793e-06, + "loss": 0.0853, + "step": 10009 + }, + { + "epoch": 0.58, + "grad_norm": 0.26086832974147256, + "learning_rate": 8.065754130182876e-06, + "loss": 0.2698, + "step": 10010 + }, + { + "epoch": 0.58, + "grad_norm": 0.5370388860704075, + "learning_rate": 8.063928394146806e-06, + "loss": 0.4125, + "step": 10011 + }, + { + "epoch": 0.58, + "grad_norm": 0.7740071781472809, + "learning_rate": 8.062102725156818e-06, + "loss": 0.3091, + "step": 10012 + }, + { + "epoch": 0.58, + "grad_norm": 0.36948235193836254, + "learning_rate": 8.060277123276125e-06, + "loss": 0.3018, + "step": 10013 + }, + { + "epoch": 0.58, + "grad_norm": 0.40449012511592974, + "learning_rate": 8.058451588567954e-06, + "loss": 0.3211, + "step": 10014 + }, + { + "epoch": 0.58, + "grad_norm": 0.23658106450923316, + "learning_rate": 8.05662612109552e-06, + "loss": 0.1578, + "step": 10015 + }, + { + "epoch": 0.58, + "grad_norm": 0.3824604885675158, + "learning_rate": 8.05480072092204e-06, + "loss": 0.2864, + "step": 10016 + }, + { + "epoch": 0.58, + "grad_norm": 0.5705382545976666, + "learning_rate": 8.052975388110726e-06, + "loss": 0.3922, + "step": 10017 + }, + { + "epoch": 0.58, + "grad_norm": 0.3746308523465573, + "learning_rate": 8.051150122724793e-06, + "loss": 0.2776, + "step": 10018 + }, + { + "epoch": 0.58, + "grad_norm": 0.6099067258468585, + "learning_rate": 8.049324924827447e-06, + "loss": 0.3158, + "step": 10019 + }, + { + "epoch": 0.58, + "grad_norm": 0.38826376456495426, + "learning_rate": 8.047499794481894e-06, + "loss": 0.3084, + "step": 10020 + }, + { + "epoch": 0.58, + "grad_norm": 0.24314199497277397, + "learning_rate": 8.045674731751338e-06, + "loss": 0.2092, + "step": 10021 + }, + { + "epoch": 0.58, + "grad_norm": 0.31113245462098016, + "learning_rate": 8.043849736698986e-06, + "loss": 0.1859, + "step": 10022 + }, + { + "epoch": 0.58, + "grad_norm": 0.37576660120750494, + "learning_rate": 8.04202480938803e-06, + "loss": 0.3239, + "step": 10023 + }, + { + "epoch": 0.58, + "grad_norm": 0.7344082199207784, + "learning_rate": 8.040199949881672e-06, + "loss": 0.4354, + "step": 10024 + }, + { + "epoch": 0.58, + "grad_norm": 0.45688911337935884, + "learning_rate": 8.038375158243108e-06, + "loss": 0.2071, + "step": 10025 + }, + { + "epoch": 0.58, + "grad_norm": 0.2761160102991935, + "learning_rate": 8.036550434535522e-06, + "loss": 0.2761, + "step": 10026 + }, + { + "epoch": 0.58, + "grad_norm": 1.061528670972983, + "learning_rate": 8.034725778822114e-06, + "loss": 0.5102, + "step": 10027 + }, + { + "epoch": 0.58, + "grad_norm": 0.16482060521116781, + "learning_rate": 8.032901191166071e-06, + "loss": 0.0969, + "step": 10028 + }, + { + "epoch": 0.58, + "grad_norm": 0.30696431779330907, + "learning_rate": 8.03107667163057e-06, + "loss": 0.2962, + "step": 10029 + }, + { + "epoch": 0.58, + "grad_norm": 0.5561323310489525, + "learning_rate": 8.029252220278802e-06, + "loss": 0.3496, + "step": 10030 + }, + { + "epoch": 0.58, + "grad_norm": 0.45238876227242075, + "learning_rate": 8.02742783717394e-06, + "loss": 0.2456, + "step": 10031 + }, + { + "epoch": 0.58, + "grad_norm": 0.5396238256737506, + "learning_rate": 8.025603522379172e-06, + "loss": 0.3749, + "step": 10032 + }, + { + "epoch": 0.58, + "grad_norm": 0.3195175238113092, + "learning_rate": 8.023779275957668e-06, + "loss": 0.2389, + "step": 10033 + }, + { + "epoch": 0.58, + "grad_norm": 0.24479568402304927, + "learning_rate": 8.021955097972602e-06, + "loss": 0.2131, + "step": 10034 + }, + { + "epoch": 0.58, + "grad_norm": 0.3826033703959546, + "learning_rate": 8.020130988487146e-06, + "loss": 0.2464, + "step": 10035 + }, + { + "epoch": 0.58, + "grad_norm": 0.7693896830289918, + "learning_rate": 8.01830694756447e-06, + "loss": 0.4276, + "step": 10036 + }, + { + "epoch": 0.58, + "grad_norm": 0.3963773983506641, + "learning_rate": 8.016482975267738e-06, + "loss": 0.2943, + "step": 10037 + }, + { + "epoch": 0.58, + "grad_norm": 0.28363379615809237, + "learning_rate": 8.014659071660113e-06, + "loss": 0.2321, + "step": 10038 + }, + { + "epoch": 0.58, + "grad_norm": 0.41409455127608996, + "learning_rate": 8.012835236804764e-06, + "loss": 0.2602, + "step": 10039 + }, + { + "epoch": 0.58, + "grad_norm": 0.33374553909505866, + "learning_rate": 8.01101147076484e-06, + "loss": 0.2047, + "step": 10040 + }, + { + "epoch": 0.58, + "grad_norm": 0.3514808777272365, + "learning_rate": 8.009187773603508e-06, + "loss": 0.246, + "step": 10041 + }, + { + "epoch": 0.58, + "grad_norm": 0.3632666739681491, + "learning_rate": 8.007364145383914e-06, + "loss": 0.3166, + "step": 10042 + }, + { + "epoch": 0.58, + "grad_norm": 0.809247352433276, + "learning_rate": 8.005540586169216e-06, + "loss": 0.4513, + "step": 10043 + }, + { + "epoch": 0.58, + "grad_norm": 0.31999804489355055, + "learning_rate": 8.003717096022561e-06, + "loss": 0.2042, + "step": 10044 + }, + { + "epoch": 0.58, + "grad_norm": 0.45816885307178257, + "learning_rate": 8.001893675007098e-06, + "loss": 0.3312, + "step": 10045 + }, + { + "epoch": 0.58, + "grad_norm": 0.218345571007771, + "learning_rate": 8.00007032318597e-06, + "loss": 0.1845, + "step": 10046 + }, + { + "epoch": 0.58, + "grad_norm": 0.337060392878494, + "learning_rate": 7.99824704062232e-06, + "loss": 0.2972, + "step": 10047 + }, + { + "epoch": 0.58, + "grad_norm": 0.7301838529307065, + "learning_rate": 7.996423827379292e-06, + "loss": 0.3609, + "step": 10048 + }, + { + "epoch": 0.58, + "grad_norm": 0.32799631732078843, + "learning_rate": 7.994600683520018e-06, + "loss": 0.3072, + "step": 10049 + }, + { + "epoch": 0.58, + "grad_norm": 0.4005912109187204, + "learning_rate": 7.992777609107638e-06, + "loss": 0.3005, + "step": 10050 + }, + { + "epoch": 0.58, + "grad_norm": 0.8157284221590041, + "learning_rate": 7.99095460420528e-06, + "loss": 0.3047, + "step": 10051 + }, + { + "epoch": 0.58, + "grad_norm": 0.3615354345631635, + "learning_rate": 7.989131668876081e-06, + "loss": 0.2347, + "step": 10052 + }, + { + "epoch": 0.58, + "grad_norm": 0.3831280211895221, + "learning_rate": 7.987308803183164e-06, + "loss": 0.3028, + "step": 10053 + }, + { + "epoch": 0.58, + "grad_norm": 0.3977051857632209, + "learning_rate": 7.985486007189658e-06, + "loss": 0.2917, + "step": 10054 + }, + { + "epoch": 0.58, + "grad_norm": 0.3577210765059678, + "learning_rate": 7.983663280958682e-06, + "loss": 0.2307, + "step": 10055 + }, + { + "epoch": 0.58, + "grad_norm": 0.37861640099556787, + "learning_rate": 7.981840624553364e-06, + "loss": 0.2822, + "step": 10056 + }, + { + "epoch": 0.58, + "grad_norm": 0.3719306516421558, + "learning_rate": 7.980018038036815e-06, + "loss": 0.313, + "step": 10057 + }, + { + "epoch": 0.58, + "grad_norm": 0.7053848501492725, + "learning_rate": 7.978195521472157e-06, + "loss": 0.3032, + "step": 10058 + }, + { + "epoch": 0.58, + "grad_norm": 0.3060816409264136, + "learning_rate": 7.976373074922498e-06, + "loss": 0.2589, + "step": 10059 + }, + { + "epoch": 0.58, + "grad_norm": 0.7008923801011527, + "learning_rate": 7.974550698450956e-06, + "loss": 0.5585, + "step": 10060 + }, + { + "epoch": 0.58, + "grad_norm": 0.22270529904066977, + "learning_rate": 7.972728392120634e-06, + "loss": 0.1837, + "step": 10061 + }, + { + "epoch": 0.58, + "grad_norm": 0.2558814298202647, + "learning_rate": 7.970906155994646e-06, + "loss": 0.2174, + "step": 10062 + }, + { + "epoch": 0.58, + "grad_norm": 1.0344165023559295, + "learning_rate": 7.969083990136084e-06, + "loss": 0.5085, + "step": 10063 + }, + { + "epoch": 0.58, + "grad_norm": 0.7397533513729492, + "learning_rate": 7.967261894608058e-06, + "loss": 0.3023, + "step": 10064 + }, + { + "epoch": 0.58, + "grad_norm": 0.298771462575836, + "learning_rate": 7.965439869473664e-06, + "loss": 0.2718, + "step": 10065 + }, + { + "epoch": 0.58, + "grad_norm": 0.492680151076164, + "learning_rate": 7.963617914796002e-06, + "loss": 0.3987, + "step": 10066 + }, + { + "epoch": 0.58, + "grad_norm": 0.17351851611022887, + "learning_rate": 7.961796030638162e-06, + "loss": 0.096, + "step": 10067 + }, + { + "epoch": 0.58, + "grad_norm": 0.34316673447226265, + "learning_rate": 7.95997421706324e-06, + "loss": 0.2893, + "step": 10068 + }, + { + "epoch": 0.58, + "grad_norm": 0.3809433763662325, + "learning_rate": 7.95815247413432e-06, + "loss": 0.3244, + "step": 10069 + }, + { + "epoch": 0.58, + "grad_norm": 0.8279519992417633, + "learning_rate": 7.956330801914495e-06, + "loss": 0.3853, + "step": 10070 + }, + { + "epoch": 0.58, + "grad_norm": 0.3236230187941723, + "learning_rate": 7.954509200466845e-06, + "loss": 0.2006, + "step": 10071 + }, + { + "epoch": 0.58, + "grad_norm": 0.9845789891659482, + "learning_rate": 7.952687669854453e-06, + "loss": 0.7532, + "step": 10072 + }, + { + "epoch": 0.58, + "grad_norm": 0.2719432805518241, + "learning_rate": 7.950866210140401e-06, + "loss": 0.2321, + "step": 10073 + }, + { + "epoch": 0.58, + "grad_norm": 0.25669260749844697, + "learning_rate": 7.949044821387761e-06, + "loss": 0.1774, + "step": 10074 + }, + { + "epoch": 0.58, + "grad_norm": 0.8589351191506319, + "learning_rate": 7.947223503659613e-06, + "loss": 0.4398, + "step": 10075 + }, + { + "epoch": 0.58, + "grad_norm": 0.8104627329921519, + "learning_rate": 7.945402257019026e-06, + "loss": 0.4334, + "step": 10076 + }, + { + "epoch": 0.58, + "grad_norm": 0.28954849191593923, + "learning_rate": 7.943581081529072e-06, + "loss": 0.2135, + "step": 10077 + }, + { + "epoch": 0.58, + "grad_norm": 0.5205399924008268, + "learning_rate": 7.941759977252815e-06, + "loss": 0.4581, + "step": 10078 + }, + { + "epoch": 0.58, + "grad_norm": 0.28954303976259166, + "learning_rate": 7.939938944253321e-06, + "loss": 0.1576, + "step": 10079 + }, + { + "epoch": 0.58, + "grad_norm": 0.3201876263622221, + "learning_rate": 7.938117982593653e-06, + "loss": 0.203, + "step": 10080 + }, + { + "epoch": 0.58, + "grad_norm": 0.6596627329890012, + "learning_rate": 7.936297092336872e-06, + "loss": 0.3682, + "step": 10081 + }, + { + "epoch": 0.58, + "grad_norm": 0.5906861427138822, + "learning_rate": 7.934476273546032e-06, + "loss": 0.3605, + "step": 10082 + }, + { + "epoch": 0.58, + "grad_norm": 0.30195190043595455, + "learning_rate": 7.932655526284192e-06, + "loss": 0.2666, + "step": 10083 + }, + { + "epoch": 0.58, + "grad_norm": 0.992932228624963, + "learning_rate": 7.930834850614399e-06, + "loss": 0.5499, + "step": 10084 + }, + { + "epoch": 0.58, + "grad_norm": 0.28475196579428813, + "learning_rate": 7.92901424659971e-06, + "loss": 0.2582, + "step": 10085 + }, + { + "epoch": 0.58, + "grad_norm": 0.32554733501197314, + "learning_rate": 7.927193714303166e-06, + "loss": 0.2448, + "step": 10086 + }, + { + "epoch": 0.58, + "grad_norm": 0.38443162661917063, + "learning_rate": 7.925373253787817e-06, + "loss": 0.2281, + "step": 10087 + }, + { + "epoch": 0.58, + "grad_norm": 0.4907863733563514, + "learning_rate": 7.923552865116701e-06, + "loss": 0.3353, + "step": 10088 + }, + { + "epoch": 0.58, + "grad_norm": 0.39379032559924637, + "learning_rate": 7.92173254835286e-06, + "loss": 0.2464, + "step": 10089 + }, + { + "epoch": 0.58, + "grad_norm": 0.37205057590132345, + "learning_rate": 7.919912303559334e-06, + "loss": 0.2769, + "step": 10090 + }, + { + "epoch": 0.58, + "grad_norm": 0.37416042075941736, + "learning_rate": 7.91809213079915e-06, + "loss": 0.2306, + "step": 10091 + }, + { + "epoch": 0.58, + "grad_norm": 0.36138527919034713, + "learning_rate": 7.916272030135353e-06, + "loss": 0.2915, + "step": 10092 + }, + { + "epoch": 0.58, + "grad_norm": 0.3876317520648567, + "learning_rate": 7.91445200163096e-06, + "loss": 0.2792, + "step": 10093 + }, + { + "epoch": 0.58, + "grad_norm": 0.3951992348618812, + "learning_rate": 7.912632045349008e-06, + "loss": 0.2647, + "step": 10094 + }, + { + "epoch": 0.58, + "grad_norm": 0.33494661203789694, + "learning_rate": 7.910812161352517e-06, + "loss": 0.2683, + "step": 10095 + }, + { + "epoch": 0.58, + "grad_norm": 0.9173367452830046, + "learning_rate": 7.908992349704515e-06, + "loss": 0.7132, + "step": 10096 + }, + { + "epoch": 0.58, + "grad_norm": 0.3487243075292584, + "learning_rate": 7.907172610468015e-06, + "loss": 0.2215, + "step": 10097 + }, + { + "epoch": 0.58, + "grad_norm": 0.32608359551160127, + "learning_rate": 7.905352943706035e-06, + "loss": 0.2646, + "step": 10098 + }, + { + "epoch": 0.58, + "grad_norm": 0.42778870571825733, + "learning_rate": 7.903533349481596e-06, + "loss": 0.2771, + "step": 10099 + }, + { + "epoch": 0.58, + "grad_norm": 0.29251505997739846, + "learning_rate": 7.901713827857705e-06, + "loss": 0.1692, + "step": 10100 + }, + { + "epoch": 0.58, + "grad_norm": 0.28561704547379524, + "learning_rate": 7.899894378897374e-06, + "loss": 0.275, + "step": 10101 + }, + { + "epoch": 0.58, + "grad_norm": 1.1525092467625975, + "learning_rate": 7.898075002663612e-06, + "loss": 0.7976, + "step": 10102 + }, + { + "epoch": 0.58, + "grad_norm": 0.6225456716829496, + "learning_rate": 7.89625569921942e-06, + "loss": 0.335, + "step": 10103 + }, + { + "epoch": 0.58, + "grad_norm": 0.3281355566608314, + "learning_rate": 7.894436468627804e-06, + "loss": 0.2564, + "step": 10104 + }, + { + "epoch": 0.58, + "grad_norm": 0.36381880629843233, + "learning_rate": 7.892617310951761e-06, + "loss": 0.3175, + "step": 10105 + }, + { + "epoch": 0.58, + "grad_norm": 0.2159222699318665, + "learning_rate": 7.890798226254291e-06, + "loss": 0.1054, + "step": 10106 + }, + { + "epoch": 0.58, + "grad_norm": 0.35946385596297437, + "learning_rate": 7.888979214598387e-06, + "loss": 0.2295, + "step": 10107 + }, + { + "epoch": 0.58, + "grad_norm": 0.4936241577077283, + "learning_rate": 7.887160276047045e-06, + "loss": 0.376, + "step": 10108 + }, + { + "epoch": 0.58, + "grad_norm": 0.33073585211025036, + "learning_rate": 7.885341410663248e-06, + "loss": 0.2954, + "step": 10109 + }, + { + "epoch": 0.58, + "grad_norm": 0.34079160331467057, + "learning_rate": 7.883522618509989e-06, + "loss": 0.2056, + "step": 10110 + }, + { + "epoch": 0.58, + "grad_norm": 0.351615537794108, + "learning_rate": 7.881703899650249e-06, + "loss": 0.2591, + "step": 10111 + }, + { + "epoch": 0.58, + "grad_norm": 0.39871038149815147, + "learning_rate": 7.879885254147014e-06, + "loss": 0.2566, + "step": 10112 + }, + { + "epoch": 0.58, + "grad_norm": 0.24185991194383902, + "learning_rate": 7.878066682063262e-06, + "loss": 0.2185, + "step": 10113 + }, + { + "epoch": 0.58, + "grad_norm": 1.558532170715619, + "learning_rate": 7.876248183461967e-06, + "loss": 0.7215, + "step": 10114 + }, + { + "epoch": 0.58, + "grad_norm": 0.6384019101854085, + "learning_rate": 7.874429758406108e-06, + "loss": 0.3869, + "step": 10115 + }, + { + "epoch": 0.58, + "grad_norm": 0.3028402041371424, + "learning_rate": 7.872611406958653e-06, + "loss": 0.2154, + "step": 10116 + }, + { + "epoch": 0.58, + "grad_norm": 0.3622538842476559, + "learning_rate": 7.870793129182577e-06, + "loss": 0.2996, + "step": 10117 + }, + { + "epoch": 0.58, + "grad_norm": 0.27173806317093663, + "learning_rate": 7.86897492514084e-06, + "loss": 0.1832, + "step": 10118 + }, + { + "epoch": 0.58, + "grad_norm": 0.30861485287327556, + "learning_rate": 7.86715679489641e-06, + "loss": 0.2098, + "step": 10119 + }, + { + "epoch": 0.58, + "grad_norm": 1.2068126960800758, + "learning_rate": 7.86533873851225e-06, + "loss": 0.6609, + "step": 10120 + }, + { + "epoch": 0.58, + "grad_norm": 0.36384977954229525, + "learning_rate": 7.863520756051317e-06, + "loss": 0.3154, + "step": 10121 + }, + { + "epoch": 0.58, + "grad_norm": 0.4049015882904989, + "learning_rate": 7.861702847576568e-06, + "loss": 0.293, + "step": 10122 + }, + { + "epoch": 0.58, + "grad_norm": 0.5157148425875564, + "learning_rate": 7.859885013150959e-06, + "loss": 0.2408, + "step": 10123 + }, + { + "epoch": 0.58, + "grad_norm": 0.23292577160493733, + "learning_rate": 7.858067252837437e-06, + "loss": 0.2067, + "step": 10124 + }, + { + "epoch": 0.58, + "grad_norm": 0.5576874400588665, + "learning_rate": 7.856249566698957e-06, + "loss": 0.3294, + "step": 10125 + }, + { + "epoch": 0.58, + "grad_norm": 0.331359061847375, + "learning_rate": 7.854431954798463e-06, + "loss": 0.2648, + "step": 10126 + }, + { + "epoch": 0.58, + "grad_norm": 0.5790983417813569, + "learning_rate": 7.852614417198894e-06, + "loss": 0.3896, + "step": 10127 + }, + { + "epoch": 0.58, + "grad_norm": 0.5886527390523273, + "learning_rate": 7.850796953963198e-06, + "loss": 0.3371, + "step": 10128 + }, + { + "epoch": 0.58, + "grad_norm": 0.2781699195992228, + "learning_rate": 7.848979565154314e-06, + "loss": 0.2399, + "step": 10129 + }, + { + "epoch": 0.58, + "grad_norm": 0.25233740774202973, + "learning_rate": 7.847162250835171e-06, + "loss": 0.1594, + "step": 10130 + }, + { + "epoch": 0.58, + "grad_norm": 0.5636084765264904, + "learning_rate": 7.845345011068709e-06, + "loss": 0.4195, + "step": 10131 + }, + { + "epoch": 0.58, + "grad_norm": 0.29482534737792404, + "learning_rate": 7.843527845917855e-06, + "loss": 0.2604, + "step": 10132 + }, + { + "epoch": 0.58, + "grad_norm": 0.44948437825709303, + "learning_rate": 7.84171075544554e-06, + "loss": 0.3688, + "step": 10133 + }, + { + "epoch": 0.58, + "grad_norm": 0.386973790913395, + "learning_rate": 7.839893739714686e-06, + "loss": 0.2615, + "step": 10134 + }, + { + "epoch": 0.58, + "grad_norm": 0.4697085406586142, + "learning_rate": 7.83807679878822e-06, + "loss": 0.3673, + "step": 10135 + }, + { + "epoch": 0.58, + "grad_norm": 0.2578535107262174, + "learning_rate": 7.836259932729062e-06, + "loss": 0.203, + "step": 10136 + }, + { + "epoch": 0.58, + "grad_norm": 0.3996683546043715, + "learning_rate": 7.834443141600131e-06, + "loss": 0.3033, + "step": 10137 + }, + { + "epoch": 0.58, + "grad_norm": 0.5165287449268846, + "learning_rate": 7.83262642546434e-06, + "loss": 0.3747, + "step": 10138 + }, + { + "epoch": 0.58, + "grad_norm": 0.4618223329699041, + "learning_rate": 7.830809784384602e-06, + "loss": 0.2441, + "step": 10139 + }, + { + "epoch": 0.58, + "grad_norm": 0.30623960016940566, + "learning_rate": 7.828993218423829e-06, + "loss": 0.2678, + "step": 10140 + }, + { + "epoch": 0.58, + "grad_norm": 0.35976134707155416, + "learning_rate": 7.827176727644925e-06, + "loss": 0.3079, + "step": 10141 + }, + { + "epoch": 0.58, + "grad_norm": 0.28324400036157693, + "learning_rate": 7.825360312110801e-06, + "loss": 0.1523, + "step": 10142 + }, + { + "epoch": 0.58, + "grad_norm": 0.7363848108703631, + "learning_rate": 7.823543971884353e-06, + "loss": 0.3966, + "step": 10143 + }, + { + "epoch": 0.58, + "grad_norm": 0.3260077080948101, + "learning_rate": 7.821727707028486e-06, + "loss": 0.2802, + "step": 10144 + }, + { + "epoch": 0.58, + "grad_norm": 0.27632507657679467, + "learning_rate": 7.81991151760609e-06, + "loss": 0.2405, + "step": 10145 + }, + { + "epoch": 0.58, + "grad_norm": 0.3488908220712134, + "learning_rate": 7.81809540368007e-06, + "loss": 0.1139, + "step": 10146 + }, + { + "epoch": 0.58, + "grad_norm": 0.37529873675347386, + "learning_rate": 7.81627936531331e-06, + "loss": 0.2968, + "step": 10147 + }, + { + "epoch": 0.58, + "grad_norm": 0.3596295347448901, + "learning_rate": 7.814463402568703e-06, + "loss": 0.3217, + "step": 10148 + }, + { + "epoch": 0.58, + "grad_norm": 0.6919007612501898, + "learning_rate": 7.812647515509131e-06, + "loss": 0.245, + "step": 10149 + }, + { + "epoch": 0.58, + "grad_norm": 0.36005726352566714, + "learning_rate": 7.810831704197486e-06, + "loss": 0.3193, + "step": 10150 + }, + { + "epoch": 0.58, + "grad_norm": 0.3847714209623518, + "learning_rate": 7.80901596869664e-06, + "loss": 0.2601, + "step": 10151 + }, + { + "epoch": 0.58, + "grad_norm": 0.21052130026797297, + "learning_rate": 7.807200309069482e-06, + "loss": 0.1744, + "step": 10152 + }, + { + "epoch": 0.58, + "grad_norm": 0.40305583102744774, + "learning_rate": 7.805384725378881e-06, + "loss": 0.3051, + "step": 10153 + }, + { + "epoch": 0.58, + "grad_norm": 1.0107321479306213, + "learning_rate": 7.803569217687711e-06, + "loss": 0.5662, + "step": 10154 + }, + { + "epoch": 0.58, + "grad_norm": 0.6122037063934481, + "learning_rate": 7.801753786058847e-06, + "loss": 0.2446, + "step": 10155 + }, + { + "epoch": 0.58, + "grad_norm": 0.5214675627145766, + "learning_rate": 7.799938430555152e-06, + "loss": 0.3822, + "step": 10156 + }, + { + "epoch": 0.58, + "grad_norm": 0.35482096271578345, + "learning_rate": 7.798123151239497e-06, + "loss": 0.3462, + "step": 10157 + }, + { + "epoch": 0.58, + "grad_norm": 0.24059969586075008, + "learning_rate": 7.79630794817474e-06, + "loss": 0.1636, + "step": 10158 + }, + { + "epoch": 0.58, + "grad_norm": 0.39992023315441017, + "learning_rate": 7.794492821423747e-06, + "loss": 0.2323, + "step": 10159 + }, + { + "epoch": 0.58, + "grad_norm": 0.36592149517228917, + "learning_rate": 7.79267777104937e-06, + "loss": 0.3129, + "step": 10160 + }, + { + "epoch": 0.58, + "grad_norm": 0.6464934747226485, + "learning_rate": 7.79086279711447e-06, + "loss": 0.3873, + "step": 10161 + }, + { + "epoch": 0.58, + "grad_norm": 0.32063480202839684, + "learning_rate": 7.789047899681893e-06, + "loss": 0.2247, + "step": 10162 + }, + { + "epoch": 0.58, + "grad_norm": 0.2778589463458745, + "learning_rate": 7.787233078814497e-06, + "loss": 0.2119, + "step": 10163 + }, + { + "epoch": 0.58, + "grad_norm": 0.3474062922703908, + "learning_rate": 7.785418334575122e-06, + "loss": 0.3213, + "step": 10164 + }, + { + "epoch": 0.58, + "grad_norm": 0.3227767768744291, + "learning_rate": 7.783603667026616e-06, + "loss": 0.2222, + "step": 10165 + }, + { + "epoch": 0.58, + "grad_norm": 0.7112062417756925, + "learning_rate": 7.781789076231815e-06, + "loss": 0.387, + "step": 10166 + }, + { + "epoch": 0.58, + "grad_norm": 0.5940613791092809, + "learning_rate": 7.779974562253568e-06, + "loss": 0.3781, + "step": 10167 + }, + { + "epoch": 0.58, + "grad_norm": 0.24669557310007448, + "learning_rate": 7.778160125154702e-06, + "loss": 0.2217, + "step": 10168 + }, + { + "epoch": 0.58, + "grad_norm": 1.3234491850879115, + "learning_rate": 7.776345764998059e-06, + "loss": 0.8241, + "step": 10169 + }, + { + "epoch": 0.58, + "grad_norm": 0.2135022809142091, + "learning_rate": 7.774531481846464e-06, + "loss": 0.1409, + "step": 10170 + }, + { + "epoch": 0.58, + "grad_norm": 0.33929896012752037, + "learning_rate": 7.77271727576275e-06, + "loss": 0.2906, + "step": 10171 + }, + { + "epoch": 0.58, + "grad_norm": 0.423664471917811, + "learning_rate": 7.770903146809738e-06, + "loss": 0.2725, + "step": 10172 + }, + { + "epoch": 0.58, + "grad_norm": 0.6369728062982355, + "learning_rate": 7.769089095050258e-06, + "loss": 0.3813, + "step": 10173 + }, + { + "epoch": 0.58, + "grad_norm": 0.3777810720627077, + "learning_rate": 7.767275120547123e-06, + "loss": 0.2917, + "step": 10174 + }, + { + "epoch": 0.58, + "grad_norm": 0.24151161950766775, + "learning_rate": 7.765461223363158e-06, + "loss": 0.1624, + "step": 10175 + }, + { + "epoch": 0.58, + "grad_norm": 0.26233751290851975, + "learning_rate": 7.76364740356117e-06, + "loss": 0.2287, + "step": 10176 + }, + { + "epoch": 0.58, + "grad_norm": 0.4027707555176561, + "learning_rate": 7.76183366120398e-06, + "loss": 0.333, + "step": 10177 + }, + { + "epoch": 0.58, + "grad_norm": 0.9316383400032427, + "learning_rate": 7.760019996354396e-06, + "loss": 0.3163, + "step": 10178 + }, + { + "epoch": 0.58, + "grad_norm": 0.900838231219768, + "learning_rate": 7.75820640907522e-06, + "loss": 0.6042, + "step": 10179 + }, + { + "epoch": 0.58, + "grad_norm": 0.26204651119783984, + "learning_rate": 7.75639289942926e-06, + "loss": 0.2604, + "step": 10180 + }, + { + "epoch": 0.58, + "grad_norm": 0.4479463747107403, + "learning_rate": 7.754579467479318e-06, + "loss": 0.2993, + "step": 10181 + }, + { + "epoch": 0.59, + "grad_norm": 0.33894552794765487, + "learning_rate": 7.752766113288192e-06, + "loss": 0.1901, + "step": 10182 + }, + { + "epoch": 0.59, + "grad_norm": 0.3762907960074976, + "learning_rate": 7.750952836918679e-06, + "loss": 0.2689, + "step": 10183 + }, + { + "epoch": 0.59, + "grad_norm": 0.414454215601207, + "learning_rate": 7.749139638433573e-06, + "loss": 0.3139, + "step": 10184 + }, + { + "epoch": 0.59, + "grad_norm": 0.586844077895301, + "learning_rate": 7.747326517895662e-06, + "loss": 0.135, + "step": 10185 + }, + { + "epoch": 0.59, + "grad_norm": 0.32530862088865625, + "learning_rate": 7.74551347536774e-06, + "loss": 0.2699, + "step": 10186 + }, + { + "epoch": 0.59, + "grad_norm": 1.1672584420814525, + "learning_rate": 7.743700510912588e-06, + "loss": 0.6975, + "step": 10187 + }, + { + "epoch": 0.59, + "grad_norm": 0.24495780308060752, + "learning_rate": 7.741887624592992e-06, + "loss": 0.2082, + "step": 10188 + }, + { + "epoch": 0.59, + "grad_norm": 0.36084956108999233, + "learning_rate": 7.740074816471727e-06, + "loss": 0.277, + "step": 10189 + }, + { + "epoch": 0.59, + "grad_norm": 0.40954337704045063, + "learning_rate": 7.738262086611578e-06, + "loss": 0.2866, + "step": 10190 + }, + { + "epoch": 0.59, + "grad_norm": 0.36418034114752734, + "learning_rate": 7.736449435075314e-06, + "loss": 0.2365, + "step": 10191 + }, + { + "epoch": 0.59, + "grad_norm": 0.35156018638606407, + "learning_rate": 7.734636861925706e-06, + "loss": 0.2822, + "step": 10192 + }, + { + "epoch": 0.59, + "grad_norm": 0.4751770010443096, + "learning_rate": 7.732824367225531e-06, + "loss": 0.3951, + "step": 10193 + }, + { + "epoch": 0.59, + "grad_norm": 0.6841245197795643, + "learning_rate": 7.731011951037547e-06, + "loss": 0.2865, + "step": 10194 + }, + { + "epoch": 0.59, + "grad_norm": 0.3483252023779255, + "learning_rate": 7.729199613424523e-06, + "loss": 0.2767, + "step": 10195 + }, + { + "epoch": 0.59, + "grad_norm": 0.24313842780450307, + "learning_rate": 7.727387354449217e-06, + "loss": 0.2301, + "step": 10196 + }, + { + "epoch": 0.59, + "grad_norm": 1.0930305495887416, + "learning_rate": 7.725575174174395e-06, + "loss": 0.6972, + "step": 10197 + }, + { + "epoch": 0.59, + "grad_norm": 0.28099806807697314, + "learning_rate": 7.723763072662804e-06, + "loss": 0.2012, + "step": 10198 + }, + { + "epoch": 0.59, + "grad_norm": 0.8655932878922294, + "learning_rate": 7.721951049977196e-06, + "loss": 0.4398, + "step": 10199 + }, + { + "epoch": 0.59, + "grad_norm": 0.36319675934158235, + "learning_rate": 7.72013910618033e-06, + "loss": 0.3273, + "step": 10200 + }, + { + "epoch": 0.59, + "grad_norm": 0.31370298470256497, + "learning_rate": 7.718327241334944e-06, + "loss": 0.2333, + "step": 10201 + }, + { + "epoch": 0.59, + "grad_norm": 0.2954558356084548, + "learning_rate": 7.716515455503791e-06, + "loss": 0.201, + "step": 10202 + }, + { + "epoch": 0.59, + "grad_norm": 0.4505771810527777, + "learning_rate": 7.71470374874961e-06, + "loss": 0.322, + "step": 10203 + }, + { + "epoch": 0.59, + "grad_norm": 0.27661425856954885, + "learning_rate": 7.712892121135136e-06, + "loss": 0.2182, + "step": 10204 + }, + { + "epoch": 0.59, + "grad_norm": 1.2487777153308748, + "learning_rate": 7.711080572723113e-06, + "loss": 0.7325, + "step": 10205 + }, + { + "epoch": 0.59, + "grad_norm": 0.7749108515324832, + "learning_rate": 7.709269103576269e-06, + "loss": 0.3924, + "step": 10206 + }, + { + "epoch": 0.59, + "grad_norm": 0.31900576203015873, + "learning_rate": 7.70745771375734e-06, + "loss": 0.1904, + "step": 10207 + }, + { + "epoch": 0.59, + "grad_norm": 0.3050746601489578, + "learning_rate": 7.70564640332905e-06, + "loss": 0.2411, + "step": 10208 + }, + { + "epoch": 0.59, + "grad_norm": 0.4378828524958338, + "learning_rate": 7.703835172354127e-06, + "loss": 0.251, + "step": 10209 + }, + { + "epoch": 0.59, + "grad_norm": 0.32933250069623154, + "learning_rate": 7.702024020895292e-06, + "loss": 0.241, + "step": 10210 + }, + { + "epoch": 0.59, + "grad_norm": 0.8241027953814687, + "learning_rate": 7.70021294901527e-06, + "loss": 0.3369, + "step": 10211 + }, + { + "epoch": 0.59, + "grad_norm": 0.3543263899051379, + "learning_rate": 7.69840195677677e-06, + "loss": 0.3201, + "step": 10212 + }, + { + "epoch": 0.59, + "grad_norm": 0.4099252481168575, + "learning_rate": 7.696591044242513e-06, + "loss": 0.3004, + "step": 10213 + }, + { + "epoch": 0.59, + "grad_norm": 0.2820915527848439, + "learning_rate": 7.694780211475209e-06, + "loss": 0.166, + "step": 10214 + }, + { + "epoch": 0.59, + "grad_norm": 0.3524674573849591, + "learning_rate": 7.692969458537568e-06, + "loss": 0.3248, + "step": 10215 + }, + { + "epoch": 0.59, + "grad_norm": 0.43080167749690207, + "learning_rate": 7.691158785492294e-06, + "loss": 0.3089, + "step": 10216 + }, + { + "epoch": 0.59, + "grad_norm": 0.6922394291237297, + "learning_rate": 7.689348192402095e-06, + "loss": 0.2959, + "step": 10217 + }, + { + "epoch": 0.59, + "grad_norm": 0.9169189348266817, + "learning_rate": 7.687537679329668e-06, + "loss": 0.507, + "step": 10218 + }, + { + "epoch": 0.59, + "grad_norm": 0.3090444070036527, + "learning_rate": 7.685727246337709e-06, + "loss": 0.2482, + "step": 10219 + }, + { + "epoch": 0.59, + "grad_norm": 0.26794700844517866, + "learning_rate": 7.683916893488918e-06, + "loss": 0.2302, + "step": 10220 + }, + { + "epoch": 0.59, + "grad_norm": 0.29467839115121064, + "learning_rate": 7.682106620845984e-06, + "loss": 0.1529, + "step": 10221 + }, + { + "epoch": 0.59, + "grad_norm": 0.35073917502149066, + "learning_rate": 7.6802964284716e-06, + "loss": 0.2598, + "step": 10222 + }, + { + "epoch": 0.59, + "grad_norm": 1.202371112662339, + "learning_rate": 7.678486316428449e-06, + "loss": 0.4024, + "step": 10223 + }, + { + "epoch": 0.59, + "grad_norm": 0.3608808093489747, + "learning_rate": 7.676676284779217e-06, + "loss": 0.276, + "step": 10224 + }, + { + "epoch": 0.59, + "grad_norm": 0.32842709343076865, + "learning_rate": 7.674866333586586e-06, + "loss": 0.2837, + "step": 10225 + }, + { + "epoch": 0.59, + "grad_norm": 0.9377059450455212, + "learning_rate": 7.673056462913235e-06, + "loss": 0.5578, + "step": 10226 + }, + { + "epoch": 0.59, + "grad_norm": 0.22635475620796228, + "learning_rate": 7.671246672821837e-06, + "loss": 0.1744, + "step": 10227 + }, + { + "epoch": 0.59, + "grad_norm": 0.34285947837407843, + "learning_rate": 7.669436963375067e-06, + "loss": 0.2717, + "step": 10228 + }, + { + "epoch": 0.59, + "grad_norm": 1.1313131442190563, + "learning_rate": 7.667627334635595e-06, + "loss": 0.46, + "step": 10229 + }, + { + "epoch": 0.59, + "grad_norm": 0.8438766470568757, + "learning_rate": 7.665817786666088e-06, + "loss": 0.3498, + "step": 10230 + }, + { + "epoch": 0.59, + "grad_norm": 0.4222669207775579, + "learning_rate": 7.664008319529215e-06, + "loss": 0.3328, + "step": 10231 + }, + { + "epoch": 0.59, + "grad_norm": 0.3097461985519247, + "learning_rate": 7.66219893328763e-06, + "loss": 0.2693, + "step": 10232 + }, + { + "epoch": 0.59, + "grad_norm": 0.4686433548216791, + "learning_rate": 7.660389628003993e-06, + "loss": 0.3129, + "step": 10233 + }, + { + "epoch": 0.59, + "grad_norm": 0.42214270318871294, + "learning_rate": 7.658580403740965e-06, + "loss": 0.2034, + "step": 10234 + }, + { + "epoch": 0.59, + "grad_norm": 0.5099855157917578, + "learning_rate": 7.656771260561195e-06, + "loss": 0.3126, + "step": 10235 + }, + { + "epoch": 0.59, + "grad_norm": 0.2866231632582778, + "learning_rate": 7.654962198527338e-06, + "loss": 0.2458, + "step": 10236 + }, + { + "epoch": 0.59, + "grad_norm": 0.2955888837904505, + "learning_rate": 7.653153217702036e-06, + "loss": 0.1982, + "step": 10237 + }, + { + "epoch": 0.59, + "grad_norm": 0.38594128734646455, + "learning_rate": 7.651344318147941e-06, + "loss": 0.24, + "step": 10238 + }, + { + "epoch": 0.59, + "grad_norm": 0.493756437636117, + "learning_rate": 7.649535499927688e-06, + "loss": 0.3752, + "step": 10239 + }, + { + "epoch": 0.59, + "grad_norm": 0.25207716691183124, + "learning_rate": 7.647726763103923e-06, + "loss": 0.2299, + "step": 10240 + }, + { + "epoch": 0.59, + "grad_norm": 0.7780560932689861, + "learning_rate": 7.645918107739274e-06, + "loss": 0.4704, + "step": 10241 + }, + { + "epoch": 0.59, + "grad_norm": 0.3710784879455275, + "learning_rate": 7.644109533896384e-06, + "loss": 0.2544, + "step": 10242 + }, + { + "epoch": 0.59, + "grad_norm": 0.28552083656844823, + "learning_rate": 7.642301041637879e-06, + "loss": 0.2177, + "step": 10243 + }, + { + "epoch": 0.59, + "grad_norm": 0.47267057079111247, + "learning_rate": 7.640492631026387e-06, + "loss": 0.3323, + "step": 10244 + }, + { + "epoch": 0.59, + "grad_norm": 0.7320353696301287, + "learning_rate": 7.638684302124533e-06, + "loss": 0.4111, + "step": 10245 + }, + { + "epoch": 0.59, + "grad_norm": 0.4219758171502595, + "learning_rate": 7.63687605499494e-06, + "loss": 0.2955, + "step": 10246 + }, + { + "epoch": 0.59, + "grad_norm": 0.36371007108240666, + "learning_rate": 7.635067889700228e-06, + "loss": 0.2585, + "step": 10247 + }, + { + "epoch": 0.59, + "grad_norm": 0.24237484085577543, + "learning_rate": 7.633259806303012e-06, + "loss": 0.2022, + "step": 10248 + }, + { + "epoch": 0.59, + "grad_norm": 0.5130982347891825, + "learning_rate": 7.63145180486591e-06, + "loss": 0.2591, + "step": 10249 + }, + { + "epoch": 0.59, + "grad_norm": 0.3943639466503955, + "learning_rate": 7.629643885451527e-06, + "loss": 0.212, + "step": 10250 + }, + { + "epoch": 0.59, + "grad_norm": 0.42780601563784476, + "learning_rate": 7.627836048122477e-06, + "loss": 0.3231, + "step": 10251 + }, + { + "epoch": 0.59, + "grad_norm": 0.5485089590488623, + "learning_rate": 7.626028292941361e-06, + "loss": 0.3429, + "step": 10252 + }, + { + "epoch": 0.59, + "grad_norm": 0.38528395368808604, + "learning_rate": 7.624220619970784e-06, + "loss": 0.2701, + "step": 10253 + }, + { + "epoch": 0.59, + "grad_norm": 0.27098911193826375, + "learning_rate": 7.622413029273343e-06, + "loss": 0.1638, + "step": 10254 + }, + { + "epoch": 0.59, + "grad_norm": 0.301421858486742, + "learning_rate": 7.62060552091164e-06, + "loss": 0.2811, + "step": 10255 + }, + { + "epoch": 0.59, + "grad_norm": 0.3772924937493043, + "learning_rate": 7.618798094948262e-06, + "loss": 0.2382, + "step": 10256 + }, + { + "epoch": 0.59, + "grad_norm": 0.6013924440847807, + "learning_rate": 7.616990751445806e-06, + "loss": 0.4171, + "step": 10257 + }, + { + "epoch": 0.59, + "grad_norm": 0.39066029970772426, + "learning_rate": 7.615183490466858e-06, + "loss": 0.2633, + "step": 10258 + }, + { + "epoch": 0.59, + "grad_norm": 0.36730340180357374, + "learning_rate": 7.613376312074001e-06, + "loss": 0.3297, + "step": 10259 + }, + { + "epoch": 0.59, + "grad_norm": 0.2195929862317336, + "learning_rate": 7.611569216329821e-06, + "loss": 0.1535, + "step": 10260 + }, + { + "epoch": 0.59, + "grad_norm": 0.40569141532500297, + "learning_rate": 7.609762203296896e-06, + "loss": 0.2855, + "step": 10261 + }, + { + "epoch": 0.59, + "grad_norm": 0.566404154555934, + "learning_rate": 7.607955273037804e-06, + "loss": 0.4164, + "step": 10262 + }, + { + "epoch": 0.59, + "grad_norm": 0.3857204802136643, + "learning_rate": 7.606148425615117e-06, + "loss": 0.2772, + "step": 10263 + }, + { + "epoch": 0.59, + "grad_norm": 0.608997704118891, + "learning_rate": 7.604341661091409e-06, + "loss": 0.3627, + "step": 10264 + }, + { + "epoch": 0.59, + "grad_norm": 0.4045959883896538, + "learning_rate": 7.602534979529246e-06, + "loss": 0.3597, + "step": 10265 + }, + { + "epoch": 0.59, + "grad_norm": 0.20847234205880777, + "learning_rate": 7.600728380991191e-06, + "loss": 0.1583, + "step": 10266 + }, + { + "epoch": 0.59, + "grad_norm": 0.39509917853189525, + "learning_rate": 7.598921865539811e-06, + "loss": 0.2954, + "step": 10267 + }, + { + "epoch": 0.59, + "grad_norm": 0.33854641736778623, + "learning_rate": 7.597115433237664e-06, + "loss": 0.2913, + "step": 10268 + }, + { + "epoch": 0.59, + "grad_norm": 0.5218405160890431, + "learning_rate": 7.5953090841473035e-06, + "loss": 0.3258, + "step": 10269 + }, + { + "epoch": 0.59, + "grad_norm": 0.5757741501759367, + "learning_rate": 7.593502818331289e-06, + "loss": 0.3423, + "step": 10270 + }, + { + "epoch": 0.59, + "grad_norm": 0.2601837400754058, + "learning_rate": 7.5916966358521645e-06, + "loss": 0.2893, + "step": 10271 + }, + { + "epoch": 0.59, + "grad_norm": 0.43141156338626285, + "learning_rate": 7.589890536772486e-06, + "loss": 0.2517, + "step": 10272 + }, + { + "epoch": 0.59, + "grad_norm": 0.2912064861117238, + "learning_rate": 7.588084521154791e-06, + "loss": 0.1262, + "step": 10273 + }, + { + "epoch": 0.59, + "grad_norm": 0.39090096704594307, + "learning_rate": 7.586278589061628e-06, + "loss": 0.3112, + "step": 10274 + }, + { + "epoch": 0.59, + "grad_norm": 0.50511253032244, + "learning_rate": 7.584472740555533e-06, + "loss": 0.3057, + "step": 10275 + }, + { + "epoch": 0.59, + "grad_norm": 0.3452619407210834, + "learning_rate": 7.582666975699043e-06, + "loss": 0.1574, + "step": 10276 + }, + { + "epoch": 0.59, + "grad_norm": 0.37024431661053503, + "learning_rate": 7.5808612945546915e-06, + "loss": 0.3077, + "step": 10277 + }, + { + "epoch": 0.59, + "grad_norm": 0.44955413310834985, + "learning_rate": 7.5790556971850095e-06, + "loss": 0.2907, + "step": 10278 + }, + { + "epoch": 0.59, + "grad_norm": 0.22616688185308217, + "learning_rate": 7.577250183652523e-06, + "loss": 0.1954, + "step": 10279 + }, + { + "epoch": 0.59, + "grad_norm": 0.4488134009283226, + "learning_rate": 7.575444754019762e-06, + "loss": 0.3517, + "step": 10280 + }, + { + "epoch": 0.59, + "grad_norm": 0.6806739153782752, + "learning_rate": 7.5736394083492414e-06, + "loss": 0.4589, + "step": 10281 + }, + { + "epoch": 0.59, + "grad_norm": 0.38788290607606046, + "learning_rate": 7.571834146703486e-06, + "loss": 0.1904, + "step": 10282 + }, + { + "epoch": 0.59, + "grad_norm": 0.3014316610003963, + "learning_rate": 7.57002896914501e-06, + "loss": 0.2887, + "step": 10283 + }, + { + "epoch": 0.59, + "grad_norm": 0.4034575913681455, + "learning_rate": 7.568223875736325e-06, + "loss": 0.2604, + "step": 10284 + }, + { + "epoch": 0.59, + "grad_norm": 0.6416524913164146, + "learning_rate": 7.566418866539944e-06, + "loss": 0.3879, + "step": 10285 + }, + { + "epoch": 0.59, + "grad_norm": 0.23814333645697722, + "learning_rate": 7.5646139416183705e-06, + "loss": 0.1818, + "step": 10286 + }, + { + "epoch": 0.59, + "grad_norm": 0.35517257625022275, + "learning_rate": 7.562809101034114e-06, + "loss": 0.3117, + "step": 10287 + }, + { + "epoch": 0.59, + "grad_norm": 1.2153446890355475, + "learning_rate": 7.56100434484967e-06, + "loss": 0.7712, + "step": 10288 + }, + { + "epoch": 0.59, + "grad_norm": 0.30753262429141587, + "learning_rate": 7.559199673127545e-06, + "loss": 0.1986, + "step": 10289 + }, + { + "epoch": 0.59, + "grad_norm": 0.7300295591145592, + "learning_rate": 7.557395085930227e-06, + "loss": 0.4217, + "step": 10290 + }, + { + "epoch": 0.59, + "grad_norm": 0.32301624520605615, + "learning_rate": 7.555590583320214e-06, + "loss": 0.2986, + "step": 10291 + }, + { + "epoch": 0.59, + "grad_norm": 0.255707287279116, + "learning_rate": 7.553786165359993e-06, + "loss": 0.1879, + "step": 10292 + }, + { + "epoch": 0.59, + "grad_norm": 0.41385565707671423, + "learning_rate": 7.551981832112054e-06, + "loss": 0.2826, + "step": 10293 + }, + { + "epoch": 0.59, + "grad_norm": 0.38640746910827395, + "learning_rate": 7.550177583638876e-06, + "loss": 0.3198, + "step": 10294 + }, + { + "epoch": 0.59, + "grad_norm": 0.2565067070850015, + "learning_rate": 7.548373420002945e-06, + "loss": 0.2217, + "step": 10295 + }, + { + "epoch": 0.59, + "grad_norm": 0.7864258593626743, + "learning_rate": 7.546569341266737e-06, + "loss": 0.4573, + "step": 10296 + }, + { + "epoch": 0.59, + "grad_norm": 0.5094781014286576, + "learning_rate": 7.544765347492727e-06, + "loss": 0.3626, + "step": 10297 + }, + { + "epoch": 0.59, + "grad_norm": 0.3531764204079477, + "learning_rate": 7.542961438743389e-06, + "loss": 0.2905, + "step": 10298 + }, + { + "epoch": 0.59, + "grad_norm": 0.22608108952594327, + "learning_rate": 7.54115761508119e-06, + "loss": 0.1933, + "step": 10299 + }, + { + "epoch": 0.59, + "grad_norm": 0.5899693029472266, + "learning_rate": 7.539353876568594e-06, + "loss": 0.4156, + "step": 10300 + }, + { + "epoch": 0.59, + "grad_norm": 0.3815342442871858, + "learning_rate": 7.537550223268071e-06, + "loss": 0.3133, + "step": 10301 + }, + { + "epoch": 0.59, + "grad_norm": 0.7859397443567046, + "learning_rate": 7.5357466552420745e-06, + "loss": 0.2873, + "step": 10302 + }, + { + "epoch": 0.59, + "grad_norm": 0.46196064535471065, + "learning_rate": 7.533943172553068e-06, + "loss": 0.336, + "step": 10303 + }, + { + "epoch": 0.59, + "grad_norm": 0.3143589100536943, + "learning_rate": 7.5321397752635e-06, + "loss": 0.2493, + "step": 10304 + }, + { + "epoch": 0.59, + "grad_norm": 0.1956301211002063, + "learning_rate": 7.53033646343583e-06, + "loss": 0.116, + "step": 10305 + }, + { + "epoch": 0.59, + "grad_norm": 0.33209653400560535, + "learning_rate": 7.528533237132498e-06, + "loss": 0.3106, + "step": 10306 + }, + { + "epoch": 0.59, + "grad_norm": 0.324432066107435, + "learning_rate": 7.526730096415957e-06, + "loss": 0.2727, + "step": 10307 + }, + { + "epoch": 0.59, + "grad_norm": 0.6904170847854164, + "learning_rate": 7.524927041348646e-06, + "loss": 0.4153, + "step": 10308 + }, + { + "epoch": 0.59, + "grad_norm": 0.5469584121601104, + "learning_rate": 7.523124071993004e-06, + "loss": 0.2054, + "step": 10309 + }, + { + "epoch": 0.59, + "grad_norm": 0.34453117937491873, + "learning_rate": 7.521321188411469e-06, + "loss": 0.2789, + "step": 10310 + }, + { + "epoch": 0.59, + "grad_norm": 0.2578278431274199, + "learning_rate": 7.519518390666474e-06, + "loss": 0.237, + "step": 10311 + }, + { + "epoch": 0.59, + "grad_norm": 0.22170453038599644, + "learning_rate": 7.517715678820452e-06, + "loss": 0.1511, + "step": 10312 + }, + { + "epoch": 0.59, + "grad_norm": 0.3583110798163475, + "learning_rate": 7.515913052935827e-06, + "loss": 0.2837, + "step": 10313 + }, + { + "epoch": 0.59, + "grad_norm": 0.7549106305530019, + "learning_rate": 7.514110513075028e-06, + "loss": 0.4292, + "step": 10314 + }, + { + "epoch": 0.59, + "grad_norm": 0.291936518934735, + "learning_rate": 7.512308059300474e-06, + "loss": 0.2353, + "step": 10315 + }, + { + "epoch": 0.59, + "grad_norm": 0.3369054517703914, + "learning_rate": 7.510505691674586e-06, + "loss": 0.2834, + "step": 10316 + }, + { + "epoch": 0.59, + "grad_norm": 0.2868584847338976, + "learning_rate": 7.5087034102597775e-06, + "loss": 0.1644, + "step": 10317 + }, + { + "epoch": 0.59, + "grad_norm": 0.2998619310436137, + "learning_rate": 7.506901215118465e-06, + "loss": 0.2527, + "step": 10318 + }, + { + "epoch": 0.59, + "grad_norm": 0.36454420205043037, + "learning_rate": 7.505099106313053e-06, + "loss": 0.2659, + "step": 10319 + }, + { + "epoch": 0.59, + "grad_norm": 1.0795147065212092, + "learning_rate": 7.503297083905955e-06, + "loss": 0.3916, + "step": 10320 + }, + { + "epoch": 0.59, + "grad_norm": 1.1310884841179982, + "learning_rate": 7.5014951479595684e-06, + "loss": 0.5808, + "step": 10321 + }, + { + "epoch": 0.59, + "grad_norm": 0.2958718654952075, + "learning_rate": 7.499693298536301e-06, + "loss": 0.1898, + "step": 10322 + }, + { + "epoch": 0.59, + "grad_norm": 0.28992196328142583, + "learning_rate": 7.497891535698546e-06, + "loss": 0.2596, + "step": 10323 + }, + { + "epoch": 0.59, + "grad_norm": 1.1602792649842537, + "learning_rate": 7.496089859508697e-06, + "loss": 0.5329, + "step": 10324 + }, + { + "epoch": 0.59, + "grad_norm": 0.3745377074556588, + "learning_rate": 7.494288270029152e-06, + "loss": 0.2368, + "step": 10325 + }, + { + "epoch": 0.59, + "grad_norm": 0.31340930333139033, + "learning_rate": 7.492486767322293e-06, + "loss": 0.2712, + "step": 10326 + }, + { + "epoch": 0.59, + "grad_norm": 0.46084638494659036, + "learning_rate": 7.490685351450513e-06, + "loss": 0.3427, + "step": 10327 + }, + { + "epoch": 0.59, + "grad_norm": 0.3007749876753642, + "learning_rate": 7.488884022476189e-06, + "loss": 0.1969, + "step": 10328 + }, + { + "epoch": 0.59, + "grad_norm": 1.0453565752421365, + "learning_rate": 7.487082780461704e-06, + "loss": 0.5567, + "step": 10329 + }, + { + "epoch": 0.59, + "grad_norm": 0.40266828324910653, + "learning_rate": 7.485281625469432e-06, + "loss": 0.3253, + "step": 10330 + }, + { + "epoch": 0.59, + "grad_norm": 0.33743676652775795, + "learning_rate": 7.483480557561753e-06, + "loss": 0.2332, + "step": 10331 + }, + { + "epoch": 0.59, + "grad_norm": 0.4111867303618263, + "learning_rate": 7.481679576801035e-06, + "loss": 0.2775, + "step": 10332 + }, + { + "epoch": 0.59, + "grad_norm": 0.4693763211900533, + "learning_rate": 7.479878683249642e-06, + "loss": 0.2668, + "step": 10333 + }, + { + "epoch": 0.59, + "grad_norm": 0.503106972794314, + "learning_rate": 7.478077876969943e-06, + "loss": 0.2492, + "step": 10334 + }, + { + "epoch": 0.59, + "grad_norm": 0.32620964008429154, + "learning_rate": 7.476277158024299e-06, + "loss": 0.2576, + "step": 10335 + }, + { + "epoch": 0.59, + "grad_norm": 0.6618248472899589, + "learning_rate": 7.474476526475066e-06, + "loss": 0.4087, + "step": 10336 + }, + { + "epoch": 0.59, + "grad_norm": 0.42152985701301704, + "learning_rate": 7.4726759823846054e-06, + "loss": 0.2839, + "step": 10337 + }, + { + "epoch": 0.59, + "grad_norm": 0.2510070165287194, + "learning_rate": 7.470875525815263e-06, + "loss": 0.2232, + "step": 10338 + }, + { + "epoch": 0.59, + "grad_norm": 0.42724280486977506, + "learning_rate": 7.4690751568293955e-06, + "loss": 0.2797, + "step": 10339 + }, + { + "epoch": 0.59, + "grad_norm": 0.37480903963553386, + "learning_rate": 7.467274875489345e-06, + "loss": 0.2427, + "step": 10340 + }, + { + "epoch": 0.59, + "grad_norm": 0.41897775646501684, + "learning_rate": 7.465474681857459e-06, + "loss": 0.2865, + "step": 10341 + }, + { + "epoch": 0.59, + "grad_norm": 0.3516394732341963, + "learning_rate": 7.463674575996072e-06, + "loss": 0.3245, + "step": 10342 + }, + { + "epoch": 0.59, + "grad_norm": 0.3376477205658888, + "learning_rate": 7.461874557967528e-06, + "loss": 0.2722, + "step": 10343 + }, + { + "epoch": 0.59, + "grad_norm": 0.2938787637187035, + "learning_rate": 7.4600746278341575e-06, + "loss": 0.1601, + "step": 10344 + }, + { + "epoch": 0.59, + "grad_norm": 0.4272505094482967, + "learning_rate": 7.458274785658295e-06, + "loss": 0.2518, + "step": 10345 + }, + { + "epoch": 0.59, + "grad_norm": 0.32516909890328166, + "learning_rate": 7.4564750315022645e-06, + "loss": 0.2598, + "step": 10346 + }, + { + "epoch": 0.59, + "grad_norm": 0.36114353203460814, + "learning_rate": 7.454675365428397e-06, + "loss": 0.305, + "step": 10347 + }, + { + "epoch": 0.59, + "grad_norm": 0.5091804731487372, + "learning_rate": 7.452875787499012e-06, + "loss": 0.3558, + "step": 10348 + }, + { + "epoch": 0.59, + "grad_norm": 0.3474769886170156, + "learning_rate": 7.451076297776427e-06, + "loss": 0.2804, + "step": 10349 + }, + { + "epoch": 0.59, + "grad_norm": 0.45404733187295016, + "learning_rate": 7.4492768963229635e-06, + "loss": 0.3461, + "step": 10350 + }, + { + "epoch": 0.59, + "grad_norm": 0.22564809690050838, + "learning_rate": 7.447477583200928e-06, + "loss": 0.1469, + "step": 10351 + }, + { + "epoch": 0.59, + "grad_norm": 0.41250335520063286, + "learning_rate": 7.445678358472637e-06, + "loss": 0.2752, + "step": 10352 + }, + { + "epoch": 0.59, + "grad_norm": 0.43262651395981655, + "learning_rate": 7.443879222200392e-06, + "loss": 0.3522, + "step": 10353 + }, + { + "epoch": 0.59, + "grad_norm": 0.3365479924421112, + "learning_rate": 7.442080174446502e-06, + "loss": 0.2761, + "step": 10354 + }, + { + "epoch": 0.59, + "grad_norm": 0.39423987473222083, + "learning_rate": 7.440281215273262e-06, + "loss": 0.2763, + "step": 10355 + }, + { + "epoch": 0.6, + "grad_norm": 0.4763702671153097, + "learning_rate": 7.438482344742977e-06, + "loss": 0.3736, + "step": 10356 + }, + { + "epoch": 0.6, + "grad_norm": 0.24311212151397302, + "learning_rate": 7.436683562917937e-06, + "loss": 0.1081, + "step": 10357 + }, + { + "epoch": 0.6, + "grad_norm": 0.31815626651918666, + "learning_rate": 7.4348848698604345e-06, + "loss": 0.2684, + "step": 10358 + }, + { + "epoch": 0.6, + "grad_norm": 0.3554260506520077, + "learning_rate": 7.433086265632759e-06, + "loss": 0.3245, + "step": 10359 + }, + { + "epoch": 0.6, + "grad_norm": 0.7899975431739951, + "learning_rate": 7.431287750297196e-06, + "loss": 0.5236, + "step": 10360 + }, + { + "epoch": 0.6, + "grad_norm": 0.324058350565275, + "learning_rate": 7.429489323916028e-06, + "loss": 0.2124, + "step": 10361 + }, + { + "epoch": 0.6, + "grad_norm": 0.30049014342614966, + "learning_rate": 7.427690986551534e-06, + "loss": 0.2998, + "step": 10362 + }, + { + "epoch": 0.6, + "grad_norm": 0.2735467911648441, + "learning_rate": 7.42589273826599e-06, + "loss": 0.1753, + "step": 10363 + }, + { + "epoch": 0.6, + "grad_norm": 0.3070856253154252, + "learning_rate": 7.42409457912167e-06, + "loss": 0.214, + "step": 10364 + }, + { + "epoch": 0.6, + "grad_norm": 0.7289098597123872, + "learning_rate": 7.422296509180844e-06, + "loss": 0.3994, + "step": 10365 + }, + { + "epoch": 0.6, + "grad_norm": 0.3466921745387132, + "learning_rate": 7.420498528505783e-06, + "loss": 0.3249, + "step": 10366 + }, + { + "epoch": 0.6, + "grad_norm": 0.30005732279309993, + "learning_rate": 7.418700637158742e-06, + "loss": 0.2126, + "step": 10367 + }, + { + "epoch": 0.6, + "grad_norm": 1.4382862328192985, + "learning_rate": 7.416902835201989e-06, + "loss": 0.7743, + "step": 10368 + }, + { + "epoch": 0.6, + "grad_norm": 0.3143363107928643, + "learning_rate": 7.415105122697777e-06, + "loss": 0.2053, + "step": 10369 + }, + { + "epoch": 0.6, + "grad_norm": 0.27691108069319204, + "learning_rate": 7.413307499708367e-06, + "loss": 0.2016, + "step": 10370 + }, + { + "epoch": 0.6, + "grad_norm": 0.47985595670757103, + "learning_rate": 7.411509966296004e-06, + "loss": 0.297, + "step": 10371 + }, + { + "epoch": 0.6, + "grad_norm": 0.9858514760671856, + "learning_rate": 7.409712522522942e-06, + "loss": 0.7043, + "step": 10372 + }, + { + "epoch": 0.6, + "grad_norm": 0.6016526088054295, + "learning_rate": 7.407915168451423e-06, + "loss": 0.3614, + "step": 10373 + }, + { + "epoch": 0.6, + "grad_norm": 0.29457207851404604, + "learning_rate": 7.40611790414369e-06, + "loss": 0.24, + "step": 10374 + }, + { + "epoch": 0.6, + "grad_norm": 0.5271657134892388, + "learning_rate": 7.404320729661982e-06, + "loss": 0.2722, + "step": 10375 + }, + { + "epoch": 0.6, + "grad_norm": 0.4621286015570544, + "learning_rate": 7.402523645068536e-06, + "loss": 0.3243, + "step": 10376 + }, + { + "epoch": 0.6, + "grad_norm": 0.26337030099005204, + "learning_rate": 7.400726650425585e-06, + "loss": 0.2056, + "step": 10377 + }, + { + "epoch": 0.6, + "grad_norm": 0.337834575079801, + "learning_rate": 7.3989297457953565e-06, + "loss": 0.3024, + "step": 10378 + }, + { + "epoch": 0.6, + "grad_norm": 0.5600185258598154, + "learning_rate": 7.3971329312400805e-06, + "loss": 0.4083, + "step": 10379 + }, + { + "epoch": 0.6, + "grad_norm": 0.4112617760836703, + "learning_rate": 7.395336206821979e-06, + "loss": 0.2426, + "step": 10380 + }, + { + "epoch": 0.6, + "grad_norm": 0.615001228721209, + "learning_rate": 7.393539572603274e-06, + "loss": 0.3951, + "step": 10381 + }, + { + "epoch": 0.6, + "grad_norm": 0.2863126488000933, + "learning_rate": 7.391743028646179e-06, + "loss": 0.2512, + "step": 10382 + }, + { + "epoch": 0.6, + "grad_norm": 0.31806544952047155, + "learning_rate": 7.3899465750129116e-06, + "loss": 0.2348, + "step": 10383 + }, + { + "epoch": 0.6, + "grad_norm": 0.27767063661119495, + "learning_rate": 7.388150211765682e-06, + "loss": 0.1522, + "step": 10384 + }, + { + "epoch": 0.6, + "grad_norm": 0.40574913552979075, + "learning_rate": 7.3863539389667e-06, + "loss": 0.2952, + "step": 10385 + }, + { + "epoch": 0.6, + "grad_norm": 0.3449818810488741, + "learning_rate": 7.384557756678166e-06, + "loss": 0.2971, + "step": 10386 + }, + { + "epoch": 0.6, + "grad_norm": 0.4392748651396246, + "learning_rate": 7.382761664962287e-06, + "loss": 0.2963, + "step": 10387 + }, + { + "epoch": 0.6, + "grad_norm": 0.5556219375952675, + "learning_rate": 7.380965663881259e-06, + "loss": 0.3425, + "step": 10388 + }, + { + "epoch": 0.6, + "grad_norm": 0.2330582030181791, + "learning_rate": 7.379169753497275e-06, + "loss": 0.2051, + "step": 10389 + }, + { + "epoch": 0.6, + "grad_norm": 0.3359612146899458, + "learning_rate": 7.377373933872531e-06, + "loss": 0.2831, + "step": 10390 + }, + { + "epoch": 0.6, + "grad_norm": 0.7637438385972759, + "learning_rate": 7.375578205069213e-06, + "loss": 0.3904, + "step": 10391 + }, + { + "epoch": 0.6, + "grad_norm": 0.3545649951392472, + "learning_rate": 7.373782567149514e-06, + "loss": 0.2763, + "step": 10392 + }, + { + "epoch": 0.6, + "grad_norm": 0.6164382213937144, + "learning_rate": 7.371987020175606e-06, + "loss": 0.3046, + "step": 10393 + }, + { + "epoch": 0.6, + "grad_norm": 0.3547602337809164, + "learning_rate": 7.370191564209679e-06, + "loss": 0.2542, + "step": 10394 + }, + { + "epoch": 0.6, + "grad_norm": 0.2775262885159086, + "learning_rate": 7.368396199313901e-06, + "loss": 0.2352, + "step": 10395 + }, + { + "epoch": 0.6, + "grad_norm": 0.4603446729991688, + "learning_rate": 7.3666009255504534e-06, + "loss": 0.2823, + "step": 10396 + }, + { + "epoch": 0.6, + "grad_norm": 0.41311951116192697, + "learning_rate": 7.364805742981499e-06, + "loss": 0.2084, + "step": 10397 + }, + { + "epoch": 0.6, + "grad_norm": 0.26981663565088704, + "learning_rate": 7.363010651669211e-06, + "loss": 0.2726, + "step": 10398 + }, + { + "epoch": 0.6, + "grad_norm": 0.7117455054970516, + "learning_rate": 7.361215651675753e-06, + "loss": 0.398, + "step": 10399 + }, + { + "epoch": 0.6, + "grad_norm": 0.4345400774009875, + "learning_rate": 7.359420743063282e-06, + "loss": 0.1569, + "step": 10400 + }, + { + "epoch": 0.6, + "grad_norm": 0.3219544373698642, + "learning_rate": 7.357625925893954e-06, + "loss": 0.2259, + "step": 10401 + }, + { + "epoch": 0.6, + "grad_norm": 0.2688178658293408, + "learning_rate": 7.355831200229928e-06, + "loss": 0.2649, + "step": 10402 + }, + { + "epoch": 0.6, + "grad_norm": 0.3136728201989251, + "learning_rate": 7.354036566133354e-06, + "loss": 0.2091, + "step": 10403 + }, + { + "epoch": 0.6, + "grad_norm": 0.4715407097290975, + "learning_rate": 7.3522420236663805e-06, + "loss": 0.3276, + "step": 10404 + }, + { + "epoch": 0.6, + "grad_norm": 0.4956804964246323, + "learning_rate": 7.350447572891148e-06, + "loss": 0.3307, + "step": 10405 + }, + { + "epoch": 0.6, + "grad_norm": 0.32953209301390884, + "learning_rate": 7.348653213869807e-06, + "loss": 0.1928, + "step": 10406 + }, + { + "epoch": 0.6, + "grad_norm": 0.40875599533641027, + "learning_rate": 7.346858946664488e-06, + "loss": 0.3065, + "step": 10407 + }, + { + "epoch": 0.6, + "grad_norm": 0.2591224485745581, + "learning_rate": 7.345064771337332e-06, + "loss": 0.1801, + "step": 10408 + }, + { + "epoch": 0.6, + "grad_norm": 0.4714728778705214, + "learning_rate": 7.343270687950468e-06, + "loss": 0.36, + "step": 10409 + }, + { + "epoch": 0.6, + "grad_norm": 0.28898646326817645, + "learning_rate": 7.341476696566026e-06, + "loss": 0.2228, + "step": 10410 + }, + { + "epoch": 0.6, + "grad_norm": 0.7426477752866582, + "learning_rate": 7.33968279724613e-06, + "loss": 0.4396, + "step": 10411 + }, + { + "epoch": 0.6, + "grad_norm": 1.1024879194552417, + "learning_rate": 7.337888990052906e-06, + "loss": 0.5185, + "step": 10412 + }, + { + "epoch": 0.6, + "grad_norm": 0.23906818831035612, + "learning_rate": 7.336095275048474e-06, + "loss": 0.1846, + "step": 10413 + }, + { + "epoch": 0.6, + "grad_norm": 0.3227646272577517, + "learning_rate": 7.334301652294944e-06, + "loss": 0.2678, + "step": 10414 + }, + { + "epoch": 0.6, + "grad_norm": 0.5806326984574692, + "learning_rate": 7.332508121854435e-06, + "loss": 0.4073, + "step": 10415 + }, + { + "epoch": 0.6, + "grad_norm": 0.34300519661195633, + "learning_rate": 7.330714683789053e-06, + "loss": 0.2575, + "step": 10416 + }, + { + "epoch": 0.6, + "grad_norm": 0.5015264244296673, + "learning_rate": 7.32892133816091e-06, + "loss": 0.3357, + "step": 10417 + }, + { + "epoch": 0.6, + "grad_norm": 0.42152132629199246, + "learning_rate": 7.327128085032103e-06, + "loss": 0.3183, + "step": 10418 + }, + { + "epoch": 0.6, + "grad_norm": 0.3411341644741504, + "learning_rate": 7.325334924464737e-06, + "loss": 0.2233, + "step": 10419 + }, + { + "epoch": 0.6, + "grad_norm": 0.4057527596698589, + "learning_rate": 7.323541856520908e-06, + "loss": 0.3004, + "step": 10420 + }, + { + "epoch": 0.6, + "grad_norm": 0.3473167164819613, + "learning_rate": 7.32174888126271e-06, + "loss": 0.2923, + "step": 10421 + }, + { + "epoch": 0.6, + "grad_norm": 0.36276706144912185, + "learning_rate": 7.3199559987522305e-06, + "loss": 0.288, + "step": 10422 + }, + { + "epoch": 0.6, + "grad_norm": 0.41822234452984713, + "learning_rate": 7.3181632090515635e-06, + "loss": 0.2161, + "step": 10423 + }, + { + "epoch": 0.6, + "grad_norm": 0.5414425489075652, + "learning_rate": 7.316370512222785e-06, + "loss": 0.3239, + "step": 10424 + }, + { + "epoch": 0.6, + "grad_norm": 0.378916321206655, + "learning_rate": 7.314577908327982e-06, + "loss": 0.2869, + "step": 10425 + }, + { + "epoch": 0.6, + "grad_norm": 0.33481596176071216, + "learning_rate": 7.312785397429231e-06, + "loss": 0.2687, + "step": 10426 + }, + { + "epoch": 0.6, + "grad_norm": 0.5608026074266522, + "learning_rate": 7.310992979588607e-06, + "loss": 0.3707, + "step": 10427 + }, + { + "epoch": 0.6, + "grad_norm": 0.319216032928723, + "learning_rate": 7.30920065486818e-06, + "loss": 0.2487, + "step": 10428 + }, + { + "epoch": 0.6, + "grad_norm": 0.23822935077111004, + "learning_rate": 7.307408423330016e-06, + "loss": 0.1923, + "step": 10429 + }, + { + "epoch": 0.6, + "grad_norm": 1.1734266520628216, + "learning_rate": 7.305616285036186e-06, + "loss": 0.5906, + "step": 10430 + }, + { + "epoch": 0.6, + "grad_norm": 0.306891867740223, + "learning_rate": 7.303824240048744e-06, + "loss": 0.2423, + "step": 10431 + }, + { + "epoch": 0.6, + "grad_norm": 0.6941208094628137, + "learning_rate": 7.3020322884297565e-06, + "loss": 0.3179, + "step": 10432 + }, + { + "epoch": 0.6, + "grad_norm": 0.31808162658254446, + "learning_rate": 7.300240430241278e-06, + "loss": 0.2968, + "step": 10433 + }, + { + "epoch": 0.6, + "grad_norm": 0.4021974465762369, + "learning_rate": 7.298448665545352e-06, + "loss": 0.2697, + "step": 10434 + }, + { + "epoch": 0.6, + "grad_norm": 0.2515530206184095, + "learning_rate": 7.296656994404034e-06, + "loss": 0.1669, + "step": 10435 + }, + { + "epoch": 0.6, + "grad_norm": 0.5861578574807791, + "learning_rate": 7.294865416879366e-06, + "loss": 0.1531, + "step": 10436 + }, + { + "epoch": 0.6, + "grad_norm": 0.34651374064157503, + "learning_rate": 7.293073933033394e-06, + "loss": 0.2677, + "step": 10437 + }, + { + "epoch": 0.6, + "grad_norm": 0.37290784368695534, + "learning_rate": 7.291282542928158e-06, + "loss": 0.3219, + "step": 10438 + }, + { + "epoch": 0.6, + "grad_norm": 0.8520457116127256, + "learning_rate": 7.289491246625686e-06, + "loss": 0.3167, + "step": 10439 + }, + { + "epoch": 0.6, + "grad_norm": 0.3610971456454583, + "learning_rate": 7.287700044188019e-06, + "loss": 0.2736, + "step": 10440 + }, + { + "epoch": 0.6, + "grad_norm": 0.27075648165360194, + "learning_rate": 7.28590893567718e-06, + "loss": 0.2578, + "step": 10441 + }, + { + "epoch": 0.6, + "grad_norm": 0.467437287940649, + "learning_rate": 7.2841179211552005e-06, + "loss": 0.1746, + "step": 10442 + }, + { + "epoch": 0.6, + "grad_norm": 0.3203387589486802, + "learning_rate": 7.282327000684099e-06, + "loss": 0.2521, + "step": 10443 + }, + { + "epoch": 0.6, + "grad_norm": 1.0366281064937823, + "learning_rate": 7.280536174325897e-06, + "loss": 0.4184, + "step": 10444 + }, + { + "epoch": 0.6, + "grad_norm": 0.32022139792761206, + "learning_rate": 7.27874544214261e-06, + "loss": 0.2512, + "step": 10445 + }, + { + "epoch": 0.6, + "grad_norm": 0.31124170915094695, + "learning_rate": 7.276954804196252e-06, + "loss": 0.2607, + "step": 10446 + }, + { + "epoch": 0.6, + "grad_norm": 0.27313298584475565, + "learning_rate": 7.2751642605488305e-06, + "loss": 0.1748, + "step": 10447 + }, + { + "epoch": 0.6, + "grad_norm": 1.4671493965457703, + "learning_rate": 7.273373811262356e-06, + "loss": 0.7225, + "step": 10448 + }, + { + "epoch": 0.6, + "grad_norm": 0.30145172575689483, + "learning_rate": 7.271583456398827e-06, + "loss": 0.2257, + "step": 10449 + }, + { + "epoch": 0.6, + "grad_norm": 0.7093270985040872, + "learning_rate": 7.269793196020247e-06, + "loss": 0.3663, + "step": 10450 + }, + { + "epoch": 0.6, + "grad_norm": 1.0567717494520565, + "learning_rate": 7.26800303018861e-06, + "loss": 0.7185, + "step": 10451 + }, + { + "epoch": 0.6, + "grad_norm": 0.3041414305207806, + "learning_rate": 7.266212958965912e-06, + "loss": 0.2066, + "step": 10452 + }, + { + "epoch": 0.6, + "grad_norm": 0.37389481781956213, + "learning_rate": 7.264422982414143e-06, + "loss": 0.3267, + "step": 10453 + }, + { + "epoch": 0.6, + "grad_norm": 0.25075241124657155, + "learning_rate": 7.2626331005952845e-06, + "loss": 0.1912, + "step": 10454 + }, + { + "epoch": 0.6, + "grad_norm": 0.3700868988947285, + "learning_rate": 7.260843313571328e-06, + "loss": 0.2145, + "step": 10455 + }, + { + "epoch": 0.6, + "grad_norm": 0.9142862153982831, + "learning_rate": 7.259053621404246e-06, + "loss": 0.4353, + "step": 10456 + }, + { + "epoch": 0.6, + "grad_norm": 0.3873001010445203, + "learning_rate": 7.2572640241560225e-06, + "loss": 0.3062, + "step": 10457 + }, + { + "epoch": 0.6, + "grad_norm": 0.3067948858367847, + "learning_rate": 7.255474521888624e-06, + "loss": 0.1958, + "step": 10458 + }, + { + "epoch": 0.6, + "grad_norm": 0.3374418148179951, + "learning_rate": 7.253685114664029e-06, + "loss": 0.232, + "step": 10459 + }, + { + "epoch": 0.6, + "grad_norm": 0.45806778271637055, + "learning_rate": 7.251895802544197e-06, + "loss": 0.3377, + "step": 10460 + }, + { + "epoch": 0.6, + "grad_norm": 0.4224168796644813, + "learning_rate": 7.250106585591098e-06, + "loss": 0.2808, + "step": 10461 + }, + { + "epoch": 0.6, + "grad_norm": 0.4315638186581728, + "learning_rate": 7.2483174638666876e-06, + "loss": 0.2695, + "step": 10462 + }, + { + "epoch": 0.6, + "grad_norm": 1.1670725205025345, + "learning_rate": 7.246528437432927e-06, + "loss": 0.7757, + "step": 10463 + }, + { + "epoch": 0.6, + "grad_norm": 0.3893678908800406, + "learning_rate": 7.244739506351765e-06, + "loss": 0.2822, + "step": 10464 + }, + { + "epoch": 0.6, + "grad_norm": 0.27797550100587226, + "learning_rate": 7.242950670685159e-06, + "loss": 0.222, + "step": 10465 + }, + { + "epoch": 0.6, + "grad_norm": 0.4390055988054794, + "learning_rate": 7.2411619304950535e-06, + "loss": 0.26, + "step": 10466 + }, + { + "epoch": 0.6, + "grad_norm": 0.32031797829067743, + "learning_rate": 7.239373285843392e-06, + "loss": 0.2501, + "step": 10467 + }, + { + "epoch": 0.6, + "grad_norm": 0.6696812537335045, + "learning_rate": 7.237584736792112e-06, + "loss": 0.3122, + "step": 10468 + }, + { + "epoch": 0.6, + "grad_norm": 0.34998214322910576, + "learning_rate": 7.235796283403153e-06, + "loss": 0.3351, + "step": 10469 + }, + { + "epoch": 0.6, + "grad_norm": 0.3469237876049126, + "learning_rate": 7.234007925738451e-06, + "loss": 0.2644, + "step": 10470 + }, + { + "epoch": 0.6, + "grad_norm": 1.4179928240007145, + "learning_rate": 7.2322196638599365e-06, + "loss": 0.5999, + "step": 10471 + }, + { + "epoch": 0.6, + "grad_norm": 0.4999890912597717, + "learning_rate": 7.230431497829533e-06, + "loss": 0.2942, + "step": 10472 + }, + { + "epoch": 0.6, + "grad_norm": 0.2984927447908153, + "learning_rate": 7.228643427709172e-06, + "loss": 0.2701, + "step": 10473 + }, + { + "epoch": 0.6, + "grad_norm": 0.28578890500066373, + "learning_rate": 7.226855453560766e-06, + "loss": 0.1861, + "step": 10474 + }, + { + "epoch": 0.6, + "grad_norm": 0.8567557080685262, + "learning_rate": 7.2250675754462384e-06, + "loss": 0.4243, + "step": 10475 + }, + { + "epoch": 0.6, + "grad_norm": 0.38207266662063444, + "learning_rate": 7.2232797934275e-06, + "loss": 0.2831, + "step": 10476 + }, + { + "epoch": 0.6, + "grad_norm": 0.32210840344326147, + "learning_rate": 7.221492107566466e-06, + "loss": 0.285, + "step": 10477 + }, + { + "epoch": 0.6, + "grad_norm": 0.527301928692191, + "learning_rate": 7.2197045179250395e-06, + "loss": 0.2243, + "step": 10478 + }, + { + "epoch": 0.6, + "grad_norm": 0.3561268181099593, + "learning_rate": 7.217917024565124e-06, + "loss": 0.2425, + "step": 10479 + }, + { + "epoch": 0.6, + "grad_norm": 0.38431117081131044, + "learning_rate": 7.216129627548625e-06, + "loss": 0.271, + "step": 10480 + }, + { + "epoch": 0.6, + "grad_norm": 0.35747867342515993, + "learning_rate": 7.214342326937434e-06, + "loss": 0.2597, + "step": 10481 + }, + { + "epoch": 0.6, + "grad_norm": 0.3926495922665949, + "learning_rate": 7.212555122793452e-06, + "loss": 0.2489, + "step": 10482 + }, + { + "epoch": 0.6, + "grad_norm": 0.4978920915555641, + "learning_rate": 7.210768015178563e-06, + "loss": 0.3426, + "step": 10483 + }, + { + "epoch": 0.6, + "grad_norm": 0.7036657391861827, + "learning_rate": 7.208981004154661e-06, + "loss": 0.4456, + "step": 10484 + }, + { + "epoch": 0.6, + "grad_norm": 0.26724477915832684, + "learning_rate": 7.2071940897836235e-06, + "loss": 0.2103, + "step": 10485 + }, + { + "epoch": 0.6, + "grad_norm": 0.24639680661311067, + "learning_rate": 7.205407272127336e-06, + "loss": 0.181, + "step": 10486 + }, + { + "epoch": 0.6, + "grad_norm": 1.1345625130187684, + "learning_rate": 7.203620551247675e-06, + "loss": 0.7537, + "step": 10487 + }, + { + "epoch": 0.6, + "grad_norm": 0.32662252851637746, + "learning_rate": 7.201833927206514e-06, + "loss": 0.206, + "step": 10488 + }, + { + "epoch": 0.6, + "grad_norm": 0.3779858157814377, + "learning_rate": 7.200047400065722e-06, + "loss": 0.2848, + "step": 10489 + }, + { + "epoch": 0.6, + "grad_norm": 0.7855434602278417, + "learning_rate": 7.198260969887171e-06, + "loss": 0.4231, + "step": 10490 + }, + { + "epoch": 0.6, + "grad_norm": 0.22071700621294713, + "learning_rate": 7.196474636732722e-06, + "loss": 0.1673, + "step": 10491 + }, + { + "epoch": 0.6, + "grad_norm": 0.41724723056633095, + "learning_rate": 7.194688400664232e-06, + "loss": 0.2648, + "step": 10492 + }, + { + "epoch": 0.6, + "grad_norm": 0.33254747854591676, + "learning_rate": 7.192902261743566e-06, + "loss": 0.3185, + "step": 10493 + }, + { + "epoch": 0.6, + "grad_norm": 0.23978557568988323, + "learning_rate": 7.191116220032572e-06, + "loss": 0.0635, + "step": 10494 + }, + { + "epoch": 0.6, + "grad_norm": 0.45877974911373315, + "learning_rate": 7.189330275593104e-06, + "loss": 0.2984, + "step": 10495 + }, + { + "epoch": 0.6, + "grad_norm": 0.36561107076275134, + "learning_rate": 7.187544428487006e-06, + "loss": 0.2965, + "step": 10496 + }, + { + "epoch": 0.6, + "grad_norm": 0.6271544762259579, + "learning_rate": 7.1857586787761246e-06, + "loss": 0.3096, + "step": 10497 + }, + { + "epoch": 0.6, + "grad_norm": 0.22321373342280323, + "learning_rate": 7.183973026522297e-06, + "loss": 0.1597, + "step": 10498 + }, + { + "epoch": 0.6, + "grad_norm": 1.1306791168483041, + "learning_rate": 7.182187471787365e-06, + "loss": 0.7098, + "step": 10499 + }, + { + "epoch": 0.6, + "grad_norm": 0.5794004017247366, + "learning_rate": 7.180402014633159e-06, + "loss": 0.2908, + "step": 10500 + }, + { + "epoch": 0.6, + "grad_norm": 0.27354208770268323, + "learning_rate": 7.178616655121513e-06, + "loss": 0.2454, + "step": 10501 + }, + { + "epoch": 0.6, + "grad_norm": 1.158877404681521, + "learning_rate": 7.176831393314248e-06, + "loss": 0.6185, + "step": 10502 + }, + { + "epoch": 0.6, + "grad_norm": 0.5729698953409218, + "learning_rate": 7.175046229273191e-06, + "loss": 0.3404, + "step": 10503 + }, + { + "epoch": 0.6, + "grad_norm": 0.2731478718166219, + "learning_rate": 7.17326116306016e-06, + "loss": 0.196, + "step": 10504 + }, + { + "epoch": 0.6, + "grad_norm": 0.34421103264516534, + "learning_rate": 7.171476194736975e-06, + "loss": 0.3172, + "step": 10505 + }, + { + "epoch": 0.6, + "grad_norm": 0.4021245776566278, + "learning_rate": 7.169691324365447e-06, + "loss": 0.2735, + "step": 10506 + }, + { + "epoch": 0.6, + "grad_norm": 0.4057698150407605, + "learning_rate": 7.167906552007387e-06, + "loss": 0.2176, + "step": 10507 + }, + { + "epoch": 0.6, + "grad_norm": 0.39037278889321536, + "learning_rate": 7.166121877724599e-06, + "loss": 0.3269, + "step": 10508 + }, + { + "epoch": 0.6, + "grad_norm": 0.42136169762460707, + "learning_rate": 7.164337301578892e-06, + "loss": 0.2538, + "step": 10509 + }, + { + "epoch": 0.6, + "grad_norm": 0.3061269281320481, + "learning_rate": 7.162552823632059e-06, + "loss": 0.2145, + "step": 10510 + }, + { + "epoch": 0.6, + "grad_norm": 0.6243819662075308, + "learning_rate": 7.160768443945902e-06, + "loss": 0.3622, + "step": 10511 + }, + { + "epoch": 0.6, + "grad_norm": 0.39212060454543934, + "learning_rate": 7.15898416258221e-06, + "loss": 0.3269, + "step": 10512 + }, + { + "epoch": 0.6, + "grad_norm": 0.29126800036976597, + "learning_rate": 7.157199979602777e-06, + "loss": 0.2787, + "step": 10513 + }, + { + "epoch": 0.6, + "grad_norm": 0.4578357233133374, + "learning_rate": 7.155415895069385e-06, + "loss": 0.1526, + "step": 10514 + }, + { + "epoch": 0.6, + "grad_norm": 0.7504728210353762, + "learning_rate": 7.153631909043818e-06, + "loss": 0.3463, + "step": 10515 + }, + { + "epoch": 0.6, + "grad_norm": 0.39710388219731657, + "learning_rate": 7.151848021587855e-06, + "loss": 0.2964, + "step": 10516 + }, + { + "epoch": 0.6, + "grad_norm": 0.3785129422319976, + "learning_rate": 7.150064232763274e-06, + "loss": 0.2846, + "step": 10517 + }, + { + "epoch": 0.6, + "grad_norm": 0.5141174513877448, + "learning_rate": 7.1482805426318465e-06, + "loss": 0.314, + "step": 10518 + }, + { + "epoch": 0.6, + "grad_norm": 0.2443119966552638, + "learning_rate": 7.146496951255339e-06, + "loss": 0.2207, + "step": 10519 + }, + { + "epoch": 0.6, + "grad_norm": 0.30675794437787113, + "learning_rate": 7.144713458695521e-06, + "loss": 0.2244, + "step": 10520 + }, + { + "epoch": 0.6, + "grad_norm": 0.5894451000930164, + "learning_rate": 7.1429300650141505e-06, + "loss": 0.3308, + "step": 10521 + }, + { + "epoch": 0.6, + "grad_norm": 0.3686690194975854, + "learning_rate": 7.141146770272993e-06, + "loss": 0.3187, + "step": 10522 + }, + { + "epoch": 0.6, + "grad_norm": 0.6833298869205651, + "learning_rate": 7.139363574533797e-06, + "loss": 0.3821, + "step": 10523 + }, + { + "epoch": 0.6, + "grad_norm": 0.28139648241122606, + "learning_rate": 7.137580477858319e-06, + "loss": 0.231, + "step": 10524 + }, + { + "epoch": 0.6, + "grad_norm": 0.41488090474727446, + "learning_rate": 7.1357974803083044e-06, + "loss": 0.3398, + "step": 10525 + }, + { + "epoch": 0.6, + "grad_norm": 0.30323806929317104, + "learning_rate": 7.134014581945501e-06, + "loss": 0.1871, + "step": 10526 + }, + { + "epoch": 0.6, + "grad_norm": 0.3600629089927518, + "learning_rate": 7.132231782831649e-06, + "loss": 0.1966, + "step": 10527 + }, + { + "epoch": 0.6, + "grad_norm": 0.50493454952615, + "learning_rate": 7.130449083028488e-06, + "loss": 0.3759, + "step": 10528 + }, + { + "epoch": 0.6, + "grad_norm": 0.4093641362730516, + "learning_rate": 7.1286664825977505e-06, + "loss": 0.3145, + "step": 10529 + }, + { + "epoch": 0.6, + "grad_norm": 0.6043612819812036, + "learning_rate": 7.1268839816011695e-06, + "loss": 0.2636, + "step": 10530 + }, + { + "epoch": 0.61, + "grad_norm": 0.24807553607158486, + "learning_rate": 7.125101580100474e-06, + "loss": 0.2093, + "step": 10531 + }, + { + "epoch": 0.61, + "grad_norm": 0.2676598452867699, + "learning_rate": 7.123319278157385e-06, + "loss": 0.2648, + "step": 10532 + }, + { + "epoch": 0.61, + "grad_norm": 0.5550732423895852, + "learning_rate": 7.121537075833629e-06, + "loss": 0.1338, + "step": 10533 + }, + { + "epoch": 0.61, + "grad_norm": 0.36036269402932336, + "learning_rate": 7.119754973190915e-06, + "loss": 0.2997, + "step": 10534 + }, + { + "epoch": 0.61, + "grad_norm": 0.9122378994044027, + "learning_rate": 7.11797297029097e-06, + "loss": 0.3795, + "step": 10535 + }, + { + "epoch": 0.61, + "grad_norm": 0.3485600601021822, + "learning_rate": 7.116191067195494e-06, + "loss": 0.3105, + "step": 10536 + }, + { + "epoch": 0.61, + "grad_norm": 0.30727339279065635, + "learning_rate": 7.114409263966195e-06, + "loss": 0.1943, + "step": 10537 + }, + { + "epoch": 0.61, + "grad_norm": 0.2655701703598823, + "learning_rate": 7.11262756066478e-06, + "loss": 0.1737, + "step": 10538 + }, + { + "epoch": 0.61, + "grad_norm": 0.6735221155033528, + "learning_rate": 7.110845957352948e-06, + "loss": 0.3591, + "step": 10539 + }, + { + "epoch": 0.61, + "grad_norm": 0.27521311307189994, + "learning_rate": 7.109064454092398e-06, + "loss": 0.2416, + "step": 10540 + }, + { + "epoch": 0.61, + "grad_norm": 0.854134216843187, + "learning_rate": 7.1072830509448185e-06, + "loss": 0.4508, + "step": 10541 + }, + { + "epoch": 0.61, + "grad_norm": 0.6999782759176724, + "learning_rate": 7.105501747971906e-06, + "loss": 0.4854, + "step": 10542 + }, + { + "epoch": 0.61, + "grad_norm": 0.3321506116907094, + "learning_rate": 7.103720545235342e-06, + "loss": 0.2268, + "step": 10543 + }, + { + "epoch": 0.61, + "grad_norm": 0.233533515493403, + "learning_rate": 7.10193944279681e-06, + "loss": 0.2295, + "step": 10544 + }, + { + "epoch": 0.61, + "grad_norm": 0.7011227141792372, + "learning_rate": 7.100158440717993e-06, + "loss": 0.3943, + "step": 10545 + }, + { + "epoch": 0.61, + "grad_norm": 0.3676451737884998, + "learning_rate": 7.098377539060562e-06, + "loss": 0.273, + "step": 10546 + }, + { + "epoch": 0.61, + "grad_norm": 0.7447326581606627, + "learning_rate": 7.096596737886194e-06, + "loss": 0.3583, + "step": 10547 + }, + { + "epoch": 0.61, + "grad_norm": 0.32197795833723225, + "learning_rate": 7.0948160372565534e-06, + "loss": 0.2779, + "step": 10548 + }, + { + "epoch": 0.61, + "grad_norm": 0.3593098276921128, + "learning_rate": 7.093035437233311e-06, + "loss": 0.2799, + "step": 10549 + }, + { + "epoch": 0.61, + "grad_norm": 0.1792310565543837, + "learning_rate": 7.091254937878125e-06, + "loss": 0.084, + "step": 10550 + }, + { + "epoch": 0.61, + "grad_norm": 1.0643531416224592, + "learning_rate": 7.089474539252656e-06, + "loss": 0.3914, + "step": 10551 + }, + { + "epoch": 0.61, + "grad_norm": 0.250677757331347, + "learning_rate": 7.087694241418558e-06, + "loss": 0.2396, + "step": 10552 + }, + { + "epoch": 0.61, + "grad_norm": 0.3975599441810498, + "learning_rate": 7.085914044437485e-06, + "loss": 0.2871, + "step": 10553 + }, + { + "epoch": 0.61, + "grad_norm": 0.9224701306896524, + "learning_rate": 7.084133948371081e-06, + "loss": 0.6096, + "step": 10554 + }, + { + "epoch": 0.61, + "grad_norm": 0.3374955882849087, + "learning_rate": 7.082353953280995e-06, + "loss": 0.2439, + "step": 10555 + }, + { + "epoch": 0.61, + "grad_norm": 0.21337914040021594, + "learning_rate": 7.080574059228866e-06, + "loss": 0.1718, + "step": 10556 + }, + { + "epoch": 0.61, + "grad_norm": 1.2038444194000044, + "learning_rate": 7.07879426627633e-06, + "loss": 0.3988, + "step": 10557 + }, + { + "epoch": 0.61, + "grad_norm": 0.30269484417471876, + "learning_rate": 7.077014574485025e-06, + "loss": 0.24, + "step": 10558 + }, + { + "epoch": 0.61, + "grad_norm": 1.2401651526461381, + "learning_rate": 7.075234983916577e-06, + "loss": 0.7724, + "step": 10559 + }, + { + "epoch": 0.61, + "grad_norm": 0.30485211566640963, + "learning_rate": 7.073455494632618e-06, + "loss": 0.2483, + "step": 10560 + }, + { + "epoch": 0.61, + "grad_norm": 0.46309273195641676, + "learning_rate": 7.071676106694767e-06, + "loss": 0.2658, + "step": 10561 + }, + { + "epoch": 0.61, + "grad_norm": 0.32683809943240216, + "learning_rate": 7.06989682016465e-06, + "loss": 0.2232, + "step": 10562 + }, + { + "epoch": 0.61, + "grad_norm": 0.7879418679957149, + "learning_rate": 7.068117635103877e-06, + "loss": 0.2834, + "step": 10563 + }, + { + "epoch": 0.61, + "grad_norm": 0.3973588991375235, + "learning_rate": 7.066338551574066e-06, + "loss": 0.29, + "step": 10564 + }, + { + "epoch": 0.61, + "grad_norm": 0.4464487646966575, + "learning_rate": 7.064559569636824e-06, + "loss": 0.3641, + "step": 10565 + }, + { + "epoch": 0.61, + "grad_norm": 0.885248157283508, + "learning_rate": 7.062780689353758e-06, + "loss": 0.3694, + "step": 10566 + }, + { + "epoch": 0.61, + "grad_norm": 0.3276928966908981, + "learning_rate": 7.06100191078647e-06, + "loss": 0.2578, + "step": 10567 + }, + { + "epoch": 0.61, + "grad_norm": 0.35618267701930156, + "learning_rate": 7.0592232339965664e-06, + "loss": 0.3178, + "step": 10568 + }, + { + "epoch": 0.61, + "grad_norm": 0.3768634547209685, + "learning_rate": 7.057444659045627e-06, + "loss": 0.2148, + "step": 10569 + }, + { + "epoch": 0.61, + "grad_norm": 0.25590767906812234, + "learning_rate": 7.055666185995256e-06, + "loss": 0.2129, + "step": 10570 + }, + { + "epoch": 0.61, + "grad_norm": 0.5079144727780354, + "learning_rate": 7.053887814907036e-06, + "loss": 0.3775, + "step": 10571 + }, + { + "epoch": 0.61, + "grad_norm": 0.47995334935170597, + "learning_rate": 7.0521095458425555e-06, + "loss": 0.3491, + "step": 10572 + }, + { + "epoch": 0.61, + "grad_norm": 0.3018031483350368, + "learning_rate": 7.050331378863395e-06, + "loss": 0.1862, + "step": 10573 + }, + { + "epoch": 0.61, + "grad_norm": 0.7799879504359073, + "learning_rate": 7.048553314031132e-06, + "loss": 0.4413, + "step": 10574 + }, + { + "epoch": 0.61, + "grad_norm": 0.34979493541325096, + "learning_rate": 7.04677535140734e-06, + "loss": 0.2612, + "step": 10575 + }, + { + "epoch": 0.61, + "grad_norm": 0.2285568510508859, + "learning_rate": 7.0449974910535916e-06, + "loss": 0.1851, + "step": 10576 + }, + { + "epoch": 0.61, + "grad_norm": 1.0304051087015562, + "learning_rate": 7.043219733031452e-06, + "loss": 0.627, + "step": 10577 + }, + { + "epoch": 0.61, + "grad_norm": 1.0139546657916503, + "learning_rate": 7.041442077402487e-06, + "loss": 0.6937, + "step": 10578 + }, + { + "epoch": 0.61, + "grad_norm": 0.30342367295446254, + "learning_rate": 7.0396645242282535e-06, + "loss": 0.2022, + "step": 10579 + }, + { + "epoch": 0.61, + "grad_norm": 0.3986109508000254, + "learning_rate": 7.037887073570313e-06, + "loss": 0.3156, + "step": 10580 + }, + { + "epoch": 0.61, + "grad_norm": 0.4877704840722932, + "learning_rate": 7.036109725490214e-06, + "loss": 0.2859, + "step": 10581 + }, + { + "epoch": 0.61, + "grad_norm": 0.30519393626632985, + "learning_rate": 7.03433248004951e-06, + "loss": 0.1873, + "step": 10582 + }, + { + "epoch": 0.61, + "grad_norm": 0.41057454427454104, + "learning_rate": 7.032555337309743e-06, + "loss": 0.3229, + "step": 10583 + }, + { + "epoch": 0.61, + "grad_norm": 0.3539347623622069, + "learning_rate": 7.030778297332457e-06, + "loss": 0.3001, + "step": 10584 + }, + { + "epoch": 0.61, + "grad_norm": 0.3887406352092294, + "learning_rate": 7.0290013601791905e-06, + "loss": 0.2601, + "step": 10585 + }, + { + "epoch": 0.61, + "grad_norm": 0.47380536198285, + "learning_rate": 7.027224525911479e-06, + "loss": 0.2907, + "step": 10586 + }, + { + "epoch": 0.61, + "grad_norm": 0.4561919825253446, + "learning_rate": 7.025447794590856e-06, + "loss": 0.3529, + "step": 10587 + }, + { + "epoch": 0.61, + "grad_norm": 0.24317420310642943, + "learning_rate": 7.023671166278845e-06, + "loss": 0.2204, + "step": 10588 + }, + { + "epoch": 0.61, + "grad_norm": 0.30300578946259576, + "learning_rate": 7.021894641036977e-06, + "loss": 0.1961, + "step": 10589 + }, + { + "epoch": 0.61, + "grad_norm": 0.7928562012811823, + "learning_rate": 7.020118218926767e-06, + "loss": 0.5509, + "step": 10590 + }, + { + "epoch": 0.61, + "grad_norm": 0.31281127151054094, + "learning_rate": 7.018341900009738e-06, + "loss": 0.2595, + "step": 10591 + }, + { + "epoch": 0.61, + "grad_norm": 0.378034705279621, + "learning_rate": 7.0165656843473965e-06, + "loss": 0.2803, + "step": 10592 + }, + { + "epoch": 0.61, + "grad_norm": 0.9236198996788825, + "learning_rate": 7.0147895720012596e-06, + "loss": 0.5131, + "step": 10593 + }, + { + "epoch": 0.61, + "grad_norm": 0.24966395258526514, + "learning_rate": 7.01301356303283e-06, + "loss": 0.1917, + "step": 10594 + }, + { + "epoch": 0.61, + "grad_norm": 0.9200006479669353, + "learning_rate": 7.011237657503615e-06, + "loss": 0.5364, + "step": 10595 + }, + { + "epoch": 0.61, + "grad_norm": 0.2708229012641202, + "learning_rate": 7.009461855475111e-06, + "loss": 0.2601, + "step": 10596 + }, + { + "epoch": 0.61, + "grad_norm": 0.42431345222551275, + "learning_rate": 7.00768615700881e-06, + "loss": 0.3102, + "step": 10597 + }, + { + "epoch": 0.61, + "grad_norm": 0.49933535085134173, + "learning_rate": 7.005910562166213e-06, + "loss": 0.3538, + "step": 10598 + }, + { + "epoch": 0.61, + "grad_norm": 0.32711074269410184, + "learning_rate": 7.004135071008803e-06, + "loss": 0.244, + "step": 10599 + }, + { + "epoch": 0.61, + "grad_norm": 0.3911638518785674, + "learning_rate": 7.0023596835980676e-06, + "loss": 0.2621, + "step": 10600 + }, + { + "epoch": 0.61, + "grad_norm": 0.549319709524697, + "learning_rate": 7.000584399995486e-06, + "loss": 0.4431, + "step": 10601 + }, + { + "epoch": 0.61, + "grad_norm": 0.4383392566950631, + "learning_rate": 6.998809220262541e-06, + "loss": 0.2549, + "step": 10602 + }, + { + "epoch": 0.61, + "grad_norm": 0.39079099029956776, + "learning_rate": 6.997034144460702e-06, + "loss": 0.2778, + "step": 10603 + }, + { + "epoch": 0.61, + "grad_norm": 0.24881046372954682, + "learning_rate": 6.995259172651441e-06, + "loss": 0.2452, + "step": 10604 + }, + { + "epoch": 0.61, + "grad_norm": 1.0911990822931978, + "learning_rate": 6.993484304896225e-06, + "loss": 0.2585, + "step": 10605 + }, + { + "epoch": 0.61, + "grad_norm": 0.3859135519020631, + "learning_rate": 6.991709541256517e-06, + "loss": 0.271, + "step": 10606 + }, + { + "epoch": 0.61, + "grad_norm": 0.38547240651856923, + "learning_rate": 6.98993488179378e-06, + "loss": 0.328, + "step": 10607 + }, + { + "epoch": 0.61, + "grad_norm": 0.496427375438408, + "learning_rate": 6.988160326569471e-06, + "loss": 0.2881, + "step": 10608 + }, + { + "epoch": 0.61, + "grad_norm": 0.2807034932115483, + "learning_rate": 6.986385875645036e-06, + "loss": 0.2477, + "step": 10609 + }, + { + "epoch": 0.61, + "grad_norm": 0.36666726706200375, + "learning_rate": 6.984611529081931e-06, + "loss": 0.2671, + "step": 10610 + }, + { + "epoch": 0.61, + "grad_norm": 0.3784004456555454, + "learning_rate": 6.982837286941598e-06, + "loss": 0.2992, + "step": 10611 + }, + { + "epoch": 0.61, + "grad_norm": 0.2947344402549206, + "learning_rate": 6.981063149285481e-06, + "loss": 0.1862, + "step": 10612 + }, + { + "epoch": 0.61, + "grad_norm": 1.2011332586742978, + "learning_rate": 6.979289116175014e-06, + "loss": 0.7797, + "step": 10613 + }, + { + "epoch": 0.61, + "grad_norm": 0.6269447977839719, + "learning_rate": 6.977515187671639e-06, + "loss": 0.3863, + "step": 10614 + }, + { + "epoch": 0.61, + "grad_norm": 0.28525954215061056, + "learning_rate": 6.975741363836781e-06, + "loss": 0.2034, + "step": 10615 + }, + { + "epoch": 0.61, + "grad_norm": 0.23251089488001686, + "learning_rate": 6.973967644731872e-06, + "loss": 0.2167, + "step": 10616 + }, + { + "epoch": 0.61, + "grad_norm": 1.136268898483301, + "learning_rate": 6.972194030418329e-06, + "loss": 0.4865, + "step": 10617 + }, + { + "epoch": 0.61, + "grad_norm": 0.35214424867594674, + "learning_rate": 6.97042052095758e-06, + "loss": 0.1561, + "step": 10618 + }, + { + "epoch": 0.61, + "grad_norm": 0.3186430841898647, + "learning_rate": 6.968647116411036e-06, + "loss": 0.2863, + "step": 10619 + }, + { + "epoch": 0.61, + "grad_norm": 0.5067307018719389, + "learning_rate": 6.966873816840114e-06, + "loss": 0.3595, + "step": 10620 + }, + { + "epoch": 0.61, + "grad_norm": 0.33263369625653055, + "learning_rate": 6.96510062230622e-06, + "loss": 0.1733, + "step": 10621 + }, + { + "epoch": 0.61, + "grad_norm": 0.23286758847398575, + "learning_rate": 6.963327532870763e-06, + "loss": 0.1742, + "step": 10622 + }, + { + "epoch": 0.61, + "grad_norm": 0.4730282585689458, + "learning_rate": 6.961554548595142e-06, + "loss": 0.326, + "step": 10623 + }, + { + "epoch": 0.61, + "grad_norm": 0.3619899467460046, + "learning_rate": 6.959781669540754e-06, + "loss": 0.2461, + "step": 10624 + }, + { + "epoch": 0.61, + "grad_norm": 0.3841805210107079, + "learning_rate": 6.958008895769e-06, + "loss": 0.2792, + "step": 10625 + }, + { + "epoch": 0.61, + "grad_norm": 0.7631851504994283, + "learning_rate": 6.956236227341262e-06, + "loss": 0.5202, + "step": 10626 + }, + { + "epoch": 0.61, + "grad_norm": 0.2999124611334779, + "learning_rate": 6.954463664318937e-06, + "loss": 0.2546, + "step": 10627 + }, + { + "epoch": 0.61, + "grad_norm": 0.24820162596488865, + "learning_rate": 6.952691206763402e-06, + "loss": 0.1693, + "step": 10628 + }, + { + "epoch": 0.61, + "grad_norm": 1.2347189617039587, + "learning_rate": 6.950918854736041e-06, + "loss": 0.5083, + "step": 10629 + }, + { + "epoch": 0.61, + "grad_norm": 0.5522730367103457, + "learning_rate": 6.949146608298227e-06, + "loss": 0.3568, + "step": 10630 + }, + { + "epoch": 0.61, + "grad_norm": 0.3104954162518697, + "learning_rate": 6.947374467511336e-06, + "loss": 0.2486, + "step": 10631 + }, + { + "epoch": 0.61, + "grad_norm": 0.4520113873626869, + "learning_rate": 6.945602432436736e-06, + "loss": 0.3982, + "step": 10632 + }, + { + "epoch": 0.61, + "grad_norm": 0.4093916598564017, + "learning_rate": 6.9438305031357935e-06, + "loss": 0.2927, + "step": 10633 + }, + { + "epoch": 0.61, + "grad_norm": 0.3338504922517524, + "learning_rate": 6.9420586796698655e-06, + "loss": 0.219, + "step": 10634 + }, + { + "epoch": 0.61, + "grad_norm": 0.25665018793378996, + "learning_rate": 6.940286962100318e-06, + "loss": 0.1841, + "step": 10635 + }, + { + "epoch": 0.61, + "grad_norm": 0.6598078007493531, + "learning_rate": 6.938515350488503e-06, + "loss": 0.4409, + "step": 10636 + }, + { + "epoch": 0.61, + "grad_norm": 0.3300845858986062, + "learning_rate": 6.936743844895768e-06, + "loss": 0.2797, + "step": 10637 + }, + { + "epoch": 0.61, + "grad_norm": 0.4680762857712185, + "learning_rate": 6.934972445383459e-06, + "loss": 0.2779, + "step": 10638 + }, + { + "epoch": 0.61, + "grad_norm": 0.4200145663298372, + "learning_rate": 6.933201152012925e-06, + "loss": 0.3078, + "step": 10639 + }, + { + "epoch": 0.61, + "grad_norm": 0.27962340097798727, + "learning_rate": 6.931429964845501e-06, + "loss": 0.2241, + "step": 10640 + }, + { + "epoch": 0.61, + "grad_norm": 0.53746389745485, + "learning_rate": 6.929658883942527e-06, + "loss": 0.1577, + "step": 10641 + }, + { + "epoch": 0.61, + "grad_norm": 0.4959989835213774, + "learning_rate": 6.927887909365333e-06, + "loss": 0.3364, + "step": 10642 + }, + { + "epoch": 0.61, + "grad_norm": 0.29723094294413305, + "learning_rate": 6.92611704117525e-06, + "loss": 0.2759, + "step": 10643 + }, + { + "epoch": 0.61, + "grad_norm": 0.8817731319299101, + "learning_rate": 6.924346279433599e-06, + "loss": 0.4836, + "step": 10644 + }, + { + "epoch": 0.61, + "grad_norm": 0.603693846570787, + "learning_rate": 6.922575624201706e-06, + "loss": 0.379, + "step": 10645 + }, + { + "epoch": 0.61, + "grad_norm": 0.31016181140642807, + "learning_rate": 6.920805075540886e-06, + "loss": 0.2332, + "step": 10646 + }, + { + "epoch": 0.61, + "grad_norm": 0.27221079855815977, + "learning_rate": 6.919034633512456e-06, + "loss": 0.2433, + "step": 10647 + }, + { + "epoch": 0.61, + "grad_norm": 0.6010295063712859, + "learning_rate": 6.917264298177724e-06, + "loss": 0.267, + "step": 10648 + }, + { + "epoch": 0.61, + "grad_norm": 0.4000427804407078, + "learning_rate": 6.915494069597993e-06, + "loss": 0.3225, + "step": 10649 + }, + { + "epoch": 0.61, + "grad_norm": 1.0407707069428398, + "learning_rate": 6.913723947834574e-06, + "loss": 0.6438, + "step": 10650 + }, + { + "epoch": 0.61, + "grad_norm": 0.2765627603584837, + "learning_rate": 6.9119539329487585e-06, + "loss": 0.2274, + "step": 10651 + }, + { + "epoch": 0.61, + "grad_norm": 0.5224307700746306, + "learning_rate": 6.9101840250018485e-06, + "loss": 0.3428, + "step": 10652 + }, + { + "epoch": 0.61, + "grad_norm": 0.35898777209765914, + "learning_rate": 6.908414224055129e-06, + "loss": 0.2134, + "step": 10653 + }, + { + "epoch": 0.61, + "grad_norm": 0.7316390060143079, + "learning_rate": 6.906644530169896e-06, + "loss": 0.2693, + "step": 10654 + }, + { + "epoch": 0.61, + "grad_norm": 0.2562741289721085, + "learning_rate": 6.904874943407427e-06, + "loss": 0.2569, + "step": 10655 + }, + { + "epoch": 0.61, + "grad_norm": 1.057082723143132, + "learning_rate": 6.903105463829007e-06, + "loss": 0.7312, + "step": 10656 + }, + { + "epoch": 0.61, + "grad_norm": 0.6832390986635581, + "learning_rate": 6.901336091495912e-06, + "loss": 0.159, + "step": 10657 + }, + { + "epoch": 0.61, + "grad_norm": 0.2674851553546089, + "learning_rate": 6.899566826469415e-06, + "loss": 0.2218, + "step": 10658 + }, + { + "epoch": 0.61, + "grad_norm": 0.3579968138322811, + "learning_rate": 6.897797668810784e-06, + "loss": 0.2911, + "step": 10659 + }, + { + "epoch": 0.61, + "grad_norm": 0.5464163869475679, + "learning_rate": 6.896028618581287e-06, + "loss": 0.3132, + "step": 10660 + }, + { + "epoch": 0.61, + "grad_norm": 0.3000321311735532, + "learning_rate": 6.894259675842188e-06, + "loss": 0.2106, + "step": 10661 + }, + { + "epoch": 0.61, + "grad_norm": 1.2299271449853884, + "learning_rate": 6.892490840654739e-06, + "loss": 0.8311, + "step": 10662 + }, + { + "epoch": 0.61, + "grad_norm": 0.30179860641260736, + "learning_rate": 6.890722113080201e-06, + "loss": 0.2821, + "step": 10663 + }, + { + "epoch": 0.61, + "grad_norm": 0.34845039622272467, + "learning_rate": 6.888953493179819e-06, + "loss": 0.2284, + "step": 10664 + }, + { + "epoch": 0.61, + "grad_norm": 0.5995945108100336, + "learning_rate": 6.88718498101485e-06, + "loss": 0.3831, + "step": 10665 + }, + { + "epoch": 0.61, + "grad_norm": 0.25283872062952506, + "learning_rate": 6.885416576646525e-06, + "loss": 0.1791, + "step": 10666 + }, + { + "epoch": 0.61, + "grad_norm": 0.2785774684194344, + "learning_rate": 6.883648280136094e-06, + "loss": 0.223, + "step": 10667 + }, + { + "epoch": 0.61, + "grad_norm": 1.1132804366543598, + "learning_rate": 6.881880091544786e-06, + "loss": 0.7453, + "step": 10668 + }, + { + "epoch": 0.61, + "grad_norm": 0.8623000767888052, + "learning_rate": 6.880112010933839e-06, + "loss": 0.5088, + "step": 10669 + }, + { + "epoch": 0.61, + "grad_norm": 0.3251227489998159, + "learning_rate": 6.878344038364481e-06, + "loss": 0.2274, + "step": 10670 + }, + { + "epoch": 0.61, + "grad_norm": 0.36645085483371054, + "learning_rate": 6.8765761738979305e-06, + "loss": 0.3253, + "step": 10671 + }, + { + "epoch": 0.61, + "grad_norm": 0.26849276242470155, + "learning_rate": 6.874808417595415e-06, + "loss": 0.1771, + "step": 10672 + }, + { + "epoch": 0.61, + "grad_norm": 0.32862638825590396, + "learning_rate": 6.87304076951815e-06, + "loss": 0.2591, + "step": 10673 + }, + { + "epoch": 0.61, + "grad_norm": 0.4298652986509392, + "learning_rate": 6.871273229727346e-06, + "loss": 0.2987, + "step": 10674 + }, + { + "epoch": 0.61, + "grad_norm": 0.5045099892284276, + "learning_rate": 6.869505798284217e-06, + "loss": 0.3762, + "step": 10675 + }, + { + "epoch": 0.61, + "grad_norm": 0.32738198225492715, + "learning_rate": 6.867738475249967e-06, + "loss": 0.28, + "step": 10676 + }, + { + "epoch": 0.61, + "grad_norm": 0.6053803182315074, + "learning_rate": 6.8659712606858e-06, + "loss": 0.292, + "step": 10677 + }, + { + "epoch": 0.61, + "grad_norm": 0.2784146187816311, + "learning_rate": 6.8642041546529115e-06, + "loss": 0.2397, + "step": 10678 + }, + { + "epoch": 0.61, + "grad_norm": 0.25915374026684634, + "learning_rate": 6.8624371572125e-06, + "loss": 0.1908, + "step": 10679 + }, + { + "epoch": 0.61, + "grad_norm": 1.1034569399626564, + "learning_rate": 6.860670268425754e-06, + "loss": 0.5397, + "step": 10680 + }, + { + "epoch": 0.61, + "grad_norm": 0.6580196847714953, + "learning_rate": 6.858903488353863e-06, + "loss": 0.3914, + "step": 10681 + }, + { + "epoch": 0.61, + "grad_norm": 0.3087985735861544, + "learning_rate": 6.857136817058007e-06, + "loss": 0.2731, + "step": 10682 + }, + { + "epoch": 0.61, + "grad_norm": 0.30721390171367746, + "learning_rate": 6.855370254599369e-06, + "loss": 0.2289, + "step": 10683 + }, + { + "epoch": 0.61, + "grad_norm": 0.22683232782377963, + "learning_rate": 6.853603801039124e-06, + "loss": 0.156, + "step": 10684 + }, + { + "epoch": 0.61, + "grad_norm": 0.3619861963806639, + "learning_rate": 6.8518374564384434e-06, + "loss": 0.2869, + "step": 10685 + }, + { + "epoch": 0.61, + "grad_norm": 1.2769919962248444, + "learning_rate": 6.850071220858496e-06, + "loss": 0.6362, + "step": 10686 + }, + { + "epoch": 0.61, + "grad_norm": 0.36211924335233864, + "learning_rate": 6.84830509436045e-06, + "loss": 0.2483, + "step": 10687 + }, + { + "epoch": 0.61, + "grad_norm": 0.372473918353896, + "learning_rate": 6.846539077005461e-06, + "loss": 0.2733, + "step": 10688 + }, + { + "epoch": 0.61, + "grad_norm": 0.7227237641851568, + "learning_rate": 6.844773168854686e-06, + "loss": 0.4258, + "step": 10689 + }, + { + "epoch": 0.61, + "grad_norm": 0.25478526632383197, + "learning_rate": 6.843007369969283e-06, + "loss": 0.1721, + "step": 10690 + }, + { + "epoch": 0.61, + "grad_norm": 0.2592583923802607, + "learning_rate": 6.841241680410398e-06, + "loss": 0.2386, + "step": 10691 + }, + { + "epoch": 0.61, + "grad_norm": 0.417897430422118, + "learning_rate": 6.83947610023918e-06, + "loss": 0.3193, + "step": 10692 + }, + { + "epoch": 0.61, + "grad_norm": 0.8573806813691548, + "learning_rate": 6.837710629516765e-06, + "loss": 0.3226, + "step": 10693 + }, + { + "epoch": 0.61, + "grad_norm": 0.32550275471629125, + "learning_rate": 6.835945268304298e-06, + "loss": 0.2556, + "step": 10694 + }, + { + "epoch": 0.61, + "grad_norm": 0.3535437138265614, + "learning_rate": 6.834180016662908e-06, + "loss": 0.3155, + "step": 10695 + }, + { + "epoch": 0.61, + "grad_norm": 0.9949706011291491, + "learning_rate": 6.8324148746537286e-06, + "loss": 0.3106, + "step": 10696 + }, + { + "epoch": 0.61, + "grad_norm": 0.31547765324720395, + "learning_rate": 6.830649842337885e-06, + "loss": 0.2539, + "step": 10697 + }, + { + "epoch": 0.61, + "grad_norm": 1.0524343916133838, + "learning_rate": 6.828884919776504e-06, + "loss": 0.5325, + "step": 10698 + }, + { + "epoch": 0.61, + "grad_norm": 0.44576274356069756, + "learning_rate": 6.827120107030698e-06, + "loss": 0.3259, + "step": 10699 + }, + { + "epoch": 0.61, + "grad_norm": 0.22639385024816566, + "learning_rate": 6.82535540416159e-06, + "loss": 0.1372, + "step": 10700 + }, + { + "epoch": 0.61, + "grad_norm": 1.1446799361802587, + "learning_rate": 6.823590811230287e-06, + "loss": 0.6339, + "step": 10701 + }, + { + "epoch": 0.61, + "grad_norm": 0.3551463442681128, + "learning_rate": 6.821826328297896e-06, + "loss": 0.3097, + "step": 10702 + }, + { + "epoch": 0.61, + "grad_norm": 0.34226727600361334, + "learning_rate": 6.820061955425527e-06, + "loss": 0.1847, + "step": 10703 + }, + { + "epoch": 0.61, + "grad_norm": 0.5462335453113937, + "learning_rate": 6.818297692674273e-06, + "loss": 0.3826, + "step": 10704 + }, + { + "epoch": 0.62, + "grad_norm": 0.41512218730788686, + "learning_rate": 6.81653354010523e-06, + "loss": 0.2327, + "step": 10705 + }, + { + "epoch": 0.62, + "grad_norm": 0.2849103841623909, + "learning_rate": 6.8147694977794975e-06, + "loss": 0.1901, + "step": 10706 + }, + { + "epoch": 0.62, + "grad_norm": 0.35804096430942806, + "learning_rate": 6.813005565758158e-06, + "loss": 0.3045, + "step": 10707 + }, + { + "epoch": 0.62, + "grad_norm": 0.959727916273778, + "learning_rate": 6.8112417441022995e-06, + "loss": 0.5161, + "step": 10708 + }, + { + "epoch": 0.62, + "grad_norm": 0.41141512882499015, + "learning_rate": 6.809478032873002e-06, + "loss": 0.3103, + "step": 10709 + }, + { + "epoch": 0.62, + "grad_norm": 0.28673660446727955, + "learning_rate": 6.807714432131343e-06, + "loss": 0.2599, + "step": 10710 + }, + { + "epoch": 0.62, + "grad_norm": 0.6706818173645254, + "learning_rate": 6.805950941938395e-06, + "loss": 0.4362, + "step": 10711 + }, + { + "epoch": 0.62, + "grad_norm": 0.2853954047662548, + "learning_rate": 6.804187562355231e-06, + "loss": 0.2041, + "step": 10712 + }, + { + "epoch": 0.62, + "grad_norm": 0.3040463174783864, + "learning_rate": 6.802424293442914e-06, + "loss": 0.1679, + "step": 10713 + }, + { + "epoch": 0.62, + "grad_norm": 0.3824192547178403, + "learning_rate": 6.800661135262505e-06, + "loss": 0.3014, + "step": 10714 + }, + { + "epoch": 0.62, + "grad_norm": 0.3310073163699963, + "learning_rate": 6.7988980878750636e-06, + "loss": 0.2803, + "step": 10715 + }, + { + "epoch": 0.62, + "grad_norm": 0.900982088435711, + "learning_rate": 6.797135151341643e-06, + "loss": 0.3219, + "step": 10716 + }, + { + "epoch": 0.62, + "grad_norm": 0.42156896906947466, + "learning_rate": 6.7953723257232955e-06, + "loss": 0.3128, + "step": 10717 + }, + { + "epoch": 0.62, + "grad_norm": 0.2688101831184305, + "learning_rate": 6.793609611081064e-06, + "loss": 0.2541, + "step": 10718 + }, + { + "epoch": 0.62, + "grad_norm": 0.3140732036560362, + "learning_rate": 6.791847007475998e-06, + "loss": 0.1823, + "step": 10719 + }, + { + "epoch": 0.62, + "grad_norm": 1.0008202280028524, + "learning_rate": 6.7900845149691285e-06, + "loss": 0.631, + "step": 10720 + }, + { + "epoch": 0.62, + "grad_norm": 0.42576299687385705, + "learning_rate": 6.7883221336214965e-06, + "loss": 0.2948, + "step": 10721 + }, + { + "epoch": 0.62, + "grad_norm": 0.42623039582871874, + "learning_rate": 6.7865598634941295e-06, + "loss": 0.3172, + "step": 10722 + }, + { + "epoch": 0.62, + "grad_norm": 0.45379008885941363, + "learning_rate": 6.784797704648058e-06, + "loss": 0.2807, + "step": 10723 + }, + { + "epoch": 0.62, + "grad_norm": 0.3814091792954231, + "learning_rate": 6.7830356571443016e-06, + "loss": 0.2453, + "step": 10724 + }, + { + "epoch": 0.62, + "grad_norm": 0.2705471683167067, + "learning_rate": 6.7812737210438836e-06, + "loss": 0.1876, + "step": 10725 + }, + { + "epoch": 0.62, + "grad_norm": 0.3377882881037869, + "learning_rate": 6.77951189640782e-06, + "loss": 0.2592, + "step": 10726 + }, + { + "epoch": 0.62, + "grad_norm": 0.3912153744933908, + "learning_rate": 6.777750183297117e-06, + "loss": 0.26, + "step": 10727 + }, + { + "epoch": 0.62, + "grad_norm": 0.43342392314247236, + "learning_rate": 6.77598858177279e-06, + "loss": 0.3417, + "step": 10728 + }, + { + "epoch": 0.62, + "grad_norm": 0.9386708011142606, + "learning_rate": 6.774227091895835e-06, + "loss": 0.3769, + "step": 10729 + }, + { + "epoch": 0.62, + "grad_norm": 0.3215869104097005, + "learning_rate": 6.772465713727262e-06, + "loss": 0.2572, + "step": 10730 + }, + { + "epoch": 0.62, + "grad_norm": 0.21444487048175678, + "learning_rate": 6.77070444732806e-06, + "loss": 0.1924, + "step": 10731 + }, + { + "epoch": 0.62, + "grad_norm": 0.7105654768008676, + "learning_rate": 6.768943292759226e-06, + "loss": 0.3145, + "step": 10732 + }, + { + "epoch": 0.62, + "grad_norm": 0.8871355072259255, + "learning_rate": 6.767182250081744e-06, + "loss": 0.2807, + "step": 10733 + }, + { + "epoch": 0.62, + "grad_norm": 0.36136097231280245, + "learning_rate": 6.765421319356605e-06, + "loss": 0.3122, + "step": 10734 + }, + { + "epoch": 0.62, + "grad_norm": 0.45744692480747945, + "learning_rate": 6.763660500644783e-06, + "loss": 0.3692, + "step": 10735 + }, + { + "epoch": 0.62, + "grad_norm": 0.31629793854850224, + "learning_rate": 6.761899794007262e-06, + "loss": 0.2086, + "step": 10736 + }, + { + "epoch": 0.62, + "grad_norm": 0.25178578119165373, + "learning_rate": 6.760139199505014e-06, + "loss": 0.1469, + "step": 10737 + }, + { + "epoch": 0.62, + "grad_norm": 0.3273390465451645, + "learning_rate": 6.758378717199004e-06, + "loss": 0.2983, + "step": 10738 + }, + { + "epoch": 0.62, + "grad_norm": 0.58071175528015, + "learning_rate": 6.756618347150196e-06, + "loss": 0.2238, + "step": 10739 + }, + { + "epoch": 0.62, + "grad_norm": 0.3900166059812892, + "learning_rate": 6.7548580894195585e-06, + "loss": 0.3206, + "step": 10740 + }, + { + "epoch": 0.62, + "grad_norm": 2.5014095471100632, + "learning_rate": 6.753097944068043e-06, + "loss": 0.6091, + "step": 10741 + }, + { + "epoch": 0.62, + "grad_norm": 0.27374873890951196, + "learning_rate": 6.7513379111566105e-06, + "loss": 0.2153, + "step": 10742 + }, + { + "epoch": 0.62, + "grad_norm": 0.2331727972943417, + "learning_rate": 6.749577990746202e-06, + "loss": 0.2058, + "step": 10743 + }, + { + "epoch": 0.62, + "grad_norm": 0.6106539407307946, + "learning_rate": 6.74781818289777e-06, + "loss": 0.3983, + "step": 10744 + }, + { + "epoch": 0.62, + "grad_norm": 0.3191253890892391, + "learning_rate": 6.746058487672253e-06, + "loss": 0.1622, + "step": 10745 + }, + { + "epoch": 0.62, + "grad_norm": 0.2832733787647948, + "learning_rate": 6.744298905130593e-06, + "loss": 0.2955, + "step": 10746 + }, + { + "epoch": 0.62, + "grad_norm": 1.2000591687611528, + "learning_rate": 6.74253943533372e-06, + "loss": 0.7825, + "step": 10747 + }, + { + "epoch": 0.62, + "grad_norm": 0.5827279940418081, + "learning_rate": 6.740780078342568e-06, + "loss": 0.304, + "step": 10748 + }, + { + "epoch": 0.62, + "grad_norm": 0.2421944177584202, + "learning_rate": 6.7390208342180595e-06, + "loss": 0.1781, + "step": 10749 + }, + { + "epoch": 0.62, + "grad_norm": 0.3432118231078927, + "learning_rate": 6.737261703021123e-06, + "loss": 0.3119, + "step": 10750 + }, + { + "epoch": 0.62, + "grad_norm": 0.6051458957393929, + "learning_rate": 6.735502684812669e-06, + "loss": 0.371, + "step": 10751 + }, + { + "epoch": 0.62, + "grad_norm": 0.4037052100199354, + "learning_rate": 6.73374377965362e-06, + "loss": 0.2492, + "step": 10752 + }, + { + "epoch": 0.62, + "grad_norm": 0.9394090165245456, + "learning_rate": 6.731984987604882e-06, + "loss": 0.5348, + "step": 10753 + }, + { + "epoch": 0.62, + "grad_norm": 0.29799092639937913, + "learning_rate": 6.730226308727363e-06, + "loss": 0.2664, + "step": 10754 + }, + { + "epoch": 0.62, + "grad_norm": 0.44911316464676126, + "learning_rate": 6.728467743081968e-06, + "loss": 0.2381, + "step": 10755 + }, + { + "epoch": 0.62, + "grad_norm": 0.3233394168263766, + "learning_rate": 6.726709290729592e-06, + "loss": 0.2382, + "step": 10756 + }, + { + "epoch": 0.62, + "grad_norm": 0.5916566364028347, + "learning_rate": 6.724950951731135e-06, + "loss": 0.3323, + "step": 10757 + }, + { + "epoch": 0.62, + "grad_norm": 0.2714572194543101, + "learning_rate": 6.723192726147482e-06, + "loss": 0.2394, + "step": 10758 + }, + { + "epoch": 0.62, + "grad_norm": 1.0438216384131889, + "learning_rate": 6.721434614039528e-06, + "loss": 0.6214, + "step": 10759 + }, + { + "epoch": 0.62, + "grad_norm": 0.8915962859834544, + "learning_rate": 6.719676615468149e-06, + "loss": 0.4534, + "step": 10760 + }, + { + "epoch": 0.62, + "grad_norm": 0.3335692130553307, + "learning_rate": 6.717918730494231e-06, + "loss": 0.2893, + "step": 10761 + }, + { + "epoch": 0.62, + "grad_norm": 0.23904715452470016, + "learning_rate": 6.716160959178644e-06, + "loss": 0.2044, + "step": 10762 + }, + { + "epoch": 0.62, + "grad_norm": 0.77472616741585, + "learning_rate": 6.714403301582263e-06, + "loss": 0.4212, + "step": 10763 + }, + { + "epoch": 0.62, + "grad_norm": 0.3341935232637106, + "learning_rate": 6.712645757765952e-06, + "loss": 0.272, + "step": 10764 + }, + { + "epoch": 0.62, + "grad_norm": 0.8951143831388909, + "learning_rate": 6.710888327790581e-06, + "loss": 0.3213, + "step": 10765 + }, + { + "epoch": 0.62, + "grad_norm": 0.29522917110829133, + "learning_rate": 6.709131011717005e-06, + "loss": 0.2728, + "step": 10766 + }, + { + "epoch": 0.62, + "grad_norm": 0.3995050553221213, + "learning_rate": 6.707373809606077e-06, + "loss": 0.3016, + "step": 10767 + }, + { + "epoch": 0.62, + "grad_norm": 0.33594474411352, + "learning_rate": 6.705616721518655e-06, + "loss": 0.1888, + "step": 10768 + }, + { + "epoch": 0.62, + "grad_norm": 0.2893697612282927, + "learning_rate": 6.703859747515584e-06, + "loss": 0.1952, + "step": 10769 + }, + { + "epoch": 0.62, + "grad_norm": 0.27035597075734846, + "learning_rate": 6.702102887657709e-06, + "loss": 0.2757, + "step": 10770 + }, + { + "epoch": 0.62, + "grad_norm": 0.8535729454967663, + "learning_rate": 6.7003461420058715e-06, + "loss": 0.3506, + "step": 10771 + }, + { + "epoch": 0.62, + "grad_norm": 0.5549308852351893, + "learning_rate": 6.6985895106209005e-06, + "loss": 0.3496, + "step": 10772 + }, + { + "epoch": 0.62, + "grad_norm": 0.36265593609148106, + "learning_rate": 6.696832993563636e-06, + "loss": 0.2992, + "step": 10773 + }, + { + "epoch": 0.62, + "grad_norm": 0.3523666206537711, + "learning_rate": 6.695076590894899e-06, + "loss": 0.2899, + "step": 10774 + }, + { + "epoch": 0.62, + "grad_norm": 0.2912704248423908, + "learning_rate": 6.693320302675521e-06, + "loss": 0.1405, + "step": 10775 + }, + { + "epoch": 0.62, + "grad_norm": 0.3451346608345089, + "learning_rate": 6.6915641289663154e-06, + "loss": 0.2796, + "step": 10776 + }, + { + "epoch": 0.62, + "grad_norm": 0.3192817030035515, + "learning_rate": 6.689808069828105e-06, + "loss": 0.2486, + "step": 10777 + }, + { + "epoch": 0.62, + "grad_norm": 0.38933740283548857, + "learning_rate": 6.688052125321698e-06, + "loss": 0.2439, + "step": 10778 + }, + { + "epoch": 0.62, + "grad_norm": 0.3694365237432283, + "learning_rate": 6.686296295507903e-06, + "loss": 0.2851, + "step": 10779 + }, + { + "epoch": 0.62, + "grad_norm": 0.4782734090964818, + "learning_rate": 6.684540580447525e-06, + "loss": 0.2538, + "step": 10780 + }, + { + "epoch": 0.62, + "grad_norm": 0.4149927816314323, + "learning_rate": 6.682784980201363e-06, + "loss": 0.2624, + "step": 10781 + }, + { + "epoch": 0.62, + "grad_norm": 0.2933724186560608, + "learning_rate": 6.6810294948302165e-06, + "loss": 0.2648, + "step": 10782 + }, + { + "epoch": 0.62, + "grad_norm": 0.46882515485188336, + "learning_rate": 6.679274124394874e-06, + "loss": 0.2259, + "step": 10783 + }, + { + "epoch": 0.62, + "grad_norm": 0.5844321532824087, + "learning_rate": 6.677518868956128e-06, + "loss": 0.2892, + "step": 10784 + }, + { + "epoch": 0.62, + "grad_norm": 0.29395991528266874, + "learning_rate": 6.675763728574758e-06, + "loss": 0.2712, + "step": 10785 + }, + { + "epoch": 0.62, + "grad_norm": 0.47841665979336606, + "learning_rate": 6.67400870331155e-06, + "loss": 0.3708, + "step": 10786 + }, + { + "epoch": 0.62, + "grad_norm": 0.7244194775477389, + "learning_rate": 6.672253793227273e-06, + "loss": 0.4866, + "step": 10787 + }, + { + "epoch": 0.62, + "grad_norm": 0.3039119401991573, + "learning_rate": 6.670498998382708e-06, + "loss": 0.2247, + "step": 10788 + }, + { + "epoch": 0.62, + "grad_norm": 0.35386920706771574, + "learning_rate": 6.668744318838618e-06, + "loss": 0.2584, + "step": 10789 + }, + { + "epoch": 0.62, + "grad_norm": 0.31571310808178393, + "learning_rate": 6.66698975465577e-06, + "loss": 0.2504, + "step": 10790 + }, + { + "epoch": 0.62, + "grad_norm": 0.3214123531686496, + "learning_rate": 6.665235305894925e-06, + "loss": 0.2037, + "step": 10791 + }, + { + "epoch": 0.62, + "grad_norm": 0.902925238719976, + "learning_rate": 6.663480972616835e-06, + "loss": 0.4962, + "step": 10792 + }, + { + "epoch": 0.62, + "grad_norm": 0.3389188225905232, + "learning_rate": 6.661726754882256e-06, + "loss": 0.2932, + "step": 10793 + }, + { + "epoch": 0.62, + "grad_norm": 0.3137494677922765, + "learning_rate": 6.659972652751936e-06, + "loss": 0.2132, + "step": 10794 + }, + { + "epoch": 0.62, + "grad_norm": 0.39839423898137716, + "learning_rate": 6.658218666286621e-06, + "loss": 0.3217, + "step": 10795 + }, + { + "epoch": 0.62, + "grad_norm": 0.40810260556052064, + "learning_rate": 6.656464795547048e-06, + "loss": 0.2755, + "step": 10796 + }, + { + "epoch": 0.62, + "grad_norm": 0.34635085809256516, + "learning_rate": 6.654711040593957e-06, + "loss": 0.2657, + "step": 10797 + }, + { + "epoch": 0.62, + "grad_norm": 0.3303025437976195, + "learning_rate": 6.652957401488076e-06, + "loss": 0.2489, + "step": 10798 + }, + { + "epoch": 0.62, + "grad_norm": 1.1771107826528375, + "learning_rate": 6.651203878290139e-06, + "loss": 0.7729, + "step": 10799 + }, + { + "epoch": 0.62, + "grad_norm": 0.3086966114240212, + "learning_rate": 6.649450471060865e-06, + "loss": 0.245, + "step": 10800 + }, + { + "epoch": 0.62, + "grad_norm": 0.42800613630194834, + "learning_rate": 6.64769717986098e-06, + "loss": 0.2697, + "step": 10801 + }, + { + "epoch": 0.62, + "grad_norm": 0.26776956632235843, + "learning_rate": 6.6459440047511955e-06, + "loss": 0.2168, + "step": 10802 + }, + { + "epoch": 0.62, + "grad_norm": 0.3370355453206775, + "learning_rate": 6.6441909457922286e-06, + "loss": 0.2665, + "step": 10803 + }, + { + "epoch": 0.62, + "grad_norm": 1.0228746361906365, + "learning_rate": 6.642438003044781e-06, + "loss": 0.3131, + "step": 10804 + }, + { + "epoch": 0.62, + "grad_norm": 0.35770512175069535, + "learning_rate": 6.640685176569568e-06, + "loss": 0.324, + "step": 10805 + }, + { + "epoch": 0.62, + "grad_norm": 0.31252442730763696, + "learning_rate": 6.638932466427277e-06, + "loss": 0.248, + "step": 10806 + }, + { + "epoch": 0.62, + "grad_norm": 0.7175209362515863, + "learning_rate": 6.637179872678612e-06, + "loss": 0.3296, + "step": 10807 + }, + { + "epoch": 0.62, + "grad_norm": 0.23272726866411692, + "learning_rate": 6.635427395384262e-06, + "loss": 0.1369, + "step": 10808 + }, + { + "epoch": 0.62, + "grad_norm": 0.4007091085654962, + "learning_rate": 6.633675034604918e-06, + "loss": 0.2966, + "step": 10809 + }, + { + "epoch": 0.62, + "grad_norm": 0.348395608259044, + "learning_rate": 6.6319227904012605e-06, + "loss": 0.2952, + "step": 10810 + }, + { + "epoch": 0.62, + "grad_norm": 0.7075777072812005, + "learning_rate": 6.630170662833974e-06, + "loss": 0.2966, + "step": 10811 + }, + { + "epoch": 0.62, + "grad_norm": 0.359803224019601, + "learning_rate": 6.62841865196373e-06, + "loss": 0.2858, + "step": 10812 + }, + { + "epoch": 0.62, + "grad_norm": 0.3924621489778972, + "learning_rate": 6.626666757851208e-06, + "loss": 0.3285, + "step": 10813 + }, + { + "epoch": 0.62, + "grad_norm": 0.3379307119515423, + "learning_rate": 6.624914980557067e-06, + "loss": 0.1459, + "step": 10814 + }, + { + "epoch": 0.62, + "grad_norm": 0.25824750797348944, + "learning_rate": 6.623163320141977e-06, + "loss": 0.2055, + "step": 10815 + }, + { + "epoch": 0.62, + "grad_norm": 0.9254204327897717, + "learning_rate": 6.621411776666593e-06, + "loss": 0.5132, + "step": 10816 + }, + { + "epoch": 0.62, + "grad_norm": 0.3290210750926486, + "learning_rate": 6.619660350191577e-06, + "loss": 0.2528, + "step": 10817 + }, + { + "epoch": 0.62, + "grad_norm": 0.3258960929973952, + "learning_rate": 6.617909040777578e-06, + "loss": 0.2723, + "step": 10818 + }, + { + "epoch": 0.62, + "grad_norm": 0.6898961033010748, + "learning_rate": 6.6161578484852405e-06, + "loss": 0.4741, + "step": 10819 + }, + { + "epoch": 0.62, + "grad_norm": 0.1955724790230162, + "learning_rate": 6.614406773375215e-06, + "loss": 0.0904, + "step": 10820 + }, + { + "epoch": 0.62, + "grad_norm": 0.31062886283654306, + "learning_rate": 6.612655815508135e-06, + "loss": 0.2714, + "step": 10821 + }, + { + "epoch": 0.62, + "grad_norm": 0.5003777175917457, + "learning_rate": 6.610904974944638e-06, + "loss": 0.3493, + "step": 10822 + }, + { + "epoch": 0.62, + "grad_norm": 0.6498721179269221, + "learning_rate": 6.609154251745356e-06, + "loss": 0.4102, + "step": 10823 + }, + { + "epoch": 0.62, + "grad_norm": 0.31094275540668453, + "learning_rate": 6.607403645970919e-06, + "loss": 0.2138, + "step": 10824 + }, + { + "epoch": 0.62, + "grad_norm": 0.35222845924002294, + "learning_rate": 6.605653157681945e-06, + "loss": 0.308, + "step": 10825 + }, + { + "epoch": 0.62, + "grad_norm": 0.5014401149672492, + "learning_rate": 6.603902786939058e-06, + "loss": 0.3462, + "step": 10826 + }, + { + "epoch": 0.62, + "grad_norm": 0.23552220497581033, + "learning_rate": 6.60215253380287e-06, + "loss": 0.1319, + "step": 10827 + }, + { + "epoch": 0.62, + "grad_norm": 0.7787992405401837, + "learning_rate": 6.600402398333995e-06, + "loss": 0.3407, + "step": 10828 + }, + { + "epoch": 0.62, + "grad_norm": 0.36508668218733975, + "learning_rate": 6.598652380593037e-06, + "loss": 0.3031, + "step": 10829 + }, + { + "epoch": 0.62, + "grad_norm": 0.3085726407226889, + "learning_rate": 6.596902480640603e-06, + "loss": 0.1881, + "step": 10830 + }, + { + "epoch": 0.62, + "grad_norm": 0.7405743438567102, + "learning_rate": 6.595152698537289e-06, + "loss": 0.5434, + "step": 10831 + }, + { + "epoch": 0.62, + "grad_norm": 1.4452690524635368, + "learning_rate": 6.59340303434369e-06, + "loss": 0.7422, + "step": 10832 + }, + { + "epoch": 0.62, + "grad_norm": 0.21649306649578795, + "learning_rate": 6.591653488120398e-06, + "loss": 0.1843, + "step": 10833 + }, + { + "epoch": 0.62, + "grad_norm": 0.34346378810527417, + "learning_rate": 6.589904059927998e-06, + "loss": 0.2565, + "step": 10834 + }, + { + "epoch": 0.62, + "grad_norm": 0.692637088445479, + "learning_rate": 6.588154749827076e-06, + "loss": 0.4296, + "step": 10835 + }, + { + "epoch": 0.62, + "grad_norm": 0.41281261424206633, + "learning_rate": 6.586405557878206e-06, + "loss": 0.287, + "step": 10836 + }, + { + "epoch": 0.62, + "grad_norm": 0.32134511517364006, + "learning_rate": 6.584656484141967e-06, + "loss": 0.2603, + "step": 10837 + }, + { + "epoch": 0.62, + "grad_norm": 0.80282010542682, + "learning_rate": 6.582907528678928e-06, + "loss": 0.4662, + "step": 10838 + }, + { + "epoch": 0.62, + "grad_norm": 0.33148783656373965, + "learning_rate": 6.5811586915496515e-06, + "loss": 0.2478, + "step": 10839 + }, + { + "epoch": 0.62, + "grad_norm": 0.37101099026547674, + "learning_rate": 6.579409972814703e-06, + "loss": 0.1527, + "step": 10840 + }, + { + "epoch": 0.62, + "grad_norm": 0.3358065314278554, + "learning_rate": 6.577661372534639e-06, + "loss": 0.3153, + "step": 10841 + }, + { + "epoch": 0.62, + "grad_norm": 0.3190398698999832, + "learning_rate": 6.575912890770017e-06, + "loss": 0.2436, + "step": 10842 + }, + { + "epoch": 0.62, + "grad_norm": 0.8837230043317458, + "learning_rate": 6.574164527581383e-06, + "loss": 0.4759, + "step": 10843 + }, + { + "epoch": 0.62, + "grad_norm": 0.47649999430816803, + "learning_rate": 6.5724162830292835e-06, + "loss": 0.3134, + "step": 10844 + }, + { + "epoch": 0.62, + "grad_norm": 0.3922580154206632, + "learning_rate": 6.570668157174263e-06, + "loss": 0.2656, + "step": 10845 + }, + { + "epoch": 0.62, + "grad_norm": 0.23499303324710186, + "learning_rate": 6.568920150076854e-06, + "loss": 0.1829, + "step": 10846 + }, + { + "epoch": 0.62, + "grad_norm": 0.6347531042335608, + "learning_rate": 6.567172261797594e-06, + "loss": 0.417, + "step": 10847 + }, + { + "epoch": 0.62, + "grad_norm": 0.39361758639994104, + "learning_rate": 6.5654244923970105e-06, + "loss": 0.2663, + "step": 10848 + }, + { + "epoch": 0.62, + "grad_norm": 0.3087842118794902, + "learning_rate": 6.56367684193563e-06, + "loss": 0.2918, + "step": 10849 + }, + { + "epoch": 0.62, + "grad_norm": 1.0242043905970053, + "learning_rate": 6.561929310473971e-06, + "loss": 0.2512, + "step": 10850 + }, + { + "epoch": 0.62, + "grad_norm": 0.4242460543027238, + "learning_rate": 6.560181898072554e-06, + "loss": 0.3042, + "step": 10851 + }, + { + "epoch": 0.62, + "grad_norm": 0.5240664744635831, + "learning_rate": 6.558434604791888e-06, + "loss": 0.3468, + "step": 10852 + }, + { + "epoch": 0.62, + "grad_norm": 0.22341814224458637, + "learning_rate": 6.556687430692486e-06, + "loss": 0.1698, + "step": 10853 + }, + { + "epoch": 0.62, + "grad_norm": 0.39096154916868614, + "learning_rate": 6.5549403758348485e-06, + "loss": 0.2807, + "step": 10854 + }, + { + "epoch": 0.62, + "grad_norm": 0.5309980404520158, + "learning_rate": 6.553193440279479e-06, + "loss": 0.3881, + "step": 10855 + }, + { + "epoch": 0.62, + "grad_norm": 0.4725538900448848, + "learning_rate": 6.551446624086873e-06, + "loss": 0.2522, + "step": 10856 + }, + { + "epoch": 0.62, + "grad_norm": 0.3080050015821684, + "learning_rate": 6.549699927317519e-06, + "loss": 0.2818, + "step": 10857 + }, + { + "epoch": 0.62, + "grad_norm": 0.5715657627569951, + "learning_rate": 6.5479533500319105e-06, + "loss": 0.35, + "step": 10858 + }, + { + "epoch": 0.62, + "grad_norm": 0.2846118138159484, + "learning_rate": 6.546206892290527e-06, + "loss": 0.1529, + "step": 10859 + }, + { + "epoch": 0.62, + "grad_norm": 0.3320757102260107, + "learning_rate": 6.544460554153853e-06, + "loss": 0.2813, + "step": 10860 + }, + { + "epoch": 0.62, + "grad_norm": 0.36196437928371844, + "learning_rate": 6.542714335682359e-06, + "loss": 0.2973, + "step": 10861 + }, + { + "epoch": 0.62, + "grad_norm": 1.141389722693649, + "learning_rate": 6.54096823693652e-06, + "loss": 0.4734, + "step": 10862 + }, + { + "epoch": 0.62, + "grad_norm": 0.6271265112294695, + "learning_rate": 6.5392222579768015e-06, + "loss": 0.2178, + "step": 10863 + }, + { + "epoch": 0.62, + "grad_norm": 0.3753408561849779, + "learning_rate": 6.537476398863669e-06, + "loss": 0.313, + "step": 10864 + }, + { + "epoch": 0.62, + "grad_norm": 0.33153725793470623, + "learning_rate": 6.535730659657577e-06, + "loss": 0.2618, + "step": 10865 + }, + { + "epoch": 0.62, + "grad_norm": 0.32391570737586556, + "learning_rate": 6.533985040418988e-06, + "loss": 0.1546, + "step": 10866 + }, + { + "epoch": 0.62, + "grad_norm": 0.3128620256105588, + "learning_rate": 6.532239541208343e-06, + "loss": 0.2564, + "step": 10867 + }, + { + "epoch": 0.62, + "grad_norm": 0.4533892575766774, + "learning_rate": 6.530494162086098e-06, + "loss": 0.3441, + "step": 10868 + }, + { + "epoch": 0.62, + "grad_norm": 0.28938108521272216, + "learning_rate": 6.5287489031126875e-06, + "loss": 0.2152, + "step": 10869 + }, + { + "epoch": 0.62, + "grad_norm": 0.46521166367836747, + "learning_rate": 6.527003764348555e-06, + "loss": 0.3301, + "step": 10870 + }, + { + "epoch": 0.62, + "grad_norm": 1.1228207484521606, + "learning_rate": 6.5252587458541325e-06, + "loss": 0.5712, + "step": 10871 + }, + { + "epoch": 0.62, + "grad_norm": 0.5512502762042788, + "learning_rate": 6.523513847689854e-06, + "loss": 0.3708, + "step": 10872 + }, + { + "epoch": 0.62, + "grad_norm": 0.19166979867318232, + "learning_rate": 6.521769069916136e-06, + "loss": 0.1914, + "step": 10873 + }, + { + "epoch": 0.62, + "grad_norm": 0.9560087983933109, + "learning_rate": 6.520024412593409e-06, + "loss": 0.4243, + "step": 10874 + }, + { + "epoch": 0.62, + "grad_norm": 0.5336182223969617, + "learning_rate": 6.518279875782083e-06, + "loss": 0.3538, + "step": 10875 + }, + { + "epoch": 0.62, + "grad_norm": 0.3886374618315691, + "learning_rate": 6.516535459542579e-06, + "loss": 0.2293, + "step": 10876 + }, + { + "epoch": 0.62, + "grad_norm": 0.40788284876238157, + "learning_rate": 6.514791163935299e-06, + "loss": 0.3177, + "step": 10877 + }, + { + "epoch": 0.62, + "grad_norm": 0.5755561972390177, + "learning_rate": 6.513046989020653e-06, + "loss": 0.3556, + "step": 10878 + }, + { + "epoch": 0.63, + "grad_norm": 0.23691729198980177, + "learning_rate": 6.5113029348590384e-06, + "loss": 0.152, + "step": 10879 + }, + { + "epoch": 0.63, + "grad_norm": 0.4475169294764376, + "learning_rate": 6.509559001510854e-06, + "loss": 0.2962, + "step": 10880 + }, + { + "epoch": 0.63, + "grad_norm": 0.7213839327976953, + "learning_rate": 6.5078151890364916e-06, + "loss": 0.3129, + "step": 10881 + }, + { + "epoch": 0.63, + "grad_norm": 0.3567362019743413, + "learning_rate": 6.50607149749634e-06, + "loss": 0.2346, + "step": 10882 + }, + { + "epoch": 0.63, + "grad_norm": 0.8685527444154926, + "learning_rate": 6.504327926950782e-06, + "loss": 0.5479, + "step": 10883 + }, + { + "epoch": 0.63, + "grad_norm": 0.42927802250755426, + "learning_rate": 6.502584477460195e-06, + "loss": 0.3064, + "step": 10884 + }, + { + "epoch": 0.63, + "grad_norm": 0.2919765365381944, + "learning_rate": 6.50084114908496e-06, + "loss": 0.2674, + "step": 10885 + }, + { + "epoch": 0.63, + "grad_norm": 0.5838456550127608, + "learning_rate": 6.4990979418854436e-06, + "loss": 0.2418, + "step": 10886 + }, + { + "epoch": 0.63, + "grad_norm": 0.33539401157915516, + "learning_rate": 6.497354855922016e-06, + "loss": 0.2074, + "step": 10887 + }, + { + "epoch": 0.63, + "grad_norm": 0.36912521934440096, + "learning_rate": 6.495611891255038e-06, + "loss": 0.2864, + "step": 10888 + }, + { + "epoch": 0.63, + "grad_norm": 0.332611184205583, + "learning_rate": 6.493869047944872e-06, + "loss": 0.2492, + "step": 10889 + }, + { + "epoch": 0.63, + "grad_norm": 0.5711716515617774, + "learning_rate": 6.4921263260518664e-06, + "loss": 0.413, + "step": 10890 + }, + { + "epoch": 0.63, + "grad_norm": 0.34918005553161635, + "learning_rate": 6.490383725636377e-06, + "loss": 0.2892, + "step": 10891 + }, + { + "epoch": 0.63, + "grad_norm": 0.321025479817607, + "learning_rate": 6.488641246758749e-06, + "loss": 0.2723, + "step": 10892 + }, + { + "epoch": 0.63, + "grad_norm": 0.2324885317833231, + "learning_rate": 6.486898889479323e-06, + "loss": 0.1738, + "step": 10893 + }, + { + "epoch": 0.63, + "grad_norm": 0.38333854647224463, + "learning_rate": 6.485156653858438e-06, + "loss": 0.3174, + "step": 10894 + }, + { + "epoch": 0.63, + "grad_norm": 1.008010824991491, + "learning_rate": 6.483414539956426e-06, + "loss": 0.3672, + "step": 10895 + }, + { + "epoch": 0.63, + "grad_norm": 0.27193926530077744, + "learning_rate": 6.48167254783362e-06, + "loss": 0.2538, + "step": 10896 + }, + { + "epoch": 0.63, + "grad_norm": 0.41061131285869107, + "learning_rate": 6.479930677550338e-06, + "loss": 0.3058, + "step": 10897 + }, + { + "epoch": 0.63, + "grad_norm": 0.6100862999061782, + "learning_rate": 6.478188929166909e-06, + "loss": 0.3987, + "step": 10898 + }, + { + "epoch": 0.63, + "grad_norm": 0.15737671574549414, + "learning_rate": 6.476447302743643e-06, + "loss": 0.0739, + "step": 10899 + }, + { + "epoch": 0.63, + "grad_norm": 0.3672614455951174, + "learning_rate": 6.474705798340857e-06, + "loss": 0.3108, + "step": 10900 + }, + { + "epoch": 0.63, + "grad_norm": 0.37121567073539297, + "learning_rate": 6.472964416018857e-06, + "loss": 0.3178, + "step": 10901 + }, + { + "epoch": 0.63, + "grad_norm": 0.5386380238554062, + "learning_rate": 6.471223155837949e-06, + "loss": 0.2583, + "step": 10902 + }, + { + "epoch": 0.63, + "grad_norm": 0.3702006272772363, + "learning_rate": 6.469482017858428e-06, + "loss": 0.2944, + "step": 10903 + }, + { + "epoch": 0.63, + "grad_norm": 0.3343193983572874, + "learning_rate": 6.4677410021405975e-06, + "loss": 0.3281, + "step": 10904 + }, + { + "epoch": 0.63, + "grad_norm": 0.15994451590807493, + "learning_rate": 6.46600010874474e-06, + "loss": 0.0712, + "step": 10905 + }, + { + "epoch": 0.63, + "grad_norm": 0.35321148685659404, + "learning_rate": 6.4642593377311515e-06, + "loss": 0.2867, + "step": 10906 + }, + { + "epoch": 0.63, + "grad_norm": 1.1587678178893002, + "learning_rate": 6.462518689160109e-06, + "loss": 0.568, + "step": 10907 + }, + { + "epoch": 0.63, + "grad_norm": 0.30479496232056075, + "learning_rate": 6.460778163091891e-06, + "loss": 0.2539, + "step": 10908 + }, + { + "epoch": 0.63, + "grad_norm": 0.35727687245385015, + "learning_rate": 6.45903775958677e-06, + "loss": 0.2846, + "step": 10909 + }, + { + "epoch": 0.63, + "grad_norm": 2.4649919245936944, + "learning_rate": 6.457297478705023e-06, + "loss": 0.7402, + "step": 10910 + }, + { + "epoch": 0.63, + "grad_norm": 0.2619218553658152, + "learning_rate": 6.45555732050691e-06, + "loss": 0.1705, + "step": 10911 + }, + { + "epoch": 0.63, + "grad_norm": 0.33591317094101153, + "learning_rate": 6.4538172850526955e-06, + "loss": 0.2024, + "step": 10912 + }, + { + "epoch": 0.63, + "grad_norm": 0.341833146387076, + "learning_rate": 6.452077372402634e-06, + "loss": 0.3078, + "step": 10913 + }, + { + "epoch": 0.63, + "grad_norm": 0.5891021497920335, + "learning_rate": 6.450337582616983e-06, + "loss": 0.3681, + "step": 10914 + }, + { + "epoch": 0.63, + "grad_norm": 0.34289810526304676, + "learning_rate": 6.448597915755988e-06, + "loss": 0.2137, + "step": 10915 + }, + { + "epoch": 0.63, + "grad_norm": 0.359263118619319, + "learning_rate": 6.446858371879896e-06, + "loss": 0.312, + "step": 10916 + }, + { + "epoch": 0.63, + "grad_norm": 0.28334773046172257, + "learning_rate": 6.445118951048942e-06, + "loss": 0.173, + "step": 10917 + }, + { + "epoch": 0.63, + "grad_norm": 0.30036622389977835, + "learning_rate": 6.44337965332337e-06, + "loss": 0.19, + "step": 10918 + }, + { + "epoch": 0.63, + "grad_norm": 0.749494957815434, + "learning_rate": 6.4416404787634045e-06, + "loss": 0.4418, + "step": 10919 + }, + { + "epoch": 0.63, + "grad_norm": 0.3391009390947701, + "learning_rate": 6.439901427429278e-06, + "loss": 0.3278, + "step": 10920 + }, + { + "epoch": 0.63, + "grad_norm": 0.4741224416315272, + "learning_rate": 6.438162499381212e-06, + "loss": 0.2293, + "step": 10921 + }, + { + "epoch": 0.63, + "grad_norm": 0.9750360327136708, + "learning_rate": 6.4364236946794234e-06, + "loss": 0.6323, + "step": 10922 + }, + { + "epoch": 0.63, + "grad_norm": 0.24827351464326222, + "learning_rate": 6.434685013384132e-06, + "loss": 0.1511, + "step": 10923 + }, + { + "epoch": 0.63, + "grad_norm": 0.288203382381012, + "learning_rate": 6.432946455555542e-06, + "loss": 0.2516, + "step": 10924 + }, + { + "epoch": 0.63, + "grad_norm": 0.463203852610262, + "learning_rate": 6.4312080212538665e-06, + "loss": 0.2672, + "step": 10925 + }, + { + "epoch": 0.63, + "grad_norm": 0.7318010513536188, + "learning_rate": 6.4294697105393e-06, + "loss": 0.4274, + "step": 10926 + }, + { + "epoch": 0.63, + "grad_norm": 0.3059976691995668, + "learning_rate": 6.427731523472047e-06, + "loss": 0.2783, + "step": 10927 + }, + { + "epoch": 0.63, + "grad_norm": 0.3061553802177348, + "learning_rate": 6.425993460112297e-06, + "loss": 0.2512, + "step": 10928 + }, + { + "epoch": 0.63, + "grad_norm": 0.2644528421377624, + "learning_rate": 6.424255520520239e-06, + "loss": 0.1606, + "step": 10929 + }, + { + "epoch": 0.63, + "grad_norm": 0.3216069885908651, + "learning_rate": 6.422517704756057e-06, + "loss": 0.2645, + "step": 10930 + }, + { + "epoch": 0.63, + "grad_norm": 1.0873383787909738, + "learning_rate": 6.420780012879937e-06, + "loss": 0.3225, + "step": 10931 + }, + { + "epoch": 0.63, + "grad_norm": 0.32131116306748664, + "learning_rate": 6.419042444952048e-06, + "loss": 0.2816, + "step": 10932 + }, + { + "epoch": 0.63, + "grad_norm": 0.3490888556275218, + "learning_rate": 6.417305001032567e-06, + "loss": 0.2824, + "step": 10933 + }, + { + "epoch": 0.63, + "grad_norm": 1.0053545569436766, + "learning_rate": 6.415567681181658e-06, + "loss": 0.4126, + "step": 10934 + }, + { + "epoch": 0.63, + "grad_norm": 0.3229247233746252, + "learning_rate": 6.413830485459488e-06, + "loss": 0.2473, + "step": 10935 + }, + { + "epoch": 0.63, + "grad_norm": 0.3380905219858116, + "learning_rate": 6.412093413926213e-06, + "loss": 0.2688, + "step": 10936 + }, + { + "epoch": 0.63, + "grad_norm": 0.554045753814118, + "learning_rate": 6.410356466641989e-06, + "loss": 0.298, + "step": 10937 + }, + { + "epoch": 0.63, + "grad_norm": 0.9802965063335598, + "learning_rate": 6.408619643666967e-06, + "loss": 0.3922, + "step": 10938 + }, + { + "epoch": 0.63, + "grad_norm": 0.31719321797306044, + "learning_rate": 6.40688294506129e-06, + "loss": 0.2581, + "step": 10939 + }, + { + "epoch": 0.63, + "grad_norm": 0.3305320623675954, + "learning_rate": 6.405146370885107e-06, + "loss": 0.2959, + "step": 10940 + }, + { + "epoch": 0.63, + "grad_norm": 0.2577489342671599, + "learning_rate": 6.403409921198548e-06, + "loss": 0.1013, + "step": 10941 + }, + { + "epoch": 0.63, + "grad_norm": 0.3603206692947732, + "learning_rate": 6.401673596061747e-06, + "loss": 0.2713, + "step": 10942 + }, + { + "epoch": 0.63, + "grad_norm": 1.004731951323438, + "learning_rate": 6.399937395534837e-06, + "loss": 0.4131, + "step": 10943 + }, + { + "epoch": 0.63, + "grad_norm": 0.26455269733315795, + "learning_rate": 6.398201319677937e-06, + "loss": 0.2145, + "step": 10944 + }, + { + "epoch": 0.63, + "grad_norm": 0.31962065915692844, + "learning_rate": 6.396465368551172e-06, + "loss": 0.2746, + "step": 10945 + }, + { + "epoch": 0.63, + "grad_norm": 1.1313165310387425, + "learning_rate": 6.394729542214657e-06, + "loss": 0.606, + "step": 10946 + }, + { + "epoch": 0.63, + "grad_norm": 0.33812898844419714, + "learning_rate": 6.392993840728503e-06, + "loss": 0.2592, + "step": 10947 + }, + { + "epoch": 0.63, + "grad_norm": 0.3277772485530417, + "learning_rate": 6.391258264152818e-06, + "loss": 0.2676, + "step": 10948 + }, + { + "epoch": 0.63, + "grad_norm": 0.7622917250054192, + "learning_rate": 6.389522812547701e-06, + "loss": 0.4147, + "step": 10949 + }, + { + "epoch": 0.63, + "grad_norm": 0.2837893256968949, + "learning_rate": 6.3877874859732556e-06, + "loss": 0.1997, + "step": 10950 + }, + { + "epoch": 0.63, + "grad_norm": 0.3097014346057528, + "learning_rate": 6.386052284489575e-06, + "loss": 0.1805, + "step": 10951 + }, + { + "epoch": 0.63, + "grad_norm": 0.3016138484836072, + "learning_rate": 6.3843172081567474e-06, + "loss": 0.2933, + "step": 10952 + }, + { + "epoch": 0.63, + "grad_norm": 0.7273665127231179, + "learning_rate": 6.382582257034858e-06, + "loss": 0.4313, + "step": 10953 + }, + { + "epoch": 0.63, + "grad_norm": 0.3243297248965466, + "learning_rate": 6.380847431183992e-06, + "loss": 0.188, + "step": 10954 + }, + { + "epoch": 0.63, + "grad_norm": 0.3351138381356627, + "learning_rate": 6.379112730664222e-06, + "loss": 0.2968, + "step": 10955 + }, + { + "epoch": 0.63, + "grad_norm": 0.3233328725092727, + "learning_rate": 6.377378155535625e-06, + "loss": 0.2423, + "step": 10956 + }, + { + "epoch": 0.63, + "grad_norm": 0.2918832243355452, + "learning_rate": 6.375643705858263e-06, + "loss": 0.2093, + "step": 10957 + }, + { + "epoch": 0.63, + "grad_norm": 1.0029877128585176, + "learning_rate": 6.373909381692207e-06, + "loss": 0.5905, + "step": 10958 + }, + { + "epoch": 0.63, + "grad_norm": 0.5027694083886098, + "learning_rate": 6.372175183097511e-06, + "loss": 0.3369, + "step": 10959 + }, + { + "epoch": 0.63, + "grad_norm": 0.3078451694449207, + "learning_rate": 6.370441110134233e-06, + "loss": 0.2701, + "step": 10960 + }, + { + "epoch": 0.63, + "grad_norm": 0.41308557058341894, + "learning_rate": 6.3687071628624244e-06, + "loss": 0.2483, + "step": 10961 + }, + { + "epoch": 0.63, + "grad_norm": 0.45906823035018285, + "learning_rate": 6.366973341342128e-06, + "loss": 0.3574, + "step": 10962 + }, + { + "epoch": 0.63, + "grad_norm": 0.34703098652586084, + "learning_rate": 6.365239645633392e-06, + "loss": 0.2705, + "step": 10963 + }, + { + "epoch": 0.63, + "grad_norm": 0.26359333923079414, + "learning_rate": 6.3635060757962485e-06, + "loss": 0.1914, + "step": 10964 + }, + { + "epoch": 0.63, + "grad_norm": 0.5789237996497145, + "learning_rate": 6.361772631890735e-06, + "loss": 0.3915, + "step": 10965 + }, + { + "epoch": 0.63, + "grad_norm": 0.3167200194383066, + "learning_rate": 6.360039313976875e-06, + "loss": 0.2645, + "step": 10966 + }, + { + "epoch": 0.63, + "grad_norm": 1.3052671121934587, + "learning_rate": 6.3583061221147015e-06, + "loss": 0.2044, + "step": 10967 + }, + { + "epoch": 0.63, + "grad_norm": 0.3444639163727557, + "learning_rate": 6.356573056364227e-06, + "loss": 0.3192, + "step": 10968 + }, + { + "epoch": 0.63, + "grad_norm": 0.5307240095449588, + "learning_rate": 6.354840116785473e-06, + "loss": 0.3267, + "step": 10969 + }, + { + "epoch": 0.63, + "grad_norm": 0.25389779069515805, + "learning_rate": 6.353107303438447e-06, + "loss": 0.1643, + "step": 10970 + }, + { + "epoch": 0.63, + "grad_norm": 0.4045083838798069, + "learning_rate": 6.351374616383161e-06, + "loss": 0.3156, + "step": 10971 + }, + { + "epoch": 0.63, + "grad_norm": 0.6028023578671253, + "learning_rate": 6.349642055679613e-06, + "loss": 0.3478, + "step": 10972 + }, + { + "epoch": 0.63, + "grad_norm": 0.4182115635796257, + "learning_rate": 6.347909621387809e-06, + "loss": 0.2903, + "step": 10973 + }, + { + "epoch": 0.63, + "grad_norm": 1.0543676184663318, + "learning_rate": 6.346177313567732e-06, + "loss": 0.4733, + "step": 10974 + }, + { + "epoch": 0.63, + "grad_norm": 0.2926592072070201, + "learning_rate": 6.34444513227938e-06, + "loss": 0.2497, + "step": 10975 + }, + { + "epoch": 0.63, + "grad_norm": 0.24753098131598456, + "learning_rate": 6.342713077582733e-06, + "loss": 0.2406, + "step": 10976 + }, + { + "epoch": 0.63, + "grad_norm": 0.5753767227553651, + "learning_rate": 6.340981149537777e-06, + "loss": 0.3157, + "step": 10977 + }, + { + "epoch": 0.63, + "grad_norm": 0.3953220853323611, + "learning_rate": 6.339249348204485e-06, + "loss": 0.2584, + "step": 10978 + }, + { + "epoch": 0.63, + "grad_norm": 0.5473609875755407, + "learning_rate": 6.337517673642833e-06, + "loss": 0.3299, + "step": 10979 + }, + { + "epoch": 0.63, + "grad_norm": 0.3745830109479958, + "learning_rate": 6.335786125912784e-06, + "loss": 0.2747, + "step": 10980 + }, + { + "epoch": 0.63, + "grad_norm": 0.40675845180689163, + "learning_rate": 6.3340547050743055e-06, + "loss": 0.3007, + "step": 10981 + }, + { + "epoch": 0.63, + "grad_norm": 0.690730580115114, + "learning_rate": 6.332323411187353e-06, + "loss": 0.3359, + "step": 10982 + }, + { + "epoch": 0.63, + "grad_norm": 0.2285847486188215, + "learning_rate": 6.330592244311885e-06, + "loss": 0.1904, + "step": 10983 + }, + { + "epoch": 0.63, + "grad_norm": 0.3287690816760705, + "learning_rate": 6.328861204507848e-06, + "loss": 0.2549, + "step": 10984 + }, + { + "epoch": 0.63, + "grad_norm": 1.4816162350887843, + "learning_rate": 6.327130291835192e-06, + "loss": 0.8568, + "step": 10985 + }, + { + "epoch": 0.63, + "grad_norm": 1.1107702877033343, + "learning_rate": 6.325399506353855e-06, + "loss": 0.8058, + "step": 10986 + }, + { + "epoch": 0.63, + "grad_norm": 0.32706815193793987, + "learning_rate": 6.323668848123774e-06, + "loss": 0.19, + "step": 10987 + }, + { + "epoch": 0.63, + "grad_norm": 0.3039169646645536, + "learning_rate": 6.321938317204886e-06, + "loss": 0.2826, + "step": 10988 + }, + { + "epoch": 0.63, + "grad_norm": 0.34350298686577224, + "learning_rate": 6.320207913657111e-06, + "loss": 0.223, + "step": 10989 + }, + { + "epoch": 0.63, + "grad_norm": 0.43173203039681673, + "learning_rate": 6.3184776375403814e-06, + "loss": 0.1458, + "step": 10990 + }, + { + "epoch": 0.63, + "grad_norm": 0.32354358590641585, + "learning_rate": 6.3167474889146096e-06, + "loss": 0.292, + "step": 10991 + }, + { + "epoch": 0.63, + "grad_norm": 0.4753435688955507, + "learning_rate": 6.315017467839717e-06, + "loss": 0.3936, + "step": 10992 + }, + { + "epoch": 0.63, + "grad_norm": 0.37915678416069587, + "learning_rate": 6.313287574375609e-06, + "loss": 0.2156, + "step": 10993 + }, + { + "epoch": 0.63, + "grad_norm": 0.4090258131741896, + "learning_rate": 6.311557808582196e-06, + "loss": 0.3467, + "step": 10994 + }, + { + "epoch": 0.63, + "grad_norm": 0.3414960965925519, + "learning_rate": 6.309828170519376e-06, + "loss": 0.2537, + "step": 10995 + }, + { + "epoch": 0.63, + "grad_norm": 0.2555111542785431, + "learning_rate": 6.308098660247049e-06, + "loss": 0.1574, + "step": 10996 + }, + { + "epoch": 0.63, + "grad_norm": 0.53907874280059, + "learning_rate": 6.306369277825104e-06, + "loss": 0.3552, + "step": 10997 + }, + { + "epoch": 0.63, + "grad_norm": 0.8653136044186649, + "learning_rate": 6.304640023313435e-06, + "loss": 0.5482, + "step": 10998 + }, + { + "epoch": 0.63, + "grad_norm": 0.2936106670828241, + "learning_rate": 6.302910896771921e-06, + "loss": 0.2673, + "step": 10999 + }, + { + "epoch": 0.63, + "grad_norm": 0.3666631543606978, + "learning_rate": 6.301181898260444e-06, + "loss": 0.2542, + "step": 11000 + }, + { + "epoch": 0.63, + "grad_norm": 0.42116700454476547, + "learning_rate": 6.299453027838881e-06, + "loss": 0.2626, + "step": 11001 + }, + { + "epoch": 0.63, + "grad_norm": 0.33832310295296864, + "learning_rate": 6.297724285567098e-06, + "loss": 0.2159, + "step": 11002 + }, + { + "epoch": 0.63, + "grad_norm": 0.3384603298577127, + "learning_rate": 6.295995671504965e-06, + "loss": 0.2478, + "step": 11003 + }, + { + "epoch": 0.63, + "grad_norm": 0.4787703629140393, + "learning_rate": 6.294267185712342e-06, + "loss": 0.361, + "step": 11004 + }, + { + "epoch": 0.63, + "grad_norm": 0.588450481399667, + "learning_rate": 6.292538828249087e-06, + "loss": 0.355, + "step": 11005 + }, + { + "epoch": 0.63, + "grad_norm": 0.32064689647937217, + "learning_rate": 6.290810599175052e-06, + "loss": 0.234, + "step": 11006 + }, + { + "epoch": 0.63, + "grad_norm": 0.3023253847261538, + "learning_rate": 6.289082498550091e-06, + "loss": 0.2633, + "step": 11007 + }, + { + "epoch": 0.63, + "grad_norm": 0.33301628269246675, + "learning_rate": 6.287354526434042e-06, + "loss": 0.1976, + "step": 11008 + }, + { + "epoch": 0.63, + "grad_norm": 0.3279184303807924, + "learning_rate": 6.285626682886743e-06, + "loss": 0.2236, + "step": 11009 + }, + { + "epoch": 0.63, + "grad_norm": 0.7138001020794481, + "learning_rate": 6.283898967968034e-06, + "loss": 0.4225, + "step": 11010 + }, + { + "epoch": 0.63, + "grad_norm": 0.35085811541230716, + "learning_rate": 6.282171381737742e-06, + "loss": 0.2892, + "step": 11011 + }, + { + "epoch": 0.63, + "grad_norm": 0.35646638300198263, + "learning_rate": 6.280443924255697e-06, + "loss": 0.3308, + "step": 11012 + }, + { + "epoch": 0.63, + "grad_norm": 0.9594251689271127, + "learning_rate": 6.27871659558172e-06, + "loss": 0.2799, + "step": 11013 + }, + { + "epoch": 0.63, + "grad_norm": 0.2198364833708466, + "learning_rate": 6.276989395775625e-06, + "loss": 0.1496, + "step": 11014 + }, + { + "epoch": 0.63, + "grad_norm": 0.28285740443505136, + "learning_rate": 6.275262324897229e-06, + "loss": 0.2867, + "step": 11015 + }, + { + "epoch": 0.63, + "grad_norm": 0.7428097831158089, + "learning_rate": 6.273535383006336e-06, + "loss": 0.319, + "step": 11016 + }, + { + "epoch": 0.63, + "grad_norm": 0.5076884926327849, + "learning_rate": 6.271808570162754e-06, + "loss": 0.3514, + "step": 11017 + }, + { + "epoch": 0.63, + "grad_norm": 0.3783077303287608, + "learning_rate": 6.27008188642628e-06, + "loss": 0.3001, + "step": 11018 + }, + { + "epoch": 0.63, + "grad_norm": 0.3307956499614347, + "learning_rate": 6.268355331856713e-06, + "loss": 0.2285, + "step": 11019 + }, + { + "epoch": 0.63, + "grad_norm": 0.2362548513026544, + "learning_rate": 6.266628906513836e-06, + "loss": 0.1491, + "step": 11020 + }, + { + "epoch": 0.63, + "grad_norm": 0.42279259430228205, + "learning_rate": 6.264902610457442e-06, + "loss": 0.316, + "step": 11021 + }, + { + "epoch": 0.63, + "grad_norm": 0.575956482245312, + "learning_rate": 6.263176443747309e-06, + "loss": 0.3045, + "step": 11022 + }, + { + "epoch": 0.63, + "grad_norm": 0.48818995665095016, + "learning_rate": 6.261450406443217e-06, + "loss": 0.3887, + "step": 11023 + }, + { + "epoch": 0.63, + "grad_norm": 0.3348148241071795, + "learning_rate": 6.259724498604933e-06, + "loss": 0.2703, + "step": 11024 + }, + { + "epoch": 0.63, + "grad_norm": 1.1273760167300912, + "learning_rate": 6.257998720292233e-06, + "loss": 0.5308, + "step": 11025 + }, + { + "epoch": 0.63, + "grad_norm": 0.1848169085706313, + "learning_rate": 6.256273071564874e-06, + "loss": 0.088, + "step": 11026 + }, + { + "epoch": 0.63, + "grad_norm": 0.25545609687182125, + "learning_rate": 6.254547552482617e-06, + "loss": 0.2522, + "step": 11027 + }, + { + "epoch": 0.63, + "grad_norm": 0.7623754548706887, + "learning_rate": 6.25282216310522e-06, + "loss": 0.368, + "step": 11028 + }, + { + "epoch": 0.63, + "grad_norm": 0.4726985840995557, + "learning_rate": 6.2510969034924265e-06, + "loss": 0.3078, + "step": 11029 + }, + { + "epoch": 0.63, + "grad_norm": 0.3572847810692785, + "learning_rate": 6.249371773703989e-06, + "loss": 0.276, + "step": 11030 + }, + { + "epoch": 0.63, + "grad_norm": 0.3708677390096252, + "learning_rate": 6.247646773799645e-06, + "loss": 0.3205, + "step": 11031 + }, + { + "epoch": 0.63, + "grad_norm": 0.25304006716718513, + "learning_rate": 6.245921903839132e-06, + "loss": 0.123, + "step": 11032 + }, + { + "epoch": 0.63, + "grad_norm": 0.3925028301065731, + "learning_rate": 6.24419716388218e-06, + "loss": 0.2976, + "step": 11033 + }, + { + "epoch": 0.63, + "grad_norm": 0.8941776485349109, + "learning_rate": 6.242472553988521e-06, + "loss": 0.4912, + "step": 11034 + }, + { + "epoch": 0.63, + "grad_norm": 0.3501794684964001, + "learning_rate": 6.240748074217875e-06, + "loss": 0.3189, + "step": 11035 + }, + { + "epoch": 0.63, + "grad_norm": 0.3330008477936822, + "learning_rate": 6.239023724629962e-06, + "loss": 0.217, + "step": 11036 + }, + { + "epoch": 0.63, + "grad_norm": 1.1036007794647276, + "learning_rate": 6.237299505284495e-06, + "loss": 0.616, + "step": 11037 + }, + { + "epoch": 0.63, + "grad_norm": 0.3554645171715663, + "learning_rate": 6.235575416241185e-06, + "loss": 0.3099, + "step": 11038 + }, + { + "epoch": 0.63, + "grad_norm": 0.26021400716218845, + "learning_rate": 6.233851457559736e-06, + "loss": 0.2198, + "step": 11039 + }, + { + "epoch": 0.63, + "grad_norm": 0.35160884390360947, + "learning_rate": 6.232127629299849e-06, + "loss": 0.2377, + "step": 11040 + }, + { + "epoch": 0.63, + "grad_norm": 1.0184502874359644, + "learning_rate": 6.230403931521224e-06, + "loss": 0.7252, + "step": 11041 + }, + { + "epoch": 0.63, + "grad_norm": 0.27161208909389467, + "learning_rate": 6.228680364283546e-06, + "loss": 0.1799, + "step": 11042 + }, + { + "epoch": 0.63, + "grad_norm": 0.3497995921510243, + "learning_rate": 6.226956927646504e-06, + "loss": 0.3126, + "step": 11043 + }, + { + "epoch": 0.63, + "grad_norm": 0.6248331696318262, + "learning_rate": 6.225233621669782e-06, + "loss": 0.4057, + "step": 11044 + }, + { + "epoch": 0.63, + "grad_norm": 0.3119749624889606, + "learning_rate": 6.2235104464130545e-06, + "loss": 0.2302, + "step": 11045 + }, + { + "epoch": 0.63, + "grad_norm": 0.3756060506238427, + "learning_rate": 6.221787401936002e-06, + "loss": 0.2444, + "step": 11046 + }, + { + "epoch": 0.63, + "grad_norm": 0.34670055217013945, + "learning_rate": 6.220064488298285e-06, + "loss": 0.3051, + "step": 11047 + }, + { + "epoch": 0.63, + "grad_norm": 0.249174577275586, + "learning_rate": 6.2183417055595765e-06, + "loss": 0.2063, + "step": 11048 + }, + { + "epoch": 0.63, + "grad_norm": 1.0123588001422943, + "learning_rate": 6.216619053779529e-06, + "loss": 0.2817, + "step": 11049 + }, + { + "epoch": 0.63, + "grad_norm": 0.36445402991388354, + "learning_rate": 6.214896533017803e-06, + "loss": 0.3022, + "step": 11050 + }, + { + "epoch": 0.63, + "grad_norm": 0.3092688596231202, + "learning_rate": 6.213174143334046e-06, + "loss": 0.2614, + "step": 11051 + }, + { + "epoch": 0.63, + "grad_norm": 0.25530111498148117, + "learning_rate": 6.211451884787907e-06, + "loss": 0.1012, + "step": 11052 + }, + { + "epoch": 0.64, + "grad_norm": 0.4294403584004139, + "learning_rate": 6.209729757439026e-06, + "loss": 0.3003, + "step": 11053 + }, + { + "epoch": 0.64, + "grad_norm": 0.4008704554780664, + "learning_rate": 6.208007761347039e-06, + "loss": 0.2886, + "step": 11054 + }, + { + "epoch": 0.64, + "grad_norm": 0.27196718819293497, + "learning_rate": 6.206285896571582e-06, + "loss": 0.24, + "step": 11055 + }, + { + "epoch": 0.64, + "grad_norm": 0.5460900718518568, + "learning_rate": 6.20456416317228e-06, + "loss": 0.332, + "step": 11056 + }, + { + "epoch": 0.64, + "grad_norm": 0.369685863562834, + "learning_rate": 6.202842561208759e-06, + "loss": 0.2815, + "step": 11057 + }, + { + "epoch": 0.64, + "grad_norm": 0.5100637075537648, + "learning_rate": 6.201121090740634e-06, + "loss": 0.2929, + "step": 11058 + }, + { + "epoch": 0.64, + "grad_norm": 0.316403402827174, + "learning_rate": 6.199399751827525e-06, + "loss": 0.2596, + "step": 11059 + }, + { + "epoch": 0.64, + "grad_norm": 0.3212154547207368, + "learning_rate": 6.197678544529037e-06, + "loss": 0.2387, + "step": 11060 + }, + { + "epoch": 0.64, + "grad_norm": 0.5344441277836296, + "learning_rate": 6.195957468904781e-06, + "loss": 0.3082, + "step": 11061 + }, + { + "epoch": 0.64, + "grad_norm": 0.3320258149067593, + "learning_rate": 6.19423652501435e-06, + "loss": 0.2633, + "step": 11062 + }, + { + "epoch": 0.64, + "grad_norm": 0.3367919993055683, + "learning_rate": 6.192515712917348e-06, + "loss": 0.2523, + "step": 11063 + }, + { + "epoch": 0.64, + "grad_norm": 1.0809686276937551, + "learning_rate": 6.19079503267336e-06, + "loss": 0.6319, + "step": 11064 + }, + { + "epoch": 0.64, + "grad_norm": 0.19407826999737932, + "learning_rate": 6.189074484341979e-06, + "loss": 0.098, + "step": 11065 + }, + { + "epoch": 0.64, + "grad_norm": 0.31577201724467685, + "learning_rate": 6.187354067982785e-06, + "loss": 0.2449, + "step": 11066 + }, + { + "epoch": 0.64, + "grad_norm": 0.3403924479356509, + "learning_rate": 6.185633783655354e-06, + "loss": 0.302, + "step": 11067 + }, + { + "epoch": 0.64, + "grad_norm": 0.6402462185739675, + "learning_rate": 6.183913631419263e-06, + "loss": 0.3312, + "step": 11068 + }, + { + "epoch": 0.64, + "grad_norm": 0.3382422319781781, + "learning_rate": 6.182193611334075e-06, + "loss": 0.258, + "step": 11069 + }, + { + "epoch": 0.64, + "grad_norm": 1.2736662713881572, + "learning_rate": 6.180473723459361e-06, + "loss": 0.7406, + "step": 11070 + }, + { + "epoch": 0.64, + "grad_norm": 0.25041480842921693, + "learning_rate": 6.178753967854677e-06, + "loss": 0.2126, + "step": 11071 + }, + { + "epoch": 0.64, + "grad_norm": 0.4063472874123882, + "learning_rate": 6.17703434457958e-06, + "loss": 0.2927, + "step": 11072 + }, + { + "epoch": 0.64, + "grad_norm": 0.586628542560072, + "learning_rate": 6.175314853693617e-06, + "loss": 0.3441, + "step": 11073 + }, + { + "epoch": 0.64, + "grad_norm": 0.27702603918539737, + "learning_rate": 6.173595495256338e-06, + "loss": 0.2467, + "step": 11074 + }, + { + "epoch": 0.64, + "grad_norm": 0.35689733543792085, + "learning_rate": 6.1718762693272846e-06, + "loss": 0.1613, + "step": 11075 + }, + { + "epoch": 0.64, + "grad_norm": 0.46985107766144624, + "learning_rate": 6.170157175965988e-06, + "loss": 0.3795, + "step": 11076 + }, + { + "epoch": 0.64, + "grad_norm": 0.7798022987856814, + "learning_rate": 6.168438215231984e-06, + "loss": 0.499, + "step": 11077 + }, + { + "epoch": 0.64, + "grad_norm": 0.2888764256268748, + "learning_rate": 6.166719387184802e-06, + "loss": 0.1906, + "step": 11078 + }, + { + "epoch": 0.64, + "grad_norm": 0.32604639861053947, + "learning_rate": 6.16500069188396e-06, + "loss": 0.3296, + "step": 11079 + }, + { + "epoch": 0.64, + "grad_norm": 0.2507700628659636, + "learning_rate": 6.163282129388981e-06, + "loss": 0.1362, + "step": 11080 + }, + { + "epoch": 0.64, + "grad_norm": 0.31590120501365204, + "learning_rate": 6.1615636997593745e-06, + "loss": 0.2005, + "step": 11081 + }, + { + "epoch": 0.64, + "grad_norm": 0.3610564542876327, + "learning_rate": 6.159845403054654e-06, + "loss": 0.332, + "step": 11082 + }, + { + "epoch": 0.64, + "grad_norm": 1.2608533543659348, + "learning_rate": 6.15812723933432e-06, + "loss": 0.7569, + "step": 11083 + }, + { + "epoch": 0.64, + "grad_norm": 0.32773053299135346, + "learning_rate": 6.1564092086578765e-06, + "loss": 0.2223, + "step": 11084 + }, + { + "epoch": 0.64, + "grad_norm": 0.8451695808115459, + "learning_rate": 6.154691311084816e-06, + "loss": 0.4394, + "step": 11085 + }, + { + "epoch": 0.64, + "grad_norm": 0.21285824327119043, + "learning_rate": 6.152973546674631e-06, + "loss": 0.2111, + "step": 11086 + }, + { + "epoch": 0.64, + "grad_norm": 0.3162598405639108, + "learning_rate": 6.151255915486804e-06, + "loss": 0.2425, + "step": 11087 + }, + { + "epoch": 0.64, + "grad_norm": 0.9304264514798442, + "learning_rate": 6.1495384175808224e-06, + "loss": 0.3778, + "step": 11088 + }, + { + "epoch": 0.64, + "grad_norm": 0.7140149435298279, + "learning_rate": 6.147821053016159e-06, + "loss": 0.4178, + "step": 11089 + }, + { + "epoch": 0.64, + "grad_norm": 0.3320040093447702, + "learning_rate": 6.146103821852286e-06, + "loss": 0.245, + "step": 11090 + }, + { + "epoch": 0.64, + "grad_norm": 0.35658129823456214, + "learning_rate": 6.144386724148674e-06, + "loss": 0.2434, + "step": 11091 + }, + { + "epoch": 0.64, + "grad_norm": 0.27249855103507575, + "learning_rate": 6.142669759964781e-06, + "loss": 0.1735, + "step": 11092 + }, + { + "epoch": 0.64, + "grad_norm": 0.5892126845666474, + "learning_rate": 6.140952929360071e-06, + "loss": 0.3012, + "step": 11093 + }, + { + "epoch": 0.64, + "grad_norm": 0.24465733364449624, + "learning_rate": 6.139236232393993e-06, + "loss": 0.2379, + "step": 11094 + }, + { + "epoch": 0.64, + "grad_norm": 0.9972780676501973, + "learning_rate": 6.137519669126e-06, + "loss": 0.4295, + "step": 11095 + }, + { + "epoch": 0.64, + "grad_norm": 0.4816808218850385, + "learning_rate": 6.135803239615532e-06, + "loss": 0.337, + "step": 11096 + }, + { + "epoch": 0.64, + "grad_norm": 0.2635477301210972, + "learning_rate": 6.134086943922034e-06, + "loss": 0.183, + "step": 11097 + }, + { + "epoch": 0.64, + "grad_norm": 0.2615090099141512, + "learning_rate": 6.132370782104937e-06, + "loss": 0.2292, + "step": 11098 + }, + { + "epoch": 0.64, + "grad_norm": 0.38708784016928843, + "learning_rate": 6.130654754223676e-06, + "loss": 0.2688, + "step": 11099 + }, + { + "epoch": 0.64, + "grad_norm": 0.5112058574827448, + "learning_rate": 6.128938860337672e-06, + "loss": 0.3646, + "step": 11100 + }, + { + "epoch": 0.64, + "grad_norm": 0.7461043466785471, + "learning_rate": 6.127223100506351e-06, + "loss": 0.3124, + "step": 11101 + }, + { + "epoch": 0.64, + "grad_norm": 0.27170610461361683, + "learning_rate": 6.125507474789125e-06, + "loss": 0.2624, + "step": 11102 + }, + { + "epoch": 0.64, + "grad_norm": 0.5161349562940573, + "learning_rate": 6.123791983245411e-06, + "loss": 0.3863, + "step": 11103 + }, + { + "epoch": 0.64, + "grad_norm": 0.21589302078484102, + "learning_rate": 6.122076625934612e-06, + "loss": 0.1027, + "step": 11104 + }, + { + "epoch": 0.64, + "grad_norm": 0.3764534243966187, + "learning_rate": 6.120361402916135e-06, + "loss": 0.2836, + "step": 11105 + }, + { + "epoch": 0.64, + "grad_norm": 0.37975502883, + "learning_rate": 6.118646314249376e-06, + "loss": 0.3184, + "step": 11106 + }, + { + "epoch": 0.64, + "grad_norm": 0.6739943039680442, + "learning_rate": 6.116931359993725e-06, + "loss": 0.3041, + "step": 11107 + }, + { + "epoch": 0.64, + "grad_norm": 0.37209910595351725, + "learning_rate": 6.115216540208577e-06, + "loss": 0.2577, + "step": 11108 + }, + { + "epoch": 0.64, + "grad_norm": 0.5528497756841371, + "learning_rate": 6.1135018549533146e-06, + "loss": 0.3213, + "step": 11109 + }, + { + "epoch": 0.64, + "grad_norm": 0.21630491935510923, + "learning_rate": 6.111787304287312e-06, + "loss": 0.1668, + "step": 11110 + }, + { + "epoch": 0.64, + "grad_norm": 0.6091154648621666, + "learning_rate": 6.11007288826995e-06, + "loss": 0.3458, + "step": 11111 + }, + { + "epoch": 0.64, + "grad_norm": 0.35893455503756927, + "learning_rate": 6.108358606960595e-06, + "loss": 0.31, + "step": 11112 + }, + { + "epoch": 0.64, + "grad_norm": 0.7157112727794527, + "learning_rate": 6.1066444604186156e-06, + "loss": 0.4223, + "step": 11113 + }, + { + "epoch": 0.64, + "grad_norm": 0.26562757365475465, + "learning_rate": 6.104930448703369e-06, + "loss": 0.2132, + "step": 11114 + }, + { + "epoch": 0.64, + "grad_norm": 0.3765400989158581, + "learning_rate": 6.1032165718742154e-06, + "loss": 0.296, + "step": 11115 + }, + { + "epoch": 0.64, + "grad_norm": 0.4711689009780174, + "learning_rate": 6.1015028299905025e-06, + "loss": 0.2469, + "step": 11116 + }, + { + "epoch": 0.64, + "grad_norm": 0.27771652468962005, + "learning_rate": 6.0997892231115805e-06, + "loss": 0.1337, + "step": 11117 + }, + { + "epoch": 0.64, + "grad_norm": 0.2506367775544241, + "learning_rate": 6.098075751296792e-06, + "loss": 0.272, + "step": 11118 + }, + { + "epoch": 0.64, + "grad_norm": 0.7113697594382398, + "learning_rate": 6.096362414605468e-06, + "loss": 0.4545, + "step": 11119 + }, + { + "epoch": 0.64, + "grad_norm": 0.31511420097201853, + "learning_rate": 6.0946492130969494e-06, + "loss": 0.1979, + "step": 11120 + }, + { + "epoch": 0.64, + "grad_norm": 0.48430785170819585, + "learning_rate": 6.092936146830557e-06, + "loss": 0.3365, + "step": 11121 + }, + { + "epoch": 0.64, + "grad_norm": 0.34387547840728383, + "learning_rate": 6.091223215865621e-06, + "loss": 0.2893, + "step": 11122 + }, + { + "epoch": 0.64, + "grad_norm": 0.23731111105713615, + "learning_rate": 6.089510420261455e-06, + "loss": 0.1881, + "step": 11123 + }, + { + "epoch": 0.64, + "grad_norm": 0.3714432234815818, + "learning_rate": 6.087797760077376e-06, + "loss": 0.265, + "step": 11124 + }, + { + "epoch": 0.64, + "grad_norm": 0.7696874757906164, + "learning_rate": 6.086085235372692e-06, + "loss": 0.488, + "step": 11125 + }, + { + "epoch": 0.64, + "grad_norm": 0.29488211422426086, + "learning_rate": 6.084372846206709e-06, + "loss": 0.2643, + "step": 11126 + }, + { + "epoch": 0.64, + "grad_norm": 0.3843671048783373, + "learning_rate": 6.0826605926387226e-06, + "loss": 0.2295, + "step": 11127 + }, + { + "epoch": 0.64, + "grad_norm": 0.28084906133884785, + "learning_rate": 6.080948474728036e-06, + "loss": 0.1574, + "step": 11128 + }, + { + "epoch": 0.64, + "grad_norm": 0.8237786686226553, + "learning_rate": 6.079236492533931e-06, + "loss": 0.3501, + "step": 11129 + }, + { + "epoch": 0.64, + "grad_norm": 0.2653302823814511, + "learning_rate": 6.077524646115701e-06, + "loss": 0.2377, + "step": 11130 + }, + { + "epoch": 0.64, + "grad_norm": 0.790921659400872, + "learning_rate": 6.075812935532623e-06, + "loss": 0.5031, + "step": 11131 + }, + { + "epoch": 0.64, + "grad_norm": 0.732432702456682, + "learning_rate": 6.074101360843973e-06, + "loss": 0.4053, + "step": 11132 + }, + { + "epoch": 0.64, + "grad_norm": 0.3450032202174695, + "learning_rate": 6.072389922109027e-06, + "loss": 0.2106, + "step": 11133 + }, + { + "epoch": 0.64, + "grad_norm": 0.3755738306929151, + "learning_rate": 6.070678619387045e-06, + "loss": 0.2833, + "step": 11134 + }, + { + "epoch": 0.64, + "grad_norm": 0.923733611804049, + "learning_rate": 6.068967452737296e-06, + "loss": 0.3767, + "step": 11135 + }, + { + "epoch": 0.64, + "grad_norm": 0.2933452155571611, + "learning_rate": 6.067256422219034e-06, + "loss": 0.2224, + "step": 11136 + }, + { + "epoch": 0.64, + "grad_norm": 0.29408326213010355, + "learning_rate": 6.065545527891514e-06, + "loss": 0.1626, + "step": 11137 + }, + { + "epoch": 0.64, + "grad_norm": 0.3024053064991005, + "learning_rate": 6.063834769813982e-06, + "loss": 0.2829, + "step": 11138 + }, + { + "epoch": 0.64, + "grad_norm": 0.441073941280975, + "learning_rate": 6.062124148045685e-06, + "loss": 0.3102, + "step": 11139 + }, + { + "epoch": 0.64, + "grad_norm": 1.033460670183201, + "learning_rate": 6.060413662645856e-06, + "loss": 0.3068, + "step": 11140 + }, + { + "epoch": 0.64, + "grad_norm": 0.3800496285695335, + "learning_rate": 6.058703313673735e-06, + "loss": 0.3042, + "step": 11141 + }, + { + "epoch": 0.64, + "grad_norm": 0.3069885259774946, + "learning_rate": 6.0569931011885504e-06, + "loss": 0.2701, + "step": 11142 + }, + { + "epoch": 0.64, + "grad_norm": 0.261025683645621, + "learning_rate": 6.055283025249526e-06, + "loss": 0.1452, + "step": 11143 + }, + { + "epoch": 0.64, + "grad_norm": 0.30579045165578644, + "learning_rate": 6.053573085915875e-06, + "loss": 0.1788, + "step": 11144 + }, + { + "epoch": 0.64, + "grad_norm": 0.41347157047586297, + "learning_rate": 6.0518632832468215e-06, + "loss": 0.3075, + "step": 11145 + }, + { + "epoch": 0.64, + "grad_norm": 0.509072555332188, + "learning_rate": 6.050153617301571e-06, + "loss": 0.2722, + "step": 11146 + }, + { + "epoch": 0.64, + "grad_norm": 1.0137824837962006, + "learning_rate": 6.048444088139334e-06, + "loss": 0.4353, + "step": 11147 + }, + { + "epoch": 0.64, + "grad_norm": 0.32813531727045664, + "learning_rate": 6.0467346958193056e-06, + "loss": 0.253, + "step": 11148 + }, + { + "epoch": 0.64, + "grad_norm": 0.3297149666866975, + "learning_rate": 6.045025440400684e-06, + "loss": 0.2803, + "step": 11149 + }, + { + "epoch": 0.64, + "grad_norm": 0.2517094708835535, + "learning_rate": 6.043316321942663e-06, + "loss": 0.1736, + "step": 11150 + }, + { + "epoch": 0.64, + "grad_norm": 0.36579356484526454, + "learning_rate": 6.0416073405044274e-06, + "loss": 0.2721, + "step": 11151 + }, + { + "epoch": 0.64, + "grad_norm": 1.009514354771467, + "learning_rate": 6.039898496145159e-06, + "loss": 0.4483, + "step": 11152 + }, + { + "epoch": 0.64, + "grad_norm": 0.46789848161172504, + "learning_rate": 6.038189788924036e-06, + "loss": 0.2603, + "step": 11153 + }, + { + "epoch": 0.64, + "grad_norm": 0.28716040798789005, + "learning_rate": 6.03648121890023e-06, + "loss": 0.2665, + "step": 11154 + }, + { + "epoch": 0.64, + "grad_norm": 1.2217392161585139, + "learning_rate": 6.03477278613291e-06, + "loss": 0.7532, + "step": 11155 + }, + { + "epoch": 0.64, + "grad_norm": 0.20730714099122807, + "learning_rate": 6.033064490681238e-06, + "loss": 0.1146, + "step": 11156 + }, + { + "epoch": 0.64, + "grad_norm": 0.4212369317329507, + "learning_rate": 6.031356332604369e-06, + "loss": 0.2847, + "step": 11157 + }, + { + "epoch": 0.64, + "grad_norm": 0.3950415550882442, + "learning_rate": 6.029648311961462e-06, + "loss": 0.3112, + "step": 11158 + }, + { + "epoch": 0.64, + "grad_norm": 0.5599324152074959, + "learning_rate": 6.027940428811662e-06, + "loss": 0.1964, + "step": 11159 + }, + { + "epoch": 0.64, + "grad_norm": 0.3724627817967568, + "learning_rate": 6.026232683214115e-06, + "loss": 0.2767, + "step": 11160 + }, + { + "epoch": 0.64, + "grad_norm": 0.4752167622283187, + "learning_rate": 6.024525075227959e-06, + "loss": 0.3687, + "step": 11161 + }, + { + "epoch": 0.64, + "grad_norm": 0.21954940331324588, + "learning_rate": 6.02281760491233e-06, + "loss": 0.1958, + "step": 11162 + }, + { + "epoch": 0.64, + "grad_norm": 0.33151403738859675, + "learning_rate": 6.021110272326354e-06, + "loss": 0.2124, + "step": 11163 + }, + { + "epoch": 0.64, + "grad_norm": 0.515384369202005, + "learning_rate": 6.0194030775291605e-06, + "loss": 0.3731, + "step": 11164 + }, + { + "epoch": 0.64, + "grad_norm": 0.4626758506358267, + "learning_rate": 6.017696020579864e-06, + "loss": 0.3026, + "step": 11165 + }, + { + "epoch": 0.64, + "grad_norm": 0.26893646286578476, + "learning_rate": 6.015989101537586e-06, + "loss": 0.2204, + "step": 11166 + }, + { + "epoch": 0.64, + "grad_norm": 1.1058572603376675, + "learning_rate": 6.0142823204614335e-06, + "loss": 0.703, + "step": 11167 + }, + { + "epoch": 0.64, + "grad_norm": 0.3964445682878911, + "learning_rate": 6.012575677410512e-06, + "loss": 0.2333, + "step": 11168 + }, + { + "epoch": 0.64, + "grad_norm": 0.27704603202525324, + "learning_rate": 6.010869172443923e-06, + "loss": 0.2202, + "step": 11169 + }, + { + "epoch": 0.64, + "grad_norm": 0.3433573699670649, + "learning_rate": 6.0091628056207655e-06, + "loss": 0.2727, + "step": 11170 + }, + { + "epoch": 0.64, + "grad_norm": 0.9905163871846923, + "learning_rate": 6.007456577000128e-06, + "loss": 0.4821, + "step": 11171 + }, + { + "epoch": 0.64, + "grad_norm": 0.35401947805745243, + "learning_rate": 6.005750486641095e-06, + "loss": 0.2056, + "step": 11172 + }, + { + "epoch": 0.64, + "grad_norm": 0.5022193592434819, + "learning_rate": 6.004044534602753e-06, + "loss": 0.3671, + "step": 11173 + }, + { + "epoch": 0.64, + "grad_norm": 0.3889968137655978, + "learning_rate": 6.002338720944174e-06, + "loss": 0.3389, + "step": 11174 + }, + { + "epoch": 0.64, + "grad_norm": 0.32816842990029704, + "learning_rate": 6.000633045724438e-06, + "loss": 0.2712, + "step": 11175 + }, + { + "epoch": 0.64, + "grad_norm": 0.2643109087774337, + "learning_rate": 5.998927509002608e-06, + "loss": 0.136, + "step": 11176 + }, + { + "epoch": 0.64, + "grad_norm": 0.34977099868176076, + "learning_rate": 5.997222110837742e-06, + "loss": 0.2911, + "step": 11177 + }, + { + "epoch": 0.64, + "grad_norm": 0.41325953968084655, + "learning_rate": 5.995516851288904e-06, + "loss": 0.2771, + "step": 11178 + }, + { + "epoch": 0.64, + "grad_norm": 0.5048678658640868, + "learning_rate": 5.9938117304151445e-06, + "loss": 0.3181, + "step": 11179 + }, + { + "epoch": 0.64, + "grad_norm": 0.5855639790646341, + "learning_rate": 5.992106748275513e-06, + "loss": 0.3861, + "step": 11180 + }, + { + "epoch": 0.64, + "grad_norm": 0.3731060722176844, + "learning_rate": 5.990401904929051e-06, + "loss": 0.2846, + "step": 11181 + }, + { + "epoch": 0.64, + "grad_norm": 0.2450716350813472, + "learning_rate": 5.988697200434801e-06, + "loss": 0.1784, + "step": 11182 + }, + { + "epoch": 0.64, + "grad_norm": 1.588488452674224, + "learning_rate": 5.986992634851794e-06, + "loss": 0.7367, + "step": 11183 + }, + { + "epoch": 0.64, + "grad_norm": 0.33778700717578847, + "learning_rate": 5.985288208239057e-06, + "loss": 0.2592, + "step": 11184 + }, + { + "epoch": 0.64, + "grad_norm": 0.38914509602391384, + "learning_rate": 5.98358392065562e-06, + "loss": 0.2963, + "step": 11185 + }, + { + "epoch": 0.64, + "grad_norm": 0.780635173262054, + "learning_rate": 5.981879772160497e-06, + "loss": 0.3924, + "step": 11186 + }, + { + "epoch": 0.64, + "grad_norm": 0.31361656933869875, + "learning_rate": 5.980175762812705e-06, + "loss": 0.2679, + "step": 11187 + }, + { + "epoch": 0.64, + "grad_norm": 0.4583142651536605, + "learning_rate": 5.978471892671254e-06, + "loss": 0.2751, + "step": 11188 + }, + { + "epoch": 0.64, + "grad_norm": 0.2585065185246987, + "learning_rate": 5.976768161795149e-06, + "loss": 0.1919, + "step": 11189 + }, + { + "epoch": 0.64, + "grad_norm": 0.3744518103024148, + "learning_rate": 5.975064570243387e-06, + "loss": 0.2544, + "step": 11190 + }, + { + "epoch": 0.64, + "grad_norm": 1.2344220965569719, + "learning_rate": 5.973361118074969e-06, + "loss": 0.8546, + "step": 11191 + }, + { + "epoch": 0.64, + "grad_norm": 0.9938392858124002, + "learning_rate": 5.97165780534888e-06, + "loss": 0.3239, + "step": 11192 + }, + { + "epoch": 0.64, + "grad_norm": 0.2816535477742484, + "learning_rate": 5.969954632124111e-06, + "loss": 0.248, + "step": 11193 + }, + { + "epoch": 0.64, + "grad_norm": 0.5002961541221576, + "learning_rate": 5.968251598459636e-06, + "loss": 0.3412, + "step": 11194 + }, + { + "epoch": 0.64, + "grad_norm": 0.2849071332633207, + "learning_rate": 5.966548704414436e-06, + "loss": 0.1259, + "step": 11195 + }, + { + "epoch": 0.64, + "grad_norm": 0.38572866452620297, + "learning_rate": 5.964845950047484e-06, + "loss": 0.2577, + "step": 11196 + }, + { + "epoch": 0.64, + "grad_norm": 0.3144011371244218, + "learning_rate": 5.96314333541774e-06, + "loss": 0.2866, + "step": 11197 + }, + { + "epoch": 0.64, + "grad_norm": 1.0859733305344872, + "learning_rate": 5.961440860584169e-06, + "loss": 0.4253, + "step": 11198 + }, + { + "epoch": 0.64, + "grad_norm": 0.31667468432575874, + "learning_rate": 5.959738525605727e-06, + "loss": 0.1599, + "step": 11199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3265305092885602, + "learning_rate": 5.958036330541368e-06, + "loss": 0.2316, + "step": 11200 + }, + { + "epoch": 0.64, + "grad_norm": 0.2730721162752112, + "learning_rate": 5.956334275450035e-06, + "loss": 0.2342, + "step": 11201 + }, + { + "epoch": 0.64, + "grad_norm": 0.315786099286474, + "learning_rate": 5.954632360390673e-06, + "loss": 0.1946, + "step": 11202 + }, + { + "epoch": 0.64, + "grad_norm": 0.8586307279331272, + "learning_rate": 5.9529305854222185e-06, + "loss": 0.4156, + "step": 11203 + }, + { + "epoch": 0.64, + "grad_norm": 0.8222346678946091, + "learning_rate": 5.951228950603605e-06, + "loss": 0.5293, + "step": 11204 + }, + { + "epoch": 0.64, + "grad_norm": 0.26079077888105623, + "learning_rate": 5.949527455993756e-06, + "loss": 0.2104, + "step": 11205 + }, + { + "epoch": 0.64, + "grad_norm": 0.4512259533268354, + "learning_rate": 5.947826101651599e-06, + "loss": 0.3257, + "step": 11206 + }, + { + "epoch": 0.64, + "grad_norm": 0.32723592880119196, + "learning_rate": 5.946124887636049e-06, + "loss": 0.2213, + "step": 11207 + }, + { + "epoch": 0.64, + "grad_norm": 0.29865161019025654, + "learning_rate": 5.944423814006022e-06, + "loss": 0.2226, + "step": 11208 + }, + { + "epoch": 0.64, + "grad_norm": 0.35149010732275404, + "learning_rate": 5.9427228808204216e-06, + "loss": 0.3059, + "step": 11209 + }, + { + "epoch": 0.64, + "grad_norm": 0.7682225597339026, + "learning_rate": 5.941022088138158e-06, + "loss": 0.4489, + "step": 11210 + }, + { + "epoch": 0.64, + "grad_norm": 0.30994659665733854, + "learning_rate": 5.939321436018119e-06, + "loss": 0.2518, + "step": 11211 + }, + { + "epoch": 0.64, + "grad_norm": 0.5923893515591114, + "learning_rate": 5.937620924519207e-06, + "loss": 0.0215, + "step": 11212 + }, + { + "epoch": 0.64, + "grad_norm": 0.22704968985868573, + "learning_rate": 5.935920553700305e-06, + "loss": 0.216, + "step": 11213 + }, + { + "epoch": 0.64, + "grad_norm": 0.5706346113528733, + "learning_rate": 5.934220323620303e-06, + "loss": 0.3533, + "step": 11214 + }, + { + "epoch": 0.64, + "grad_norm": 0.37401496655785443, + "learning_rate": 5.932520234338073e-06, + "loss": 0.2737, + "step": 11215 + }, + { + "epoch": 0.64, + "grad_norm": 0.4734541768133188, + "learning_rate": 5.930820285912495e-06, + "loss": 0.4071, + "step": 11216 + }, + { + "epoch": 0.64, + "grad_norm": 0.3539907526938965, + "learning_rate": 5.9291204784024335e-06, + "loss": 0.2553, + "step": 11217 + }, + { + "epoch": 0.64, + "grad_norm": 0.3659412287344784, + "learning_rate": 5.9274208118667565e-06, + "loss": 0.2484, + "step": 11218 + }, + { + "epoch": 0.64, + "grad_norm": 0.3113800892023193, + "learning_rate": 5.92572128636432e-06, + "loss": 0.1939, + "step": 11219 + }, + { + "epoch": 0.64, + "grad_norm": 0.5020546282550093, + "learning_rate": 5.924021901953983e-06, + "loss": 0.2901, + "step": 11220 + }, + { + "epoch": 0.64, + "grad_norm": 0.286313888428949, + "learning_rate": 5.922322658694591e-06, + "loss": 0.2489, + "step": 11221 + }, + { + "epoch": 0.64, + "grad_norm": 1.2063295574915158, + "learning_rate": 5.920623556644987e-06, + "loss": 0.7685, + "step": 11222 + }, + { + "epoch": 0.64, + "grad_norm": 0.422377259307182, + "learning_rate": 5.918924595864017e-06, + "loss": 0.3199, + "step": 11223 + }, + { + "epoch": 0.64, + "grad_norm": 0.5829496855100268, + "learning_rate": 5.917225776410511e-06, + "loss": 0.3525, + "step": 11224 + }, + { + "epoch": 0.64, + "grad_norm": 0.2512331675050159, + "learning_rate": 5.915527098343302e-06, + "loss": 0.206, + "step": 11225 + }, + { + "epoch": 0.64, + "grad_norm": 0.3745193433663806, + "learning_rate": 5.913828561721214e-06, + "loss": 0.2792, + "step": 11226 + }, + { + "epoch": 0.65, + "grad_norm": 0.3169721307067816, + "learning_rate": 5.912130166603066e-06, + "loss": 0.2322, + "step": 11227 + }, + { + "epoch": 0.65, + "grad_norm": 0.4240458248846178, + "learning_rate": 5.910431913047674e-06, + "loss": 0.2787, + "step": 11228 + }, + { + "epoch": 0.65, + "grad_norm": 0.30928589966688147, + "learning_rate": 5.908733801113851e-06, + "loss": 0.2619, + "step": 11229 + }, + { + "epoch": 0.65, + "grad_norm": 0.48813601012960506, + "learning_rate": 5.907035830860399e-06, + "loss": 0.3456, + "step": 11230 + }, + { + "epoch": 0.65, + "grad_norm": 0.5823485197085368, + "learning_rate": 5.905338002346122e-06, + "loss": 0.314, + "step": 11231 + }, + { + "epoch": 0.65, + "grad_norm": 0.5900398729201102, + "learning_rate": 5.9036403156298125e-06, + "loss": 0.2937, + "step": 11232 + }, + { + "epoch": 0.65, + "grad_norm": 0.23746694604233817, + "learning_rate": 5.901942770770264e-06, + "loss": 0.2338, + "step": 11233 + }, + { + "epoch": 0.65, + "grad_norm": 0.30105969621280293, + "learning_rate": 5.900245367826258e-06, + "loss": 0.1899, + "step": 11234 + }, + { + "epoch": 0.65, + "grad_norm": 0.6304257335183989, + "learning_rate": 5.898548106856583e-06, + "loss": 0.3072, + "step": 11235 + }, + { + "epoch": 0.65, + "grad_norm": 0.4284540690270265, + "learning_rate": 5.896850987920009e-06, + "loss": 0.2833, + "step": 11236 + }, + { + "epoch": 0.65, + "grad_norm": 0.38906981985827127, + "learning_rate": 5.895154011075308e-06, + "loss": 0.3207, + "step": 11237 + }, + { + "epoch": 0.65, + "grad_norm": 0.5208798117829698, + "learning_rate": 5.893457176381248e-06, + "loss": 0.1669, + "step": 11238 + }, + { + "epoch": 0.65, + "grad_norm": 0.39622311730384707, + "learning_rate": 5.891760483896587e-06, + "loss": 0.3249, + "step": 11239 + }, + { + "epoch": 0.65, + "grad_norm": 0.38496094157001054, + "learning_rate": 5.890063933680087e-06, + "loss": 0.2616, + "step": 11240 + }, + { + "epoch": 0.65, + "grad_norm": 0.22651439846232713, + "learning_rate": 5.8883675257904936e-06, + "loss": 0.1797, + "step": 11241 + }, + { + "epoch": 0.65, + "grad_norm": 0.38679464880782893, + "learning_rate": 5.886671260286558e-06, + "loss": 0.3031, + "step": 11242 + }, + { + "epoch": 0.65, + "grad_norm": 0.7959092880908866, + "learning_rate": 5.884975137227018e-06, + "loss": 0.3909, + "step": 11243 + }, + { + "epoch": 0.65, + "grad_norm": 0.3346253622241024, + "learning_rate": 5.883279156670616e-06, + "loss": 0.1724, + "step": 11244 + }, + { + "epoch": 0.65, + "grad_norm": 0.30811296607418726, + "learning_rate": 5.881583318676078e-06, + "loss": 0.2709, + "step": 11245 + }, + { + "epoch": 0.65, + "grad_norm": 0.3766558287583714, + "learning_rate": 5.879887623302131e-06, + "loss": 0.3055, + "step": 11246 + }, + { + "epoch": 0.65, + "grad_norm": 0.23333225379841346, + "learning_rate": 5.8781920706075e-06, + "loss": 0.1275, + "step": 11247 + }, + { + "epoch": 0.65, + "grad_norm": 0.5988756712228822, + "learning_rate": 5.876496660650899e-06, + "loss": 0.3427, + "step": 11248 + }, + { + "epoch": 0.65, + "grad_norm": 0.37202182636931236, + "learning_rate": 5.874801393491041e-06, + "loss": 0.301, + "step": 11249 + }, + { + "epoch": 0.65, + "grad_norm": 0.705704673592431, + "learning_rate": 5.873106269186635e-06, + "loss": 0.3855, + "step": 11250 + }, + { + "epoch": 0.65, + "grad_norm": 0.3430225343190707, + "learning_rate": 5.871411287796379e-06, + "loss": 0.229, + "step": 11251 + }, + { + "epoch": 0.65, + "grad_norm": 0.3532476705395196, + "learning_rate": 5.869716449378975e-06, + "loss": 0.3249, + "step": 11252 + }, + { + "epoch": 0.65, + "grad_norm": 0.21846891869803547, + "learning_rate": 5.8680217539931106e-06, + "loss": 0.147, + "step": 11253 + }, + { + "epoch": 0.65, + "grad_norm": 0.3593853724638047, + "learning_rate": 5.866327201697477e-06, + "loss": 0.2326, + "step": 11254 + }, + { + "epoch": 0.65, + "grad_norm": 0.7948326466630459, + "learning_rate": 5.864632792550753e-06, + "loss": 0.4388, + "step": 11255 + }, + { + "epoch": 0.65, + "grad_norm": 0.4301961477693214, + "learning_rate": 5.862938526611619e-06, + "loss": 0.2876, + "step": 11256 + }, + { + "epoch": 0.65, + "grad_norm": 0.27854050842859307, + "learning_rate": 5.861244403938744e-06, + "loss": 0.232, + "step": 11257 + }, + { + "epoch": 0.65, + "grad_norm": 1.2624624061673055, + "learning_rate": 5.859550424590801e-06, + "loss": 0.7603, + "step": 11258 + }, + { + "epoch": 0.65, + "grad_norm": 0.3089760684352078, + "learning_rate": 5.857856588626445e-06, + "loss": 0.182, + "step": 11259 + }, + { + "epoch": 0.65, + "grad_norm": 0.23799079783274543, + "learning_rate": 5.856162896104339e-06, + "loss": 0.2428, + "step": 11260 + }, + { + "epoch": 0.65, + "grad_norm": 0.874747782088808, + "learning_rate": 5.854469347083134e-06, + "loss": 0.5215, + "step": 11261 + }, + { + "epoch": 0.65, + "grad_norm": 0.5987070815263701, + "learning_rate": 5.852775941621476e-06, + "loss": 0.2821, + "step": 11262 + }, + { + "epoch": 0.65, + "grad_norm": 0.3813507046247867, + "learning_rate": 5.851082679778011e-06, + "loss": 0.2997, + "step": 11263 + }, + { + "epoch": 0.65, + "grad_norm": 0.3432165598746957, + "learning_rate": 5.8493895616113714e-06, + "loss": 0.2648, + "step": 11264 + }, + { + "epoch": 0.65, + "grad_norm": 0.49326946299327873, + "learning_rate": 5.847696587180195e-06, + "loss": 0.3269, + "step": 11265 + }, + { + "epoch": 0.65, + "grad_norm": 0.3997606558563779, + "learning_rate": 5.846003756543106e-06, + "loss": 0.309, + "step": 11266 + }, + { + "epoch": 0.65, + "grad_norm": 0.2854817732641904, + "learning_rate": 5.844311069758729e-06, + "loss": 0.138, + "step": 11267 + }, + { + "epoch": 0.65, + "grad_norm": 0.359787125128394, + "learning_rate": 5.842618526885679e-06, + "loss": 0.2986, + "step": 11268 + }, + { + "epoch": 0.65, + "grad_norm": 0.34338489317305765, + "learning_rate": 5.840926127982573e-06, + "loss": 0.2825, + "step": 11269 + }, + { + "epoch": 0.65, + "grad_norm": 0.7416336040716597, + "learning_rate": 5.839233873108016e-06, + "loss": 0.3722, + "step": 11270 + }, + { + "epoch": 0.65, + "grad_norm": 0.6592265725324533, + "learning_rate": 5.837541762320609e-06, + "loss": 0.4155, + "step": 11271 + }, + { + "epoch": 0.65, + "grad_norm": 0.3190564215746574, + "learning_rate": 5.835849795678954e-06, + "loss": 0.2622, + "step": 11272 + }, + { + "epoch": 0.65, + "grad_norm": 0.19298277235341782, + "learning_rate": 5.834157973241643e-06, + "loss": 0.1614, + "step": 11273 + }, + { + "epoch": 0.65, + "grad_norm": 0.7677838234055707, + "learning_rate": 5.83246629506726e-06, + "loss": 0.4286, + "step": 11274 + }, + { + "epoch": 0.65, + "grad_norm": 0.3608993586517985, + "learning_rate": 5.830774761214392e-06, + "loss": 0.303, + "step": 11275 + }, + { + "epoch": 0.65, + "grad_norm": 0.5166381381403379, + "learning_rate": 5.829083371741609e-06, + "loss": 0.3991, + "step": 11276 + }, + { + "epoch": 0.65, + "grad_norm": 0.5044253316790134, + "learning_rate": 5.827392126707499e-06, + "loss": 0.2718, + "step": 11277 + }, + { + "epoch": 0.65, + "grad_norm": 0.3718745113889626, + "learning_rate": 5.825701026170616e-06, + "loss": 0.3091, + "step": 11278 + }, + { + "epoch": 0.65, + "grad_norm": 0.28428241777519586, + "learning_rate": 5.824010070189523e-06, + "loss": 0.2016, + "step": 11279 + }, + { + "epoch": 0.65, + "grad_norm": 0.30819648744024025, + "learning_rate": 5.8223192588227836e-06, + "loss": 0.2175, + "step": 11280 + }, + { + "epoch": 0.65, + "grad_norm": 0.3475980694959061, + "learning_rate": 5.820628592128952e-06, + "loss": 0.275, + "step": 11281 + }, + { + "epoch": 0.65, + "grad_norm": 0.711060174788503, + "learning_rate": 5.81893807016657e-06, + "loss": 0.4232, + "step": 11282 + }, + { + "epoch": 0.65, + "grad_norm": 0.5564311587922617, + "learning_rate": 5.817247692994179e-06, + "loss": 0.2324, + "step": 11283 + }, + { + "epoch": 0.65, + "grad_norm": 0.39698374618945803, + "learning_rate": 5.815557460670326e-06, + "loss": 0.2794, + "step": 11284 + }, + { + "epoch": 0.65, + "grad_norm": 0.24435255765312222, + "learning_rate": 5.813867373253537e-06, + "loss": 0.2001, + "step": 11285 + }, + { + "epoch": 0.65, + "grad_norm": 0.7686243817440529, + "learning_rate": 5.8121774308023415e-06, + "loss": 0.4895, + "step": 11286 + }, + { + "epoch": 0.65, + "grad_norm": 0.31839227350161425, + "learning_rate": 5.810487633375261e-06, + "loss": 0.2133, + "step": 11287 + }, + { + "epoch": 0.65, + "grad_norm": 0.4042563230540896, + "learning_rate": 5.80879798103081e-06, + "loss": 0.3192, + "step": 11288 + }, + { + "epoch": 0.65, + "grad_norm": 0.76019850506445, + "learning_rate": 5.807108473827508e-06, + "loss": 0.4039, + "step": 11289 + }, + { + "epoch": 0.65, + "grad_norm": 0.30882169987209035, + "learning_rate": 5.80541911182386e-06, + "loss": 0.2067, + "step": 11290 + }, + { + "epoch": 0.65, + "grad_norm": 0.2653860564848561, + "learning_rate": 5.803729895078368e-06, + "loss": 0.1633, + "step": 11291 + }, + { + "epoch": 0.65, + "grad_norm": 0.37243490114112704, + "learning_rate": 5.802040823649524e-06, + "loss": 0.3116, + "step": 11292 + }, + { + "epoch": 0.65, + "grad_norm": 0.2908543475298292, + "learning_rate": 5.800351897595832e-06, + "loss": 0.2062, + "step": 11293 + }, + { + "epoch": 0.65, + "grad_norm": 0.7543086085745012, + "learning_rate": 5.7986631169757715e-06, + "loss": 0.4082, + "step": 11294 + }, + { + "epoch": 0.65, + "grad_norm": 0.7486544921576171, + "learning_rate": 5.796974481847827e-06, + "loss": 0.4599, + "step": 11295 + }, + { + "epoch": 0.65, + "grad_norm": 0.2232167232788598, + "learning_rate": 5.795285992270472e-06, + "loss": 0.216, + "step": 11296 + }, + { + "epoch": 0.65, + "grad_norm": 0.47861387869738203, + "learning_rate": 5.793597648302185e-06, + "loss": 0.258, + "step": 11297 + }, + { + "epoch": 0.65, + "grad_norm": 0.4226253650595357, + "learning_rate": 5.791909450001432e-06, + "loss": 0.3018, + "step": 11298 + }, + { + "epoch": 0.65, + "grad_norm": 0.32740022436539834, + "learning_rate": 5.790221397426672e-06, + "loss": 0.2581, + "step": 11299 + }, + { + "epoch": 0.65, + "grad_norm": 0.37377043137715343, + "learning_rate": 5.7885334906363656e-06, + "loss": 0.2573, + "step": 11300 + }, + { + "epoch": 0.65, + "grad_norm": 1.0508988726333814, + "learning_rate": 5.786845729688958e-06, + "loss": 0.5739, + "step": 11301 + }, + { + "epoch": 0.65, + "grad_norm": 0.39097791401524284, + "learning_rate": 5.785158114642906e-06, + "loss": 0.2412, + "step": 11302 + }, + { + "epoch": 0.65, + "grad_norm": 0.2146535619035754, + "learning_rate": 5.783470645556648e-06, + "loss": 0.1288, + "step": 11303 + }, + { + "epoch": 0.65, + "grad_norm": 0.30364120884793583, + "learning_rate": 5.781783322488619e-06, + "loss": 0.2825, + "step": 11304 + }, + { + "epoch": 0.65, + "grad_norm": 0.35964780048498673, + "learning_rate": 5.78009614549725e-06, + "loss": 0.2912, + "step": 11305 + }, + { + "epoch": 0.65, + "grad_norm": 0.7982606825690333, + "learning_rate": 5.778409114640973e-06, + "loss": 0.3289, + "step": 11306 + }, + { + "epoch": 0.65, + "grad_norm": 1.2002096946727734, + "learning_rate": 5.776722229978206e-06, + "loss": 0.5363, + "step": 11307 + }, + { + "epoch": 0.65, + "grad_norm": 0.26494917021712633, + "learning_rate": 5.775035491567367e-06, + "loss": 0.2465, + "step": 11308 + }, + { + "epoch": 0.65, + "grad_norm": 0.2207873813549719, + "learning_rate": 5.773348899466864e-06, + "loss": 0.1466, + "step": 11309 + }, + { + "epoch": 0.65, + "grad_norm": 0.8187241696416505, + "learning_rate": 5.7716624537351105e-06, + "loss": 0.4156, + "step": 11310 + }, + { + "epoch": 0.65, + "grad_norm": 0.3221642458006116, + "learning_rate": 5.769976154430507e-06, + "loss": 0.2634, + "step": 11311 + }, + { + "epoch": 0.65, + "grad_norm": 0.3845684244489153, + "learning_rate": 5.768290001611446e-06, + "loss": 0.2996, + "step": 11312 + }, + { + "epoch": 0.65, + "grad_norm": 0.9866515649350825, + "learning_rate": 5.7666039953363155e-06, + "loss": 0.3581, + "step": 11313 + }, + { + "epoch": 0.65, + "grad_norm": 0.3284899038074513, + "learning_rate": 5.76491813566351e-06, + "loss": 0.2465, + "step": 11314 + }, + { + "epoch": 0.65, + "grad_norm": 1.3074311625669295, + "learning_rate": 5.763232422651407e-06, + "loss": 0.5833, + "step": 11315 + }, + { + "epoch": 0.65, + "grad_norm": 0.2768885144280665, + "learning_rate": 5.761546856358384e-06, + "loss": 0.2403, + "step": 11316 + }, + { + "epoch": 0.65, + "grad_norm": 0.34299468161640334, + "learning_rate": 5.759861436842806e-06, + "loss": 0.2699, + "step": 11317 + }, + { + "epoch": 0.65, + "grad_norm": 0.846850459365096, + "learning_rate": 5.7581761641630485e-06, + "loss": 0.5027, + "step": 11318 + }, + { + "epoch": 0.65, + "grad_norm": 0.25050625975393576, + "learning_rate": 5.756491038377469e-06, + "loss": 0.211, + "step": 11319 + }, + { + "epoch": 0.65, + "grad_norm": 0.39637983568228946, + "learning_rate": 5.754806059544421e-06, + "loss": 0.2666, + "step": 11320 + }, + { + "epoch": 0.65, + "grad_norm": 0.5585156954349844, + "learning_rate": 5.753121227722254e-06, + "loss": 0.3317, + "step": 11321 + }, + { + "epoch": 0.65, + "grad_norm": 0.4894209756295622, + "learning_rate": 5.7514365429693186e-06, + "loss": 0.2484, + "step": 11322 + }, + { + "epoch": 0.65, + "grad_norm": 0.39527617870681714, + "learning_rate": 5.749752005343954e-06, + "loss": 0.2635, + "step": 11323 + }, + { + "epoch": 0.65, + "grad_norm": 0.33485572046557144, + "learning_rate": 5.7480676149044945e-06, + "loss": 0.2856, + "step": 11324 + }, + { + "epoch": 0.65, + "grad_norm": 0.4181430086739952, + "learning_rate": 5.746383371709267e-06, + "loss": 0.2432, + "step": 11325 + }, + { + "epoch": 0.65, + "grad_norm": 0.28865695519714385, + "learning_rate": 5.7446992758166035e-06, + "loss": 0.1886, + "step": 11326 + }, + { + "epoch": 0.65, + "grad_norm": 0.43098022642011, + "learning_rate": 5.743015327284822e-06, + "loss": 0.332, + "step": 11327 + }, + { + "epoch": 0.65, + "grad_norm": 0.46720475592837557, + "learning_rate": 5.7413315261722355e-06, + "loss": 0.3363, + "step": 11328 + }, + { + "epoch": 0.65, + "grad_norm": 0.3041795818931702, + "learning_rate": 5.739647872537157e-06, + "loss": 0.2173, + "step": 11329 + }, + { + "epoch": 0.65, + "grad_norm": 1.087779922116115, + "learning_rate": 5.737964366437885e-06, + "loss": 0.8105, + "step": 11330 + }, + { + "epoch": 0.65, + "grad_norm": 0.23944258282874695, + "learning_rate": 5.736281007932727e-06, + "loss": 0.181, + "step": 11331 + }, + { + "epoch": 0.65, + "grad_norm": 0.27234929845391487, + "learning_rate": 5.734597797079974e-06, + "loss": 0.2099, + "step": 11332 + }, + { + "epoch": 0.65, + "grad_norm": 0.7745522496107576, + "learning_rate": 5.732914733937917e-06, + "loss": 0.4524, + "step": 11333 + }, + { + "epoch": 0.65, + "grad_norm": 0.7024537449114042, + "learning_rate": 5.731231818564834e-06, + "loss": 0.4207, + "step": 11334 + }, + { + "epoch": 0.65, + "grad_norm": 0.3230484251573659, + "learning_rate": 5.729549051019014e-06, + "loss": 0.2026, + "step": 11335 + }, + { + "epoch": 0.65, + "grad_norm": 0.3556886103604453, + "learning_rate": 5.7278664313587275e-06, + "loss": 0.3461, + "step": 11336 + }, + { + "epoch": 0.65, + "grad_norm": 0.30108299641494074, + "learning_rate": 5.726183959642242e-06, + "loss": 0.1966, + "step": 11337 + }, + { + "epoch": 0.65, + "grad_norm": 0.6097747958982636, + "learning_rate": 5.724501635927818e-06, + "loss": 0.3249, + "step": 11338 + }, + { + "epoch": 0.65, + "grad_norm": 0.37914027796049266, + "learning_rate": 5.722819460273723e-06, + "loss": 0.2742, + "step": 11339 + }, + { + "epoch": 0.65, + "grad_norm": 0.35501758807427714, + "learning_rate": 5.7211374327382066e-06, + "loss": 0.2916, + "step": 11340 + }, + { + "epoch": 0.65, + "grad_norm": 0.5940137811131664, + "learning_rate": 5.719455553379516e-06, + "loss": 0.3368, + "step": 11341 + }, + { + "epoch": 0.65, + "grad_norm": 0.36694725493194763, + "learning_rate": 5.717773822255896e-06, + "loss": 0.2532, + "step": 11342 + }, + { + "epoch": 0.65, + "grad_norm": 0.23800875625112516, + "learning_rate": 5.71609223942558e-06, + "loss": 0.2069, + "step": 11343 + }, + { + "epoch": 0.65, + "grad_norm": 0.37304610142960154, + "learning_rate": 5.7144108049468106e-06, + "loss": 0.2867, + "step": 11344 + }, + { + "epoch": 0.65, + "grad_norm": 0.36018737364414755, + "learning_rate": 5.712729518877813e-06, + "loss": 0.2725, + "step": 11345 + }, + { + "epoch": 0.65, + "grad_norm": 1.454786122363037, + "learning_rate": 5.711048381276801e-06, + "loss": 0.838, + "step": 11346 + }, + { + "epoch": 0.65, + "grad_norm": 0.4161241033055131, + "learning_rate": 5.709367392202003e-06, + "loss": 0.2913, + "step": 11347 + }, + { + "epoch": 0.65, + "grad_norm": 0.28370028246480244, + "learning_rate": 5.707686551711628e-06, + "loss": 0.2552, + "step": 11348 + }, + { + "epoch": 0.65, + "grad_norm": 0.2784506630346675, + "learning_rate": 5.706005859863883e-06, + "loss": 0.1635, + "step": 11349 + }, + { + "epoch": 0.65, + "grad_norm": 0.36016396685166263, + "learning_rate": 5.704325316716966e-06, + "loss": 0.3082, + "step": 11350 + }, + { + "epoch": 0.65, + "grad_norm": 0.4467132067514661, + "learning_rate": 5.702644922329083e-06, + "loss": 0.3476, + "step": 11351 + }, + { + "epoch": 0.65, + "grad_norm": 0.327846838982423, + "learning_rate": 5.70096467675842e-06, + "loss": 0.2569, + "step": 11352 + }, + { + "epoch": 0.65, + "grad_norm": 0.367416999026494, + "learning_rate": 5.699284580063167e-06, + "loss": 0.2946, + "step": 11353 + }, + { + "epoch": 0.65, + "grad_norm": 0.5459229122762558, + "learning_rate": 5.697604632301504e-06, + "loss": 0.4332, + "step": 11354 + }, + { + "epoch": 0.65, + "grad_norm": 0.25668900132390404, + "learning_rate": 5.695924833531603e-06, + "loss": 0.2039, + "step": 11355 + }, + { + "epoch": 0.65, + "grad_norm": 0.4629903377649034, + "learning_rate": 5.6942451838116445e-06, + "loss": 0.2769, + "step": 11356 + }, + { + "epoch": 0.65, + "grad_norm": 0.2880039976122195, + "learning_rate": 5.69256568319979e-06, + "loss": 0.2513, + "step": 11357 + }, + { + "epoch": 0.65, + "grad_norm": 1.0645379082384252, + "learning_rate": 5.6908863317542e-06, + "loss": 0.3649, + "step": 11358 + }, + { + "epoch": 0.65, + "grad_norm": 0.3809085828908202, + "learning_rate": 5.689207129533027e-06, + "loss": 0.3278, + "step": 11359 + }, + { + "epoch": 0.65, + "grad_norm": 0.33329868938159685, + "learning_rate": 5.687528076594432e-06, + "loss": 0.3125, + "step": 11360 + }, + { + "epoch": 0.65, + "grad_norm": 0.7300849087139614, + "learning_rate": 5.685849172996551e-06, + "loss": 0.411, + "step": 11361 + }, + { + "epoch": 0.65, + "grad_norm": 0.4960419001943459, + "learning_rate": 5.6841704187975296e-06, + "loss": 0.2599, + "step": 11362 + }, + { + "epoch": 0.65, + "grad_norm": 0.21882316289564832, + "learning_rate": 5.682491814055497e-06, + "loss": 0.2088, + "step": 11363 + }, + { + "epoch": 0.65, + "grad_norm": 0.4726847855526382, + "learning_rate": 5.680813358828592e-06, + "loss": 0.3169, + "step": 11364 + }, + { + "epoch": 0.65, + "grad_norm": 0.30338445447650797, + "learning_rate": 5.679135053174932e-06, + "loss": 0.1485, + "step": 11365 + }, + { + "epoch": 0.65, + "grad_norm": 0.4310514017451592, + "learning_rate": 5.677456897152641e-06, + "loss": 0.3493, + "step": 11366 + }, + { + "epoch": 0.65, + "grad_norm": 0.33338075483482793, + "learning_rate": 5.6757788908198316e-06, + "loss": 0.2944, + "step": 11367 + }, + { + "epoch": 0.65, + "grad_norm": 0.4127582369100299, + "learning_rate": 5.674101034234609e-06, + "loss": 0.239, + "step": 11368 + }, + { + "epoch": 0.65, + "grad_norm": 0.3351444220704729, + "learning_rate": 5.672423327455085e-06, + "loss": 0.2188, + "step": 11369 + }, + { + "epoch": 0.65, + "grad_norm": 0.4345390366631468, + "learning_rate": 5.670745770539356e-06, + "loss": 0.2443, + "step": 11370 + }, + { + "epoch": 0.65, + "grad_norm": 0.24478960709092923, + "learning_rate": 5.669068363545516e-06, + "loss": 0.2213, + "step": 11371 + }, + { + "epoch": 0.65, + "grad_norm": 0.4821758480038891, + "learning_rate": 5.667391106531647e-06, + "loss": 0.3277, + "step": 11372 + }, + { + "epoch": 0.65, + "grad_norm": 0.7428114884812085, + "learning_rate": 5.665713999555842e-06, + "loss": 0.3687, + "step": 11373 + }, + { + "epoch": 0.65, + "grad_norm": 0.7485401252645069, + "learning_rate": 5.6640370426761735e-06, + "loss": 0.4149, + "step": 11374 + }, + { + "epoch": 0.65, + "grad_norm": 0.23688874268890042, + "learning_rate": 5.662360235950717e-06, + "loss": 0.1964, + "step": 11375 + }, + { + "epoch": 0.65, + "grad_norm": 0.31576975671951585, + "learning_rate": 5.6606835794375346e-06, + "loss": 0.254, + "step": 11376 + }, + { + "epoch": 0.65, + "grad_norm": 0.7108945036865616, + "learning_rate": 5.659007073194697e-06, + "loss": 0.4271, + "step": 11377 + }, + { + "epoch": 0.65, + "grad_norm": 0.33881004573315643, + "learning_rate": 5.657330717280258e-06, + "loss": 0.2427, + "step": 11378 + }, + { + "epoch": 0.65, + "grad_norm": 0.35024537485757967, + "learning_rate": 5.655654511752274e-06, + "loss": 0.2942, + "step": 11379 + }, + { + "epoch": 0.65, + "grad_norm": 0.6233162286325228, + "learning_rate": 5.653978456668779e-06, + "loss": 0.3727, + "step": 11380 + }, + { + "epoch": 0.65, + "grad_norm": 0.23743888501996108, + "learning_rate": 5.652302552087827e-06, + "loss": 0.1682, + "step": 11381 + }, + { + "epoch": 0.65, + "grad_norm": 0.42937799121144055, + "learning_rate": 5.6506267980674515e-06, + "loss": 0.2655, + "step": 11382 + }, + { + "epoch": 0.65, + "grad_norm": 0.3103419789202479, + "learning_rate": 5.648951194665683e-06, + "loss": 0.294, + "step": 11383 + }, + { + "epoch": 0.65, + "grad_norm": 0.32653676277691335, + "learning_rate": 5.647275741940543e-06, + "loss": 0.2316, + "step": 11384 + }, + { + "epoch": 0.65, + "grad_norm": 0.6254068439076284, + "learning_rate": 5.645600439950061e-06, + "loss": 0.4076, + "step": 11385 + }, + { + "epoch": 0.65, + "grad_norm": 0.8407360463120334, + "learning_rate": 5.643925288752248e-06, + "loss": 0.4822, + "step": 11386 + }, + { + "epoch": 0.65, + "grad_norm": 0.35286540111542963, + "learning_rate": 5.642250288405116e-06, + "loss": 0.2868, + "step": 11387 + }, + { + "epoch": 0.65, + "grad_norm": 0.2163730802370492, + "learning_rate": 5.6405754389666635e-06, + "loss": 0.1894, + "step": 11388 + }, + { + "epoch": 0.65, + "grad_norm": 0.5102026088195443, + "learning_rate": 5.638900740494901e-06, + "loss": 0.3567, + "step": 11389 + }, + { + "epoch": 0.65, + "grad_norm": 0.39499275164180747, + "learning_rate": 5.637226193047818e-06, + "loss": 0.293, + "step": 11390 + }, + { + "epoch": 0.65, + "grad_norm": 0.36184693966404863, + "learning_rate": 5.635551796683405e-06, + "loss": 0.2894, + "step": 11391 + }, + { + "epoch": 0.65, + "grad_norm": 1.1693194311109523, + "learning_rate": 5.633877551459646e-06, + "loss": 0.5964, + "step": 11392 + }, + { + "epoch": 0.65, + "grad_norm": 0.3155464664914508, + "learning_rate": 5.6322034574345145e-06, + "loss": 0.2347, + "step": 11393 + }, + { + "epoch": 0.65, + "grad_norm": 0.20528795990945872, + "learning_rate": 5.630529514665993e-06, + "loss": 0.1466, + "step": 11394 + }, + { + "epoch": 0.65, + "grad_norm": 0.4960690183431173, + "learning_rate": 5.628855723212048e-06, + "loss": 0.3488, + "step": 11395 + }, + { + "epoch": 0.65, + "grad_norm": 0.32121279561481897, + "learning_rate": 5.62718208313064e-06, + "loss": 0.2645, + "step": 11396 + }, + { + "epoch": 0.65, + "grad_norm": 0.7035742236195769, + "learning_rate": 5.625508594479725e-06, + "loss": 0.3335, + "step": 11397 + }, + { + "epoch": 0.65, + "grad_norm": 0.7735193256482944, + "learning_rate": 5.6238352573172635e-06, + "loss": 0.4442, + "step": 11398 + }, + { + "epoch": 0.65, + "grad_norm": 0.26150894556463467, + "learning_rate": 5.622162071701198e-06, + "loss": 0.2663, + "step": 11399 + }, + { + "epoch": 0.65, + "grad_norm": 0.27248367312206967, + "learning_rate": 5.6204890376894735e-06, + "loss": 0.1802, + "step": 11400 + }, + { + "epoch": 0.66, + "grad_norm": 0.6700607475194417, + "learning_rate": 5.61881615534002e-06, + "loss": 0.2683, + "step": 11401 + }, + { + "epoch": 0.66, + "grad_norm": 0.3534029225730532, + "learning_rate": 5.617143424710778e-06, + "loss": 0.3028, + "step": 11402 + }, + { + "epoch": 0.66, + "grad_norm": 0.3681851782516326, + "learning_rate": 5.615470845859672e-06, + "loss": 0.2829, + "step": 11403 + }, + { + "epoch": 0.66, + "grad_norm": 0.5710242786946949, + "learning_rate": 5.613798418844623e-06, + "loss": 0.2027, + "step": 11404 + }, + { + "epoch": 0.66, + "grad_norm": 0.37252093370364825, + "learning_rate": 5.6121261437235445e-06, + "loss": 0.2886, + "step": 11405 + }, + { + "epoch": 0.66, + "grad_norm": 0.30991829143518573, + "learning_rate": 5.6104540205543445e-06, + "loss": 0.1872, + "step": 11406 + }, + { + "epoch": 0.66, + "grad_norm": 0.3134833617394962, + "learning_rate": 5.608782049394938e-06, + "loss": 0.2361, + "step": 11407 + }, + { + "epoch": 0.66, + "grad_norm": 0.3926136787003498, + "learning_rate": 5.60711023030322e-06, + "loss": 0.3061, + "step": 11408 + }, + { + "epoch": 0.66, + "grad_norm": 0.9496778327004386, + "learning_rate": 5.605438563337087e-06, + "loss": 0.7436, + "step": 11409 + }, + { + "epoch": 0.66, + "grad_norm": 0.4551020413608209, + "learning_rate": 5.6037670485544215e-06, + "loss": 0.2394, + "step": 11410 + }, + { + "epoch": 0.66, + "grad_norm": 0.2614793357489382, + "learning_rate": 5.60209568601312e-06, + "loss": 0.2574, + "step": 11411 + }, + { + "epoch": 0.66, + "grad_norm": 0.4548863557191528, + "learning_rate": 5.600424475771058e-06, + "loss": 0.2345, + "step": 11412 + }, + { + "epoch": 0.66, + "grad_norm": 0.6171749330064369, + "learning_rate": 5.5987534178861e-06, + "loss": 0.3804, + "step": 11413 + }, + { + "epoch": 0.66, + "grad_norm": 0.310695729368847, + "learning_rate": 5.5970825124161255e-06, + "loss": 0.2084, + "step": 11414 + }, + { + "epoch": 0.66, + "grad_norm": 0.2857621372988274, + "learning_rate": 5.595411759418995e-06, + "loss": 0.2615, + "step": 11415 + }, + { + "epoch": 0.66, + "grad_norm": 0.9880804735860319, + "learning_rate": 5.5937411589525655e-06, + "loss": 0.5304, + "step": 11416 + }, + { + "epoch": 0.66, + "grad_norm": 0.29446071991324213, + "learning_rate": 5.592070711074691e-06, + "loss": 0.1841, + "step": 11417 + }, + { + "epoch": 0.66, + "grad_norm": 0.7950217222504905, + "learning_rate": 5.590400415843214e-06, + "loss": 0.4106, + "step": 11418 + }, + { + "epoch": 0.66, + "grad_norm": 0.2692860474214037, + "learning_rate": 5.5887302733159835e-06, + "loss": 0.2516, + "step": 11419 + }, + { + "epoch": 0.66, + "grad_norm": 0.28812730174273965, + "learning_rate": 5.587060283550835e-06, + "loss": 0.2093, + "step": 11420 + }, + { + "epoch": 0.66, + "grad_norm": 0.4135254933127519, + "learning_rate": 5.585390446605598e-06, + "loss": 0.2756, + "step": 11421 + }, + { + "epoch": 0.66, + "grad_norm": 0.3391811932650513, + "learning_rate": 5.583720762538097e-06, + "loss": 0.302, + "step": 11422 + }, + { + "epoch": 0.66, + "grad_norm": 0.5581481167123312, + "learning_rate": 5.58205123140616e-06, + "loss": 0.1832, + "step": 11423 + }, + { + "epoch": 0.66, + "grad_norm": 0.8472758677182067, + "learning_rate": 5.5803818532676e-06, + "loss": 0.402, + "step": 11424 + }, + { + "epoch": 0.66, + "grad_norm": 0.8200728581102539, + "learning_rate": 5.578712628180225e-06, + "loss": 0.4662, + "step": 11425 + }, + { + "epoch": 0.66, + "grad_norm": 0.3875008712683409, + "learning_rate": 5.577043556201838e-06, + "loss": 0.2686, + "step": 11426 + }, + { + "epoch": 0.66, + "grad_norm": 0.2416269514785201, + "learning_rate": 5.575374637390246e-06, + "loss": 0.2104, + "step": 11427 + }, + { + "epoch": 0.66, + "grad_norm": 0.4898967331695952, + "learning_rate": 5.573705871803241e-06, + "loss": 0.2438, + "step": 11428 + }, + { + "epoch": 0.66, + "grad_norm": 0.3172534747512469, + "learning_rate": 5.57203725949861e-06, + "loss": 0.2663, + "step": 11429 + }, + { + "epoch": 0.66, + "grad_norm": 0.9321900723666333, + "learning_rate": 5.570368800534139e-06, + "loss": 0.3019, + "step": 11430 + }, + { + "epoch": 0.66, + "grad_norm": 0.4761349234786449, + "learning_rate": 5.568700494967603e-06, + "loss": 0.3314, + "step": 11431 + }, + { + "epoch": 0.66, + "grad_norm": 0.32465852015725005, + "learning_rate": 5.567032342856781e-06, + "loss": 0.2739, + "step": 11432 + }, + { + "epoch": 0.66, + "grad_norm": 0.1793227954302825, + "learning_rate": 5.565364344259438e-06, + "loss": 0.1204, + "step": 11433 + }, + { + "epoch": 0.66, + "grad_norm": 0.3672817087183113, + "learning_rate": 5.563696499233337e-06, + "loss": 0.3124, + "step": 11434 + }, + { + "epoch": 0.66, + "grad_norm": 0.33119602560147793, + "learning_rate": 5.562028807836233e-06, + "loss": 0.2328, + "step": 11435 + }, + { + "epoch": 0.66, + "grad_norm": 0.7227754858322918, + "learning_rate": 5.560361270125884e-06, + "loss": 0.3104, + "step": 11436 + }, + { + "epoch": 0.66, + "grad_norm": 1.0485837440591883, + "learning_rate": 5.558693886160032e-06, + "loss": 0.5809, + "step": 11437 + }, + { + "epoch": 0.66, + "grad_norm": 0.32761246481860695, + "learning_rate": 5.557026655996422e-06, + "loss": 0.2612, + "step": 11438 + }, + { + "epoch": 0.66, + "grad_norm": 0.37388839229218224, + "learning_rate": 5.555359579692782e-06, + "loss": 0.3131, + "step": 11439 + }, + { + "epoch": 0.66, + "grad_norm": 0.339682737117197, + "learning_rate": 5.553692657306853e-06, + "loss": 0.1579, + "step": 11440 + }, + { + "epoch": 0.66, + "grad_norm": 0.33306031343411013, + "learning_rate": 5.552025888896356e-06, + "loss": 0.2583, + "step": 11441 + }, + { + "epoch": 0.66, + "grad_norm": 0.9423663180233574, + "learning_rate": 5.550359274519012e-06, + "loss": 0.4231, + "step": 11442 + }, + { + "epoch": 0.66, + "grad_norm": 0.3801574172959991, + "learning_rate": 5.54869281423253e-06, + "loss": 0.2682, + "step": 11443 + }, + { + "epoch": 0.66, + "grad_norm": 0.37924059353551987, + "learning_rate": 5.547026508094629e-06, + "loss": 0.2664, + "step": 11444 + }, + { + "epoch": 0.66, + "grad_norm": 0.5090379498547105, + "learning_rate": 5.545360356163009e-06, + "loss": 0.3892, + "step": 11445 + }, + { + "epoch": 0.66, + "grad_norm": 0.22891711398429887, + "learning_rate": 5.54369435849537e-06, + "loss": 0.182, + "step": 11446 + }, + { + "epoch": 0.66, + "grad_norm": 0.47266754852632326, + "learning_rate": 5.5420285151493995e-06, + "loss": 0.3223, + "step": 11447 + }, + { + "epoch": 0.66, + "grad_norm": 0.6746597708337676, + "learning_rate": 5.540362826182791e-06, + "loss": 0.3292, + "step": 11448 + }, + { + "epoch": 0.66, + "grad_norm": 1.2132277880660471, + "learning_rate": 5.538697291653228e-06, + "loss": 0.7311, + "step": 11449 + }, + { + "epoch": 0.66, + "grad_norm": 0.3082460870951119, + "learning_rate": 5.537031911618385e-06, + "loss": 0.2031, + "step": 11450 + }, + { + "epoch": 0.66, + "grad_norm": 0.3597103355681424, + "learning_rate": 5.53536668613593e-06, + "loss": 0.3081, + "step": 11451 + }, + { + "epoch": 0.66, + "grad_norm": 0.41272000121983643, + "learning_rate": 5.5337016152635396e-06, + "loss": 0.2683, + "step": 11452 + }, + { + "epoch": 0.66, + "grad_norm": 0.3017020610938978, + "learning_rate": 5.53203669905887e-06, + "loss": 0.2304, + "step": 11453 + }, + { + "epoch": 0.66, + "grad_norm": 0.43437243190707603, + "learning_rate": 5.530371937579577e-06, + "loss": 0.2326, + "step": 11454 + }, + { + "epoch": 0.66, + "grad_norm": 0.3317204639922343, + "learning_rate": 5.528707330883308e-06, + "loss": 0.2883, + "step": 11455 + }, + { + "epoch": 0.66, + "grad_norm": 0.3327247393119727, + "learning_rate": 5.527042879027715e-06, + "loss": 0.2176, + "step": 11456 + }, + { + "epoch": 0.66, + "grad_norm": 1.3365246378891962, + "learning_rate": 5.525378582070438e-06, + "loss": 0.7339, + "step": 11457 + }, + { + "epoch": 0.66, + "grad_norm": 0.3413124951106821, + "learning_rate": 5.523714440069104e-06, + "loss": 0.3221, + "step": 11458 + }, + { + "epoch": 0.66, + "grad_norm": 0.3312639650942997, + "learning_rate": 5.522050453081349e-06, + "loss": 0.2081, + "step": 11459 + }, + { + "epoch": 0.66, + "grad_norm": 0.25778157585553163, + "learning_rate": 5.5203866211647904e-06, + "loss": 0.1958, + "step": 11460 + }, + { + "epoch": 0.66, + "grad_norm": 0.9463391804603914, + "learning_rate": 5.518722944377053e-06, + "loss": 0.507, + "step": 11461 + }, + { + "epoch": 0.66, + "grad_norm": 0.37823758384435996, + "learning_rate": 5.517059422775748e-06, + "loss": 0.3032, + "step": 11462 + }, + { + "epoch": 0.66, + "grad_norm": 0.2921793595846805, + "learning_rate": 5.515396056418482e-06, + "loss": 0.2408, + "step": 11463 + }, + { + "epoch": 0.66, + "grad_norm": 0.5883154221839674, + "learning_rate": 5.513732845362856e-06, + "loss": 0.4082, + "step": 11464 + }, + { + "epoch": 0.66, + "grad_norm": 0.2867425278106484, + "learning_rate": 5.51206978966647e-06, + "loss": 0.1848, + "step": 11465 + }, + { + "epoch": 0.66, + "grad_norm": 0.24205597069931772, + "learning_rate": 5.510406889386914e-06, + "loss": 0.2085, + "step": 11466 + }, + { + "epoch": 0.66, + "grad_norm": 1.2740952812449393, + "learning_rate": 5.5087441445817765e-06, + "loss": 0.7298, + "step": 11467 + }, + { + "epoch": 0.66, + "grad_norm": 0.5389920094979618, + "learning_rate": 5.507081555308631e-06, + "loss": 0.3645, + "step": 11468 + }, + { + "epoch": 0.66, + "grad_norm": 0.3339186256662309, + "learning_rate": 5.505419121625062e-06, + "loss": 0.2649, + "step": 11469 + }, + { + "epoch": 0.66, + "grad_norm": 0.3448488174392606, + "learning_rate": 5.503756843588635e-06, + "loss": 0.3087, + "step": 11470 + }, + { + "epoch": 0.66, + "grad_norm": 0.3839012951403879, + "learning_rate": 5.502094721256916e-06, + "loss": 0.2503, + "step": 11471 + }, + { + "epoch": 0.66, + "grad_norm": 0.24422334705090812, + "learning_rate": 5.500432754687464e-06, + "loss": 0.1323, + "step": 11472 + }, + { + "epoch": 0.66, + "grad_norm": 1.0341671776790258, + "learning_rate": 5.498770943937828e-06, + "loss": 0.6798, + "step": 11473 + }, + { + "epoch": 0.66, + "grad_norm": 0.2849180917522527, + "learning_rate": 5.497109289065563e-06, + "loss": 0.2485, + "step": 11474 + }, + { + "epoch": 0.66, + "grad_norm": 0.5072468134724325, + "learning_rate": 5.495447790128211e-06, + "loss": 0.3332, + "step": 11475 + }, + { + "epoch": 0.66, + "grad_norm": 0.6364321717273036, + "learning_rate": 5.493786447183308e-06, + "loss": 0.3118, + "step": 11476 + }, + { + "epoch": 0.66, + "grad_norm": 0.603457958394203, + "learning_rate": 5.4921252602883834e-06, + "loss": 0.2669, + "step": 11477 + }, + { + "epoch": 0.66, + "grad_norm": 0.24790100570896947, + "learning_rate": 5.490464229500969e-06, + "loss": 0.2388, + "step": 11478 + }, + { + "epoch": 0.66, + "grad_norm": 0.26144460223892885, + "learning_rate": 5.488803354878587e-06, + "loss": 0.2021, + "step": 11479 + }, + { + "epoch": 0.66, + "grad_norm": 0.5581526805294116, + "learning_rate": 5.487142636478749e-06, + "loss": 0.3137, + "step": 11480 + }, + { + "epoch": 0.66, + "grad_norm": 0.46154960985058663, + "learning_rate": 5.485482074358968e-06, + "loss": 0.3153, + "step": 11481 + }, + { + "epoch": 0.66, + "grad_norm": 0.34216719846377985, + "learning_rate": 5.48382166857675e-06, + "loss": 0.2507, + "step": 11482 + }, + { + "epoch": 0.66, + "grad_norm": 0.5615865278512538, + "learning_rate": 5.482161419189591e-06, + "loss": 0.2432, + "step": 11483 + }, + { + "epoch": 0.66, + "grad_norm": 0.23731768796120098, + "learning_rate": 5.4805013262549885e-06, + "loss": 0.1988, + "step": 11484 + }, + { + "epoch": 0.66, + "grad_norm": 1.286312030117045, + "learning_rate": 5.478841389830427e-06, + "loss": 0.4339, + "step": 11485 + }, + { + "epoch": 0.66, + "grad_norm": 0.30921205394689105, + "learning_rate": 5.477181609973399e-06, + "loss": 0.2737, + "step": 11486 + }, + { + "epoch": 0.66, + "grad_norm": 0.38646319079933794, + "learning_rate": 5.475521986741377e-06, + "loss": 0.2982, + "step": 11487 + }, + { + "epoch": 0.66, + "grad_norm": 1.0393640176508394, + "learning_rate": 5.4738625201918324e-06, + "loss": 0.5382, + "step": 11488 + }, + { + "epoch": 0.66, + "grad_norm": 0.2587507359924883, + "learning_rate": 5.472203210382231e-06, + "loss": 0.0732, + "step": 11489 + }, + { + "epoch": 0.66, + "grad_norm": 0.25179914104749707, + "learning_rate": 5.470544057370042e-06, + "loss": 0.2673, + "step": 11490 + }, + { + "epoch": 0.66, + "grad_norm": 0.26847000724967895, + "learning_rate": 5.468885061212716e-06, + "loss": 0.1861, + "step": 11491 + }, + { + "epoch": 0.66, + "grad_norm": 0.33831228269888364, + "learning_rate": 5.467226221967707e-06, + "loss": 0.2408, + "step": 11492 + }, + { + "epoch": 0.66, + "grad_norm": 0.49982395528851653, + "learning_rate": 5.465567539692455e-06, + "loss": 0.3434, + "step": 11493 + }, + { + "epoch": 0.66, + "grad_norm": 0.3667710672250758, + "learning_rate": 5.463909014444409e-06, + "loss": 0.316, + "step": 11494 + }, + { + "epoch": 0.66, + "grad_norm": 0.5919667616125639, + "learning_rate": 5.462250646280997e-06, + "loss": 0.174, + "step": 11495 + }, + { + "epoch": 0.66, + "grad_norm": 0.2919001787559771, + "learning_rate": 5.460592435259651e-06, + "loss": 0.2295, + "step": 11496 + }, + { + "epoch": 0.66, + "grad_norm": 0.45198607078143405, + "learning_rate": 5.458934381437793e-06, + "loss": 0.3138, + "step": 11497 + }, + { + "epoch": 0.66, + "grad_norm": 0.29738566743482925, + "learning_rate": 5.457276484872839e-06, + "loss": 0.2406, + "step": 11498 + }, + { + "epoch": 0.66, + "grad_norm": 0.34804859280565137, + "learning_rate": 5.455618745622209e-06, + "loss": 0.2933, + "step": 11499 + }, + { + "epoch": 0.66, + "grad_norm": 1.1076657921418147, + "learning_rate": 5.453961163743304e-06, + "loss": 0.6148, + "step": 11500 + }, + { + "epoch": 0.66, + "grad_norm": 0.7959871319468619, + "learning_rate": 5.452303739293532e-06, + "loss": 0.3982, + "step": 11501 + }, + { + "epoch": 0.66, + "grad_norm": 0.2500213796943707, + "learning_rate": 5.4506464723302784e-06, + "loss": 0.2233, + "step": 11502 + }, + { + "epoch": 0.66, + "grad_norm": 0.39779781556021504, + "learning_rate": 5.448989362910949e-06, + "loss": 0.2594, + "step": 11503 + }, + { + "epoch": 0.66, + "grad_norm": 0.5295136834773769, + "learning_rate": 5.447332411092921e-06, + "loss": 0.3786, + "step": 11504 + }, + { + "epoch": 0.66, + "grad_norm": 0.23936075071175678, + "learning_rate": 5.445675616933576e-06, + "loss": 0.1819, + "step": 11505 + }, + { + "epoch": 0.66, + "grad_norm": 0.34567033680043985, + "learning_rate": 5.444018980490284e-06, + "loss": 0.2925, + "step": 11506 + }, + { + "epoch": 0.66, + "grad_norm": 0.7141315652138195, + "learning_rate": 5.4423625018204226e-06, + "loss": 0.3823, + "step": 11507 + }, + { + "epoch": 0.66, + "grad_norm": 0.32578121816347494, + "learning_rate": 5.440706180981352e-06, + "loss": 0.2104, + "step": 11508 + }, + { + "epoch": 0.66, + "grad_norm": 0.633359243897076, + "learning_rate": 5.439050018030432e-06, + "loss": 0.368, + "step": 11509 + }, + { + "epoch": 0.66, + "grad_norm": 0.3036594850395294, + "learning_rate": 5.437394013025012e-06, + "loss": 0.2951, + "step": 11510 + }, + { + "epoch": 0.66, + "grad_norm": 0.3218116260567253, + "learning_rate": 5.435738166022437e-06, + "loss": 0.2107, + "step": 11511 + }, + { + "epoch": 0.66, + "grad_norm": 0.27971976076190586, + "learning_rate": 5.434082477080058e-06, + "loss": 0.1691, + "step": 11512 + }, + { + "epoch": 0.66, + "grad_norm": 0.37628575012172416, + "learning_rate": 5.432426946255206e-06, + "loss": 0.2813, + "step": 11513 + }, + { + "epoch": 0.66, + "grad_norm": 0.29629151440877577, + "learning_rate": 5.4307715736052125e-06, + "loss": 0.2711, + "step": 11514 + }, + { + "epoch": 0.66, + "grad_norm": 0.7761394695030567, + "learning_rate": 5.429116359187403e-06, + "loss": 0.3176, + "step": 11515 + }, + { + "epoch": 0.66, + "grad_norm": 1.1914877581468113, + "learning_rate": 5.427461303059096e-06, + "loss": 0.8457, + "step": 11516 + }, + { + "epoch": 0.66, + "grad_norm": 0.3173092424468873, + "learning_rate": 5.425806405277609e-06, + "loss": 0.2569, + "step": 11517 + }, + { + "epoch": 0.66, + "grad_norm": 0.24185940664168293, + "learning_rate": 5.424151665900246e-06, + "loss": 0.1908, + "step": 11518 + }, + { + "epoch": 0.66, + "grad_norm": 1.2671168707668683, + "learning_rate": 5.422497084984317e-06, + "loss": 0.5888, + "step": 11519 + }, + { + "epoch": 0.66, + "grad_norm": 0.3204691857610319, + "learning_rate": 5.420842662587118e-06, + "loss": 0.2625, + "step": 11520 + }, + { + "epoch": 0.66, + "grad_norm": 0.837128435683929, + "learning_rate": 5.41918839876594e-06, + "loss": 0.3109, + "step": 11521 + }, + { + "epoch": 0.66, + "grad_norm": 0.28996702322416396, + "learning_rate": 5.41753429357807e-06, + "loss": 0.2688, + "step": 11522 + }, + { + "epoch": 0.66, + "grad_norm": 0.3468731094237857, + "learning_rate": 5.4158803470807875e-06, + "loss": 0.2796, + "step": 11523 + }, + { + "epoch": 0.66, + "grad_norm": 0.25427661796028395, + "learning_rate": 5.414226559331375e-06, + "loss": 0.1582, + "step": 11524 + }, + { + "epoch": 0.66, + "grad_norm": 0.28924380253126614, + "learning_rate": 5.4125729303871e-06, + "loss": 0.229, + "step": 11525 + }, + { + "epoch": 0.66, + "grad_norm": 0.371977089760644, + "learning_rate": 5.410919460305226e-06, + "loss": 0.2852, + "step": 11526 + }, + { + "epoch": 0.66, + "grad_norm": 0.7166682064653169, + "learning_rate": 5.409266149143011e-06, + "loss": 0.4062, + "step": 11527 + }, + { + "epoch": 0.66, + "grad_norm": 0.5786444654582379, + "learning_rate": 5.407612996957716e-06, + "loss": 0.3038, + "step": 11528 + }, + { + "epoch": 0.66, + "grad_norm": 0.34804236614724643, + "learning_rate": 5.405960003806585e-06, + "loss": 0.2652, + "step": 11529 + }, + { + "epoch": 0.66, + "grad_norm": 0.22885662878947813, + "learning_rate": 5.4043071697468604e-06, + "loss": 0.2186, + "step": 11530 + }, + { + "epoch": 0.66, + "grad_norm": 0.6317423866420934, + "learning_rate": 5.4026544948357795e-06, + "loss": 0.2647, + "step": 11531 + }, + { + "epoch": 0.66, + "grad_norm": 0.3832186602134028, + "learning_rate": 5.401001979130578e-06, + "loss": 0.273, + "step": 11532 + }, + { + "epoch": 0.66, + "grad_norm": 0.46543784414296346, + "learning_rate": 5.399349622688479e-06, + "loss": 0.3423, + "step": 11533 + }, + { + "epoch": 0.66, + "grad_norm": 0.444348249465614, + "learning_rate": 5.397697425566707e-06, + "loss": 0.2682, + "step": 11534 + }, + { + "epoch": 0.66, + "grad_norm": 0.32841907606679893, + "learning_rate": 5.396045387822474e-06, + "loss": 0.2644, + "step": 11535 + }, + { + "epoch": 0.66, + "grad_norm": 0.29513111111681184, + "learning_rate": 5.394393509512987e-06, + "loss": 0.1942, + "step": 11536 + }, + { + "epoch": 0.66, + "grad_norm": 0.4622638865061429, + "learning_rate": 5.392741790695459e-06, + "loss": 0.346, + "step": 11537 + }, + { + "epoch": 0.66, + "grad_norm": 0.2652974325826972, + "learning_rate": 5.391090231427086e-06, + "loss": 0.2254, + "step": 11538 + }, + { + "epoch": 0.66, + "grad_norm": 0.8429674842171873, + "learning_rate": 5.389438831765059e-06, + "loss": 0.4517, + "step": 11539 + }, + { + "epoch": 0.66, + "grad_norm": 1.2820194430463543, + "learning_rate": 5.387787591766562e-06, + "loss": 0.8453, + "step": 11540 + }, + { + "epoch": 0.66, + "grad_norm": 0.25818239711603863, + "learning_rate": 5.386136511488789e-06, + "loss": 0.211, + "step": 11541 + }, + { + "epoch": 0.66, + "grad_norm": 0.4809465676464306, + "learning_rate": 5.384485590988908e-06, + "loss": 0.3567, + "step": 11542 + }, + { + "epoch": 0.66, + "grad_norm": 0.5760262213588997, + "learning_rate": 5.382834830324093e-06, + "loss": 0.2892, + "step": 11543 + }, + { + "epoch": 0.66, + "grad_norm": 0.2266235624622428, + "learning_rate": 5.381184229551506e-06, + "loss": 0.1549, + "step": 11544 + }, + { + "epoch": 0.66, + "grad_norm": 0.5227017467399918, + "learning_rate": 5.379533788728313e-06, + "loss": 0.3697, + "step": 11545 + }, + { + "epoch": 0.66, + "grad_norm": 0.48189898826149535, + "learning_rate": 5.377883507911668e-06, + "loss": 0.4144, + "step": 11546 + }, + { + "epoch": 0.66, + "grad_norm": 0.2966848658482328, + "learning_rate": 5.376233387158722e-06, + "loss": 0.1897, + "step": 11547 + }, + { + "epoch": 0.66, + "grad_norm": 0.6944979318698845, + "learning_rate": 5.3745834265266054e-06, + "loss": 0.4308, + "step": 11548 + }, + { + "epoch": 0.66, + "grad_norm": 0.2628428496235638, + "learning_rate": 5.372933626072472e-06, + "loss": 0.2565, + "step": 11549 + }, + { + "epoch": 0.66, + "grad_norm": 0.3982433001251939, + "learning_rate": 5.371283985853446e-06, + "loss": 0.2565, + "step": 11550 + }, + { + "epoch": 0.66, + "grad_norm": 0.31675267485780206, + "learning_rate": 5.369634505926658e-06, + "loss": 0.1933, + "step": 11551 + }, + { + "epoch": 0.66, + "grad_norm": 1.1224053933698808, + "learning_rate": 5.367985186349223e-06, + "loss": 0.7319, + "step": 11552 + }, + { + "epoch": 0.66, + "grad_norm": 0.3152515760305445, + "learning_rate": 5.3663360271782675e-06, + "loss": 0.2678, + "step": 11553 + }, + { + "epoch": 0.66, + "grad_norm": 0.3356411560582424, + "learning_rate": 5.364687028470894e-06, + "loss": 0.2476, + "step": 11554 + }, + { + "epoch": 0.66, + "grad_norm": 0.6719810096510042, + "learning_rate": 5.363038190284211e-06, + "loss": 0.4357, + "step": 11555 + }, + { + "epoch": 0.66, + "grad_norm": 0.2516818101597573, + "learning_rate": 5.36138951267531e-06, + "loss": 0.2007, + "step": 11556 + }, + { + "epoch": 0.66, + "grad_norm": 0.2570646063899158, + "learning_rate": 5.359740995701297e-06, + "loss": 0.1896, + "step": 11557 + }, + { + "epoch": 0.66, + "grad_norm": 1.0684404877013456, + "learning_rate": 5.358092639419252e-06, + "loss": 0.7644, + "step": 11558 + }, + { + "epoch": 0.66, + "grad_norm": 0.313824020741263, + "learning_rate": 5.356444443886262e-06, + "loss": 0.2709, + "step": 11559 + }, + { + "epoch": 0.66, + "grad_norm": 0.7877441786641255, + "learning_rate": 5.3547964091593955e-06, + "loss": 0.3024, + "step": 11560 + }, + { + "epoch": 0.66, + "grad_norm": 0.3567833071601899, + "learning_rate": 5.353148535295733e-06, + "loss": 0.3206, + "step": 11561 + }, + { + "epoch": 0.66, + "grad_norm": 0.24311132372342875, + "learning_rate": 5.351500822352338e-06, + "loss": 0.1998, + "step": 11562 + }, + { + "epoch": 0.66, + "grad_norm": 0.4527933293358373, + "learning_rate": 5.3498532703862685e-06, + "loss": 0.2622, + "step": 11563 + }, + { + "epoch": 0.66, + "grad_norm": 0.4517449777092792, + "learning_rate": 5.34820587945458e-06, + "loss": 0.2963, + "step": 11564 + }, + { + "epoch": 0.66, + "grad_norm": 0.4205026001874601, + "learning_rate": 5.34655864961432e-06, + "loss": 0.2904, + "step": 11565 + }, + { + "epoch": 0.66, + "grad_norm": 0.5018741672324546, + "learning_rate": 5.344911580922536e-06, + "loss": 0.3114, + "step": 11566 + }, + { + "epoch": 0.66, + "grad_norm": 0.3871254278340785, + "learning_rate": 5.343264673436264e-06, + "loss": 0.1549, + "step": 11567 + }, + { + "epoch": 0.66, + "grad_norm": 0.392704311459362, + "learning_rate": 5.341617927212537e-06, + "loss": 0.2629, + "step": 11568 + }, + { + "epoch": 0.66, + "grad_norm": 0.2654887678301144, + "learning_rate": 5.339971342308377e-06, + "loss": 0.2367, + "step": 11569 + }, + { + "epoch": 0.66, + "grad_norm": 0.9922960293520513, + "learning_rate": 5.33832491878081e-06, + "loss": 0.5467, + "step": 11570 + }, + { + "epoch": 0.66, + "grad_norm": 0.41226520432516683, + "learning_rate": 5.3366786566868545e-06, + "loss": 0.2878, + "step": 11571 + }, + { + "epoch": 0.66, + "grad_norm": 0.4855856005335587, + "learning_rate": 5.335032556083515e-06, + "loss": 0.3625, + "step": 11572 + }, + { + "epoch": 0.66, + "grad_norm": 0.37447574962277336, + "learning_rate": 5.333386617027793e-06, + "loss": 0.2506, + "step": 11573 + }, + { + "epoch": 0.66, + "grad_norm": 0.32366398711854405, + "learning_rate": 5.331740839576697e-06, + "loss": 0.2546, + "step": 11574 + }, + { + "epoch": 0.67, + "grad_norm": 0.2625217824484109, + "learning_rate": 5.330095223787214e-06, + "loss": 0.1951, + "step": 11575 + }, + { + "epoch": 0.67, + "grad_norm": 0.779074955761385, + "learning_rate": 5.3284497697163325e-06, + "loss": 0.5442, + "step": 11576 + }, + { + "epoch": 0.67, + "grad_norm": 0.27009579975714293, + "learning_rate": 5.326804477421035e-06, + "loss": 0.2266, + "step": 11577 + }, + { + "epoch": 0.67, + "grad_norm": 0.42870519581357974, + "learning_rate": 5.325159346958293e-06, + "loss": 0.3498, + "step": 11578 + }, + { + "epoch": 0.67, + "grad_norm": 1.0770530022845466, + "learning_rate": 5.323514378385086e-06, + "loss": 0.5653, + "step": 11579 + }, + { + "epoch": 0.67, + "grad_norm": 0.25141483196515735, + "learning_rate": 5.321869571758375e-06, + "loss": 0.1638, + "step": 11580 + }, + { + "epoch": 0.67, + "grad_norm": 0.31187570399237746, + "learning_rate": 5.32022492713512e-06, + "loss": 0.2445, + "step": 11581 + }, + { + "epoch": 0.67, + "grad_norm": 0.7460901109990338, + "learning_rate": 5.318580444572276e-06, + "loss": 0.4426, + "step": 11582 + }, + { + "epoch": 0.67, + "grad_norm": 0.43048688929882034, + "learning_rate": 5.316936124126788e-06, + "loss": 0.2262, + "step": 11583 + }, + { + "epoch": 0.67, + "grad_norm": 0.40015207220498034, + "learning_rate": 5.3152919658556e-06, + "loss": 0.3021, + "step": 11584 + }, + { + "epoch": 0.67, + "grad_norm": 0.3664291203534989, + "learning_rate": 5.313647969815647e-06, + "loss": 0.2887, + "step": 11585 + }, + { + "epoch": 0.67, + "grad_norm": 0.351670785679463, + "learning_rate": 5.312004136063866e-06, + "loss": 0.184, + "step": 11586 + }, + { + "epoch": 0.67, + "grad_norm": 0.31282841073476086, + "learning_rate": 5.310360464657183e-06, + "loss": 0.2668, + "step": 11587 + }, + { + "epoch": 0.67, + "grad_norm": 0.9626155482266147, + "learning_rate": 5.308716955652513e-06, + "loss": 0.4174, + "step": 11588 + }, + { + "epoch": 0.67, + "grad_norm": 0.2985410671209383, + "learning_rate": 5.3070736091067734e-06, + "loss": 0.267, + "step": 11589 + }, + { + "epoch": 0.67, + "grad_norm": 0.30113331654362685, + "learning_rate": 5.30543042507687e-06, + "loss": 0.2067, + "step": 11590 + }, + { + "epoch": 0.67, + "grad_norm": 1.0546116846708513, + "learning_rate": 5.303787403619711e-06, + "loss": 0.4606, + "step": 11591 + }, + { + "epoch": 0.67, + "grad_norm": 0.3379662424665475, + "learning_rate": 5.302144544792194e-06, + "loss": 0.2209, + "step": 11592 + }, + { + "epoch": 0.67, + "grad_norm": 0.2875890580534507, + "learning_rate": 5.300501848651209e-06, + "loss": 0.244, + "step": 11593 + }, + { + "epoch": 0.67, + "grad_norm": 1.0134444255063182, + "learning_rate": 5.298859315253639e-06, + "loss": 0.3835, + "step": 11594 + }, + { + "epoch": 0.67, + "grad_norm": 0.41250760331534936, + "learning_rate": 5.297216944656371e-06, + "loss": 0.3064, + "step": 11595 + }, + { + "epoch": 0.67, + "grad_norm": 0.30618938735708207, + "learning_rate": 5.29557473691628e-06, + "loss": 0.1658, + "step": 11596 + }, + { + "epoch": 0.67, + "grad_norm": 0.3628548944099711, + "learning_rate": 5.293932692090233e-06, + "loss": 0.3127, + "step": 11597 + }, + { + "epoch": 0.67, + "grad_norm": 0.3740872155732483, + "learning_rate": 5.292290810235092e-06, + "loss": 0.2613, + "step": 11598 + }, + { + "epoch": 0.67, + "grad_norm": 0.5642476827593211, + "learning_rate": 5.29064909140772e-06, + "loss": 0.2667, + "step": 11599 + }, + { + "epoch": 0.67, + "grad_norm": 0.5899163594087162, + "learning_rate": 5.289007535664967e-06, + "loss": 0.3414, + "step": 11600 + }, + { + "epoch": 0.67, + "grad_norm": 0.40825253228739095, + "learning_rate": 5.287366143063682e-06, + "loss": 0.3243, + "step": 11601 + }, + { + "epoch": 0.67, + "grad_norm": 0.2748846662378343, + "learning_rate": 5.285724913660704e-06, + "loss": 0.2326, + "step": 11602 + }, + { + "epoch": 0.67, + "grad_norm": 0.43119599601784436, + "learning_rate": 5.284083847512866e-06, + "loss": 0.1601, + "step": 11603 + }, + { + "epoch": 0.67, + "grad_norm": 0.8477040777808765, + "learning_rate": 5.282442944677005e-06, + "loss": 0.4426, + "step": 11604 + }, + { + "epoch": 0.67, + "grad_norm": 0.27256531636671766, + "learning_rate": 5.280802205209943e-06, + "loss": 0.2666, + "step": 11605 + }, + { + "epoch": 0.67, + "grad_norm": 0.5546628272888353, + "learning_rate": 5.279161629168497e-06, + "loss": 0.3214, + "step": 11606 + }, + { + "epoch": 0.67, + "grad_norm": 0.5307811121875399, + "learning_rate": 5.2775212166094755e-06, + "loss": 0.3207, + "step": 11607 + }, + { + "epoch": 0.67, + "grad_norm": 0.2547089293698558, + "learning_rate": 5.275880967589697e-06, + "loss": 0.2159, + "step": 11608 + }, + { + "epoch": 0.67, + "grad_norm": 0.3482597708001019, + "learning_rate": 5.274240882165958e-06, + "loss": 0.2253, + "step": 11609 + }, + { + "epoch": 0.67, + "grad_norm": 0.5292646057639918, + "learning_rate": 5.272600960395051e-06, + "loss": 0.283, + "step": 11610 + }, + { + "epoch": 0.67, + "grad_norm": 0.3723792760127392, + "learning_rate": 5.270961202333769e-06, + "loss": 0.292, + "step": 11611 + }, + { + "epoch": 0.67, + "grad_norm": 0.7733747358805955, + "learning_rate": 5.2693216080388984e-06, + "loss": 0.4955, + "step": 11612 + }, + { + "epoch": 0.67, + "grad_norm": 0.29065692377208907, + "learning_rate": 5.267682177567219e-06, + "loss": 0.2262, + "step": 11613 + }, + { + "epoch": 0.67, + "grad_norm": 0.37946091941589666, + "learning_rate": 5.266042910975501e-06, + "loss": 0.3012, + "step": 11614 + }, + { + "epoch": 0.67, + "grad_norm": 0.28954371908158133, + "learning_rate": 5.264403808320514e-06, + "loss": 0.1924, + "step": 11615 + }, + { + "epoch": 0.67, + "grad_norm": 0.36618728513841026, + "learning_rate": 5.26276486965902e-06, + "loss": 0.2213, + "step": 11616 + }, + { + "epoch": 0.67, + "grad_norm": 0.3013556099336767, + "learning_rate": 5.261126095047774e-06, + "loss": 0.2854, + "step": 11617 + }, + { + "epoch": 0.67, + "grad_norm": 0.6303827060308433, + "learning_rate": 5.259487484543528e-06, + "loss": 0.4274, + "step": 11618 + }, + { + "epoch": 0.67, + "grad_norm": 0.3363842194928553, + "learning_rate": 5.257849038203022e-06, + "loss": 0.1905, + "step": 11619 + }, + { + "epoch": 0.67, + "grad_norm": 0.2881946093992692, + "learning_rate": 5.256210756083004e-06, + "loss": 0.2326, + "step": 11620 + }, + { + "epoch": 0.67, + "grad_norm": 0.2732752542283715, + "learning_rate": 5.254572638240204e-06, + "loss": 0.2327, + "step": 11621 + }, + { + "epoch": 0.67, + "grad_norm": 0.6126870326894789, + "learning_rate": 5.252934684731349e-06, + "loss": 0.2707, + "step": 11622 + }, + { + "epoch": 0.67, + "grad_norm": 0.38004426464239194, + "learning_rate": 5.251296895613158e-06, + "loss": 0.2675, + "step": 11623 + }, + { + "epoch": 0.67, + "grad_norm": 0.3487591686090327, + "learning_rate": 5.249659270942355e-06, + "loss": 0.3091, + "step": 11624 + }, + { + "epoch": 0.67, + "grad_norm": 0.809507380773592, + "learning_rate": 5.248021810775647e-06, + "loss": 0.4803, + "step": 11625 + }, + { + "epoch": 0.67, + "grad_norm": 0.24443885048236402, + "learning_rate": 5.2463845151697404e-06, + "loss": 0.1772, + "step": 11626 + }, + { + "epoch": 0.67, + "grad_norm": 0.4368490630538348, + "learning_rate": 5.2447473841813335e-06, + "loss": 0.2457, + "step": 11627 + }, + { + "epoch": 0.67, + "grad_norm": 0.4995512757478243, + "learning_rate": 5.243110417867117e-06, + "loss": 0.3012, + "step": 11628 + }, + { + "epoch": 0.67, + "grad_norm": 0.29444216072671153, + "learning_rate": 5.241473616283783e-06, + "loss": 0.2354, + "step": 11629 + }, + { + "epoch": 0.67, + "grad_norm": 1.4348647745786949, + "learning_rate": 5.239836979488015e-06, + "loss": 0.7923, + "step": 11630 + }, + { + "epoch": 0.67, + "grad_norm": 0.7504570213240344, + "learning_rate": 5.238200507536488e-06, + "loss": 0.4943, + "step": 11631 + }, + { + "epoch": 0.67, + "grad_norm": 0.31611857707224705, + "learning_rate": 5.23656420048587e-06, + "loss": 0.2007, + "step": 11632 + }, + { + "epoch": 0.67, + "grad_norm": 0.2280572315656847, + "learning_rate": 5.23492805839283e-06, + "loss": 0.2107, + "step": 11633 + }, + { + "epoch": 0.67, + "grad_norm": 0.5290475222067625, + "learning_rate": 5.233292081314027e-06, + "loss": 0.3361, + "step": 11634 + }, + { + "epoch": 0.67, + "grad_norm": 0.33595680323497873, + "learning_rate": 5.231656269306116e-06, + "loss": 0.2121, + "step": 11635 + }, + { + "epoch": 0.67, + "grad_norm": 0.34149735897754147, + "learning_rate": 5.230020622425738e-06, + "loss": 0.2989, + "step": 11636 + }, + { + "epoch": 0.67, + "grad_norm": 0.5854348227990583, + "learning_rate": 5.228385140729545e-06, + "loss": 0.4182, + "step": 11637 + }, + { + "epoch": 0.67, + "grad_norm": 0.37979442690169474, + "learning_rate": 5.226749824274169e-06, + "loss": 0.2815, + "step": 11638 + }, + { + "epoch": 0.67, + "grad_norm": 0.4566569153966813, + "learning_rate": 5.225114673116243e-06, + "loss": 0.239, + "step": 11639 + }, + { + "epoch": 0.67, + "grad_norm": 0.2744528609459148, + "learning_rate": 5.223479687312388e-06, + "loss": 0.2327, + "step": 11640 + }, + { + "epoch": 0.67, + "grad_norm": 0.36138822589171504, + "learning_rate": 5.2218448669192235e-06, + "loss": 0.2638, + "step": 11641 + }, + { + "epoch": 0.67, + "grad_norm": 0.8235685157243273, + "learning_rate": 5.220210211993371e-06, + "loss": 0.457, + "step": 11642 + }, + { + "epoch": 0.67, + "grad_norm": 1.145506050031028, + "learning_rate": 5.21857572259143e-06, + "loss": 0.6852, + "step": 11643 + }, + { + "epoch": 0.67, + "grad_norm": 0.32843776235279604, + "learning_rate": 5.216941398770009e-06, + "loss": 0.2588, + "step": 11644 + }, + { + "epoch": 0.67, + "grad_norm": 0.33328103317540847, + "learning_rate": 5.215307240585696e-06, + "loss": 0.2655, + "step": 11645 + }, + { + "epoch": 0.67, + "grad_norm": 0.34161222318740736, + "learning_rate": 5.213673248095092e-06, + "loss": 0.2008, + "step": 11646 + }, + { + "epoch": 0.67, + "grad_norm": 0.34405362854914645, + "learning_rate": 5.212039421354779e-06, + "loss": 0.2812, + "step": 11647 + }, + { + "epoch": 0.67, + "grad_norm": 0.3367695864863821, + "learning_rate": 5.2104057604213335e-06, + "loss": 0.2562, + "step": 11648 + }, + { + "epoch": 0.67, + "grad_norm": 0.7079692534150634, + "learning_rate": 5.208772265351332e-06, + "loss": 0.426, + "step": 11649 + }, + { + "epoch": 0.67, + "grad_norm": 0.35211947295926666, + "learning_rate": 5.207138936201339e-06, + "loss": 0.2587, + "step": 11650 + }, + { + "epoch": 0.67, + "grad_norm": 0.6453440156091795, + "learning_rate": 5.205505773027919e-06, + "loss": 0.3904, + "step": 11651 + }, + { + "epoch": 0.67, + "grad_norm": 0.2491559057765381, + "learning_rate": 5.203872775887628e-06, + "loss": 0.1939, + "step": 11652 + }, + { + "epoch": 0.67, + "grad_norm": 0.362328629160573, + "learning_rate": 5.202239944837013e-06, + "loss": 0.283, + "step": 11653 + }, + { + "epoch": 0.67, + "grad_norm": 0.4889478459733575, + "learning_rate": 5.200607279932626e-06, + "loss": 0.3449, + "step": 11654 + }, + { + "epoch": 0.67, + "grad_norm": 0.6228592787338032, + "learning_rate": 5.198974781231003e-06, + "loss": 0.308, + "step": 11655 + }, + { + "epoch": 0.67, + "grad_norm": 0.30679920540472044, + "learning_rate": 5.197342448788676e-06, + "loss": 0.2397, + "step": 11656 + }, + { + "epoch": 0.67, + "grad_norm": 0.3359198585706779, + "learning_rate": 5.19571028266217e-06, + "loss": 0.3035, + "step": 11657 + }, + { + "epoch": 0.67, + "grad_norm": 0.22988331423007832, + "learning_rate": 5.194078282908015e-06, + "loss": 0.0879, + "step": 11658 + }, + { + "epoch": 0.67, + "grad_norm": 0.32083574318893576, + "learning_rate": 5.192446449582722e-06, + "loss": 0.2683, + "step": 11659 + }, + { + "epoch": 0.67, + "grad_norm": 0.37311945503442323, + "learning_rate": 5.190814782742801e-06, + "loss": 0.3295, + "step": 11660 + }, + { + "epoch": 0.67, + "grad_norm": 0.9174082847992806, + "learning_rate": 5.1891832824447545e-06, + "loss": 0.3407, + "step": 11661 + }, + { + "epoch": 0.67, + "grad_norm": 0.311522227954505, + "learning_rate": 5.1875519487450865e-06, + "loss": 0.2712, + "step": 11662 + }, + { + "epoch": 0.67, + "grad_norm": 0.7068267725620065, + "learning_rate": 5.185920781700288e-06, + "loss": 0.4194, + "step": 11663 + }, + { + "epoch": 0.67, + "grad_norm": 0.3690138293865834, + "learning_rate": 5.184289781366847e-06, + "loss": 0.2892, + "step": 11664 + }, + { + "epoch": 0.67, + "grad_norm": 0.2054627742441291, + "learning_rate": 5.182658947801242e-06, + "loss": 0.1386, + "step": 11665 + }, + { + "epoch": 0.67, + "grad_norm": 1.2891895203008483, + "learning_rate": 5.1810282810599475e-06, + "loss": 0.6783, + "step": 11666 + }, + { + "epoch": 0.67, + "grad_norm": 0.8395194551653319, + "learning_rate": 5.17939778119944e-06, + "loss": 0.412, + "step": 11667 + }, + { + "epoch": 0.67, + "grad_norm": 0.2798409401034943, + "learning_rate": 5.1777674482761805e-06, + "loss": 0.2069, + "step": 11668 + }, + { + "epoch": 0.67, + "grad_norm": 0.44920481661776707, + "learning_rate": 5.176137282346627e-06, + "loss": 0.3641, + "step": 11669 + }, + { + "epoch": 0.67, + "grad_norm": 0.29918314203834345, + "learning_rate": 5.174507283467228e-06, + "loss": 0.1879, + "step": 11670 + }, + { + "epoch": 0.67, + "grad_norm": 0.32543040157224057, + "learning_rate": 5.172877451694438e-06, + "loss": 0.197, + "step": 11671 + }, + { + "epoch": 0.67, + "grad_norm": 0.36804171214608106, + "learning_rate": 5.171247787084694e-06, + "loss": 0.3286, + "step": 11672 + }, + { + "epoch": 0.67, + "grad_norm": 0.9862885552626823, + "learning_rate": 5.169618289694432e-06, + "loss": 0.3905, + "step": 11673 + }, + { + "epoch": 0.67, + "grad_norm": 0.2984786136741482, + "learning_rate": 5.167988959580077e-06, + "loss": 0.1931, + "step": 11674 + }, + { + "epoch": 0.67, + "grad_norm": 1.0670030322829498, + "learning_rate": 5.16635979679806e-06, + "loss": 0.5376, + "step": 11675 + }, + { + "epoch": 0.67, + "grad_norm": 0.34866352029633485, + "learning_rate": 5.1647308014047955e-06, + "loss": 0.2992, + "step": 11676 + }, + { + "epoch": 0.67, + "grad_norm": 0.33441485196590653, + "learning_rate": 5.163101973456696e-06, + "loss": 0.2794, + "step": 11677 + }, + { + "epoch": 0.67, + "grad_norm": 0.2906375868270081, + "learning_rate": 5.161473313010162e-06, + "loss": 0.15, + "step": 11678 + }, + { + "epoch": 0.67, + "grad_norm": 0.9677833792997231, + "learning_rate": 5.159844820121605e-06, + "loss": 0.4, + "step": 11679 + }, + { + "epoch": 0.67, + "grad_norm": 0.2778530544704572, + "learning_rate": 5.158216494847412e-06, + "loss": 0.2421, + "step": 11680 + }, + { + "epoch": 0.67, + "grad_norm": 0.44108767628976453, + "learning_rate": 5.156588337243974e-06, + "loss": 0.2371, + "step": 11681 + }, + { + "epoch": 0.67, + "grad_norm": 0.9779485327404348, + "learning_rate": 5.154960347367675e-06, + "loss": 0.4578, + "step": 11682 + }, + { + "epoch": 0.67, + "grad_norm": 0.27024184345027874, + "learning_rate": 5.153332525274888e-06, + "loss": 0.225, + "step": 11683 + }, + { + "epoch": 0.67, + "grad_norm": 0.3015393644257497, + "learning_rate": 5.1517048710219895e-06, + "loss": 0.2489, + "step": 11684 + }, + { + "epoch": 0.67, + "grad_norm": 0.8142713116682633, + "learning_rate": 5.150077384665342e-06, + "loss": 0.397, + "step": 11685 + }, + { + "epoch": 0.67, + "grad_norm": 0.333100189552544, + "learning_rate": 5.148450066261303e-06, + "loss": 0.2676, + "step": 11686 + }, + { + "epoch": 0.67, + "grad_norm": 0.4376687450485571, + "learning_rate": 5.146822915866232e-06, + "loss": 0.24, + "step": 11687 + }, + { + "epoch": 0.67, + "grad_norm": 0.3483112994923384, + "learning_rate": 5.145195933536476e-06, + "loss": 0.2511, + "step": 11688 + }, + { + "epoch": 0.67, + "grad_norm": 0.4132548196316583, + "learning_rate": 5.143569119328376e-06, + "loss": 0.2765, + "step": 11689 + }, + { + "epoch": 0.67, + "grad_norm": 0.7860273994083556, + "learning_rate": 5.141942473298264e-06, + "loss": 0.2614, + "step": 11690 + }, + { + "epoch": 0.67, + "grad_norm": 0.3396154895768527, + "learning_rate": 5.140315995502478e-06, + "loss": 0.2806, + "step": 11691 + }, + { + "epoch": 0.67, + "grad_norm": 0.4005484919825302, + "learning_rate": 5.1386896859973425e-06, + "loss": 0.2488, + "step": 11692 + }, + { + "epoch": 0.67, + "grad_norm": 0.33537309338762605, + "learning_rate": 5.1370635448391736e-06, + "loss": 0.2494, + "step": 11693 + }, + { + "epoch": 0.67, + "grad_norm": 1.2551963101728314, + "learning_rate": 5.135437572084284e-06, + "loss": 0.2223, + "step": 11694 + }, + { + "epoch": 0.67, + "grad_norm": 0.39248745343059227, + "learning_rate": 5.133811767788979e-06, + "loss": 0.2952, + "step": 11695 + }, + { + "epoch": 0.67, + "grad_norm": 0.29648525882978316, + "learning_rate": 5.132186132009567e-06, + "loss": 0.2726, + "step": 11696 + }, + { + "epoch": 0.67, + "grad_norm": 0.9013794775813733, + "learning_rate": 5.13056066480234e-06, + "loss": 0.357, + "step": 11697 + }, + { + "epoch": 0.67, + "grad_norm": 0.34343934566226364, + "learning_rate": 5.128935366223588e-06, + "loss": 0.2625, + "step": 11698 + }, + { + "epoch": 0.67, + "grad_norm": 0.2714716915290179, + "learning_rate": 5.12731023632959e-06, + "loss": 0.1737, + "step": 11699 + }, + { + "epoch": 0.67, + "grad_norm": 0.3910270132526932, + "learning_rate": 5.125685275176633e-06, + "loss": 0.3165, + "step": 11700 + }, + { + "epoch": 0.67, + "grad_norm": 0.35071460165870216, + "learning_rate": 5.124060482820986e-06, + "loss": 0.1882, + "step": 11701 + }, + { + "epoch": 0.67, + "grad_norm": 0.9215729327737139, + "learning_rate": 5.122435859318915e-06, + "loss": 0.4102, + "step": 11702 + }, + { + "epoch": 0.67, + "grad_norm": 0.45771396649945395, + "learning_rate": 5.120811404726675e-06, + "loss": 0.3709, + "step": 11703 + }, + { + "epoch": 0.67, + "grad_norm": 0.27670028601173474, + "learning_rate": 5.119187119100533e-06, + "loss": 0.2096, + "step": 11704 + }, + { + "epoch": 0.67, + "grad_norm": 0.26230396682039764, + "learning_rate": 5.117563002496728e-06, + "loss": 0.168, + "step": 11705 + }, + { + "epoch": 0.67, + "grad_norm": 1.0274803578898541, + "learning_rate": 5.115939054971508e-06, + "loss": 0.4535, + "step": 11706 + }, + { + "epoch": 0.67, + "grad_norm": 0.6142974067442903, + "learning_rate": 5.114315276581108e-06, + "loss": 0.2165, + "step": 11707 + }, + { + "epoch": 0.67, + "grad_norm": 0.2637060378536606, + "learning_rate": 5.1126916673817575e-06, + "loss": 0.2753, + "step": 11708 + }, + { + "epoch": 0.67, + "grad_norm": 1.119474613305846, + "learning_rate": 5.111068227429686e-06, + "loss": 0.6991, + "step": 11709 + }, + { + "epoch": 0.67, + "grad_norm": 0.3746375472716624, + "learning_rate": 5.109444956781113e-06, + "loss": 0.1687, + "step": 11710 + }, + { + "epoch": 0.67, + "grad_norm": 0.26177824937585215, + "learning_rate": 5.10782185549225e-06, + "loss": 0.2256, + "step": 11711 + }, + { + "epoch": 0.67, + "grad_norm": 0.48526546688989247, + "learning_rate": 5.106198923619302e-06, + "loss": 0.3127, + "step": 11712 + }, + { + "epoch": 0.67, + "grad_norm": 0.6618222141302967, + "learning_rate": 5.10457616121848e-06, + "loss": 0.2913, + "step": 11713 + }, + { + "epoch": 0.67, + "grad_norm": 0.45600963586374926, + "learning_rate": 5.102953568345973e-06, + "loss": 0.2749, + "step": 11714 + }, + { + "epoch": 0.67, + "grad_norm": 0.4655809092972486, + "learning_rate": 5.101331145057975e-06, + "loss": 0.3785, + "step": 11715 + }, + { + "epoch": 0.67, + "grad_norm": 0.36798216280033647, + "learning_rate": 5.0997088914106685e-06, + "loss": 0.2713, + "step": 11716 + }, + { + "epoch": 0.67, + "grad_norm": 0.21140015509665294, + "learning_rate": 5.098086807460232e-06, + "loss": 0.1602, + "step": 11717 + }, + { + "epoch": 0.67, + "grad_norm": 0.7371500660216942, + "learning_rate": 5.096464893262838e-06, + "loss": 0.3948, + "step": 11718 + }, + { + "epoch": 0.67, + "grad_norm": 0.3635716936363465, + "learning_rate": 5.094843148874654e-06, + "loss": 0.2994, + "step": 11719 + }, + { + "epoch": 0.67, + "grad_norm": 0.3033816677570989, + "learning_rate": 5.0932215743518375e-06, + "loss": 0.2358, + "step": 11720 + }, + { + "epoch": 0.67, + "grad_norm": 1.0560147653471157, + "learning_rate": 5.0916001697505506e-06, + "loss": 0.6911, + "step": 11721 + }, + { + "epoch": 0.67, + "grad_norm": 0.3864890302897584, + "learning_rate": 5.089978935126939e-06, + "loss": 0.2657, + "step": 11722 + }, + { + "epoch": 0.67, + "grad_norm": 0.22419003265245574, + "learning_rate": 5.088357870537146e-06, + "loss": 0.1348, + "step": 11723 + }, + { + "epoch": 0.67, + "grad_norm": 0.3287953682638831, + "learning_rate": 5.086736976037304e-06, + "loss": 0.3103, + "step": 11724 + }, + { + "epoch": 0.67, + "grad_norm": 0.7126230773256188, + "learning_rate": 5.085116251683554e-06, + "loss": 0.3805, + "step": 11725 + }, + { + "epoch": 0.67, + "grad_norm": 0.34080945340313146, + "learning_rate": 5.083495697532016e-06, + "loss": 0.2898, + "step": 11726 + }, + { + "epoch": 0.67, + "grad_norm": 0.32289359281894736, + "learning_rate": 5.081875313638811e-06, + "loss": 0.2766, + "step": 11727 + }, + { + "epoch": 0.67, + "grad_norm": 0.7495934803198708, + "learning_rate": 5.080255100060048e-06, + "loss": 0.3571, + "step": 11728 + }, + { + "epoch": 0.67, + "grad_norm": 0.2572948506856289, + "learning_rate": 5.078635056851844e-06, + "loss": 0.2007, + "step": 11729 + }, + { + "epoch": 0.67, + "grad_norm": 0.5697537663129776, + "learning_rate": 5.077015184070296e-06, + "loss": 0.2481, + "step": 11730 + }, + { + "epoch": 0.67, + "grad_norm": 0.33024815771221205, + "learning_rate": 5.075395481771501e-06, + "loss": 0.2487, + "step": 11731 + }, + { + "epoch": 0.67, + "grad_norm": 0.29132435493008624, + "learning_rate": 5.073775950011548e-06, + "loss": 0.2737, + "step": 11732 + }, + { + "epoch": 0.67, + "grad_norm": 1.1874298610861767, + "learning_rate": 5.072156588846519e-06, + "loss": 0.5421, + "step": 11733 + }, + { + "epoch": 0.67, + "grad_norm": 0.5883251543734425, + "learning_rate": 5.070537398332498e-06, + "loss": 0.3345, + "step": 11734 + }, + { + "epoch": 0.67, + "grad_norm": 0.27343202258408916, + "learning_rate": 5.068918378525555e-06, + "loss": 0.2571, + "step": 11735 + }, + { + "epoch": 0.67, + "grad_norm": 0.6437468881803801, + "learning_rate": 5.067299529481758e-06, + "loss": 0.2749, + "step": 11736 + }, + { + "epoch": 0.67, + "grad_norm": 0.34159851863097196, + "learning_rate": 5.065680851257162e-06, + "loss": 0.1786, + "step": 11737 + }, + { + "epoch": 0.67, + "grad_norm": 0.37613981223297227, + "learning_rate": 5.0640623439078285e-06, + "loss": 0.2851, + "step": 11738 + }, + { + "epoch": 0.67, + "grad_norm": 0.365571260834238, + "learning_rate": 5.062444007489804e-06, + "loss": 0.2933, + "step": 11739 + }, + { + "epoch": 0.67, + "grad_norm": 0.346912717482984, + "learning_rate": 5.060825842059132e-06, + "loss": 0.1916, + "step": 11740 + }, + { + "epoch": 0.67, + "grad_norm": 0.47141640995945183, + "learning_rate": 5.059207847671845e-06, + "loss": 0.3235, + "step": 11741 + }, + { + "epoch": 0.67, + "grad_norm": 0.45849898304823045, + "learning_rate": 5.05759002438398e-06, + "loss": 0.2979, + "step": 11742 + }, + { + "epoch": 0.67, + "grad_norm": 0.25376962264594627, + "learning_rate": 5.055972372251562e-06, + "loss": 0.1864, + "step": 11743 + }, + { + "epoch": 0.67, + "grad_norm": 0.3635867761463181, + "learning_rate": 5.054354891330607e-06, + "loss": 0.2763, + "step": 11744 + }, + { + "epoch": 0.67, + "grad_norm": 1.209100047476115, + "learning_rate": 5.05273758167713e-06, + "loss": 0.7096, + "step": 11745 + }, + { + "epoch": 0.67, + "grad_norm": 0.5569931214330035, + "learning_rate": 5.051120443347134e-06, + "loss": 0.2018, + "step": 11746 + }, + { + "epoch": 0.67, + "grad_norm": 0.2892336497399014, + "learning_rate": 5.049503476396627e-06, + "loss": 0.2789, + "step": 11747 + }, + { + "epoch": 0.67, + "grad_norm": 0.43334647063447024, + "learning_rate": 5.047886680881603e-06, + "loss": 0.3457, + "step": 11748 + }, + { + "epoch": 0.68, + "grad_norm": 0.165429228208656, + "learning_rate": 5.0462700568580495e-06, + "loss": 0.0847, + "step": 11749 + }, + { + "epoch": 0.68, + "grad_norm": 0.4192615834692852, + "learning_rate": 5.044653604381952e-06, + "loss": 0.3126, + "step": 11750 + }, + { + "epoch": 0.68, + "grad_norm": 0.3438659632511523, + "learning_rate": 5.043037323509285e-06, + "loss": 0.3159, + "step": 11751 + }, + { + "epoch": 0.68, + "grad_norm": 0.6009103614653322, + "learning_rate": 5.041421214296025e-06, + "loss": 0.375, + "step": 11752 + }, + { + "epoch": 0.68, + "grad_norm": 0.32373127606272445, + "learning_rate": 5.039805276798128e-06, + "loss": 0.2451, + "step": 11753 + }, + { + "epoch": 0.68, + "grad_norm": 1.0671526753829008, + "learning_rate": 5.0381895110715676e-06, + "loss": 0.5263, + "step": 11754 + }, + { + "epoch": 0.68, + "grad_norm": 0.23259453773066574, + "learning_rate": 5.03657391717229e-06, + "loss": 0.1934, + "step": 11755 + }, + { + "epoch": 0.68, + "grad_norm": 0.3365964910715124, + "learning_rate": 5.0349584951562445e-06, + "loss": 0.2178, + "step": 11756 + }, + { + "epoch": 0.68, + "grad_norm": 1.0692943385938543, + "learning_rate": 5.033343245079373e-06, + "loss": 0.7361, + "step": 11757 + }, + { + "epoch": 0.68, + "grad_norm": 0.5576953322811902, + "learning_rate": 5.031728166997607e-06, + "loss": 0.3471, + "step": 11758 + }, + { + "epoch": 0.68, + "grad_norm": 0.2808777177692286, + "learning_rate": 5.0301132609668845e-06, + "loss": 0.2165, + "step": 11759 + }, + { + "epoch": 0.68, + "grad_norm": 0.5072273706254729, + "learning_rate": 5.028498527043126e-06, + "loss": 0.3641, + "step": 11760 + }, + { + "epoch": 0.68, + "grad_norm": 0.2782055835137195, + "learning_rate": 5.026883965282252e-06, + "loss": 0.1837, + "step": 11761 + }, + { + "epoch": 0.68, + "grad_norm": 0.322179200138884, + "learning_rate": 5.025269575740166e-06, + "loss": 0.222, + "step": 11762 + }, + { + "epoch": 0.68, + "grad_norm": 0.3592214488272502, + "learning_rate": 5.023655358472786e-06, + "loss": 0.3299, + "step": 11763 + }, + { + "epoch": 0.68, + "grad_norm": 0.6839481155341293, + "learning_rate": 5.022041313536006e-06, + "loss": 0.4145, + "step": 11764 + }, + { + "epoch": 0.68, + "grad_norm": 0.302911265362752, + "learning_rate": 5.020427440985721e-06, + "loss": 0.2343, + "step": 11765 + }, + { + "epoch": 0.68, + "grad_norm": 1.2681543614082758, + "learning_rate": 5.018813740877817e-06, + "loss": 0.234, + "step": 11766 + }, + { + "epoch": 0.68, + "grad_norm": 0.23444856503788336, + "learning_rate": 5.0172002132681815e-06, + "loss": 0.2076, + "step": 11767 + }, + { + "epoch": 0.68, + "grad_norm": 0.32344719208450956, + "learning_rate": 5.0155868582126886e-06, + "loss": 0.262, + "step": 11768 + }, + { + "epoch": 0.68, + "grad_norm": 0.8788977231247462, + "learning_rate": 5.0139736757672095e-06, + "loss": 0.3568, + "step": 11769 + }, + { + "epoch": 0.68, + "grad_norm": 0.5589684828125013, + "learning_rate": 5.012360665987607e-06, + "loss": 0.3362, + "step": 11770 + }, + { + "epoch": 0.68, + "grad_norm": 0.2517561774348396, + "learning_rate": 5.010747828929736e-06, + "loss": 0.2556, + "step": 11771 + }, + { + "epoch": 0.68, + "grad_norm": 1.1552259575630777, + "learning_rate": 5.009135164649457e-06, + "loss": 0.2947, + "step": 11772 + }, + { + "epoch": 0.68, + "grad_norm": 0.35356777969622866, + "learning_rate": 5.007522673202613e-06, + "loss": 0.2245, + "step": 11773 + }, + { + "epoch": 0.68, + "grad_norm": 0.33910329035869946, + "learning_rate": 5.005910354645043e-06, + "loss": 0.2669, + "step": 11774 + }, + { + "epoch": 0.68, + "grad_norm": 0.3495175522409411, + "learning_rate": 5.0042982090325805e-06, + "loss": 0.3104, + "step": 11775 + }, + { + "epoch": 0.68, + "grad_norm": 0.622377722237779, + "learning_rate": 5.002686236421059e-06, + "loss": 0.2137, + "step": 11776 + }, + { + "epoch": 0.68, + "grad_norm": 0.34094064564462573, + "learning_rate": 5.0010744368663e-06, + "loss": 0.2648, + "step": 11777 + }, + { + "epoch": 0.68, + "grad_norm": 1.1026202721985212, + "learning_rate": 4.999462810424116e-06, + "loss": 0.4879, + "step": 11778 + }, + { + "epoch": 0.68, + "grad_norm": 0.3465454930026479, + "learning_rate": 4.9978513571503175e-06, + "loss": 0.2476, + "step": 11779 + }, + { + "epoch": 0.68, + "grad_norm": 0.295672774198805, + "learning_rate": 4.996240077100713e-06, + "loss": 0.242, + "step": 11780 + }, + { + "epoch": 0.68, + "grad_norm": 0.6049472071811616, + "learning_rate": 4.994628970331102e-06, + "loss": 0.3999, + "step": 11781 + }, + { + "epoch": 0.68, + "grad_norm": 0.29679226403336495, + "learning_rate": 4.993018036897274e-06, + "loss": 0.1961, + "step": 11782 + }, + { + "epoch": 0.68, + "grad_norm": 0.26024884929211345, + "learning_rate": 4.991407276855016e-06, + "loss": 0.2608, + "step": 11783 + }, + { + "epoch": 0.68, + "grad_norm": 0.41637954941656796, + "learning_rate": 4.989796690260108e-06, + "loss": 0.2302, + "step": 11784 + }, + { + "epoch": 0.68, + "grad_norm": 1.0573213732138746, + "learning_rate": 4.988186277168325e-06, + "loss": 0.3642, + "step": 11785 + }, + { + "epoch": 0.68, + "grad_norm": 0.32321682893392756, + "learning_rate": 4.9865760376354365e-06, + "loss": 0.2565, + "step": 11786 + }, + { + "epoch": 0.68, + "grad_norm": 0.3490489850595347, + "learning_rate": 4.9849659717172e-06, + "loss": 0.2924, + "step": 11787 + }, + { + "epoch": 0.68, + "grad_norm": 0.4404451474161239, + "learning_rate": 4.98335607946938e-06, + "loss": 0.3545, + "step": 11788 + }, + { + "epoch": 0.68, + "grad_norm": 0.23778656116027025, + "learning_rate": 4.981746360947724e-06, + "loss": 0.1595, + "step": 11789 + }, + { + "epoch": 0.68, + "grad_norm": 1.3534561202280206, + "learning_rate": 4.980136816207974e-06, + "loss": 0.6599, + "step": 11790 + }, + { + "epoch": 0.68, + "grad_norm": 0.3459827769144183, + "learning_rate": 4.978527445305869e-06, + "loss": 0.3084, + "step": 11791 + }, + { + "epoch": 0.68, + "grad_norm": 0.2924247907379295, + "learning_rate": 4.976918248297145e-06, + "loss": 0.1902, + "step": 11792 + }, + { + "epoch": 0.68, + "grad_norm": 0.524691825002447, + "learning_rate": 4.9753092252375245e-06, + "loss": 0.396, + "step": 11793 + }, + { + "epoch": 0.68, + "grad_norm": 0.36268677838139596, + "learning_rate": 4.973700376182732e-06, + "loss": 0.3425, + "step": 11794 + }, + { + "epoch": 0.68, + "grad_norm": 0.2732204910654022, + "learning_rate": 4.972091701188478e-06, + "loss": 0.1814, + "step": 11795 + }, + { + "epoch": 0.68, + "grad_norm": 0.2662254490724855, + "learning_rate": 4.970483200310468e-06, + "loss": 0.1567, + "step": 11796 + }, + { + "epoch": 0.68, + "grad_norm": 0.6628178672031443, + "learning_rate": 4.968874873604414e-06, + "loss": 0.4117, + "step": 11797 + }, + { + "epoch": 0.68, + "grad_norm": 0.31714829171919484, + "learning_rate": 4.967266721126005e-06, + "loss": 0.1911, + "step": 11798 + }, + { + "epoch": 0.68, + "grad_norm": 0.3405285791476989, + "learning_rate": 4.965658742930934e-06, + "loss": 0.3089, + "step": 11799 + }, + { + "epoch": 0.68, + "grad_norm": 1.0963642128915825, + "learning_rate": 4.964050939074881e-06, + "loss": 0.6414, + "step": 11800 + }, + { + "epoch": 0.68, + "grad_norm": 0.2131071516902079, + "learning_rate": 4.962443309613529e-06, + "loss": 0.1658, + "step": 11801 + }, + { + "epoch": 0.68, + "grad_norm": 0.3574377659432091, + "learning_rate": 4.96083585460255e-06, + "loss": 0.2421, + "step": 11802 + }, + { + "epoch": 0.68, + "grad_norm": 0.4570436088730508, + "learning_rate": 4.95922857409761e-06, + "loss": 0.3248, + "step": 11803 + }, + { + "epoch": 0.68, + "grad_norm": 0.3235139120968666, + "learning_rate": 4.9576214681543626e-06, + "loss": 0.2579, + "step": 11804 + }, + { + "epoch": 0.68, + "grad_norm": 0.8548199042981579, + "learning_rate": 4.956014536828471e-06, + "loss": 0.3425, + "step": 11805 + }, + { + "epoch": 0.68, + "grad_norm": 0.3368131841542381, + "learning_rate": 4.954407780175578e-06, + "loss": 0.3074, + "step": 11806 + }, + { + "epoch": 0.68, + "grad_norm": 0.3103485250750534, + "learning_rate": 4.952801198251328e-06, + "loss": 0.2388, + "step": 11807 + }, + { + "epoch": 0.68, + "grad_norm": 0.2644081194182972, + "learning_rate": 4.95119479111135e-06, + "loss": 0.091, + "step": 11808 + }, + { + "epoch": 0.68, + "grad_norm": 0.811557495156051, + "learning_rate": 4.949588558811285e-06, + "loss": 0.3806, + "step": 11809 + }, + { + "epoch": 0.68, + "grad_norm": 0.4076560589755643, + "learning_rate": 4.947982501406749e-06, + "loss": 0.2817, + "step": 11810 + }, + { + "epoch": 0.68, + "grad_norm": 0.29270911302935926, + "learning_rate": 4.946376618953364e-06, + "loss": 0.2591, + "step": 11811 + }, + { + "epoch": 0.68, + "grad_norm": 0.9784414407077597, + "learning_rate": 4.944770911506739e-06, + "loss": 0.536, + "step": 11812 + }, + { + "epoch": 0.68, + "grad_norm": 0.3713203487743198, + "learning_rate": 4.9431653791224744e-06, + "loss": 0.2449, + "step": 11813 + }, + { + "epoch": 0.68, + "grad_norm": 0.26768673340053445, + "learning_rate": 4.941560021856181e-06, + "loss": 0.2029, + "step": 11814 + }, + { + "epoch": 0.68, + "grad_norm": 0.4008652783655156, + "learning_rate": 4.9399548397634455e-06, + "loss": 0.2715, + "step": 11815 + }, + { + "epoch": 0.68, + "grad_norm": 0.3935508821109515, + "learning_rate": 4.938349832899856e-06, + "loss": 0.2394, + "step": 11816 + }, + { + "epoch": 0.68, + "grad_norm": 0.49750616813772636, + "learning_rate": 4.9367450013209905e-06, + "loss": 0.3878, + "step": 11817 + }, + { + "epoch": 0.68, + "grad_norm": 0.3181744075806371, + "learning_rate": 4.935140345082436e-06, + "loss": 0.2385, + "step": 11818 + }, + { + "epoch": 0.68, + "grad_norm": 0.42534101653733136, + "learning_rate": 4.93353586423975e-06, + "loss": 0.3122, + "step": 11819 + }, + { + "epoch": 0.68, + "grad_norm": 0.27271478591220527, + "learning_rate": 4.9319315588484954e-06, + "loss": 0.2234, + "step": 11820 + }, + { + "epoch": 0.68, + "grad_norm": 0.6360729424545637, + "learning_rate": 4.930327428964235e-06, + "loss": 0.3097, + "step": 11821 + }, + { + "epoch": 0.68, + "grad_norm": 0.32253078555839093, + "learning_rate": 4.9287234746425195e-06, + "loss": 0.2473, + "step": 11822 + }, + { + "epoch": 0.68, + "grad_norm": 0.34936474152769675, + "learning_rate": 4.927119695938891e-06, + "loss": 0.3168, + "step": 11823 + }, + { + "epoch": 0.68, + "grad_norm": 0.9379810581274167, + "learning_rate": 4.925516092908891e-06, + "loss": 0.3485, + "step": 11824 + }, + { + "epoch": 0.68, + "grad_norm": 0.31432849488872344, + "learning_rate": 4.923912665608045e-06, + "loss": 0.2603, + "step": 11825 + }, + { + "epoch": 0.68, + "grad_norm": 0.4927456068221821, + "learning_rate": 4.9223094140918894e-06, + "loss": 0.2566, + "step": 11826 + }, + { + "epoch": 0.68, + "grad_norm": 0.31740940794607303, + "learning_rate": 4.920706338415941e-06, + "loss": 0.2725, + "step": 11827 + }, + { + "epoch": 0.68, + "grad_norm": 0.35754575878697037, + "learning_rate": 4.919103438635713e-06, + "loss": 0.1741, + "step": 11828 + }, + { + "epoch": 0.68, + "grad_norm": 0.4985427045345408, + "learning_rate": 4.91750071480671e-06, + "loss": 0.3825, + "step": 11829 + }, + { + "epoch": 0.68, + "grad_norm": 0.33708352874029796, + "learning_rate": 4.915898166984443e-06, + "loss": 0.2834, + "step": 11830 + }, + { + "epoch": 0.68, + "grad_norm": 0.5475661290159132, + "learning_rate": 4.914295795224404e-06, + "loss": 0.1531, + "step": 11831 + }, + { + "epoch": 0.68, + "grad_norm": 0.38665851593133643, + "learning_rate": 4.912693599582083e-06, + "loss": 0.3186, + "step": 11832 + }, + { + "epoch": 0.68, + "grad_norm": 0.2821804515532111, + "learning_rate": 4.91109158011296e-06, + "loss": 0.191, + "step": 11833 + }, + { + "epoch": 0.68, + "grad_norm": 0.2936069143811596, + "learning_rate": 4.909489736872521e-06, + "loss": 0.1984, + "step": 11834 + }, + { + "epoch": 0.68, + "grad_norm": 0.5737718239366725, + "learning_rate": 4.907888069916234e-06, + "loss": 0.317, + "step": 11835 + }, + { + "epoch": 0.68, + "grad_norm": 1.2137512039405558, + "learning_rate": 4.906286579299563e-06, + "loss": 0.6646, + "step": 11836 + }, + { + "epoch": 0.68, + "grad_norm": 0.36728729290760725, + "learning_rate": 4.904685265077969e-06, + "loss": 0.19, + "step": 11837 + }, + { + "epoch": 0.68, + "grad_norm": 0.30145925210114205, + "learning_rate": 4.903084127306901e-06, + "loss": 0.2932, + "step": 11838 + }, + { + "epoch": 0.68, + "grad_norm": 0.44813104620717403, + "learning_rate": 4.901483166041815e-06, + "loss": 0.272, + "step": 11839 + }, + { + "epoch": 0.68, + "grad_norm": 0.3188875124250454, + "learning_rate": 4.899882381338147e-06, + "loss": 0.2182, + "step": 11840 + }, + { + "epoch": 0.68, + "grad_norm": 0.3501168527999765, + "learning_rate": 4.898281773251333e-06, + "loss": 0.254, + "step": 11841 + }, + { + "epoch": 0.68, + "grad_norm": 0.3725052385902562, + "learning_rate": 4.896681341836798e-06, + "loss": 0.3149, + "step": 11842 + }, + { + "epoch": 0.68, + "grad_norm": 0.3919431187346392, + "learning_rate": 4.895081087149974e-06, + "loss": 0.2764, + "step": 11843 + }, + { + "epoch": 0.68, + "grad_norm": 0.447679970754077, + "learning_rate": 4.8934810092462705e-06, + "loss": 0.283, + "step": 11844 + }, + { + "epoch": 0.68, + "grad_norm": 0.27528776173796865, + "learning_rate": 4.891881108181101e-06, + "loss": 0.1694, + "step": 11845 + }, + { + "epoch": 0.68, + "grad_norm": 0.2515033888633789, + "learning_rate": 4.890281384009865e-06, + "loss": 0.2506, + "step": 11846 + }, + { + "epoch": 0.68, + "grad_norm": 0.4647181444523978, + "learning_rate": 4.8886818367879686e-06, + "loss": 0.2796, + "step": 11847 + }, + { + "epoch": 0.68, + "grad_norm": 0.6205120870967442, + "learning_rate": 4.8870824665708e-06, + "loss": 0.4248, + "step": 11848 + }, + { + "epoch": 0.68, + "grad_norm": 0.6572729053267969, + "learning_rate": 4.885483273413747e-06, + "loss": 0.3524, + "step": 11849 + }, + { + "epoch": 0.68, + "grad_norm": 0.26634984300357595, + "learning_rate": 4.883884257372188e-06, + "loss": 0.2635, + "step": 11850 + }, + { + "epoch": 0.68, + "grad_norm": 0.21217690977582984, + "learning_rate": 4.882285418501497e-06, + "loss": 0.1465, + "step": 11851 + }, + { + "epoch": 0.68, + "grad_norm": 0.823238938356525, + "learning_rate": 4.88068675685704e-06, + "loss": 0.5252, + "step": 11852 + }, + { + "epoch": 0.68, + "grad_norm": 0.346625383613797, + "learning_rate": 4.879088272494184e-06, + "loss": 0.2784, + "step": 11853 + }, + { + "epoch": 0.68, + "grad_norm": 0.3474056837835713, + "learning_rate": 4.877489965468274e-06, + "loss": 0.2554, + "step": 11854 + }, + { + "epoch": 0.68, + "grad_norm": 0.6079999000177458, + "learning_rate": 4.875891835834672e-06, + "loss": 0.3113, + "step": 11855 + }, + { + "epoch": 0.68, + "grad_norm": 0.37799226612572256, + "learning_rate": 4.874293883648714e-06, + "loss": 0.3038, + "step": 11856 + }, + { + "epoch": 0.68, + "grad_norm": 0.43480049777111607, + "learning_rate": 4.8726961089657385e-06, + "loss": 0.1556, + "step": 11857 + }, + { + "epoch": 0.68, + "grad_norm": 0.255060595881029, + "learning_rate": 4.871098511841073e-06, + "loss": 0.2094, + "step": 11858 + }, + { + "epoch": 0.68, + "grad_norm": 0.37794696995047256, + "learning_rate": 4.8695010923300505e-06, + "loss": 0.2933, + "step": 11859 + }, + { + "epoch": 0.68, + "grad_norm": 0.703634639952293, + "learning_rate": 4.867903850487983e-06, + "loss": 0.3383, + "step": 11860 + }, + { + "epoch": 0.68, + "grad_norm": 0.5711648018980142, + "learning_rate": 4.866306786370184e-06, + "loss": 0.3568, + "step": 11861 + }, + { + "epoch": 0.68, + "grad_norm": 0.3153893572774383, + "learning_rate": 4.864709900031961e-06, + "loss": 0.3094, + "step": 11862 + }, + { + "epoch": 0.68, + "grad_norm": 0.46738359154640297, + "learning_rate": 4.86311319152861e-06, + "loss": 0.3345, + "step": 11863 + }, + { + "epoch": 0.68, + "grad_norm": 0.174118788912665, + "learning_rate": 4.8615166609154315e-06, + "loss": 0.1225, + "step": 11864 + }, + { + "epoch": 0.68, + "grad_norm": 0.5155715979717324, + "learning_rate": 4.85992030824771e-06, + "loss": 0.3517, + "step": 11865 + }, + { + "epoch": 0.68, + "grad_norm": 0.39699707131238365, + "learning_rate": 4.858324133580727e-06, + "loss": 0.3108, + "step": 11866 + }, + { + "epoch": 0.68, + "grad_norm": 0.723771118865917, + "learning_rate": 4.856728136969755e-06, + "loss": 0.265, + "step": 11867 + }, + { + "epoch": 0.68, + "grad_norm": 0.34812703384012267, + "learning_rate": 4.85513231847007e-06, + "loss": 0.258, + "step": 11868 + }, + { + "epoch": 0.68, + "grad_norm": 0.46778392750169046, + "learning_rate": 4.853536678136932e-06, + "loss": 0.3343, + "step": 11869 + }, + { + "epoch": 0.68, + "grad_norm": 0.26524386390299387, + "learning_rate": 4.851941216025597e-06, + "loss": 0.1666, + "step": 11870 + }, + { + "epoch": 0.68, + "grad_norm": 0.3529335727394295, + "learning_rate": 4.850345932191313e-06, + "loss": 0.2873, + "step": 11871 + }, + { + "epoch": 0.68, + "grad_norm": 0.6736113996854561, + "learning_rate": 4.848750826689332e-06, + "loss": 0.373, + "step": 11872 + }, + { + "epoch": 0.68, + "grad_norm": 0.2454207027025233, + "learning_rate": 4.8471558995748865e-06, + "loss": 0.1576, + "step": 11873 + }, + { + "epoch": 0.68, + "grad_norm": 0.29709806190197335, + "learning_rate": 4.845561150903212e-06, + "loss": 0.2612, + "step": 11874 + }, + { + "epoch": 0.68, + "grad_norm": 1.1809594524008666, + "learning_rate": 4.843966580729533e-06, + "loss": 0.6127, + "step": 11875 + }, + { + "epoch": 0.68, + "grad_norm": 0.6862481533740115, + "learning_rate": 4.842372189109066e-06, + "loss": 0.4077, + "step": 11876 + }, + { + "epoch": 0.68, + "grad_norm": 0.6072253682769295, + "learning_rate": 4.840777976097032e-06, + "loss": 0.2227, + "step": 11877 + }, + { + "epoch": 0.68, + "grad_norm": 0.35301852333241995, + "learning_rate": 4.839183941748635e-06, + "loss": 0.3078, + "step": 11878 + }, + { + "epoch": 0.68, + "grad_norm": 0.2620722298074846, + "learning_rate": 4.837590086119076e-06, + "loss": 0.1697, + "step": 11879 + }, + { + "epoch": 0.68, + "grad_norm": 0.32669040181183934, + "learning_rate": 4.835996409263546e-06, + "loss": 0.2134, + "step": 11880 + }, + { + "epoch": 0.68, + "grad_norm": 1.089646351240924, + "learning_rate": 4.834402911237243e-06, + "loss": 0.4845, + "step": 11881 + }, + { + "epoch": 0.68, + "grad_norm": 0.29818673852673244, + "learning_rate": 4.832809592095344e-06, + "loss": 0.2822, + "step": 11882 + }, + { + "epoch": 0.68, + "grad_norm": 0.329051440364525, + "learning_rate": 4.831216451893027e-06, + "loss": 0.2178, + "step": 11883 + }, + { + "epoch": 0.68, + "grad_norm": 0.6968295293375228, + "learning_rate": 4.829623490685459e-06, + "loss": 0.4373, + "step": 11884 + }, + { + "epoch": 0.68, + "grad_norm": 0.23540721441096973, + "learning_rate": 4.828030708527814e-06, + "loss": 0.1897, + "step": 11885 + }, + { + "epoch": 0.68, + "grad_norm": 0.34736558596523004, + "learning_rate": 4.826438105475239e-06, + "loss": 0.2225, + "step": 11886 + }, + { + "epoch": 0.68, + "grad_norm": 1.0944632104089678, + "learning_rate": 4.824845681582892e-06, + "loss": 0.5581, + "step": 11887 + }, + { + "epoch": 0.68, + "grad_norm": 0.8324240460651077, + "learning_rate": 4.82325343690591e-06, + "loss": 0.414, + "step": 11888 + }, + { + "epoch": 0.68, + "grad_norm": 0.32586291593223726, + "learning_rate": 4.821661371499444e-06, + "loss": 0.2691, + "step": 11889 + }, + { + "epoch": 0.68, + "grad_norm": 0.33201111450208354, + "learning_rate": 4.820069485418622e-06, + "loss": 0.259, + "step": 11890 + }, + { + "epoch": 0.68, + "grad_norm": 0.2894218031621524, + "learning_rate": 4.818477778718571e-06, + "loss": 0.1965, + "step": 11891 + }, + { + "epoch": 0.68, + "grad_norm": 0.34646058218222653, + "learning_rate": 4.8168862514544075e-06, + "loss": 0.2612, + "step": 11892 + }, + { + "epoch": 0.68, + "grad_norm": 0.9011059324528964, + "learning_rate": 4.815294903681254e-06, + "loss": 0.0465, + "step": 11893 + }, + { + "epoch": 0.68, + "grad_norm": 0.38189133211873966, + "learning_rate": 4.813703735454216e-06, + "loss": 0.2818, + "step": 11894 + }, + { + "epoch": 0.68, + "grad_norm": 0.37774095648872624, + "learning_rate": 4.812112746828394e-06, + "loss": 0.2809, + "step": 11895 + }, + { + "epoch": 0.68, + "grad_norm": 0.9196950014411767, + "learning_rate": 4.810521937858881e-06, + "loss": 0.5367, + "step": 11896 + }, + { + "epoch": 0.68, + "grad_norm": 0.25821957577363874, + "learning_rate": 4.808931308600774e-06, + "loss": 0.2529, + "step": 11897 + }, + { + "epoch": 0.68, + "grad_norm": 0.33995182083802594, + "learning_rate": 4.807340859109152e-06, + "loss": 0.2719, + "step": 11898 + }, + { + "epoch": 0.68, + "grad_norm": 0.408344759125669, + "learning_rate": 4.805750589439092e-06, + "loss": 0.1797, + "step": 11899 + }, + { + "epoch": 0.68, + "grad_norm": 0.577276240726866, + "learning_rate": 4.804160499645667e-06, + "loss": 0.3866, + "step": 11900 + }, + { + "epoch": 0.68, + "grad_norm": 0.33678995159423364, + "learning_rate": 4.802570589783937e-06, + "loss": 0.2425, + "step": 11901 + }, + { + "epoch": 0.68, + "grad_norm": 0.3749231267150535, + "learning_rate": 4.800980859908967e-06, + "loss": 0.3353, + "step": 11902 + }, + { + "epoch": 0.68, + "grad_norm": 0.39713015624552944, + "learning_rate": 4.799391310075806e-06, + "loss": 0.1311, + "step": 11903 + }, + { + "epoch": 0.68, + "grad_norm": 0.2695977078762357, + "learning_rate": 4.7978019403395e-06, + "loss": 0.2272, + "step": 11904 + }, + { + "epoch": 0.68, + "grad_norm": 0.5527853027413822, + "learning_rate": 4.796212750755087e-06, + "loss": 0.3386, + "step": 11905 + }, + { + "epoch": 0.68, + "grad_norm": 0.5264787695999982, + "learning_rate": 4.794623741377605e-06, + "loss": 0.2782, + "step": 11906 + }, + { + "epoch": 0.68, + "grad_norm": 0.3201557404278092, + "learning_rate": 4.79303491226208e-06, + "loss": 0.2416, + "step": 11907 + }, + { + "epoch": 0.68, + "grad_norm": 1.1192785579323365, + "learning_rate": 4.791446263463531e-06, + "loss": 0.7994, + "step": 11908 + }, + { + "epoch": 0.68, + "grad_norm": 0.26545596616417333, + "learning_rate": 4.7898577950369704e-06, + "loss": 0.2056, + "step": 11909 + }, + { + "epoch": 0.68, + "grad_norm": 0.24110527748871047, + "learning_rate": 4.788269507037415e-06, + "loss": 0.1904, + "step": 11910 + }, + { + "epoch": 0.68, + "grad_norm": 0.8106310137327224, + "learning_rate": 4.786681399519862e-06, + "loss": 0.416, + "step": 11911 + }, + { + "epoch": 0.68, + "grad_norm": 0.7495867803826378, + "learning_rate": 4.785093472539307e-06, + "loss": 0.3306, + "step": 11912 + }, + { + "epoch": 0.68, + "grad_norm": 0.2882924469905718, + "learning_rate": 4.783505726150738e-06, + "loss": 0.2652, + "step": 11913 + }, + { + "epoch": 0.68, + "grad_norm": 0.4408666504381776, + "learning_rate": 4.781918160409145e-06, + "loss": 0.3201, + "step": 11914 + }, + { + "epoch": 0.68, + "grad_norm": 0.4572909264973247, + "learning_rate": 4.780330775369501e-06, + "loss": 0.2594, + "step": 11915 + }, + { + "epoch": 0.68, + "grad_norm": 0.2702213205694998, + "learning_rate": 4.778743571086779e-06, + "loss": 0.1873, + "step": 11916 + }, + { + "epoch": 0.68, + "grad_norm": 0.5390198271391285, + "learning_rate": 4.777156547615942e-06, + "loss": 0.2586, + "step": 11917 + }, + { + "epoch": 0.68, + "grad_norm": 0.5337637998050275, + "learning_rate": 4.775569705011945e-06, + "loss": 0.361, + "step": 11918 + }, + { + "epoch": 0.68, + "grad_norm": 0.30304924911307957, + "learning_rate": 4.773983043329753e-06, + "loss": 0.1909, + "step": 11919 + }, + { + "epoch": 0.68, + "grad_norm": 1.1874428884612134, + "learning_rate": 4.7723965626243e-06, + "loss": 0.7611, + "step": 11920 + }, + { + "epoch": 0.68, + "grad_norm": 0.33172829141311116, + "learning_rate": 4.770810262950524e-06, + "loss": 0.2931, + "step": 11921 + }, + { + "epoch": 0.68, + "grad_norm": 0.31447195769036773, + "learning_rate": 4.769224144363368e-06, + "loss": 0.2002, + "step": 11922 + }, + { + "epoch": 0.69, + "grad_norm": 0.49881291519374765, + "learning_rate": 4.767638206917755e-06, + "loss": 0.2958, + "step": 11923 + }, + { + "epoch": 0.69, + "grad_norm": 0.3918596183860913, + "learning_rate": 4.766052450668606e-06, + "loss": 0.2039, + "step": 11924 + }, + { + "epoch": 0.69, + "grad_norm": 0.27808781938656596, + "learning_rate": 4.764466875670836e-06, + "loss": 0.2249, + "step": 11925 + }, + { + "epoch": 0.69, + "grad_norm": 0.4956137686581436, + "learning_rate": 4.762881481979349e-06, + "loss": 0.3887, + "step": 11926 + }, + { + "epoch": 0.69, + "grad_norm": 1.111755849842336, + "learning_rate": 4.761296269649054e-06, + "loss": 0.4972, + "step": 11927 + }, + { + "epoch": 0.69, + "grad_norm": 0.30684184917618834, + "learning_rate": 4.759711238734844e-06, + "loss": 0.2521, + "step": 11928 + }, + { + "epoch": 0.69, + "grad_norm": 0.4735692273389909, + "learning_rate": 4.75812638929161e-06, + "loss": 0.2749, + "step": 11929 + }, + { + "epoch": 0.69, + "grad_norm": 0.2613868405056514, + "learning_rate": 4.756541721374228e-06, + "loss": 0.2068, + "step": 11930 + }, + { + "epoch": 0.69, + "grad_norm": 0.33203601499476443, + "learning_rate": 4.7549572350375864e-06, + "loss": 0.248, + "step": 11931 + }, + { + "epoch": 0.69, + "grad_norm": 0.9132904905924238, + "learning_rate": 4.753372930336548e-06, + "loss": 0.4699, + "step": 11932 + }, + { + "epoch": 0.69, + "grad_norm": 0.32872137130223583, + "learning_rate": 4.751788807325981e-06, + "loss": 0.2959, + "step": 11933 + }, + { + "epoch": 0.69, + "grad_norm": 0.396983128779614, + "learning_rate": 4.750204866060738e-06, + "loss": 0.2726, + "step": 11934 + }, + { + "epoch": 0.69, + "grad_norm": 0.5185433454178232, + "learning_rate": 4.748621106595679e-06, + "loss": 0.2699, + "step": 11935 + }, + { + "epoch": 0.69, + "grad_norm": 0.2455213486640146, + "learning_rate": 4.747037528985644e-06, + "loss": 0.1981, + "step": 11936 + }, + { + "epoch": 0.69, + "grad_norm": 0.4123647936906814, + "learning_rate": 4.745454133285474e-06, + "loss": 0.2777, + "step": 11937 + }, + { + "epoch": 0.69, + "grad_norm": 0.3633484612973121, + "learning_rate": 4.743870919549998e-06, + "loss": 0.3307, + "step": 11938 + }, + { + "epoch": 0.69, + "grad_norm": 0.6143787094869488, + "learning_rate": 4.74228788783405e-06, + "loss": 0.3097, + "step": 11939 + }, + { + "epoch": 0.69, + "grad_norm": 0.3848915895137807, + "learning_rate": 4.740705038192444e-06, + "loss": 0.2994, + "step": 11940 + }, + { + "epoch": 0.69, + "grad_norm": 0.303569188063324, + "learning_rate": 4.7391223706799994e-06, + "loss": 0.2996, + "step": 11941 + }, + { + "epoch": 0.69, + "grad_norm": 0.22025983968170426, + "learning_rate": 4.73753988535152e-06, + "loss": 0.0816, + "step": 11942 + }, + { + "epoch": 0.69, + "grad_norm": 0.3947879607288585, + "learning_rate": 4.735957582261803e-06, + "loss": 0.2905, + "step": 11943 + }, + { + "epoch": 0.69, + "grad_norm": 0.5295208092043296, + "learning_rate": 4.7343754614656536e-06, + "loss": 0.3748, + "step": 11944 + }, + { + "epoch": 0.69, + "grad_norm": 0.3434557916596737, + "learning_rate": 4.732793523017856e-06, + "loss": 0.2484, + "step": 11945 + }, + { + "epoch": 0.69, + "grad_norm": 0.4718786351336028, + "learning_rate": 4.73121176697319e-06, + "loss": 0.3152, + "step": 11946 + }, + { + "epoch": 0.69, + "grad_norm": 0.5218283957356089, + "learning_rate": 4.729630193386433e-06, + "loss": 0.3238, + "step": 11947 + }, + { + "epoch": 0.69, + "grad_norm": 0.158404783519397, + "learning_rate": 4.728048802312358e-06, + "loss": 0.0912, + "step": 11948 + }, + { + "epoch": 0.69, + "grad_norm": 0.27999339984469956, + "learning_rate": 4.726467593805726e-06, + "loss": 0.2589, + "step": 11949 + }, + { + "epoch": 0.69, + "grad_norm": 0.5390317862693036, + "learning_rate": 4.724886567921295e-06, + "loss": 0.4137, + "step": 11950 + }, + { + "epoch": 0.69, + "grad_norm": 0.5653285205882527, + "learning_rate": 4.723305724713812e-06, + "loss": 0.3685, + "step": 11951 + }, + { + "epoch": 0.69, + "grad_norm": 0.4366394060085917, + "learning_rate": 4.721725064238028e-06, + "loss": 0.1037, + "step": 11952 + }, + { + "epoch": 0.69, + "grad_norm": 0.28421369940574165, + "learning_rate": 4.720144586548681e-06, + "loss": 0.2764, + "step": 11953 + }, + { + "epoch": 0.69, + "grad_norm": 0.3944676448912269, + "learning_rate": 4.718564291700497e-06, + "loss": 0.2295, + "step": 11954 + }, + { + "epoch": 0.69, + "grad_norm": 0.34010771117451527, + "learning_rate": 4.7169841797482005e-06, + "loss": 0.1541, + "step": 11955 + }, + { + "epoch": 0.69, + "grad_norm": 0.3757324807553204, + "learning_rate": 4.7154042507465195e-06, + "loss": 0.2925, + "step": 11956 + }, + { + "epoch": 0.69, + "grad_norm": 0.2737937072179737, + "learning_rate": 4.713824504750161e-06, + "loss": 0.2567, + "step": 11957 + }, + { + "epoch": 0.69, + "grad_norm": 0.33085688299480903, + "learning_rate": 4.7122449418138325e-06, + "loss": 0.1568, + "step": 11958 + }, + { + "epoch": 0.69, + "grad_norm": 0.5549992341677822, + "learning_rate": 4.710665561992232e-06, + "loss": 0.3311, + "step": 11959 + }, + { + "epoch": 0.69, + "grad_norm": 0.43458610126481007, + "learning_rate": 4.709086365340057e-06, + "loss": 0.2455, + "step": 11960 + }, + { + "epoch": 0.69, + "grad_norm": 0.24458519823897504, + "learning_rate": 4.707507351911995e-06, + "loss": 0.2202, + "step": 11961 + }, + { + "epoch": 0.69, + "grad_norm": 0.4809860354275576, + "learning_rate": 4.705928521762726e-06, + "loss": 0.3509, + "step": 11962 + }, + { + "epoch": 0.69, + "grad_norm": 0.6523562611627876, + "learning_rate": 4.7043498749469204e-06, + "loss": 0.2662, + "step": 11963 + }, + { + "epoch": 0.69, + "grad_norm": 0.6412526764903204, + "learning_rate": 4.702771411519256e-06, + "loss": 0.3287, + "step": 11964 + }, + { + "epoch": 0.69, + "grad_norm": 0.2694814585384461, + "learning_rate": 4.701193131534389e-06, + "loss": 0.2403, + "step": 11965 + }, + { + "epoch": 0.69, + "grad_norm": 0.9540802506457705, + "learning_rate": 4.699615035046975e-06, + "loss": 0.5154, + "step": 11966 + }, + { + "epoch": 0.69, + "grad_norm": 0.3832799734593423, + "learning_rate": 4.698037122111665e-06, + "loss": 0.3095, + "step": 11967 + }, + { + "epoch": 0.69, + "grad_norm": 0.5151002559842178, + "learning_rate": 4.696459392783098e-06, + "loss": 0.2608, + "step": 11968 + }, + { + "epoch": 0.69, + "grad_norm": 0.2970906609548814, + "learning_rate": 4.694881847115918e-06, + "loss": 0.2769, + "step": 11969 + }, + { + "epoch": 0.69, + "grad_norm": 0.3673952699013813, + "learning_rate": 4.69330448516475e-06, + "loss": 0.2208, + "step": 11970 + }, + { + "epoch": 0.69, + "grad_norm": 0.3823198217572113, + "learning_rate": 4.691727306984222e-06, + "loss": 0.2333, + "step": 11971 + }, + { + "epoch": 0.69, + "grad_norm": 0.4797977748585798, + "learning_rate": 4.690150312628944e-06, + "loss": 0.3882, + "step": 11972 + }, + { + "epoch": 0.69, + "grad_norm": 0.42047099420085404, + "learning_rate": 4.688573502153536e-06, + "loss": 0.3205, + "step": 11973 + }, + { + "epoch": 0.69, + "grad_norm": 0.35808423918623256, + "learning_rate": 4.6869968756126e-06, + "loss": 0.2417, + "step": 11974 + }, + { + "epoch": 0.69, + "grad_norm": 0.2642813721183281, + "learning_rate": 4.685420433060732e-06, + "loss": 0.1715, + "step": 11975 + }, + { + "epoch": 0.69, + "grad_norm": 0.42438116021427397, + "learning_rate": 4.683844174552523e-06, + "loss": 0.2682, + "step": 11976 + }, + { + "epoch": 0.69, + "grad_norm": 0.2952126454257907, + "learning_rate": 4.682268100142567e-06, + "loss": 0.2697, + "step": 11977 + }, + { + "epoch": 0.69, + "grad_norm": 1.3042987725864317, + "learning_rate": 4.680692209885436e-06, + "loss": 0.3308, + "step": 11978 + }, + { + "epoch": 0.69, + "grad_norm": 0.50954991915773, + "learning_rate": 4.679116503835706e-06, + "loss": 0.3273, + "step": 11979 + }, + { + "epoch": 0.69, + "grad_norm": 0.35979928673775013, + "learning_rate": 4.6775409820479415e-06, + "loss": 0.311, + "step": 11980 + }, + { + "epoch": 0.69, + "grad_norm": 0.34865340009068274, + "learning_rate": 4.675965644576701e-06, + "loss": 0.2457, + "step": 11981 + }, + { + "epoch": 0.69, + "grad_norm": 0.2343614065070297, + "learning_rate": 4.674390491476545e-06, + "loss": 0.161, + "step": 11982 + }, + { + "epoch": 0.69, + "grad_norm": 0.39514869929001734, + "learning_rate": 4.672815522802018e-06, + "loss": 0.3022, + "step": 11983 + }, + { + "epoch": 0.69, + "grad_norm": 0.42193017601516436, + "learning_rate": 4.671240738607659e-06, + "loss": 0.2649, + "step": 11984 + }, + { + "epoch": 0.69, + "grad_norm": 0.3575772895676162, + "learning_rate": 4.669666138948001e-06, + "loss": 0.3064, + "step": 11985 + }, + { + "epoch": 0.69, + "grad_norm": 0.37733439670751967, + "learning_rate": 4.668091723877584e-06, + "loss": 0.2944, + "step": 11986 + }, + { + "epoch": 0.69, + "grad_norm": 0.363482865984895, + "learning_rate": 4.666517493450916e-06, + "loss": 0.1889, + "step": 11987 + }, + { + "epoch": 0.69, + "grad_norm": 0.2920184473283872, + "learning_rate": 4.664943447722514e-06, + "loss": 0.1943, + "step": 11988 + }, + { + "epoch": 0.69, + "grad_norm": 0.268324034496146, + "learning_rate": 4.6633695867468955e-06, + "loss": 0.2651, + "step": 11989 + }, + { + "epoch": 0.69, + "grad_norm": 0.6601723005250368, + "learning_rate": 4.661795910578558e-06, + "loss": 0.382, + "step": 11990 + }, + { + "epoch": 0.69, + "grad_norm": 0.6064327864030062, + "learning_rate": 4.660222419271999e-06, + "loss": 0.2746, + "step": 11991 + }, + { + "epoch": 0.69, + "grad_norm": 0.3675959255251272, + "learning_rate": 4.658649112881709e-06, + "loss": 0.3073, + "step": 11992 + }, + { + "epoch": 0.69, + "grad_norm": 0.36307823733203226, + "learning_rate": 4.657075991462165e-06, + "loss": 0.3006, + "step": 11993 + }, + { + "epoch": 0.69, + "grad_norm": 0.2183623643397165, + "learning_rate": 4.6555030550678544e-06, + "loss": 0.0842, + "step": 11994 + }, + { + "epoch": 0.69, + "grad_norm": 0.35620400135270036, + "learning_rate": 4.6539303037532435e-06, + "loss": 0.2911, + "step": 11995 + }, + { + "epoch": 0.69, + "grad_norm": 0.9075387144091024, + "learning_rate": 4.652357737572796e-06, + "loss": 0.392, + "step": 11996 + }, + { + "epoch": 0.69, + "grad_norm": 0.30833975615137077, + "learning_rate": 4.650785356580967e-06, + "loss": 0.2674, + "step": 11997 + }, + { + "epoch": 0.69, + "grad_norm": 0.36252082852686, + "learning_rate": 4.649213160832213e-06, + "loss": 0.3089, + "step": 11998 + }, + { + "epoch": 0.69, + "grad_norm": 1.0088006244221261, + "learning_rate": 4.647641150380978e-06, + "loss": 0.562, + "step": 11999 + }, + { + "epoch": 0.69, + "grad_norm": 0.17551431395096048, + "learning_rate": 4.6460693252817e-06, + "loss": 0.1311, + "step": 12000 + }, + { + "epoch": 0.69, + "grad_norm": 0.32796221667677156, + "learning_rate": 4.644497685588808e-06, + "loss": 0.2919, + "step": 12001 + }, + { + "epoch": 0.69, + "grad_norm": 1.211749796355846, + "learning_rate": 4.642926231356734e-06, + "loss": 0.3921, + "step": 12002 + }, + { + "epoch": 0.69, + "grad_norm": 0.6840428587949849, + "learning_rate": 4.641354962639894e-06, + "loss": 0.4082, + "step": 12003 + }, + { + "epoch": 0.69, + "grad_norm": 0.3276160490754549, + "learning_rate": 4.639783879492701e-06, + "loss": 0.2112, + "step": 12004 + }, + { + "epoch": 0.69, + "grad_norm": 0.3450703208904191, + "learning_rate": 4.638212981969562e-06, + "loss": 0.2976, + "step": 12005 + }, + { + "epoch": 0.69, + "grad_norm": 0.22299481634481944, + "learning_rate": 4.636642270124874e-06, + "loss": 0.1401, + "step": 12006 + }, + { + "epoch": 0.69, + "grad_norm": 0.4736605076802758, + "learning_rate": 4.6350717440130366e-06, + "loss": 0.2279, + "step": 12007 + }, + { + "epoch": 0.69, + "grad_norm": 0.6337811756742268, + "learning_rate": 4.633501403688434e-06, + "loss": 0.3114, + "step": 12008 + }, + { + "epoch": 0.69, + "grad_norm": 1.0475717388652215, + "learning_rate": 4.631931249205447e-06, + "loss": 0.5844, + "step": 12009 + }, + { + "epoch": 0.69, + "grad_norm": 0.29755432704987433, + "learning_rate": 4.630361280618446e-06, + "loss": 0.2081, + "step": 12010 + }, + { + "epoch": 0.69, + "grad_norm": 1.1889278166919943, + "learning_rate": 4.628791497981807e-06, + "loss": 0.6442, + "step": 12011 + }, + { + "epoch": 0.69, + "grad_norm": 0.2764738843094473, + "learning_rate": 4.627221901349887e-06, + "loss": 0.2578, + "step": 12012 + }, + { + "epoch": 0.69, + "grad_norm": 0.24776928880783564, + "learning_rate": 4.625652490777042e-06, + "loss": 0.1999, + "step": 12013 + }, + { + "epoch": 0.69, + "grad_norm": 0.7327164556440673, + "learning_rate": 4.624083266317616e-06, + "loss": 0.3085, + "step": 12014 + }, + { + "epoch": 0.69, + "grad_norm": 1.1927932070668756, + "learning_rate": 4.62251422802596e-06, + "loss": 0.7523, + "step": 12015 + }, + { + "epoch": 0.69, + "grad_norm": 0.3228198567523802, + "learning_rate": 4.620945375956404e-06, + "loss": 0.2437, + "step": 12016 + }, + { + "epoch": 0.69, + "grad_norm": 0.5191991671756184, + "learning_rate": 4.619376710163279e-06, + "loss": 0.2589, + "step": 12017 + }, + { + "epoch": 0.69, + "grad_norm": 0.7583758410617182, + "learning_rate": 4.617808230700907e-06, + "loss": 0.42, + "step": 12018 + }, + { + "epoch": 0.69, + "grad_norm": 0.33941374151587894, + "learning_rate": 4.6162399376236e-06, + "loss": 0.2747, + "step": 12019 + }, + { + "epoch": 0.69, + "grad_norm": 0.2758703659931883, + "learning_rate": 4.614671830985681e-06, + "loss": 0.2094, + "step": 12020 + }, + { + "epoch": 0.69, + "grad_norm": 0.354847088036182, + "learning_rate": 4.613103910841441e-06, + "loss": 0.21, + "step": 12021 + }, + { + "epoch": 0.69, + "grad_norm": 0.3544508126717817, + "learning_rate": 4.611536177245176e-06, + "loss": 0.2592, + "step": 12022 + }, + { + "epoch": 0.69, + "grad_norm": 1.0195307111162017, + "learning_rate": 4.609968630251187e-06, + "loss": 0.3661, + "step": 12023 + }, + { + "epoch": 0.69, + "grad_norm": 0.3860554430872647, + "learning_rate": 4.608401269913751e-06, + "loss": 0.2795, + "step": 12024 + }, + { + "epoch": 0.69, + "grad_norm": 0.39062012651266825, + "learning_rate": 4.606834096287148e-06, + "loss": 0.2738, + "step": 12025 + }, + { + "epoch": 0.69, + "grad_norm": 0.3576834430909374, + "learning_rate": 4.605267109425645e-06, + "loss": 0.2074, + "step": 12026 + }, + { + "epoch": 0.69, + "grad_norm": 1.050741505415178, + "learning_rate": 4.6037003093835135e-06, + "loss": 0.4651, + "step": 12027 + }, + { + "epoch": 0.69, + "grad_norm": 0.3340140311716859, + "learning_rate": 4.602133696215007e-06, + "loss": 0.2685, + "step": 12028 + }, + { + "epoch": 0.69, + "grad_norm": 0.3868998595979161, + "learning_rate": 4.6005672699743795e-06, + "loss": 0.3129, + "step": 12029 + }, + { + "epoch": 0.69, + "grad_norm": 0.6958039333913612, + "learning_rate": 4.599001030715876e-06, + "loss": 0.2917, + "step": 12030 + }, + { + "epoch": 0.69, + "grad_norm": 0.3372161814167908, + "learning_rate": 4.59743497849373e-06, + "loss": 0.274, + "step": 12031 + }, + { + "epoch": 0.69, + "grad_norm": 0.3879938736639436, + "learning_rate": 4.5958691133621815e-06, + "loss": 0.3127, + "step": 12032 + }, + { + "epoch": 0.69, + "grad_norm": 0.14895293830728926, + "learning_rate": 4.594303435375454e-06, + "loss": 0.1142, + "step": 12033 + }, + { + "epoch": 0.69, + "grad_norm": 0.322727915954783, + "learning_rate": 4.592737944587766e-06, + "loss": 0.2661, + "step": 12034 + }, + { + "epoch": 0.69, + "grad_norm": 1.1372554738979654, + "learning_rate": 4.591172641053326e-06, + "loss": 0.5845, + "step": 12035 + }, + { + "epoch": 0.69, + "grad_norm": 0.42201688444677404, + "learning_rate": 4.589607524826351e-06, + "loss": 0.2639, + "step": 12036 + }, + { + "epoch": 0.69, + "grad_norm": 0.3165929336205737, + "learning_rate": 4.588042595961032e-06, + "loss": 0.2747, + "step": 12037 + }, + { + "epoch": 0.69, + "grad_norm": 0.40563172992912694, + "learning_rate": 4.586477854511566e-06, + "loss": 0.2555, + "step": 12038 + }, + { + "epoch": 0.69, + "grad_norm": 0.3947859210351605, + "learning_rate": 4.584913300532135e-06, + "loss": 0.2764, + "step": 12039 + }, + { + "epoch": 0.69, + "grad_norm": 0.28751873344914913, + "learning_rate": 4.583348934076929e-06, + "loss": 0.1917, + "step": 12040 + }, + { + "epoch": 0.69, + "grad_norm": 0.3649164575475577, + "learning_rate": 4.581784755200115e-06, + "loss": 0.2989, + "step": 12041 + }, + { + "epoch": 0.69, + "grad_norm": 0.7260664375548328, + "learning_rate": 4.580220763955863e-06, + "loss": 0.3636, + "step": 12042 + }, + { + "epoch": 0.69, + "grad_norm": 0.29090161691547395, + "learning_rate": 4.578656960398328e-06, + "loss": 0.1897, + "step": 12043 + }, + { + "epoch": 0.69, + "grad_norm": 0.3837560896918546, + "learning_rate": 4.577093344581674e-06, + "loss": 0.316, + "step": 12044 + }, + { + "epoch": 0.69, + "grad_norm": 0.3944502241833138, + "learning_rate": 4.575529916560043e-06, + "loss": 0.2643, + "step": 12045 + }, + { + "epoch": 0.69, + "grad_norm": 0.28381455429202385, + "learning_rate": 4.573966676387579e-06, + "loss": 0.2047, + "step": 12046 + }, + { + "epoch": 0.69, + "grad_norm": 0.5857048817229604, + "learning_rate": 4.5724036241184144e-06, + "loss": 0.2819, + "step": 12047 + }, + { + "epoch": 0.69, + "grad_norm": 0.3373748374080497, + "learning_rate": 4.5708407598066766e-06, + "loss": 0.2939, + "step": 12048 + }, + { + "epoch": 0.69, + "grad_norm": 0.285983173917611, + "learning_rate": 4.569278083506492e-06, + "loss": 0.1824, + "step": 12049 + }, + { + "epoch": 0.69, + "grad_norm": 1.1284028289918933, + "learning_rate": 4.567715595271976e-06, + "loss": 0.5352, + "step": 12050 + }, + { + "epoch": 0.69, + "grad_norm": 1.1099867271284871, + "learning_rate": 4.566153295157233e-06, + "loss": 0.7988, + "step": 12051 + }, + { + "epoch": 0.69, + "grad_norm": 0.2915399836846607, + "learning_rate": 4.5645911832163654e-06, + "loss": 0.2437, + "step": 12052 + }, + { + "epoch": 0.69, + "grad_norm": 0.35039432313772084, + "learning_rate": 4.563029259503474e-06, + "loss": 0.2095, + "step": 12053 + }, + { + "epoch": 0.69, + "grad_norm": 0.41625909045888376, + "learning_rate": 4.561467524072651e-06, + "loss": 0.2736, + "step": 12054 + }, + { + "epoch": 0.69, + "grad_norm": 0.308228128875875, + "learning_rate": 4.5599059769779654e-06, + "loss": 0.2516, + "step": 12055 + }, + { + "epoch": 0.69, + "grad_norm": 0.34633389816855337, + "learning_rate": 4.558344618273506e-06, + "loss": 0.2532, + "step": 12056 + }, + { + "epoch": 0.69, + "grad_norm": 1.1119503132303514, + "learning_rate": 4.556783448013338e-06, + "loss": 0.7745, + "step": 12057 + }, + { + "epoch": 0.69, + "grad_norm": 0.505912833656344, + "learning_rate": 4.555222466251525e-06, + "loss": 0.3091, + "step": 12058 + }, + { + "epoch": 0.69, + "grad_norm": 0.30427640882311047, + "learning_rate": 4.553661673042123e-06, + "loss": 0.2245, + "step": 12059 + }, + { + "epoch": 0.69, + "grad_norm": 0.2822146065378598, + "learning_rate": 4.552101068439181e-06, + "loss": 0.2482, + "step": 12060 + }, + { + "epoch": 0.69, + "grad_norm": 0.3780466110606745, + "learning_rate": 4.550540652496748e-06, + "loss": 0.2474, + "step": 12061 + }, + { + "epoch": 0.69, + "grad_norm": 0.5094716111081256, + "learning_rate": 4.548980425268857e-06, + "loss": 0.2451, + "step": 12062 + }, + { + "epoch": 0.69, + "grad_norm": 1.2888871968821543, + "learning_rate": 4.5474203868095415e-06, + "loss": 0.4701, + "step": 12063 + }, + { + "epoch": 0.69, + "grad_norm": 0.2587492497087311, + "learning_rate": 4.545860537172818e-06, + "loss": 0.2568, + "step": 12064 + }, + { + "epoch": 0.69, + "grad_norm": 0.46174224389574064, + "learning_rate": 4.5443008764127135e-06, + "loss": 0.3415, + "step": 12065 + }, + { + "epoch": 0.69, + "grad_norm": 0.24079285479608747, + "learning_rate": 4.542741404583235e-06, + "loss": 0.1051, + "step": 12066 + }, + { + "epoch": 0.69, + "grad_norm": 0.42779305694074776, + "learning_rate": 4.541182121738388e-06, + "loss": 0.2387, + "step": 12067 + }, + { + "epoch": 0.69, + "grad_norm": 0.28243846220049185, + "learning_rate": 4.539623027932165e-06, + "loss": 0.2727, + "step": 12068 + }, + { + "epoch": 0.69, + "grad_norm": 0.7645806896247914, + "learning_rate": 4.538064123218565e-06, + "loss": 0.34, + "step": 12069 + }, + { + "epoch": 0.69, + "grad_norm": 0.34980810066617957, + "learning_rate": 4.53650540765157e-06, + "loss": 0.2661, + "step": 12070 + }, + { + "epoch": 0.69, + "grad_norm": 0.5862188882028873, + "learning_rate": 4.534946881285158e-06, + "loss": 0.3972, + "step": 12071 + }, + { + "epoch": 0.69, + "grad_norm": 0.21256587958188933, + "learning_rate": 4.533388544173301e-06, + "loss": 0.169, + "step": 12072 + }, + { + "epoch": 0.69, + "grad_norm": 0.3133687014658715, + "learning_rate": 4.531830396369959e-06, + "loss": 0.2401, + "step": 12073 + }, + { + "epoch": 0.69, + "grad_norm": 1.1903642275444457, + "learning_rate": 4.530272437929099e-06, + "loss": 0.6457, + "step": 12074 + }, + { + "epoch": 0.69, + "grad_norm": 0.3660557403373204, + "learning_rate": 4.528714668904669e-06, + "loss": 0.2687, + "step": 12075 + }, + { + "epoch": 0.69, + "grad_norm": 0.5739283505931541, + "learning_rate": 4.527157089350616e-06, + "loss": 0.3292, + "step": 12076 + }, + { + "epoch": 0.69, + "grad_norm": 0.4039211294031608, + "learning_rate": 4.525599699320873e-06, + "loss": 0.3117, + "step": 12077 + }, + { + "epoch": 0.69, + "grad_norm": 0.30685207485541593, + "learning_rate": 4.52404249886938e-06, + "loss": 0.1924, + "step": 12078 + }, + { + "epoch": 0.69, + "grad_norm": 0.287329467106963, + "learning_rate": 4.5224854880500615e-06, + "loss": 0.0706, + "step": 12079 + }, + { + "epoch": 0.69, + "grad_norm": 0.27929117817412435, + "learning_rate": 4.520928666916834e-06, + "loss": 0.2665, + "step": 12080 + }, + { + "epoch": 0.69, + "grad_norm": 0.7506304080374441, + "learning_rate": 4.519372035523607e-06, + "loss": 0.3991, + "step": 12081 + }, + { + "epoch": 0.69, + "grad_norm": 0.4259104980903786, + "learning_rate": 4.517815593924295e-06, + "loss": 0.2043, + "step": 12082 + }, + { + "epoch": 0.69, + "grad_norm": 0.3686775706552464, + "learning_rate": 4.5162593421727926e-06, + "loss": 0.3217, + "step": 12083 + }, + { + "epoch": 0.69, + "grad_norm": 0.4937075304767455, + "learning_rate": 4.514703280322995e-06, + "loss": 0.3287, + "step": 12084 + }, + { + "epoch": 0.69, + "grad_norm": 0.2030857822006487, + "learning_rate": 4.513147408428786e-06, + "loss": 0.1249, + "step": 12085 + }, + { + "epoch": 0.69, + "grad_norm": 0.5124655651149372, + "learning_rate": 4.5115917265440425e-06, + "loss": 0.3465, + "step": 12086 + }, + { + "epoch": 0.69, + "grad_norm": 0.4821076032344685, + "learning_rate": 4.510036234722645e-06, + "loss": 0.3284, + "step": 12087 + }, + { + "epoch": 0.69, + "grad_norm": 0.2927910612808501, + "learning_rate": 4.5084809330184605e-06, + "loss": 0.2206, + "step": 12088 + }, + { + "epoch": 0.69, + "grad_norm": 0.5136650655164572, + "learning_rate": 4.506925821485338e-06, + "loss": 0.3624, + "step": 12089 + }, + { + "epoch": 0.69, + "grad_norm": 0.2936563962334066, + "learning_rate": 4.505370900177142e-06, + "loss": 0.1928, + "step": 12090 + }, + { + "epoch": 0.69, + "grad_norm": 0.3934100319552002, + "learning_rate": 4.503816169147715e-06, + "loss": 0.2727, + "step": 12091 + }, + { + "epoch": 0.69, + "grad_norm": 0.28160661063648673, + "learning_rate": 4.502261628450898e-06, + "loss": 0.2303, + "step": 12092 + }, + { + "epoch": 0.69, + "grad_norm": 0.5788519867248023, + "learning_rate": 4.5007072781405205e-06, + "loss": 0.3512, + "step": 12093 + }, + { + "epoch": 0.69, + "grad_norm": 0.6096906100436412, + "learning_rate": 4.4991531182704166e-06, + "loss": 0.3835, + "step": 12094 + }, + { + "epoch": 0.69, + "grad_norm": 0.28201262052309894, + "learning_rate": 4.497599148894404e-06, + "loss": 0.2467, + "step": 12095 + }, + { + "epoch": 0.69, + "grad_norm": 0.33592745821692854, + "learning_rate": 4.496045370066296e-06, + "loss": 0.2624, + "step": 12096 + }, + { + "epoch": 0.7, + "grad_norm": 0.3753798828666515, + "learning_rate": 4.494491781839901e-06, + "loss": 0.2396, + "step": 12097 + }, + { + "epoch": 0.7, + "grad_norm": 0.3387213970720124, + "learning_rate": 4.492938384269015e-06, + "loss": 0.249, + "step": 12098 + }, + { + "epoch": 0.7, + "grad_norm": 0.33277551615459394, + "learning_rate": 4.491385177407439e-06, + "loss": 0.293, + "step": 12099 + }, + { + "epoch": 0.7, + "grad_norm": 0.7391401050418194, + "learning_rate": 4.489832161308958e-06, + "loss": 0.4181, + "step": 12100 + }, + { + "epoch": 0.7, + "grad_norm": 0.3603244564137188, + "learning_rate": 4.488279336027353e-06, + "loss": 0.3054, + "step": 12101 + }, + { + "epoch": 0.7, + "grad_norm": 0.3353997447238762, + "learning_rate": 4.486726701616393e-06, + "loss": 0.0981, + "step": 12102 + }, + { + "epoch": 0.7, + "grad_norm": 0.2523472651585536, + "learning_rate": 4.485174258129854e-06, + "loss": 0.2128, + "step": 12103 + }, + { + "epoch": 0.7, + "grad_norm": 0.29579376641255173, + "learning_rate": 4.483622005621493e-06, + "loss": 0.2761, + "step": 12104 + }, + { + "epoch": 0.7, + "grad_norm": 0.67653447878386, + "learning_rate": 4.4820699441450655e-06, + "loss": 0.3091, + "step": 12105 + }, + { + "epoch": 0.7, + "grad_norm": 0.6401876821418024, + "learning_rate": 4.4805180737543145e-06, + "loss": 0.4376, + "step": 12106 + }, + { + "epoch": 0.7, + "grad_norm": 0.3921815615134285, + "learning_rate": 4.478966394502988e-06, + "loss": 0.3295, + "step": 12107 + }, + { + "epoch": 0.7, + "grad_norm": 0.3475348263674097, + "learning_rate": 4.4774149064448195e-06, + "loss": 0.2545, + "step": 12108 + }, + { + "epoch": 0.7, + "grad_norm": 0.36534583192677034, + "learning_rate": 4.475863609633534e-06, + "loss": 0.2564, + "step": 12109 + }, + { + "epoch": 0.7, + "grad_norm": 0.3668623445667178, + "learning_rate": 4.474312504122854e-06, + "loss": 0.2889, + "step": 12110 + }, + { + "epoch": 0.7, + "grad_norm": 0.25610872600300666, + "learning_rate": 4.472761589966493e-06, + "loss": 0.2137, + "step": 12111 + }, + { + "epoch": 0.7, + "grad_norm": 0.561497219037444, + "learning_rate": 4.471210867218161e-06, + "loss": 0.3434, + "step": 12112 + }, + { + "epoch": 0.7, + "grad_norm": 0.37026942810244867, + "learning_rate": 4.4696603359315604e-06, + "loss": 0.3116, + "step": 12113 + }, + { + "epoch": 0.7, + "grad_norm": 0.7497924302268585, + "learning_rate": 4.468109996160385e-06, + "loss": 0.4706, + "step": 12114 + }, + { + "epoch": 0.7, + "grad_norm": 0.3076550596172886, + "learning_rate": 4.466559847958318e-06, + "loss": 0.2392, + "step": 12115 + }, + { + "epoch": 0.7, + "grad_norm": 0.3388503851948695, + "learning_rate": 4.46500989137905e-06, + "loss": 0.2912, + "step": 12116 + }, + { + "epoch": 0.7, + "grad_norm": 0.3243942487719419, + "learning_rate": 4.463460126476251e-06, + "loss": 0.2058, + "step": 12117 + }, + { + "epoch": 0.7, + "grad_norm": 0.46445889232908727, + "learning_rate": 4.46191055330359e-06, + "loss": 0.2573, + "step": 12118 + }, + { + "epoch": 0.7, + "grad_norm": 0.26405584070347937, + "learning_rate": 4.460361171914724e-06, + "loss": 0.2552, + "step": 12119 + }, + { + "epoch": 0.7, + "grad_norm": 0.48086867532454386, + "learning_rate": 4.458811982363317e-06, + "loss": 0.2995, + "step": 12120 + }, + { + "epoch": 0.7, + "grad_norm": 0.6352621011962262, + "learning_rate": 4.457262984703015e-06, + "loss": 0.2686, + "step": 12121 + }, + { + "epoch": 0.7, + "grad_norm": 0.2669809765909583, + "learning_rate": 4.455714178987456e-06, + "loss": 0.2236, + "step": 12122 + }, + { + "epoch": 0.7, + "grad_norm": 0.26373342863482124, + "learning_rate": 4.454165565270272e-06, + "loss": 0.2267, + "step": 12123 + }, + { + "epoch": 0.7, + "grad_norm": 0.47415110258209775, + "learning_rate": 4.452617143605099e-06, + "loss": 0.2042, + "step": 12124 + }, + { + "epoch": 0.7, + "grad_norm": 0.37038910052018265, + "learning_rate": 4.451068914045556e-06, + "loss": 0.3001, + "step": 12125 + }, + { + "epoch": 0.7, + "grad_norm": 0.7943156231991123, + "learning_rate": 4.449520876645258e-06, + "loss": 0.372, + "step": 12126 + }, + { + "epoch": 0.7, + "grad_norm": 0.30751059252070245, + "learning_rate": 4.447973031457809e-06, + "loss": 0.2835, + "step": 12127 + }, + { + "epoch": 0.7, + "grad_norm": 0.31206151442044566, + "learning_rate": 4.4464253785368205e-06, + "loss": 0.1877, + "step": 12128 + }, + { + "epoch": 0.7, + "grad_norm": 0.2591005552248067, + "learning_rate": 4.4448779179358815e-06, + "loss": 0.1728, + "step": 12129 + }, + { + "epoch": 0.7, + "grad_norm": 1.248204418833787, + "learning_rate": 4.443330649708581e-06, + "loss": 0.7986, + "step": 12130 + }, + { + "epoch": 0.7, + "grad_norm": 0.2736973391515441, + "learning_rate": 4.441783573908498e-06, + "loss": 0.2117, + "step": 12131 + }, + { + "epoch": 0.7, + "grad_norm": 0.5426384659039937, + "learning_rate": 4.440236690589215e-06, + "loss": 0.3368, + "step": 12132 + }, + { + "epoch": 0.7, + "grad_norm": 0.9915066429385122, + "learning_rate": 4.438689999804295e-06, + "loss": 0.4082, + "step": 12133 + }, + { + "epoch": 0.7, + "grad_norm": 0.2815498231479696, + "learning_rate": 4.437143501607302e-06, + "loss": 0.1841, + "step": 12134 + }, + { + "epoch": 0.7, + "grad_norm": 0.21873970118674596, + "learning_rate": 4.435597196051789e-06, + "loss": 0.2132, + "step": 12135 + }, + { + "epoch": 0.7, + "grad_norm": 1.2154872539731227, + "learning_rate": 4.434051083191304e-06, + "loss": 0.742, + "step": 12136 + }, + { + "epoch": 0.7, + "grad_norm": 0.3401125230433799, + "learning_rate": 4.432505163079394e-06, + "loss": 0.1953, + "step": 12137 + }, + { + "epoch": 0.7, + "grad_norm": 1.0219315897519718, + "learning_rate": 4.4309594357695895e-06, + "loss": 0.4175, + "step": 12138 + }, + { + "epoch": 0.7, + "grad_norm": 0.37693401492937234, + "learning_rate": 4.429413901315421e-06, + "loss": 0.3112, + "step": 12139 + }, + { + "epoch": 0.7, + "grad_norm": 0.3591687661022238, + "learning_rate": 4.4278685597704065e-06, + "loss": 0.2951, + "step": 12140 + }, + { + "epoch": 0.7, + "grad_norm": 0.434384821341122, + "learning_rate": 4.426323411188067e-06, + "loss": 0.1519, + "step": 12141 + }, + { + "epoch": 0.7, + "grad_norm": 0.2933368604620994, + "learning_rate": 4.424778455621908e-06, + "loss": 0.25, + "step": 12142 + }, + { + "epoch": 0.7, + "grad_norm": 0.32729484470318493, + "learning_rate": 4.4232336931254324e-06, + "loss": 0.2727, + "step": 12143 + }, + { + "epoch": 0.7, + "grad_norm": 0.7742472208938666, + "learning_rate": 4.42168912375213e-06, + "loss": 0.2935, + "step": 12144 + }, + { + "epoch": 0.7, + "grad_norm": 0.790094629215079, + "learning_rate": 4.420144747555497e-06, + "loss": 0.3164, + "step": 12145 + }, + { + "epoch": 0.7, + "grad_norm": 0.355497378756577, + "learning_rate": 4.418600564589012e-06, + "loss": 0.273, + "step": 12146 + }, + { + "epoch": 0.7, + "grad_norm": 0.33562205802520717, + "learning_rate": 4.417056574906148e-06, + "loss": 0.2513, + "step": 12147 + }, + { + "epoch": 0.7, + "grad_norm": 0.9439148205438117, + "learning_rate": 4.415512778560376e-06, + "loss": 0.5202, + "step": 12148 + }, + { + "epoch": 0.7, + "grad_norm": 0.31573271535824526, + "learning_rate": 4.413969175605152e-06, + "loss": 0.2504, + "step": 12149 + }, + { + "epoch": 0.7, + "grad_norm": 0.30110495517556174, + "learning_rate": 4.412425766093939e-06, + "loss": 0.1944, + "step": 12150 + }, + { + "epoch": 0.7, + "grad_norm": 0.5043638552372725, + "learning_rate": 4.410882550080182e-06, + "loss": 0.3179, + "step": 12151 + }, + { + "epoch": 0.7, + "grad_norm": 0.33032605242595947, + "learning_rate": 4.409339527617321e-06, + "loss": 0.2619, + "step": 12152 + }, + { + "epoch": 0.7, + "grad_norm": 1.2303501312225467, + "learning_rate": 4.407796698758788e-06, + "loss": 0.6729, + "step": 12153 + }, + { + "epoch": 0.7, + "grad_norm": 0.46335850324152866, + "learning_rate": 4.40625406355802e-06, + "loss": 0.2766, + "step": 12154 + }, + { + "epoch": 0.7, + "grad_norm": 0.2846449360546112, + "learning_rate": 4.404711622068436e-06, + "loss": 0.2599, + "step": 12155 + }, + { + "epoch": 0.7, + "grad_norm": 0.7503421917007584, + "learning_rate": 4.40316937434344e-06, + "loss": 0.4376, + "step": 12156 + }, + { + "epoch": 0.7, + "grad_norm": 0.2820811862011241, + "learning_rate": 4.401627320436453e-06, + "loss": 0.1031, + "step": 12157 + }, + { + "epoch": 0.7, + "grad_norm": 0.33872988149067856, + "learning_rate": 4.40008546040087e-06, + "loss": 0.2569, + "step": 12158 + }, + { + "epoch": 0.7, + "grad_norm": 0.3510877902653597, + "learning_rate": 4.3985437942900865e-06, + "loss": 0.3007, + "step": 12159 + }, + { + "epoch": 0.7, + "grad_norm": 0.8283159971853021, + "learning_rate": 4.397002322157492e-06, + "loss": 0.3384, + "step": 12160 + }, + { + "epoch": 0.7, + "grad_norm": 0.3278628379074249, + "learning_rate": 4.395461044056462e-06, + "loss": 0.2431, + "step": 12161 + }, + { + "epoch": 0.7, + "grad_norm": 0.4197960471554202, + "learning_rate": 4.393919960040377e-06, + "loss": 0.2814, + "step": 12162 + }, + { + "epoch": 0.7, + "grad_norm": 0.2417208244068282, + "learning_rate": 4.392379070162604e-06, + "loss": 0.1892, + "step": 12163 + }, + { + "epoch": 0.7, + "grad_norm": 0.3290819278571764, + "learning_rate": 4.390838374476503e-06, + "loss": 0.2373, + "step": 12164 + }, + { + "epoch": 0.7, + "grad_norm": 0.9639347889069623, + "learning_rate": 4.3892978730354245e-06, + "loss": 0.5774, + "step": 12165 + }, + { + "epoch": 0.7, + "grad_norm": 0.32514156426728147, + "learning_rate": 4.387757565892722e-06, + "loss": 0.3137, + "step": 12166 + }, + { + "epoch": 0.7, + "grad_norm": 0.2997269568806797, + "learning_rate": 4.386217453101735e-06, + "loss": 0.1819, + "step": 12167 + }, + { + "epoch": 0.7, + "grad_norm": 0.44106553840301743, + "learning_rate": 4.384677534715794e-06, + "loss": 0.2737, + "step": 12168 + }, + { + "epoch": 0.7, + "grad_norm": 0.40065246533447957, + "learning_rate": 4.383137810788226e-06, + "loss": 0.2289, + "step": 12169 + }, + { + "epoch": 0.7, + "grad_norm": 0.3319847681859942, + "learning_rate": 4.381598281372358e-06, + "loss": 0.1977, + "step": 12170 + }, + { + "epoch": 0.7, + "grad_norm": 0.3532188786608427, + "learning_rate": 4.3800589465215e-06, + "loss": 0.3221, + "step": 12171 + }, + { + "epoch": 0.7, + "grad_norm": 0.8397267713928125, + "learning_rate": 4.378519806288959e-06, + "loss": 0.3961, + "step": 12172 + }, + { + "epoch": 0.7, + "grad_norm": 0.32192933397243806, + "learning_rate": 4.376980860728031e-06, + "loss": 0.2236, + "step": 12173 + }, + { + "epoch": 0.7, + "grad_norm": 1.0490275553766555, + "learning_rate": 4.375442109892019e-06, + "loss": 0.527, + "step": 12174 + }, + { + "epoch": 0.7, + "grad_norm": 0.22301783924245172, + "learning_rate": 4.373903553834203e-06, + "loss": 0.2092, + "step": 12175 + }, + { + "epoch": 0.7, + "grad_norm": 0.3305834602905949, + "learning_rate": 4.372365192607866e-06, + "loss": 0.2539, + "step": 12176 + }, + { + "epoch": 0.7, + "grad_norm": 1.0340970094704132, + "learning_rate": 4.370827026266281e-06, + "loss": 0.2975, + "step": 12177 + }, + { + "epoch": 0.7, + "grad_norm": 0.37116083450859394, + "learning_rate": 4.36928905486271e-06, + "loss": 0.299, + "step": 12178 + }, + { + "epoch": 0.7, + "grad_norm": 0.29651541573485524, + "learning_rate": 4.3677512784504195e-06, + "loss": 0.2427, + "step": 12179 + }, + { + "epoch": 0.7, + "grad_norm": 1.0371072230107874, + "learning_rate": 4.366213697082661e-06, + "loss": 0.3959, + "step": 12180 + }, + { + "epoch": 0.7, + "grad_norm": 0.3476298224017206, + "learning_rate": 4.3646763108126796e-06, + "loss": 0.1951, + "step": 12181 + }, + { + "epoch": 0.7, + "grad_norm": 0.651854837110252, + "learning_rate": 4.363139119693712e-06, + "loss": 0.3743, + "step": 12182 + }, + { + "epoch": 0.7, + "grad_norm": 0.3424674523302693, + "learning_rate": 4.361602123778998e-06, + "loss": 0.2393, + "step": 12183 + }, + { + "epoch": 0.7, + "grad_norm": 1.009983692526136, + "learning_rate": 4.360065323121759e-06, + "loss": 0.4591, + "step": 12184 + }, + { + "epoch": 0.7, + "grad_norm": 0.4010361679084553, + "learning_rate": 4.358528717775217e-06, + "loss": 0.2785, + "step": 12185 + }, + { + "epoch": 0.7, + "grad_norm": 0.35234124130496747, + "learning_rate": 4.356992307792578e-06, + "loss": 0.2476, + "step": 12186 + }, + { + "epoch": 0.7, + "grad_norm": 0.23746666293127477, + "learning_rate": 4.355456093227056e-06, + "loss": 0.1907, + "step": 12187 + }, + { + "epoch": 0.7, + "grad_norm": 0.39902282031332437, + "learning_rate": 4.353920074131848e-06, + "loss": 0.2616, + "step": 12188 + }, + { + "epoch": 0.7, + "grad_norm": 0.437743857169606, + "learning_rate": 4.352384250560147e-06, + "loss": 0.3208, + "step": 12189 + }, + { + "epoch": 0.7, + "grad_norm": 0.3195775949375206, + "learning_rate": 4.350848622565131e-06, + "loss": 0.239, + "step": 12190 + }, + { + "epoch": 0.7, + "grad_norm": 0.3694916234759852, + "learning_rate": 4.349313190199988e-06, + "loss": 0.28, + "step": 12191 + }, + { + "epoch": 0.7, + "grad_norm": 0.5313708496767161, + "learning_rate": 4.347777953517885e-06, + "loss": 0.3363, + "step": 12192 + }, + { + "epoch": 0.7, + "grad_norm": 0.19723128611524043, + "learning_rate": 4.3462429125719884e-06, + "loss": 0.0851, + "step": 12193 + }, + { + "epoch": 0.7, + "grad_norm": 0.3500548154240149, + "learning_rate": 4.344708067415454e-06, + "loss": 0.263, + "step": 12194 + }, + { + "epoch": 0.7, + "grad_norm": 0.38467142893601863, + "learning_rate": 4.34317341810144e-06, + "loss": 0.2988, + "step": 12195 + }, + { + "epoch": 0.7, + "grad_norm": 0.6932149267264256, + "learning_rate": 4.341638964683086e-06, + "loss": 0.3442, + "step": 12196 + }, + { + "epoch": 0.7, + "grad_norm": 0.3163438844619936, + "learning_rate": 4.3401047072135315e-06, + "loss": 0.2652, + "step": 12197 + }, + { + "epoch": 0.7, + "grad_norm": 1.2479930111845223, + "learning_rate": 4.338570645745904e-06, + "loss": 0.5147, + "step": 12198 + }, + { + "epoch": 0.7, + "grad_norm": 0.21571739388095768, + "learning_rate": 4.337036780333336e-06, + "loss": 0.1895, + "step": 12199 + }, + { + "epoch": 0.7, + "grad_norm": 0.5681859271955713, + "learning_rate": 4.33550311102894e-06, + "loss": 0.3091, + "step": 12200 + }, + { + "epoch": 0.7, + "grad_norm": 0.4052305180015126, + "learning_rate": 4.333969637885827e-06, + "loss": 0.3169, + "step": 12201 + }, + { + "epoch": 0.7, + "grad_norm": 0.33747729303466445, + "learning_rate": 4.332436360957104e-06, + "loss": 0.3031, + "step": 12202 + }, + { + "epoch": 0.7, + "grad_norm": 0.34873869056387957, + "learning_rate": 4.3309032802958605e-06, + "loss": 0.1527, + "step": 12203 + }, + { + "epoch": 0.7, + "grad_norm": 0.4009960812786949, + "learning_rate": 4.329370395955198e-06, + "loss": 0.301, + "step": 12204 + }, + { + "epoch": 0.7, + "grad_norm": 0.6642800794115589, + "learning_rate": 4.3278377079881935e-06, + "loss": 0.3961, + "step": 12205 + }, + { + "epoch": 0.7, + "grad_norm": 0.30282343673125794, + "learning_rate": 4.326305216447926e-06, + "loss": 0.2192, + "step": 12206 + }, + { + "epoch": 0.7, + "grad_norm": 0.23522905895128113, + "learning_rate": 4.32477292138746e-06, + "loss": 0.2117, + "step": 12207 + }, + { + "epoch": 0.7, + "grad_norm": 1.2941628507913063, + "learning_rate": 4.3232408228598685e-06, + "loss": 0.6561, + "step": 12208 + }, + { + "epoch": 0.7, + "grad_norm": 0.35384968435253333, + "learning_rate": 4.321708920918203e-06, + "loss": 0.2012, + "step": 12209 + }, + { + "epoch": 0.7, + "grad_norm": 0.3530687951144725, + "learning_rate": 4.320177215615513e-06, + "loss": 0.2766, + "step": 12210 + }, + { + "epoch": 0.7, + "grad_norm": 0.5241546712916147, + "learning_rate": 4.318645707004839e-06, + "loss": 0.3251, + "step": 12211 + }, + { + "epoch": 0.7, + "grad_norm": 0.36823411666661837, + "learning_rate": 4.317114395139222e-06, + "loss": 0.2413, + "step": 12212 + }, + { + "epoch": 0.7, + "grad_norm": 0.2259296402410964, + "learning_rate": 4.3155832800716905e-06, + "loss": 0.1746, + "step": 12213 + }, + { + "epoch": 0.7, + "grad_norm": 0.35860128935561664, + "learning_rate": 4.314052361855265e-06, + "loss": 0.3109, + "step": 12214 + }, + { + "epoch": 0.7, + "grad_norm": 0.5859399504943746, + "learning_rate": 4.312521640542961e-06, + "loss": 0.4153, + "step": 12215 + }, + { + "epoch": 0.7, + "grad_norm": 0.35181741535289196, + "learning_rate": 4.310991116187786e-06, + "loss": 0.2138, + "step": 12216 + }, + { + "epoch": 0.7, + "grad_norm": 0.5705186817412079, + "learning_rate": 4.309460788842747e-06, + "loss": 0.3709, + "step": 12217 + }, + { + "epoch": 0.7, + "grad_norm": 0.3280362062848473, + "learning_rate": 4.307930658560836e-06, + "loss": 0.2838, + "step": 12218 + }, + { + "epoch": 0.7, + "grad_norm": 0.25938678514674396, + "learning_rate": 4.306400725395041e-06, + "loss": 0.1739, + "step": 12219 + }, + { + "epoch": 0.7, + "grad_norm": 0.3705889257477343, + "learning_rate": 4.304870989398341e-06, + "loss": 0.2753, + "step": 12220 + }, + { + "epoch": 0.7, + "grad_norm": 0.7826792297259539, + "learning_rate": 4.303341450623717e-06, + "loss": 0.3297, + "step": 12221 + }, + { + "epoch": 0.7, + "grad_norm": 0.25656625473960787, + "learning_rate": 4.301812109124134e-06, + "loss": 0.2215, + "step": 12222 + }, + { + "epoch": 0.7, + "grad_norm": 0.7552748960134755, + "learning_rate": 4.300282964952553e-06, + "loss": 0.3884, + "step": 12223 + }, + { + "epoch": 0.7, + "grad_norm": 0.6690995984223479, + "learning_rate": 4.2987540181619265e-06, + "loss": 0.3853, + "step": 12224 + }, + { + "epoch": 0.7, + "grad_norm": 0.2465990758728293, + "learning_rate": 4.2972252688052055e-06, + "loss": 0.1742, + "step": 12225 + }, + { + "epoch": 0.7, + "grad_norm": 0.3394607609597814, + "learning_rate": 4.295696716935326e-06, + "loss": 0.2924, + "step": 12226 + }, + { + "epoch": 0.7, + "grad_norm": 0.3112479870105338, + "learning_rate": 4.294168362605224e-06, + "loss": 0.2147, + "step": 12227 + }, + { + "epoch": 0.7, + "grad_norm": 0.3997767864323401, + "learning_rate": 4.292640205867824e-06, + "loss": 0.2892, + "step": 12228 + }, + { + "epoch": 0.7, + "grad_norm": 1.072222696374089, + "learning_rate": 4.291112246776052e-06, + "loss": 0.2978, + "step": 12229 + }, + { + "epoch": 0.7, + "grad_norm": 0.2996227549076089, + "learning_rate": 4.2895844853828165e-06, + "loss": 0.2741, + "step": 12230 + }, + { + "epoch": 0.7, + "grad_norm": 0.40168932524681566, + "learning_rate": 4.288056921741024e-06, + "loss": 0.3001, + "step": 12231 + }, + { + "epoch": 0.7, + "grad_norm": 0.2768315986265087, + "learning_rate": 4.286529555903572e-06, + "loss": 0.1643, + "step": 12232 + }, + { + "epoch": 0.7, + "grad_norm": 0.41305074420474364, + "learning_rate": 4.285002387923359e-06, + "loss": 0.2157, + "step": 12233 + }, + { + "epoch": 0.7, + "grad_norm": 0.3232120840888913, + "learning_rate": 4.283475417853268e-06, + "loss": 0.2766, + "step": 12234 + }, + { + "epoch": 0.7, + "grad_norm": 0.7148956900870683, + "learning_rate": 4.2819486457461765e-06, + "loss": 0.2733, + "step": 12235 + }, + { + "epoch": 0.7, + "grad_norm": 0.8019061600739739, + "learning_rate": 4.280422071654955e-06, + "loss": 0.4053, + "step": 12236 + }, + { + "epoch": 0.7, + "grad_norm": 0.3394041675400556, + "learning_rate": 4.278895695632474e-06, + "loss": 0.2784, + "step": 12237 + }, + { + "epoch": 0.7, + "grad_norm": 0.3341011586741274, + "learning_rate": 4.27736951773159e-06, + "loss": 0.2744, + "step": 12238 + }, + { + "epoch": 0.7, + "grad_norm": 0.35152470282886145, + "learning_rate": 4.275843538005153e-06, + "loss": 0.2208, + "step": 12239 + }, + { + "epoch": 0.7, + "grad_norm": 0.370400992897311, + "learning_rate": 4.274317756506008e-06, + "loss": 0.2785, + "step": 12240 + }, + { + "epoch": 0.7, + "grad_norm": 0.4693231268558691, + "learning_rate": 4.2727921732869894e-06, + "loss": 0.2495, + "step": 12241 + }, + { + "epoch": 0.7, + "grad_norm": 0.3241862213090096, + "learning_rate": 4.271266788400935e-06, + "loss": 0.2379, + "step": 12242 + }, + { + "epoch": 0.7, + "grad_norm": 0.3449514730274378, + "learning_rate": 4.269741601900667e-06, + "loss": 0.2681, + "step": 12243 + }, + { + "epoch": 0.7, + "grad_norm": 1.1807650540650891, + "learning_rate": 4.268216613838998e-06, + "loss": 0.6882, + "step": 12244 + }, + { + "epoch": 0.7, + "grad_norm": 0.36468184086103056, + "learning_rate": 4.266691824268739e-06, + "loss": 0.2161, + "step": 12245 + }, + { + "epoch": 0.7, + "grad_norm": 0.29744735921142973, + "learning_rate": 4.2651672332427e-06, + "loss": 0.264, + "step": 12246 + }, + { + "epoch": 0.7, + "grad_norm": 0.476773513987374, + "learning_rate": 4.263642840813672e-06, + "loss": 0.2558, + "step": 12247 + }, + { + "epoch": 0.7, + "grad_norm": 0.5281422302262536, + "learning_rate": 4.262118647034447e-06, + "loss": 0.132, + "step": 12248 + }, + { + "epoch": 0.7, + "grad_norm": 0.3918244540090564, + "learning_rate": 4.260594651957801e-06, + "loss": 0.3159, + "step": 12249 + }, + { + "epoch": 0.7, + "grad_norm": 0.3831510721886654, + "learning_rate": 4.25907085563652e-06, + "loss": 0.3368, + "step": 12250 + }, + { + "epoch": 0.7, + "grad_norm": 0.518415226887272, + "learning_rate": 4.257547258123369e-06, + "loss": 0.2439, + "step": 12251 + }, + { + "epoch": 0.7, + "grad_norm": 0.3912430449227991, + "learning_rate": 4.256023859471109e-06, + "loss": 0.2949, + "step": 12252 + }, + { + "epoch": 0.7, + "grad_norm": 0.31137797694610614, + "learning_rate": 4.254500659732496e-06, + "loss": 0.1849, + "step": 12253 + }, + { + "epoch": 0.7, + "grad_norm": 0.36367056958104427, + "learning_rate": 4.2529776589602735e-06, + "loss": 0.3097, + "step": 12254 + }, + { + "epoch": 0.7, + "grad_norm": 0.28821543574715613, + "learning_rate": 4.251454857207193e-06, + "loss": 0.1862, + "step": 12255 + }, + { + "epoch": 0.7, + "grad_norm": 0.7304920210259426, + "learning_rate": 4.249932254525985e-06, + "loss": 0.4968, + "step": 12256 + }, + { + "epoch": 0.7, + "grad_norm": 0.4805174118924019, + "learning_rate": 4.24840985096937e-06, + "loss": 0.3333, + "step": 12257 + }, + { + "epoch": 0.7, + "grad_norm": 0.24509749223680857, + "learning_rate": 4.246887646590077e-06, + "loss": 0.2109, + "step": 12258 + }, + { + "epoch": 0.7, + "grad_norm": 0.30668412572681175, + "learning_rate": 4.245365641440818e-06, + "loss": 0.2188, + "step": 12259 + }, + { + "epoch": 0.7, + "grad_norm": 1.0420482711700545, + "learning_rate": 4.243843835574299e-06, + "loss": 0.4492, + "step": 12260 + }, + { + "epoch": 0.7, + "grad_norm": 0.29947922607947813, + "learning_rate": 4.242322229043218e-06, + "loss": 0.2126, + "step": 12261 + }, + { + "epoch": 0.7, + "grad_norm": 0.34042241000669604, + "learning_rate": 4.240800821900274e-06, + "loss": 0.2878, + "step": 12262 + }, + { + "epoch": 0.7, + "grad_norm": 0.8679278842448671, + "learning_rate": 4.23927961419815e-06, + "loss": 0.4032, + "step": 12263 + }, + { + "epoch": 0.7, + "grad_norm": 0.3370249737144881, + "learning_rate": 4.237758605989523e-06, + "loss": 0.247, + "step": 12264 + }, + { + "epoch": 0.7, + "grad_norm": 0.15104224638072863, + "learning_rate": 4.236237797327071e-06, + "loss": 0.071, + "step": 12265 + }, + { + "epoch": 0.7, + "grad_norm": 0.37245443877274914, + "learning_rate": 4.2347171882634505e-06, + "loss": 0.3221, + "step": 12266 + }, + { + "epoch": 0.7, + "grad_norm": 0.3957859482382533, + "learning_rate": 4.2331967788513295e-06, + "loss": 0.2676, + "step": 12267 + }, + { + "epoch": 0.7, + "grad_norm": 0.4817458267951734, + "learning_rate": 4.231676569143357e-06, + "loss": 0.2875, + "step": 12268 + }, + { + "epoch": 0.7, + "grad_norm": 0.33385576628286906, + "learning_rate": 4.230156559192177e-06, + "loss": 0.3, + "step": 12269 + }, + { + "epoch": 0.7, + "grad_norm": 0.3606609119285642, + "learning_rate": 4.228636749050422e-06, + "loss": 0.2815, + "step": 12270 + }, + { + "epoch": 0.71, + "grad_norm": 0.28662475275203025, + "learning_rate": 4.227117138770733e-06, + "loss": 0.1133, + "step": 12271 + }, + { + "epoch": 0.71, + "grad_norm": 1.1924756559482446, + "learning_rate": 4.225597728405729e-06, + "loss": 0.5055, + "step": 12272 + }, + { + "epoch": 0.71, + "grad_norm": 0.33127274692223163, + "learning_rate": 4.224078518008028e-06, + "loss": 0.2566, + "step": 12273 + }, + { + "epoch": 0.71, + "grad_norm": 0.39603641748968726, + "learning_rate": 4.222559507630235e-06, + "loss": 0.2764, + "step": 12274 + }, + { + "epoch": 0.71, + "grad_norm": 1.1371302801557202, + "learning_rate": 4.221040697324962e-06, + "loss": 0.8109, + "step": 12275 + }, + { + "epoch": 0.71, + "grad_norm": 0.35190049784212624, + "learning_rate": 4.2195220871448005e-06, + "loss": 0.2424, + "step": 12276 + }, + { + "epoch": 0.71, + "grad_norm": 0.27023187100195273, + "learning_rate": 4.218003677142342e-06, + "loss": 0.2132, + "step": 12277 + }, + { + "epoch": 0.71, + "grad_norm": 0.48459025177133447, + "learning_rate": 4.216485467370163e-06, + "loss": 0.2421, + "step": 12278 + }, + { + "epoch": 0.71, + "grad_norm": 0.30829697404074013, + "learning_rate": 4.214967457880846e-06, + "loss": 0.2431, + "step": 12279 + }, + { + "epoch": 0.71, + "grad_norm": 0.6722535491136014, + "learning_rate": 4.213449648726958e-06, + "loss": 0.4095, + "step": 12280 + }, + { + "epoch": 0.71, + "grad_norm": 0.30788458676053476, + "learning_rate": 4.211932039961061e-06, + "loss": 0.2747, + "step": 12281 + }, + { + "epoch": 0.71, + "grad_norm": 0.3495341382200821, + "learning_rate": 4.210414631635707e-06, + "loss": 0.2409, + "step": 12282 + }, + { + "epoch": 0.71, + "grad_norm": 1.1231503038430486, + "learning_rate": 4.208897423803443e-06, + "loss": 0.5617, + "step": 12283 + }, + { + "epoch": 0.71, + "grad_norm": 0.3074058591745467, + "learning_rate": 4.207380416516815e-06, + "loss": 0.162, + "step": 12284 + }, + { + "epoch": 0.71, + "grad_norm": 0.4244766221986541, + "learning_rate": 4.2058636098283545e-06, + "loss": 0.294, + "step": 12285 + }, + { + "epoch": 0.71, + "grad_norm": 0.31602488202105283, + "learning_rate": 4.204347003790588e-06, + "loss": 0.3086, + "step": 12286 + }, + { + "epoch": 0.71, + "grad_norm": 0.9753506360078198, + "learning_rate": 4.202830598456032e-06, + "loss": 0.5152, + "step": 12287 + }, + { + "epoch": 0.71, + "grad_norm": 0.4134773260026491, + "learning_rate": 4.201314393877206e-06, + "loss": 0.2844, + "step": 12288 + }, + { + "epoch": 0.71, + "grad_norm": 0.3680992182220188, + "learning_rate": 4.199798390106613e-06, + "loss": 0.2835, + "step": 12289 + }, + { + "epoch": 0.71, + "grad_norm": 0.29708444907868753, + "learning_rate": 4.198282587196757e-06, + "loss": 0.2302, + "step": 12290 + }, + { + "epoch": 0.71, + "grad_norm": 0.2937281945675394, + "learning_rate": 4.196766985200118e-06, + "loss": 0.1866, + "step": 12291 + }, + { + "epoch": 0.71, + "grad_norm": 0.6844793590759544, + "learning_rate": 4.195251584169192e-06, + "loss": 0.4549, + "step": 12292 + }, + { + "epoch": 0.71, + "grad_norm": 0.5098854869023844, + "learning_rate": 4.193736384156455e-06, + "loss": 0.3973, + "step": 12293 + }, + { + "epoch": 0.71, + "grad_norm": 0.2564448270712793, + "learning_rate": 4.192221385214377e-06, + "loss": 0.2072, + "step": 12294 + }, + { + "epoch": 0.71, + "grad_norm": 0.5101193979062211, + "learning_rate": 4.190706587395418e-06, + "loss": 0.2752, + "step": 12295 + }, + { + "epoch": 0.71, + "grad_norm": 0.4351332109330603, + "learning_rate": 4.189191990752044e-06, + "loss": 0.271, + "step": 12296 + }, + { + "epoch": 0.71, + "grad_norm": 0.26640761033236676, + "learning_rate": 4.187677595336702e-06, + "loss": 0.2164, + "step": 12297 + }, + { + "epoch": 0.71, + "grad_norm": 0.7739642089022813, + "learning_rate": 4.186163401201835e-06, + "loss": 0.3333, + "step": 12298 + }, + { + "epoch": 0.71, + "grad_norm": 0.9586181627625986, + "learning_rate": 4.184649408399876e-06, + "loss": 0.6132, + "step": 12299 + }, + { + "epoch": 0.71, + "grad_norm": 0.3301622040533963, + "learning_rate": 4.183135616983261e-06, + "loss": 0.1892, + "step": 12300 + }, + { + "epoch": 0.71, + "grad_norm": 0.4567133495984465, + "learning_rate": 4.181622027004409e-06, + "loss": 0.3046, + "step": 12301 + }, + { + "epoch": 0.71, + "grad_norm": 0.46370032678492656, + "learning_rate": 4.1801086385157366e-06, + "loss": 0.3539, + "step": 12302 + }, + { + "epoch": 0.71, + "grad_norm": 0.4066621177522798, + "learning_rate": 4.178595451569648e-06, + "loss": 0.2692, + "step": 12303 + }, + { + "epoch": 0.71, + "grad_norm": 0.20560129040918335, + "learning_rate": 4.177082466218553e-06, + "loss": 0.1409, + "step": 12304 + }, + { + "epoch": 0.71, + "grad_norm": 0.35498442687133175, + "learning_rate": 4.17556968251484e-06, + "loss": 0.3217, + "step": 12305 + }, + { + "epoch": 0.71, + "grad_norm": 0.6429202460112974, + "learning_rate": 4.1740571005109e-06, + "loss": 0.3263, + "step": 12306 + }, + { + "epoch": 0.71, + "grad_norm": 0.35829501157032695, + "learning_rate": 4.1725447202591115e-06, + "loss": 0.2361, + "step": 12307 + }, + { + "epoch": 0.71, + "grad_norm": 0.6702766382148628, + "learning_rate": 4.171032541811846e-06, + "loss": 0.3579, + "step": 12308 + }, + { + "epoch": 0.71, + "grad_norm": 0.302468040201284, + "learning_rate": 4.169520565221476e-06, + "loss": 0.2649, + "step": 12309 + }, + { + "epoch": 0.71, + "grad_norm": 0.19513857944378782, + "learning_rate": 4.1680087905403575e-06, + "loss": 0.1667, + "step": 12310 + }, + { + "epoch": 0.71, + "grad_norm": 1.3169755916645531, + "learning_rate": 4.166497217820844e-06, + "loss": 0.7212, + "step": 12311 + }, + { + "epoch": 0.71, + "grad_norm": 0.3903916545542141, + "learning_rate": 4.164985847115279e-06, + "loss": 0.2538, + "step": 12312 + }, + { + "epoch": 0.71, + "grad_norm": 0.3307900193032403, + "learning_rate": 4.163474678476004e-06, + "loss": 0.2559, + "step": 12313 + }, + { + "epoch": 0.71, + "grad_norm": 0.7210475091920264, + "learning_rate": 4.161963711955351e-06, + "loss": 0.3969, + "step": 12314 + }, + { + "epoch": 0.71, + "grad_norm": 0.4130701912582865, + "learning_rate": 4.1604529476056446e-06, + "loss": 0.2791, + "step": 12315 + }, + { + "epoch": 0.71, + "grad_norm": 0.33158894769484887, + "learning_rate": 4.158942385479198e-06, + "loss": 0.2499, + "step": 12316 + }, + { + "epoch": 0.71, + "grad_norm": 0.26527730099168056, + "learning_rate": 4.157432025628327e-06, + "loss": 0.2226, + "step": 12317 + }, + { + "epoch": 0.71, + "grad_norm": 0.4024841563367093, + "learning_rate": 4.155921868105336e-06, + "loss": 0.2536, + "step": 12318 + }, + { + "epoch": 0.71, + "grad_norm": 0.4715939185767074, + "learning_rate": 4.154411912962518e-06, + "loss": 0.3245, + "step": 12319 + }, + { + "epoch": 0.71, + "grad_norm": 0.49784484653712924, + "learning_rate": 4.152902160252165e-06, + "loss": 0.2754, + "step": 12320 + }, + { + "epoch": 0.71, + "grad_norm": 0.3167715593401071, + "learning_rate": 4.151392610026554e-06, + "loss": 0.2556, + "step": 12321 + }, + { + "epoch": 0.71, + "grad_norm": 0.5362943498624293, + "learning_rate": 4.149883262337969e-06, + "loss": 0.361, + "step": 12322 + }, + { + "epoch": 0.71, + "grad_norm": 0.2270767119225587, + "learning_rate": 4.148374117238676e-06, + "loss": 0.1251, + "step": 12323 + }, + { + "epoch": 0.71, + "grad_norm": 0.5482866300378484, + "learning_rate": 4.1468651747809366e-06, + "loss": 0.2773, + "step": 12324 + }, + { + "epoch": 0.71, + "grad_norm": 0.26991337180134006, + "learning_rate": 4.145356435017003e-06, + "loss": 0.262, + "step": 12325 + }, + { + "epoch": 0.71, + "grad_norm": 0.8129780685542745, + "learning_rate": 4.143847897999124e-06, + "loss": 0.3138, + "step": 12326 + }, + { + "epoch": 0.71, + "grad_norm": 0.6230381356562327, + "learning_rate": 4.142339563779542e-06, + "loss": 0.3043, + "step": 12327 + }, + { + "epoch": 0.71, + "grad_norm": 0.4264045585072286, + "learning_rate": 4.140831432410484e-06, + "loss": 0.3266, + "step": 12328 + }, + { + "epoch": 0.71, + "grad_norm": 0.3738143640469887, + "learning_rate": 4.139323503944186e-06, + "loss": 0.3328, + "step": 12329 + }, + { + "epoch": 0.71, + "grad_norm": 0.2431647354371285, + "learning_rate": 4.1378157784328625e-06, + "loss": 0.1254, + "step": 12330 + }, + { + "epoch": 0.71, + "grad_norm": 0.5894238839413188, + "learning_rate": 4.136308255928726e-06, + "loss": 0.3434, + "step": 12331 + }, + { + "epoch": 0.71, + "grad_norm": 1.2000110180565327, + "learning_rate": 4.134800936483983e-06, + "loss": 0.6542, + "step": 12332 + }, + { + "epoch": 0.71, + "grad_norm": 0.267646159596712, + "learning_rate": 4.1332938201508285e-06, + "loss": 0.2183, + "step": 12333 + }, + { + "epoch": 0.71, + "grad_norm": 0.5490669982217542, + "learning_rate": 4.13178690698146e-06, + "loss": 0.3078, + "step": 12334 + }, + { + "epoch": 0.71, + "grad_norm": 0.40224921477044767, + "learning_rate": 4.130280197028058e-06, + "loss": 0.2917, + "step": 12335 + }, + { + "epoch": 0.71, + "grad_norm": 0.398436666474072, + "learning_rate": 4.128773690342801e-06, + "loss": 0.2357, + "step": 12336 + }, + { + "epoch": 0.71, + "grad_norm": 0.3085871901913578, + "learning_rate": 4.127267386977854e-06, + "loss": 0.2875, + "step": 12337 + }, + { + "epoch": 0.71, + "grad_norm": 0.41757727862480615, + "learning_rate": 4.125761286985389e-06, + "loss": 0.2504, + "step": 12338 + }, + { + "epoch": 0.71, + "grad_norm": 0.5953337494643126, + "learning_rate": 4.124255390417558e-06, + "loss": 0.2831, + "step": 12339 + }, + { + "epoch": 0.71, + "grad_norm": 0.3973615565133707, + "learning_rate": 4.122749697326511e-06, + "loss": 0.2354, + "step": 12340 + }, + { + "epoch": 0.71, + "grad_norm": 0.3576969019418588, + "learning_rate": 4.121244207764384e-06, + "loss": 0.3025, + "step": 12341 + }, + { + "epoch": 0.71, + "grad_norm": 0.6161027575033992, + "learning_rate": 4.119738921783323e-06, + "loss": 0.3604, + "step": 12342 + }, + { + "epoch": 0.71, + "grad_norm": 0.22322602570107805, + "learning_rate": 4.118233839435449e-06, + "loss": 0.1689, + "step": 12343 + }, + { + "epoch": 0.71, + "grad_norm": 0.5065158568778143, + "learning_rate": 4.1167289607728845e-06, + "loss": 0.3368, + "step": 12344 + }, + { + "epoch": 0.71, + "grad_norm": 0.4057898445010308, + "learning_rate": 4.1152242858477435e-06, + "loss": 0.2837, + "step": 12345 + }, + { + "epoch": 0.71, + "grad_norm": 0.31512309786509607, + "learning_rate": 4.113719814712127e-06, + "loss": 0.2346, + "step": 12346 + }, + { + "epoch": 0.71, + "grad_norm": 0.6702265380981722, + "learning_rate": 4.112215547418145e-06, + "loss": 0.4688, + "step": 12347 + }, + { + "epoch": 0.71, + "grad_norm": 0.38723962767239795, + "learning_rate": 4.110711484017886e-06, + "loss": 0.3277, + "step": 12348 + }, + { + "epoch": 0.71, + "grad_norm": 0.238084010943153, + "learning_rate": 4.1092076245634346e-06, + "loss": 0.1993, + "step": 12349 + }, + { + "epoch": 0.71, + "grad_norm": 0.4857992938940668, + "learning_rate": 4.107703969106867e-06, + "loss": 0.2588, + "step": 12350 + }, + { + "epoch": 0.71, + "grad_norm": 0.617699439344278, + "learning_rate": 4.10620051770026e-06, + "loss": 0.3235, + "step": 12351 + }, + { + "epoch": 0.71, + "grad_norm": 0.4087980308933262, + "learning_rate": 4.104697270395676e-06, + "loss": 0.3254, + "step": 12352 + }, + { + "epoch": 0.71, + "grad_norm": 0.34914826637736407, + "learning_rate": 4.103194227245172e-06, + "loss": 0.2729, + "step": 12353 + }, + { + "epoch": 0.71, + "grad_norm": 0.5393994201005855, + "learning_rate": 4.101691388300795e-06, + "loss": 0.3091, + "step": 12354 + }, + { + "epoch": 0.71, + "grad_norm": 0.42117791947230276, + "learning_rate": 4.100188753614595e-06, + "loss": 0.2889, + "step": 12355 + }, + { + "epoch": 0.71, + "grad_norm": 0.2679960829980229, + "learning_rate": 4.098686323238604e-06, + "loss": 0.1591, + "step": 12356 + }, + { + "epoch": 0.71, + "grad_norm": 0.3767455945961782, + "learning_rate": 4.097184097224853e-06, + "loss": 0.263, + "step": 12357 + }, + { + "epoch": 0.71, + "grad_norm": 0.3979800959449096, + "learning_rate": 4.095682075625363e-06, + "loss": 0.2972, + "step": 12358 + }, + { + "epoch": 0.71, + "grad_norm": 0.7153844837257711, + "learning_rate": 4.094180258492147e-06, + "loss": 0.3261, + "step": 12359 + }, + { + "epoch": 0.71, + "grad_norm": 0.33698757282514463, + "learning_rate": 4.092678645877217e-06, + "loss": 0.3312, + "step": 12360 + }, + { + "epoch": 0.71, + "grad_norm": 0.32736161830074756, + "learning_rate": 4.09117723783257e-06, + "loss": 0.2544, + "step": 12361 + }, + { + "epoch": 0.71, + "grad_norm": 0.29749907521333085, + "learning_rate": 4.089676034410198e-06, + "loss": 0.1178, + "step": 12362 + }, + { + "epoch": 0.71, + "grad_norm": 0.7603494708438945, + "learning_rate": 4.088175035662095e-06, + "loss": 0.4403, + "step": 12363 + }, + { + "epoch": 0.71, + "grad_norm": 0.3803334125043377, + "learning_rate": 4.086674241640235e-06, + "loss": 0.2635, + "step": 12364 + }, + { + "epoch": 0.71, + "grad_norm": 0.3682851850725568, + "learning_rate": 4.085173652396593e-06, + "loss": 0.3203, + "step": 12365 + }, + { + "epoch": 0.71, + "grad_norm": 0.5811058200485942, + "learning_rate": 4.083673267983128e-06, + "loss": 0.2581, + "step": 12366 + }, + { + "epoch": 0.71, + "grad_norm": 0.34836499493050266, + "learning_rate": 4.0821730884518085e-06, + "loss": 0.2758, + "step": 12367 + }, + { + "epoch": 0.71, + "grad_norm": 0.2767104843102272, + "learning_rate": 4.08067311385458e-06, + "loss": 0.1553, + "step": 12368 + }, + { + "epoch": 0.71, + "grad_norm": 0.30352089800280146, + "learning_rate": 4.079173344243387e-06, + "loss": 0.2286, + "step": 12369 + }, + { + "epoch": 0.71, + "grad_norm": 0.32423430858686303, + "learning_rate": 4.077673779670166e-06, + "loss": 0.2643, + "step": 12370 + }, + { + "epoch": 0.71, + "grad_norm": 0.6190946400648354, + "learning_rate": 4.076174420186844e-06, + "loss": 0.4023, + "step": 12371 + }, + { + "epoch": 0.71, + "grad_norm": 0.3159074215735569, + "learning_rate": 4.07467526584535e-06, + "loss": 0.2145, + "step": 12372 + }, + { + "epoch": 0.71, + "grad_norm": 0.30664118077870395, + "learning_rate": 4.073176316697598e-06, + "loss": 0.2746, + "step": 12373 + }, + { + "epoch": 0.71, + "grad_norm": 0.3017502296489269, + "learning_rate": 4.071677572795495e-06, + "loss": 0.1723, + "step": 12374 + }, + { + "epoch": 0.71, + "grad_norm": 0.4615780722507267, + "learning_rate": 4.0701790341909386e-06, + "loss": 0.2075, + "step": 12375 + }, + { + "epoch": 0.71, + "grad_norm": 0.3855982088594073, + "learning_rate": 4.068680700935831e-06, + "loss": 0.3036, + "step": 12376 + }, + { + "epoch": 0.71, + "grad_norm": 0.36671811376397273, + "learning_rate": 4.0671825730820555e-06, + "loss": 0.3023, + "step": 12377 + }, + { + "epoch": 0.71, + "grad_norm": 1.2575854114457314, + "learning_rate": 4.065684650681493e-06, + "loss": 0.75, + "step": 12378 + }, + { + "epoch": 0.71, + "grad_norm": 0.34502650981498256, + "learning_rate": 4.064186933786012e-06, + "loss": 0.2024, + "step": 12379 + }, + { + "epoch": 0.71, + "grad_norm": 0.22705977337178385, + "learning_rate": 4.062689422447487e-06, + "loss": 0.2112, + "step": 12380 + }, + { + "epoch": 0.71, + "grad_norm": 0.640804257150291, + "learning_rate": 4.061192116717771e-06, + "loss": 0.4038, + "step": 12381 + }, + { + "epoch": 0.71, + "grad_norm": 0.33188078306411595, + "learning_rate": 4.0596950166487146e-06, + "loss": 0.2323, + "step": 12382 + }, + { + "epoch": 0.71, + "grad_norm": 1.2416745783711352, + "learning_rate": 4.058198122292167e-06, + "loss": 0.6851, + "step": 12383 + }, + { + "epoch": 0.71, + "grad_norm": 0.33509544879795256, + "learning_rate": 4.0567014336999584e-06, + "loss": 0.2977, + "step": 12384 + }, + { + "epoch": 0.71, + "grad_norm": 0.35947747921768164, + "learning_rate": 4.055204950923927e-06, + "loss": 0.205, + "step": 12385 + }, + { + "epoch": 0.71, + "grad_norm": 0.32272880351486805, + "learning_rate": 4.053708674015893e-06, + "loss": 0.1713, + "step": 12386 + }, + { + "epoch": 0.71, + "grad_norm": 0.5147264850139122, + "learning_rate": 4.052212603027672e-06, + "loss": 0.3099, + "step": 12387 + }, + { + "epoch": 0.71, + "grad_norm": 0.2537057854401847, + "learning_rate": 4.050716738011068e-06, + "loss": 0.1977, + "step": 12388 + }, + { + "epoch": 0.71, + "grad_norm": 0.5015595237780548, + "learning_rate": 4.049221079017892e-06, + "loss": 0.3651, + "step": 12389 + }, + { + "epoch": 0.71, + "grad_norm": 1.2806204433781172, + "learning_rate": 4.0477256260999344e-06, + "loss": 0.6888, + "step": 12390 + }, + { + "epoch": 0.71, + "grad_norm": 0.3961070936444837, + "learning_rate": 4.046230379308982e-06, + "loss": 0.2544, + "step": 12391 + }, + { + "epoch": 0.71, + "grad_norm": 0.24351026173987872, + "learning_rate": 4.0447353386968155e-06, + "loss": 0.2007, + "step": 12392 + }, + { + "epoch": 0.71, + "grad_norm": 0.5317384655177135, + "learning_rate": 4.043240504315209e-06, + "loss": 0.3628, + "step": 12393 + }, + { + "epoch": 0.71, + "grad_norm": 0.36782911635286475, + "learning_rate": 4.041745876215927e-06, + "loss": 0.3094, + "step": 12394 + }, + { + "epoch": 0.71, + "grad_norm": 0.26601632760932586, + "learning_rate": 4.040251454450729e-06, + "loss": 0.1666, + "step": 12395 + }, + { + "epoch": 0.71, + "grad_norm": 0.36052225817436, + "learning_rate": 4.038757239071364e-06, + "loss": 0.3037, + "step": 12396 + }, + { + "epoch": 0.71, + "grad_norm": 0.3159879747015827, + "learning_rate": 4.037263230129583e-06, + "loss": 0.2501, + "step": 12397 + }, + { + "epoch": 0.71, + "grad_norm": 0.6144153528165102, + "learning_rate": 4.035769427677118e-06, + "loss": 0.2974, + "step": 12398 + }, + { + "epoch": 0.71, + "grad_norm": 0.4335431228327503, + "learning_rate": 4.034275831765702e-06, + "loss": 0.2698, + "step": 12399 + }, + { + "epoch": 0.71, + "grad_norm": 0.2705356897806853, + "learning_rate": 4.032782442447055e-06, + "loss": 0.249, + "step": 12400 + }, + { + "epoch": 0.71, + "grad_norm": 0.27198063920594956, + "learning_rate": 4.031289259772898e-06, + "loss": 0.1899, + "step": 12401 + }, + { + "epoch": 0.71, + "grad_norm": 0.9726204979369423, + "learning_rate": 4.029796283794938e-06, + "loss": 0.5632, + "step": 12402 + }, + { + "epoch": 0.71, + "grad_norm": 0.3511151019796598, + "learning_rate": 4.028303514564876e-06, + "loss": 0.2668, + "step": 12403 + }, + { + "epoch": 0.71, + "grad_norm": 0.46440585552108493, + "learning_rate": 4.026810952134402e-06, + "loss": 0.2985, + "step": 12404 + }, + { + "epoch": 0.71, + "grad_norm": 0.3997705620102345, + "learning_rate": 4.025318596555212e-06, + "loss": 0.2284, + "step": 12405 + }, + { + "epoch": 0.71, + "grad_norm": 0.24602939683462624, + "learning_rate": 4.023826447878982e-06, + "loss": 0.2192, + "step": 12406 + }, + { + "epoch": 0.71, + "grad_norm": 1.1948345929606112, + "learning_rate": 4.022334506157386e-06, + "loss": 0.8185, + "step": 12407 + }, + { + "epoch": 0.71, + "grad_norm": 0.33598971886925083, + "learning_rate": 4.020842771442085e-06, + "loss": 0.2542, + "step": 12408 + }, + { + "epoch": 0.71, + "grad_norm": 0.42240133665361956, + "learning_rate": 4.019351243784745e-06, + "loss": 0.2873, + "step": 12409 + }, + { + "epoch": 0.71, + "grad_norm": 0.5062951214860231, + "learning_rate": 4.017859923237014e-06, + "loss": 0.3348, + "step": 12410 + }, + { + "epoch": 0.71, + "grad_norm": 0.34173636455748163, + "learning_rate": 4.016368809850537e-06, + "loss": 0.2076, + "step": 12411 + }, + { + "epoch": 0.71, + "grad_norm": 0.3865772364660087, + "learning_rate": 4.01487790367695e-06, + "loss": 0.2696, + "step": 12412 + }, + { + "epoch": 0.71, + "grad_norm": 0.3557344917335959, + "learning_rate": 4.013387204767881e-06, + "loss": 0.3212, + "step": 12413 + }, + { + "epoch": 0.71, + "grad_norm": 0.23541794800684201, + "learning_rate": 4.01189671317496e-06, + "loss": 0.0956, + "step": 12414 + }, + { + "epoch": 0.71, + "grad_norm": 0.2951339678252151, + "learning_rate": 4.0104064289497965e-06, + "loss": 0.2571, + "step": 12415 + }, + { + "epoch": 0.71, + "grad_norm": 0.33535513867717087, + "learning_rate": 4.008916352144002e-06, + "loss": 0.302, + "step": 12416 + }, + { + "epoch": 0.71, + "grad_norm": 1.12245964652745, + "learning_rate": 4.007426482809172e-06, + "loss": 0.5287, + "step": 12417 + }, + { + "epoch": 0.71, + "grad_norm": 0.29781734362720846, + "learning_rate": 4.0059368209969106e-06, + "loss": 0.186, + "step": 12418 + }, + { + "epoch": 0.71, + "grad_norm": 1.0779840391036732, + "learning_rate": 4.004447366758798e-06, + "loss": 0.7548, + "step": 12419 + }, + { + "epoch": 0.71, + "grad_norm": 0.22319618891727672, + "learning_rate": 4.002958120146415e-06, + "loss": 0.2061, + "step": 12420 + }, + { + "epoch": 0.71, + "grad_norm": 0.3003774906221277, + "learning_rate": 4.001469081211332e-06, + "loss": 0.2077, + "step": 12421 + }, + { + "epoch": 0.71, + "grad_norm": 0.6334985632628138, + "learning_rate": 3.99998025000512e-06, + "loss": 0.3586, + "step": 12422 + }, + { + "epoch": 0.71, + "grad_norm": 1.0323028337477798, + "learning_rate": 3.998491626579334e-06, + "loss": 0.4385, + "step": 12423 + }, + { + "epoch": 0.71, + "grad_norm": 0.22590105407607644, + "learning_rate": 3.997003210985524e-06, + "loss": 0.2142, + "step": 12424 + }, + { + "epoch": 0.71, + "grad_norm": 1.2186487143817915, + "learning_rate": 3.995515003275235e-06, + "loss": 0.7572, + "step": 12425 + }, + { + "epoch": 0.71, + "grad_norm": 0.320271494595398, + "learning_rate": 3.9940270035000036e-06, + "loss": 0.2219, + "step": 12426 + }, + { + "epoch": 0.71, + "grad_norm": 0.5789444426466166, + "learning_rate": 3.992539211711359e-06, + "loss": 0.3055, + "step": 12427 + }, + { + "epoch": 0.71, + "grad_norm": 0.2750210705197789, + "learning_rate": 3.991051627960822e-06, + "loss": 0.2382, + "step": 12428 + }, + { + "epoch": 0.71, + "grad_norm": 1.1626005918199633, + "learning_rate": 3.989564252299907e-06, + "loss": 0.6162, + "step": 12429 + }, + { + "epoch": 0.71, + "grad_norm": 0.6137501022840672, + "learning_rate": 3.988077084780126e-06, + "loss": 0.3585, + "step": 12430 + }, + { + "epoch": 0.71, + "grad_norm": 0.3564797669004418, + "learning_rate": 3.986590125452977e-06, + "loss": 0.2574, + "step": 12431 + }, + { + "epoch": 0.71, + "grad_norm": 0.3595173618942061, + "learning_rate": 3.985103374369954e-06, + "loss": 0.3004, + "step": 12432 + }, + { + "epoch": 0.71, + "grad_norm": 0.5610988564310685, + "learning_rate": 3.983616831582538e-06, + "loss": 0.3092, + "step": 12433 + }, + { + "epoch": 0.71, + "grad_norm": 0.22865917764490962, + "learning_rate": 3.9821304971422155e-06, + "loss": 0.153, + "step": 12434 + }, + { + "epoch": 0.71, + "grad_norm": 1.0396124580607748, + "learning_rate": 3.980644371100457e-06, + "loss": 0.4856, + "step": 12435 + }, + { + "epoch": 0.71, + "grad_norm": 0.2712024160349323, + "learning_rate": 3.979158453508724e-06, + "loss": 0.2664, + "step": 12436 + }, + { + "epoch": 0.71, + "grad_norm": 0.45818436095554166, + "learning_rate": 3.977672744418475e-06, + "loss": 0.2934, + "step": 12437 + }, + { + "epoch": 0.71, + "grad_norm": 0.7060226833636093, + "learning_rate": 3.976187243881156e-06, + "loss": 0.3856, + "step": 12438 + }, + { + "epoch": 0.71, + "grad_norm": 0.28119753881737464, + "learning_rate": 3.974701951948218e-06, + "loss": 0.207, + "step": 12439 + }, + { + "epoch": 0.71, + "grad_norm": 0.3156999954213079, + "learning_rate": 3.973216868671092e-06, + "loss": 0.2725, + "step": 12440 + }, + { + "epoch": 0.71, + "grad_norm": 0.3275821325716865, + "learning_rate": 3.9717319941012054e-06, + "loss": 0.0955, + "step": 12441 + }, + { + "epoch": 0.71, + "grad_norm": 0.4230111041588665, + "learning_rate": 3.970247328289979e-06, + "loss": 0.3222, + "step": 12442 + }, + { + "epoch": 0.71, + "grad_norm": 0.5330638370371196, + "learning_rate": 3.96876287128883e-06, + "loss": 0.3889, + "step": 12443 + }, + { + "epoch": 0.71, + "grad_norm": 0.48238254940743125, + "learning_rate": 3.967278623149165e-06, + "loss": 0.2652, + "step": 12444 + }, + { + "epoch": 0.72, + "grad_norm": 0.6377281456624062, + "learning_rate": 3.965794583922382e-06, + "loss": 0.27, + "step": 12445 + }, + { + "epoch": 0.72, + "grad_norm": 0.24998486209393883, + "learning_rate": 3.964310753659869e-06, + "loss": 0.1951, + "step": 12446 + }, + { + "epoch": 0.72, + "grad_norm": 0.3267863704261433, + "learning_rate": 3.9628271324130185e-06, + "loss": 0.2329, + "step": 12447 + }, + { + "epoch": 0.72, + "grad_norm": 0.5376672392151259, + "learning_rate": 3.961343720233204e-06, + "loss": 0.2847, + "step": 12448 + }, + { + "epoch": 0.72, + "grad_norm": 0.41943787136242416, + "learning_rate": 3.9598605171717976e-06, + "loss": 0.3178, + "step": 12449 + }, + { + "epoch": 0.72, + "grad_norm": 0.6504493113239271, + "learning_rate": 3.958377523280162e-06, + "loss": 0.3003, + "step": 12450 + }, + { + "epoch": 0.72, + "grad_norm": 0.6739410311061353, + "learning_rate": 3.956894738609649e-06, + "loss": 0.3241, + "step": 12451 + }, + { + "epoch": 0.72, + "grad_norm": 0.25881186952173607, + "learning_rate": 3.955412163211615e-06, + "loss": 0.2511, + "step": 12452 + }, + { + "epoch": 0.72, + "grad_norm": 0.4730776211188233, + "learning_rate": 3.953929797137398e-06, + "loss": 0.2741, + "step": 12453 + }, + { + "epoch": 0.72, + "grad_norm": 0.32294979003618474, + "learning_rate": 3.9524476404383324e-06, + "loss": 0.1982, + "step": 12454 + }, + { + "epoch": 0.72, + "grad_norm": 0.4427233885442675, + "learning_rate": 3.9509656931657405e-06, + "loss": 0.3236, + "step": 12455 + }, + { + "epoch": 0.72, + "grad_norm": 0.5624680637454126, + "learning_rate": 3.949483955370951e-06, + "loss": 0.3349, + "step": 12456 + }, + { + "epoch": 0.72, + "grad_norm": 0.3926441797137969, + "learning_rate": 3.9480024271052715e-06, + "loss": 0.1955, + "step": 12457 + }, + { + "epoch": 0.72, + "grad_norm": 0.33987897951718266, + "learning_rate": 3.946521108420008e-06, + "loss": 0.2365, + "step": 12458 + }, + { + "epoch": 0.72, + "grad_norm": 0.3372391079565309, + "learning_rate": 3.945039999366458e-06, + "loss": 0.2494, + "step": 12459 + }, + { + "epoch": 0.72, + "grad_norm": 0.3119762747576032, + "learning_rate": 3.9435590999959115e-06, + "loss": 0.2094, + "step": 12460 + }, + { + "epoch": 0.72, + "grad_norm": 0.5354947724202691, + "learning_rate": 3.942078410359655e-06, + "loss": 0.3259, + "step": 12461 + }, + { + "epoch": 0.72, + "grad_norm": 0.7620520172406171, + "learning_rate": 3.940597930508962e-06, + "loss": 0.4483, + "step": 12462 + }, + { + "epoch": 0.72, + "grad_norm": 0.3951157539623328, + "learning_rate": 3.939117660495098e-06, + "loss": 0.2072, + "step": 12463 + }, + { + "epoch": 0.72, + "grad_norm": 0.30130478030699875, + "learning_rate": 3.937637600369332e-06, + "loss": 0.2685, + "step": 12464 + }, + { + "epoch": 0.72, + "grad_norm": 0.317880848548765, + "learning_rate": 3.936157750182915e-06, + "loss": 0.1775, + "step": 12465 + }, + { + "epoch": 0.72, + "grad_norm": 0.7654264548014176, + "learning_rate": 3.934678109987096e-06, + "loss": 0.3159, + "step": 12466 + }, + { + "epoch": 0.72, + "grad_norm": 0.35211292954046547, + "learning_rate": 3.933198679833108e-06, + "loss": 0.2436, + "step": 12467 + }, + { + "epoch": 0.72, + "grad_norm": 0.48058464826750036, + "learning_rate": 3.931719459772193e-06, + "loss": 0.4169, + "step": 12468 + }, + { + "epoch": 0.72, + "grad_norm": 0.7896326662596463, + "learning_rate": 3.9302404498555725e-06, + "loss": 0.3491, + "step": 12469 + }, + { + "epoch": 0.72, + "grad_norm": 0.1967202253818277, + "learning_rate": 3.928761650134464e-06, + "loss": 0.149, + "step": 12470 + }, + { + "epoch": 0.72, + "grad_norm": 0.5161953619328937, + "learning_rate": 3.927283060660075e-06, + "loss": 0.326, + "step": 12471 + }, + { + "epoch": 0.72, + "grad_norm": 0.4444720336390856, + "learning_rate": 3.925804681483614e-06, + "loss": 0.3244, + "step": 12472 + }, + { + "epoch": 0.72, + "grad_norm": 0.36218587264417024, + "learning_rate": 3.924326512656279e-06, + "loss": 0.243, + "step": 12473 + }, + { + "epoch": 0.72, + "grad_norm": 1.308785521351059, + "learning_rate": 3.922848554229254e-06, + "loss": 0.7986, + "step": 12474 + }, + { + "epoch": 0.72, + "grad_norm": 0.37676328938957093, + "learning_rate": 3.921370806253722e-06, + "loss": 0.2643, + "step": 12475 + }, + { + "epoch": 0.72, + "grad_norm": 0.29437300765355345, + "learning_rate": 3.919893268780854e-06, + "loss": 0.2193, + "step": 12476 + }, + { + "epoch": 0.72, + "grad_norm": 0.3144016115644878, + "learning_rate": 3.918415941861825e-06, + "loss": 0.2123, + "step": 12477 + }, + { + "epoch": 0.72, + "grad_norm": 0.49162002682484635, + "learning_rate": 3.91693882554779e-06, + "loss": 0.2872, + "step": 12478 + }, + { + "epoch": 0.72, + "grad_norm": 0.36777889700838373, + "learning_rate": 3.915461919889903e-06, + "loss": 0.3064, + "step": 12479 + }, + { + "epoch": 0.72, + "grad_norm": 0.3299043967705451, + "learning_rate": 3.913985224939303e-06, + "loss": 0.2594, + "step": 12480 + }, + { + "epoch": 0.72, + "grad_norm": 0.790198056679378, + "learning_rate": 3.912508740747137e-06, + "loss": 0.3766, + "step": 12481 + }, + { + "epoch": 0.72, + "grad_norm": 0.34173548287831607, + "learning_rate": 3.911032467364531e-06, + "loss": 0.2535, + "step": 12482 + }, + { + "epoch": 0.72, + "grad_norm": 0.2538296150592013, + "learning_rate": 3.909556404842609e-06, + "loss": 0.1983, + "step": 12483 + }, + { + "epoch": 0.72, + "grad_norm": 0.8236400542230081, + "learning_rate": 3.908080553232484e-06, + "loss": 0.4423, + "step": 12484 + }, + { + "epoch": 0.72, + "grad_norm": 0.3028413178151663, + "learning_rate": 3.906604912585271e-06, + "loss": 0.2541, + "step": 12485 + }, + { + "epoch": 0.72, + "grad_norm": 0.951138237415222, + "learning_rate": 3.905129482952067e-06, + "loss": 0.4526, + "step": 12486 + }, + { + "epoch": 0.72, + "grad_norm": 0.3129589276357312, + "learning_rate": 3.903654264383967e-06, + "loss": 0.2508, + "step": 12487 + }, + { + "epoch": 0.72, + "grad_norm": 0.3663928277836569, + "learning_rate": 3.902179256932058e-06, + "loss": 0.2743, + "step": 12488 + }, + { + "epoch": 0.72, + "grad_norm": 0.776248988057081, + "learning_rate": 3.900704460647416e-06, + "loss": 0.29, + "step": 12489 + }, + { + "epoch": 0.72, + "grad_norm": 0.32241816415937624, + "learning_rate": 3.89922987558112e-06, + "loss": 0.1965, + "step": 12490 + }, + { + "epoch": 0.72, + "grad_norm": 0.2956955229384876, + "learning_rate": 3.897755501784231e-06, + "loss": 0.2823, + "step": 12491 + }, + { + "epoch": 0.72, + "grad_norm": 0.49019798055890323, + "learning_rate": 3.896281339307805e-06, + "loss": 0.3925, + "step": 12492 + }, + { + "epoch": 0.72, + "grad_norm": 0.8539817933659437, + "learning_rate": 3.8948073882028945e-06, + "loss": 0.1781, + "step": 12493 + }, + { + "epoch": 0.72, + "grad_norm": 0.3545876480577167, + "learning_rate": 3.893333648520542e-06, + "loss": 0.2658, + "step": 12494 + }, + { + "epoch": 0.72, + "grad_norm": 0.3817268627937381, + "learning_rate": 3.891860120311784e-06, + "loss": 0.3108, + "step": 12495 + }, + { + "epoch": 0.72, + "grad_norm": 0.4087311870618779, + "learning_rate": 3.890386803627642e-06, + "loss": 0.1709, + "step": 12496 + }, + { + "epoch": 0.72, + "grad_norm": 0.4015110688746216, + "learning_rate": 3.888913698519145e-06, + "loss": 0.3142, + "step": 12497 + }, + { + "epoch": 0.72, + "grad_norm": 0.4151975450528388, + "learning_rate": 3.887440805037306e-06, + "loss": 0.2699, + "step": 12498 + }, + { + "epoch": 0.72, + "grad_norm": 0.3269623807011317, + "learning_rate": 3.885968123233128e-06, + "loss": 0.233, + "step": 12499 + }, + { + "epoch": 0.72, + "grad_norm": 0.35990831564539366, + "learning_rate": 3.884495653157611e-06, + "loss": 0.292, + "step": 12500 + }, + { + "epoch": 0.72, + "grad_norm": 0.67394425063061, + "learning_rate": 3.883023394861742e-06, + "loss": 0.3572, + "step": 12501 + }, + { + "epoch": 0.72, + "grad_norm": 0.33215648058639724, + "learning_rate": 3.881551348396515e-06, + "loss": 0.1395, + "step": 12502 + }, + { + "epoch": 0.72, + "grad_norm": 0.2825531167285477, + "learning_rate": 3.880079513812901e-06, + "loss": 0.2577, + "step": 12503 + }, + { + "epoch": 0.72, + "grad_norm": 0.3434631773890578, + "learning_rate": 3.878607891161871e-06, + "loss": 0.2822, + "step": 12504 + }, + { + "epoch": 0.72, + "grad_norm": 0.8360101546679122, + "learning_rate": 3.8771364804943825e-06, + "loss": 0.4662, + "step": 12505 + }, + { + "epoch": 0.72, + "grad_norm": 0.31128367410087177, + "learning_rate": 3.8756652818613975e-06, + "loss": 0.2256, + "step": 12506 + }, + { + "epoch": 0.72, + "grad_norm": 0.3889226384732306, + "learning_rate": 3.8741942953138616e-06, + "loss": 0.3229, + "step": 12507 + }, + { + "epoch": 0.72, + "grad_norm": 1.4681783448097931, + "learning_rate": 3.872723520902713e-06, + "loss": 0.7554, + "step": 12508 + }, + { + "epoch": 0.72, + "grad_norm": 0.2748888158937971, + "learning_rate": 3.87125295867888e-06, + "loss": 0.2006, + "step": 12509 + }, + { + "epoch": 0.72, + "grad_norm": 0.27882312944220194, + "learning_rate": 3.8697826086933e-06, + "loss": 0.1742, + "step": 12510 + }, + { + "epoch": 0.72, + "grad_norm": 0.3533427566146823, + "learning_rate": 3.868312470996884e-06, + "loss": 0.3008, + "step": 12511 + }, + { + "epoch": 0.72, + "grad_norm": 0.3111571689105808, + "learning_rate": 3.866842545640542e-06, + "loss": 0.1881, + "step": 12512 + }, + { + "epoch": 0.72, + "grad_norm": 0.6455308980072993, + "learning_rate": 3.86537283267518e-06, + "loss": 0.3791, + "step": 12513 + }, + { + "epoch": 0.72, + "grad_norm": 0.4809467316313205, + "learning_rate": 3.863903332151689e-06, + "loss": 0.3647, + "step": 12514 + }, + { + "epoch": 0.72, + "grad_norm": 0.40863296174877306, + "learning_rate": 3.862434044120966e-06, + "loss": 0.2974, + "step": 12515 + }, + { + "epoch": 0.72, + "grad_norm": 0.20722922763128526, + "learning_rate": 3.860964968633888e-06, + "loss": 0.1662, + "step": 12516 + }, + { + "epoch": 0.72, + "grad_norm": 0.5038681834454446, + "learning_rate": 3.859496105741328e-06, + "loss": 0.3576, + "step": 12517 + }, + { + "epoch": 0.72, + "grad_norm": 0.3730892496342433, + "learning_rate": 3.858027455494152e-06, + "loss": 0.271, + "step": 12518 + }, + { + "epoch": 0.72, + "grad_norm": 0.37789847744452215, + "learning_rate": 3.856559017943223e-06, + "loss": 0.2646, + "step": 12519 + }, + { + "epoch": 0.72, + "grad_norm": 1.2120655893657206, + "learning_rate": 3.8550907931393925e-06, + "loss": 0.5642, + "step": 12520 + }, + { + "epoch": 0.72, + "grad_norm": 0.32447720363858357, + "learning_rate": 3.853622781133503e-06, + "loss": 0.2848, + "step": 12521 + }, + { + "epoch": 0.72, + "grad_norm": 0.3132306682340447, + "learning_rate": 3.852154981976388e-06, + "loss": 0.2106, + "step": 12522 + }, + { + "epoch": 0.72, + "grad_norm": 0.43512513969447275, + "learning_rate": 3.8506873957188865e-06, + "loss": 0.3212, + "step": 12523 + }, + { + "epoch": 0.72, + "grad_norm": 0.32854106565529423, + "learning_rate": 3.849220022411815e-06, + "loss": 0.261, + "step": 12524 + }, + { + "epoch": 0.72, + "grad_norm": 0.4033956109292242, + "learning_rate": 3.84775286210599e-06, + "loss": 0.1076, + "step": 12525 + }, + { + "epoch": 0.72, + "grad_norm": 0.4484468926497038, + "learning_rate": 3.846285914852216e-06, + "loss": 0.3562, + "step": 12526 + }, + { + "epoch": 0.72, + "grad_norm": 0.2682409012322284, + "learning_rate": 3.844819180701302e-06, + "loss": 0.2414, + "step": 12527 + }, + { + "epoch": 0.72, + "grad_norm": 1.2722056250509972, + "learning_rate": 3.843352659704032e-06, + "loss": 0.6141, + "step": 12528 + }, + { + "epoch": 0.72, + "grad_norm": 0.4828470045708207, + "learning_rate": 3.841886351911195e-06, + "loss": 0.227, + "step": 12529 + }, + { + "epoch": 0.72, + "grad_norm": 0.30225508543561413, + "learning_rate": 3.840420257373565e-06, + "loss": 0.245, + "step": 12530 + }, + { + "epoch": 0.72, + "grad_norm": 0.29599291538165057, + "learning_rate": 3.83895437614192e-06, + "loss": 0.246, + "step": 12531 + }, + { + "epoch": 0.72, + "grad_norm": 1.1752706306155982, + "learning_rate": 3.837488708267021e-06, + "loss": 0.3813, + "step": 12532 + }, + { + "epoch": 0.72, + "grad_norm": 0.43554275734913983, + "learning_rate": 3.836023253799621e-06, + "loss": 0.2608, + "step": 12533 + }, + { + "epoch": 0.72, + "grad_norm": 0.4608820204490836, + "learning_rate": 3.834558012790469e-06, + "loss": 0.347, + "step": 12534 + }, + { + "epoch": 0.72, + "grad_norm": 0.2756554502113018, + "learning_rate": 3.833092985290311e-06, + "loss": 0.2264, + "step": 12535 + }, + { + "epoch": 0.72, + "grad_norm": 0.39145909647593713, + "learning_rate": 3.831628171349877e-06, + "loss": 0.2618, + "step": 12536 + }, + { + "epoch": 0.72, + "grad_norm": 0.49976557663593313, + "learning_rate": 3.8301635710198946e-06, + "loss": 0.2346, + "step": 12537 + }, + { + "epoch": 0.72, + "grad_norm": 0.4174397744132169, + "learning_rate": 3.828699184351079e-06, + "loss": 0.2437, + "step": 12538 + }, + { + "epoch": 0.72, + "grad_norm": 0.276496262939729, + "learning_rate": 3.8272350113941494e-06, + "loss": 0.2565, + "step": 12539 + }, + { + "epoch": 0.72, + "grad_norm": 0.5865133389721945, + "learning_rate": 3.825771052199805e-06, + "loss": 0.4208, + "step": 12540 + }, + { + "epoch": 0.72, + "grad_norm": 0.8880953147875467, + "learning_rate": 3.824307306818745e-06, + "loss": 0.4872, + "step": 12541 + }, + { + "epoch": 0.72, + "grad_norm": 0.2268528697619566, + "learning_rate": 3.822843775301656e-06, + "loss": 0.1535, + "step": 12542 + }, + { + "epoch": 0.72, + "grad_norm": 0.29010451586633673, + "learning_rate": 3.821380457699217e-06, + "loss": 0.2439, + "step": 12543 + }, + { + "epoch": 0.72, + "grad_norm": 1.2260624615809157, + "learning_rate": 3.819917354062113e-06, + "loss": 0.6806, + "step": 12544 + }, + { + "epoch": 0.72, + "grad_norm": 0.2824308471763668, + "learning_rate": 3.8184544644410026e-06, + "loss": 0.2042, + "step": 12545 + }, + { + "epoch": 0.72, + "grad_norm": 0.776619086277665, + "learning_rate": 3.816991788886551e-06, + "loss": 0.4011, + "step": 12546 + }, + { + "epoch": 0.72, + "grad_norm": 0.3530785405930903, + "learning_rate": 3.815529327449402e-06, + "loss": 0.312, + "step": 12547 + }, + { + "epoch": 0.72, + "grad_norm": 0.32265673340370904, + "learning_rate": 3.8140670801802114e-06, + "loss": 0.1984, + "step": 12548 + }, + { + "epoch": 0.72, + "grad_norm": 0.2522603406858445, + "learning_rate": 3.8126050471296116e-06, + "loss": 0.1534, + "step": 12549 + }, + { + "epoch": 0.72, + "grad_norm": 0.3592744062620737, + "learning_rate": 3.811143228348233e-06, + "loss": 0.3229, + "step": 12550 + }, + { + "epoch": 0.72, + "grad_norm": 0.33175709985865126, + "learning_rate": 3.809681623886694e-06, + "loss": 0.1751, + "step": 12551 + }, + { + "epoch": 0.72, + "grad_norm": 0.5046438803543758, + "learning_rate": 3.8082202337956187e-06, + "loss": 0.3209, + "step": 12552 + }, + { + "epoch": 0.72, + "grad_norm": 1.0417354951774573, + "learning_rate": 3.80675905812561e-06, + "loss": 0.524, + "step": 12553 + }, + { + "epoch": 0.72, + "grad_norm": 0.3264891220161526, + "learning_rate": 3.805298096927269e-06, + "loss": 0.2089, + "step": 12554 + }, + { + "epoch": 0.72, + "grad_norm": 0.2294114627721482, + "learning_rate": 3.803837350251188e-06, + "loss": 0.2131, + "step": 12555 + }, + { + "epoch": 0.72, + "grad_norm": 1.2952230032564946, + "learning_rate": 3.8023768181479493e-06, + "loss": 0.8189, + "step": 12556 + }, + { + "epoch": 0.72, + "grad_norm": 0.48200138614488336, + "learning_rate": 3.800916500668139e-06, + "loss": 0.3078, + "step": 12557 + }, + { + "epoch": 0.72, + "grad_norm": 0.2545052188435359, + "learning_rate": 3.7994563978623243e-06, + "loss": 0.2414, + "step": 12558 + }, + { + "epoch": 0.72, + "grad_norm": 1.1399172653767695, + "learning_rate": 3.7979965097810667e-06, + "loss": 0.5805, + "step": 12559 + }, + { + "epoch": 0.72, + "grad_norm": 0.4024592476572722, + "learning_rate": 3.7965368364749244e-06, + "loss": 0.2655, + "step": 12560 + }, + { + "epoch": 0.72, + "grad_norm": 0.2189980848773631, + "learning_rate": 3.7950773779944437e-06, + "loss": 0.1199, + "step": 12561 + }, + { + "epoch": 0.72, + "grad_norm": 0.31760219818640645, + "learning_rate": 3.793618134390168e-06, + "loss": 0.3045, + "step": 12562 + }, + { + "epoch": 0.72, + "grad_norm": 0.3990174933506131, + "learning_rate": 3.792159105712625e-06, + "loss": 0.2764, + "step": 12563 + }, + { + "epoch": 0.72, + "grad_norm": 0.49668825548514706, + "learning_rate": 3.7907002920123482e-06, + "loss": 0.2711, + "step": 12564 + }, + { + "epoch": 0.72, + "grad_norm": 1.2066436895640726, + "learning_rate": 3.7892416933398534e-06, + "loss": 0.5866, + "step": 12565 + }, + { + "epoch": 0.72, + "grad_norm": 0.325699463496659, + "learning_rate": 3.7877833097456527e-06, + "loss": 0.259, + "step": 12566 + }, + { + "epoch": 0.72, + "grad_norm": 0.2975927184507548, + "learning_rate": 3.786325141280248e-06, + "loss": 0.2501, + "step": 12567 + }, + { + "epoch": 0.72, + "grad_norm": 0.43994328199667015, + "learning_rate": 3.7848671879941334e-06, + "loss": 0.2036, + "step": 12568 + }, + { + "epoch": 0.72, + "grad_norm": 0.6533205368181108, + "learning_rate": 3.783409449937804e-06, + "loss": 0.3164, + "step": 12569 + }, + { + "epoch": 0.72, + "grad_norm": 0.3752405058155906, + "learning_rate": 3.7819519271617377e-06, + "loss": 0.3023, + "step": 12570 + }, + { + "epoch": 0.72, + "grad_norm": 0.3617519561068833, + "learning_rate": 3.7804946197164096e-06, + "loss": 0.2507, + "step": 12571 + }, + { + "epoch": 0.72, + "grad_norm": 0.594735705756224, + "learning_rate": 3.779037527652282e-06, + "loss": 0.3228, + "step": 12572 + }, + { + "epoch": 0.72, + "grad_norm": 0.2359376307305643, + "learning_rate": 3.77758065101982e-06, + "loss": 0.1857, + "step": 12573 + }, + { + "epoch": 0.72, + "grad_norm": 0.35542427665950116, + "learning_rate": 3.7761239898694724e-06, + "loss": 0.2582, + "step": 12574 + }, + { + "epoch": 0.72, + "grad_norm": 0.5222486191552719, + "learning_rate": 3.774667544251683e-06, + "loss": 0.309, + "step": 12575 + }, + { + "epoch": 0.72, + "grad_norm": 0.41862420835222075, + "learning_rate": 3.773211314216887e-06, + "loss": 0.3407, + "step": 12576 + }, + { + "epoch": 0.72, + "grad_norm": 1.3031113013277529, + "learning_rate": 3.7717552998155184e-06, + "loss": 0.3212, + "step": 12577 + }, + { + "epoch": 0.72, + "grad_norm": 0.3062819204901965, + "learning_rate": 3.770299501097995e-06, + "loss": 0.2475, + "step": 12578 + }, + { + "epoch": 0.72, + "grad_norm": 0.29069925258267515, + "learning_rate": 3.768843918114733e-06, + "loss": 0.2404, + "step": 12579 + }, + { + "epoch": 0.72, + "grad_norm": 0.4692780214015884, + "learning_rate": 3.767388550916138e-06, + "loss": 0.2927, + "step": 12580 + }, + { + "epoch": 0.72, + "grad_norm": 0.3419993754544002, + "learning_rate": 3.7659333995526047e-06, + "loss": 0.2067, + "step": 12581 + }, + { + "epoch": 0.72, + "grad_norm": 0.5372210817123204, + "learning_rate": 3.7644784640745346e-06, + "loss": 0.3642, + "step": 12582 + }, + { + "epoch": 0.72, + "grad_norm": 0.3720534268613179, + "learning_rate": 3.763023744532307e-06, + "loss": 0.3042, + "step": 12583 + }, + { + "epoch": 0.72, + "grad_norm": 0.3256481791308857, + "learning_rate": 3.761569240976298e-06, + "loss": 0.1504, + "step": 12584 + }, + { + "epoch": 0.72, + "grad_norm": 0.4828728938480592, + "learning_rate": 3.7601149534568757e-06, + "loss": 0.3779, + "step": 12585 + }, + { + "epoch": 0.72, + "grad_norm": 0.313460913744006, + "learning_rate": 3.7586608820244076e-06, + "loss": 0.3181, + "step": 12586 + }, + { + "epoch": 0.72, + "grad_norm": 0.18315300828709036, + "learning_rate": 3.7572070267292438e-06, + "loss": 0.0854, + "step": 12587 + }, + { + "epoch": 0.72, + "grad_norm": 0.33954649548166543, + "learning_rate": 3.7557533876217325e-06, + "loss": 0.2789, + "step": 12588 + }, + { + "epoch": 0.72, + "grad_norm": 1.1214543359616003, + "learning_rate": 3.7542999647522094e-06, + "loss": 0.5101, + "step": 12589 + }, + { + "epoch": 0.72, + "grad_norm": 0.3266141732037362, + "learning_rate": 3.7528467581710137e-06, + "loss": 0.2917, + "step": 12590 + }, + { + "epoch": 0.72, + "grad_norm": 0.3392692953249234, + "learning_rate": 3.7513937679284664e-06, + "loss": 0.2326, + "step": 12591 + }, + { + "epoch": 0.72, + "grad_norm": 0.704023761433763, + "learning_rate": 3.749940994074884e-06, + "loss": 0.4103, + "step": 12592 + }, + { + "epoch": 0.72, + "grad_norm": 0.3830258897079524, + "learning_rate": 3.7484884366605758e-06, + "loss": 0.2555, + "step": 12593 + }, + { + "epoch": 0.72, + "grad_norm": 0.24571652256923193, + "learning_rate": 3.7470360957358442e-06, + "loss": 0.2247, + "step": 12594 + }, + { + "epoch": 0.72, + "grad_norm": 0.4350436646267506, + "learning_rate": 3.7455839713509844e-06, + "loss": 0.2822, + "step": 12595 + }, + { + "epoch": 0.72, + "grad_norm": 0.5916263926081792, + "learning_rate": 3.7441320635562828e-06, + "loss": 0.3318, + "step": 12596 + }, + { + "epoch": 0.72, + "grad_norm": 0.3253913954279248, + "learning_rate": 3.7426803724020143e-06, + "loss": 0.2366, + "step": 12597 + }, + { + "epoch": 0.72, + "grad_norm": 0.3462154083645354, + "learning_rate": 3.7412288979384604e-06, + "loss": 0.2957, + "step": 12598 + }, + { + "epoch": 0.72, + "grad_norm": 0.36298865887781684, + "learning_rate": 3.739777640215879e-06, + "loss": 0.2289, + "step": 12599 + }, + { + "epoch": 0.72, + "grad_norm": 0.37829378006780723, + "learning_rate": 3.7383265992845297e-06, + "loss": 0.2398, + "step": 12600 + }, + { + "epoch": 0.72, + "grad_norm": 0.4908548396772708, + "learning_rate": 3.736875775194657e-06, + "loss": 0.2657, + "step": 12601 + }, + { + "epoch": 0.72, + "grad_norm": 0.34330907741617966, + "learning_rate": 3.7354251679965103e-06, + "loss": 0.2757, + "step": 12602 + }, + { + "epoch": 0.72, + "grad_norm": 0.3923379452795223, + "learning_rate": 3.7339747777403212e-06, + "loss": 0.2949, + "step": 12603 + }, + { + "epoch": 0.72, + "grad_norm": 0.8516521788185919, + "learning_rate": 3.7325246044763164e-06, + "loss": 0.2947, + "step": 12604 + }, + { + "epoch": 0.72, + "grad_norm": 1.2060849155640518, + "learning_rate": 3.7310746482547143e-06, + "loss": 0.7756, + "step": 12605 + }, + { + "epoch": 0.72, + "grad_norm": 0.2613915751889735, + "learning_rate": 3.729624909125724e-06, + "loss": 0.2592, + "step": 12606 + }, + { + "epoch": 0.72, + "grad_norm": 0.25492878460525664, + "learning_rate": 3.7281753871395575e-06, + "loss": 0.1748, + "step": 12607 + }, + { + "epoch": 0.72, + "grad_norm": 0.6764847935708693, + "learning_rate": 3.726726082346408e-06, + "loss": 0.3877, + "step": 12608 + }, + { + "epoch": 0.72, + "grad_norm": 0.3204321858680907, + "learning_rate": 3.725276994796463e-06, + "loss": 0.2423, + "step": 12609 + }, + { + "epoch": 0.72, + "grad_norm": 0.3596000242651779, + "learning_rate": 3.7238281245399032e-06, + "loss": 0.2481, + "step": 12610 + }, + { + "epoch": 0.72, + "grad_norm": 0.7745774779469131, + "learning_rate": 3.72237947162691e-06, + "loss": 0.5167, + "step": 12611 + }, + { + "epoch": 0.72, + "grad_norm": 0.35232144352690364, + "learning_rate": 3.7209310361076445e-06, + "loss": 0.2963, + "step": 12612 + }, + { + "epoch": 0.72, + "grad_norm": 0.24440360054586446, + "learning_rate": 3.719482818032267e-06, + "loss": 0.1043, + "step": 12613 + }, + { + "epoch": 0.72, + "grad_norm": 0.3260683902895594, + "learning_rate": 3.7180348174509275e-06, + "loss": 0.2785, + "step": 12614 + }, + { + "epoch": 0.72, + "grad_norm": 0.34981957635207345, + "learning_rate": 3.7165870344137746e-06, + "loss": 0.2824, + "step": 12615 + }, + { + "epoch": 0.72, + "grad_norm": 1.22822645248553, + "learning_rate": 3.715139468970942e-06, + "loss": 0.6816, + "step": 12616 + }, + { + "epoch": 0.72, + "grad_norm": 0.3974630087585372, + "learning_rate": 3.7136921211725595e-06, + "loss": 0.2532, + "step": 12617 + }, + { + "epoch": 0.72, + "grad_norm": 0.34005021932049256, + "learning_rate": 3.7122449910687495e-06, + "loss": 0.2688, + "step": 12618 + }, + { + "epoch": 0.73, + "grad_norm": 0.3135211846099563, + "learning_rate": 3.710798078709621e-06, + "loss": 0.2613, + "step": 12619 + }, + { + "epoch": 0.73, + "grad_norm": 0.3350723208691652, + "learning_rate": 3.7093513841452876e-06, + "loss": 0.1817, + "step": 12620 + }, + { + "epoch": 0.73, + "grad_norm": 0.3935505259433406, + "learning_rate": 3.7079049074258465e-06, + "loss": 0.3123, + "step": 12621 + }, + { + "epoch": 0.73, + "grad_norm": 0.36173769233262987, + "learning_rate": 3.7064586486013865e-06, + "loss": 0.3023, + "step": 12622 + }, + { + "epoch": 0.73, + "grad_norm": 1.1886936765534302, + "learning_rate": 3.7050126077219908e-06, + "loss": 0.4164, + "step": 12623 + }, + { + "epoch": 0.73, + "grad_norm": 0.31195240262411766, + "learning_rate": 3.70356678483774e-06, + "loss": 0.2534, + "step": 12624 + }, + { + "epoch": 0.73, + "grad_norm": 0.43851398106112804, + "learning_rate": 3.702121179998701e-06, + "loss": 0.3313, + "step": 12625 + }, + { + "epoch": 0.73, + "grad_norm": 0.3608270575366872, + "learning_rate": 3.7006757932549355e-06, + "loss": 0.2095, + "step": 12626 + }, + { + "epoch": 0.73, + "grad_norm": 0.2572942190550343, + "learning_rate": 3.6992306246564923e-06, + "loss": 0.2211, + "step": 12627 + }, + { + "epoch": 0.73, + "grad_norm": 1.2467478374867251, + "learning_rate": 3.697785674253428e-06, + "loss": 0.6807, + "step": 12628 + }, + { + "epoch": 0.73, + "grad_norm": 0.4778900262033747, + "learning_rate": 3.696340942095772e-06, + "loss": 0.3873, + "step": 12629 + }, + { + "epoch": 0.73, + "grad_norm": 0.2611211344507252, + "learning_rate": 3.6948964282335576e-06, + "loss": 0.2184, + "step": 12630 + }, + { + "epoch": 0.73, + "grad_norm": 0.6519226464029577, + "learning_rate": 3.693452132716806e-06, + "loss": 0.3749, + "step": 12631 + }, + { + "epoch": 0.73, + "grad_norm": 0.37288735997878936, + "learning_rate": 3.6920080555955396e-06, + "loss": 0.2098, + "step": 12632 + }, + { + "epoch": 0.73, + "grad_norm": 0.2948673783502911, + "learning_rate": 3.6905641969197626e-06, + "loss": 0.1943, + "step": 12633 + }, + { + "epoch": 0.73, + "grad_norm": 0.34946669359400256, + "learning_rate": 3.689120556739475e-06, + "loss": 0.3015, + "step": 12634 + }, + { + "epoch": 0.73, + "grad_norm": 1.262967952928276, + "learning_rate": 3.687677135104669e-06, + "loss": 0.7467, + "step": 12635 + }, + { + "epoch": 0.73, + "grad_norm": 0.2994023778760764, + "learning_rate": 3.6862339320653353e-06, + "loss": 0.1952, + "step": 12636 + }, + { + "epoch": 0.73, + "grad_norm": 0.6864311781365865, + "learning_rate": 3.6847909476714495e-06, + "loss": 0.3597, + "step": 12637 + }, + { + "epoch": 0.73, + "grad_norm": 0.2598362738797895, + "learning_rate": 3.683348181972981e-06, + "loss": 0.2505, + "step": 12638 + }, + { + "epoch": 0.73, + "grad_norm": 0.3127138752691672, + "learning_rate": 3.68190563501989e-06, + "loss": 0.1789, + "step": 12639 + }, + { + "epoch": 0.73, + "grad_norm": 0.47588630014745503, + "learning_rate": 3.6804633068621388e-06, + "loss": 0.3032, + "step": 12640 + }, + { + "epoch": 0.73, + "grad_norm": 0.34987437967800145, + "learning_rate": 3.6790211975496714e-06, + "loss": 0.3093, + "step": 12641 + }, + { + "epoch": 0.73, + "grad_norm": 0.3070022736126839, + "learning_rate": 3.6775793071324283e-06, + "loss": 0.2453, + "step": 12642 + }, + { + "epoch": 0.73, + "grad_norm": 0.7542534218968645, + "learning_rate": 3.6761376356603385e-06, + "loss": 0.3064, + "step": 12643 + }, + { + "epoch": 0.73, + "grad_norm": 0.39024087041958594, + "learning_rate": 3.674696183183334e-06, + "loss": 0.225, + "step": 12644 + }, + { + "epoch": 0.73, + "grad_norm": 0.2456900281354834, + "learning_rate": 3.6732549497513292e-06, + "loss": 0.2106, + "step": 12645 + }, + { + "epoch": 0.73, + "grad_norm": 0.34175475930678456, + "learning_rate": 3.6718139354142326e-06, + "loss": 0.2592, + "step": 12646 + }, + { + "epoch": 0.73, + "grad_norm": 0.8023321846699596, + "learning_rate": 3.670373140221947e-06, + "loss": 0.452, + "step": 12647 + }, + { + "epoch": 0.73, + "grad_norm": 0.3495489226786334, + "learning_rate": 3.6689325642243643e-06, + "loss": 0.2793, + "step": 12648 + }, + { + "epoch": 0.73, + "grad_norm": 0.7430728352602514, + "learning_rate": 3.6674922074713783e-06, + "loss": 0.3452, + "step": 12649 + }, + { + "epoch": 0.73, + "grad_norm": 0.3347746450775112, + "learning_rate": 3.6660520700128642e-06, + "loss": 0.3015, + "step": 12650 + }, + { + "epoch": 0.73, + "grad_norm": 0.24771906724986534, + "learning_rate": 3.6646121518986954e-06, + "loss": 0.2161, + "step": 12651 + }, + { + "epoch": 0.73, + "grad_norm": 0.3430615488225278, + "learning_rate": 3.6631724531787314e-06, + "loss": 0.113, + "step": 12652 + }, + { + "epoch": 0.73, + "grad_norm": 0.38006058780149277, + "learning_rate": 3.6617329739028373e-06, + "loss": 0.2922, + "step": 12653 + }, + { + "epoch": 0.73, + "grad_norm": 0.3295860270105801, + "learning_rate": 3.660293714120856e-06, + "loss": 0.2493, + "step": 12654 + }, + { + "epoch": 0.73, + "grad_norm": 0.8069415336157884, + "learning_rate": 3.6588546738826325e-06, + "loss": 0.4577, + "step": 12655 + }, + { + "epoch": 0.73, + "grad_norm": 0.44498482845603343, + "learning_rate": 3.6574158532379944e-06, + "loss": 0.1469, + "step": 12656 + }, + { + "epoch": 0.73, + "grad_norm": 0.40957331326092905, + "learning_rate": 3.6559772522367765e-06, + "loss": 0.3075, + "step": 12657 + }, + { + "epoch": 0.73, + "grad_norm": 0.24643548781053554, + "learning_rate": 3.6545388709287933e-06, + "loss": 0.2624, + "step": 12658 + }, + { + "epoch": 0.73, + "grad_norm": 0.7845058080505076, + "learning_rate": 3.653100709363856e-06, + "loss": 0.3092, + "step": 12659 + }, + { + "epoch": 0.73, + "grad_norm": 0.38306157889681686, + "learning_rate": 3.651662767591768e-06, + "loss": 0.2797, + "step": 12660 + }, + { + "epoch": 0.73, + "grad_norm": 0.39024767404061234, + "learning_rate": 3.650225045662322e-06, + "loss": 0.3401, + "step": 12661 + }, + { + "epoch": 0.73, + "grad_norm": 0.46207148468081677, + "learning_rate": 3.6487875436253173e-06, + "loss": 0.2373, + "step": 12662 + }, + { + "epoch": 0.73, + "grad_norm": 0.3593077845975231, + "learning_rate": 3.6473502615305233e-06, + "loss": 0.2678, + "step": 12663 + }, + { + "epoch": 0.73, + "grad_norm": 0.3862979083597893, + "learning_rate": 3.645913199427713e-06, + "loss": 0.2191, + "step": 12664 + }, + { + "epoch": 0.73, + "grad_norm": 0.34427342537308164, + "learning_rate": 3.6444763573666586e-06, + "loss": 0.2569, + "step": 12665 + }, + { + "epoch": 0.73, + "grad_norm": 0.3287239171784687, + "learning_rate": 3.643039735397115e-06, + "loss": 0.2551, + "step": 12666 + }, + { + "epoch": 0.73, + "grad_norm": 1.2073091689438713, + "learning_rate": 3.6416033335688306e-06, + "loss": 0.8047, + "step": 12667 + }, + { + "epoch": 0.73, + "grad_norm": 1.14833529116643, + "learning_rate": 3.640167151931547e-06, + "loss": 0.4932, + "step": 12668 + }, + { + "epoch": 0.73, + "grad_norm": 0.2683170080436344, + "learning_rate": 3.6387311905350053e-06, + "loss": 0.2103, + "step": 12669 + }, + { + "epoch": 0.73, + "grad_norm": 0.27189369694643967, + "learning_rate": 3.637295449428928e-06, + "loss": 0.2041, + "step": 12670 + }, + { + "epoch": 0.73, + "grad_norm": 0.5783715207547905, + "learning_rate": 3.6358599286630367e-06, + "loss": 0.3914, + "step": 12671 + }, + { + "epoch": 0.73, + "grad_norm": 0.2876284884270302, + "learning_rate": 3.634424628287041e-06, + "loss": 0.191, + "step": 12672 + }, + { + "epoch": 0.73, + "grad_norm": 0.47055640385816155, + "learning_rate": 3.632989548350645e-06, + "loss": 0.3705, + "step": 12673 + }, + { + "epoch": 0.73, + "grad_norm": 0.526309013570079, + "learning_rate": 3.631554688903549e-06, + "loss": 0.3535, + "step": 12674 + }, + { + "epoch": 0.73, + "grad_norm": 0.3751698398541402, + "learning_rate": 3.6301200499954416e-06, + "loss": 0.1871, + "step": 12675 + }, + { + "epoch": 0.73, + "grad_norm": 0.3542442085533244, + "learning_rate": 3.6286856316760023e-06, + "loss": 0.2631, + "step": 12676 + }, + { + "epoch": 0.73, + "grad_norm": 0.273470471007027, + "learning_rate": 3.6272514339949015e-06, + "loss": 0.2747, + "step": 12677 + }, + { + "epoch": 0.73, + "grad_norm": 0.5734636947485215, + "learning_rate": 3.6258174570018133e-06, + "loss": 0.3421, + "step": 12678 + }, + { + "epoch": 0.73, + "grad_norm": 0.3687365959759761, + "learning_rate": 3.6243837007463933e-06, + "loss": 0.262, + "step": 12679 + }, + { + "epoch": 0.73, + "grad_norm": 1.2296519963082793, + "learning_rate": 3.6229501652782904e-06, + "loss": 0.5555, + "step": 12680 + }, + { + "epoch": 0.73, + "grad_norm": 0.31969153444544296, + "learning_rate": 3.6215168506471466e-06, + "loss": 0.2874, + "step": 12681 + }, + { + "epoch": 0.73, + "grad_norm": 0.4076483909163558, + "learning_rate": 3.6200837569026036e-06, + "loss": 0.2688, + "step": 12682 + }, + { + "epoch": 0.73, + "grad_norm": 0.41300092423729834, + "learning_rate": 3.618650884094285e-06, + "loss": 0.2668, + "step": 12683 + }, + { + "epoch": 0.73, + "grad_norm": 0.3944044972280825, + "learning_rate": 3.617218232271812e-06, + "loss": 0.2958, + "step": 12684 + }, + { + "epoch": 0.73, + "grad_norm": 0.25016697584075415, + "learning_rate": 3.615785801484797e-06, + "loss": 0.1869, + "step": 12685 + }, + { + "epoch": 0.73, + "grad_norm": 0.4950885923845611, + "learning_rate": 3.6143535917828422e-06, + "loss": 0.3181, + "step": 12686 + }, + { + "epoch": 0.73, + "grad_norm": 0.369686886187919, + "learning_rate": 3.612921603215551e-06, + "loss": 0.263, + "step": 12687 + }, + { + "epoch": 0.73, + "grad_norm": 0.5134965413780258, + "learning_rate": 3.6114898358325103e-06, + "loss": 0.2762, + "step": 12688 + }, + { + "epoch": 0.73, + "grad_norm": 0.3598094168898819, + "learning_rate": 3.6100582896833012e-06, + "loss": 0.3195, + "step": 12689 + }, + { + "epoch": 0.73, + "grad_norm": 0.5903895067372618, + "learning_rate": 3.6086269648174965e-06, + "loss": 0.3883, + "step": 12690 + }, + { + "epoch": 0.73, + "grad_norm": 0.285502437384372, + "learning_rate": 3.607195861284668e-06, + "loss": 0.2375, + "step": 12691 + }, + { + "epoch": 0.73, + "grad_norm": 0.29724734450311907, + "learning_rate": 3.605764979134372e-06, + "loss": 0.1829, + "step": 12692 + }, + { + "epoch": 0.73, + "grad_norm": 0.38014274107318985, + "learning_rate": 3.6043343184161593e-06, + "loss": 0.2531, + "step": 12693 + }, + { + "epoch": 0.73, + "grad_norm": 0.3741144539894074, + "learning_rate": 3.602903879179571e-06, + "loss": 0.3121, + "step": 12694 + }, + { + "epoch": 0.73, + "grad_norm": 0.9836418738839882, + "learning_rate": 3.601473661474154e-06, + "loss": 0.3773, + "step": 12695 + }, + { + "epoch": 0.73, + "grad_norm": 0.5388543502057055, + "learning_rate": 3.600043665349424e-06, + "loss": 0.33, + "step": 12696 + }, + { + "epoch": 0.73, + "grad_norm": 0.2299485082398649, + "learning_rate": 3.5986138908549073e-06, + "loss": 0.2319, + "step": 12697 + }, + { + "epoch": 0.73, + "grad_norm": 0.4261332556361552, + "learning_rate": 3.597184338040114e-06, + "loss": 0.166, + "step": 12698 + }, + { + "epoch": 0.73, + "grad_norm": 0.4785686820285072, + "learning_rate": 3.595755006954553e-06, + "loss": 0.3013, + "step": 12699 + }, + { + "epoch": 0.73, + "grad_norm": 0.40973404733494684, + "learning_rate": 3.5943258976477226e-06, + "loss": 0.2931, + "step": 12700 + }, + { + "epoch": 0.73, + "grad_norm": 0.32627133475863995, + "learning_rate": 3.5928970101691096e-06, + "loss": 0.2481, + "step": 12701 + }, + { + "epoch": 0.73, + "grad_norm": 0.5170098162809887, + "learning_rate": 3.5914683445681954e-06, + "loss": 0.3466, + "step": 12702 + }, + { + "epoch": 0.73, + "grad_norm": 0.287299691816571, + "learning_rate": 3.59003990089446e-06, + "loss": 0.2333, + "step": 12703 + }, + { + "epoch": 0.73, + "grad_norm": 0.3970243877751687, + "learning_rate": 3.588611679197366e-06, + "loss": 0.2657, + "step": 12704 + }, + { + "epoch": 0.73, + "grad_norm": 0.2731068096527432, + "learning_rate": 3.587183679526375e-06, + "loss": 0.2241, + "step": 12705 + }, + { + "epoch": 0.73, + "grad_norm": 0.5270425973716926, + "learning_rate": 3.585755901930934e-06, + "loss": 0.3451, + "step": 12706 + }, + { + "epoch": 0.73, + "grad_norm": 1.2638374003320183, + "learning_rate": 3.5843283464604927e-06, + "loss": 0.7912, + "step": 12707 + }, + { + "epoch": 0.73, + "grad_norm": 0.4955313922411486, + "learning_rate": 3.582901013164486e-06, + "loss": 0.1773, + "step": 12708 + }, + { + "epoch": 0.73, + "grad_norm": 0.25806863051136353, + "learning_rate": 3.5814739020923405e-06, + "loss": 0.2594, + "step": 12709 + }, + { + "epoch": 0.73, + "grad_norm": 0.2920727776909075, + "learning_rate": 3.5800470132934785e-06, + "loss": 0.2262, + "step": 12710 + }, + { + "epoch": 0.73, + "grad_norm": 0.7188461489974537, + "learning_rate": 3.5786203468173087e-06, + "loss": 0.2492, + "step": 12711 + }, + { + "epoch": 0.73, + "grad_norm": 0.3632276005566369, + "learning_rate": 3.5771939027132428e-06, + "loss": 0.2854, + "step": 12712 + }, + { + "epoch": 0.73, + "grad_norm": 0.3989870298151256, + "learning_rate": 3.5757676810306775e-06, + "loss": 0.3184, + "step": 12713 + }, + { + "epoch": 0.73, + "grad_norm": 0.3194280116657528, + "learning_rate": 3.5743416818189993e-06, + "loss": 0.1514, + "step": 12714 + }, + { + "epoch": 0.73, + "grad_norm": 0.353739585397453, + "learning_rate": 3.5729159051275895e-06, + "loss": 0.2931, + "step": 12715 + }, + { + "epoch": 0.73, + "grad_norm": 0.6406462606108349, + "learning_rate": 3.5714903510058296e-06, + "loss": 0.3994, + "step": 12716 + }, + { + "epoch": 0.73, + "grad_norm": 0.2038712219236113, + "learning_rate": 3.570065019503082e-06, + "loss": 0.1946, + "step": 12717 + }, + { + "epoch": 0.73, + "grad_norm": 0.3350444004903638, + "learning_rate": 3.5686399106687064e-06, + "loss": 0.2076, + "step": 12718 + }, + { + "epoch": 0.73, + "grad_norm": 1.097246714693747, + "learning_rate": 3.567215024552051e-06, + "loss": 0.6239, + "step": 12719 + }, + { + "epoch": 0.73, + "grad_norm": 0.46965475632734666, + "learning_rate": 3.5657903612024658e-06, + "loss": 0.3749, + "step": 12720 + }, + { + "epoch": 0.73, + "grad_norm": 0.263875280635042, + "learning_rate": 3.5643659206692837e-06, + "loss": 0.2184, + "step": 12721 + }, + { + "epoch": 0.73, + "grad_norm": 0.5866450994990379, + "learning_rate": 3.562941703001832e-06, + "loss": 0.3771, + "step": 12722 + }, + { + "epoch": 0.73, + "grad_norm": 0.19861196590825053, + "learning_rate": 3.5615177082494334e-06, + "loss": 0.1384, + "step": 12723 + }, + { + "epoch": 0.73, + "grad_norm": 0.3572080133883536, + "learning_rate": 3.5600939364613963e-06, + "loss": 0.217, + "step": 12724 + }, + { + "epoch": 0.73, + "grad_norm": 0.41415889915049825, + "learning_rate": 3.5586703876870333e-06, + "loss": 0.3052, + "step": 12725 + }, + { + "epoch": 0.73, + "grad_norm": 0.5154925408780823, + "learning_rate": 3.557247061975636e-06, + "loss": 0.3153, + "step": 12726 + }, + { + "epoch": 0.73, + "grad_norm": 0.34276841548401615, + "learning_rate": 3.5558239593764978e-06, + "loss": 0.2438, + "step": 12727 + }, + { + "epoch": 0.73, + "grad_norm": 0.42795797835213084, + "learning_rate": 3.554401079938894e-06, + "loss": 0.329, + "step": 12728 + }, + { + "epoch": 0.73, + "grad_norm": 0.234860424746158, + "learning_rate": 3.552978423712111e-06, + "loss": 0.1943, + "step": 12729 + }, + { + "epoch": 0.73, + "grad_norm": 0.36464014173959514, + "learning_rate": 3.5515559907454045e-06, + "loss": 0.2748, + "step": 12730 + }, + { + "epoch": 0.73, + "grad_norm": 0.9060753945623665, + "learning_rate": 3.550133781088033e-06, + "loss": 0.5051, + "step": 12731 + }, + { + "epoch": 0.73, + "grad_norm": 0.3529980523946801, + "learning_rate": 3.5487117947892558e-06, + "loss": 0.2867, + "step": 12732 + }, + { + "epoch": 0.73, + "grad_norm": 0.3674842048475259, + "learning_rate": 3.5472900318983105e-06, + "loss": 0.2548, + "step": 12733 + }, + { + "epoch": 0.73, + "grad_norm": 0.8401740489172301, + "learning_rate": 3.545868492464435e-06, + "loss": 0.3047, + "step": 12734 + }, + { + "epoch": 0.73, + "grad_norm": 0.27141746267388867, + "learning_rate": 3.544447176536855e-06, + "loss": 0.1553, + "step": 12735 + }, + { + "epoch": 0.73, + "grad_norm": 0.32057223006156993, + "learning_rate": 3.543026084164789e-06, + "loss": 0.2634, + "step": 12736 + }, + { + "epoch": 0.73, + "grad_norm": 0.3573243078189275, + "learning_rate": 3.5416052153974546e-06, + "loss": 0.2645, + "step": 12737 + }, + { + "epoch": 0.73, + "grad_norm": 0.6295162887590943, + "learning_rate": 3.5401845702840543e-06, + "loss": 0.4241, + "step": 12738 + }, + { + "epoch": 0.73, + "grad_norm": 0.38271116549580053, + "learning_rate": 3.5387641488737855e-06, + "loss": 0.2994, + "step": 12739 + }, + { + "epoch": 0.73, + "grad_norm": 0.44089101642573486, + "learning_rate": 3.5373439512158315e-06, + "loss": 0.2251, + "step": 12740 + }, + { + "epoch": 0.73, + "grad_norm": 0.2504442391142626, + "learning_rate": 3.5359239773593833e-06, + "loss": 0.2017, + "step": 12741 + }, + { + "epoch": 0.73, + "grad_norm": 0.3315826635141327, + "learning_rate": 3.534504227353609e-06, + "loss": 0.2718, + "step": 12742 + }, + { + "epoch": 0.73, + "grad_norm": 1.0141255217066432, + "learning_rate": 3.5330847012476754e-06, + "loss": 0.6677, + "step": 12743 + }, + { + "epoch": 0.73, + "grad_norm": 0.2908904563347626, + "learning_rate": 3.5316653990907367e-06, + "loss": 0.2313, + "step": 12744 + }, + { + "epoch": 0.73, + "grad_norm": 0.35460691259874044, + "learning_rate": 3.5302463209319514e-06, + "loss": 0.317, + "step": 12745 + }, + { + "epoch": 0.73, + "grad_norm": 0.6951267542723861, + "learning_rate": 3.5288274668204568e-06, + "loss": 0.4621, + "step": 12746 + }, + { + "epoch": 0.73, + "grad_norm": 0.15127146330290486, + "learning_rate": 3.527408836805389e-06, + "loss": 0.0716, + "step": 12747 + }, + { + "epoch": 0.73, + "grad_norm": 0.297568465211192, + "learning_rate": 3.525990430935876e-06, + "loss": 0.2372, + "step": 12748 + }, + { + "epoch": 0.73, + "grad_norm": 0.321758839001798, + "learning_rate": 3.524572249261031e-06, + "loss": 0.3338, + "step": 12749 + }, + { + "epoch": 0.73, + "grad_norm": 0.5864810260206559, + "learning_rate": 3.5231542918299753e-06, + "loss": 0.3112, + "step": 12750 + }, + { + "epoch": 0.73, + "grad_norm": 0.2883308570588384, + "learning_rate": 3.5217365586918073e-06, + "loss": 0.2406, + "step": 12751 + }, + { + "epoch": 0.73, + "grad_norm": 0.47306256806256525, + "learning_rate": 3.5203190498956242e-06, + "loss": 0.3095, + "step": 12752 + }, + { + "epoch": 0.73, + "grad_norm": 0.49735107729836175, + "learning_rate": 3.518901765490509e-06, + "loss": 0.347, + "step": 12753 + }, + { + "epoch": 0.73, + "grad_norm": 0.17686563294193186, + "learning_rate": 3.517484705525551e-06, + "loss": 0.1394, + "step": 12754 + }, + { + "epoch": 0.73, + "grad_norm": 0.8933579186696481, + "learning_rate": 3.5160678700498197e-06, + "loss": 0.5273, + "step": 12755 + }, + { + "epoch": 0.73, + "grad_norm": 0.3495559430890266, + "learning_rate": 3.5146512591123783e-06, + "loss": 0.3032, + "step": 12756 + }, + { + "epoch": 0.73, + "grad_norm": 0.28892578502445293, + "learning_rate": 3.513234872762282e-06, + "loss": 0.1972, + "step": 12757 + }, + { + "epoch": 0.73, + "grad_norm": 1.0657258271815264, + "learning_rate": 3.511818711048587e-06, + "loss": 0.5283, + "step": 12758 + }, + { + "epoch": 0.73, + "grad_norm": 0.2885840837702499, + "learning_rate": 3.5104027740203305e-06, + "loss": 0.1883, + "step": 12759 + }, + { + "epoch": 0.73, + "grad_norm": 0.2869873138895025, + "learning_rate": 3.5089870617265465e-06, + "loss": 0.1858, + "step": 12760 + }, + { + "epoch": 0.73, + "grad_norm": 0.352194227970117, + "learning_rate": 3.5075715742162586e-06, + "loss": 0.3088, + "step": 12761 + }, + { + "epoch": 0.73, + "grad_norm": 0.7226569774131096, + "learning_rate": 3.506156311538491e-06, + "loss": 0.4724, + "step": 12762 + }, + { + "epoch": 0.73, + "grad_norm": 0.29774972113115467, + "learning_rate": 3.504741273742254e-06, + "loss": 0.194, + "step": 12763 + }, + { + "epoch": 0.73, + "grad_norm": 0.33947635583751484, + "learning_rate": 3.503326460876545e-06, + "loss": 0.2951, + "step": 12764 + }, + { + "epoch": 0.73, + "grad_norm": 1.1678877911120926, + "learning_rate": 3.5019118729903566e-06, + "loss": 0.4789, + "step": 12765 + }, + { + "epoch": 0.73, + "grad_norm": 0.3045109971875889, + "learning_rate": 3.5004975101326854e-06, + "loss": 0.2444, + "step": 12766 + }, + { + "epoch": 0.73, + "grad_norm": 0.5454583862261513, + "learning_rate": 3.4990833723525054e-06, + "loss": 0.2266, + "step": 12767 + }, + { + "epoch": 0.73, + "grad_norm": 0.3329777623579333, + "learning_rate": 3.497669459698788e-06, + "loss": 0.3025, + "step": 12768 + }, + { + "epoch": 0.73, + "grad_norm": 0.3145139009440519, + "learning_rate": 3.496255772220495e-06, + "loss": 0.2587, + "step": 12769 + }, + { + "epoch": 0.73, + "grad_norm": 0.23984063415861187, + "learning_rate": 3.4948423099665883e-06, + "loss": 0.1022, + "step": 12770 + }, + { + "epoch": 0.73, + "grad_norm": 0.8220955217574739, + "learning_rate": 3.493429072986013e-06, + "loss": 0.402, + "step": 12771 + }, + { + "epoch": 0.73, + "grad_norm": 0.2833979664828243, + "learning_rate": 3.492016061327709e-06, + "loss": 0.249, + "step": 12772 + }, + { + "epoch": 0.73, + "grad_norm": 0.4655948518759838, + "learning_rate": 3.490603275040605e-06, + "loss": 0.2649, + "step": 12773 + }, + { + "epoch": 0.73, + "grad_norm": 0.39936457950351845, + "learning_rate": 3.4891907141736324e-06, + "loss": 0.289, + "step": 12774 + }, + { + "epoch": 0.73, + "grad_norm": 0.2320835341969602, + "learning_rate": 3.487778378775707e-06, + "loss": 0.1958, + "step": 12775 + }, + { + "epoch": 0.73, + "grad_norm": 0.3366992622728923, + "learning_rate": 3.4863662688957355e-06, + "loss": 0.2385, + "step": 12776 + }, + { + "epoch": 0.73, + "grad_norm": 0.8735030497179362, + "learning_rate": 3.4849543845826195e-06, + "loss": 0.373, + "step": 12777 + }, + { + "epoch": 0.73, + "grad_norm": 0.3820901509586576, + "learning_rate": 3.4835427258852507e-06, + "loss": 0.2542, + "step": 12778 + }, + { + "epoch": 0.73, + "grad_norm": 0.4889596251920278, + "learning_rate": 3.4821312928525197e-06, + "loss": 0.3594, + "step": 12779 + }, + { + "epoch": 0.73, + "grad_norm": 0.3395884327853301, + "learning_rate": 3.4807200855333024e-06, + "loss": 0.2753, + "step": 12780 + }, + { + "epoch": 0.73, + "grad_norm": 0.41027269658708754, + "learning_rate": 3.479309103976467e-06, + "loss": 0.2878, + "step": 12781 + }, + { + "epoch": 0.73, + "grad_norm": 0.2492148324473185, + "learning_rate": 3.4778983482308746e-06, + "loss": 0.1902, + "step": 12782 + }, + { + "epoch": 0.73, + "grad_norm": 0.7729463052691775, + "learning_rate": 3.4764878183453855e-06, + "loss": 0.2883, + "step": 12783 + }, + { + "epoch": 0.73, + "grad_norm": 0.2636022559064381, + "learning_rate": 3.475077514368842e-06, + "loss": 0.2694, + "step": 12784 + }, + { + "epoch": 0.73, + "grad_norm": 0.4722767331960266, + "learning_rate": 3.4736674363500846e-06, + "loss": 0.3413, + "step": 12785 + }, + { + "epoch": 0.73, + "grad_norm": 0.9849108722772745, + "learning_rate": 3.472257584337939e-06, + "loss": 0.527, + "step": 12786 + }, + { + "epoch": 0.73, + "grad_norm": 0.2514472326522464, + "learning_rate": 3.470847958381236e-06, + "loss": 0.2084, + "step": 12787 + }, + { + "epoch": 0.73, + "grad_norm": 0.2962379312373279, + "learning_rate": 3.469438558528787e-06, + "loss": 0.2507, + "step": 12788 + }, + { + "epoch": 0.73, + "grad_norm": 0.7197590638637732, + "learning_rate": 3.468029384829401e-06, + "loss": 0.2857, + "step": 12789 + }, + { + "epoch": 0.73, + "grad_norm": 0.34947417036129547, + "learning_rate": 3.466620437331876e-06, + "loss": 0.2543, + "step": 12790 + }, + { + "epoch": 0.73, + "grad_norm": 1.3024804970661898, + "learning_rate": 3.4652117160850006e-06, + "loss": 0.5634, + "step": 12791 + }, + { + "epoch": 0.73, + "grad_norm": 0.35195261144446016, + "learning_rate": 3.463803221137566e-06, + "loss": 0.2948, + "step": 12792 + }, + { + "epoch": 0.74, + "grad_norm": 0.3084578488343683, + "learning_rate": 3.462394952538345e-06, + "loss": 0.1994, + "step": 12793 + }, + { + "epoch": 0.74, + "grad_norm": 0.22815859921668444, + "learning_rate": 3.460986910336106e-06, + "loss": 0.1535, + "step": 12794 + }, + { + "epoch": 0.74, + "grad_norm": 0.7811335529012565, + "learning_rate": 3.459579094579605e-06, + "loss": 0.3796, + "step": 12795 + }, + { + "epoch": 0.74, + "grad_norm": 0.27942275007513806, + "learning_rate": 3.4581715053176023e-06, + "loss": 0.1953, + "step": 12796 + }, + { + "epoch": 0.74, + "grad_norm": 0.3920486415648585, + "learning_rate": 3.456764142598843e-06, + "loss": 0.3051, + "step": 12797 + }, + { + "epoch": 0.74, + "grad_norm": 1.016852076671105, + "learning_rate": 3.455357006472052e-06, + "loss": 0.657, + "step": 12798 + }, + { + "epoch": 0.74, + "grad_norm": 0.33966130409641193, + "learning_rate": 3.4539500969859706e-06, + "loss": 0.1705, + "step": 12799 + }, + { + "epoch": 0.74, + "grad_norm": 0.22172259172292844, + "learning_rate": 3.4525434141893166e-06, + "loss": 0.2172, + "step": 12800 + }, + { + "epoch": 0.74, + "grad_norm": 0.7058709583737947, + "learning_rate": 3.4511369581308017e-06, + "loss": 0.3947, + "step": 12801 + }, + { + "epoch": 0.74, + "grad_norm": 0.3354166197148681, + "learning_rate": 3.449730728859132e-06, + "loss": 0.1666, + "step": 12802 + }, + { + "epoch": 0.74, + "grad_norm": 0.5337470482155633, + "learning_rate": 3.4483247264230034e-06, + "loss": 0.3789, + "step": 12803 + }, + { + "epoch": 0.74, + "grad_norm": 0.33592213852392117, + "learning_rate": 3.4469189508711098e-06, + "loss": 0.3046, + "step": 12804 + }, + { + "epoch": 0.74, + "grad_norm": 0.6246678196271466, + "learning_rate": 3.445513402252132e-06, + "loss": 0.3147, + "step": 12805 + }, + { + "epoch": 0.74, + "grad_norm": 0.31427524782070476, + "learning_rate": 3.444108080614743e-06, + "loss": 0.204, + "step": 12806 + }, + { + "epoch": 0.74, + "grad_norm": 0.48310103519188197, + "learning_rate": 3.4427029860076056e-06, + "loss": 0.2649, + "step": 12807 + }, + { + "epoch": 0.74, + "grad_norm": 0.26865324112297956, + "learning_rate": 3.441298118479386e-06, + "loss": 0.2608, + "step": 12808 + }, + { + "epoch": 0.74, + "grad_norm": 0.5907160387384897, + "learning_rate": 3.4398934780787297e-06, + "loss": 0.2304, + "step": 12809 + }, + { + "epoch": 0.74, + "grad_norm": 1.1524510092382603, + "learning_rate": 3.43848906485428e-06, + "loss": 0.7557, + "step": 12810 + }, + { + "epoch": 0.74, + "grad_norm": 0.38459895702479086, + "learning_rate": 3.4370848788546695e-06, + "loss": 0.2584, + "step": 12811 + }, + { + "epoch": 0.74, + "grad_norm": 0.2760382853690984, + "learning_rate": 3.4356809201285303e-06, + "loss": 0.241, + "step": 12812 + }, + { + "epoch": 0.74, + "grad_norm": 0.32014199369104235, + "learning_rate": 3.4342771887244784e-06, + "loss": 0.2341, + "step": 12813 + }, + { + "epoch": 0.74, + "grad_norm": 0.5749750430673263, + "learning_rate": 3.4328736846911247e-06, + "loss": 0.3045, + "step": 12814 + }, + { + "epoch": 0.74, + "grad_norm": 0.38232768236464837, + "learning_rate": 3.4314704080770744e-06, + "loss": 0.2291, + "step": 12815 + }, + { + "epoch": 0.74, + "grad_norm": 0.3446190128141687, + "learning_rate": 3.4300673589309163e-06, + "loss": 0.3249, + "step": 12816 + }, + { + "epoch": 0.74, + "grad_norm": 0.5823714362679814, + "learning_rate": 3.428664537301247e-06, + "loss": 0.2962, + "step": 12817 + }, + { + "epoch": 0.74, + "grad_norm": 0.3685845728880524, + "learning_rate": 3.4272619432366427e-06, + "loss": 0.3151, + "step": 12818 + }, + { + "epoch": 0.74, + "grad_norm": 0.2341879964186778, + "learning_rate": 3.425859576785674e-06, + "loss": 0.1659, + "step": 12819 + }, + { + "epoch": 0.74, + "grad_norm": 0.4030323576618036, + "learning_rate": 3.4244574379969032e-06, + "loss": 0.2683, + "step": 12820 + }, + { + "epoch": 0.74, + "grad_norm": 0.4474160431672498, + "learning_rate": 3.4230555269188903e-06, + "loss": 0.2913, + "step": 12821 + }, + { + "epoch": 0.74, + "grad_norm": 1.0297083176155641, + "learning_rate": 3.4216538436001836e-06, + "loss": 0.5263, + "step": 12822 + }, + { + "epoch": 0.74, + "grad_norm": 0.38588698705698504, + "learning_rate": 3.4202523880893202e-06, + "loss": 0.2781, + "step": 12823 + }, + { + "epoch": 0.74, + "grad_norm": 0.30214966289449274, + "learning_rate": 3.4188511604348297e-06, + "loss": 0.2844, + "step": 12824 + }, + { + "epoch": 0.74, + "grad_norm": 0.21761652521172148, + "learning_rate": 3.417450160685245e-06, + "loss": 0.0645, + "step": 12825 + }, + { + "epoch": 0.74, + "grad_norm": 0.32300800479962954, + "learning_rate": 3.416049388889078e-06, + "loss": 0.1809, + "step": 12826 + }, + { + "epoch": 0.74, + "grad_norm": 0.3752187308252932, + "learning_rate": 3.4146488450948367e-06, + "loss": 0.2808, + "step": 12827 + }, + { + "epoch": 0.74, + "grad_norm": 0.3238753458144825, + "learning_rate": 3.413248529351023e-06, + "loss": 0.2652, + "step": 12828 + }, + { + "epoch": 0.74, + "grad_norm": 0.4298149619699671, + "learning_rate": 3.411848441706127e-06, + "loss": 0.2901, + "step": 12829 + }, + { + "epoch": 0.74, + "grad_norm": 0.4810941246561356, + "learning_rate": 3.410448582208642e-06, + "loss": 0.3489, + "step": 12830 + }, + { + "epoch": 0.74, + "grad_norm": 0.5252502309396361, + "learning_rate": 3.409048950907037e-06, + "loss": 0.3361, + "step": 12831 + }, + { + "epoch": 0.74, + "grad_norm": 0.1994109208173046, + "learning_rate": 3.4076495478497795e-06, + "loss": 0.1346, + "step": 12832 + }, + { + "epoch": 0.74, + "grad_norm": 0.4101863021105691, + "learning_rate": 3.406250373085337e-06, + "loss": 0.3074, + "step": 12833 + }, + { + "epoch": 0.74, + "grad_norm": 0.6969992266110001, + "learning_rate": 3.4048514266621612e-06, + "loss": 0.4448, + "step": 12834 + }, + { + "epoch": 0.74, + "grad_norm": 0.4233808236326501, + "learning_rate": 3.403452708628697e-06, + "loss": 0.238, + "step": 12835 + }, + { + "epoch": 0.74, + "grad_norm": 0.29572459303480436, + "learning_rate": 3.4020542190333795e-06, + "loss": 0.2596, + "step": 12836 + }, + { + "epoch": 0.74, + "grad_norm": 0.49815984479914355, + "learning_rate": 3.4006559579246425e-06, + "loss": 0.2174, + "step": 12837 + }, + { + "epoch": 0.74, + "grad_norm": 0.2761033725607673, + "learning_rate": 3.3992579253509062e-06, + "loss": 0.0935, + "step": 12838 + }, + { + "epoch": 0.74, + "grad_norm": 0.32754610459515865, + "learning_rate": 3.3978601213605842e-06, + "loss": 0.2857, + "step": 12839 + }, + { + "epoch": 0.74, + "grad_norm": 0.4294008648810787, + "learning_rate": 3.3964625460020827e-06, + "loss": 0.311, + "step": 12840 + }, + { + "epoch": 0.74, + "grad_norm": 0.6594774691330908, + "learning_rate": 3.395065199323796e-06, + "loss": 0.3494, + "step": 12841 + }, + { + "epoch": 0.74, + "grad_norm": 0.32425053227691586, + "learning_rate": 3.393668081374121e-06, + "loss": 0.2053, + "step": 12842 + }, + { + "epoch": 0.74, + "grad_norm": 0.5089151987838674, + "learning_rate": 3.3922711922014352e-06, + "loss": 0.3626, + "step": 12843 + }, + { + "epoch": 0.74, + "grad_norm": 0.21503853351438224, + "learning_rate": 3.3908745318541146e-06, + "loss": 0.1865, + "step": 12844 + }, + { + "epoch": 0.74, + "grad_norm": 0.31100684636950715, + "learning_rate": 3.389478100380521e-06, + "loss": 0.2, + "step": 12845 + }, + { + "epoch": 0.74, + "grad_norm": 0.7679876511139743, + "learning_rate": 3.3880818978290196e-06, + "loss": 0.3937, + "step": 12846 + }, + { + "epoch": 0.74, + "grad_norm": 0.32560271064088603, + "learning_rate": 3.386685924247959e-06, + "loss": 0.2838, + "step": 12847 + }, + { + "epoch": 0.74, + "grad_norm": 0.32516907110621057, + "learning_rate": 3.3852901796856796e-06, + "loss": 0.21, + "step": 12848 + }, + { + "epoch": 0.74, + "grad_norm": 1.2717231476692796, + "learning_rate": 3.3838946641905134e-06, + "loss": 0.4662, + "step": 12849 + }, + { + "epoch": 0.74, + "grad_norm": 0.2335607078138597, + "learning_rate": 3.382499377810794e-06, + "loss": 0.1513, + "step": 12850 + }, + { + "epoch": 0.74, + "grad_norm": 0.2814240890847466, + "learning_rate": 3.3811043205948366e-06, + "loss": 0.2342, + "step": 12851 + }, + { + "epoch": 0.74, + "grad_norm": 0.5990969852514557, + "learning_rate": 3.3797094925909526e-06, + "loss": 0.3364, + "step": 12852 + }, + { + "epoch": 0.74, + "grad_norm": 1.1668272839736702, + "learning_rate": 3.378314893847443e-06, + "loss": 0.7382, + "step": 12853 + }, + { + "epoch": 0.74, + "grad_norm": 0.3361253693355029, + "learning_rate": 3.3769205244126013e-06, + "loss": 0.2619, + "step": 12854 + }, + { + "epoch": 0.74, + "grad_norm": 0.357664507868556, + "learning_rate": 3.3755263843347196e-06, + "loss": 0.2275, + "step": 12855 + }, + { + "epoch": 0.74, + "grad_norm": 0.26691235584553286, + "learning_rate": 3.3741324736620752e-06, + "loss": 0.1768, + "step": 12856 + }, + { + "epoch": 0.74, + "grad_norm": 0.3316452448222894, + "learning_rate": 3.3727387924429377e-06, + "loss": 0.2648, + "step": 12857 + }, + { + "epoch": 0.74, + "grad_norm": 1.095159106391374, + "learning_rate": 3.371345340725568e-06, + "loss": 0.2964, + "step": 12858 + }, + { + "epoch": 0.74, + "grad_norm": 0.29582495886406085, + "learning_rate": 3.3699521185582274e-06, + "loss": 0.2747, + "step": 12859 + }, + { + "epoch": 0.74, + "grad_norm": 0.34966377054219355, + "learning_rate": 3.3685591259891592e-06, + "loss": 0.2708, + "step": 12860 + }, + { + "epoch": 0.74, + "grad_norm": 1.3699008142557696, + "learning_rate": 3.367166363066604e-06, + "loss": 0.2283, + "step": 12861 + }, + { + "epoch": 0.74, + "grad_norm": 0.37940865289293624, + "learning_rate": 3.3657738298387886e-06, + "loss": 0.2389, + "step": 12862 + }, + { + "epoch": 0.74, + "grad_norm": 0.29870327609057534, + "learning_rate": 3.3643815263539438e-06, + "loss": 0.2815, + "step": 12863 + }, + { + "epoch": 0.74, + "grad_norm": 0.6121185536613202, + "learning_rate": 3.3629894526602847e-06, + "loss": 0.2784, + "step": 12864 + }, + { + "epoch": 0.74, + "grad_norm": 0.39370032195369115, + "learning_rate": 3.361597608806012e-06, + "loss": 0.2998, + "step": 12865 + }, + { + "epoch": 0.74, + "grad_norm": 0.3425273598020543, + "learning_rate": 3.360205994839326e-06, + "loss": 0.2651, + "step": 12866 + }, + { + "epoch": 0.74, + "grad_norm": 0.38555273129152684, + "learning_rate": 3.358814610808424e-06, + "loss": 0.3122, + "step": 12867 + }, + { + "epoch": 0.74, + "grad_norm": 0.6651039322509703, + "learning_rate": 3.3574234567614862e-06, + "loss": 0.2088, + "step": 12868 + }, + { + "epoch": 0.74, + "grad_norm": 0.3512050670033464, + "learning_rate": 3.356032532746688e-06, + "loss": 0.2761, + "step": 12869 + }, + { + "epoch": 0.74, + "grad_norm": 0.5160563391559385, + "learning_rate": 3.354641838812195e-06, + "loss": 0.3093, + "step": 12870 + }, + { + "epoch": 0.74, + "grad_norm": 0.2944111295460084, + "learning_rate": 3.353251375006171e-06, + "loss": 0.2444, + "step": 12871 + }, + { + "epoch": 0.74, + "grad_norm": 0.23733746897925087, + "learning_rate": 3.3518611413767675e-06, + "loss": 0.2007, + "step": 12872 + }, + { + "epoch": 0.74, + "grad_norm": 1.2606823606756283, + "learning_rate": 3.3504711379721267e-06, + "loss": 0.4898, + "step": 12873 + }, + { + "epoch": 0.74, + "grad_norm": 0.8636552913098463, + "learning_rate": 3.3490813648403808e-06, + "loss": 0.3021, + "step": 12874 + }, + { + "epoch": 0.74, + "grad_norm": 0.2671159928686584, + "learning_rate": 3.347691822029665e-06, + "loss": 0.2538, + "step": 12875 + }, + { + "epoch": 0.74, + "grad_norm": 0.5173905617386279, + "learning_rate": 3.346302509588095e-06, + "loss": 0.3694, + "step": 12876 + }, + { + "epoch": 0.74, + "grad_norm": 0.17909842675389065, + "learning_rate": 3.344913427563784e-06, + "loss": 0.1285, + "step": 12877 + }, + { + "epoch": 0.74, + "grad_norm": 0.30187736747915783, + "learning_rate": 3.343524576004833e-06, + "loss": 0.2529, + "step": 12878 + }, + { + "epoch": 0.74, + "grad_norm": 0.34252991641349273, + "learning_rate": 3.342135954959338e-06, + "loss": 0.2783, + "step": 12879 + }, + { + "epoch": 0.74, + "grad_norm": 0.9488231871291415, + "learning_rate": 3.3407475644753907e-06, + "loss": 0.3745, + "step": 12880 + }, + { + "epoch": 0.74, + "grad_norm": 0.29818740197209903, + "learning_rate": 3.3393594046010693e-06, + "loss": 0.208, + "step": 12881 + }, + { + "epoch": 0.74, + "grad_norm": 1.210855898826414, + "learning_rate": 3.3379714753844463e-06, + "loss": 0.633, + "step": 12882 + }, + { + "epoch": 0.74, + "grad_norm": 0.25512255050159116, + "learning_rate": 3.3365837768735798e-06, + "loss": 0.2485, + "step": 12883 + }, + { + "epoch": 0.74, + "grad_norm": 0.29905505580538666, + "learning_rate": 3.335196309116534e-06, + "loss": 0.1868, + "step": 12884 + }, + { + "epoch": 0.74, + "grad_norm": 0.5038307443785937, + "learning_rate": 3.3338090721613547e-06, + "loss": 0.2664, + "step": 12885 + }, + { + "epoch": 0.74, + "grad_norm": 0.5952356862239482, + "learning_rate": 3.332422066056079e-06, + "loss": 0.3309, + "step": 12886 + }, + { + "epoch": 0.74, + "grad_norm": 0.2904352494406116, + "learning_rate": 3.3310352908487387e-06, + "loss": 0.1837, + "step": 12887 + }, + { + "epoch": 0.74, + "grad_norm": 0.5566544726291275, + "learning_rate": 3.3296487465873617e-06, + "loss": 0.3727, + "step": 12888 + }, + { + "epoch": 0.74, + "grad_norm": 1.338582329143749, + "learning_rate": 3.328262433319962e-06, + "loss": 0.7545, + "step": 12889 + }, + { + "epoch": 0.74, + "grad_norm": 0.18720255509199032, + "learning_rate": 3.3268763510945477e-06, + "loss": 0.1371, + "step": 12890 + }, + { + "epoch": 0.74, + "grad_norm": 0.3551963786547063, + "learning_rate": 3.325490499959114e-06, + "loss": 0.2981, + "step": 12891 + }, + { + "epoch": 0.74, + "grad_norm": 0.6958333900137992, + "learning_rate": 3.3241048799616616e-06, + "loss": 0.4138, + "step": 12892 + }, + { + "epoch": 0.74, + "grad_norm": 0.3327651923692982, + "learning_rate": 3.3227194911501705e-06, + "loss": 0.267, + "step": 12893 + }, + { + "epoch": 0.74, + "grad_norm": 1.2262784695854978, + "learning_rate": 3.3213343335726157e-06, + "loss": 0.3285, + "step": 12894 + }, + { + "epoch": 0.74, + "grad_norm": 0.35721902129772304, + "learning_rate": 3.3199494072769657e-06, + "loss": 0.3184, + "step": 12895 + }, + { + "epoch": 0.74, + "grad_norm": 0.34515026641036906, + "learning_rate": 3.3185647123111776e-06, + "loss": 0.2565, + "step": 12896 + }, + { + "epoch": 0.74, + "grad_norm": 0.4102070107736664, + "learning_rate": 3.3171802487232087e-06, + "loss": 0.1409, + "step": 12897 + }, + { + "epoch": 0.74, + "grad_norm": 0.6009426131881008, + "learning_rate": 3.3157960165610035e-06, + "loss": 0.333, + "step": 12898 + }, + { + "epoch": 0.74, + "grad_norm": 0.26019221615739213, + "learning_rate": 3.314412015872489e-06, + "loss": 0.253, + "step": 12899 + }, + { + "epoch": 0.74, + "grad_norm": 1.239600152449063, + "learning_rate": 3.313028246705603e-06, + "loss": 0.3466, + "step": 12900 + }, + { + "epoch": 0.74, + "grad_norm": 0.4657976696878919, + "learning_rate": 3.3116447091082593e-06, + "loss": 0.3098, + "step": 12901 + }, + { + "epoch": 0.74, + "grad_norm": 0.3989409195000362, + "learning_rate": 3.310261403128373e-06, + "loss": 0.2568, + "step": 12902 + }, + { + "epoch": 0.74, + "grad_norm": 0.28897844817397594, + "learning_rate": 3.3088783288138436e-06, + "loss": 0.2548, + "step": 12903 + }, + { + "epoch": 0.74, + "grad_norm": 0.4652505440048004, + "learning_rate": 3.307495486212572e-06, + "loss": 0.2714, + "step": 12904 + }, + { + "epoch": 0.74, + "grad_norm": 0.388391044884601, + "learning_rate": 3.306112875372445e-06, + "loss": 0.2605, + "step": 12905 + }, + { + "epoch": 0.74, + "grad_norm": 0.5730762996066526, + "learning_rate": 3.3047304963413407e-06, + "loss": 0.3871, + "step": 12906 + }, + { + "epoch": 0.74, + "grad_norm": 0.3801978509249738, + "learning_rate": 3.3033483491671316e-06, + "loss": 0.2487, + "step": 12907 + }, + { + "epoch": 0.74, + "grad_norm": 0.4299270873716904, + "learning_rate": 3.3019664338976787e-06, + "loss": 0.2827, + "step": 12908 + }, + { + "epoch": 0.74, + "grad_norm": 0.463795814167745, + "learning_rate": 3.300584750580842e-06, + "loss": 0.3093, + "step": 12909 + }, + { + "epoch": 0.74, + "grad_norm": 0.2269061260272887, + "learning_rate": 3.2992032992644686e-06, + "loss": 0.1444, + "step": 12910 + }, + { + "epoch": 0.74, + "grad_norm": 0.2822473309634225, + "learning_rate": 3.2978220799963955e-06, + "loss": 0.2462, + "step": 12911 + }, + { + "epoch": 0.74, + "grad_norm": 1.558699342627396, + "learning_rate": 3.2964410928244526e-06, + "loss": 0.739, + "step": 12912 + }, + { + "epoch": 0.74, + "grad_norm": 0.8887185637630161, + "learning_rate": 3.2950603377964706e-06, + "loss": 0.3034, + "step": 12913 + }, + { + "epoch": 0.74, + "grad_norm": 0.3173946544253534, + "learning_rate": 3.29367981496026e-06, + "loss": 0.2621, + "step": 12914 + }, + { + "epoch": 0.74, + "grad_norm": 0.3141577245945441, + "learning_rate": 3.29229952436363e-06, + "loss": 0.2978, + "step": 12915 + }, + { + "epoch": 0.74, + "grad_norm": 0.3278313577965663, + "learning_rate": 3.2909194660543742e-06, + "loss": 0.1864, + "step": 12916 + }, + { + "epoch": 0.74, + "grad_norm": 0.37208100190566185, + "learning_rate": 3.289539640080294e-06, + "loss": 0.1933, + "step": 12917 + }, + { + "epoch": 0.74, + "grad_norm": 1.3163936504011386, + "learning_rate": 3.288160046489166e-06, + "loss": 0.8189, + "step": 12918 + }, + { + "epoch": 0.74, + "grad_norm": 0.3683715557013782, + "learning_rate": 3.2867806853287675e-06, + "loss": 0.2944, + "step": 12919 + }, + { + "epoch": 0.74, + "grad_norm": 0.37802142040575226, + "learning_rate": 3.2854015566468643e-06, + "loss": 0.1896, + "step": 12920 + }, + { + "epoch": 0.74, + "grad_norm": 0.6399011890671984, + "learning_rate": 3.284022660491214e-06, + "loss": 0.3545, + "step": 12921 + }, + { + "epoch": 0.74, + "grad_norm": 0.22873817154729156, + "learning_rate": 3.2826439969095737e-06, + "loss": 0.2049, + "step": 12922 + }, + { + "epoch": 0.74, + "grad_norm": 0.3336483088508467, + "learning_rate": 3.281265565949683e-06, + "loss": 0.1561, + "step": 12923 + }, + { + "epoch": 0.74, + "grad_norm": 0.5578981476572488, + "learning_rate": 3.2798873676592755e-06, + "loss": 0.4109, + "step": 12924 + }, + { + "epoch": 0.74, + "grad_norm": 0.6593018831054235, + "learning_rate": 3.2785094020860777e-06, + "loss": 0.3806, + "step": 12925 + }, + { + "epoch": 0.74, + "grad_norm": 0.36905433775237373, + "learning_rate": 3.277131669277813e-06, + "loss": 0.2017, + "step": 12926 + }, + { + "epoch": 0.74, + "grad_norm": 0.3000027094105889, + "learning_rate": 3.275754169282189e-06, + "loss": 0.2773, + "step": 12927 + }, + { + "epoch": 0.74, + "grad_norm": 0.42583234394958763, + "learning_rate": 3.2743769021469096e-06, + "loss": 0.2012, + "step": 12928 + }, + { + "epoch": 0.74, + "grad_norm": 0.2963137515632942, + "learning_rate": 3.2729998679196663e-06, + "loss": 0.1866, + "step": 12929 + }, + { + "epoch": 0.74, + "grad_norm": 0.2859908008194674, + "learning_rate": 3.2716230666481506e-06, + "loss": 0.2344, + "step": 12930 + }, + { + "epoch": 0.74, + "grad_norm": 0.458580153617439, + "learning_rate": 3.2702464983800386e-06, + "loss": 0.3144, + "step": 12931 + }, + { + "epoch": 0.74, + "grad_norm": 0.3739316025400197, + "learning_rate": 3.2688701631630047e-06, + "loss": 0.2944, + "step": 12932 + }, + { + "epoch": 0.74, + "grad_norm": 0.5003894526988127, + "learning_rate": 3.2674940610447005e-06, + "loss": 0.2209, + "step": 12933 + }, + { + "epoch": 0.74, + "grad_norm": 0.2815996714466963, + "learning_rate": 3.2661181920727913e-06, + "loss": 0.2559, + "step": 12934 + }, + { + "epoch": 0.74, + "grad_norm": 0.31949144720216227, + "learning_rate": 3.2647425562949196e-06, + "loss": 0.2264, + "step": 12935 + }, + { + "epoch": 0.74, + "grad_norm": 0.43227858764097016, + "learning_rate": 3.263367153758723e-06, + "loss": 0.2649, + "step": 12936 + }, + { + "epoch": 0.74, + "grad_norm": 0.6654739405314112, + "learning_rate": 3.26199198451183e-06, + "loss": 0.3732, + "step": 12937 + }, + { + "epoch": 0.74, + "grad_norm": 0.4519785519202604, + "learning_rate": 3.2606170486018662e-06, + "loss": 0.312, + "step": 12938 + }, + { + "epoch": 0.74, + "grad_norm": 0.29017049811434276, + "learning_rate": 3.2592423460764457e-06, + "loss": 0.2275, + "step": 12939 + }, + { + "epoch": 0.74, + "grad_norm": 0.36334147109219417, + "learning_rate": 3.257867876983173e-06, + "loss": 0.1923, + "step": 12940 + }, + { + "epoch": 0.74, + "grad_norm": 0.5450956167249345, + "learning_rate": 3.256493641369641e-06, + "loss": 0.315, + "step": 12941 + }, + { + "epoch": 0.74, + "grad_norm": 0.4020490717875685, + "learning_rate": 3.2551196392834496e-06, + "loss": 0.3194, + "step": 12942 + }, + { + "epoch": 0.74, + "grad_norm": 0.32817468224538004, + "learning_rate": 3.2537458707721735e-06, + "loss": 0.2755, + "step": 12943 + }, + { + "epoch": 0.74, + "grad_norm": 0.5816663916046452, + "learning_rate": 3.252372335883388e-06, + "loss": 0.3265, + "step": 12944 + }, + { + "epoch": 0.74, + "grad_norm": 0.3815486034011904, + "learning_rate": 3.250999034664659e-06, + "loss": 0.2811, + "step": 12945 + }, + { + "epoch": 0.74, + "grad_norm": 0.3526554552274751, + "learning_rate": 3.24962596716354e-06, + "loss": 0.2452, + "step": 12946 + }, + { + "epoch": 0.74, + "grad_norm": 0.24509700046609045, + "learning_rate": 3.2482531334275856e-06, + "loss": 0.1689, + "step": 12947 + }, + { + "epoch": 0.74, + "grad_norm": 0.3338020441543314, + "learning_rate": 3.2468805335043363e-06, + "loss": 0.3091, + "step": 12948 + }, + { + "epoch": 0.74, + "grad_norm": 0.9020955309045062, + "learning_rate": 3.2455081674413226e-06, + "loss": 0.3683, + "step": 12949 + }, + { + "epoch": 0.74, + "grad_norm": 0.3050120934170287, + "learning_rate": 3.2441360352860675e-06, + "loss": 0.2703, + "step": 12950 + }, + { + "epoch": 0.74, + "grad_norm": 0.41887257623271296, + "learning_rate": 3.2427641370860953e-06, + "loss": 0.273, + "step": 12951 + }, + { + "epoch": 0.74, + "grad_norm": 0.42530719140404794, + "learning_rate": 3.241392472888909e-06, + "loss": 0.1314, + "step": 12952 + }, + { + "epoch": 0.74, + "grad_norm": 0.38376438245705596, + "learning_rate": 3.240021042742012e-06, + "loss": 0.2803, + "step": 12953 + }, + { + "epoch": 0.74, + "grad_norm": 0.47996045275196836, + "learning_rate": 3.2386498466928916e-06, + "loss": 0.3495, + "step": 12954 + }, + { + "epoch": 0.74, + "grad_norm": 0.3492844356549713, + "learning_rate": 3.237278884789039e-06, + "loss": 0.3225, + "step": 12955 + }, + { + "epoch": 0.74, + "grad_norm": 0.21621202245976492, + "learning_rate": 3.235908157077929e-06, + "loss": 0.0953, + "step": 12956 + }, + { + "epoch": 0.74, + "grad_norm": 0.37093987547274837, + "learning_rate": 3.234537663607028e-06, + "loss": 0.2762, + "step": 12957 + }, + { + "epoch": 0.74, + "grad_norm": 0.33780033787498953, + "learning_rate": 3.233167404423797e-06, + "loss": 0.2939, + "step": 12958 + }, + { + "epoch": 0.74, + "grad_norm": 0.8582542909743728, + "learning_rate": 3.231797379575684e-06, + "loss": 0.2335, + "step": 12959 + }, + { + "epoch": 0.74, + "grad_norm": 0.34765139094691233, + "learning_rate": 3.230427589110141e-06, + "loss": 0.2791, + "step": 12960 + }, + { + "epoch": 0.74, + "grad_norm": 1.1022929147980127, + "learning_rate": 3.229058033074599e-06, + "loss": 0.7693, + "step": 12961 + }, + { + "epoch": 0.74, + "grad_norm": 0.2048980960713727, + "learning_rate": 3.227688711516486e-06, + "loss": 0.18, + "step": 12962 + }, + { + "epoch": 0.74, + "grad_norm": 0.3108323466882076, + "learning_rate": 3.2263196244832183e-06, + "loss": 0.243, + "step": 12963 + }, + { + "epoch": 0.74, + "grad_norm": 1.351882544006569, + "learning_rate": 3.224950772022214e-06, + "loss": 0.5611, + "step": 12964 + }, + { + "epoch": 0.74, + "grad_norm": 0.633610003120019, + "learning_rate": 3.223582154180873e-06, + "loss": 0.2477, + "step": 12965 + }, + { + "epoch": 0.74, + "grad_norm": 0.293079591213699, + "learning_rate": 3.2222137710065915e-06, + "loss": 0.2608, + "step": 12966 + }, + { + "epoch": 0.75, + "grad_norm": 1.2513694259705406, + "learning_rate": 3.2208456225467554e-06, + "loss": 0.748, + "step": 12967 + }, + { + "epoch": 0.75, + "grad_norm": 0.2295037376326925, + "learning_rate": 3.219477708848743e-06, + "loss": 0.1619, + "step": 12968 + }, + { + "epoch": 0.75, + "grad_norm": 0.3458317104854778, + "learning_rate": 3.2181100299599268e-06, + "loss": 0.2019, + "step": 12969 + }, + { + "epoch": 0.75, + "grad_norm": 0.3743554598909583, + "learning_rate": 3.2167425859276678e-06, + "loss": 0.3023, + "step": 12970 + }, + { + "epoch": 0.75, + "grad_norm": 0.5944735307244252, + "learning_rate": 3.215375376799319e-06, + "loss": 0.3297, + "step": 12971 + }, + { + "epoch": 0.75, + "grad_norm": 0.3758967024239241, + "learning_rate": 3.214008402622232e-06, + "loss": 0.2157, + "step": 12972 + }, + { + "epoch": 0.75, + "grad_norm": 1.0402206032842936, + "learning_rate": 3.2126416634437428e-06, + "loss": 0.6525, + "step": 12973 + }, + { + "epoch": 0.75, + "grad_norm": 0.2605416145760141, + "learning_rate": 3.2112751593111803e-06, + "loss": 0.2228, + "step": 12974 + }, + { + "epoch": 0.75, + "grad_norm": 0.2663291579048332, + "learning_rate": 3.2099088902718635e-06, + "loss": 0.1584, + "step": 12975 + }, + { + "epoch": 0.75, + "grad_norm": 0.7818741848797203, + "learning_rate": 3.2085428563731137e-06, + "loss": 0.4055, + "step": 12976 + }, + { + "epoch": 0.75, + "grad_norm": 1.0304535099556142, + "learning_rate": 3.207177057662233e-06, + "loss": 0.4526, + "step": 12977 + }, + { + "epoch": 0.75, + "grad_norm": 0.23643939698926889, + "learning_rate": 3.205811494186518e-06, + "loss": 0.2101, + "step": 12978 + }, + { + "epoch": 0.75, + "grad_norm": 1.1088209673773897, + "learning_rate": 3.2044461659932557e-06, + "loss": 0.7036, + "step": 12979 + }, + { + "epoch": 0.75, + "grad_norm": 0.286646221853249, + "learning_rate": 3.2030810731297334e-06, + "loss": 0.1906, + "step": 12980 + }, + { + "epoch": 0.75, + "grad_norm": 0.3675670697198052, + "learning_rate": 3.2017162156432222e-06, + "loss": 0.2737, + "step": 12981 + }, + { + "epoch": 0.75, + "grad_norm": 0.42808874432795885, + "learning_rate": 3.2003515935809858e-06, + "loss": 0.248, + "step": 12982 + }, + { + "epoch": 0.75, + "grad_norm": 0.9420101336635036, + "learning_rate": 3.1989872069902804e-06, + "loss": 0.3599, + "step": 12983 + }, + { + "epoch": 0.75, + "grad_norm": 0.3487176167214536, + "learning_rate": 3.197623055918354e-06, + "loss": 0.2668, + "step": 12984 + }, + { + "epoch": 0.75, + "grad_norm": 1.4368984598467063, + "learning_rate": 3.196259140412451e-06, + "loss": 0.4066, + "step": 12985 + }, + { + "epoch": 0.75, + "grad_norm": 0.2809539051773061, + "learning_rate": 3.1948954605198014e-06, + "loss": 0.2199, + "step": 12986 + }, + { + "epoch": 0.75, + "grad_norm": 0.3451613359950105, + "learning_rate": 3.193532016287629e-06, + "loss": 0.2908, + "step": 12987 + }, + { + "epoch": 0.75, + "grad_norm": 0.5062091570808256, + "learning_rate": 3.1921688077631476e-06, + "loss": 0.2377, + "step": 12988 + }, + { + "epoch": 0.75, + "grad_norm": 0.39386024451333884, + "learning_rate": 3.19080583499357e-06, + "loss": 0.2658, + "step": 12989 + }, + { + "epoch": 0.75, + "grad_norm": 0.3344590966805647, + "learning_rate": 3.189443098026094e-06, + "loss": 0.2519, + "step": 12990 + }, + { + "epoch": 0.75, + "grad_norm": 0.41114489266247534, + "learning_rate": 3.188080596907911e-06, + "loss": 0.2871, + "step": 12991 + }, + { + "epoch": 0.75, + "grad_norm": 0.3871947212487157, + "learning_rate": 3.1867183316862005e-06, + "loss": 0.2061, + "step": 12992 + }, + { + "epoch": 0.75, + "grad_norm": 0.3329933325144057, + "learning_rate": 3.1853563024081446e-06, + "loss": 0.2933, + "step": 12993 + }, + { + "epoch": 0.75, + "grad_norm": 0.3416280510762696, + "learning_rate": 3.183994509120907e-06, + "loss": 0.3046, + "step": 12994 + }, + { + "epoch": 0.75, + "grad_norm": 0.5561745393411215, + "learning_rate": 3.182632951871646e-06, + "loss": 0.1458, + "step": 12995 + }, + { + "epoch": 0.75, + "grad_norm": 0.32559993723873837, + "learning_rate": 3.18127163070751e-06, + "loss": 0.2581, + "step": 12996 + }, + { + "epoch": 0.75, + "grad_norm": 0.5157714793821341, + "learning_rate": 3.1799105456756463e-06, + "loss": 0.3919, + "step": 12997 + }, + { + "epoch": 0.75, + "grad_norm": 0.45047672562360686, + "learning_rate": 3.1785496968231877e-06, + "loss": 0.2458, + "step": 12998 + }, + { + "epoch": 0.75, + "grad_norm": 0.3044374430749099, + "learning_rate": 3.1771890841972643e-06, + "loss": 0.2535, + "step": 12999 + }, + { + "epoch": 0.75, + "grad_norm": 0.4077212584271979, + "learning_rate": 3.1758287078449812e-06, + "loss": 0.2976, + "step": 13000 + }, + { + "epoch": 0.75, + "grad_norm": 0.2829686186815421, + "learning_rate": 3.174468567813461e-06, + "loss": 0.1982, + "step": 13001 + }, + { + "epoch": 0.75, + "grad_norm": 0.3156669845287359, + "learning_rate": 3.1731086641497997e-06, + "loss": 0.2602, + "step": 13002 + }, + { + "epoch": 0.75, + "grad_norm": 1.0682277454930609, + "learning_rate": 3.171748996901093e-06, + "loss": 0.7463, + "step": 13003 + }, + { + "epoch": 0.75, + "grad_norm": 0.9602481865968809, + "learning_rate": 3.1703895661144213e-06, + "loss": 0.3494, + "step": 13004 + }, + { + "epoch": 0.75, + "grad_norm": 0.2875938503824787, + "learning_rate": 3.1690303718368675e-06, + "loss": 0.1851, + "step": 13005 + }, + { + "epoch": 0.75, + "grad_norm": 0.3280084081282049, + "learning_rate": 3.1676714141154998e-06, + "loss": 0.2983, + "step": 13006 + }, + { + "epoch": 0.75, + "grad_norm": 0.28378818890057067, + "learning_rate": 3.1663126929973766e-06, + "loss": 0.1949, + "step": 13007 + }, + { + "epoch": 0.75, + "grad_norm": 0.29700206228395926, + "learning_rate": 3.1649542085295503e-06, + "loss": 0.1941, + "step": 13008 + }, + { + "epoch": 0.75, + "grad_norm": 0.4696627068472638, + "learning_rate": 3.163595960759063e-06, + "loss": 0.3623, + "step": 13009 + }, + { + "epoch": 0.75, + "grad_norm": 0.47099670577215524, + "learning_rate": 3.162237949732957e-06, + "loss": 0.3376, + "step": 13010 + }, + { + "epoch": 0.75, + "grad_norm": 0.2863904511902697, + "learning_rate": 3.1608801754982564e-06, + "loss": 0.1883, + "step": 13011 + }, + { + "epoch": 0.75, + "grad_norm": 0.33279925478274053, + "learning_rate": 3.1595226381019817e-06, + "loss": 0.1941, + "step": 13012 + }, + { + "epoch": 0.75, + "grad_norm": 0.5530973813912018, + "learning_rate": 3.15816533759114e-06, + "loss": 0.3521, + "step": 13013 + }, + { + "epoch": 0.75, + "grad_norm": 0.23414052147715458, + "learning_rate": 3.1568082740127425e-06, + "loss": 0.2081, + "step": 13014 + }, + { + "epoch": 0.75, + "grad_norm": 0.9721095642376917, + "learning_rate": 3.1554514474137797e-06, + "loss": 0.5654, + "step": 13015 + }, + { + "epoch": 0.75, + "grad_norm": 0.5991458451958498, + "learning_rate": 3.154094857841239e-06, + "loss": 0.397, + "step": 13016 + }, + { + "epoch": 0.75, + "grad_norm": 0.31615058172984073, + "learning_rate": 3.152738505342097e-06, + "loss": 0.2359, + "step": 13017 + }, + { + "epoch": 0.75, + "grad_norm": 0.36932228495972225, + "learning_rate": 3.1513823899633276e-06, + "loss": 0.2451, + "step": 13018 + }, + { + "epoch": 0.75, + "grad_norm": 0.2985834736662441, + "learning_rate": 3.1500265117518926e-06, + "loss": 0.2106, + "step": 13019 + }, + { + "epoch": 0.75, + "grad_norm": 0.31142451629486045, + "learning_rate": 3.148670870754744e-06, + "loss": 0.2453, + "step": 13020 + }, + { + "epoch": 0.75, + "grad_norm": 0.7538480855399459, + "learning_rate": 3.1473154670188255e-06, + "loss": 0.3306, + "step": 13021 + }, + { + "epoch": 0.75, + "grad_norm": 0.3409916931889605, + "learning_rate": 3.145960300591081e-06, + "loss": 0.2929, + "step": 13022 + }, + { + "epoch": 0.75, + "grad_norm": 0.3780237167598833, + "learning_rate": 3.1446053715184367e-06, + "loss": 0.2681, + "step": 13023 + }, + { + "epoch": 0.75, + "grad_norm": 0.5302115363553981, + "learning_rate": 3.1432506798478134e-06, + "loss": 0.2266, + "step": 13024 + }, + { + "epoch": 0.75, + "grad_norm": 0.20758940749083343, + "learning_rate": 3.1418962256261256e-06, + "loss": 0.2004, + "step": 13025 + }, + { + "epoch": 0.75, + "grad_norm": 0.3761950098958392, + "learning_rate": 3.1405420089002713e-06, + "loss": 0.2534, + "step": 13026 + }, + { + "epoch": 0.75, + "grad_norm": 0.6177190474844302, + "learning_rate": 3.1391880297171574e-06, + "loss": 0.2827, + "step": 13027 + }, + { + "epoch": 0.75, + "grad_norm": 0.686885988108865, + "learning_rate": 3.1378342881236657e-06, + "loss": 0.3982, + "step": 13028 + }, + { + "epoch": 0.75, + "grad_norm": 0.42586124579859064, + "learning_rate": 3.1364807841666776e-06, + "loss": 0.2896, + "step": 13029 + }, + { + "epoch": 0.75, + "grad_norm": 0.2834899923338788, + "learning_rate": 3.1351275178930616e-06, + "loss": 0.2551, + "step": 13030 + }, + { + "epoch": 0.75, + "grad_norm": 0.17052564170743853, + "learning_rate": 3.133774489349688e-06, + "loss": 0.0897, + "step": 13031 + }, + { + "epoch": 0.75, + "grad_norm": 0.3672592846418563, + "learning_rate": 3.1324216985834088e-06, + "loss": 0.2728, + "step": 13032 + }, + { + "epoch": 0.75, + "grad_norm": 0.41687149656453454, + "learning_rate": 3.1310691456410703e-06, + "loss": 0.3083, + "step": 13033 + }, + { + "epoch": 0.75, + "grad_norm": 0.5042667901150454, + "learning_rate": 3.1297168305695125e-06, + "loss": 0.2768, + "step": 13034 + }, + { + "epoch": 0.75, + "grad_norm": 0.3323112138388973, + "learning_rate": 3.128364753415565e-06, + "loss": 0.2576, + "step": 13035 + }, + { + "epoch": 0.75, + "grad_norm": 1.2650359396451536, + "learning_rate": 3.127012914226051e-06, + "loss": 0.4992, + "step": 13036 + }, + { + "epoch": 0.75, + "grad_norm": 0.256286588856045, + "learning_rate": 3.125661313047783e-06, + "loss": 0.2135, + "step": 13037 + }, + { + "epoch": 0.75, + "grad_norm": 0.30430831676033854, + "learning_rate": 3.1243099499275666e-06, + "loss": 0.2453, + "step": 13038 + }, + { + "epoch": 0.75, + "grad_norm": 0.7948641796041248, + "learning_rate": 3.1229588249122034e-06, + "loss": 0.4419, + "step": 13039 + }, + { + "epoch": 0.75, + "grad_norm": 0.46946317228487927, + "learning_rate": 3.12160793804848e-06, + "loss": 0.2173, + "step": 13040 + }, + { + "epoch": 0.75, + "grad_norm": 0.34864930416461026, + "learning_rate": 3.120257289383178e-06, + "loss": 0.2764, + "step": 13041 + }, + { + "epoch": 0.75, + "grad_norm": 0.4017789633656141, + "learning_rate": 3.1189068789630672e-06, + "loss": 0.2943, + "step": 13042 + }, + { + "epoch": 0.75, + "grad_norm": 0.4361244313994006, + "learning_rate": 3.117556706834919e-06, + "loss": 0.2576, + "step": 13043 + }, + { + "epoch": 0.75, + "grad_norm": 0.727950689186441, + "learning_rate": 3.116206773045486e-06, + "loss": 0.212, + "step": 13044 + }, + { + "epoch": 0.75, + "grad_norm": 0.33404541531367393, + "learning_rate": 3.1148570776415153e-06, + "loss": 0.2929, + "step": 13045 + }, + { + "epoch": 0.75, + "grad_norm": 0.3258378879850681, + "learning_rate": 3.1135076206697456e-06, + "loss": 0.2331, + "step": 13046 + }, + { + "epoch": 0.75, + "grad_norm": 0.3613632019699203, + "learning_rate": 3.112158402176915e-06, + "loss": 0.1676, + "step": 13047 + }, + { + "epoch": 0.75, + "grad_norm": 0.5140961520189742, + "learning_rate": 3.110809422209742e-06, + "loss": 0.353, + "step": 13048 + }, + { + "epoch": 0.75, + "grad_norm": 0.34242223527610316, + "learning_rate": 3.109460680814942e-06, + "loss": 0.2875, + "step": 13049 + }, + { + "epoch": 0.75, + "grad_norm": 0.4790141871496615, + "learning_rate": 3.108112178039222e-06, + "loss": 0.2166, + "step": 13050 + }, + { + "epoch": 0.75, + "grad_norm": 0.5128694160336205, + "learning_rate": 3.106763913929278e-06, + "loss": 0.3545, + "step": 13051 + }, + { + "epoch": 0.75, + "grad_norm": 0.25136946231934676, + "learning_rate": 3.1054158885318075e-06, + "loss": 0.1828, + "step": 13052 + }, + { + "epoch": 0.75, + "grad_norm": 0.27467417356526297, + "learning_rate": 3.104068101893487e-06, + "loss": 0.232, + "step": 13053 + }, + { + "epoch": 0.75, + "grad_norm": 0.5452312702861496, + "learning_rate": 3.102720554060993e-06, + "loss": 0.3419, + "step": 13054 + }, + { + "epoch": 0.75, + "grad_norm": 0.770701859262707, + "learning_rate": 3.101373245080985e-06, + "loss": 0.4232, + "step": 13055 + }, + { + "epoch": 0.75, + "grad_norm": 0.3705818865576376, + "learning_rate": 3.100026175000128e-06, + "loss": 0.2777, + "step": 13056 + }, + { + "epoch": 0.75, + "grad_norm": 0.3255903926001107, + "learning_rate": 3.0986793438650686e-06, + "loss": 0.2491, + "step": 13057 + }, + { + "epoch": 0.75, + "grad_norm": 0.29833688651425433, + "learning_rate": 3.097332751722447e-06, + "loss": 0.265, + "step": 13058 + }, + { + "epoch": 0.75, + "grad_norm": 0.280230038707804, + "learning_rate": 3.095986398618892e-06, + "loss": 0.1964, + "step": 13059 + }, + { + "epoch": 0.75, + "grad_norm": 0.5305521770532992, + "learning_rate": 3.094640284601034e-06, + "loss": 0.2259, + "step": 13060 + }, + { + "epoch": 0.75, + "grad_norm": 0.3592098971698116, + "learning_rate": 3.093294409715486e-06, + "loss": 0.3027, + "step": 13061 + }, + { + "epoch": 0.75, + "grad_norm": 0.7451073577020624, + "learning_rate": 3.0919487740088563e-06, + "loss": 0.3797, + "step": 13062 + }, + { + "epoch": 0.75, + "grad_norm": 0.3298793993154702, + "learning_rate": 3.090603377527742e-06, + "loss": 0.2196, + "step": 13063 + }, + { + "epoch": 0.75, + "grad_norm": 0.23678147433076321, + "learning_rate": 3.0892582203187337e-06, + "loss": 0.184, + "step": 13064 + }, + { + "epoch": 0.75, + "grad_norm": 0.36824332059382947, + "learning_rate": 3.087913302428419e-06, + "loss": 0.2789, + "step": 13065 + }, + { + "epoch": 0.75, + "grad_norm": 0.3972684606203196, + "learning_rate": 3.0865686239033687e-06, + "loss": 0.2218, + "step": 13066 + }, + { + "epoch": 0.75, + "grad_norm": 0.7531669824152212, + "learning_rate": 3.085224184790151e-06, + "loss": 0.3647, + "step": 13067 + }, + { + "epoch": 0.75, + "grad_norm": 0.5677699683736633, + "learning_rate": 3.083879985135322e-06, + "loss": 0.3255, + "step": 13068 + }, + { + "epoch": 0.75, + "grad_norm": 0.25851571654009503, + "learning_rate": 3.082536024985431e-06, + "loss": 0.2738, + "step": 13069 + }, + { + "epoch": 0.75, + "grad_norm": 0.9987692197544271, + "learning_rate": 3.0811923043870206e-06, + "loss": 0.4699, + "step": 13070 + }, + { + "epoch": 0.75, + "grad_norm": 0.2454516431873957, + "learning_rate": 3.0798488233866196e-06, + "loss": 0.1554, + "step": 13071 + }, + { + "epoch": 0.75, + "grad_norm": 0.3892847958002378, + "learning_rate": 3.0785055820307595e-06, + "loss": 0.2817, + "step": 13072 + }, + { + "epoch": 0.75, + "grad_norm": 0.3326192433773473, + "learning_rate": 3.077162580365953e-06, + "loss": 0.2553, + "step": 13073 + }, + { + "epoch": 0.75, + "grad_norm": 0.6536009982343787, + "learning_rate": 3.07581981843871e-06, + "loss": 0.3113, + "step": 13074 + }, + { + "epoch": 0.75, + "grad_norm": 0.36056431727318683, + "learning_rate": 3.0744772962955283e-06, + "loss": 0.2842, + "step": 13075 + }, + { + "epoch": 0.75, + "grad_norm": 0.26550009542834724, + "learning_rate": 3.0731350139828963e-06, + "loss": 0.2026, + "step": 13076 + }, + { + "epoch": 0.75, + "grad_norm": 0.23690984887180597, + "learning_rate": 3.071792971547305e-06, + "loss": 0.2057, + "step": 13077 + }, + { + "epoch": 0.75, + "grad_norm": 0.5352459519460655, + "learning_rate": 3.0704511690352246e-06, + "loss": 0.3448, + "step": 13078 + }, + { + "epoch": 0.75, + "grad_norm": 0.7568358868361343, + "learning_rate": 3.0691096064931226e-06, + "loss": 0.3506, + "step": 13079 + }, + { + "epoch": 0.75, + "grad_norm": 0.8462214742386458, + "learning_rate": 3.0677682839674526e-06, + "loss": 0.2057, + "step": 13080 + }, + { + "epoch": 0.75, + "grad_norm": 0.2447690890806674, + "learning_rate": 3.0664272015046735e-06, + "loss": 0.2524, + "step": 13081 + }, + { + "epoch": 0.75, + "grad_norm": 1.2331150403392583, + "learning_rate": 3.0650863591512215e-06, + "loss": 0.6398, + "step": 13082 + }, + { + "epoch": 0.75, + "grad_norm": 0.32002625401700546, + "learning_rate": 3.063745756953531e-06, + "loss": 0.1126, + "step": 13083 + }, + { + "epoch": 0.75, + "grad_norm": 0.38562212812440144, + "learning_rate": 3.062405394958022e-06, + "loss": 0.2825, + "step": 13084 + }, + { + "epoch": 0.75, + "grad_norm": 0.4755195083621667, + "learning_rate": 3.061065273211121e-06, + "loss": 0.2858, + "step": 13085 + }, + { + "epoch": 0.75, + "grad_norm": 0.5813518421794341, + "learning_rate": 3.0597253917592308e-06, + "loss": 0.1095, + "step": 13086 + }, + { + "epoch": 0.75, + "grad_norm": 0.38714185646623833, + "learning_rate": 3.0583857506487514e-06, + "loss": 0.33, + "step": 13087 + }, + { + "epoch": 0.75, + "grad_norm": 1.2848101672311991, + "learning_rate": 3.057046349926075e-06, + "loss": 0.7524, + "step": 13088 + }, + { + "epoch": 0.75, + "grad_norm": 0.2332084154602774, + "learning_rate": 3.0557071896375824e-06, + "loss": 0.1926, + "step": 13089 + }, + { + "epoch": 0.75, + "grad_norm": 0.3755476958237447, + "learning_rate": 3.054368269829654e-06, + "loss": 0.2989, + "step": 13090 + }, + { + "epoch": 0.75, + "grad_norm": 0.46469134330943285, + "learning_rate": 3.0530295905486527e-06, + "loss": 0.2978, + "step": 13091 + }, + { + "epoch": 0.75, + "grad_norm": 0.4148719623586969, + "learning_rate": 3.0516911518409387e-06, + "loss": 0.3245, + "step": 13092 + }, + { + "epoch": 0.75, + "grad_norm": 0.28103354751520443, + "learning_rate": 3.0503529537528585e-06, + "loss": 0.2178, + "step": 13093 + }, + { + "epoch": 0.75, + "grad_norm": 1.1477225314022288, + "learning_rate": 3.04901499633076e-06, + "loss": 0.7313, + "step": 13094 + }, + { + "epoch": 0.75, + "grad_norm": 0.6204403347702722, + "learning_rate": 3.047677279620973e-06, + "loss": 0.3122, + "step": 13095 + }, + { + "epoch": 0.75, + "grad_norm": 0.3277311313482158, + "learning_rate": 3.0463398036698222e-06, + "loss": 0.2242, + "step": 13096 + }, + { + "epoch": 0.75, + "grad_norm": 0.24065037007139628, + "learning_rate": 3.0450025685236227e-06, + "loss": 0.2127, + "step": 13097 + }, + { + "epoch": 0.75, + "grad_norm": 1.3160438437281266, + "learning_rate": 3.043665574228688e-06, + "loss": 0.677, + "step": 13098 + }, + { + "epoch": 0.75, + "grad_norm": 0.305915681969301, + "learning_rate": 3.042328820831315e-06, + "loss": 0.2111, + "step": 13099 + }, + { + "epoch": 0.75, + "grad_norm": 0.47529902211212993, + "learning_rate": 3.040992308377796e-06, + "loss": 0.3485, + "step": 13100 + }, + { + "epoch": 0.75, + "grad_norm": 0.4365889963384517, + "learning_rate": 3.0396560369144145e-06, + "loss": 0.3292, + "step": 13101 + }, + { + "epoch": 0.75, + "grad_norm": 0.3014203717649501, + "learning_rate": 3.038320006487445e-06, + "loss": 0.2112, + "step": 13102 + }, + { + "epoch": 0.75, + "grad_norm": 0.2850257890850647, + "learning_rate": 3.036984217143154e-06, + "loss": 0.1755, + "step": 13103 + }, + { + "epoch": 0.75, + "grad_norm": 0.3429237306537405, + "learning_rate": 3.0356486689278e-06, + "loss": 0.3056, + "step": 13104 + }, + { + "epoch": 0.75, + "grad_norm": 0.3024864803605085, + "learning_rate": 3.034313361887631e-06, + "loss": 0.242, + "step": 13105 + }, + { + "epoch": 0.75, + "grad_norm": 0.8906428296674906, + "learning_rate": 3.0329782960688926e-06, + "loss": 0.3281, + "step": 13106 + }, + { + "epoch": 0.75, + "grad_norm": 0.5836982031317877, + "learning_rate": 3.031643471517817e-06, + "loss": 0.3389, + "step": 13107 + }, + { + "epoch": 0.75, + "grad_norm": 0.3551032461668801, + "learning_rate": 3.0303088882806276e-06, + "loss": 0.2684, + "step": 13108 + }, + { + "epoch": 0.75, + "grad_norm": 0.2158504718862656, + "learning_rate": 3.028974546403539e-06, + "loss": 0.1674, + "step": 13109 + }, + { + "epoch": 0.75, + "grad_norm": 0.7293506132465545, + "learning_rate": 3.027640445932766e-06, + "loss": 0.4317, + "step": 13110 + }, + { + "epoch": 0.75, + "grad_norm": 0.36427266642415257, + "learning_rate": 3.0263065869145035e-06, + "loss": 0.26, + "step": 13111 + }, + { + "epoch": 0.75, + "grad_norm": 0.5653448581871906, + "learning_rate": 3.024972969394944e-06, + "loss": 0.2791, + "step": 13112 + }, + { + "epoch": 0.75, + "grad_norm": 0.4739343898820716, + "learning_rate": 3.023639593420271e-06, + "loss": 0.3312, + "step": 13113 + }, + { + "epoch": 0.75, + "grad_norm": 0.36681901328564676, + "learning_rate": 3.022306459036656e-06, + "loss": 0.2701, + "step": 13114 + }, + { + "epoch": 0.75, + "grad_norm": 0.29844337218234673, + "learning_rate": 3.0209735662902706e-06, + "loss": 0.1129, + "step": 13115 + }, + { + "epoch": 0.75, + "grad_norm": 0.35840107821791856, + "learning_rate": 3.019640915227271e-06, + "loss": 0.2941, + "step": 13116 + }, + { + "epoch": 0.75, + "grad_norm": 0.3529763050954908, + "learning_rate": 3.0183085058938068e-06, + "loss": 0.2667, + "step": 13117 + }, + { + "epoch": 0.75, + "grad_norm": 0.8960601500229003, + "learning_rate": 3.016976338336015e-06, + "loss": 0.3917, + "step": 13118 + }, + { + "epoch": 0.75, + "grad_norm": 0.8699740633932406, + "learning_rate": 3.015644412600036e-06, + "loss": 0.2787, + "step": 13119 + }, + { + "epoch": 0.75, + "grad_norm": 0.316723083079564, + "learning_rate": 3.0143127287319895e-06, + "loss": 0.2392, + "step": 13120 + }, + { + "epoch": 0.75, + "grad_norm": 0.2449187149952753, + "learning_rate": 3.012981286777994e-06, + "loss": 0.2233, + "step": 13121 + }, + { + "epoch": 0.75, + "grad_norm": 1.1979911990400887, + "learning_rate": 3.0116500867841525e-06, + "loss": 0.4074, + "step": 13122 + }, + { + "epoch": 0.75, + "grad_norm": 0.3560474829287622, + "learning_rate": 3.0103191287965715e-06, + "loss": 0.2798, + "step": 13123 + }, + { + "epoch": 0.75, + "grad_norm": 0.9490243159614561, + "learning_rate": 3.008988412861338e-06, + "loss": 0.3888, + "step": 13124 + }, + { + "epoch": 0.75, + "grad_norm": 0.34330007122672573, + "learning_rate": 3.007657939024535e-06, + "loss": 0.2458, + "step": 13125 + }, + { + "epoch": 0.75, + "grad_norm": 0.31736090452794763, + "learning_rate": 3.006327707332235e-06, + "loss": 0.2582, + "step": 13126 + }, + { + "epoch": 0.75, + "grad_norm": 0.4724601701873738, + "learning_rate": 3.004997717830508e-06, + "loss": 0.2203, + "step": 13127 + }, + { + "epoch": 0.75, + "grad_norm": 0.2326895772207429, + "learning_rate": 3.003667970565409e-06, + "loss": 0.1987, + "step": 13128 + }, + { + "epoch": 0.75, + "grad_norm": 0.3605528897187082, + "learning_rate": 3.002338465582988e-06, + "loss": 0.2826, + "step": 13129 + }, + { + "epoch": 0.75, + "grad_norm": 0.9734472280455081, + "learning_rate": 3.0010092029292835e-06, + "loss": 0.3781, + "step": 13130 + }, + { + "epoch": 0.75, + "grad_norm": 1.0805909522215216, + "learning_rate": 2.9996801826503275e-06, + "loss": 0.5344, + "step": 13131 + }, + { + "epoch": 0.75, + "grad_norm": 0.2773513817656313, + "learning_rate": 2.9983514047921493e-06, + "loss": 0.1889, + "step": 13132 + }, + { + "epoch": 0.75, + "grad_norm": 0.35829546695635645, + "learning_rate": 2.9970228694007598e-06, + "loss": 0.3274, + "step": 13133 + }, + { + "epoch": 0.75, + "grad_norm": 0.4805023126371464, + "learning_rate": 2.995694576522168e-06, + "loss": 0.2938, + "step": 13134 + }, + { + "epoch": 0.75, + "grad_norm": 0.32261178589923567, + "learning_rate": 2.9943665262023714e-06, + "loss": 0.2114, + "step": 13135 + }, + { + "epoch": 0.75, + "grad_norm": 0.36506769087255786, + "learning_rate": 2.993038718487361e-06, + "loss": 0.3249, + "step": 13136 + }, + { + "epoch": 0.75, + "grad_norm": 0.3643438925093802, + "learning_rate": 2.991711153423118e-06, + "loss": 0.215, + "step": 13137 + }, + { + "epoch": 0.75, + "grad_norm": 0.33487426259480946, + "learning_rate": 2.9903838310556133e-06, + "loss": 0.1942, + "step": 13138 + }, + { + "epoch": 0.75, + "grad_norm": 0.4488417837759212, + "learning_rate": 2.989056751430819e-06, + "loss": 0.2472, + "step": 13139 + }, + { + "epoch": 0.75, + "grad_norm": 0.3362232531745725, + "learning_rate": 2.987729914594687e-06, + "loss": 0.3241, + "step": 13140 + }, + { + "epoch": 0.76, + "grad_norm": 0.29966928249023517, + "learning_rate": 2.9864033205931675e-06, + "loss": 0.2217, + "step": 13141 + }, + { + "epoch": 0.76, + "grad_norm": 0.863128083275095, + "learning_rate": 2.9850769694721982e-06, + "loss": 0.5081, + "step": 13142 + }, + { + "epoch": 0.76, + "grad_norm": 0.39978155517146674, + "learning_rate": 2.9837508612777087e-06, + "loss": 0.2601, + "step": 13143 + }, + { + "epoch": 0.76, + "grad_norm": 0.2719235906065331, + "learning_rate": 2.9824249960556294e-06, + "loss": 0.2627, + "step": 13144 + }, + { + "epoch": 0.76, + "grad_norm": 0.5339971689860459, + "learning_rate": 2.9810993738518702e-06, + "loss": 0.2528, + "step": 13145 + }, + { + "epoch": 0.76, + "grad_norm": 0.7843067034359767, + "learning_rate": 2.9797739947123383e-06, + "loss": 0.4102, + "step": 13146 + }, + { + "epoch": 0.76, + "grad_norm": 0.3894676336923343, + "learning_rate": 2.9784488586829272e-06, + "loss": 0.2814, + "step": 13147 + }, + { + "epoch": 0.76, + "grad_norm": 0.2722933405316551, + "learning_rate": 2.9771239658095342e-06, + "loss": 0.249, + "step": 13148 + }, + { + "epoch": 0.76, + "grad_norm": 0.2812989717087355, + "learning_rate": 2.975799316138035e-06, + "loss": 0.187, + "step": 13149 + }, + { + "epoch": 0.76, + "grad_norm": 0.39908579003212574, + "learning_rate": 2.9744749097143046e-06, + "loss": 0.2796, + "step": 13150 + }, + { + "epoch": 0.76, + "grad_norm": 0.6090914983354809, + "learning_rate": 2.9731507465842025e-06, + "loss": 0.2293, + "step": 13151 + }, + { + "epoch": 0.76, + "grad_norm": 0.4057610007364994, + "learning_rate": 2.97182682679359e-06, + "loss": 0.304, + "step": 13152 + }, + { + "epoch": 0.76, + "grad_norm": 0.4026742727902014, + "learning_rate": 2.970503150388313e-06, + "loss": 0.3015, + "step": 13153 + }, + { + "epoch": 0.76, + "grad_norm": 0.29655660277141555, + "learning_rate": 2.96917971741421e-06, + "loss": 0.1956, + "step": 13154 + }, + { + "epoch": 0.76, + "grad_norm": 0.4737388090525767, + "learning_rate": 2.9678565279171113e-06, + "loss": 0.2305, + "step": 13155 + }, + { + "epoch": 0.76, + "grad_norm": 0.25693414461923525, + "learning_rate": 2.9665335819428354e-06, + "loss": 0.2463, + "step": 13156 + }, + { + "epoch": 0.76, + "grad_norm": 0.5539166843080455, + "learning_rate": 2.9652108795372016e-06, + "loss": 0.323, + "step": 13157 + }, + { + "epoch": 0.76, + "grad_norm": 0.6583760802800553, + "learning_rate": 2.963888420746013e-06, + "loss": 0.2997, + "step": 13158 + }, + { + "epoch": 0.76, + "grad_norm": 0.3184635760317434, + "learning_rate": 2.962566205615065e-06, + "loss": 0.2428, + "step": 13159 + }, + { + "epoch": 0.76, + "grad_norm": 0.45352708893623567, + "learning_rate": 2.9612442341901448e-06, + "loss": 0.3603, + "step": 13160 + }, + { + "epoch": 0.76, + "grad_norm": 0.22507968518561244, + "learning_rate": 2.9599225065170356e-06, + "loss": 0.1509, + "step": 13161 + }, + { + "epoch": 0.76, + "grad_norm": 0.32301562997476935, + "learning_rate": 2.9586010226415085e-06, + "loss": 0.2435, + "step": 13162 + }, + { + "epoch": 0.76, + "grad_norm": 0.9494394947854288, + "learning_rate": 2.9572797826093256e-06, + "loss": 0.4594, + "step": 13163 + }, + { + "epoch": 0.76, + "grad_norm": 0.36294253708825114, + "learning_rate": 2.9559587864662365e-06, + "loss": 0.2539, + "step": 13164 + }, + { + "epoch": 0.76, + "grad_norm": 0.6077925906601935, + "learning_rate": 2.9546380342579962e-06, + "loss": 0.3771, + "step": 13165 + }, + { + "epoch": 0.76, + "grad_norm": 0.3826961493987988, + "learning_rate": 2.953317526030337e-06, + "loss": 0.3284, + "step": 13166 + }, + { + "epoch": 0.76, + "grad_norm": 0.26520872395312056, + "learning_rate": 2.9519972618289894e-06, + "loss": 0.2096, + "step": 13167 + }, + { + "epoch": 0.76, + "grad_norm": 0.3666621982903667, + "learning_rate": 2.9506772416996732e-06, + "loss": 0.1855, + "step": 13168 + }, + { + "epoch": 0.76, + "grad_norm": 0.3930965638463575, + "learning_rate": 2.9493574656881006e-06, + "loss": 0.29, + "step": 13169 + }, + { + "epoch": 0.76, + "grad_norm": 0.8084312110357765, + "learning_rate": 2.9480379338399757e-06, + "loss": 0.3901, + "step": 13170 + }, + { + "epoch": 0.76, + "grad_norm": 0.3213576415786909, + "learning_rate": 2.9467186462009943e-06, + "loss": 0.1748, + "step": 13171 + }, + { + "epoch": 0.76, + "grad_norm": 0.2886036323658359, + "learning_rate": 2.94539960281684e-06, + "loss": 0.2953, + "step": 13172 + }, + { + "epoch": 0.76, + "grad_norm": 0.44952697496270905, + "learning_rate": 2.944080803733197e-06, + "loss": 0.2489, + "step": 13173 + }, + { + "epoch": 0.76, + "grad_norm": 0.24913717295057006, + "learning_rate": 2.942762248995733e-06, + "loss": 0.1602, + "step": 13174 + }, + { + "epoch": 0.76, + "grad_norm": 0.5314385966365966, + "learning_rate": 2.9414439386501082e-06, + "loss": 0.3329, + "step": 13175 + }, + { + "epoch": 0.76, + "grad_norm": 0.3573835288969693, + "learning_rate": 2.9401258727419723e-06, + "loss": 0.3176, + "step": 13176 + }, + { + "epoch": 0.76, + "grad_norm": 0.31895803358139446, + "learning_rate": 2.938808051316978e-06, + "loss": 0.1936, + "step": 13177 + }, + { + "epoch": 0.76, + "grad_norm": 0.5156004074194172, + "learning_rate": 2.937490474420758e-06, + "loss": 0.3777, + "step": 13178 + }, + { + "epoch": 0.76, + "grad_norm": 0.3266568107674821, + "learning_rate": 2.9361731420989382e-06, + "loss": 0.1728, + "step": 13179 + }, + { + "epoch": 0.76, + "grad_norm": 0.2562266580469743, + "learning_rate": 2.9348560543971383e-06, + "loss": 0.2493, + "step": 13180 + }, + { + "epoch": 0.76, + "grad_norm": 0.4862114642170611, + "learning_rate": 2.933539211360966e-06, + "loss": 0.2606, + "step": 13181 + }, + { + "epoch": 0.76, + "grad_norm": 0.8436076426893645, + "learning_rate": 2.932222613036032e-06, + "loss": 0.4518, + "step": 13182 + }, + { + "epoch": 0.76, + "grad_norm": 0.8093127081004269, + "learning_rate": 2.930906259467924e-06, + "loss": 0.3676, + "step": 13183 + }, + { + "epoch": 0.76, + "grad_norm": 0.24389401278509923, + "learning_rate": 2.9295901507022275e-06, + "loss": 0.2305, + "step": 13184 + }, + { + "epoch": 0.76, + "grad_norm": 0.5109376582936659, + "learning_rate": 2.928274286784517e-06, + "loss": 0.2928, + "step": 13185 + }, + { + "epoch": 0.76, + "grad_norm": 0.6477468458115577, + "learning_rate": 2.9269586677603677e-06, + "loss": 0.3776, + "step": 13186 + }, + { + "epoch": 0.76, + "grad_norm": 0.28910265146819936, + "learning_rate": 2.9256432936753354e-06, + "loss": 0.1995, + "step": 13187 + }, + { + "epoch": 0.76, + "grad_norm": 0.350700075545137, + "learning_rate": 2.924328164574972e-06, + "loss": 0.3169, + "step": 13188 + }, + { + "epoch": 0.76, + "grad_norm": 0.7686568143004432, + "learning_rate": 2.923013280504816e-06, + "loss": 0.4884, + "step": 13189 + }, + { + "epoch": 0.76, + "grad_norm": 0.3587136303027198, + "learning_rate": 2.9216986415104097e-06, + "loss": 0.2156, + "step": 13190 + }, + { + "epoch": 0.76, + "grad_norm": 0.7374740757701493, + "learning_rate": 2.9203842476372747e-06, + "loss": 0.3926, + "step": 13191 + }, + { + "epoch": 0.76, + "grad_norm": 0.2838512447883044, + "learning_rate": 2.9190700989309285e-06, + "loss": 0.2682, + "step": 13192 + }, + { + "epoch": 0.76, + "grad_norm": 0.290622311729339, + "learning_rate": 2.9177561954368804e-06, + "loss": 0.2525, + "step": 13193 + }, + { + "epoch": 0.76, + "grad_norm": 0.5359851867679772, + "learning_rate": 2.916442537200629e-06, + "loss": 0.1526, + "step": 13194 + }, + { + "epoch": 0.76, + "grad_norm": 0.37080513662796255, + "learning_rate": 2.9151291242676692e-06, + "loss": 0.2884, + "step": 13195 + }, + { + "epoch": 0.76, + "grad_norm": 0.30793946557758933, + "learning_rate": 2.9138159566834834e-06, + "loss": 0.2773, + "step": 13196 + }, + { + "epoch": 0.76, + "grad_norm": 0.8039012607707117, + "learning_rate": 2.912503034493547e-06, + "loss": 0.2705, + "step": 13197 + }, + { + "epoch": 0.76, + "grad_norm": 0.5254037076250225, + "learning_rate": 2.911190357743322e-06, + "loss": 0.3449, + "step": 13198 + }, + { + "epoch": 0.76, + "grad_norm": 0.387990735879651, + "learning_rate": 2.909877926478274e-06, + "loss": 0.3074, + "step": 13199 + }, + { + "epoch": 0.76, + "grad_norm": 0.22161277090349127, + "learning_rate": 2.9085657407438485e-06, + "loss": 0.1872, + "step": 13200 + }, + { + "epoch": 0.76, + "grad_norm": 0.5582834510824618, + "learning_rate": 2.9072538005854855e-06, + "loss": 0.3282, + "step": 13201 + }, + { + "epoch": 0.76, + "grad_norm": 0.40851544103697307, + "learning_rate": 2.9059421060486193e-06, + "loss": 0.3156, + "step": 13202 + }, + { + "epoch": 0.76, + "grad_norm": 0.35263245779401653, + "learning_rate": 2.904630657178672e-06, + "loss": 0.2656, + "step": 13203 + }, + { + "epoch": 0.76, + "grad_norm": 0.6917076508307758, + "learning_rate": 2.903319454021061e-06, + "loss": 0.3521, + "step": 13204 + }, + { + "epoch": 0.76, + "grad_norm": 0.34619152479873244, + "learning_rate": 2.9020084966211913e-06, + "loss": 0.2813, + "step": 13205 + }, + { + "epoch": 0.76, + "grad_norm": 0.2818608501668898, + "learning_rate": 2.900697785024459e-06, + "loss": 0.1854, + "step": 13206 + }, + { + "epoch": 0.76, + "grad_norm": 0.2772904606806348, + "learning_rate": 2.89938731927626e-06, + "loss": 0.2237, + "step": 13207 + }, + { + "epoch": 0.76, + "grad_norm": 0.3456570326485675, + "learning_rate": 2.8980770994219743e-06, + "loss": 0.2759, + "step": 13208 + }, + { + "epoch": 0.76, + "grad_norm": 0.6722228995159082, + "learning_rate": 2.8967671255069717e-06, + "loss": 0.3808, + "step": 13209 + }, + { + "epoch": 0.76, + "grad_norm": 0.5544282846657669, + "learning_rate": 2.8954573975766156e-06, + "loss": 0.1777, + "step": 13210 + }, + { + "epoch": 0.76, + "grad_norm": 0.33520447957453975, + "learning_rate": 2.8941479156762675e-06, + "loss": 0.2917, + "step": 13211 + }, + { + "epoch": 0.76, + "grad_norm": 0.23424922457156772, + "learning_rate": 2.892838679851272e-06, + "loss": 0.1917, + "step": 13212 + }, + { + "epoch": 0.76, + "grad_norm": 0.3888829363306553, + "learning_rate": 2.891529690146966e-06, + "loss": 0.2193, + "step": 13213 + }, + { + "epoch": 0.76, + "grad_norm": 0.39275318593597286, + "learning_rate": 2.8902209466086794e-06, + "loss": 0.3079, + "step": 13214 + }, + { + "epoch": 0.76, + "grad_norm": 0.6074206260942908, + "learning_rate": 2.8889124492817377e-06, + "loss": 0.3461, + "step": 13215 + }, + { + "epoch": 0.76, + "grad_norm": 0.4121332734309223, + "learning_rate": 2.887604198211453e-06, + "loss": 0.2094, + "step": 13216 + }, + { + "epoch": 0.76, + "grad_norm": 0.4086837165721845, + "learning_rate": 2.886296193443129e-06, + "loss": 0.2966, + "step": 13217 + }, + { + "epoch": 0.76, + "grad_norm": 0.31241744718481923, + "learning_rate": 2.8849884350220614e-06, + "loss": 0.2266, + "step": 13218 + }, + { + "epoch": 0.76, + "grad_norm": 0.3312463483694761, + "learning_rate": 2.883680922993536e-06, + "loss": 0.2859, + "step": 13219 + }, + { + "epoch": 0.76, + "grad_norm": 0.3602296698578734, + "learning_rate": 2.882373657402836e-06, + "loss": 0.2236, + "step": 13220 + }, + { + "epoch": 0.76, + "grad_norm": 0.6188669949771659, + "learning_rate": 2.8810666382952314e-06, + "loss": 0.3728, + "step": 13221 + }, + { + "epoch": 0.76, + "grad_norm": 1.3765443199588423, + "learning_rate": 2.879759865715982e-06, + "loss": 0.5777, + "step": 13222 + }, + { + "epoch": 0.76, + "grad_norm": 0.24491184844570574, + "learning_rate": 2.87845333971034e-06, + "loss": 0.2082, + "step": 13223 + }, + { + "epoch": 0.76, + "grad_norm": 0.31949829603425767, + "learning_rate": 2.877147060323555e-06, + "loss": 0.2706, + "step": 13224 + }, + { + "epoch": 0.76, + "grad_norm": 0.8517481211049055, + "learning_rate": 2.875841027600862e-06, + "loss": 0.4386, + "step": 13225 + }, + { + "epoch": 0.76, + "grad_norm": 0.2866011692470818, + "learning_rate": 2.8745352415874872e-06, + "loss": 0.2287, + "step": 13226 + }, + { + "epoch": 0.76, + "grad_norm": 0.3154714117522481, + "learning_rate": 2.873229702328647e-06, + "loss": 0.251, + "step": 13227 + }, + { + "epoch": 0.76, + "grad_norm": 0.47557481422269005, + "learning_rate": 2.8719244098695597e-06, + "loss": 0.3048, + "step": 13228 + }, + { + "epoch": 0.76, + "grad_norm": 0.28793328291945003, + "learning_rate": 2.8706193642554237e-06, + "loss": 0.1866, + "step": 13229 + }, + { + "epoch": 0.76, + "grad_norm": 1.1838012367861193, + "learning_rate": 2.8693145655314327e-06, + "loss": 0.6212, + "step": 13230 + }, + { + "epoch": 0.76, + "grad_norm": 0.3389412390749291, + "learning_rate": 2.86801001374277e-06, + "loss": 0.284, + "step": 13231 + }, + { + "epoch": 0.76, + "grad_norm": 0.3654452826305494, + "learning_rate": 2.8667057089346127e-06, + "loss": 0.2889, + "step": 13232 + }, + { + "epoch": 0.76, + "grad_norm": 0.14945398474158922, + "learning_rate": 2.865401651152132e-06, + "loss": 0.0971, + "step": 13233 + }, + { + "epoch": 0.76, + "grad_norm": 0.8662144879164575, + "learning_rate": 2.864097840440485e-06, + "loss": 0.3408, + "step": 13234 + }, + { + "epoch": 0.76, + "grad_norm": 0.33595790732355874, + "learning_rate": 2.8627942768448234e-06, + "loss": 0.2453, + "step": 13235 + }, + { + "epoch": 0.76, + "grad_norm": 0.34996311712384964, + "learning_rate": 2.861490960410289e-06, + "loss": 0.2498, + "step": 13236 + }, + { + "epoch": 0.76, + "grad_norm": 0.4919206950943628, + "learning_rate": 2.8601878911820168e-06, + "loss": 0.3234, + "step": 13237 + }, + { + "epoch": 0.76, + "grad_norm": 0.3451700720073584, + "learning_rate": 2.8588850692051296e-06, + "loss": 0.2718, + "step": 13238 + }, + { + "epoch": 0.76, + "grad_norm": 0.1936044629355527, + "learning_rate": 2.857582494524742e-06, + "loss": 0.1738, + "step": 13239 + }, + { + "epoch": 0.76, + "grad_norm": 1.196637382273675, + "learning_rate": 2.8562801671859697e-06, + "loss": 0.5177, + "step": 13240 + }, + { + "epoch": 0.76, + "grad_norm": 0.3138689667270411, + "learning_rate": 2.8549780872339073e-06, + "loss": 0.2528, + "step": 13241 + }, + { + "epoch": 0.76, + "grad_norm": 0.7183731018697147, + "learning_rate": 2.8536762547136464e-06, + "loss": 0.3813, + "step": 13242 + }, + { + "epoch": 0.76, + "grad_norm": 0.3149057673584101, + "learning_rate": 2.85237466967027e-06, + "loss": 0.2506, + "step": 13243 + }, + { + "epoch": 0.76, + "grad_norm": 0.31630528529321505, + "learning_rate": 2.851073332148848e-06, + "loss": 0.2517, + "step": 13244 + }, + { + "epoch": 0.76, + "grad_norm": 0.4393966081472135, + "learning_rate": 2.849772242194453e-06, + "loss": 0.2605, + "step": 13245 + }, + { + "epoch": 0.76, + "grad_norm": 0.3954421837785276, + "learning_rate": 2.8484713998521364e-06, + "loss": 0.1002, + "step": 13246 + }, + { + "epoch": 0.76, + "grad_norm": 0.25876846616548765, + "learning_rate": 2.847170805166949e-06, + "loss": 0.2416, + "step": 13247 + }, + { + "epoch": 0.76, + "grad_norm": 0.47757103299646225, + "learning_rate": 2.8458704581839247e-06, + "loss": 0.3262, + "step": 13248 + }, + { + "epoch": 0.76, + "grad_norm": 0.9076141858611277, + "learning_rate": 2.844570358948103e-06, + "loss": 0.3371, + "step": 13249 + }, + { + "epoch": 0.76, + "grad_norm": 0.3070964861074088, + "learning_rate": 2.843270507504502e-06, + "loss": 0.2474, + "step": 13250 + }, + { + "epoch": 0.76, + "grad_norm": 0.37473028523472246, + "learning_rate": 2.8419709038981345e-06, + "loss": 0.2904, + "step": 13251 + }, + { + "epoch": 0.76, + "grad_norm": 0.1318432605062941, + "learning_rate": 2.840671548174004e-06, + "loss": 0.0704, + "step": 13252 + }, + { + "epoch": 0.76, + "grad_norm": 0.39967919119913864, + "learning_rate": 2.8393724403771137e-06, + "loss": 0.2531, + "step": 13253 + }, + { + "epoch": 0.76, + "grad_norm": 1.0046598432534393, + "learning_rate": 2.8380735805524475e-06, + "loss": 0.3925, + "step": 13254 + }, + { + "epoch": 0.76, + "grad_norm": 0.3777098827872954, + "learning_rate": 2.8367749687449853e-06, + "loss": 0.334, + "step": 13255 + }, + { + "epoch": 0.76, + "grad_norm": 0.3147497604198872, + "learning_rate": 2.835476604999695e-06, + "loss": 0.1906, + "step": 13256 + }, + { + "epoch": 0.76, + "grad_norm": 0.4294056542690755, + "learning_rate": 2.8341784893615443e-06, + "loss": 0.2689, + "step": 13257 + }, + { + "epoch": 0.76, + "grad_norm": 0.43842450172880576, + "learning_rate": 2.8328806218754855e-06, + "loss": 0.2174, + "step": 13258 + }, + { + "epoch": 0.76, + "grad_norm": 0.25153714414036416, + "learning_rate": 2.831583002586461e-06, + "loss": 0.2005, + "step": 13259 + }, + { + "epoch": 0.76, + "grad_norm": 0.5546683602382102, + "learning_rate": 2.83028563153941e-06, + "loss": 0.3272, + "step": 13260 + }, + { + "epoch": 0.76, + "grad_norm": 1.3431270553899364, + "learning_rate": 2.8289885087792557e-06, + "loss": 0.6881, + "step": 13261 + }, + { + "epoch": 0.76, + "grad_norm": 0.2682221801490365, + "learning_rate": 2.827691634350924e-06, + "loss": 0.2002, + "step": 13262 + }, + { + "epoch": 0.76, + "grad_norm": 0.3618717416925984, + "learning_rate": 2.826395008299323e-06, + "loss": 0.29, + "step": 13263 + }, + { + "epoch": 0.76, + "grad_norm": 0.3130034952304038, + "learning_rate": 2.8250986306693553e-06, + "loss": 0.1539, + "step": 13264 + }, + { + "epoch": 0.76, + "grad_norm": 0.3121843996017311, + "learning_rate": 2.823802501505909e-06, + "loss": 0.2223, + "step": 13265 + }, + { + "epoch": 0.76, + "grad_norm": 0.7111069282689343, + "learning_rate": 2.8225066208538765e-06, + "loss": 0.4252, + "step": 13266 + }, + { + "epoch": 0.76, + "grad_norm": 0.3587339369972459, + "learning_rate": 2.821210988758132e-06, + "loss": 0.3243, + "step": 13267 + }, + { + "epoch": 0.76, + "grad_norm": 0.31053986252393684, + "learning_rate": 2.8199156052635412e-06, + "loss": 0.2829, + "step": 13268 + }, + { + "epoch": 0.76, + "grad_norm": 1.3607437392178636, + "learning_rate": 2.8186204704149643e-06, + "loss": 0.2799, + "step": 13269 + }, + { + "epoch": 0.76, + "grad_norm": 0.259545723920443, + "learning_rate": 2.817325584257252e-06, + "loss": 0.2053, + "step": 13270 + }, + { + "epoch": 0.76, + "grad_norm": 0.36841259735356224, + "learning_rate": 2.8160309468352465e-06, + "loss": 0.3017, + "step": 13271 + }, + { + "epoch": 0.76, + "grad_norm": 0.38905719201812056, + "learning_rate": 2.81473655819378e-06, + "loss": 0.2515, + "step": 13272 + }, + { + "epoch": 0.76, + "grad_norm": 1.0396985241949528, + "learning_rate": 2.813442418377674e-06, + "loss": 0.7514, + "step": 13273 + }, + { + "epoch": 0.76, + "grad_norm": 0.39836820205825924, + "learning_rate": 2.812148527431752e-06, + "loss": 0.2428, + "step": 13274 + }, + { + "epoch": 0.76, + "grad_norm": 0.305449725807176, + "learning_rate": 2.8108548854008166e-06, + "loss": 0.242, + "step": 13275 + }, + { + "epoch": 0.76, + "grad_norm": 0.44681122919529676, + "learning_rate": 2.8095614923296676e-06, + "loss": 0.2259, + "step": 13276 + }, + { + "epoch": 0.76, + "grad_norm": 0.3816614658218707, + "learning_rate": 2.8082683482630912e-06, + "loss": 0.2823, + "step": 13277 + }, + { + "epoch": 0.76, + "grad_norm": 0.2946987437084205, + "learning_rate": 2.806975453245877e-06, + "loss": 0.2036, + "step": 13278 + }, + { + "epoch": 0.76, + "grad_norm": 0.49942919812183445, + "learning_rate": 2.8056828073227925e-06, + "loss": 0.3453, + "step": 13279 + }, + { + "epoch": 0.76, + "grad_norm": 0.40850999837166446, + "learning_rate": 2.804390410538603e-06, + "loss": 0.3256, + "step": 13280 + }, + { + "epoch": 0.76, + "grad_norm": 0.5696317559539408, + "learning_rate": 2.803098262938062e-06, + "loss": 0.3323, + "step": 13281 + }, + { + "epoch": 0.76, + "grad_norm": 0.4290707314341114, + "learning_rate": 2.801806364565921e-06, + "loss": 0.2762, + "step": 13282 + }, + { + "epoch": 0.76, + "grad_norm": 0.26581716980266745, + "learning_rate": 2.8005147154669166e-06, + "loss": 0.2421, + "step": 13283 + }, + { + "epoch": 0.76, + "grad_norm": 0.299496740652704, + "learning_rate": 2.7992233156857784e-06, + "loss": 0.1897, + "step": 13284 + }, + { + "epoch": 0.76, + "grad_norm": 1.0457263308117586, + "learning_rate": 2.7979321652672266e-06, + "loss": 0.5193, + "step": 13285 + }, + { + "epoch": 0.76, + "grad_norm": 0.3313512064073587, + "learning_rate": 2.79664126425597e-06, + "loss": 0.2552, + "step": 13286 + }, + { + "epoch": 0.76, + "grad_norm": 0.3630065685033784, + "learning_rate": 2.795350612696721e-06, + "loss": 0.276, + "step": 13287 + }, + { + "epoch": 0.76, + "grad_norm": 0.6448201348910892, + "learning_rate": 2.794060210634171e-06, + "loss": 0.2862, + "step": 13288 + }, + { + "epoch": 0.76, + "grad_norm": 0.6742071686259051, + "learning_rate": 2.7927700581130046e-06, + "loss": 0.3216, + "step": 13289 + }, + { + "epoch": 0.76, + "grad_norm": 0.2330554908362521, + "learning_rate": 2.7914801551778994e-06, + "loss": 0.2058, + "step": 13290 + }, + { + "epoch": 0.76, + "grad_norm": 0.31746453612708525, + "learning_rate": 2.7901905018735287e-06, + "loss": 0.2666, + "step": 13291 + }, + { + "epoch": 0.76, + "grad_norm": 0.6076098034746343, + "learning_rate": 2.7889010982445508e-06, + "loss": 0.3607, + "step": 13292 + }, + { + "epoch": 0.76, + "grad_norm": 0.3652652449404285, + "learning_rate": 2.7876119443356177e-06, + "loss": 0.3044, + "step": 13293 + }, + { + "epoch": 0.76, + "grad_norm": 0.441685084264975, + "learning_rate": 2.7863230401913698e-06, + "loss": 0.3271, + "step": 13294 + }, + { + "epoch": 0.76, + "grad_norm": 0.28157585778291394, + "learning_rate": 2.7850343858564487e-06, + "loss": 0.1958, + "step": 13295 + }, + { + "epoch": 0.76, + "grad_norm": 0.2538193604013508, + "learning_rate": 2.7837459813754765e-06, + "loss": 0.1991, + "step": 13296 + }, + { + "epoch": 0.76, + "grad_norm": 1.1707178137167724, + "learning_rate": 2.782457826793069e-06, + "loss": 0.6914, + "step": 13297 + }, + { + "epoch": 0.76, + "grad_norm": 0.4149696440496968, + "learning_rate": 2.781169922153838e-06, + "loss": 0.1949, + "step": 13298 + }, + { + "epoch": 0.76, + "grad_norm": 0.2861185610671488, + "learning_rate": 2.7798822675023795e-06, + "loss": 0.2823, + "step": 13299 + }, + { + "epoch": 0.76, + "grad_norm": 0.7178983508096588, + "learning_rate": 2.7785948628832904e-06, + "loss": 0.3858, + "step": 13300 + }, + { + "epoch": 0.76, + "grad_norm": 0.4103924325196549, + "learning_rate": 2.7773077083411502e-06, + "loss": 0.1833, + "step": 13301 + }, + { + "epoch": 0.76, + "grad_norm": 0.244686535238677, + "learning_rate": 2.776020803920533e-06, + "loss": 0.1998, + "step": 13302 + }, + { + "epoch": 0.76, + "grad_norm": 0.355293710352619, + "learning_rate": 2.774734149666005e-06, + "loss": 0.3177, + "step": 13303 + }, + { + "epoch": 0.76, + "grad_norm": 0.2862677955158507, + "learning_rate": 2.773447745622123e-06, + "loss": 0.2068, + "step": 13304 + }, + { + "epoch": 0.76, + "grad_norm": 0.6852767845171588, + "learning_rate": 2.7721615918334355e-06, + "loss": 0.4025, + "step": 13305 + }, + { + "epoch": 0.76, + "grad_norm": 0.36885958549727094, + "learning_rate": 2.7708756883444776e-06, + "loss": 0.297, + "step": 13306 + }, + { + "epoch": 0.76, + "grad_norm": 0.5868284645412355, + "learning_rate": 2.7695900351997864e-06, + "loss": 0.3458, + "step": 13307 + }, + { + "epoch": 0.76, + "grad_norm": 0.27339180895296544, + "learning_rate": 2.7683046324438822e-06, + "loss": 0.1766, + "step": 13308 + }, + { + "epoch": 0.76, + "grad_norm": 0.36055577891955803, + "learning_rate": 2.7670194801212768e-06, + "loss": 0.2466, + "step": 13309 + }, + { + "epoch": 0.76, + "grad_norm": 0.41817920256675367, + "learning_rate": 2.7657345782764765e-06, + "loss": 0.3027, + "step": 13310 + }, + { + "epoch": 0.76, + "grad_norm": 0.2847934263341108, + "learning_rate": 2.7644499269539728e-06, + "loss": 0.242, + "step": 13311 + }, + { + "epoch": 0.76, + "grad_norm": 0.9291208397012233, + "learning_rate": 2.7631655261982605e-06, + "loss": 0.4407, + "step": 13312 + }, + { + "epoch": 0.76, + "grad_norm": 0.7871185563291784, + "learning_rate": 2.7618813760538145e-06, + "loss": 0.3596, + "step": 13313 + }, + { + "epoch": 0.76, + "grad_norm": 0.2395530050943581, + "learning_rate": 2.7605974765651057e-06, + "loss": 0.2066, + "step": 13314 + }, + { + "epoch": 0.77, + "grad_norm": 0.43243853087168727, + "learning_rate": 2.759313827776592e-06, + "loss": 0.2994, + "step": 13315 + }, + { + "epoch": 0.77, + "grad_norm": 0.499861531513214, + "learning_rate": 2.758030429732732e-06, + "loss": 0.3264, + "step": 13316 + }, + { + "epoch": 0.77, + "grad_norm": 0.4019932945242539, + "learning_rate": 2.7567472824779663e-06, + "loss": 0.2627, + "step": 13317 + }, + { + "epoch": 0.77, + "grad_norm": 0.35582583706274945, + "learning_rate": 2.7554643860567308e-06, + "loss": 0.2845, + "step": 13318 + }, + { + "epoch": 0.77, + "grad_norm": 0.5481213981526601, + "learning_rate": 2.75418174051345e-06, + "loss": 0.2395, + "step": 13319 + }, + { + "epoch": 0.77, + "grad_norm": 0.41611532347043945, + "learning_rate": 2.7528993458925457e-06, + "loss": 0.3025, + "step": 13320 + }, + { + "epoch": 0.77, + "grad_norm": 0.3799576843178077, + "learning_rate": 2.751617202238427e-06, + "loss": 0.1868, + "step": 13321 + }, + { + "epoch": 0.77, + "grad_norm": 0.2908916015181389, + "learning_rate": 2.750335309595491e-06, + "loss": 0.2636, + "step": 13322 + }, + { + "epoch": 0.77, + "grad_norm": 0.43567171245253916, + "learning_rate": 2.7490536680081325e-06, + "loss": 0.2941, + "step": 13323 + }, + { + "epoch": 0.77, + "grad_norm": 0.4234412711982623, + "learning_rate": 2.7477722775207303e-06, + "loss": 0.1738, + "step": 13324 + }, + { + "epoch": 0.77, + "grad_norm": 0.5576484292752899, + "learning_rate": 2.746491138177666e-06, + "loss": 0.2826, + "step": 13325 + }, + { + "epoch": 0.77, + "grad_norm": 0.3903393434473755, + "learning_rate": 2.745210250023301e-06, + "loss": 0.3149, + "step": 13326 + }, + { + "epoch": 0.77, + "grad_norm": 0.34900301912911175, + "learning_rate": 2.743929613101993e-06, + "loss": 0.2765, + "step": 13327 + }, + { + "epoch": 0.77, + "grad_norm": 0.8648744675226305, + "learning_rate": 2.7426492274580883e-06, + "loss": 0.3671, + "step": 13328 + }, + { + "epoch": 0.77, + "grad_norm": 0.3467725051148224, + "learning_rate": 2.7413690931359316e-06, + "loss": 0.3054, + "step": 13329 + }, + { + "epoch": 0.77, + "grad_norm": 0.22494972847791722, + "learning_rate": 2.7400892101798504e-06, + "loss": 0.2063, + "step": 13330 + }, + { + "epoch": 0.77, + "grad_norm": 0.7541721363633993, + "learning_rate": 2.7388095786341682e-06, + "loss": 0.1194, + "step": 13331 + }, + { + "epoch": 0.77, + "grad_norm": 0.36934648699727984, + "learning_rate": 2.7375301985431947e-06, + "loss": 0.2673, + "step": 13332 + }, + { + "epoch": 0.77, + "grad_norm": 0.8084421696547506, + "learning_rate": 2.736251069951241e-06, + "loss": 0.4386, + "step": 13333 + }, + { + "epoch": 0.77, + "grad_norm": 0.29545160567203277, + "learning_rate": 2.734972192902601e-06, + "loss": 0.2284, + "step": 13334 + }, + { + "epoch": 0.77, + "grad_norm": 0.36633179660452714, + "learning_rate": 2.733693567441561e-06, + "loss": 0.3288, + "step": 13335 + }, + { + "epoch": 0.77, + "grad_norm": 0.24642775837098382, + "learning_rate": 2.732415193612401e-06, + "loss": 0.1621, + "step": 13336 + }, + { + "epoch": 0.77, + "grad_norm": 0.5111131791764394, + "learning_rate": 2.73113707145939e-06, + "loss": 0.1199, + "step": 13337 + }, + { + "epoch": 0.77, + "grad_norm": 0.37770395293264974, + "learning_rate": 2.7298592010267887e-06, + "loss": 0.3027, + "step": 13338 + }, + { + "epoch": 0.77, + "grad_norm": 0.3905628714529268, + "learning_rate": 2.7285815823588513e-06, + "loss": 0.2994, + "step": 13339 + }, + { + "epoch": 0.77, + "grad_norm": 0.5113595561516364, + "learning_rate": 2.7273042154998188e-06, + "loss": 0.2487, + "step": 13340 + }, + { + "epoch": 0.77, + "grad_norm": 0.40329210177460667, + "learning_rate": 2.726027100493931e-06, + "loss": 0.3088, + "step": 13341 + }, + { + "epoch": 0.77, + "grad_norm": 0.2326471863617773, + "learning_rate": 2.724750237385412e-06, + "loss": 0.2229, + "step": 13342 + }, + { + "epoch": 0.77, + "grad_norm": 1.0041471856553938, + "learning_rate": 2.723473626218479e-06, + "loss": 0.4367, + "step": 13343 + }, + { + "epoch": 0.77, + "grad_norm": 0.3045613574682275, + "learning_rate": 2.722197267037339e-06, + "loss": 0.1968, + "step": 13344 + }, + { + "epoch": 0.77, + "grad_norm": 0.5701953085284389, + "learning_rate": 2.7209211598861975e-06, + "loss": 0.3772, + "step": 13345 + }, + { + "epoch": 0.77, + "grad_norm": 0.3510620371677265, + "learning_rate": 2.719645304809242e-06, + "loss": 0.294, + "step": 13346 + }, + { + "epoch": 0.77, + "grad_norm": 0.2921036385897743, + "learning_rate": 2.7183697018506584e-06, + "loss": 0.2072, + "step": 13347 + }, + { + "epoch": 0.77, + "grad_norm": 0.2440216161862196, + "learning_rate": 2.7170943510546177e-06, + "loss": 0.1587, + "step": 13348 + }, + { + "epoch": 0.77, + "grad_norm": 0.822130467598231, + "learning_rate": 2.715819252465284e-06, + "loss": 0.5094, + "step": 13349 + }, + { + "epoch": 0.77, + "grad_norm": 0.24005749395345125, + "learning_rate": 2.714544406126819e-06, + "loss": 0.2119, + "step": 13350 + }, + { + "epoch": 0.77, + "grad_norm": 0.6209963089409438, + "learning_rate": 2.713269812083369e-06, + "loss": 0.3862, + "step": 13351 + }, + { + "epoch": 0.77, + "grad_norm": 1.0539666138783272, + "learning_rate": 2.711995470379071e-06, + "loss": 0.6506, + "step": 13352 + }, + { + "epoch": 0.77, + "grad_norm": 0.33989522147319307, + "learning_rate": 2.7107213810580536e-06, + "loss": 0.1925, + "step": 13353 + }, + { + "epoch": 0.77, + "grad_norm": 0.2547793738056594, + "learning_rate": 2.709447544164444e-06, + "loss": 0.2418, + "step": 13354 + }, + { + "epoch": 0.77, + "grad_norm": 0.3889272161112719, + "learning_rate": 2.708173959742353e-06, + "loss": 0.2009, + "step": 13355 + }, + { + "epoch": 0.77, + "grad_norm": 0.36124339525567295, + "learning_rate": 2.7069006278358844e-06, + "loss": 0.2852, + "step": 13356 + }, + { + "epoch": 0.77, + "grad_norm": 0.866977479010928, + "learning_rate": 2.70562754848913e-06, + "loss": 0.29, + "step": 13357 + }, + { + "epoch": 0.77, + "grad_norm": 0.32296997474230627, + "learning_rate": 2.704354721746183e-06, + "loss": 0.2969, + "step": 13358 + }, + { + "epoch": 0.77, + "grad_norm": 0.34896980548098877, + "learning_rate": 2.703082147651118e-06, + "loss": 0.2693, + "step": 13359 + }, + { + "epoch": 0.77, + "grad_norm": 0.2370600806689802, + "learning_rate": 2.7018098262480053e-06, + "loss": 0.103, + "step": 13360 + }, + { + "epoch": 0.77, + "grad_norm": 0.5435084819332936, + "learning_rate": 2.700537757580901e-06, + "loss": 0.3418, + "step": 13361 + }, + { + "epoch": 0.77, + "grad_norm": 0.31758347380995683, + "learning_rate": 2.699265941693863e-06, + "loss": 0.2566, + "step": 13362 + }, + { + "epoch": 0.77, + "grad_norm": 0.4792746559962163, + "learning_rate": 2.6979943786309315e-06, + "loss": 0.2685, + "step": 13363 + }, + { + "epoch": 0.77, + "grad_norm": 1.2601019771202395, + "learning_rate": 2.6967230684361413e-06, + "loss": 0.7461, + "step": 13364 + }, + { + "epoch": 0.77, + "grad_norm": 0.3264807957462211, + "learning_rate": 2.6954520111535166e-06, + "loss": 0.251, + "step": 13365 + }, + { + "epoch": 0.77, + "grad_norm": 0.23864470808248076, + "learning_rate": 2.694181206827071e-06, + "loss": 0.1862, + "step": 13366 + }, + { + "epoch": 0.77, + "grad_norm": 0.6936911184017519, + "learning_rate": 2.69291065550082e-06, + "loss": 0.3615, + "step": 13367 + }, + { + "epoch": 0.77, + "grad_norm": 0.311294629785046, + "learning_rate": 2.691640357218759e-06, + "loss": 0.2626, + "step": 13368 + }, + { + "epoch": 0.77, + "grad_norm": 1.1184392357841415, + "learning_rate": 2.690370312024878e-06, + "loss": 0.4929, + "step": 13369 + }, + { + "epoch": 0.77, + "grad_norm": 0.32713334536191796, + "learning_rate": 2.6891005199631558e-06, + "loss": 0.2735, + "step": 13370 + }, + { + "epoch": 0.77, + "grad_norm": 0.3248615650269381, + "learning_rate": 2.6878309810775738e-06, + "loss": 0.2617, + "step": 13371 + }, + { + "epoch": 0.77, + "grad_norm": 1.4369216717341033, + "learning_rate": 2.6865616954120878e-06, + "loss": 0.5723, + "step": 13372 + }, + { + "epoch": 0.77, + "grad_norm": 0.5156635728031603, + "learning_rate": 2.6852926630106558e-06, + "loss": 0.2641, + "step": 13373 + }, + { + "epoch": 0.77, + "grad_norm": 0.3373075279153563, + "learning_rate": 2.6840238839172206e-06, + "loss": 0.2585, + "step": 13374 + }, + { + "epoch": 0.77, + "grad_norm": 0.36128908717495617, + "learning_rate": 2.682755358175728e-06, + "loss": 0.1984, + "step": 13375 + }, + { + "epoch": 0.77, + "grad_norm": 0.71767892274232, + "learning_rate": 2.6814870858301013e-06, + "loss": 0.2892, + "step": 13376 + }, + { + "epoch": 0.77, + "grad_norm": 0.36937216939202855, + "learning_rate": 2.6802190669242634e-06, + "loss": 0.2787, + "step": 13377 + }, + { + "epoch": 0.77, + "grad_norm": 0.3642005381982544, + "learning_rate": 2.6789513015021207e-06, + "loss": 0.2875, + "step": 13378 + }, + { + "epoch": 0.77, + "grad_norm": 0.44540040503421996, + "learning_rate": 2.6776837896075824e-06, + "loss": 0.2292, + "step": 13379 + }, + { + "epoch": 0.77, + "grad_norm": 0.33180184811006774, + "learning_rate": 2.6764165312845402e-06, + "loss": 0.2614, + "step": 13380 + }, + { + "epoch": 0.77, + "grad_norm": 0.32777961922851084, + "learning_rate": 2.675149526576879e-06, + "loss": 0.2594, + "step": 13381 + }, + { + "epoch": 0.77, + "grad_norm": 0.4898636512120215, + "learning_rate": 2.67388277552847e-06, + "loss": 0.3877, + "step": 13382 + }, + { + "epoch": 0.77, + "grad_norm": 0.2721793650080163, + "learning_rate": 2.67261627818319e-06, + "loss": 0.197, + "step": 13383 + }, + { + "epoch": 0.77, + "grad_norm": 0.7355116149856278, + "learning_rate": 2.671350034584893e-06, + "loss": 0.3658, + "step": 13384 + }, + { + "epoch": 0.77, + "grad_norm": 0.47438055021689923, + "learning_rate": 2.670084044777429e-06, + "loss": 0.3412, + "step": 13385 + }, + { + "epoch": 0.77, + "grad_norm": 0.2641302345028114, + "learning_rate": 2.668818308804636e-06, + "loss": 0.2146, + "step": 13386 + }, + { + "epoch": 0.77, + "grad_norm": 0.25613544160514196, + "learning_rate": 2.6675528267103534e-06, + "loss": 0.1642, + "step": 13387 + }, + { + "epoch": 0.77, + "grad_norm": 1.011992324523561, + "learning_rate": 2.6662875985384007e-06, + "loss": 0.6767, + "step": 13388 + }, + { + "epoch": 0.77, + "grad_norm": 0.3127942387817402, + "learning_rate": 2.665022624332593e-06, + "loss": 0.2063, + "step": 13389 + }, + { + "epoch": 0.77, + "grad_norm": 0.3537706267658949, + "learning_rate": 2.6637579041367357e-06, + "loss": 0.2924, + "step": 13390 + }, + { + "epoch": 0.77, + "grad_norm": 0.71475916044793, + "learning_rate": 2.6624934379946243e-06, + "loss": 0.3884, + "step": 13391 + }, + { + "epoch": 0.77, + "grad_norm": 0.21694765840296173, + "learning_rate": 2.661229225950054e-06, + "loss": 0.1554, + "step": 13392 + }, + { + "epoch": 0.77, + "grad_norm": 0.35457329355787714, + "learning_rate": 2.659965268046798e-06, + "loss": 0.2834, + "step": 13393 + }, + { + "epoch": 0.77, + "grad_norm": 0.4947678076948455, + "learning_rate": 2.6587015643286295e-06, + "loss": 0.3978, + "step": 13394 + }, + { + "epoch": 0.77, + "grad_norm": 0.5838803034932754, + "learning_rate": 2.657438114839308e-06, + "loss": 0.3296, + "step": 13395 + }, + { + "epoch": 0.77, + "grad_norm": 0.3963018840650698, + "learning_rate": 2.6561749196225915e-06, + "loss": 0.2608, + "step": 13396 + }, + { + "epoch": 0.77, + "grad_norm": 0.3411749326825493, + "learning_rate": 2.654911978722222e-06, + "loss": 0.2992, + "step": 13397 + }, + { + "epoch": 0.77, + "grad_norm": 0.4121763965343542, + "learning_rate": 2.6536492921819346e-06, + "loss": 0.2847, + "step": 13398 + }, + { + "epoch": 0.77, + "grad_norm": 0.23315859992533955, + "learning_rate": 2.6523868600454526e-06, + "loss": 0.143, + "step": 13399 + }, + { + "epoch": 0.77, + "grad_norm": 1.2290226217364666, + "learning_rate": 2.6511246823565016e-06, + "loss": 0.6549, + "step": 13400 + }, + { + "epoch": 0.77, + "grad_norm": 0.48684221439801073, + "learning_rate": 2.649862759158787e-06, + "loss": 0.2985, + "step": 13401 + }, + { + "epoch": 0.77, + "grad_norm": 0.27189307366780724, + "learning_rate": 2.648601090496008e-06, + "loss": 0.2455, + "step": 13402 + }, + { + "epoch": 0.77, + "grad_norm": 1.2851418916503219, + "learning_rate": 2.6473396764118575e-06, + "loss": 0.5728, + "step": 13403 + }, + { + "epoch": 0.77, + "grad_norm": 0.3704241967181858, + "learning_rate": 2.646078516950018e-06, + "loss": 0.2355, + "step": 13404 + }, + { + "epoch": 0.77, + "grad_norm": 0.32679798644361935, + "learning_rate": 2.6448176121541634e-06, + "loss": 0.2539, + "step": 13405 + }, + { + "epoch": 0.77, + "grad_norm": 0.42009343086991197, + "learning_rate": 2.643556962067958e-06, + "loss": 0.2625, + "step": 13406 + }, + { + "epoch": 0.77, + "grad_norm": 0.3364463651284445, + "learning_rate": 2.6422965667350566e-06, + "loss": 0.2599, + "step": 13407 + }, + { + "epoch": 0.77, + "grad_norm": 0.4603678526160474, + "learning_rate": 2.6410364261991108e-06, + "loss": 0.2815, + "step": 13408 + }, + { + "epoch": 0.77, + "grad_norm": 0.34026034801807636, + "learning_rate": 2.6397765405037577e-06, + "loss": 0.2485, + "step": 13409 + }, + { + "epoch": 0.77, + "grad_norm": 0.32456868640721753, + "learning_rate": 2.6385169096926265e-06, + "loss": 0.2346, + "step": 13410 + }, + { + "epoch": 0.77, + "grad_norm": 0.49966563733385355, + "learning_rate": 2.637257533809334e-06, + "loss": 0.2526, + "step": 13411 + }, + { + "epoch": 0.77, + "grad_norm": 0.6283074877544407, + "learning_rate": 2.6359984128975013e-06, + "loss": 0.33, + "step": 13412 + }, + { + "epoch": 0.77, + "grad_norm": 0.36892761734499885, + "learning_rate": 2.6347395470007254e-06, + "loss": 0.2784, + "step": 13413 + }, + { + "epoch": 0.77, + "grad_norm": 0.27568567981994324, + "learning_rate": 2.6334809361626034e-06, + "loss": 0.2485, + "step": 13414 + }, + { + "epoch": 0.77, + "grad_norm": 1.3477396319559145, + "learning_rate": 2.632222580426719e-06, + "loss": 0.2197, + "step": 13415 + }, + { + "epoch": 0.77, + "grad_norm": 0.5419492347153053, + "learning_rate": 2.6309644798366474e-06, + "loss": 0.2762, + "step": 13416 + }, + { + "epoch": 0.77, + "grad_norm": 0.3174658239757126, + "learning_rate": 2.6297066344359612e-06, + "loss": 0.2882, + "step": 13417 + }, + { + "epoch": 0.77, + "grad_norm": 0.47016789004810144, + "learning_rate": 2.6284490442682186e-06, + "loss": 0.3389, + "step": 13418 + }, + { + "epoch": 0.77, + "grad_norm": 0.3946797301594869, + "learning_rate": 2.6271917093769673e-06, + "loss": 0.2183, + "step": 13419 + }, + { + "epoch": 0.77, + "grad_norm": 0.23602412255899952, + "learning_rate": 2.6259346298057476e-06, + "loss": 0.1842, + "step": 13420 + }, + { + "epoch": 0.77, + "grad_norm": 0.34069892961807774, + "learning_rate": 2.6246778055980983e-06, + "loss": 0.2678, + "step": 13421 + }, + { + "epoch": 0.77, + "grad_norm": 0.5221130511938119, + "learning_rate": 2.6234212367975375e-06, + "loss": 0.1991, + "step": 13422 + }, + { + "epoch": 0.77, + "grad_norm": 0.6077899342263603, + "learning_rate": 2.6221649234475845e-06, + "loss": 0.4096, + "step": 13423 + }, + { + "epoch": 0.77, + "grad_norm": 0.6059573421941735, + "learning_rate": 2.620908865591738e-06, + "loss": 0.3912, + "step": 13424 + }, + { + "epoch": 0.77, + "grad_norm": 0.25779870510643993, + "learning_rate": 2.619653063273504e-06, + "loss": 0.2187, + "step": 13425 + }, + { + "epoch": 0.77, + "grad_norm": 0.3211535151394236, + "learning_rate": 2.618397516536367e-06, + "loss": 0.2455, + "step": 13426 + }, + { + "epoch": 0.77, + "grad_norm": 0.42875680879462036, + "learning_rate": 2.6171422254238067e-06, + "loss": 0.2418, + "step": 13427 + }, + { + "epoch": 0.77, + "grad_norm": 0.4063859681100399, + "learning_rate": 2.6158871899792927e-06, + "loss": 0.1835, + "step": 13428 + }, + { + "epoch": 0.77, + "grad_norm": 0.3628029769796745, + "learning_rate": 2.6146324102462862e-06, + "loss": 0.2992, + "step": 13429 + }, + { + "epoch": 0.77, + "grad_norm": 0.5550301775668914, + "learning_rate": 2.6133778862682433e-06, + "loss": 0.3215, + "step": 13430 + }, + { + "epoch": 0.77, + "grad_norm": 0.7979129683971643, + "learning_rate": 2.612123618088608e-06, + "loss": 0.4975, + "step": 13431 + }, + { + "epoch": 0.77, + "grad_norm": 0.1949401131221945, + "learning_rate": 2.610869605750813e-06, + "loss": 0.138, + "step": 13432 + }, + { + "epoch": 0.77, + "grad_norm": 0.35522383806764823, + "learning_rate": 2.6096158492982837e-06, + "loss": 0.2854, + "step": 13433 + }, + { + "epoch": 0.77, + "grad_norm": 0.8215836187570505, + "learning_rate": 2.6083623487744423e-06, + "loss": 0.4089, + "step": 13434 + }, + { + "epoch": 0.77, + "grad_norm": 0.32566860474390014, + "learning_rate": 2.6071091042226947e-06, + "loss": 0.2306, + "step": 13435 + }, + { + "epoch": 0.77, + "grad_norm": 0.7199078363127668, + "learning_rate": 2.6058561156864415e-06, + "loss": 0.4138, + "step": 13436 + }, + { + "epoch": 0.77, + "grad_norm": 0.3209376148836106, + "learning_rate": 2.60460338320907e-06, + "loss": 0.3103, + "step": 13437 + }, + { + "epoch": 0.77, + "grad_norm": 0.25042851717939346, + "learning_rate": 2.603350906833971e-06, + "loss": 0.1709, + "step": 13438 + }, + { + "epoch": 0.77, + "grad_norm": 0.482862170243484, + "learning_rate": 2.6020986866045085e-06, + "loss": 0.2197, + "step": 13439 + }, + { + "epoch": 0.77, + "grad_norm": 0.5706303570276031, + "learning_rate": 2.600846722564051e-06, + "loss": 0.2956, + "step": 13440 + }, + { + "epoch": 0.77, + "grad_norm": 0.2703381171096475, + "learning_rate": 2.59959501475595e-06, + "loss": 0.233, + "step": 13441 + }, + { + "epoch": 0.77, + "grad_norm": 0.8217067922733584, + "learning_rate": 2.5983435632235586e-06, + "loss": 0.4331, + "step": 13442 + }, + { + "epoch": 0.77, + "grad_norm": 0.4817791637134085, + "learning_rate": 2.597092368010212e-06, + "loss": 0.2811, + "step": 13443 + }, + { + "epoch": 0.77, + "grad_norm": 0.42074169351000873, + "learning_rate": 2.5958414291592384e-06, + "loss": 0.3039, + "step": 13444 + }, + { + "epoch": 0.77, + "grad_norm": 0.22653893104331904, + "learning_rate": 2.594590746713953e-06, + "loss": 0.1724, + "step": 13445 + }, + { + "epoch": 0.77, + "grad_norm": 0.5405885473781558, + "learning_rate": 2.5933403207176766e-06, + "loss": 0.2775, + "step": 13446 + }, + { + "epoch": 0.77, + "grad_norm": 0.3693178605084256, + "learning_rate": 2.5920901512137052e-06, + "loss": 0.296, + "step": 13447 + }, + { + "epoch": 0.77, + "grad_norm": 0.5107901097900251, + "learning_rate": 2.5908402382453337e-06, + "loss": 0.291, + "step": 13448 + }, + { + "epoch": 0.77, + "grad_norm": 0.39069716983926167, + "learning_rate": 2.589590581855843e-06, + "loss": 0.2842, + "step": 13449 + }, + { + "epoch": 0.77, + "grad_norm": 0.3709867346737115, + "learning_rate": 2.588341182088514e-06, + "loss": 0.2768, + "step": 13450 + }, + { + "epoch": 0.77, + "grad_norm": 0.30386942428806196, + "learning_rate": 2.587092038986613e-06, + "loss": 0.1273, + "step": 13451 + }, + { + "epoch": 0.77, + "grad_norm": 0.5989646935983985, + "learning_rate": 2.5858431525933946e-06, + "loss": 0.3982, + "step": 13452 + }, + { + "epoch": 0.77, + "grad_norm": 0.24380382171509266, + "learning_rate": 2.5845945229521095e-06, + "loss": 0.2705, + "step": 13453 + }, + { + "epoch": 0.77, + "grad_norm": 0.7642863991718087, + "learning_rate": 2.5833461501059933e-06, + "loss": 0.3247, + "step": 13454 + }, + { + "epoch": 0.77, + "grad_norm": 0.7580810104910045, + "learning_rate": 2.5820980340982847e-06, + "loss": 0.4048, + "step": 13455 + }, + { + "epoch": 0.77, + "grad_norm": 0.30818453796959994, + "learning_rate": 2.5808501749722024e-06, + "loss": 0.2299, + "step": 13456 + }, + { + "epoch": 0.77, + "grad_norm": 0.27604039325913704, + "learning_rate": 2.5796025727709595e-06, + "loss": 0.2518, + "step": 13457 + }, + { + "epoch": 0.77, + "grad_norm": 0.8081791607239174, + "learning_rate": 2.5783552275377567e-06, + "loss": 0.2606, + "step": 13458 + }, + { + "epoch": 0.77, + "grad_norm": 0.374970126576345, + "learning_rate": 2.577108139315797e-06, + "loss": 0.2767, + "step": 13459 + }, + { + "epoch": 0.77, + "grad_norm": 1.1719009812907486, + "learning_rate": 2.575861308148263e-06, + "loss": 0.748, + "step": 13460 + }, + { + "epoch": 0.77, + "grad_norm": 0.28830064194617916, + "learning_rate": 2.574614734078332e-06, + "loss": 0.2245, + "step": 13461 + }, + { + "epoch": 0.77, + "grad_norm": 0.40624340668557385, + "learning_rate": 2.5733684171491713e-06, + "loss": 0.3191, + "step": 13462 + }, + { + "epoch": 0.77, + "grad_norm": 0.31625190451738877, + "learning_rate": 2.5721223574039466e-06, + "loss": 0.1969, + "step": 13463 + }, + { + "epoch": 0.77, + "grad_norm": 0.42735286500023417, + "learning_rate": 2.570876554885804e-06, + "loss": 0.2341, + "step": 13464 + }, + { + "epoch": 0.77, + "grad_norm": 0.3077642913701503, + "learning_rate": 2.5696310096378875e-06, + "loss": 0.2794, + "step": 13465 + }, + { + "epoch": 0.77, + "grad_norm": 1.1827315444704343, + "learning_rate": 2.568385721703329e-06, + "loss": 0.7581, + "step": 13466 + }, + { + "epoch": 0.77, + "grad_norm": 1.4823498393056806, + "learning_rate": 2.5671406911252506e-06, + "loss": 0.2698, + "step": 13467 + }, + { + "epoch": 0.77, + "grad_norm": 0.3486648926153057, + "learning_rate": 2.5658959179467734e-06, + "loss": 0.2548, + "step": 13468 + }, + { + "epoch": 0.77, + "grad_norm": 0.3394923711719963, + "learning_rate": 2.5646514022110013e-06, + "loss": 0.2921, + "step": 13469 + }, + { + "epoch": 0.77, + "grad_norm": 0.4458713311099046, + "learning_rate": 2.563407143961032e-06, + "loss": 0.2497, + "step": 13470 + }, + { + "epoch": 0.77, + "grad_norm": 0.23033084641571222, + "learning_rate": 2.5621631432399496e-06, + "loss": 0.1549, + "step": 13471 + }, + { + "epoch": 0.77, + "grad_norm": 0.470600873287133, + "learning_rate": 2.5609194000908434e-06, + "loss": 0.3824, + "step": 13472 + }, + { + "epoch": 0.77, + "grad_norm": 0.5083980754662374, + "learning_rate": 2.5596759145567763e-06, + "loss": 0.3503, + "step": 13473 + }, + { + "epoch": 0.77, + "grad_norm": 0.3457918738633052, + "learning_rate": 2.5584326866808084e-06, + "loss": 0.1994, + "step": 13474 + }, + { + "epoch": 0.77, + "grad_norm": 0.8561374677835701, + "learning_rate": 2.557189716506e-06, + "loss": 0.3989, + "step": 13475 + }, + { + "epoch": 0.77, + "grad_norm": 0.3040416433764094, + "learning_rate": 2.555947004075392e-06, + "loss": 0.2597, + "step": 13476 + }, + { + "epoch": 0.77, + "grad_norm": 0.22530902436018296, + "learning_rate": 2.5547045494320187e-06, + "loss": 0.1685, + "step": 13477 + }, + { + "epoch": 0.77, + "grad_norm": 1.4681940576762507, + "learning_rate": 2.5534623526189075e-06, + "loss": 0.7422, + "step": 13478 + }, + { + "epoch": 0.77, + "grad_norm": 0.8041062456322301, + "learning_rate": 2.5522204136790707e-06, + "loss": 0.4247, + "step": 13479 + }, + { + "epoch": 0.77, + "grad_norm": 0.3134178270242085, + "learning_rate": 2.5509787326555245e-06, + "loss": 0.2055, + "step": 13480 + }, + { + "epoch": 0.77, + "grad_norm": 0.37965284021285, + "learning_rate": 2.5497373095912638e-06, + "loss": 0.3178, + "step": 13481 + }, + { + "epoch": 0.77, + "grad_norm": 0.3664967026991713, + "learning_rate": 2.5484961445292798e-06, + "loss": 0.2107, + "step": 13482 + }, + { + "epoch": 0.77, + "grad_norm": 0.348010313926068, + "learning_rate": 2.5472552375125514e-06, + "loss": 0.2591, + "step": 13483 + }, + { + "epoch": 0.77, + "grad_norm": 0.27596022773032486, + "learning_rate": 2.546014588584057e-06, + "loss": 0.2083, + "step": 13484 + }, + { + "epoch": 0.77, + "grad_norm": 0.5443468371920548, + "learning_rate": 2.5447741977867556e-06, + "loss": 0.3054, + "step": 13485 + }, + { + "epoch": 0.77, + "grad_norm": 0.32076582873013293, + "learning_rate": 2.543534065163604e-06, + "loss": 0.2684, + "step": 13486 + }, + { + "epoch": 0.77, + "grad_norm": 0.7357780117610295, + "learning_rate": 2.542294190757544e-06, + "loss": 0.3032, + "step": 13487 + }, + { + "epoch": 0.77, + "grad_norm": 0.33348082391285816, + "learning_rate": 2.541054574611518e-06, + "loss": 0.2992, + "step": 13488 + }, + { + "epoch": 0.78, + "grad_norm": 0.24713867181114496, + "learning_rate": 2.539815216768452e-06, + "loss": 0.2075, + "step": 13489 + }, + { + "epoch": 0.78, + "grad_norm": 0.2824735073527296, + "learning_rate": 2.5385761172712642e-06, + "loss": 0.1629, + "step": 13490 + }, + { + "epoch": 0.78, + "grad_norm": 0.7765884467550676, + "learning_rate": 2.537337276162861e-06, + "loss": 0.4139, + "step": 13491 + }, + { + "epoch": 0.78, + "grad_norm": 0.31663661173967594, + "learning_rate": 2.5360986934861507e-06, + "loss": 0.2695, + "step": 13492 + }, + { + "epoch": 0.78, + "grad_norm": 0.3654158248056001, + "learning_rate": 2.5348603692840214e-06, + "loss": 0.3131, + "step": 13493 + }, + { + "epoch": 0.78, + "grad_norm": 0.5894688790078348, + "learning_rate": 2.5336223035993566e-06, + "loss": 0.1376, + "step": 13494 + }, + { + "epoch": 0.78, + "grad_norm": 0.3125708863235182, + "learning_rate": 2.53238449647503e-06, + "loss": 0.2578, + "step": 13495 + }, + { + "epoch": 0.78, + "grad_norm": 1.1793852459395195, + "learning_rate": 2.5311469479539043e-06, + "loss": 0.7588, + "step": 13496 + }, + { + "epoch": 0.78, + "grad_norm": 0.2854328646753722, + "learning_rate": 2.5299096580788416e-06, + "loss": 0.2246, + "step": 13497 + }, + { + "epoch": 0.78, + "grad_norm": 0.411565998534182, + "learning_rate": 2.5286726268926864e-06, + "loss": 0.299, + "step": 13498 + }, + { + "epoch": 0.78, + "grad_norm": 0.47853799166222705, + "learning_rate": 2.5274358544382773e-06, + "loss": 0.3148, + "step": 13499 + }, + { + "epoch": 0.78, + "grad_norm": 0.32126409083539537, + "learning_rate": 2.5261993407584394e-06, + "loss": 0.2218, + "step": 13500 + }, + { + "epoch": 0.78, + "grad_norm": 0.35565643253633095, + "learning_rate": 2.5249630858960006e-06, + "loss": 0.2473, + "step": 13501 + }, + { + "epoch": 0.78, + "grad_norm": 1.0739827262463106, + "learning_rate": 2.5237270898937684e-06, + "loss": 0.7007, + "step": 13502 + }, + { + "epoch": 0.78, + "grad_norm": 0.4259404404044619, + "learning_rate": 2.522491352794545e-06, + "loss": 0.2166, + "step": 13503 + }, + { + "epoch": 0.78, + "grad_norm": 0.32529796843916775, + "learning_rate": 2.521255874641122e-06, + "loss": 0.2448, + "step": 13504 + }, + { + "epoch": 0.78, + "grad_norm": 0.24437696676073684, + "learning_rate": 2.5200206554762897e-06, + "loss": 0.2097, + "step": 13505 + }, + { + "epoch": 0.78, + "grad_norm": 1.2143331452246588, + "learning_rate": 2.5187856953428237e-06, + "loss": 0.4804, + "step": 13506 + }, + { + "epoch": 0.78, + "grad_norm": 0.31524982654847683, + "learning_rate": 2.5175509942834843e-06, + "loss": 0.2062, + "step": 13507 + }, + { + "epoch": 0.78, + "grad_norm": 0.3559843324635449, + "learning_rate": 2.516316552341028e-06, + "loss": 0.3224, + "step": 13508 + }, + { + "epoch": 0.78, + "grad_norm": 0.5991992837613603, + "learning_rate": 2.515082369558212e-06, + "loss": 0.3749, + "step": 13509 + }, + { + "epoch": 0.78, + "grad_norm": 0.24645117400291955, + "learning_rate": 2.513848445977771e-06, + "loss": 0.182, + "step": 13510 + }, + { + "epoch": 0.78, + "grad_norm": 0.48245437149625137, + "learning_rate": 2.5126147816424364e-06, + "loss": 0.2498, + "step": 13511 + }, + { + "epoch": 0.78, + "grad_norm": 0.3648211240975149, + "learning_rate": 2.5113813765949267e-06, + "loss": 0.3079, + "step": 13512 + }, + { + "epoch": 0.78, + "grad_norm": 0.28731932512581504, + "learning_rate": 2.5101482308779625e-06, + "loss": 0.1939, + "step": 13513 + }, + { + "epoch": 0.78, + "grad_norm": 0.9704475832943743, + "learning_rate": 2.508915344534242e-06, + "loss": 0.4582, + "step": 13514 + }, + { + "epoch": 0.78, + "grad_norm": 0.6266296632397085, + "learning_rate": 2.50768271760646e-06, + "loss": 0.3931, + "step": 13515 + }, + { + "epoch": 0.78, + "grad_norm": 0.27616660652349634, + "learning_rate": 2.5064503501373017e-06, + "loss": 0.2008, + "step": 13516 + }, + { + "epoch": 0.78, + "grad_norm": 0.2513559465711763, + "learning_rate": 2.505218242169448e-06, + "loss": 0.2077, + "step": 13517 + }, + { + "epoch": 0.78, + "grad_norm": 1.3205543433577367, + "learning_rate": 2.5039863937455645e-06, + "loss": 0.484, + "step": 13518 + }, + { + "epoch": 0.78, + "grad_norm": 0.6244415397855417, + "learning_rate": 2.5027548049083094e-06, + "loss": 0.3689, + "step": 13519 + }, + { + "epoch": 0.78, + "grad_norm": 0.3004803209648444, + "learning_rate": 2.5015234757003326e-06, + "loss": 0.2546, + "step": 13520 + }, + { + "epoch": 0.78, + "grad_norm": 0.7853617262016618, + "learning_rate": 2.500292406164273e-06, + "loss": 0.3941, + "step": 13521 + }, + { + "epoch": 0.78, + "grad_norm": 0.3676844253504336, + "learning_rate": 2.4990615963427688e-06, + "loss": 0.2752, + "step": 13522 + }, + { + "epoch": 0.78, + "grad_norm": 0.22875365405484152, + "learning_rate": 2.4978310462784373e-06, + "loss": 0.1272, + "step": 13523 + }, + { + "epoch": 0.78, + "grad_norm": 0.35103757648372264, + "learning_rate": 2.496600756013895e-06, + "loss": 0.3124, + "step": 13524 + }, + { + "epoch": 0.78, + "grad_norm": 0.3743025090473022, + "learning_rate": 2.4953707255917426e-06, + "loss": 0.2631, + "step": 13525 + }, + { + "epoch": 0.78, + "grad_norm": 0.4822393785473432, + "learning_rate": 2.4941409550545824e-06, + "loss": 0.2804, + "step": 13526 + }, + { + "epoch": 0.78, + "grad_norm": 1.1309185431941364, + "learning_rate": 2.492911444444999e-06, + "loss": 0.5591, + "step": 13527 + }, + { + "epoch": 0.78, + "grad_norm": 0.25582945570068455, + "learning_rate": 2.491682193805568e-06, + "loss": 0.2465, + "step": 13528 + }, + { + "epoch": 0.78, + "grad_norm": 0.24118639331888805, + "learning_rate": 2.4904532031788577e-06, + "loss": 0.1293, + "step": 13529 + }, + { + "epoch": 0.78, + "grad_norm": 1.2057509069942007, + "learning_rate": 2.489224472607432e-06, + "loss": 0.5991, + "step": 13530 + }, + { + "epoch": 0.78, + "grad_norm": 0.4075751183842601, + "learning_rate": 2.487996002133841e-06, + "loss": 0.2751, + "step": 13531 + }, + { + "epoch": 0.78, + "grad_norm": 0.3821138342755005, + "learning_rate": 2.486767791800625e-06, + "loss": 0.2959, + "step": 13532 + }, + { + "epoch": 0.78, + "grad_norm": 0.4553999305831225, + "learning_rate": 2.4855398416503173e-06, + "loss": 0.3063, + "step": 13533 + }, + { + "epoch": 0.78, + "grad_norm": 0.39885559025142264, + "learning_rate": 2.4843121517254386e-06, + "loss": 0.2575, + "step": 13534 + }, + { + "epoch": 0.78, + "grad_norm": 0.2631468133379456, + "learning_rate": 2.4830847220685096e-06, + "loss": 0.2009, + "step": 13535 + }, + { + "epoch": 0.78, + "grad_norm": 0.33484657861419903, + "learning_rate": 2.4818575527220347e-06, + "loss": 0.2423, + "step": 13536 + }, + { + "epoch": 0.78, + "grad_norm": 0.5743804627272125, + "learning_rate": 2.4806306437285075e-06, + "loss": 0.3096, + "step": 13537 + }, + { + "epoch": 0.78, + "grad_norm": 0.36102512843751833, + "learning_rate": 2.479403995130416e-06, + "loss": 0.2962, + "step": 13538 + }, + { + "epoch": 0.78, + "grad_norm": 0.4681753138150729, + "learning_rate": 2.4781776069702446e-06, + "loss": 0.2943, + "step": 13539 + }, + { + "epoch": 0.78, + "grad_norm": 0.40481623216969953, + "learning_rate": 2.4769514792904603e-06, + "loss": 0.2884, + "step": 13540 + }, + { + "epoch": 0.78, + "grad_norm": 0.2316043899089237, + "learning_rate": 2.4757256121335182e-06, + "loss": 0.195, + "step": 13541 + }, + { + "epoch": 0.78, + "grad_norm": 0.8906998454080058, + "learning_rate": 2.4745000055418767e-06, + "loss": 0.2673, + "step": 13542 + }, + { + "epoch": 0.78, + "grad_norm": 0.3756343888756985, + "learning_rate": 2.4732746595579772e-06, + "loss": 0.2692, + "step": 13543 + }, + { + "epoch": 0.78, + "grad_norm": 0.289010965803477, + "learning_rate": 2.4720495742242522e-06, + "loss": 0.2824, + "step": 13544 + }, + { + "epoch": 0.78, + "grad_norm": 1.2453049542327923, + "learning_rate": 2.4708247495831263e-06, + "loss": 0.7404, + "step": 13545 + }, + { + "epoch": 0.78, + "grad_norm": 0.36664623886660264, + "learning_rate": 2.4696001856770137e-06, + "loss": 0.1901, + "step": 13546 + }, + { + "epoch": 0.78, + "grad_norm": 0.32350893720100893, + "learning_rate": 2.468375882548325e-06, + "loss": 0.2296, + "step": 13547 + }, + { + "epoch": 0.78, + "grad_norm": 0.28448727451578076, + "learning_rate": 2.4671518402394554e-06, + "loss": 0.2343, + "step": 13548 + }, + { + "epoch": 0.78, + "grad_norm": 0.38941415911483, + "learning_rate": 2.4659280587927935e-06, + "loss": 0.2319, + "step": 13549 + }, + { + "epoch": 0.78, + "grad_norm": 0.540433968384648, + "learning_rate": 2.464704538250717e-06, + "loss": 0.2962, + "step": 13550 + }, + { + "epoch": 0.78, + "grad_norm": 1.0027508666664908, + "learning_rate": 2.463481278655601e-06, + "loss": 0.6688, + "step": 13551 + }, + { + "epoch": 0.78, + "grad_norm": 0.2635421406849195, + "learning_rate": 2.4622582800498042e-06, + "loss": 0.2202, + "step": 13552 + }, + { + "epoch": 0.78, + "grad_norm": 0.3419175327603542, + "learning_rate": 2.4610355424756782e-06, + "loss": 0.2265, + "step": 13553 + }, + { + "epoch": 0.78, + "grad_norm": 0.43706452382046834, + "learning_rate": 2.4598130659755647e-06, + "loss": 0.2644, + "step": 13554 + }, + { + "epoch": 0.78, + "grad_norm": 0.47010643053265033, + "learning_rate": 2.4585908505918034e-06, + "loss": 0.1736, + "step": 13555 + }, + { + "epoch": 0.78, + "grad_norm": 0.2701575469928234, + "learning_rate": 2.4573688963667176e-06, + "loss": 0.2637, + "step": 13556 + }, + { + "epoch": 0.78, + "grad_norm": 1.1200006981818171, + "learning_rate": 2.4561472033426213e-06, + "loss": 0.8067, + "step": 13557 + }, + { + "epoch": 0.78, + "grad_norm": 0.8991589933463543, + "learning_rate": 2.4549257715618234e-06, + "loss": 0.3886, + "step": 13558 + }, + { + "epoch": 0.78, + "grad_norm": 0.2317283658849086, + "learning_rate": 2.4537046010666187e-06, + "loss": 0.1782, + "step": 13559 + }, + { + "epoch": 0.78, + "grad_norm": 0.3356415177168732, + "learning_rate": 2.452483691899302e-06, + "loss": 0.2841, + "step": 13560 + }, + { + "epoch": 0.78, + "grad_norm": 0.4554604862081215, + "learning_rate": 2.45126304410215e-06, + "loss": 0.2736, + "step": 13561 + }, + { + "epoch": 0.78, + "grad_norm": 0.33245473228630973, + "learning_rate": 2.450042657717435e-06, + "loss": 0.2231, + "step": 13562 + }, + { + "epoch": 0.78, + "grad_norm": 1.1404646769499847, + "learning_rate": 2.4488225327874147e-06, + "loss": 0.657, + "step": 13563 + }, + { + "epoch": 0.78, + "grad_norm": 0.33060524305523453, + "learning_rate": 2.4476026693543485e-06, + "loss": 0.2643, + "step": 13564 + }, + { + "epoch": 0.78, + "grad_norm": 0.4029416650173851, + "learning_rate": 2.4463830674604773e-06, + "loss": 0.2251, + "step": 13565 + }, + { + "epoch": 0.78, + "grad_norm": 0.532161244083854, + "learning_rate": 2.4451637271480357e-06, + "loss": 0.3595, + "step": 13566 + }, + { + "epoch": 0.78, + "grad_norm": 0.22270028050530902, + "learning_rate": 2.4439446484592466e-06, + "loss": 0.1721, + "step": 13567 + }, + { + "epoch": 0.78, + "grad_norm": 0.5652419091857782, + "learning_rate": 2.442725831436331e-06, + "loss": 0.2662, + "step": 13568 + }, + { + "epoch": 0.78, + "grad_norm": 0.8543186545019308, + "learning_rate": 2.4415072761214963e-06, + "loss": 0.5066, + "step": 13569 + }, + { + "epoch": 0.78, + "grad_norm": 0.5957280739948273, + "learning_rate": 2.4402889825569396e-06, + "loss": 0.3107, + "step": 13570 + }, + { + "epoch": 0.78, + "grad_norm": 0.4228437651054475, + "learning_rate": 2.4390709507848497e-06, + "loss": 0.2888, + "step": 13571 + }, + { + "epoch": 0.78, + "grad_norm": 0.3197597496407284, + "learning_rate": 2.4378531808474048e-06, + "loss": 0.2583, + "step": 13572 + }, + { + "epoch": 0.78, + "grad_norm": 0.2274135399939389, + "learning_rate": 2.4366356727867847e-06, + "loss": 0.156, + "step": 13573 + }, + { + "epoch": 0.78, + "grad_norm": 0.45599695397853623, + "learning_rate": 2.435418426645144e-06, + "loss": 0.2762, + "step": 13574 + }, + { + "epoch": 0.78, + "grad_norm": 0.3557905396710193, + "learning_rate": 2.4342014424646343e-06, + "loss": 0.2661, + "step": 13575 + }, + { + "epoch": 0.78, + "grad_norm": 0.7923026226601533, + "learning_rate": 2.4329847202874058e-06, + "loss": 0.3719, + "step": 13576 + }, + { + "epoch": 0.78, + "grad_norm": 0.7790919251786718, + "learning_rate": 2.4317682601555913e-06, + "loss": 0.2882, + "step": 13577 + }, + { + "epoch": 0.78, + "grad_norm": 0.9627289488428679, + "learning_rate": 2.4305520621113175e-06, + "loss": 0.2932, + "step": 13578 + }, + { + "epoch": 0.78, + "grad_norm": 0.2430678228878389, + "learning_rate": 2.4293361261966965e-06, + "loss": 0.2217, + "step": 13579 + }, + { + "epoch": 0.78, + "grad_norm": 0.24952073494447208, + "learning_rate": 2.4281204524538425e-06, + "loss": 0.2056, + "step": 13580 + }, + { + "epoch": 0.78, + "grad_norm": 1.1887519774137303, + "learning_rate": 2.426905040924853e-06, + "loss": 0.7538, + "step": 13581 + }, + { + "epoch": 0.78, + "grad_norm": 0.6848642705531716, + "learning_rate": 2.4256898916518145e-06, + "loss": 0.2607, + "step": 13582 + }, + { + "epoch": 0.78, + "grad_norm": 0.34377998627378237, + "learning_rate": 2.4244750046768105e-06, + "loss": 0.2902, + "step": 13583 + }, + { + "epoch": 0.78, + "grad_norm": 0.35628235811761977, + "learning_rate": 2.4232603800419087e-06, + "loss": 0.3136, + "step": 13584 + }, + { + "epoch": 0.78, + "grad_norm": 0.1818921796340036, + "learning_rate": 2.4220460177891757e-06, + "loss": 0.0816, + "step": 13585 + }, + { + "epoch": 0.78, + "grad_norm": 0.3849502874315825, + "learning_rate": 2.4208319179606643e-06, + "loss": 0.3033, + "step": 13586 + }, + { + "epoch": 0.78, + "grad_norm": 0.47780436818363103, + "learning_rate": 2.419618080598417e-06, + "loss": 0.3461, + "step": 13587 + }, + { + "epoch": 0.78, + "grad_norm": 0.4695138835770297, + "learning_rate": 2.418404505744467e-06, + "loss": 0.222, + "step": 13588 + }, + { + "epoch": 0.78, + "grad_norm": 0.5030258966165988, + "learning_rate": 2.4171911934408464e-06, + "loss": 0.2913, + "step": 13589 + }, + { + "epoch": 0.78, + "grad_norm": 0.9470056441285403, + "learning_rate": 2.4159781437295684e-06, + "loss": 0.4908, + "step": 13590 + }, + { + "epoch": 0.78, + "grad_norm": 0.2590739492221378, + "learning_rate": 2.414765356652641e-06, + "loss": 0.1915, + "step": 13591 + }, + { + "epoch": 0.78, + "grad_norm": 0.2734733858017858, + "learning_rate": 2.4135528322520597e-06, + "loss": 0.2396, + "step": 13592 + }, + { + "epoch": 0.78, + "grad_norm": 0.47194836624512676, + "learning_rate": 2.4123405705698213e-06, + "loss": 0.2789, + "step": 13593 + }, + { + "epoch": 0.78, + "grad_norm": 0.7394128282774476, + "learning_rate": 2.4111285716479015e-06, + "loss": 0.3719, + "step": 13594 + }, + { + "epoch": 0.78, + "grad_norm": 0.2604512556637114, + "learning_rate": 2.4099168355282743e-06, + "loss": 0.1993, + "step": 13595 + }, + { + "epoch": 0.78, + "grad_norm": 0.49077072597667926, + "learning_rate": 2.4087053622529e-06, + "loss": 0.351, + "step": 13596 + }, + { + "epoch": 0.78, + "grad_norm": 1.1799394846564877, + "learning_rate": 2.4074941518637295e-06, + "loss": 0.5699, + "step": 13597 + }, + { + "epoch": 0.78, + "grad_norm": 0.3030824419466015, + "learning_rate": 2.406283204402714e-06, + "loss": 0.2124, + "step": 13598 + }, + { + "epoch": 0.78, + "grad_norm": 0.46405446573905923, + "learning_rate": 2.405072519911783e-06, + "loss": 0.31, + "step": 13599 + }, + { + "epoch": 0.78, + "grad_norm": 0.32419622170740764, + "learning_rate": 2.4038620984328655e-06, + "loss": 0.2435, + "step": 13600 + }, + { + "epoch": 0.78, + "grad_norm": 0.21380776837026694, + "learning_rate": 2.4026519400078728e-06, + "loss": 0.1533, + "step": 13601 + }, + { + "epoch": 0.78, + "grad_norm": 1.3300209163280692, + "learning_rate": 2.401442044678721e-06, + "loss": 0.5964, + "step": 13602 + }, + { + "epoch": 0.78, + "grad_norm": 0.35887510821296326, + "learning_rate": 2.4002324124873033e-06, + "loss": 0.2981, + "step": 13603 + }, + { + "epoch": 0.78, + "grad_norm": 0.2783468021005403, + "learning_rate": 2.3990230434755112e-06, + "loss": 0.1819, + "step": 13604 + }, + { + "epoch": 0.78, + "grad_norm": 0.7379956763914063, + "learning_rate": 2.3978139376852206e-06, + "loss": 0.4326, + "step": 13605 + }, + { + "epoch": 0.78, + "grad_norm": 0.3247249377272244, + "learning_rate": 2.3966050951583096e-06, + "loss": 0.1835, + "step": 13606 + }, + { + "epoch": 0.78, + "grad_norm": 0.3669611320334294, + "learning_rate": 2.39539651593664e-06, + "loss": 0.2728, + "step": 13607 + }, + { + "epoch": 0.78, + "grad_norm": 0.35511621181556124, + "learning_rate": 2.3941882000620586e-06, + "loss": 0.2297, + "step": 13608 + }, + { + "epoch": 0.78, + "grad_norm": 1.2103759075595983, + "learning_rate": 2.3929801475764113e-06, + "loss": 0.498, + "step": 13609 + }, + { + "epoch": 0.78, + "grad_norm": 0.33080955912933513, + "learning_rate": 2.391772358521536e-06, + "loss": 0.2461, + "step": 13610 + }, + { + "epoch": 0.78, + "grad_norm": 0.37747048144770934, + "learning_rate": 2.3905648329392574e-06, + "loss": 0.2751, + "step": 13611 + }, + { + "epoch": 0.78, + "grad_norm": 0.8894374852257119, + "learning_rate": 2.389357570871391e-06, + "loss": 0.4819, + "step": 13612 + }, + { + "epoch": 0.78, + "grad_norm": 0.24421514760385282, + "learning_rate": 2.3881505723597422e-06, + "loss": 0.202, + "step": 13613 + }, + { + "epoch": 0.78, + "grad_norm": 0.40684410356900697, + "learning_rate": 2.386943837446114e-06, + "loss": 0.1062, + "step": 13614 + }, + { + "epoch": 0.78, + "grad_norm": 0.3562939479119407, + "learning_rate": 2.385737366172294e-06, + "loss": 0.3223, + "step": 13615 + }, + { + "epoch": 0.78, + "grad_norm": 0.34011491819599793, + "learning_rate": 2.3845311585800612e-06, + "loss": 0.2739, + "step": 13616 + }, + { + "epoch": 0.78, + "grad_norm": 0.7845879219744151, + "learning_rate": 2.3833252147111853e-06, + "loss": 0.2916, + "step": 13617 + }, + { + "epoch": 0.78, + "grad_norm": 0.343820832697006, + "learning_rate": 2.382119534607431e-06, + "loss": 0.2475, + "step": 13618 + }, + { + "epoch": 0.78, + "grad_norm": 0.25729081529380954, + "learning_rate": 2.38091411831055e-06, + "loss": 0.2516, + "step": 13619 + }, + { + "epoch": 0.78, + "grad_norm": 0.5159852122593639, + "learning_rate": 2.379708965862285e-06, + "loss": 0.2322, + "step": 13620 + }, + { + "epoch": 0.78, + "grad_norm": 0.8836787867400838, + "learning_rate": 2.3785040773043686e-06, + "loss": 0.3155, + "step": 13621 + }, + { + "epoch": 0.78, + "grad_norm": 0.4244576302501707, + "learning_rate": 2.3772994526785308e-06, + "loss": 0.2892, + "step": 13622 + }, + { + "epoch": 0.78, + "grad_norm": 0.29146153581979123, + "learning_rate": 2.376095092026486e-06, + "loss": 0.289, + "step": 13623 + }, + { + "epoch": 0.78, + "grad_norm": 0.6333592123809318, + "learning_rate": 2.37489099538994e-06, + "loss": 0.2585, + "step": 13624 + }, + { + "epoch": 0.78, + "grad_norm": 0.3367141969825039, + "learning_rate": 2.3736871628105907e-06, + "loss": 0.2642, + "step": 13625 + }, + { + "epoch": 0.78, + "grad_norm": 0.2573235517922094, + "learning_rate": 2.372483594330124e-06, + "loss": 0.1519, + "step": 13626 + }, + { + "epoch": 0.78, + "grad_norm": 0.3441524091740002, + "learning_rate": 2.3712802899902256e-06, + "loss": 0.258, + "step": 13627 + }, + { + "epoch": 0.78, + "grad_norm": 0.31942754964550046, + "learning_rate": 2.3700772498325617e-06, + "loss": 0.2458, + "step": 13628 + }, + { + "epoch": 0.78, + "grad_norm": 0.7120882141052624, + "learning_rate": 2.3688744738987955e-06, + "loss": 0.3966, + "step": 13629 + }, + { + "epoch": 0.78, + "grad_norm": 1.3333702259355196, + "learning_rate": 2.3676719622305754e-06, + "loss": 0.2666, + "step": 13630 + }, + { + "epoch": 0.78, + "grad_norm": 0.2738749937750857, + "learning_rate": 2.3664697148695494e-06, + "loss": 0.2535, + "step": 13631 + }, + { + "epoch": 0.78, + "grad_norm": 0.24507540728163543, + "learning_rate": 2.365267731857349e-06, + "loss": 0.1795, + "step": 13632 + }, + { + "epoch": 0.78, + "grad_norm": 0.7427338888923851, + "learning_rate": 2.3640660132356e-06, + "loss": 0.3918, + "step": 13633 + }, + { + "epoch": 0.78, + "grad_norm": 0.3615965217499706, + "learning_rate": 2.362864559045912e-06, + "loss": 0.2184, + "step": 13634 + }, + { + "epoch": 0.78, + "grad_norm": 0.34559986606325616, + "learning_rate": 2.3616633693298996e-06, + "loss": 0.2857, + "step": 13635 + }, + { + "epoch": 0.78, + "grad_norm": 1.076346574627362, + "learning_rate": 2.360462444129156e-06, + "loss": 0.6005, + "step": 13636 + }, + { + "epoch": 0.78, + "grad_norm": 0.2748717407836768, + "learning_rate": 2.3592617834852694e-06, + "loss": 0.1638, + "step": 13637 + }, + { + "epoch": 0.78, + "grad_norm": 0.4605339516609135, + "learning_rate": 2.358061387439818e-06, + "loss": 0.3061, + "step": 13638 + }, + { + "epoch": 0.78, + "grad_norm": 0.36174894634119437, + "learning_rate": 2.356861256034371e-06, + "loss": 0.3071, + "step": 13639 + }, + { + "epoch": 0.78, + "grad_norm": 0.40535929812470445, + "learning_rate": 2.355661389310492e-06, + "loss": 0.2054, + "step": 13640 + }, + { + "epoch": 0.78, + "grad_norm": 0.5112923460526705, + "learning_rate": 2.354461787309733e-06, + "loss": 0.3737, + "step": 13641 + }, + { + "epoch": 0.78, + "grad_norm": 0.5087292017954329, + "learning_rate": 2.353262450073628e-06, + "loss": 0.3251, + "step": 13642 + }, + { + "epoch": 0.78, + "grad_norm": 0.29548431419215865, + "learning_rate": 2.3520633776437187e-06, + "loss": 0.199, + "step": 13643 + }, + { + "epoch": 0.78, + "grad_norm": 0.26742981963554274, + "learning_rate": 2.3508645700615253e-06, + "loss": 0.1937, + "step": 13644 + }, + { + "epoch": 0.78, + "grad_norm": 0.7458493819978979, + "learning_rate": 2.3496660273685633e-06, + "loss": 0.3586, + "step": 13645 + }, + { + "epoch": 0.78, + "grad_norm": 0.398854225716011, + "learning_rate": 2.348467749606335e-06, + "loss": 0.2928, + "step": 13646 + }, + { + "epoch": 0.78, + "grad_norm": 0.2920827396093392, + "learning_rate": 2.347269736816341e-06, + "loss": 0.2593, + "step": 13647 + }, + { + "epoch": 0.78, + "grad_norm": 1.4477857127683085, + "learning_rate": 2.3460719890400687e-06, + "loss": 0.748, + "step": 13648 + }, + { + "epoch": 0.78, + "grad_norm": 0.45883127272812063, + "learning_rate": 2.3448745063189937e-06, + "loss": 0.2472, + "step": 13649 + }, + { + "epoch": 0.78, + "grad_norm": 0.2593031866666469, + "learning_rate": 2.3436772886945847e-06, + "loss": 0.1717, + "step": 13650 + }, + { + "epoch": 0.78, + "grad_norm": 0.3450260325923007, + "learning_rate": 2.3424803362083005e-06, + "loss": 0.3068, + "step": 13651 + }, + { + "epoch": 0.78, + "grad_norm": 0.4138606948993418, + "learning_rate": 2.3412836489015945e-06, + "loss": 0.2737, + "step": 13652 + }, + { + "epoch": 0.78, + "grad_norm": 0.46342922046762425, + "learning_rate": 2.340087226815907e-06, + "loss": 0.2678, + "step": 13653 + }, + { + "epoch": 0.78, + "grad_norm": 0.5116239699745199, + "learning_rate": 2.338891069992669e-06, + "loss": 0.3142, + "step": 13654 + }, + { + "epoch": 0.78, + "grad_norm": 0.3018463186424405, + "learning_rate": 2.3376951784733014e-06, + "loss": 0.2575, + "step": 13655 + }, + { + "epoch": 0.78, + "grad_norm": 0.5042604633777217, + "learning_rate": 2.336499552299223e-06, + "loss": 0.3364, + "step": 13656 + }, + { + "epoch": 0.78, + "grad_norm": 0.3450572288678373, + "learning_rate": 2.3353041915118357e-06, + "loss": 0.1711, + "step": 13657 + }, + { + "epoch": 0.78, + "grad_norm": 0.5783242085477613, + "learning_rate": 2.3341090961525347e-06, + "loss": 0.3407, + "step": 13658 + }, + { + "epoch": 0.78, + "grad_norm": 0.26315711604683223, + "learning_rate": 2.3329142662627026e-06, + "loss": 0.2855, + "step": 13659 + }, + { + "epoch": 0.78, + "grad_norm": 1.3037750222352267, + "learning_rate": 2.3317197018837233e-06, + "loss": 0.3251, + "step": 13660 + }, + { + "epoch": 0.78, + "grad_norm": 0.6003607719198905, + "learning_rate": 2.330525403056961e-06, + "loss": 0.3302, + "step": 13661 + }, + { + "epoch": 0.78, + "grad_norm": 0.397572728143814, + "learning_rate": 2.329331369823774e-06, + "loss": 0.3095, + "step": 13662 + }, + { + "epoch": 0.79, + "grad_norm": 0.23107023596230547, + "learning_rate": 2.3281376022255107e-06, + "loss": 0.1822, + "step": 13663 + }, + { + "epoch": 0.79, + "grad_norm": 0.5463080263376949, + "learning_rate": 2.326944100303511e-06, + "loss": 0.323, + "step": 13664 + }, + { + "epoch": 0.79, + "grad_norm": 0.389194949108078, + "learning_rate": 2.32575086409911e-06, + "loss": 0.3304, + "step": 13665 + }, + { + "epoch": 0.79, + "grad_norm": 0.5020850413515092, + "learning_rate": 2.3245578936536263e-06, + "loss": 0.2403, + "step": 13666 + }, + { + "epoch": 0.79, + "grad_norm": 0.3099713566620966, + "learning_rate": 2.323365189008372e-06, + "loss": 0.2437, + "step": 13667 + }, + { + "epoch": 0.79, + "grad_norm": 0.4927324531266989, + "learning_rate": 2.3221727502046487e-06, + "loss": 0.3289, + "step": 13668 + }, + { + "epoch": 0.79, + "grad_norm": 0.5103081683200503, + "learning_rate": 2.3209805772837557e-06, + "loss": 0.2563, + "step": 13669 + }, + { + "epoch": 0.79, + "grad_norm": 0.23731099735038388, + "learning_rate": 2.3197886702869756e-06, + "loss": 0.1511, + "step": 13670 + }, + { + "epoch": 0.79, + "grad_norm": 0.2998413324216504, + "learning_rate": 2.3185970292555827e-06, + "loss": 0.294, + "step": 13671 + }, + { + "epoch": 0.79, + "grad_norm": 1.1268300194743812, + "learning_rate": 2.317405654230842e-06, + "loss": 0.5717, + "step": 13672 + }, + { + "epoch": 0.79, + "grad_norm": 0.42392272265792735, + "learning_rate": 2.3162145452540164e-06, + "loss": 0.2239, + "step": 13673 + }, + { + "epoch": 0.79, + "grad_norm": 0.5320072799016502, + "learning_rate": 2.3150237023663503e-06, + "loss": 0.3257, + "step": 13674 + }, + { + "epoch": 0.79, + "grad_norm": 0.3519616895462257, + "learning_rate": 2.3138331256090853e-06, + "loss": 0.2953, + "step": 13675 + }, + { + "epoch": 0.79, + "grad_norm": 0.2006587422306295, + "learning_rate": 2.312642815023444e-06, + "loss": 0.1078, + "step": 13676 + }, + { + "epoch": 0.79, + "grad_norm": 0.3494649945683863, + "learning_rate": 2.311452770650653e-06, + "loss": 0.2756, + "step": 13677 + }, + { + "epoch": 0.79, + "grad_norm": 0.27624763479241404, + "learning_rate": 2.3102629925319233e-06, + "loss": 0.2441, + "step": 13678 + }, + { + "epoch": 0.79, + "grad_norm": 0.5364171797371735, + "learning_rate": 2.3090734807084545e-06, + "loss": 0.2007, + "step": 13679 + }, + { + "epoch": 0.79, + "grad_norm": 0.3835854642103055, + "learning_rate": 2.307884235221438e-06, + "loss": 0.2958, + "step": 13680 + }, + { + "epoch": 0.79, + "grad_norm": 0.5006981875240444, + "learning_rate": 2.3066952561120616e-06, + "loss": 0.2308, + "step": 13681 + }, + { + "epoch": 0.79, + "grad_norm": 0.3935079231579874, + "learning_rate": 2.3055065434214983e-06, + "loss": 0.2848, + "step": 13682 + }, + { + "epoch": 0.79, + "grad_norm": 0.2531872170122483, + "learning_rate": 2.3043180971909128e-06, + "loss": 0.2199, + "step": 13683 + }, + { + "epoch": 0.79, + "grad_norm": 0.42624480063631087, + "learning_rate": 2.3031299174614572e-06, + "loss": 0.2557, + "step": 13684 + }, + { + "epoch": 0.79, + "grad_norm": 0.6061150551108329, + "learning_rate": 2.3019420042742856e-06, + "loss": 0.3503, + "step": 13685 + }, + { + "epoch": 0.79, + "grad_norm": 0.24096818349359286, + "learning_rate": 2.3007543576705303e-06, + "loss": 0.2254, + "step": 13686 + }, + { + "epoch": 0.79, + "grad_norm": 1.5103883995679313, + "learning_rate": 2.299566977691321e-06, + "loss": 0.508, + "step": 13687 + }, + { + "epoch": 0.79, + "grad_norm": 0.5508991805415692, + "learning_rate": 2.2983798643777755e-06, + "loss": 0.3538, + "step": 13688 + }, + { + "epoch": 0.79, + "grad_norm": 0.3684502029643531, + "learning_rate": 2.297193017771002e-06, + "loss": 0.2364, + "step": 13689 + }, + { + "epoch": 0.79, + "grad_norm": 0.28819135601703405, + "learning_rate": 2.296006437912106e-06, + "loss": 0.2523, + "step": 13690 + }, + { + "epoch": 0.79, + "grad_norm": 0.37804061739241884, + "learning_rate": 2.2948201248421754e-06, + "loss": 0.2241, + "step": 13691 + }, + { + "epoch": 0.79, + "grad_norm": 0.40514152634974704, + "learning_rate": 2.2936340786022926e-06, + "loss": 0.2199, + "step": 13692 + }, + { + "epoch": 0.79, + "grad_norm": 1.195907334850774, + "learning_rate": 2.2924482992335272e-06, + "loss": 0.4781, + "step": 13693 + }, + { + "epoch": 0.79, + "grad_norm": 0.3126316257768985, + "learning_rate": 2.291262786776949e-06, + "loss": 0.2628, + "step": 13694 + }, + { + "epoch": 0.79, + "grad_norm": 0.3317162176271463, + "learning_rate": 2.2900775412736086e-06, + "loss": 0.2689, + "step": 13695 + }, + { + "epoch": 0.79, + "grad_norm": 0.502522546503961, + "learning_rate": 2.288892562764552e-06, + "loss": 0.2287, + "step": 13696 + }, + { + "epoch": 0.79, + "grad_norm": 0.4025773704640404, + "learning_rate": 2.28770785129081e-06, + "loss": 0.2352, + "step": 13697 + }, + { + "epoch": 0.79, + "grad_norm": 0.3119324736608787, + "learning_rate": 2.286523406893418e-06, + "loss": 0.2736, + "step": 13698 + }, + { + "epoch": 0.79, + "grad_norm": 0.46774707777608343, + "learning_rate": 2.285339229613388e-06, + "loss": 0.2402, + "step": 13699 + }, + { + "epoch": 0.79, + "grad_norm": 0.7445728385464858, + "learning_rate": 2.2841553194917288e-06, + "loss": 0.5074, + "step": 13700 + }, + { + "epoch": 0.79, + "grad_norm": 0.3503867920780646, + "learning_rate": 2.2829716765694397e-06, + "loss": 0.2881, + "step": 13701 + }, + { + "epoch": 0.79, + "grad_norm": 0.31801113224197763, + "learning_rate": 2.2817883008875065e-06, + "loss": 0.2597, + "step": 13702 + }, + { + "epoch": 0.79, + "grad_norm": 0.2962235175090054, + "learning_rate": 2.2806051924869144e-06, + "loss": 0.1875, + "step": 13703 + }, + { + "epoch": 0.79, + "grad_norm": 0.3308268216739318, + "learning_rate": 2.2794223514086333e-06, + "loss": 0.25, + "step": 13704 + }, + { + "epoch": 0.79, + "grad_norm": 1.2976921395533858, + "learning_rate": 2.2782397776936237e-06, + "loss": 0.2965, + "step": 13705 + }, + { + "epoch": 0.79, + "grad_norm": 0.3459148821492143, + "learning_rate": 2.277057471382836e-06, + "loss": 0.3103, + "step": 13706 + }, + { + "epoch": 0.79, + "grad_norm": 0.3780665332879215, + "learning_rate": 2.2758754325172194e-06, + "loss": 0.2764, + "step": 13707 + }, + { + "epoch": 0.79, + "grad_norm": 0.7272877027788468, + "learning_rate": 2.274693661137707e-06, + "loss": 0.3907, + "step": 13708 + }, + { + "epoch": 0.79, + "grad_norm": 0.16950972666281172, + "learning_rate": 2.273512157285215e-06, + "loss": 0.0706, + "step": 13709 + }, + { + "epoch": 0.79, + "grad_norm": 0.40555071835046586, + "learning_rate": 2.272330921000667e-06, + "loss": 0.2703, + "step": 13710 + }, + { + "epoch": 0.79, + "grad_norm": 0.3774955734684031, + "learning_rate": 2.271149952324968e-06, + "loss": 0.3077, + "step": 13711 + }, + { + "epoch": 0.79, + "grad_norm": 0.6892753385861591, + "learning_rate": 2.2699692512990135e-06, + "loss": 0.2721, + "step": 13712 + }, + { + "epoch": 0.79, + "grad_norm": 0.341879768930826, + "learning_rate": 2.268788817963692e-06, + "loss": 0.2822, + "step": 13713 + }, + { + "epoch": 0.79, + "grad_norm": 0.36570922717372284, + "learning_rate": 2.2676086523598773e-06, + "loss": 0.3289, + "step": 13714 + }, + { + "epoch": 0.79, + "grad_norm": 0.3003448965263176, + "learning_rate": 2.266428754528446e-06, + "loss": 0.0964, + "step": 13715 + }, + { + "epoch": 0.79, + "grad_norm": 0.33720536672084483, + "learning_rate": 2.2652491245102537e-06, + "loss": 0.2491, + "step": 13716 + }, + { + "epoch": 0.79, + "grad_norm": 1.161787269663438, + "learning_rate": 2.2640697623461517e-06, + "loss": 0.5815, + "step": 13717 + }, + { + "epoch": 0.79, + "grad_norm": 0.29078226584492334, + "learning_rate": 2.262890668076979e-06, + "loss": 0.2155, + "step": 13718 + }, + { + "epoch": 0.79, + "grad_norm": 0.36384414626106976, + "learning_rate": 2.2617118417435725e-06, + "loss": 0.2685, + "step": 13719 + }, + { + "epoch": 0.79, + "grad_norm": 0.9616109497695068, + "learning_rate": 2.260533283386751e-06, + "loss": 0.4991, + "step": 13720 + }, + { + "epoch": 0.79, + "grad_norm": 0.30110611270374804, + "learning_rate": 2.25935499304733e-06, + "loss": 0.1999, + "step": 13721 + }, + { + "epoch": 0.79, + "grad_norm": 0.23948729147878228, + "learning_rate": 2.2581769707661107e-06, + "loss": 0.2069, + "step": 13722 + }, + { + "epoch": 0.79, + "grad_norm": 1.4181780988657122, + "learning_rate": 2.256999216583892e-06, + "loss": 0.4937, + "step": 13723 + }, + { + "epoch": 0.79, + "grad_norm": 0.7846008419436977, + "learning_rate": 2.2558217305414564e-06, + "loss": 0.4049, + "step": 13724 + }, + { + "epoch": 0.79, + "grad_norm": 0.2869992147760968, + "learning_rate": 2.2546445126795822e-06, + "loss": 0.1819, + "step": 13725 + }, + { + "epoch": 0.79, + "grad_norm": 0.34869844428817903, + "learning_rate": 2.2534675630390366e-06, + "loss": 0.3234, + "step": 13726 + }, + { + "epoch": 0.79, + "grad_norm": 0.3008860845713572, + "learning_rate": 2.2522908816605716e-06, + "loss": 0.1825, + "step": 13727 + }, + { + "epoch": 0.79, + "grad_norm": 0.28978102239520515, + "learning_rate": 2.251114468584944e-06, + "loss": 0.1858, + "step": 13728 + }, + { + "epoch": 0.79, + "grad_norm": 0.8083556054139469, + "learning_rate": 2.2499383238528894e-06, + "loss": 0.4195, + "step": 13729 + }, + { + "epoch": 0.79, + "grad_norm": 0.3769533565114214, + "learning_rate": 2.2487624475051364e-06, + "loss": 0.2837, + "step": 13730 + }, + { + "epoch": 0.79, + "grad_norm": 0.39109306811626965, + "learning_rate": 2.2475868395824043e-06, + "loss": 0.2734, + "step": 13731 + }, + { + "epoch": 0.79, + "grad_norm": 0.7981003418145786, + "learning_rate": 2.2464115001254096e-06, + "loss": 0.3579, + "step": 13732 + }, + { + "epoch": 0.79, + "grad_norm": 0.4237085330933973, + "learning_rate": 2.245236429174851e-06, + "loss": 0.2282, + "step": 13733 + }, + { + "epoch": 0.79, + "grad_norm": 0.26938040168964505, + "learning_rate": 2.244061626771421e-06, + "loss": 0.2416, + "step": 13734 + }, + { + "epoch": 0.79, + "grad_norm": 0.33512199491603817, + "learning_rate": 2.2428870929558012e-06, + "loss": 0.2038, + "step": 13735 + }, + { + "epoch": 0.79, + "grad_norm": 0.7622171998110745, + "learning_rate": 2.2417128277686694e-06, + "loss": 0.4113, + "step": 13736 + }, + { + "epoch": 0.79, + "grad_norm": 0.34080168091289903, + "learning_rate": 2.2405388312506903e-06, + "loss": 0.2377, + "step": 13737 + }, + { + "epoch": 0.79, + "grad_norm": 0.34730323144893105, + "learning_rate": 2.239365103442517e-06, + "loss": 0.2583, + "step": 13738 + }, + { + "epoch": 0.79, + "grad_norm": 1.0072696237450594, + "learning_rate": 2.238191644384794e-06, + "loss": 0.5047, + "step": 13739 + }, + { + "epoch": 0.79, + "grad_norm": 0.24308109364594496, + "learning_rate": 2.237018454118163e-06, + "loss": 0.2012, + "step": 13740 + }, + { + "epoch": 0.79, + "grad_norm": 0.5470089584947886, + "learning_rate": 2.2358455326832496e-06, + "loss": 0.2059, + "step": 13741 + }, + { + "epoch": 0.79, + "grad_norm": 0.34034621021093414, + "learning_rate": 2.234672880120674e-06, + "loss": 0.2917, + "step": 13742 + }, + { + "epoch": 0.79, + "grad_norm": 0.3159765482745945, + "learning_rate": 2.233500496471037e-06, + "loss": 0.2669, + "step": 13743 + }, + { + "epoch": 0.79, + "grad_norm": 1.1613176017519513, + "learning_rate": 2.2323283817749463e-06, + "loss": 0.7208, + "step": 13744 + }, + { + "epoch": 0.79, + "grad_norm": 0.338617416056063, + "learning_rate": 2.2311565360729903e-06, + "loss": 0.2283, + "step": 13745 + }, + { + "epoch": 0.79, + "grad_norm": 0.3873334398545606, + "learning_rate": 2.2299849594057487e-06, + "loss": 0.2524, + "step": 13746 + }, + { + "epoch": 0.79, + "grad_norm": 0.2770440136651813, + "learning_rate": 2.2288136518137914e-06, + "loss": 0.2116, + "step": 13747 + }, + { + "epoch": 0.79, + "grad_norm": 0.678257433651019, + "learning_rate": 2.227642613337686e-06, + "loss": 0.2842, + "step": 13748 + }, + { + "epoch": 0.79, + "grad_norm": 0.34415897142683244, + "learning_rate": 2.2264718440179835e-06, + "loss": 0.2728, + "step": 13749 + }, + { + "epoch": 0.79, + "grad_norm": 0.3417732021973484, + "learning_rate": 2.2253013438952253e-06, + "loss": 0.2948, + "step": 13750 + }, + { + "epoch": 0.79, + "grad_norm": 1.7331090075356483, + "learning_rate": 2.224131113009945e-06, + "loss": 0.1448, + "step": 13751 + }, + { + "epoch": 0.79, + "grad_norm": 0.35091864102953746, + "learning_rate": 2.222961151402674e-06, + "loss": 0.2522, + "step": 13752 + }, + { + "epoch": 0.79, + "grad_norm": 0.44152654129417956, + "learning_rate": 2.2217914591139222e-06, + "loss": 0.3178, + "step": 13753 + }, + { + "epoch": 0.79, + "grad_norm": 0.2511167710167237, + "learning_rate": 2.2206220361841978e-06, + "loss": 0.14, + "step": 13754 + }, + { + "epoch": 0.79, + "grad_norm": 0.31288446501141626, + "learning_rate": 2.2194528826539984e-06, + "loss": 0.2369, + "step": 13755 + }, + { + "epoch": 0.79, + "grad_norm": 1.312231502877169, + "learning_rate": 2.218283998563808e-06, + "loss": 0.6555, + "step": 13756 + }, + { + "epoch": 0.79, + "grad_norm": 0.5150153647059441, + "learning_rate": 2.2171153839541114e-06, + "loss": 0.3475, + "step": 13757 + }, + { + "epoch": 0.79, + "grad_norm": 0.27675543088755755, + "learning_rate": 2.2159470388653737e-06, + "loss": 0.2179, + "step": 13758 + }, + { + "epoch": 0.79, + "grad_norm": 0.8129461695250663, + "learning_rate": 2.2147789633380555e-06, + "loss": 0.401, + "step": 13759 + }, + { + "epoch": 0.79, + "grad_norm": 0.3186014787104289, + "learning_rate": 2.213611157412605e-06, + "loss": 0.1735, + "step": 13760 + }, + { + "epoch": 0.79, + "grad_norm": 0.2907238321322088, + "learning_rate": 2.2124436211294676e-06, + "loss": 0.1941, + "step": 13761 + }, + { + "epoch": 0.79, + "grad_norm": 0.4140887679030627, + "learning_rate": 2.2112763545290728e-06, + "loss": 0.3082, + "step": 13762 + }, + { + "epoch": 0.79, + "grad_norm": 1.1476560055281388, + "learning_rate": 2.2101093576518416e-06, + "loss": 0.3793, + "step": 13763 + }, + { + "epoch": 0.79, + "grad_norm": 0.3858913981085996, + "learning_rate": 2.2089426305381865e-06, + "loss": 0.2011, + "step": 13764 + }, + { + "epoch": 0.79, + "grad_norm": 0.39061485494929044, + "learning_rate": 2.2077761732285165e-06, + "loss": 0.3134, + "step": 13765 + }, + { + "epoch": 0.79, + "grad_norm": 0.31571158619901163, + "learning_rate": 2.206609985763222e-06, + "loss": 0.2483, + "step": 13766 + }, + { + "epoch": 0.79, + "grad_norm": 0.3321054971659981, + "learning_rate": 2.2054440681826896e-06, + "loss": 0.1801, + "step": 13767 + }, + { + "epoch": 0.79, + "grad_norm": 0.34644794016271147, + "learning_rate": 2.2042784205272927e-06, + "loss": 0.2443, + "step": 13768 + }, + { + "epoch": 0.79, + "grad_norm": 0.48747689057722515, + "learning_rate": 2.203113042837396e-06, + "loss": 0.324, + "step": 13769 + }, + { + "epoch": 0.79, + "grad_norm": 0.3253743126407448, + "learning_rate": 2.2019479351533625e-06, + "loss": 0.2552, + "step": 13770 + }, + { + "epoch": 0.79, + "grad_norm": 0.5069718524012903, + "learning_rate": 2.2007830975155366e-06, + "loss": 0.2568, + "step": 13771 + }, + { + "epoch": 0.79, + "grad_norm": 1.1996810800114002, + "learning_rate": 2.199618529964257e-06, + "loss": 0.6718, + "step": 13772 + }, + { + "epoch": 0.79, + "grad_norm": 0.3826603271787008, + "learning_rate": 2.198454232539848e-06, + "loss": 0.2895, + "step": 13773 + }, + { + "epoch": 0.79, + "grad_norm": 0.19718297829960185, + "learning_rate": 2.1972902052826384e-06, + "loss": 0.1763, + "step": 13774 + }, + { + "epoch": 0.79, + "grad_norm": 0.6762841450323746, + "learning_rate": 2.1961264482329326e-06, + "loss": 0.3604, + "step": 13775 + }, + { + "epoch": 0.79, + "grad_norm": 0.523526720693821, + "learning_rate": 2.194962961431032e-06, + "loss": 0.2805, + "step": 13776 + }, + { + "epoch": 0.79, + "grad_norm": 0.348195317352207, + "learning_rate": 2.1937997449172287e-06, + "loss": 0.2318, + "step": 13777 + }, + { + "epoch": 0.79, + "grad_norm": 0.49968051927791735, + "learning_rate": 2.192636798731804e-06, + "loss": 0.3517, + "step": 13778 + }, + { + "epoch": 0.79, + "grad_norm": 0.3862758209154229, + "learning_rate": 2.1914741229150315e-06, + "loss": 0.2706, + "step": 13779 + }, + { + "epoch": 0.79, + "grad_norm": 0.20524174445611812, + "learning_rate": 2.1903117175071754e-06, + "loss": 0.1168, + "step": 13780 + }, + { + "epoch": 0.79, + "grad_norm": 0.33607769190284076, + "learning_rate": 2.1891495825484856e-06, + "loss": 0.2942, + "step": 13781 + }, + { + "epoch": 0.79, + "grad_norm": 0.681396311351655, + "learning_rate": 2.1879877180792117e-06, + "loss": 0.3332, + "step": 13782 + }, + { + "epoch": 0.79, + "grad_norm": 0.3897404025541323, + "learning_rate": 2.186826124139587e-06, + "loss": 0.2887, + "step": 13783 + }, + { + "epoch": 0.79, + "grad_norm": 1.199733904627748, + "learning_rate": 2.185664800769839e-06, + "loss": 0.3483, + "step": 13784 + }, + { + "epoch": 0.79, + "grad_norm": 0.3624521652254083, + "learning_rate": 2.1845037480101793e-06, + "loss": 0.2673, + "step": 13785 + }, + { + "epoch": 0.79, + "grad_norm": 0.23852031573910368, + "learning_rate": 2.183342965900821e-06, + "loss": 0.2303, + "step": 13786 + }, + { + "epoch": 0.79, + "grad_norm": 0.6022579071701457, + "learning_rate": 2.18218245448196e-06, + "loss": 0.2888, + "step": 13787 + }, + { + "epoch": 0.79, + "grad_norm": 0.29140171805652, + "learning_rate": 2.1810222137937855e-06, + "loss": 0.1838, + "step": 13788 + }, + { + "epoch": 0.79, + "grad_norm": 0.3210911846698981, + "learning_rate": 2.1798622438764716e-06, + "loss": 0.2784, + "step": 13789 + }, + { + "epoch": 0.79, + "grad_norm": 0.4872387173728299, + "learning_rate": 2.1787025447701947e-06, + "loss": 0.2583, + "step": 13790 + }, + { + "epoch": 0.79, + "grad_norm": 0.3987621580560564, + "learning_rate": 2.177543116515113e-06, + "loss": 0.286, + "step": 13791 + }, + { + "epoch": 0.79, + "grad_norm": 0.4894773404094698, + "learning_rate": 2.176383959151377e-06, + "loss": 0.3393, + "step": 13792 + }, + { + "epoch": 0.79, + "grad_norm": 0.2736323672900733, + "learning_rate": 2.175225072719127e-06, + "loss": 0.2155, + "step": 13793 + }, + { + "epoch": 0.79, + "grad_norm": 0.2835026201314597, + "learning_rate": 2.174066457258495e-06, + "loss": 0.1832, + "step": 13794 + }, + { + "epoch": 0.79, + "grad_norm": 0.5998237761061286, + "learning_rate": 2.172908112809606e-06, + "loss": 0.3561, + "step": 13795 + }, + { + "epoch": 0.79, + "grad_norm": 1.1528927782639422, + "learning_rate": 2.1717500394125735e-06, + "loss": 0.6232, + "step": 13796 + }, + { + "epoch": 0.79, + "grad_norm": 0.2608272389082933, + "learning_rate": 2.1705922371075005e-06, + "loss": 0.2213, + "step": 13797 + }, + { + "epoch": 0.79, + "grad_norm": 0.5306546366421715, + "learning_rate": 2.169434705934479e-06, + "loss": 0.308, + "step": 13798 + }, + { + "epoch": 0.79, + "grad_norm": 0.4938087384739634, + "learning_rate": 2.1682774459335987e-06, + "loss": 0.2987, + "step": 13799 + }, + { + "epoch": 0.79, + "grad_norm": 0.1622853869015634, + "learning_rate": 2.1671204571449345e-06, + "loss": 0.0694, + "step": 13800 + }, + { + "epoch": 0.79, + "grad_norm": 0.3014907139564795, + "learning_rate": 2.165963739608552e-06, + "loss": 0.274, + "step": 13801 + }, + { + "epoch": 0.79, + "grad_norm": 0.4934739195565253, + "learning_rate": 2.164807293364506e-06, + "loss": 0.3363, + "step": 13802 + }, + { + "epoch": 0.79, + "grad_norm": 0.4667111156007383, + "learning_rate": 2.1636511184528484e-06, + "loss": 0.1946, + "step": 13803 + }, + { + "epoch": 0.79, + "grad_norm": 0.4077960646746423, + "learning_rate": 2.162495214913616e-06, + "loss": 0.3125, + "step": 13804 + }, + { + "epoch": 0.79, + "grad_norm": 0.32122695724294886, + "learning_rate": 2.1613395827868366e-06, + "loss": 0.2702, + "step": 13805 + }, + { + "epoch": 0.79, + "grad_norm": 0.14965512931372155, + "learning_rate": 2.160184222112531e-06, + "loss": 0.0691, + "step": 13806 + }, + { + "epoch": 0.79, + "grad_norm": 0.4639504102647323, + "learning_rate": 2.159029132930707e-06, + "loss": 0.2847, + "step": 13807 + }, + { + "epoch": 0.79, + "grad_norm": 1.364104155468768, + "learning_rate": 2.1578743152813676e-06, + "loss": 0.7156, + "step": 13808 + }, + { + "epoch": 0.79, + "grad_norm": 0.3139451351224941, + "learning_rate": 2.156719769204505e-06, + "loss": 0.2972, + "step": 13809 + }, + { + "epoch": 0.79, + "grad_norm": 0.3411874232698844, + "learning_rate": 2.155565494740098e-06, + "loss": 0.2292, + "step": 13810 + }, + { + "epoch": 0.79, + "grad_norm": 1.5518338776407492, + "learning_rate": 2.1544114919281223e-06, + "loss": 0.616, + "step": 13811 + }, + { + "epoch": 0.79, + "grad_norm": 0.20569821108771164, + "learning_rate": 2.153257760808538e-06, + "loss": 0.146, + "step": 13812 + }, + { + "epoch": 0.79, + "grad_norm": 0.2839430285999692, + "learning_rate": 2.152104301421302e-06, + "loss": 0.2202, + "step": 13813 + }, + { + "epoch": 0.79, + "grad_norm": 0.4522044299979987, + "learning_rate": 2.150951113806351e-06, + "loss": 0.3257, + "step": 13814 + }, + { + "epoch": 0.79, + "grad_norm": 0.4669588799542236, + "learning_rate": 2.1497981980036297e-06, + "loss": 0.3026, + "step": 13815 + }, + { + "epoch": 0.79, + "grad_norm": 0.37379373753426265, + "learning_rate": 2.1486455540530593e-06, + "loss": 0.2319, + "step": 13816 + }, + { + "epoch": 0.79, + "grad_norm": 0.3565690261654594, + "learning_rate": 2.1474931819945555e-06, + "loss": 0.3044, + "step": 13817 + }, + { + "epoch": 0.79, + "grad_norm": 0.27704374870348236, + "learning_rate": 2.1463410818680253e-06, + "loss": 0.1727, + "step": 13818 + }, + { + "epoch": 0.79, + "grad_norm": 0.3302566985170461, + "learning_rate": 2.1451892537133624e-06, + "loss": 0.2454, + "step": 13819 + }, + { + "epoch": 0.79, + "grad_norm": 0.41198364612002364, + "learning_rate": 2.1440376975704614e-06, + "loss": 0.2576, + "step": 13820 + }, + { + "epoch": 0.79, + "grad_norm": 0.42590625607150334, + "learning_rate": 2.142886413479197e-06, + "loss": 0.3075, + "step": 13821 + }, + { + "epoch": 0.79, + "grad_norm": 0.3323355752642059, + "learning_rate": 2.1417354014794378e-06, + "loss": 0.2635, + "step": 13822 + }, + { + "epoch": 0.79, + "grad_norm": 1.287156112327123, + "learning_rate": 2.1405846616110416e-06, + "loss": 0.3389, + "step": 13823 + }, + { + "epoch": 0.79, + "grad_norm": 0.27619945652710726, + "learning_rate": 2.1394341939138618e-06, + "loss": 0.149, + "step": 13824 + }, + { + "epoch": 0.79, + "grad_norm": 0.2555642275552039, + "learning_rate": 2.1382839984277395e-06, + "loss": 0.2574, + "step": 13825 + }, + { + "epoch": 0.79, + "grad_norm": 0.687890090018683, + "learning_rate": 2.137134075192504e-06, + "loss": 0.2607, + "step": 13826 + }, + { + "epoch": 0.79, + "grad_norm": 0.6251334392979514, + "learning_rate": 2.135984424247974e-06, + "loss": 0.3902, + "step": 13827 + }, + { + "epoch": 0.79, + "grad_norm": 0.35576024720872323, + "learning_rate": 2.1348350456339684e-06, + "loss": 0.2901, + "step": 13828 + }, + { + "epoch": 0.79, + "grad_norm": 0.33393058087833427, + "learning_rate": 2.1336859393902864e-06, + "loss": 0.2403, + "step": 13829 + }, + { + "epoch": 0.79, + "grad_norm": 0.3272136365997171, + "learning_rate": 2.1325371055567236e-06, + "loss": 0.16, + "step": 13830 + }, + { + "epoch": 0.79, + "grad_norm": 0.38288933342849923, + "learning_rate": 2.1313885441730607e-06, + "loss": 0.2884, + "step": 13831 + }, + { + "epoch": 0.79, + "grad_norm": 0.6989908981679257, + "learning_rate": 2.1302402552790723e-06, + "loss": 0.3487, + "step": 13832 + }, + { + "epoch": 0.79, + "grad_norm": 0.26166129345641775, + "learning_rate": 2.1290922389145284e-06, + "loss": 0.2073, + "step": 13833 + }, + { + "epoch": 0.79, + "grad_norm": 0.38623162282324663, + "learning_rate": 2.1279444951191806e-06, + "loss": 0.3036, + "step": 13834 + }, + { + "epoch": 0.79, + "grad_norm": 1.1632051177823823, + "learning_rate": 2.1267970239327773e-06, + "loss": 0.5774, + "step": 13835 + }, + { + "epoch": 0.79, + "grad_norm": 0.27563291221288966, + "learning_rate": 2.1256498253950518e-06, + "loss": 0.1642, + "step": 13836 + }, + { + "epoch": 0.8, + "grad_norm": 0.27018901283686697, + "learning_rate": 2.124502899545737e-06, + "loss": 0.2479, + "step": 13837 + }, + { + "epoch": 0.8, + "grad_norm": 0.5005766607922495, + "learning_rate": 2.1233562464245483e-06, + "loss": 0.2709, + "step": 13838 + }, + { + "epoch": 0.8, + "grad_norm": 1.167362155705901, + "learning_rate": 2.122209866071194e-06, + "loss": 0.4289, + "step": 13839 + }, + { + "epoch": 0.8, + "grad_norm": 0.3193880909220907, + "learning_rate": 2.12106375852537e-06, + "loss": 0.2474, + "step": 13840 + }, + { + "epoch": 0.8, + "grad_norm": 0.35005318057106716, + "learning_rate": 2.119917923826773e-06, + "loss": 0.2889, + "step": 13841 + }, + { + "epoch": 0.8, + "grad_norm": 0.33854876303697296, + "learning_rate": 2.118772362015078e-06, + "loss": 0.0908, + "step": 13842 + }, + { + "epoch": 0.8, + "grad_norm": 0.3633576105580375, + "learning_rate": 2.117627073129961e-06, + "loss": 0.2801, + "step": 13843 + }, + { + "epoch": 0.8, + "grad_norm": 0.42219924579575663, + "learning_rate": 2.1164820572110734e-06, + "loss": 0.2845, + "step": 13844 + }, + { + "epoch": 0.8, + "grad_norm": 0.35881211061754714, + "learning_rate": 2.115337314298077e-06, + "loss": 0.3175, + "step": 13845 + }, + { + "epoch": 0.8, + "grad_norm": 0.2868125822886127, + "learning_rate": 2.1141928444306094e-06, + "loss": 0.2033, + "step": 13846 + }, + { + "epoch": 0.8, + "grad_norm": 1.1661974844904361, + "learning_rate": 2.113048647648305e-06, + "loss": 0.4777, + "step": 13847 + }, + { + "epoch": 0.8, + "grad_norm": 0.3302764354075652, + "learning_rate": 2.1119047239907833e-06, + "loss": 0.2756, + "step": 13848 + }, + { + "epoch": 0.8, + "grad_norm": 0.29027735567796065, + "learning_rate": 2.110761073497665e-06, + "loss": 0.1903, + "step": 13849 + }, + { + "epoch": 0.8, + "grad_norm": 0.48117388677328654, + "learning_rate": 2.1096176962085513e-06, + "loss": 0.262, + "step": 13850 + }, + { + "epoch": 0.8, + "grad_norm": 0.4337568500734956, + "learning_rate": 2.1084745921630377e-06, + "loss": 0.3096, + "step": 13851 + }, + { + "epoch": 0.8, + "grad_norm": 0.284048549328376, + "learning_rate": 2.107331761400707e-06, + "loss": 0.1813, + "step": 13852 + }, + { + "epoch": 0.8, + "grad_norm": 0.3678509849823864, + "learning_rate": 2.1061892039611407e-06, + "loss": 0.2978, + "step": 13853 + }, + { + "epoch": 0.8, + "grad_norm": 0.9895870132408121, + "learning_rate": 2.105046919883903e-06, + "loss": 0.4123, + "step": 13854 + }, + { + "epoch": 0.8, + "grad_norm": 0.30690631485293857, + "learning_rate": 2.1039049092085507e-06, + "loss": 0.2116, + "step": 13855 + }, + { + "epoch": 0.8, + "grad_norm": 0.2885275065069769, + "learning_rate": 2.102763171974629e-06, + "loss": 0.232, + "step": 13856 + }, + { + "epoch": 0.8, + "grad_norm": 0.39902371245566787, + "learning_rate": 2.1016217082216815e-06, + "loss": 0.2799, + "step": 13857 + }, + { + "epoch": 0.8, + "grad_norm": 0.31971621759090113, + "learning_rate": 2.100480517989235e-06, + "loss": 0.259, + "step": 13858 + }, + { + "epoch": 0.8, + "grad_norm": 0.8751263325898722, + "learning_rate": 2.099339601316809e-06, + "loss": 0.2952, + "step": 13859 + }, + { + "epoch": 0.8, + "grad_norm": 0.34455456165955256, + "learning_rate": 2.098198958243911e-06, + "loss": 0.303, + "step": 13860 + }, + { + "epoch": 0.8, + "grad_norm": 0.3229178337704223, + "learning_rate": 2.0970585888100425e-06, + "loss": 0.266, + "step": 13861 + }, + { + "epoch": 0.8, + "grad_norm": 0.8776049310548688, + "learning_rate": 2.0959184930546973e-06, + "loss": 0.2564, + "step": 13862 + }, + { + "epoch": 0.8, + "grad_norm": 0.47569729397496585, + "learning_rate": 2.0947786710173545e-06, + "loss": 0.2821, + "step": 13863 + }, + { + "epoch": 0.8, + "grad_norm": 0.2504288310318657, + "learning_rate": 2.0936391227374874e-06, + "loss": 0.2158, + "step": 13864 + }, + { + "epoch": 0.8, + "grad_norm": 0.35032213602144774, + "learning_rate": 2.0924998482545535e-06, + "loss": 0.2503, + "step": 13865 + }, + { + "epoch": 0.8, + "grad_norm": 0.6552856340397468, + "learning_rate": 2.0913608476080138e-06, + "loss": 0.3592, + "step": 13866 + }, + { + "epoch": 0.8, + "grad_norm": 0.3133790814738462, + "learning_rate": 2.090222120837306e-06, + "loss": 0.2509, + "step": 13867 + }, + { + "epoch": 0.8, + "grad_norm": 0.47362331697258986, + "learning_rate": 2.089083667981868e-06, + "loss": 0.2264, + "step": 13868 + }, + { + "epoch": 0.8, + "grad_norm": 0.4945426270079345, + "learning_rate": 2.087945489081119e-06, + "loss": 0.4079, + "step": 13869 + }, + { + "epoch": 0.8, + "grad_norm": 0.3993652479527239, + "learning_rate": 2.0868075841744795e-06, + "loss": 0.2696, + "step": 13870 + }, + { + "epoch": 0.8, + "grad_norm": 0.26067501072855986, + "learning_rate": 2.0856699533013535e-06, + "loss": 0.1919, + "step": 13871 + }, + { + "epoch": 0.8, + "grad_norm": 0.3616882633023731, + "learning_rate": 2.0845325965011375e-06, + "loss": 0.264, + "step": 13872 + }, + { + "epoch": 0.8, + "grad_norm": 0.3935744589887008, + "learning_rate": 2.083395513813217e-06, + "loss": 0.277, + "step": 13873 + }, + { + "epoch": 0.8, + "grad_norm": 0.662955242143687, + "learning_rate": 2.082258705276966e-06, + "loss": 0.3611, + "step": 13874 + }, + { + "epoch": 0.8, + "grad_norm": 1.0458268635013401, + "learning_rate": 2.0811221709317587e-06, + "loss": 0.5303, + "step": 13875 + }, + { + "epoch": 0.8, + "grad_norm": 0.23084717455090384, + "learning_rate": 2.0799859108169496e-06, + "loss": 0.2034, + "step": 13876 + }, + { + "epoch": 0.8, + "grad_norm": 0.3093465003844103, + "learning_rate": 2.0788499249718887e-06, + "loss": 0.2502, + "step": 13877 + }, + { + "epoch": 0.8, + "grad_norm": 1.0381983387677298, + "learning_rate": 2.077714213435914e-06, + "loss": 0.2885, + "step": 13878 + }, + { + "epoch": 0.8, + "grad_norm": 0.32579748606769765, + "learning_rate": 2.0765787762483545e-06, + "loss": 0.2467, + "step": 13879 + }, + { + "epoch": 0.8, + "grad_norm": 1.1957787010002299, + "learning_rate": 2.075443613448532e-06, + "loss": 0.4765, + "step": 13880 + }, + { + "epoch": 0.8, + "grad_norm": 0.33500602148308517, + "learning_rate": 2.0743087250757544e-06, + "loss": 0.2632, + "step": 13881 + }, + { + "epoch": 0.8, + "grad_norm": 0.3294045154993524, + "learning_rate": 2.073174111169327e-06, + "loss": 0.2471, + "step": 13882 + }, + { + "epoch": 0.8, + "grad_norm": 0.8524795933817103, + "learning_rate": 2.072039771768539e-06, + "loss": 0.3734, + "step": 13883 + }, + { + "epoch": 0.8, + "grad_norm": 0.21219881196196438, + "learning_rate": 2.0709057069126726e-06, + "loss": 0.2046, + "step": 13884 + }, + { + "epoch": 0.8, + "grad_norm": 0.3405533089230494, + "learning_rate": 2.0697719166410013e-06, + "loss": 0.2002, + "step": 13885 + }, + { + "epoch": 0.8, + "grad_norm": 1.5915861638020246, + "learning_rate": 2.068638400992784e-06, + "loss": 0.757, + "step": 13886 + }, + { + "epoch": 0.8, + "grad_norm": 1.2417936476128963, + "learning_rate": 2.0675051600072817e-06, + "loss": 0.7115, + "step": 13887 + }, + { + "epoch": 0.8, + "grad_norm": 0.2680697096757186, + "learning_rate": 2.0663721937237334e-06, + "loss": 0.1963, + "step": 13888 + }, + { + "epoch": 0.8, + "grad_norm": 0.3929872162313532, + "learning_rate": 2.0652395021813752e-06, + "loss": 0.3059, + "step": 13889 + }, + { + "epoch": 0.8, + "grad_norm": 0.36593534101852726, + "learning_rate": 2.064107085419429e-06, + "loss": 0.1969, + "step": 13890 + }, + { + "epoch": 0.8, + "grad_norm": 0.32655675523124694, + "learning_rate": 2.062974943477116e-06, + "loss": 0.1665, + "step": 13891 + }, + { + "epoch": 0.8, + "grad_norm": 0.29384082638874254, + "learning_rate": 2.0618430763936402e-06, + "loss": 0.2684, + "step": 13892 + }, + { + "epoch": 0.8, + "grad_norm": 1.209531727244767, + "learning_rate": 2.0607114842081966e-06, + "loss": 0.7586, + "step": 13893 + }, + { + "epoch": 0.8, + "grad_norm": 0.3941791338393727, + "learning_rate": 2.0595801669599704e-06, + "loss": 0.2967, + "step": 13894 + }, + { + "epoch": 0.8, + "grad_norm": 0.5237764192291912, + "learning_rate": 2.0584491246881443e-06, + "loss": 0.2825, + "step": 13895 + }, + { + "epoch": 0.8, + "grad_norm": 0.2794762521852401, + "learning_rate": 2.0573183574318832e-06, + "loss": 0.249, + "step": 13896 + }, + { + "epoch": 0.8, + "grad_norm": 0.2774111684414447, + "learning_rate": 2.0561878652303458e-06, + "loss": 0.1959, + "step": 13897 + }, + { + "epoch": 0.8, + "grad_norm": 0.5438540602781272, + "learning_rate": 2.0550576481226814e-06, + "loss": 0.212, + "step": 13898 + }, + { + "epoch": 0.8, + "grad_norm": 0.6511150660244753, + "learning_rate": 2.0539277061480256e-06, + "loss": 0.4511, + "step": 13899 + }, + { + "epoch": 0.8, + "grad_norm": 0.2423417364066042, + "learning_rate": 2.0527980393455147e-06, + "loss": 0.2466, + "step": 13900 + }, + { + "epoch": 0.8, + "grad_norm": 0.47122913695901136, + "learning_rate": 2.051668647754267e-06, + "loss": 0.2733, + "step": 13901 + }, + { + "epoch": 0.8, + "grad_norm": 0.27193278941468646, + "learning_rate": 2.0505395314133915e-06, + "loss": 0.156, + "step": 13902 + }, + { + "epoch": 0.8, + "grad_norm": 0.5962094572756668, + "learning_rate": 2.049410690361987e-06, + "loss": 0.304, + "step": 13903 + }, + { + "epoch": 0.8, + "grad_norm": 0.26728260818607263, + "learning_rate": 2.0482821246391515e-06, + "loss": 0.2332, + "step": 13904 + }, + { + "epoch": 0.8, + "grad_norm": 0.7884511286921686, + "learning_rate": 2.0471538342839637e-06, + "loss": 0.4379, + "step": 13905 + }, + { + "epoch": 0.8, + "grad_norm": 0.5466734579354183, + "learning_rate": 2.0460258193354963e-06, + "loss": 0.3473, + "step": 13906 + }, + { + "epoch": 0.8, + "grad_norm": 0.39502877345893916, + "learning_rate": 2.0448980798328113e-06, + "loss": 0.3107, + "step": 13907 + }, + { + "epoch": 0.8, + "grad_norm": 0.19179045423602922, + "learning_rate": 2.043770615814966e-06, + "loss": 0.1632, + "step": 13908 + }, + { + "epoch": 0.8, + "grad_norm": 0.5438731088748233, + "learning_rate": 2.0426434273210016e-06, + "loss": 0.2428, + "step": 13909 + }, + { + "epoch": 0.8, + "grad_norm": 0.3807065412872983, + "learning_rate": 2.041516514389954e-06, + "loss": 0.2902, + "step": 13910 + }, + { + "epoch": 0.8, + "grad_norm": 0.7299876932540644, + "learning_rate": 2.0403898770608466e-06, + "loss": 0.3083, + "step": 13911 + }, + { + "epoch": 0.8, + "grad_norm": 0.3073229067569145, + "learning_rate": 2.0392635153726958e-06, + "loss": 0.2584, + "step": 13912 + }, + { + "epoch": 0.8, + "grad_norm": 0.38588900994842523, + "learning_rate": 2.0381374293645072e-06, + "loss": 0.2999, + "step": 13913 + }, + { + "epoch": 0.8, + "grad_norm": 0.47918109962924327, + "learning_rate": 2.0370116190752763e-06, + "loss": 0.1199, + "step": 13914 + }, + { + "epoch": 0.8, + "grad_norm": 0.3032568846110088, + "learning_rate": 2.035886084543989e-06, + "loss": 0.19, + "step": 13915 + }, + { + "epoch": 0.8, + "grad_norm": 0.3048366839804276, + "learning_rate": 2.0347608258096263e-06, + "loss": 0.2932, + "step": 13916 + }, + { + "epoch": 0.8, + "grad_norm": 0.860092659438108, + "learning_rate": 2.0336358429111534e-06, + "loss": 0.3054, + "step": 13917 + }, + { + "epoch": 0.8, + "grad_norm": 0.410237488915634, + "learning_rate": 2.0325111358875295e-06, + "loss": 0.2969, + "step": 13918 + }, + { + "epoch": 0.8, + "grad_norm": 0.5844699302701855, + "learning_rate": 2.031386704777698e-06, + "loss": 0.3557, + "step": 13919 + }, + { + "epoch": 0.8, + "grad_norm": 0.3807335069363862, + "learning_rate": 2.0302625496206065e-06, + "loss": 0.2834, + "step": 13920 + }, + { + "epoch": 0.8, + "grad_norm": 0.19674851030297955, + "learning_rate": 2.0291386704551795e-06, + "loss": 0.0822, + "step": 13921 + }, + { + "epoch": 0.8, + "grad_norm": 0.4132502470459073, + "learning_rate": 2.028015067320338e-06, + "loss": 0.3011, + "step": 13922 + }, + { + "epoch": 0.8, + "grad_norm": 0.6473087562464866, + "learning_rate": 2.0268917402549914e-06, + "loss": 0.3178, + "step": 13923 + }, + { + "epoch": 0.8, + "grad_norm": 0.36673530265989734, + "learning_rate": 2.0257686892980387e-06, + "loss": 0.2243, + "step": 13924 + }, + { + "epoch": 0.8, + "grad_norm": 0.36203099449274834, + "learning_rate": 2.0246459144883767e-06, + "loss": 0.2867, + "step": 13925 + }, + { + "epoch": 0.8, + "grad_norm": 0.5054331029168437, + "learning_rate": 2.023523415864883e-06, + "loss": 0.2783, + "step": 13926 + }, + { + "epoch": 0.8, + "grad_norm": 0.28828519889572235, + "learning_rate": 2.02240119346643e-06, + "loss": 0.0938, + "step": 13927 + }, + { + "epoch": 0.8, + "grad_norm": 0.3016104369032031, + "learning_rate": 2.0212792473318788e-06, + "loss": 0.2868, + "step": 13928 + }, + { + "epoch": 0.8, + "grad_norm": 0.6227814296163309, + "learning_rate": 2.020157577500086e-06, + "loss": 0.379, + "step": 13929 + }, + { + "epoch": 0.8, + "grad_norm": 0.36615003349825465, + "learning_rate": 2.019036184009894e-06, + "loss": 0.2155, + "step": 13930 + }, + { + "epoch": 0.8, + "grad_norm": 0.40065400364964354, + "learning_rate": 2.0179150669001347e-06, + "loss": 0.3056, + "step": 13931 + }, + { + "epoch": 0.8, + "grad_norm": 0.359059617026297, + "learning_rate": 2.0167942262096317e-06, + "loss": 0.3046, + "step": 13932 + }, + { + "epoch": 0.8, + "grad_norm": 0.3025116844483224, + "learning_rate": 2.0156736619772034e-06, + "loss": 0.1751, + "step": 13933 + }, + { + "epoch": 0.8, + "grad_norm": 0.3910522596201756, + "learning_rate": 2.0145533742416536e-06, + "loss": 0.2408, + "step": 13934 + }, + { + "epoch": 0.8, + "grad_norm": 0.8924655555675018, + "learning_rate": 2.013433363041777e-06, + "loss": 0.4662, + "step": 13935 + }, + { + "epoch": 0.8, + "grad_norm": 0.3086428880041467, + "learning_rate": 2.012313628416359e-06, + "loss": 0.2813, + "step": 13936 + }, + { + "epoch": 0.8, + "grad_norm": 0.3740806939413502, + "learning_rate": 2.0111941704041738e-06, + "loss": 0.2224, + "step": 13937 + }, + { + "epoch": 0.8, + "grad_norm": 1.312160171802165, + "learning_rate": 2.0100749890439943e-06, + "loss": 0.5937, + "step": 13938 + }, + { + "epoch": 0.8, + "grad_norm": 0.39192858923914753, + "learning_rate": 2.0089560843745737e-06, + "loss": 0.2937, + "step": 13939 + }, + { + "epoch": 0.8, + "grad_norm": 0.28806615219896364, + "learning_rate": 2.0078374564346605e-06, + "loss": 0.2294, + "step": 13940 + }, + { + "epoch": 0.8, + "grad_norm": 0.28823469322612705, + "learning_rate": 2.0067191052629897e-06, + "loss": 0.1929, + "step": 13941 + }, + { + "epoch": 0.8, + "grad_norm": 1.1471591644672199, + "learning_rate": 2.0056010308982954e-06, + "loss": 0.6341, + "step": 13942 + }, + { + "epoch": 0.8, + "grad_norm": 0.30332531713716976, + "learning_rate": 2.0044832333792942e-06, + "loss": 0.1903, + "step": 13943 + }, + { + "epoch": 0.8, + "grad_norm": 0.37927445804426185, + "learning_rate": 2.003365712744694e-06, + "loss": 0.286, + "step": 13944 + }, + { + "epoch": 0.8, + "grad_norm": 0.8720692601753824, + "learning_rate": 2.0022484690331957e-06, + "loss": 0.3939, + "step": 13945 + }, + { + "epoch": 0.8, + "grad_norm": 0.326152282045429, + "learning_rate": 2.0011315022834887e-06, + "loss": 0.2417, + "step": 13946 + }, + { + "epoch": 0.8, + "grad_norm": 0.25737955441315835, + "learning_rate": 2.000014812534253e-06, + "loss": 0.121, + "step": 13947 + }, + { + "epoch": 0.8, + "grad_norm": 0.3269429544155623, + "learning_rate": 1.9988983998241616e-06, + "loss": 0.2851, + "step": 13948 + }, + { + "epoch": 0.8, + "grad_norm": 0.262006983013765, + "learning_rate": 1.9977822641918722e-06, + "loss": 0.2173, + "step": 13949 + }, + { + "epoch": 0.8, + "grad_norm": 1.0942238387697292, + "learning_rate": 1.996666405676041e-06, + "loss": 0.2634, + "step": 13950 + }, + { + "epoch": 0.8, + "grad_norm": 0.31107848659817255, + "learning_rate": 1.9955508243153075e-06, + "loss": 0.2901, + "step": 13951 + }, + { + "epoch": 0.8, + "grad_norm": 0.36351275425339735, + "learning_rate": 1.9944355201483057e-06, + "loss": 0.2643, + "step": 13952 + }, + { + "epoch": 0.8, + "grad_norm": 0.27098007640215643, + "learning_rate": 1.993320493213654e-06, + "loss": 0.1226, + "step": 13953 + }, + { + "epoch": 0.8, + "grad_norm": 0.3990952691061257, + "learning_rate": 1.992205743549972e-06, + "loss": 0.2757, + "step": 13954 + }, + { + "epoch": 0.8, + "grad_norm": 0.3607559119535387, + "learning_rate": 1.991091271195862e-06, + "loss": 0.2886, + "step": 13955 + }, + { + "epoch": 0.8, + "grad_norm": 0.32360319442150737, + "learning_rate": 1.989977076189916e-06, + "loss": 0.254, + "step": 13956 + }, + { + "epoch": 0.8, + "grad_norm": 0.5376372309933372, + "learning_rate": 1.9888631585707165e-06, + "loss": 0.3056, + "step": 13957 + }, + { + "epoch": 0.8, + "grad_norm": 0.4290201609712683, + "learning_rate": 1.987749518376845e-06, + "loss": 0.2751, + "step": 13958 + }, + { + "epoch": 0.8, + "grad_norm": 0.49871685728533977, + "learning_rate": 1.986636155646862e-06, + "loss": 0.3373, + "step": 13959 + }, + { + "epoch": 0.8, + "grad_norm": 0.27146044669137814, + "learning_rate": 1.985523070419324e-06, + "loss": 0.1775, + "step": 13960 + }, + { + "epoch": 0.8, + "grad_norm": 0.2442071282951535, + "learning_rate": 1.984410262732779e-06, + "loss": 0.1945, + "step": 13961 + }, + { + "epoch": 0.8, + "grad_norm": 0.6881344071763977, + "learning_rate": 1.9832977326257587e-06, + "loss": 0.3537, + "step": 13962 + }, + { + "epoch": 0.8, + "grad_norm": 0.32311184584433045, + "learning_rate": 1.9821854801367947e-06, + "loss": 0.2321, + "step": 13963 + }, + { + "epoch": 0.8, + "grad_norm": 0.29921551388686407, + "learning_rate": 1.981073505304404e-06, + "loss": 0.2423, + "step": 13964 + }, + { + "epoch": 0.8, + "grad_norm": 1.1952034277542511, + "learning_rate": 1.9799618081670925e-06, + "loss": 0.6672, + "step": 13965 + }, + { + "epoch": 0.8, + "grad_norm": 0.16565970362132512, + "learning_rate": 1.978850388763356e-06, + "loss": 0.0987, + "step": 13966 + }, + { + "epoch": 0.8, + "grad_norm": 0.28583421178478075, + "learning_rate": 1.977739247131688e-06, + "loss": 0.2643, + "step": 13967 + }, + { + "epoch": 0.8, + "grad_norm": 0.45246208200159804, + "learning_rate": 1.976628383310566e-06, + "loss": 0.3104, + "step": 13968 + }, + { + "epoch": 0.8, + "grad_norm": 0.7345956913222442, + "learning_rate": 1.9755177973384575e-06, + "loss": 0.2461, + "step": 13969 + }, + { + "epoch": 0.8, + "grad_norm": 0.3468748142933442, + "learning_rate": 1.9744074892538203e-06, + "loss": 0.2622, + "step": 13970 + }, + { + "epoch": 0.8, + "grad_norm": 0.4893550654394881, + "learning_rate": 1.9732974590951083e-06, + "loss": 0.3401, + "step": 13971 + }, + { + "epoch": 0.8, + "grad_norm": 0.313095481399203, + "learning_rate": 1.972187706900761e-06, + "loss": 0.256, + "step": 13972 + }, + { + "epoch": 0.8, + "grad_norm": 0.28891551172507257, + "learning_rate": 1.9710782327092083e-06, + "loss": 0.1845, + "step": 13973 + }, + { + "epoch": 0.8, + "grad_norm": 0.45100541255152954, + "learning_rate": 1.9699690365588674e-06, + "loss": 0.3064, + "step": 13974 + }, + { + "epoch": 0.8, + "grad_norm": 0.329722050488096, + "learning_rate": 1.9688601184881572e-06, + "loss": 0.2946, + "step": 13975 + }, + { + "epoch": 0.8, + "grad_norm": 0.28484365336909273, + "learning_rate": 1.9677514785354747e-06, + "loss": 0.1848, + "step": 13976 + }, + { + "epoch": 0.8, + "grad_norm": 1.0415921858447894, + "learning_rate": 1.966643116739214e-06, + "loss": 0.5841, + "step": 13977 + }, + { + "epoch": 0.8, + "grad_norm": 0.945207146977298, + "learning_rate": 1.9655350331377563e-06, + "loss": 0.4686, + "step": 13978 + }, + { + "epoch": 0.8, + "grad_norm": 0.32279578400528663, + "learning_rate": 1.964427227769475e-06, + "loss": 0.1987, + "step": 13979 + }, + { + "epoch": 0.8, + "grad_norm": 0.3187970274152468, + "learning_rate": 1.9633197006727333e-06, + "loss": 0.2472, + "step": 13980 + }, + { + "epoch": 0.8, + "grad_norm": 0.4706050178267718, + "learning_rate": 1.9622124518858855e-06, + "loss": 0.215, + "step": 13981 + }, + { + "epoch": 0.8, + "grad_norm": 0.32904291628440924, + "learning_rate": 1.9611054814472707e-06, + "loss": 0.2531, + "step": 13982 + }, + { + "epoch": 0.8, + "grad_norm": 0.3076825890196617, + "learning_rate": 1.959998789395231e-06, + "loss": 0.2521, + "step": 13983 + }, + { + "epoch": 0.8, + "grad_norm": 0.808660652729481, + "learning_rate": 1.9588923757680878e-06, + "loss": 0.496, + "step": 13984 + }, + { + "epoch": 0.8, + "grad_norm": 0.3336398442274031, + "learning_rate": 1.9577862406041558e-06, + "loss": 0.2611, + "step": 13985 + }, + { + "epoch": 0.8, + "grad_norm": 0.741654787071884, + "learning_rate": 1.956680383941737e-06, + "loss": 0.1866, + "step": 13986 + }, + { + "epoch": 0.8, + "grad_norm": 0.2751490630762147, + "learning_rate": 1.9555748058191337e-06, + "loss": 0.2453, + "step": 13987 + }, + { + "epoch": 0.8, + "grad_norm": 0.3371685877572128, + "learning_rate": 1.9544695062746286e-06, + "loss": 0.2371, + "step": 13988 + }, + { + "epoch": 0.8, + "grad_norm": 0.8680164276785397, + "learning_rate": 1.9533644853464996e-06, + "loss": 0.4721, + "step": 13989 + }, + { + "epoch": 0.8, + "grad_norm": 0.47686024734591553, + "learning_rate": 1.952259743073012e-06, + "loss": 0.3312, + "step": 13990 + }, + { + "epoch": 0.8, + "grad_norm": 0.39545690708722075, + "learning_rate": 1.9511552794924194e-06, + "loss": 0.2362, + "step": 13991 + }, + { + "epoch": 0.8, + "grad_norm": 0.36269709669257977, + "learning_rate": 1.9500510946429772e-06, + "loss": 0.2391, + "step": 13992 + }, + { + "epoch": 0.8, + "grad_norm": 0.31630341720542904, + "learning_rate": 1.9489471885629196e-06, + "loss": 0.1847, + "step": 13993 + }, + { + "epoch": 0.8, + "grad_norm": 0.40712626311337746, + "learning_rate": 1.9478435612904744e-06, + "loss": 0.2752, + "step": 13994 + }, + { + "epoch": 0.8, + "grad_norm": 0.2942320985895319, + "learning_rate": 1.946740212863858e-06, + "loss": 0.288, + "step": 13995 + }, + { + "epoch": 0.8, + "grad_norm": 0.7675447935881212, + "learning_rate": 1.945637143321284e-06, + "loss": 0.3028, + "step": 13996 + }, + { + "epoch": 0.8, + "grad_norm": 0.400006777628331, + "learning_rate": 1.9445343527009497e-06, + "loss": 0.2919, + "step": 13997 + }, + { + "epoch": 0.8, + "grad_norm": 0.3556966067366299, + "learning_rate": 1.9434318410410435e-06, + "loss": 0.247, + "step": 13998 + }, + { + "epoch": 0.8, + "grad_norm": 0.25185255386753236, + "learning_rate": 1.942329608379745e-06, + "loss": 0.183, + "step": 13999 + }, + { + "epoch": 0.8, + "grad_norm": 0.31969515942588655, + "learning_rate": 1.9412276547552276e-06, + "loss": 0.2485, + "step": 14000 + }, + { + "epoch": 0.8, + "grad_norm": 0.9991830838318048, + "learning_rate": 1.9401259802056495e-06, + "loss": 0.5334, + "step": 14001 + }, + { + "epoch": 0.8, + "grad_norm": 0.42432293133392635, + "learning_rate": 1.9390245847691625e-06, + "loss": 0.2674, + "step": 14002 + }, + { + "epoch": 0.8, + "grad_norm": 0.257524155935746, + "learning_rate": 1.9379234684839075e-06, + "loss": 0.244, + "step": 14003 + }, + { + "epoch": 0.8, + "grad_norm": 1.2512902112125146, + "learning_rate": 1.9368226313880134e-06, + "loss": 0.5929, + "step": 14004 + }, + { + "epoch": 0.8, + "grad_norm": 0.2558079446835865, + "learning_rate": 1.935722073519608e-06, + "loss": 0.0865, + "step": 14005 + }, + { + "epoch": 0.8, + "grad_norm": 0.30514259612396377, + "learning_rate": 1.9346217949168e-06, + "loss": 0.2563, + "step": 14006 + }, + { + "epoch": 0.8, + "grad_norm": 0.32676000041131353, + "learning_rate": 1.933521795617692e-06, + "loss": 0.2903, + "step": 14007 + }, + { + "epoch": 0.8, + "grad_norm": 0.6566724718751233, + "learning_rate": 1.932422075660376e-06, + "loss": 0.3683, + "step": 14008 + }, + { + "epoch": 0.8, + "grad_norm": 0.31808490481525514, + "learning_rate": 1.931322635082938e-06, + "loss": 0.175, + "step": 14009 + }, + { + "epoch": 0.8, + "grad_norm": 0.5415586763759216, + "learning_rate": 1.9302234739234507e-06, + "loss": 0.3217, + "step": 14010 + }, + { + "epoch": 0.8, + "grad_norm": 0.2155972087042718, + "learning_rate": 1.9291245922199776e-06, + "loss": 0.2108, + "step": 14011 + }, + { + "epoch": 0.81, + "grad_norm": 0.40196080536904666, + "learning_rate": 1.9280259900105723e-06, + "loss": 0.1741, + "step": 14012 + }, + { + "epoch": 0.81, + "grad_norm": 0.5085855651773016, + "learning_rate": 1.9269276673332806e-06, + "loss": 0.3481, + "step": 14013 + }, + { + "epoch": 0.81, + "grad_norm": 0.6857508871116539, + "learning_rate": 1.9258296242261355e-06, + "loss": 0.4188, + "step": 14014 + }, + { + "epoch": 0.81, + "grad_norm": 0.24640439130406608, + "learning_rate": 1.9247318607271637e-06, + "loss": 0.2135, + "step": 14015 + }, + { + "epoch": 0.81, + "grad_norm": 0.5920314104837552, + "learning_rate": 1.923634376874378e-06, + "loss": 0.3719, + "step": 14016 + }, + { + "epoch": 0.81, + "grad_norm": 0.25891151164406034, + "learning_rate": 1.9225371727057897e-06, + "loss": 0.1558, + "step": 14017 + }, + { + "epoch": 0.81, + "grad_norm": 0.3841785028881913, + "learning_rate": 1.921440248259391e-06, + "loss": 0.2032, + "step": 14018 + }, + { + "epoch": 0.81, + "grad_norm": 0.2969167120395358, + "learning_rate": 1.9203436035731694e-06, + "loss": 0.2832, + "step": 14019 + }, + { + "epoch": 0.81, + "grad_norm": 0.8309038714782914, + "learning_rate": 1.919247238685098e-06, + "loss": 0.5118, + "step": 14020 + }, + { + "epoch": 0.81, + "grad_norm": 0.7674720192887823, + "learning_rate": 1.918151153633151e-06, + "loss": 0.2878, + "step": 14021 + }, + { + "epoch": 0.81, + "grad_norm": 0.564361057127029, + "learning_rate": 1.917055348455281e-06, + "loss": 0.2283, + "step": 14022 + }, + { + "epoch": 0.81, + "grad_norm": 0.2814577681747703, + "learning_rate": 1.9159598231894385e-06, + "loss": 0.2676, + "step": 14023 + }, + { + "epoch": 0.81, + "grad_norm": 0.2937360059194529, + "learning_rate": 1.9148645778735555e-06, + "loss": 0.2098, + "step": 14024 + }, + { + "epoch": 0.81, + "grad_norm": 0.5653575255098695, + "learning_rate": 1.9137696125455672e-06, + "loss": 0.2534, + "step": 14025 + }, + { + "epoch": 0.81, + "grad_norm": 0.5171731933248757, + "learning_rate": 1.91267492724339e-06, + "loss": 0.3388, + "step": 14026 + }, + { + "epoch": 0.81, + "grad_norm": 0.3752775546359505, + "learning_rate": 1.911580522004931e-06, + "loss": 0.2683, + "step": 14027 + }, + { + "epoch": 0.81, + "grad_norm": 0.368645831443498, + "learning_rate": 1.910486396868092e-06, + "loss": 0.2158, + "step": 14028 + }, + { + "epoch": 0.81, + "grad_norm": 0.30316861367267955, + "learning_rate": 1.909392551870759e-06, + "loss": 0.1882, + "step": 14029 + }, + { + "epoch": 0.81, + "grad_norm": 0.6934510092482077, + "learning_rate": 1.908298987050815e-06, + "loss": 0.3114, + "step": 14030 + }, + { + "epoch": 0.81, + "grad_norm": 0.24347718869590526, + "learning_rate": 1.907205702446131e-06, + "loss": 0.2422, + "step": 14031 + }, + { + "epoch": 0.81, + "grad_norm": 1.259711777802538, + "learning_rate": 1.9061126980945644e-06, + "loss": 0.657, + "step": 14032 + }, + { + "epoch": 0.81, + "grad_norm": 0.572298799907, + "learning_rate": 1.9050199740339648e-06, + "loss": 0.3185, + "step": 14033 + }, + { + "epoch": 0.81, + "grad_norm": 0.42220044917716687, + "learning_rate": 1.9039275303021775e-06, + "loss": 0.2817, + "step": 14034 + }, + { + "epoch": 0.81, + "grad_norm": 0.34749331845809894, + "learning_rate": 1.9028353669370315e-06, + "loss": 0.2275, + "step": 14035 + }, + { + "epoch": 0.81, + "grad_norm": 0.49512336442006893, + "learning_rate": 1.9017434839763493e-06, + "loss": 0.3007, + "step": 14036 + }, + { + "epoch": 0.81, + "grad_norm": 0.2254669338598251, + "learning_rate": 1.900651881457939e-06, + "loss": 0.1896, + "step": 14037 + }, + { + "epoch": 0.81, + "grad_norm": 0.4076412787846548, + "learning_rate": 1.8995605594196086e-06, + "loss": 0.2732, + "step": 14038 + }, + { + "epoch": 0.81, + "grad_norm": 0.32296030460251207, + "learning_rate": 1.8984695178991475e-06, + "loss": 0.2963, + "step": 14039 + }, + { + "epoch": 0.81, + "grad_norm": 0.5481564846950949, + "learning_rate": 1.8973787569343394e-06, + "loss": 0.3206, + "step": 14040 + }, + { + "epoch": 0.81, + "grad_norm": 0.6209723598460225, + "learning_rate": 1.8962882765629552e-06, + "loss": 0.2801, + "step": 14041 + }, + { + "epoch": 0.81, + "grad_norm": 0.3055173237798795, + "learning_rate": 1.8951980768227586e-06, + "loss": 0.2729, + "step": 14042 + }, + { + "epoch": 0.81, + "grad_norm": 0.35224208256032374, + "learning_rate": 1.8941081577515053e-06, + "loss": 0.2993, + "step": 14043 + }, + { + "epoch": 0.81, + "grad_norm": 0.28113135249311766, + "learning_rate": 1.8930185193869376e-06, + "loss": 0.1722, + "step": 14044 + }, + { + "epoch": 0.81, + "grad_norm": 0.29298152458518495, + "learning_rate": 1.8919291617667912e-06, + "loss": 0.1652, + "step": 14045 + }, + { + "epoch": 0.81, + "grad_norm": 0.45232617515848045, + "learning_rate": 1.8908400849287889e-06, + "loss": 0.2993, + "step": 14046 + }, + { + "epoch": 0.81, + "grad_norm": 0.37147528154819126, + "learning_rate": 1.8897512889106451e-06, + "loss": 0.3054, + "step": 14047 + }, + { + "epoch": 0.81, + "grad_norm": 0.5620327131313082, + "learning_rate": 1.8886627737500663e-06, + "loss": 0.2291, + "step": 14048 + }, + { + "epoch": 0.81, + "grad_norm": 0.26947274618159855, + "learning_rate": 1.8875745394847434e-06, + "loss": 0.2188, + "step": 14049 + }, + { + "epoch": 0.81, + "grad_norm": 0.3342387244403811, + "learning_rate": 1.8864865861523684e-06, + "loss": 0.3209, + "step": 14050 + }, + { + "epoch": 0.81, + "grad_norm": 0.16985618467855765, + "learning_rate": 1.8853989137906137e-06, + "loss": 0.0886, + "step": 14051 + }, + { + "epoch": 0.81, + "grad_norm": 0.38349041687568947, + "learning_rate": 1.8843115224371467e-06, + "loss": 0.2769, + "step": 14052 + }, + { + "epoch": 0.81, + "grad_norm": 0.6251738290863288, + "learning_rate": 1.8832244121296217e-06, + "loss": 0.3818, + "step": 14053 + }, + { + "epoch": 0.81, + "grad_norm": 0.3763987776386038, + "learning_rate": 1.8821375829056842e-06, + "loss": 0.2174, + "step": 14054 + }, + { + "epoch": 0.81, + "grad_norm": 0.31215780044633634, + "learning_rate": 1.8810510348029753e-06, + "loss": 0.2912, + "step": 14055 + }, + { + "epoch": 0.81, + "grad_norm": 1.225420066022846, + "learning_rate": 1.8799647678591203e-06, + "loss": 0.7848, + "step": 14056 + }, + { + "epoch": 0.81, + "grad_norm": 0.2622067946912736, + "learning_rate": 1.878878782111736e-06, + "loss": 0.1747, + "step": 14057 + }, + { + "epoch": 0.81, + "grad_norm": 0.35062197492639324, + "learning_rate": 1.8777930775984277e-06, + "loss": 0.222, + "step": 14058 + }, + { + "epoch": 0.81, + "grad_norm": 0.36092983810301527, + "learning_rate": 1.8767076543567986e-06, + "loss": 0.305, + "step": 14059 + }, + { + "epoch": 0.81, + "grad_norm": 0.5759708595809394, + "learning_rate": 1.8756225124244332e-06, + "loss": 0.3023, + "step": 14060 + }, + { + "epoch": 0.81, + "grad_norm": 0.3373531812343405, + "learning_rate": 1.8745376518389113e-06, + "loss": 0.2121, + "step": 14061 + }, + { + "epoch": 0.81, + "grad_norm": 0.3674552882376826, + "learning_rate": 1.8734530726377997e-06, + "loss": 0.3401, + "step": 14062 + }, + { + "epoch": 0.81, + "grad_norm": 0.20581830272049467, + "learning_rate": 1.8723687748586605e-06, + "loss": 0.1439, + "step": 14063 + }, + { + "epoch": 0.81, + "grad_norm": 0.3971041780692171, + "learning_rate": 1.8712847585390403e-06, + "loss": 0.2296, + "step": 14064 + }, + { + "epoch": 0.81, + "grad_norm": 0.7877507215245848, + "learning_rate": 1.8702010237164803e-06, + "loss": 0.3901, + "step": 14065 + }, + { + "epoch": 0.81, + "grad_norm": 0.3564769039000346, + "learning_rate": 1.8691175704285091e-06, + "loss": 0.3047, + "step": 14066 + }, + { + "epoch": 0.81, + "grad_norm": 0.32404927429594854, + "learning_rate": 1.8680343987126448e-06, + "loss": 0.1954, + "step": 14067 + }, + { + "epoch": 0.81, + "grad_norm": 1.125143640483104, + "learning_rate": 1.8669515086064006e-06, + "loss": 0.7126, + "step": 14068 + }, + { + "epoch": 0.81, + "grad_norm": 0.4371310503844042, + "learning_rate": 1.8658689001472775e-06, + "loss": 0.2545, + "step": 14069 + }, + { + "epoch": 0.81, + "grad_norm": 0.26684346084216576, + "learning_rate": 1.8647865733727644e-06, + "loss": 0.2519, + "step": 14070 + }, + { + "epoch": 0.81, + "grad_norm": 0.30304389288387373, + "learning_rate": 1.8637045283203391e-06, + "loss": 0.1918, + "step": 14071 + }, + { + "epoch": 0.81, + "grad_norm": 0.9331663827577262, + "learning_rate": 1.8626227650274787e-06, + "loss": 0.4022, + "step": 14072 + }, + { + "epoch": 0.81, + "grad_norm": 0.3671040035406563, + "learning_rate": 1.8615412835316426e-06, + "loss": 0.2767, + "step": 14073 + }, + { + "epoch": 0.81, + "grad_norm": 0.33313642195654125, + "learning_rate": 1.8604600838702814e-06, + "loss": 0.2546, + "step": 14074 + }, + { + "epoch": 0.81, + "grad_norm": 0.6134663014323616, + "learning_rate": 1.8593791660808357e-06, + "loss": 0.3682, + "step": 14075 + }, + { + "epoch": 0.81, + "grad_norm": 0.35912139079324334, + "learning_rate": 1.8582985302007405e-06, + "loss": 0.2771, + "step": 14076 + }, + { + "epoch": 0.81, + "grad_norm": 0.3628918133959204, + "learning_rate": 1.8572181762674192e-06, + "loss": 0.1353, + "step": 14077 + }, + { + "epoch": 0.81, + "grad_norm": 0.31315581719439217, + "learning_rate": 1.8561381043182803e-06, + "loss": 0.2639, + "step": 14078 + }, + { + "epoch": 0.81, + "grad_norm": 0.3602637118248707, + "learning_rate": 1.8550583143907274e-06, + "loss": 0.2798, + "step": 14079 + }, + { + "epoch": 0.81, + "grad_norm": 0.7198841723664006, + "learning_rate": 1.8539788065221598e-06, + "loss": 0.351, + "step": 14080 + }, + { + "epoch": 0.81, + "grad_norm": 0.6480899378091401, + "learning_rate": 1.8528995807499528e-06, + "loss": 0.3457, + "step": 14081 + }, + { + "epoch": 0.81, + "grad_norm": 0.2939543727068227, + "learning_rate": 1.8518206371114833e-06, + "loss": 0.264, + "step": 14082 + }, + { + "epoch": 0.81, + "grad_norm": 0.2582146968956923, + "learning_rate": 1.8507419756441114e-06, + "loss": 0.193, + "step": 14083 + }, + { + "epoch": 0.81, + "grad_norm": 1.4650946807988512, + "learning_rate": 1.8496635963851973e-06, + "loss": 0.1613, + "step": 14084 + }, + { + "epoch": 0.81, + "grad_norm": 0.30943678757627774, + "learning_rate": 1.8485854993720831e-06, + "loss": 0.2519, + "step": 14085 + }, + { + "epoch": 0.81, + "grad_norm": 0.33098435069101717, + "learning_rate": 1.8475076846421025e-06, + "loss": 0.303, + "step": 14086 + }, + { + "epoch": 0.81, + "grad_norm": 0.8136153140125502, + "learning_rate": 1.8464301522325767e-06, + "loss": 0.3076, + "step": 14087 + }, + { + "epoch": 0.81, + "grad_norm": 0.3524686554462998, + "learning_rate": 1.8453529021808282e-06, + "loss": 0.2519, + "step": 14088 + }, + { + "epoch": 0.81, + "grad_norm": 0.4539966783959743, + "learning_rate": 1.8442759345241567e-06, + "loss": 0.2466, + "step": 14089 + }, + { + "epoch": 0.81, + "grad_norm": 0.2479546022298384, + "learning_rate": 1.8431992492998595e-06, + "loss": 0.1885, + "step": 14090 + }, + { + "epoch": 0.81, + "grad_norm": 0.2979881583987387, + "learning_rate": 1.8421228465452213e-06, + "loss": 0.2557, + "step": 14091 + }, + { + "epoch": 0.81, + "grad_norm": 0.8722453179515083, + "learning_rate": 1.8410467262975152e-06, + "loss": 0.5817, + "step": 14092 + }, + { + "epoch": 0.81, + "grad_norm": 0.5745595604603996, + "learning_rate": 1.8399708885940136e-06, + "loss": 0.2642, + "step": 14093 + }, + { + "epoch": 0.81, + "grad_norm": 0.28733919145554143, + "learning_rate": 1.8388953334719684e-06, + "loss": 0.2482, + "step": 14094 + }, + { + "epoch": 0.81, + "grad_norm": 0.34111214746268476, + "learning_rate": 1.837820060968627e-06, + "loss": 0.2435, + "step": 14095 + }, + { + "epoch": 0.81, + "grad_norm": 0.5477066304455874, + "learning_rate": 1.8367450711212232e-06, + "loss": 0.2656, + "step": 14096 + }, + { + "epoch": 0.81, + "grad_norm": 0.2817456644619849, + "learning_rate": 1.8356703639669904e-06, + "loss": 0.1833, + "step": 14097 + }, + { + "epoch": 0.81, + "grad_norm": 0.34795741382488465, + "learning_rate": 1.8345959395431401e-06, + "loss": 0.2963, + "step": 14098 + }, + { + "epoch": 0.81, + "grad_norm": 0.6400339452738504, + "learning_rate": 1.8335217978868825e-06, + "loss": 0.4564, + "step": 14099 + }, + { + "epoch": 0.81, + "grad_norm": 0.2979103780116249, + "learning_rate": 1.832447939035411e-06, + "loss": 0.1861, + "step": 14100 + }, + { + "epoch": 0.81, + "grad_norm": 0.375418491704937, + "learning_rate": 1.8313743630259184e-06, + "loss": 0.1906, + "step": 14101 + }, + { + "epoch": 0.81, + "grad_norm": 0.3613002064565976, + "learning_rate": 1.8303010698955803e-06, + "loss": 0.2885, + "step": 14102 + }, + { + "epoch": 0.81, + "grad_norm": 0.331150447620705, + "learning_rate": 1.8292280596815649e-06, + "loss": 0.1993, + "step": 14103 + }, + { + "epoch": 0.81, + "grad_norm": 0.5396453246374567, + "learning_rate": 1.8281553324210278e-06, + "loss": 0.3703, + "step": 14104 + }, + { + "epoch": 0.81, + "grad_norm": 1.2441896449953618, + "learning_rate": 1.8270828881511238e-06, + "loss": 0.6757, + "step": 14105 + }, + { + "epoch": 0.81, + "grad_norm": 0.22987355409113447, + "learning_rate": 1.8260107269089865e-06, + "loss": 0.206, + "step": 14106 + }, + { + "epoch": 0.81, + "grad_norm": 1.4125169977730276, + "learning_rate": 1.8249388487317465e-06, + "loss": 0.6422, + "step": 14107 + }, + { + "epoch": 0.81, + "grad_norm": 0.3321933632573765, + "learning_rate": 1.823867253656524e-06, + "loss": 0.1871, + "step": 14108 + }, + { + "epoch": 0.81, + "grad_norm": 0.33090979186783415, + "learning_rate": 1.8227959417204222e-06, + "loss": 0.2569, + "step": 14109 + }, + { + "epoch": 0.81, + "grad_norm": 0.3767301347240297, + "learning_rate": 1.8217249129605496e-06, + "loss": 0.2513, + "step": 14110 + }, + { + "epoch": 0.81, + "grad_norm": 0.9509296598908838, + "learning_rate": 1.820654167413991e-06, + "loss": 0.5372, + "step": 14111 + }, + { + "epoch": 0.81, + "grad_norm": 0.3118256550146787, + "learning_rate": 1.8195837051178267e-06, + "loss": 0.2466, + "step": 14112 + }, + { + "epoch": 0.81, + "grad_norm": 0.21152041292581364, + "learning_rate": 1.8185135261091247e-06, + "loss": 0.0666, + "step": 14113 + }, + { + "epoch": 0.81, + "grad_norm": 0.2945684219244831, + "learning_rate": 1.817443630424952e-06, + "loss": 0.2494, + "step": 14114 + }, + { + "epoch": 0.81, + "grad_norm": 0.3782111624907756, + "learning_rate": 1.8163740181023526e-06, + "loss": 0.2492, + "step": 14115 + }, + { + "epoch": 0.81, + "grad_norm": 0.4657972759725943, + "learning_rate": 1.8153046891783654e-06, + "loss": 0.2856, + "step": 14116 + }, + { + "epoch": 0.81, + "grad_norm": 0.340304507867154, + "learning_rate": 1.8142356436900288e-06, + "loss": 0.3124, + "step": 14117 + }, + { + "epoch": 0.81, + "grad_norm": 0.3706279739554896, + "learning_rate": 1.8131668816743586e-06, + "loss": 0.2635, + "step": 14118 + }, + { + "epoch": 0.81, + "grad_norm": 0.6320484353599375, + "learning_rate": 1.8120984031683686e-06, + "loss": 0.2199, + "step": 14119 + }, + { + "epoch": 0.81, + "grad_norm": 0.3099710789466092, + "learning_rate": 1.811030208209058e-06, + "loss": 0.1903, + "step": 14120 + }, + { + "epoch": 0.81, + "grad_norm": 0.4168781646528722, + "learning_rate": 1.8099622968334163e-06, + "loss": 0.2884, + "step": 14121 + }, + { + "epoch": 0.81, + "grad_norm": 0.28981006334400455, + "learning_rate": 1.8088946690784314e-06, + "loss": 0.2911, + "step": 14122 + }, + { + "epoch": 0.81, + "grad_norm": 1.0230460545064057, + "learning_rate": 1.8078273249810718e-06, + "loss": 0.4709, + "step": 14123 + }, + { + "epoch": 0.81, + "grad_norm": 0.36665359776477047, + "learning_rate": 1.806760264578299e-06, + "loss": 0.2622, + "step": 14124 + }, + { + "epoch": 0.81, + "grad_norm": 1.4271593257453827, + "learning_rate": 1.8056934879070642e-06, + "loss": 0.5857, + "step": 14125 + }, + { + "epoch": 0.81, + "grad_norm": 0.2838783227056397, + "learning_rate": 1.8046269950043138e-06, + "loss": 0.208, + "step": 14126 + }, + { + "epoch": 0.81, + "grad_norm": 0.3194121737248507, + "learning_rate": 1.803560785906977e-06, + "loss": 0.2564, + "step": 14127 + }, + { + "epoch": 0.81, + "grad_norm": 0.4439798130696629, + "learning_rate": 1.8024948606519787e-06, + "loss": 0.2703, + "step": 14128 + }, + { + "epoch": 0.81, + "grad_norm": 0.301214908000095, + "learning_rate": 1.8014292192762285e-06, + "loss": 0.2513, + "step": 14129 + }, + { + "epoch": 0.81, + "grad_norm": 0.3262329360305483, + "learning_rate": 1.8003638618166342e-06, + "loss": 0.257, + "step": 14130 + }, + { + "epoch": 0.81, + "grad_norm": 1.184176185516564, + "learning_rate": 1.7992987883100877e-06, + "loss": 0.4787, + "step": 14131 + }, + { + "epoch": 0.81, + "grad_norm": 0.6445822576434146, + "learning_rate": 1.7982339987934705e-06, + "loss": 0.2847, + "step": 14132 + }, + { + "epoch": 0.81, + "grad_norm": 0.40948848681739713, + "learning_rate": 1.7971694933036576e-06, + "loss": 0.2681, + "step": 14133 + }, + { + "epoch": 0.81, + "grad_norm": 0.23710816941660431, + "learning_rate": 1.7961052718775096e-06, + "loss": 0.2382, + "step": 14134 + }, + { + "epoch": 0.81, + "grad_norm": 0.47714786473531884, + "learning_rate": 1.7950413345518858e-06, + "loss": 0.2881, + "step": 14135 + }, + { + "epoch": 0.81, + "grad_norm": 0.4062079094273239, + "learning_rate": 1.7939776813636278e-06, + "loss": 0.179, + "step": 14136 + }, + { + "epoch": 0.81, + "grad_norm": 0.3923200485078189, + "learning_rate": 1.7929143123495695e-06, + "loss": 0.2899, + "step": 14137 + }, + { + "epoch": 0.81, + "grad_norm": 0.4917783194741124, + "learning_rate": 1.7918512275465338e-06, + "loss": 0.3125, + "step": 14138 + }, + { + "epoch": 0.81, + "grad_norm": 0.3656188792929695, + "learning_rate": 1.790788426991339e-06, + "loss": 0.1853, + "step": 14139 + }, + { + "epoch": 0.81, + "grad_norm": 0.5413502468999466, + "learning_rate": 1.7897259107207888e-06, + "loss": 0.4162, + "step": 14140 + }, + { + "epoch": 0.81, + "grad_norm": 0.32025923404560214, + "learning_rate": 1.7886636787716761e-06, + "loss": 0.2454, + "step": 14141 + }, + { + "epoch": 0.81, + "grad_norm": 0.2429282363963751, + "learning_rate": 1.787601731180786e-06, + "loss": 0.173, + "step": 14142 + }, + { + "epoch": 0.81, + "grad_norm": 0.5438855371114105, + "learning_rate": 1.7865400679848953e-06, + "loss": 0.3524, + "step": 14143 + }, + { + "epoch": 0.81, + "grad_norm": 0.6026762575249064, + "learning_rate": 1.7854786892207709e-06, + "loss": 0.4291, + "step": 14144 + }, + { + "epoch": 0.81, + "grad_norm": 0.418008323429287, + "learning_rate": 1.7844175949251653e-06, + "loss": 0.2601, + "step": 14145 + }, + { + "epoch": 0.81, + "grad_norm": 0.2651924993585563, + "learning_rate": 1.7833567851348254e-06, + "loss": 0.232, + "step": 14146 + }, + { + "epoch": 0.81, + "grad_norm": 0.2537043627469472, + "learning_rate": 1.7822962598864868e-06, + "loss": 0.1781, + "step": 14147 + }, + { + "epoch": 0.81, + "grad_norm": 0.37665750496002826, + "learning_rate": 1.7812360192168742e-06, + "loss": 0.2524, + "step": 14148 + }, + { + "epoch": 0.81, + "grad_norm": 0.353511059581225, + "learning_rate": 1.7801760631627064e-06, + "loss": 0.2421, + "step": 14149 + }, + { + "epoch": 0.81, + "grad_norm": 0.7572573431074694, + "learning_rate": 1.7791163917606846e-06, + "loss": 0.3338, + "step": 14150 + }, + { + "epoch": 0.81, + "grad_norm": 0.8686696236646662, + "learning_rate": 1.7780570050475122e-06, + "loss": 0.3671, + "step": 14151 + }, + { + "epoch": 0.81, + "grad_norm": 0.3290711277167447, + "learning_rate": 1.7769979030598706e-06, + "loss": 0.231, + "step": 14152 + }, + { + "epoch": 0.81, + "grad_norm": 0.26873437490942453, + "learning_rate": 1.7759390858344395e-06, + "loss": 0.2475, + "step": 14153 + }, + { + "epoch": 0.81, + "grad_norm": 0.3209443331607646, + "learning_rate": 1.7748805534078805e-06, + "loss": 0.1887, + "step": 14154 + }, + { + "epoch": 0.81, + "grad_norm": 0.423738269074664, + "learning_rate": 1.773822305816857e-06, + "loss": 0.2495, + "step": 14155 + }, + { + "epoch": 0.81, + "grad_norm": 0.8545198353982074, + "learning_rate": 1.7727643430980135e-06, + "loss": 0.411, + "step": 14156 + }, + { + "epoch": 0.81, + "grad_norm": 0.37808128113435296, + "learning_rate": 1.7717066652879877e-06, + "loss": 0.3064, + "step": 14157 + }, + { + "epoch": 0.81, + "grad_norm": 0.2977203707383954, + "learning_rate": 1.770649272423406e-06, + "loss": 0.276, + "step": 14158 + }, + { + "epoch": 0.81, + "grad_norm": 0.2657906575491931, + "learning_rate": 1.7695921645408832e-06, + "loss": 0.1227, + "step": 14159 + }, + { + "epoch": 0.81, + "grad_norm": 0.33109296938461874, + "learning_rate": 1.7685353416770322e-06, + "loss": 0.2161, + "step": 14160 + }, + { + "epoch": 0.81, + "grad_norm": 0.24742271713944344, + "learning_rate": 1.7674788038684488e-06, + "loss": 0.258, + "step": 14161 + }, + { + "epoch": 0.81, + "grad_norm": 0.9296375061584038, + "learning_rate": 1.7664225511517196e-06, + "loss": 0.2892, + "step": 14162 + }, + { + "epoch": 0.81, + "grad_norm": 0.427928964518346, + "learning_rate": 1.7653665835634214e-06, + "loss": 0.2758, + "step": 14163 + }, + { + "epoch": 0.81, + "grad_norm": 0.5014102721249964, + "learning_rate": 1.7643109011401272e-06, + "loss": 0.3853, + "step": 14164 + }, + { + "epoch": 0.81, + "grad_norm": 0.36698527266326064, + "learning_rate": 1.7632555039183918e-06, + "loss": 0.2581, + "step": 14165 + }, + { + "epoch": 0.81, + "grad_norm": 0.40372847248005095, + "learning_rate": 1.762200391934764e-06, + "loss": 0.2874, + "step": 14166 + }, + { + "epoch": 0.81, + "grad_norm": 0.5512841267027948, + "learning_rate": 1.7611455652257802e-06, + "loss": 0.3146, + "step": 14167 + }, + { + "epoch": 0.81, + "grad_norm": 0.2772187100793543, + "learning_rate": 1.7600910238279745e-06, + "loss": 0.096, + "step": 14168 + }, + { + "epoch": 0.81, + "grad_norm": 0.32362380832747006, + "learning_rate": 1.7590367677778607e-06, + "loss": 0.2657, + "step": 14169 + }, + { + "epoch": 0.81, + "grad_norm": 0.3396696390434374, + "learning_rate": 1.7579827971119501e-06, + "loss": 0.2754, + "step": 14170 + }, + { + "epoch": 0.81, + "grad_norm": 0.9538742821374108, + "learning_rate": 1.756929111866741e-06, + "loss": 0.4811, + "step": 14171 + }, + { + "epoch": 0.81, + "grad_norm": 0.6335048928071565, + "learning_rate": 1.7558757120787196e-06, + "loss": 0.2464, + "step": 14172 + }, + { + "epoch": 0.81, + "grad_norm": 0.29133111716058585, + "learning_rate": 1.7548225977843703e-06, + "loss": 0.2619, + "step": 14173 + }, + { + "epoch": 0.81, + "grad_norm": 0.23845768488865898, + "learning_rate": 1.7537697690201604e-06, + "loss": 0.1823, + "step": 14174 + }, + { + "epoch": 0.81, + "grad_norm": 0.5880555726294913, + "learning_rate": 1.7527172258225479e-06, + "loss": 0.143, + "step": 14175 + }, + { + "epoch": 0.81, + "grad_norm": 0.3607590540833608, + "learning_rate": 1.7516649682279807e-06, + "loss": 0.2992, + "step": 14176 + }, + { + "epoch": 0.81, + "grad_norm": 0.3758488924017635, + "learning_rate": 1.7506129962729046e-06, + "loss": 0.2974, + "step": 14177 + }, + { + "epoch": 0.81, + "grad_norm": 0.6228724462522425, + "learning_rate": 1.7495613099937447e-06, + "loss": 0.2398, + "step": 14178 + }, + { + "epoch": 0.81, + "grad_norm": 0.28615182796675526, + "learning_rate": 1.748509909426922e-06, + "loss": 0.2267, + "step": 14179 + }, + { + "epoch": 0.81, + "grad_norm": 0.5160793879630342, + "learning_rate": 1.747458794608844e-06, + "loss": 0.2544, + "step": 14180 + }, + { + "epoch": 0.81, + "grad_norm": 0.28378991284710486, + "learning_rate": 1.7464079655759181e-06, + "loss": 0.2048, + "step": 14181 + }, + { + "epoch": 0.81, + "grad_norm": 0.39445167751605087, + "learning_rate": 1.7453574223645265e-06, + "loss": 0.3056, + "step": 14182 + }, + { + "epoch": 0.81, + "grad_norm": 0.7048727007429841, + "learning_rate": 1.7443071650110532e-06, + "loss": 0.4061, + "step": 14183 + }, + { + "epoch": 0.81, + "grad_norm": 0.511720675101496, + "learning_rate": 1.743257193551865e-06, + "loss": 0.2747, + "step": 14184 + }, + { + "epoch": 0.81, + "grad_norm": 0.27744333813057853, + "learning_rate": 1.742207508023327e-06, + "loss": 0.2208, + "step": 14185 + }, + { + "epoch": 0.82, + "grad_norm": 0.2413983160625698, + "learning_rate": 1.741158108461788e-06, + "loss": 0.1841, + "step": 14186 + }, + { + "epoch": 0.82, + "grad_norm": 0.9634224363895272, + "learning_rate": 1.7401089949035888e-06, + "loss": 0.4651, + "step": 14187 + }, + { + "epoch": 0.82, + "grad_norm": 0.3454464012521359, + "learning_rate": 1.7390601673850582e-06, + "loss": 0.2216, + "step": 14188 + }, + { + "epoch": 0.82, + "grad_norm": 0.36039692383388244, + "learning_rate": 1.7380116259425205e-06, + "loss": 0.2878, + "step": 14189 + }, + { + "epoch": 0.82, + "grad_norm": 1.0262897010065517, + "learning_rate": 1.7369633706122845e-06, + "loss": 0.457, + "step": 14190 + }, + { + "epoch": 0.82, + "grad_norm": 0.32525366190097454, + "learning_rate": 1.7359154014306523e-06, + "loss": 0.1925, + "step": 14191 + }, + { + "epoch": 0.82, + "grad_norm": 0.25124789227221145, + "learning_rate": 1.7348677184339114e-06, + "loss": 0.173, + "step": 14192 + }, + { + "epoch": 0.82, + "grad_norm": 0.31799897298231583, + "learning_rate": 1.7338203216583493e-06, + "loss": 0.2934, + "step": 14193 + }, + { + "epoch": 0.82, + "grad_norm": 0.3329687176740427, + "learning_rate": 1.732773211140233e-06, + "loss": 0.2172, + "step": 14194 + }, + { + "epoch": 0.82, + "grad_norm": 0.6666999484070938, + "learning_rate": 1.7317263869158252e-06, + "loss": 0.3835, + "step": 14195 + }, + { + "epoch": 0.82, + "grad_norm": 0.840595293551172, + "learning_rate": 1.7306798490213783e-06, + "loss": 0.4352, + "step": 14196 + }, + { + "epoch": 0.82, + "grad_norm": 0.24835261216243176, + "learning_rate": 1.729633597493129e-06, + "loss": 0.2628, + "step": 14197 + }, + { + "epoch": 0.82, + "grad_norm": 0.24067764811215303, + "learning_rate": 1.7285876323673144e-06, + "loss": 0.0807, + "step": 14198 + }, + { + "epoch": 0.82, + "grad_norm": 0.6601672537032992, + "learning_rate": 1.7275419536801552e-06, + "loss": 0.3598, + "step": 14199 + }, + { + "epoch": 0.82, + "grad_norm": 0.366001352720004, + "learning_rate": 1.7264965614678631e-06, + "loss": 0.2781, + "step": 14200 + }, + { + "epoch": 0.82, + "grad_norm": 0.3601613755013287, + "learning_rate": 1.7254514557666358e-06, + "loss": 0.2612, + "step": 14201 + }, + { + "epoch": 0.82, + "grad_norm": 0.847066894311364, + "learning_rate": 1.7244066366126722e-06, + "loss": 0.3791, + "step": 14202 + }, + { + "epoch": 0.82, + "grad_norm": 0.35177083832531264, + "learning_rate": 1.72336210404215e-06, + "loss": 0.2582, + "step": 14203 + }, + { + "epoch": 0.82, + "grad_norm": 0.23594743243529212, + "learning_rate": 1.7223178580912426e-06, + "loss": 0.147, + "step": 14204 + }, + { + "epoch": 0.82, + "grad_norm": 0.41703845501159037, + "learning_rate": 1.7212738987961086e-06, + "loss": 0.3409, + "step": 14205 + }, + { + "epoch": 0.82, + "grad_norm": 0.3426632852760692, + "learning_rate": 1.7202302261929071e-06, + "loss": 0.283, + "step": 14206 + }, + { + "epoch": 0.82, + "grad_norm": 0.7113318712831286, + "learning_rate": 1.7191868403177757e-06, + "loss": 0.3203, + "step": 14207 + }, + { + "epoch": 0.82, + "grad_norm": 0.5202922764450023, + "learning_rate": 1.7181437412068491e-06, + "loss": 0.3489, + "step": 14208 + }, + { + "epoch": 0.82, + "grad_norm": 0.29408898192812816, + "learning_rate": 1.717100928896246e-06, + "loss": 0.2608, + "step": 14209 + }, + { + "epoch": 0.82, + "grad_norm": 0.32304266882818133, + "learning_rate": 1.7160584034220828e-06, + "loss": 0.1571, + "step": 14210 + }, + { + "epoch": 0.82, + "grad_norm": 0.573358791143515, + "learning_rate": 1.7150161648204622e-06, + "loss": 0.2802, + "step": 14211 + }, + { + "epoch": 0.82, + "grad_norm": 0.30989601757233975, + "learning_rate": 1.713974213127475e-06, + "loss": 0.2593, + "step": 14212 + }, + { + "epoch": 0.82, + "grad_norm": 0.36938090057695033, + "learning_rate": 1.7129325483792048e-06, + "loss": 0.3223, + "step": 14213 + }, + { + "epoch": 0.82, + "grad_norm": 1.4585879963319681, + "learning_rate": 1.7118911706117213e-06, + "loss": 0.326, + "step": 14214 + }, + { + "epoch": 0.82, + "grad_norm": 0.32485245497389287, + "learning_rate": 1.710850079861095e-06, + "loss": 0.2697, + "step": 14215 + }, + { + "epoch": 0.82, + "grad_norm": 0.44404969059636795, + "learning_rate": 1.7098092761633722e-06, + "loss": 0.247, + "step": 14216 + }, + { + "epoch": 0.82, + "grad_norm": 0.32738510941378074, + "learning_rate": 1.7087687595545943e-06, + "loss": 0.2433, + "step": 14217 + }, + { + "epoch": 0.82, + "grad_norm": 0.34031209603664575, + "learning_rate": 1.7077285300708002e-06, + "loss": 0.2835, + "step": 14218 + }, + { + "epoch": 0.82, + "grad_norm": 0.8347750409587513, + "learning_rate": 1.706688587748011e-06, + "loss": 0.5498, + "step": 14219 + }, + { + "epoch": 0.82, + "grad_norm": 0.2600218973847508, + "learning_rate": 1.7056489326222392e-06, + "loss": 0.2242, + "step": 14220 + }, + { + "epoch": 0.82, + "grad_norm": 0.2922422219720576, + "learning_rate": 1.7046095647294859e-06, + "loss": 0.1806, + "step": 14221 + }, + { + "epoch": 0.82, + "grad_norm": 0.9252160238532011, + "learning_rate": 1.70357048410575e-06, + "loss": 0.3704, + "step": 14222 + }, + { + "epoch": 0.82, + "grad_norm": 0.46332376444231493, + "learning_rate": 1.7025316907870105e-06, + "loss": 0.2813, + "step": 14223 + }, + { + "epoch": 0.82, + "grad_norm": 0.28819936990494277, + "learning_rate": 1.7014931848092409e-06, + "loss": 0.1896, + "step": 14224 + }, + { + "epoch": 0.82, + "grad_norm": 0.32904385789124907, + "learning_rate": 1.700454966208407e-06, + "loss": 0.313, + "step": 14225 + }, + { + "epoch": 0.82, + "grad_norm": 0.3804473251777745, + "learning_rate": 1.6994170350204576e-06, + "loss": 0.2184, + "step": 14226 + }, + { + "epoch": 0.82, + "grad_norm": 0.31666399354697766, + "learning_rate": 1.6983793912813418e-06, + "loss": 0.1884, + "step": 14227 + }, + { + "epoch": 0.82, + "grad_norm": 0.35107911647497736, + "learning_rate": 1.6973420350269909e-06, + "loss": 0.2859, + "step": 14228 + }, + { + "epoch": 0.82, + "grad_norm": 0.689539750224186, + "learning_rate": 1.6963049662933273e-06, + "loss": 0.3485, + "step": 14229 + }, + { + "epoch": 0.82, + "grad_norm": 0.2906909140943072, + "learning_rate": 1.6952681851162644e-06, + "loss": 0.1952, + "step": 14230 + }, + { + "epoch": 0.82, + "grad_norm": 1.1242614537154207, + "learning_rate": 1.6942316915317091e-06, + "loss": 0.7466, + "step": 14231 + }, + { + "epoch": 0.82, + "grad_norm": 0.23330529085667315, + "learning_rate": 1.6931954855755527e-06, + "loss": 0.2129, + "step": 14232 + }, + { + "epoch": 0.82, + "grad_norm": 0.3182776146060527, + "learning_rate": 1.6921595672836811e-06, + "loss": 0.235, + "step": 14233 + }, + { + "epoch": 0.82, + "grad_norm": 1.1253697865393655, + "learning_rate": 1.6911239366919618e-06, + "loss": 0.2787, + "step": 14234 + }, + { + "epoch": 0.82, + "grad_norm": 0.6584813922771027, + "learning_rate": 1.6900885938362677e-06, + "loss": 0.4179, + "step": 14235 + }, + { + "epoch": 0.82, + "grad_norm": 0.31570298371256733, + "learning_rate": 1.6890535387524465e-06, + "loss": 0.2391, + "step": 14236 + }, + { + "epoch": 0.82, + "grad_norm": 0.3460198335510046, + "learning_rate": 1.6880187714763453e-06, + "loss": 0.2658, + "step": 14237 + }, + { + "epoch": 0.82, + "grad_norm": 0.3434216073340945, + "learning_rate": 1.6869842920437961e-06, + "loss": 0.1589, + "step": 14238 + }, + { + "epoch": 0.82, + "grad_norm": 0.37042829811528694, + "learning_rate": 1.6859501004906208e-06, + "loss": 0.2287, + "step": 14239 + }, + { + "epoch": 0.82, + "grad_norm": 0.3684246063480538, + "learning_rate": 1.6849161968526384e-06, + "loss": 0.258, + "step": 14240 + }, + { + "epoch": 0.82, + "grad_norm": 0.49687821321936904, + "learning_rate": 1.6838825811656512e-06, + "loss": 0.3434, + "step": 14241 + }, + { + "epoch": 0.82, + "grad_norm": 0.4044736505228071, + "learning_rate": 1.6828492534654516e-06, + "loss": 0.2857, + "step": 14242 + }, + { + "epoch": 0.82, + "grad_norm": 0.2720523490521247, + "learning_rate": 1.6818162137878224e-06, + "loss": 0.1785, + "step": 14243 + }, + { + "epoch": 0.82, + "grad_norm": 0.2924842153658913, + "learning_rate": 1.6807834621685426e-06, + "loss": 0.2572, + "step": 14244 + }, + { + "epoch": 0.82, + "grad_norm": 0.42620199736044223, + "learning_rate": 1.6797509986433746e-06, + "loss": 0.2771, + "step": 14245 + }, + { + "epoch": 0.82, + "grad_norm": 0.4717554256016822, + "learning_rate": 1.678718823248071e-06, + "loss": 0.3129, + "step": 14246 + }, + { + "epoch": 0.82, + "grad_norm": 0.8840619730677773, + "learning_rate": 1.6776869360183746e-06, + "loss": 0.2366, + "step": 14247 + }, + { + "epoch": 0.82, + "grad_norm": 0.3197993001659107, + "learning_rate": 1.6766553369900241e-06, + "loss": 0.2557, + "step": 14248 + }, + { + "epoch": 0.82, + "grad_norm": 0.32803477233922695, + "learning_rate": 1.6756240261987434e-06, + "loss": 0.3214, + "step": 14249 + }, + { + "epoch": 0.82, + "grad_norm": 0.3035876936412637, + "learning_rate": 1.6745930036802428e-06, + "loss": 0.1272, + "step": 14250 + }, + { + "epoch": 0.82, + "grad_norm": 0.32877403922823634, + "learning_rate": 1.6735622694702259e-06, + "loss": 0.2405, + "step": 14251 + }, + { + "epoch": 0.82, + "grad_norm": 1.3599618100090622, + "learning_rate": 1.6725318236043908e-06, + "loss": 0.3988, + "step": 14252 + }, + { + "epoch": 0.82, + "grad_norm": 0.3396785730073297, + "learning_rate": 1.6715016661184225e-06, + "loss": 0.2526, + "step": 14253 + }, + { + "epoch": 0.82, + "grad_norm": 0.3797148679905969, + "learning_rate": 1.6704717970479923e-06, + "loss": 0.2691, + "step": 14254 + }, + { + "epoch": 0.82, + "grad_norm": 0.5481283074189417, + "learning_rate": 1.6694422164287627e-06, + "loss": 0.3733, + "step": 14255 + }, + { + "epoch": 0.82, + "grad_norm": 0.26139269536422804, + "learning_rate": 1.6684129242963943e-06, + "loss": 0.2224, + "step": 14256 + }, + { + "epoch": 0.82, + "grad_norm": 0.40947654935266997, + "learning_rate": 1.6673839206865283e-06, + "loss": 0.2732, + "step": 14257 + }, + { + "epoch": 0.82, + "grad_norm": 0.3235516674745909, + "learning_rate": 1.6663552056347975e-06, + "loss": 0.2544, + "step": 14258 + }, + { + "epoch": 0.82, + "grad_norm": 1.1695979780325698, + "learning_rate": 1.6653267791768258e-06, + "loss": 0.5732, + "step": 14259 + }, + { + "epoch": 0.82, + "grad_norm": 0.2849039189767827, + "learning_rate": 1.6642986413482321e-06, + "loss": 0.1958, + "step": 14260 + }, + { + "epoch": 0.82, + "grad_norm": 0.38973166723076286, + "learning_rate": 1.663270792184618e-06, + "loss": 0.2984, + "step": 14261 + }, + { + "epoch": 0.82, + "grad_norm": 0.8836587032487782, + "learning_rate": 1.6622432317215776e-06, + "loss": 0.4592, + "step": 14262 + }, + { + "epoch": 0.82, + "grad_norm": 0.40255447112780424, + "learning_rate": 1.6612159599946954e-06, + "loss": 0.2373, + "step": 14263 + }, + { + "epoch": 0.82, + "grad_norm": 0.20061146179604905, + "learning_rate": 1.660188977039544e-06, + "loss": 0.2094, + "step": 14264 + }, + { + "epoch": 0.82, + "grad_norm": 1.583325395018498, + "learning_rate": 1.659162282891692e-06, + "loss": 0.7721, + "step": 14265 + }, + { + "epoch": 0.82, + "grad_norm": 0.32561047876624966, + "learning_rate": 1.6581358775866907e-06, + "loss": 0.1869, + "step": 14266 + }, + { + "epoch": 0.82, + "grad_norm": 0.5064695995116827, + "learning_rate": 1.6571097611600862e-06, + "loss": 0.3667, + "step": 14267 + }, + { + "epoch": 0.82, + "grad_norm": 0.3729173917284346, + "learning_rate": 1.6560839336474088e-06, + "loss": 0.3192, + "step": 14268 + }, + { + "epoch": 0.82, + "grad_norm": 0.3673667832560838, + "learning_rate": 1.6550583950841891e-06, + "loss": 0.2171, + "step": 14269 + }, + { + "epoch": 0.82, + "grad_norm": 0.2352059959306334, + "learning_rate": 1.6540331455059377e-06, + "loss": 0.1389, + "step": 14270 + }, + { + "epoch": 0.82, + "grad_norm": 1.2082826393706474, + "learning_rate": 1.6530081849481595e-06, + "loss": 0.5537, + "step": 14271 + }, + { + "epoch": 0.82, + "grad_norm": 0.27082498922394227, + "learning_rate": 1.6519835134463468e-06, + "loss": 0.2635, + "step": 14272 + }, + { + "epoch": 0.82, + "grad_norm": 0.4933710914894767, + "learning_rate": 1.6509591310359886e-06, + "loss": 0.2803, + "step": 14273 + }, + { + "epoch": 0.82, + "grad_norm": 0.6557255957824724, + "learning_rate": 1.649935037752557e-06, + "loss": 0.3822, + "step": 14274 + }, + { + "epoch": 0.82, + "grad_norm": 0.8281936757764693, + "learning_rate": 1.648911233631516e-06, + "loss": 0.3157, + "step": 14275 + }, + { + "epoch": 0.82, + "grad_norm": 0.20637211972769265, + "learning_rate": 1.6478877187083187e-06, + "loss": 0.1921, + "step": 14276 + }, + { + "epoch": 0.82, + "grad_norm": 0.4550300837789682, + "learning_rate": 1.6468644930184097e-06, + "loss": 0.2334, + "step": 14277 + }, + { + "epoch": 0.82, + "grad_norm": 0.5805410905455298, + "learning_rate": 1.6458415565972253e-06, + "loss": 0.3295, + "step": 14278 + }, + { + "epoch": 0.82, + "grad_norm": 0.43031247602796713, + "learning_rate": 1.6448189094801891e-06, + "loss": 0.2736, + "step": 14279 + }, + { + "epoch": 0.82, + "grad_norm": 0.36717439759245385, + "learning_rate": 1.6437965517027143e-06, + "loss": 0.2979, + "step": 14280 + }, + { + "epoch": 0.82, + "grad_norm": 0.6289420966546745, + "learning_rate": 1.6427744833002036e-06, + "loss": 0.3179, + "step": 14281 + }, + { + "epoch": 0.82, + "grad_norm": 0.24428017486398262, + "learning_rate": 1.6417527043080583e-06, + "loss": 0.1558, + "step": 14282 + }, + { + "epoch": 0.82, + "grad_norm": 1.1756826326190717, + "learning_rate": 1.6407312147616539e-06, + "loss": 0.4512, + "step": 14283 + }, + { + "epoch": 0.82, + "grad_norm": 0.3007377368607552, + "learning_rate": 1.6397100146963662e-06, + "loss": 0.2467, + "step": 14284 + }, + { + "epoch": 0.82, + "grad_norm": 0.38786311492343356, + "learning_rate": 1.6386891041475639e-06, + "loss": 0.3136, + "step": 14285 + }, + { + "epoch": 0.82, + "grad_norm": 0.719273995861604, + "learning_rate": 1.6376684831505984e-06, + "loss": 0.2903, + "step": 14286 + }, + { + "epoch": 0.82, + "grad_norm": 0.616064577176051, + "learning_rate": 1.636648151740814e-06, + "loss": 0.2994, + "step": 14287 + }, + { + "epoch": 0.82, + "grad_norm": 0.29643428083943496, + "learning_rate": 1.6356281099535432e-06, + "loss": 0.2724, + "step": 14288 + }, + { + "epoch": 0.82, + "grad_norm": 0.27973403877242464, + "learning_rate": 1.63460835782411e-06, + "loss": 0.1537, + "step": 14289 + }, + { + "epoch": 0.82, + "grad_norm": 0.3902255474494219, + "learning_rate": 1.633588895387832e-06, + "loss": 0.2725, + "step": 14290 + }, + { + "epoch": 0.82, + "grad_norm": 0.6206832393439994, + "learning_rate": 1.6325697226800109e-06, + "loss": 0.3138, + "step": 14291 + }, + { + "epoch": 0.82, + "grad_norm": 0.32000206528220054, + "learning_rate": 1.6315508397359391e-06, + "loss": 0.2513, + "step": 14292 + }, + { + "epoch": 0.82, + "grad_norm": 0.9261649174707342, + "learning_rate": 1.6305322465909012e-06, + "loss": 0.3988, + "step": 14293 + }, + { + "epoch": 0.82, + "grad_norm": 0.3895447592390385, + "learning_rate": 1.6295139432801732e-06, + "loss": 0.2633, + "step": 14294 + }, + { + "epoch": 0.82, + "grad_norm": 0.20039587976898893, + "learning_rate": 1.628495929839018e-06, + "loss": 0.1685, + "step": 14295 + }, + { + "epoch": 0.82, + "grad_norm": 0.678502925193134, + "learning_rate": 1.6274782063026883e-06, + "loss": 0.3645, + "step": 14296 + }, + { + "epoch": 0.82, + "grad_norm": 0.3716671212985712, + "learning_rate": 1.6264607727064253e-06, + "loss": 0.3074, + "step": 14297 + }, + { + "epoch": 0.82, + "grad_norm": 0.8235463841034898, + "learning_rate": 1.6254436290854691e-06, + "loss": 0.4441, + "step": 14298 + }, + { + "epoch": 0.82, + "grad_norm": 0.6843383792249415, + "learning_rate": 1.62442677547504e-06, + "loss": 0.1194, + "step": 14299 + }, + { + "epoch": 0.82, + "grad_norm": 0.25397188356352135, + "learning_rate": 1.62341021191035e-06, + "loss": 0.267, + "step": 14300 + }, + { + "epoch": 0.82, + "grad_norm": 0.32104682332962464, + "learning_rate": 1.6223939384266064e-06, + "loss": 0.1979, + "step": 14301 + }, + { + "epoch": 0.82, + "grad_norm": 0.6705088211808614, + "learning_rate": 1.6213779550589959e-06, + "loss": 0.2417, + "step": 14302 + }, + { + "epoch": 0.82, + "grad_norm": 0.41855789850269925, + "learning_rate": 1.6203622618427105e-06, + "loss": 0.2967, + "step": 14303 + }, + { + "epoch": 0.82, + "grad_norm": 0.3418585073461008, + "learning_rate": 1.6193468588129192e-06, + "loss": 0.311, + "step": 14304 + }, + { + "epoch": 0.82, + "grad_norm": 0.8013089892375115, + "learning_rate": 1.6183317460047853e-06, + "loss": 0.1193, + "step": 14305 + }, + { + "epoch": 0.82, + "grad_norm": 0.4221278782395689, + "learning_rate": 1.6173169234534602e-06, + "loss": 0.2809, + "step": 14306 + }, + { + "epoch": 0.82, + "grad_norm": 0.286743200585038, + "learning_rate": 1.6163023911940923e-06, + "loss": 0.2143, + "step": 14307 + }, + { + "epoch": 0.82, + "grad_norm": 0.4101065643986548, + "learning_rate": 1.6152881492618123e-06, + "loss": 0.2727, + "step": 14308 + }, + { + "epoch": 0.82, + "grad_norm": 0.3389018916742762, + "learning_rate": 1.614274197691743e-06, + "loss": 0.2422, + "step": 14309 + }, + { + "epoch": 0.82, + "grad_norm": 1.02805515707719, + "learning_rate": 1.6132605365189945e-06, + "loss": 0.6135, + "step": 14310 + }, + { + "epoch": 0.82, + "grad_norm": 0.46498023154122176, + "learning_rate": 1.6122471657786764e-06, + "loss": 0.336, + "step": 14311 + }, + { + "epoch": 0.82, + "grad_norm": 0.24162416824879168, + "learning_rate": 1.6112340855058784e-06, + "loss": 0.199, + "step": 14312 + }, + { + "epoch": 0.82, + "grad_norm": 0.4745324261022142, + "learning_rate": 1.6102212957356821e-06, + "loss": 0.2715, + "step": 14313 + }, + { + "epoch": 0.82, + "grad_norm": 0.8109722485644276, + "learning_rate": 1.6092087965031623e-06, + "loss": 0.4024, + "step": 14314 + }, + { + "epoch": 0.82, + "grad_norm": 0.29032677627833714, + "learning_rate": 1.6081965878433781e-06, + "loss": 0.1993, + "step": 14315 + }, + { + "epoch": 0.82, + "grad_norm": 0.2764951495219204, + "learning_rate": 1.6071846697913907e-06, + "loss": 0.2629, + "step": 14316 + }, + { + "epoch": 0.82, + "grad_norm": 1.0850913168148355, + "learning_rate": 1.6061730423822353e-06, + "loss": 0.4369, + "step": 14317 + }, + { + "epoch": 0.82, + "grad_norm": 0.3368554761044434, + "learning_rate": 1.6051617056509427e-06, + "loss": 0.1969, + "step": 14318 + }, + { + "epoch": 0.82, + "grad_norm": 0.6791350794668766, + "learning_rate": 1.604150659632543e-06, + "loss": 0.3668, + "step": 14319 + }, + { + "epoch": 0.82, + "grad_norm": 0.2824110904939739, + "learning_rate": 1.6031399043620444e-06, + "loss": 0.2334, + "step": 14320 + }, + { + "epoch": 0.82, + "grad_norm": 0.3433389634591825, + "learning_rate": 1.6021294398744491e-06, + "loss": 0.2899, + "step": 14321 + }, + { + "epoch": 0.82, + "grad_norm": 0.27915034093289076, + "learning_rate": 1.6011192662047493e-06, + "loss": 0.1297, + "step": 14322 + }, + { + "epoch": 0.82, + "grad_norm": 0.36788278715290085, + "learning_rate": 1.6001093833879288e-06, + "loss": 0.3075, + "step": 14323 + }, + { + "epoch": 0.82, + "grad_norm": 0.3496665619411987, + "learning_rate": 1.5990997914589602e-06, + "loss": 0.2855, + "step": 14324 + }, + { + "epoch": 0.82, + "grad_norm": 0.5939639419698534, + "learning_rate": 1.598090490452805e-06, + "loss": 0.2828, + "step": 14325 + }, + { + "epoch": 0.82, + "grad_norm": 0.7697198306103441, + "learning_rate": 1.5970814804044143e-06, + "loss": 0.4741, + "step": 14326 + }, + { + "epoch": 0.82, + "grad_norm": 0.3498249717359082, + "learning_rate": 1.5960727613487282e-06, + "loss": 0.2813, + "step": 14327 + }, + { + "epoch": 0.82, + "grad_norm": 0.2114390332677578, + "learning_rate": 1.5950643333206827e-06, + "loss": 0.1838, + "step": 14328 + }, + { + "epoch": 0.82, + "grad_norm": 1.312899098888681, + "learning_rate": 1.5940561963551982e-06, + "loss": 0.3905, + "step": 14329 + }, + { + "epoch": 0.82, + "grad_norm": 0.39726885707439447, + "learning_rate": 1.5930483504871863e-06, + "loss": 0.2603, + "step": 14330 + }, + { + "epoch": 0.82, + "grad_norm": 0.3441962419375129, + "learning_rate": 1.5920407957515472e-06, + "loss": 0.2621, + "step": 14331 + }, + { + "epoch": 0.82, + "grad_norm": 1.244026243921077, + "learning_rate": 1.5910335321831749e-06, + "loss": 0.7247, + "step": 14332 + }, + { + "epoch": 0.82, + "grad_norm": 0.32768146343407234, + "learning_rate": 1.5900265598169507e-06, + "loss": 0.2683, + "step": 14333 + }, + { + "epoch": 0.82, + "grad_norm": 0.2208843892022444, + "learning_rate": 1.5890198786877442e-06, + "loss": 0.1797, + "step": 14334 + }, + { + "epoch": 0.82, + "grad_norm": 0.3084248363985326, + "learning_rate": 1.5880134888304155e-06, + "loss": 0.2296, + "step": 14335 + }, + { + "epoch": 0.82, + "grad_norm": 0.3367945172727081, + "learning_rate": 1.58700739027982e-06, + "loss": 0.2464, + "step": 14336 + }, + { + "epoch": 0.82, + "grad_norm": 0.6608801653776333, + "learning_rate": 1.5860015830707976e-06, + "loss": 0.3797, + "step": 14337 + }, + { + "epoch": 0.82, + "grad_norm": 1.5594061912195747, + "learning_rate": 1.5849960672381781e-06, + "loss": 0.3649, + "step": 14338 + }, + { + "epoch": 0.82, + "grad_norm": 0.26050111086502553, + "learning_rate": 1.5839908428167806e-06, + "loss": 0.2343, + "step": 14339 + }, + { + "epoch": 0.82, + "grad_norm": 0.537863403344283, + "learning_rate": 1.5829859098414202e-06, + "loss": 0.3252, + "step": 14340 + }, + { + "epoch": 0.82, + "grad_norm": 0.3168205101844687, + "learning_rate": 1.5819812683468971e-06, + "loss": 0.1417, + "step": 14341 + }, + { + "epoch": 0.82, + "grad_norm": 0.36311874865663823, + "learning_rate": 1.5809769183680001e-06, + "loss": 0.2854, + "step": 14342 + }, + { + "epoch": 0.82, + "grad_norm": 0.44223521127969234, + "learning_rate": 1.5799728599395093e-06, + "loss": 0.3553, + "step": 14343 + }, + { + "epoch": 0.82, + "grad_norm": 0.40043078151688744, + "learning_rate": 1.5789690930961955e-06, + "loss": 0.2374, + "step": 14344 + }, + { + "epoch": 0.82, + "grad_norm": 0.3187473275956427, + "learning_rate": 1.577965617872821e-06, + "loss": 0.2568, + "step": 14345 + }, + { + "epoch": 0.82, + "grad_norm": 0.40970369506063714, + "learning_rate": 1.5769624343041356e-06, + "loss": 0.2775, + "step": 14346 + }, + { + "epoch": 0.82, + "grad_norm": 0.27328936575328555, + "learning_rate": 1.5759595424248798e-06, + "loss": 0.2324, + "step": 14347 + }, + { + "epoch": 0.82, + "grad_norm": 0.3200545522278074, + "learning_rate": 1.5749569422697786e-06, + "loss": 0.2029, + "step": 14348 + }, + { + "epoch": 0.82, + "grad_norm": 0.7813268400056336, + "learning_rate": 1.57395463387356e-06, + "loss": 0.4164, + "step": 14349 + }, + { + "epoch": 0.82, + "grad_norm": 1.362908227303668, + "learning_rate": 1.572952617270932e-06, + "loss": 0.7034, + "step": 14350 + }, + { + "epoch": 0.82, + "grad_norm": 0.2630012527604643, + "learning_rate": 1.5719508924965876e-06, + "loss": 0.2133, + "step": 14351 + }, + { + "epoch": 0.82, + "grad_norm": 0.5084932701306548, + "learning_rate": 1.5709494595852238e-06, + "loss": 0.3867, + "step": 14352 + }, + { + "epoch": 0.82, + "grad_norm": 0.4391672927929206, + "learning_rate": 1.569948318571517e-06, + "loss": 0.2645, + "step": 14353 + }, + { + "epoch": 0.82, + "grad_norm": 0.24787677119492113, + "learning_rate": 1.5689474694901386e-06, + "loss": 0.1697, + "step": 14354 + }, + { + "epoch": 0.82, + "grad_norm": 0.5045982631903566, + "learning_rate": 1.5679469123757463e-06, + "loss": 0.331, + "step": 14355 + }, + { + "epoch": 0.82, + "grad_norm": 0.5158521633631827, + "learning_rate": 1.566946647262988e-06, + "loss": 0.3973, + "step": 14356 + }, + { + "epoch": 0.82, + "grad_norm": 0.2975250660129979, + "learning_rate": 1.5659466741865059e-06, + "loss": 0.2084, + "step": 14357 + }, + { + "epoch": 0.82, + "grad_norm": 0.882686185713709, + "learning_rate": 1.5649469931809291e-06, + "loss": 0.4715, + "step": 14358 + }, + { + "epoch": 0.82, + "grad_norm": 0.4060897628478708, + "learning_rate": 1.5639476042808743e-06, + "loss": 0.3158, + "step": 14359 + }, + { + "epoch": 0.83, + "grad_norm": 0.37822526003360196, + "learning_rate": 1.5629485075209494e-06, + "loss": 0.2885, + "step": 14360 + }, + { + "epoch": 0.83, + "grad_norm": 0.26592851936967965, + "learning_rate": 1.5619497029357566e-06, + "loss": 0.1386, + "step": 14361 + }, + { + "epoch": 0.83, + "grad_norm": 0.48297293822615317, + "learning_rate": 1.5609511905598828e-06, + "loss": 0.3686, + "step": 14362 + }, + { + "epoch": 0.83, + "grad_norm": 0.3980489462284402, + "learning_rate": 1.559952970427907e-06, + "loss": 0.2771, + "step": 14363 + }, + { + "epoch": 0.83, + "grad_norm": 0.3248249460067187, + "learning_rate": 1.5589550425743938e-06, + "loss": 0.2326, + "step": 14364 + }, + { + "epoch": 0.83, + "grad_norm": 0.7106496612344982, + "learning_rate": 1.5579574070339077e-06, + "loss": 0.397, + "step": 14365 + }, + { + "epoch": 0.83, + "grad_norm": 0.25155760454843434, + "learning_rate": 1.5569600638409931e-06, + "loss": 0.179, + "step": 14366 + }, + { + "epoch": 0.83, + "grad_norm": 0.23556004098954053, + "learning_rate": 1.5559630130301885e-06, + "loss": 0.1974, + "step": 14367 + }, + { + "epoch": 0.83, + "grad_norm": 1.3610236878509954, + "learning_rate": 1.5549662546360223e-06, + "loss": 0.7535, + "step": 14368 + }, + { + "epoch": 0.83, + "grad_norm": 0.4102886902865449, + "learning_rate": 1.5539697886930082e-06, + "loss": 0.3094, + "step": 14369 + }, + { + "epoch": 0.83, + "grad_norm": 0.4806336247620259, + "learning_rate": 1.5529736152356601e-06, + "loss": 0.2572, + "step": 14370 + }, + { + "epoch": 0.83, + "grad_norm": 0.3447873590811517, + "learning_rate": 1.551977734298472e-06, + "loss": 0.3017, + "step": 14371 + }, + { + "epoch": 0.83, + "grad_norm": 0.3259821776760638, + "learning_rate": 1.5509821459159312e-06, + "loss": 0.2352, + "step": 14372 + }, + { + "epoch": 0.83, + "grad_norm": 0.35766169837779577, + "learning_rate": 1.5499868501225135e-06, + "loss": 0.2043, + "step": 14373 + }, + { + "epoch": 0.83, + "grad_norm": 0.45759062221042746, + "learning_rate": 1.548991846952691e-06, + "loss": 0.2734, + "step": 14374 + }, + { + "epoch": 0.83, + "grad_norm": 0.26891305999108633, + "learning_rate": 1.5479971364409163e-06, + "loss": 0.2624, + "step": 14375 + }, + { + "epoch": 0.83, + "grad_norm": 0.6950747504087013, + "learning_rate": 1.5470027186216386e-06, + "loss": 0.3777, + "step": 14376 + }, + { + "epoch": 0.83, + "grad_norm": 0.8274621163981223, + "learning_rate": 1.5460085935292902e-06, + "loss": 0.2935, + "step": 14377 + }, + { + "epoch": 0.83, + "grad_norm": 0.5470660206047755, + "learning_rate": 1.5450147611983024e-06, + "loss": 0.2792, + "step": 14378 + }, + { + "epoch": 0.83, + "grad_norm": 0.22596268465440378, + "learning_rate": 1.5440212216630902e-06, + "loss": 0.2247, + "step": 14379 + }, + { + "epoch": 0.83, + "grad_norm": 0.4563189462879652, + "learning_rate": 1.54302797495806e-06, + "loss": 0.2061, + "step": 14380 + }, + { + "epoch": 0.83, + "grad_norm": 0.408530641518608, + "learning_rate": 1.5420350211176072e-06, + "loss": 0.2611, + "step": 14381 + }, + { + "epoch": 0.83, + "grad_norm": 0.4614799902770901, + "learning_rate": 1.541042360176115e-06, + "loss": 0.3243, + "step": 14382 + }, + { + "epoch": 0.83, + "grad_norm": 0.3767321614513136, + "learning_rate": 1.5400499921679647e-06, + "loss": 0.2978, + "step": 14383 + }, + { + "epoch": 0.83, + "grad_norm": 0.32608810726150617, + "learning_rate": 1.5390579171275222e-06, + "loss": 0.1694, + "step": 14384 + }, + { + "epoch": 0.83, + "grad_norm": 0.2786817942246057, + "learning_rate": 1.5380661350891346e-06, + "loss": 0.1812, + "step": 14385 + }, + { + "epoch": 0.83, + "grad_norm": 1.2423112132808052, + "learning_rate": 1.5370746460871555e-06, + "loss": 0.7655, + "step": 14386 + }, + { + "epoch": 0.83, + "grad_norm": 0.24830800114573304, + "learning_rate": 1.5360834501559185e-06, + "loss": 0.2066, + "step": 14387 + }, + { + "epoch": 0.83, + "grad_norm": 0.6941033661044869, + "learning_rate": 1.5350925473297462e-06, + "loss": 0.3225, + "step": 14388 + }, + { + "epoch": 0.83, + "grad_norm": 1.3297493516790682, + "learning_rate": 1.5341019376429533e-06, + "loss": 0.4718, + "step": 14389 + }, + { + "epoch": 0.83, + "grad_norm": 0.33027397591322355, + "learning_rate": 1.5331116211298492e-06, + "loss": 0.1663, + "step": 14390 + }, + { + "epoch": 0.83, + "grad_norm": 0.3162742331726112, + "learning_rate": 1.532121597824725e-06, + "loss": 0.2882, + "step": 14391 + }, + { + "epoch": 0.83, + "grad_norm": 0.3078259870411867, + "learning_rate": 1.5311318677618658e-06, + "loss": 0.1998, + "step": 14392 + }, + { + "epoch": 0.83, + "grad_norm": 0.3203383686794931, + "learning_rate": 1.5301424309755464e-06, + "loss": 0.2119, + "step": 14393 + }, + { + "epoch": 0.83, + "grad_norm": 0.9170395837276937, + "learning_rate": 1.529153287500027e-06, + "loss": 0.5176, + "step": 14394 + }, + { + "epoch": 0.83, + "grad_norm": 0.3675141498763632, + "learning_rate": 1.5281644373695682e-06, + "loss": 0.2876, + "step": 14395 + }, + { + "epoch": 0.83, + "grad_norm": 0.6819063271267362, + "learning_rate": 1.52717588061841e-06, + "loss": 0.26, + "step": 14396 + }, + { + "epoch": 0.83, + "grad_norm": 0.18812990097663482, + "learning_rate": 1.5261876172807865e-06, + "loss": 0.1458, + "step": 14397 + }, + { + "epoch": 0.83, + "grad_norm": 0.4405755811998149, + "learning_rate": 1.5251996473909202e-06, + "loss": 0.3262, + "step": 14398 + }, + { + "epoch": 0.83, + "grad_norm": 0.4274068515417533, + "learning_rate": 1.5242119709830272e-06, + "loss": 0.3208, + "step": 14399 + }, + { + "epoch": 0.83, + "grad_norm": 0.30076883740926846, + "learning_rate": 1.5232245880913088e-06, + "loss": 0.2194, + "step": 14400 + }, + { + "epoch": 0.83, + "grad_norm": 1.2838196412791696, + "learning_rate": 1.5222374987499588e-06, + "loss": 0.536, + "step": 14401 + }, + { + "epoch": 0.83, + "grad_norm": 0.5728611563163679, + "learning_rate": 1.5212507029931578e-06, + "loss": 0.2791, + "step": 14402 + }, + { + "epoch": 0.83, + "grad_norm": 0.22643317321614864, + "learning_rate": 1.5202642008550827e-06, + "loss": 0.2097, + "step": 14403 + }, + { + "epoch": 0.83, + "grad_norm": 0.781228803059026, + "learning_rate": 1.519277992369893e-06, + "loss": 0.3939, + "step": 14404 + }, + { + "epoch": 0.83, + "grad_norm": 0.5420511061950547, + "learning_rate": 1.5182920775717425e-06, + "loss": 0.3284, + "step": 14405 + }, + { + "epoch": 0.83, + "grad_norm": 0.23424207699255722, + "learning_rate": 1.5173064564947714e-06, + "loss": 0.2109, + "step": 14406 + }, + { + "epoch": 0.83, + "grad_norm": 0.4855977165429987, + "learning_rate": 1.5163211291731116e-06, + "loss": 0.3538, + "step": 14407 + }, + { + "epoch": 0.83, + "grad_norm": 0.7778771909877253, + "learning_rate": 1.5153360956408891e-06, + "loss": 0.3002, + "step": 14408 + }, + { + "epoch": 0.83, + "grad_norm": 0.40367431465738035, + "learning_rate": 1.514351355932212e-06, + "loss": 0.2746, + "step": 14409 + }, + { + "epoch": 0.83, + "grad_norm": 0.4594177145672264, + "learning_rate": 1.513366910081182e-06, + "loss": 0.2778, + "step": 14410 + }, + { + "epoch": 0.83, + "grad_norm": 0.3612788772247959, + "learning_rate": 1.5123827581218898e-06, + "loss": 0.3111, + "step": 14411 + }, + { + "epoch": 0.83, + "grad_norm": 0.3966527721965122, + "learning_rate": 1.5113989000884189e-06, + "loss": 0.303, + "step": 14412 + }, + { + "epoch": 0.83, + "grad_norm": 0.17782252117943567, + "learning_rate": 1.51041533601484e-06, + "loss": 0.0841, + "step": 14413 + }, + { + "epoch": 0.83, + "grad_norm": 0.3769117084741131, + "learning_rate": 1.5094320659352123e-06, + "loss": 0.2748, + "step": 14414 + }, + { + "epoch": 0.83, + "grad_norm": 0.3041994326675657, + "learning_rate": 1.5084490898835857e-06, + "loss": 0.2886, + "step": 14415 + }, + { + "epoch": 0.83, + "grad_norm": 0.6275402594647362, + "learning_rate": 1.5074664078940039e-06, + "loss": 0.2981, + "step": 14416 + }, + { + "epoch": 0.83, + "grad_norm": 0.7745986505170465, + "learning_rate": 1.5064840200004972e-06, + "loss": 0.4663, + "step": 14417 + }, + { + "epoch": 0.83, + "grad_norm": 0.26455906511621774, + "learning_rate": 1.5055019262370807e-06, + "loss": 0.2277, + "step": 14418 + }, + { + "epoch": 0.83, + "grad_norm": 0.2744273616921841, + "learning_rate": 1.5045201266377662e-06, + "loss": 0.191, + "step": 14419 + }, + { + "epoch": 0.83, + "grad_norm": 0.8188194127786869, + "learning_rate": 1.5035386212365554e-06, + "loss": 0.4115, + "step": 14420 + }, + { + "epoch": 0.83, + "grad_norm": 0.3522078195909208, + "learning_rate": 1.502557410067438e-06, + "loss": 0.2702, + "step": 14421 + }, + { + "epoch": 0.83, + "grad_norm": 0.6756378638949322, + "learning_rate": 1.5015764931643916e-06, + "loss": 0.4208, + "step": 14422 + }, + { + "epoch": 0.83, + "grad_norm": 0.25982068134200115, + "learning_rate": 1.5005958705613833e-06, + "loss": 0.2264, + "step": 14423 + }, + { + "epoch": 0.83, + "grad_norm": 0.3699918270593306, + "learning_rate": 1.4996155422923764e-06, + "loss": 0.3032, + "step": 14424 + }, + { + "epoch": 0.83, + "grad_norm": 0.31773506752213887, + "learning_rate": 1.4986355083913184e-06, + "loss": 0.1678, + "step": 14425 + }, + { + "epoch": 0.83, + "grad_norm": 0.28175897232877206, + "learning_rate": 1.4976557688921478e-06, + "loss": 0.2038, + "step": 14426 + }, + { + "epoch": 0.83, + "grad_norm": 0.3882773152820447, + "learning_rate": 1.4966763238287885e-06, + "loss": 0.2799, + "step": 14427 + }, + { + "epoch": 0.83, + "grad_norm": 0.6931088691932338, + "learning_rate": 1.4956971732351655e-06, + "loss": 0.3976, + "step": 14428 + }, + { + "epoch": 0.83, + "grad_norm": 0.3691483478603107, + "learning_rate": 1.4947183171451841e-06, + "loss": 0.2152, + "step": 14429 + }, + { + "epoch": 0.83, + "grad_norm": 0.38071812733088667, + "learning_rate": 1.4937397555927413e-06, + "loss": 0.2909, + "step": 14430 + }, + { + "epoch": 0.83, + "grad_norm": 0.2094977356719134, + "learning_rate": 1.4927614886117248e-06, + "loss": 0.1925, + "step": 14431 + }, + { + "epoch": 0.83, + "grad_norm": 0.5582480502897131, + "learning_rate": 1.4917835162360107e-06, + "loss": 0.2479, + "step": 14432 + }, + { + "epoch": 0.83, + "grad_norm": 0.3547248993099505, + "learning_rate": 1.4908058384994684e-06, + "loss": 0.2633, + "step": 14433 + }, + { + "epoch": 0.83, + "grad_norm": 0.33898191546731055, + "learning_rate": 1.4898284554359555e-06, + "loss": 0.2929, + "step": 14434 + }, + { + "epoch": 0.83, + "grad_norm": 1.20134478839783, + "learning_rate": 1.4888513670793159e-06, + "loss": 0.7416, + "step": 14435 + }, + { + "epoch": 0.83, + "grad_norm": 0.31287229804617556, + "learning_rate": 1.4878745734633859e-06, + "loss": 0.1874, + "step": 14436 + }, + { + "epoch": 0.83, + "grad_norm": 0.28716920169026183, + "learning_rate": 1.4868980746219953e-06, + "loss": 0.1744, + "step": 14437 + }, + { + "epoch": 0.83, + "grad_norm": 0.3602326317426348, + "learning_rate": 1.485921870588959e-06, + "loss": 0.3254, + "step": 14438 + }, + { + "epoch": 0.83, + "grad_norm": 0.34644487846977096, + "learning_rate": 1.4849459613980821e-06, + "loss": 0.2226, + "step": 14439 + }, + { + "epoch": 0.83, + "grad_norm": 1.7099936095796247, + "learning_rate": 1.4839703470831568e-06, + "loss": 0.6051, + "step": 14440 + }, + { + "epoch": 0.83, + "grad_norm": 1.1426420559205335, + "learning_rate": 1.4829950276779759e-06, + "loss": 0.6309, + "step": 14441 + }, + { + "epoch": 0.83, + "grad_norm": 0.21521110544361674, + "learning_rate": 1.4820200032163102e-06, + "loss": 0.2127, + "step": 14442 + }, + { + "epoch": 0.83, + "grad_norm": 0.5594517974144692, + "learning_rate": 1.481045273731926e-06, + "loss": 0.2786, + "step": 14443 + }, + { + "epoch": 0.83, + "grad_norm": 0.6070929376142952, + "learning_rate": 1.480070839258575e-06, + "loss": 0.3338, + "step": 14444 + }, + { + "epoch": 0.83, + "grad_norm": 0.2462055074718749, + "learning_rate": 1.479096699830007e-06, + "loss": 0.1646, + "step": 14445 + }, + { + "epoch": 0.83, + "grad_norm": 0.3872583829909997, + "learning_rate": 1.4781228554799544e-06, + "loss": 0.3043, + "step": 14446 + }, + { + "epoch": 0.83, + "grad_norm": 0.7217473632643663, + "learning_rate": 1.4771493062421393e-06, + "loss": 0.5007, + "step": 14447 + }, + { + "epoch": 0.83, + "grad_norm": 0.3543168866164041, + "learning_rate": 1.4761760521502788e-06, + "loss": 0.272, + "step": 14448 + }, + { + "epoch": 0.83, + "grad_norm": 0.694609659735516, + "learning_rate": 1.4752030932380723e-06, + "loss": 0.2679, + "step": 14449 + }, + { + "epoch": 0.83, + "grad_norm": 0.2668551292038593, + "learning_rate": 1.4742304295392173e-06, + "loss": 0.2415, + "step": 14450 + }, + { + "epoch": 0.83, + "grad_norm": 0.348151555424367, + "learning_rate": 1.4732580610873991e-06, + "loss": 0.2819, + "step": 14451 + }, + { + "epoch": 0.83, + "grad_norm": 0.4865809063782709, + "learning_rate": 1.4722859879162831e-06, + "loss": 0.164, + "step": 14452 + }, + { + "epoch": 0.83, + "grad_norm": 1.3427781510155108, + "learning_rate": 1.471314210059539e-06, + "loss": 0.8253, + "step": 14453 + }, + { + "epoch": 0.83, + "grad_norm": 0.2627245576209838, + "learning_rate": 1.4703427275508175e-06, + "loss": 0.2502, + "step": 14454 + }, + { + "epoch": 0.83, + "grad_norm": 0.38749675227945757, + "learning_rate": 1.4693715404237595e-06, + "loss": 0.2567, + "step": 14455 + }, + { + "epoch": 0.83, + "grad_norm": 0.48887376832848684, + "learning_rate": 1.4684006487119996e-06, + "loss": 0.2749, + "step": 14456 + }, + { + "epoch": 0.83, + "grad_norm": 0.23372320232067798, + "learning_rate": 1.4674300524491548e-06, + "loss": 0.1902, + "step": 14457 + }, + { + "epoch": 0.83, + "grad_norm": 0.3504977034974888, + "learning_rate": 1.466459751668843e-06, + "loss": 0.2363, + "step": 14458 + }, + { + "epoch": 0.83, + "grad_norm": 1.1512347307415474, + "learning_rate": 1.4654897464046624e-06, + "loss": 0.7318, + "step": 14459 + }, + { + "epoch": 0.83, + "grad_norm": 0.33497458339458847, + "learning_rate": 1.4645200366902056e-06, + "loss": 0.2622, + "step": 14460 + }, + { + "epoch": 0.83, + "grad_norm": 0.6891307734834573, + "learning_rate": 1.4635506225590511e-06, + "loss": 0.3649, + "step": 14461 + }, + { + "epoch": 0.83, + "grad_norm": 0.24648296086863, + "learning_rate": 1.4625815040447733e-06, + "loss": 0.1953, + "step": 14462 + }, + { + "epoch": 0.83, + "grad_norm": 0.3198739485686858, + "learning_rate": 1.4616126811809305e-06, + "loss": 0.2548, + "step": 14463 + }, + { + "epoch": 0.83, + "grad_norm": 0.4785902437377649, + "learning_rate": 1.4606441540010742e-06, + "loss": 0.2262, + "step": 14464 + }, + { + "epoch": 0.83, + "grad_norm": 0.42064172693904334, + "learning_rate": 1.4596759225387401e-06, + "loss": 0.2391, + "step": 14465 + }, + { + "epoch": 0.83, + "grad_norm": 0.3519446079196051, + "learning_rate": 1.4587079868274644e-06, + "loss": 0.2952, + "step": 14466 + }, + { + "epoch": 0.83, + "grad_norm": 0.4042615656613845, + "learning_rate": 1.4577403469007645e-06, + "loss": 0.3237, + "step": 14467 + }, + { + "epoch": 0.83, + "grad_norm": 0.35400671879270007, + "learning_rate": 1.4567730027921489e-06, + "loss": 0.0845, + "step": 14468 + }, + { + "epoch": 0.83, + "grad_norm": 0.3576298559652004, + "learning_rate": 1.4558059545351144e-06, + "loss": 0.2974, + "step": 14469 + }, + { + "epoch": 0.83, + "grad_norm": 0.2690274370266391, + "learning_rate": 1.4548392021631541e-06, + "loss": 0.264, + "step": 14470 + }, + { + "epoch": 0.83, + "grad_norm": 1.260476076309206, + "learning_rate": 1.4538727457097447e-06, + "loss": 0.7678, + "step": 14471 + }, + { + "epoch": 0.83, + "grad_norm": 0.32094729111190423, + "learning_rate": 1.4529065852083557e-06, + "loss": 0.2062, + "step": 14472 + }, + { + "epoch": 0.83, + "grad_norm": 0.618603149339349, + "learning_rate": 1.451940720692443e-06, + "loss": 0.3733, + "step": 14473 + }, + { + "epoch": 0.83, + "grad_norm": 0.3205669864381226, + "learning_rate": 1.450975152195454e-06, + "loss": 0.2918, + "step": 14474 + }, + { + "epoch": 0.83, + "grad_norm": 0.3185380448055098, + "learning_rate": 1.4500098797508289e-06, + "loss": 0.1982, + "step": 14475 + }, + { + "epoch": 0.83, + "grad_norm": 0.24713688538065756, + "learning_rate": 1.4490449033919952e-06, + "loss": 0.1745, + "step": 14476 + }, + { + "epoch": 0.83, + "grad_norm": 0.7044439761827327, + "learning_rate": 1.4480802231523682e-06, + "loss": 0.4051, + "step": 14477 + }, + { + "epoch": 0.83, + "grad_norm": 0.22536764749533747, + "learning_rate": 1.447115839065354e-06, + "loss": 0.2154, + "step": 14478 + }, + { + "epoch": 0.83, + "grad_norm": 0.754993666762245, + "learning_rate": 1.446151751164352e-06, + "loss": 0.4199, + "step": 14479 + }, + { + "epoch": 0.83, + "grad_norm": 1.5464917011221442, + "learning_rate": 1.4451879594827467e-06, + "loss": 0.4158, + "step": 14480 + }, + { + "epoch": 0.83, + "grad_norm": 0.22848279331701107, + "learning_rate": 1.444224464053916e-06, + "loss": 0.1559, + "step": 14481 + }, + { + "epoch": 0.83, + "grad_norm": 0.26781759688718365, + "learning_rate": 1.44326126491122e-06, + "loss": 0.2418, + "step": 14482 + }, + { + "epoch": 0.83, + "grad_norm": 0.7385480231144739, + "learning_rate": 1.4422983620880215e-06, + "loss": 0.4098, + "step": 14483 + }, + { + "epoch": 0.83, + "grad_norm": 0.5600152349811515, + "learning_rate": 1.4413357556176633e-06, + "loss": 0.3144, + "step": 14484 + }, + { + "epoch": 0.83, + "grad_norm": 0.38866276568802743, + "learning_rate": 1.4403734455334816e-06, + "loss": 0.2366, + "step": 14485 + }, + { + "epoch": 0.83, + "grad_norm": 0.3608062477208995, + "learning_rate": 1.4394114318687947e-06, + "loss": 0.2916, + "step": 14486 + }, + { + "epoch": 0.83, + "grad_norm": 0.39215757913532784, + "learning_rate": 1.4384497146569242e-06, + "loss": 0.2683, + "step": 14487 + }, + { + "epoch": 0.83, + "grad_norm": 0.28871817789259424, + "learning_rate": 1.437488293931173e-06, + "loss": 0.1842, + "step": 14488 + }, + { + "epoch": 0.83, + "grad_norm": 0.7837969352840899, + "learning_rate": 1.436527169724833e-06, + "loss": 0.3826, + "step": 14489 + }, + { + "epoch": 0.83, + "grad_norm": 0.252579482719496, + "learning_rate": 1.4355663420711863e-06, + "loss": 0.2509, + "step": 14490 + }, + { + "epoch": 0.83, + "grad_norm": 0.38934719167353166, + "learning_rate": 1.434605811003511e-06, + "loss": 0.179, + "step": 14491 + }, + { + "epoch": 0.83, + "grad_norm": 1.327017593625259, + "learning_rate": 1.4336455765550684e-06, + "loss": 0.5127, + "step": 14492 + }, + { + "epoch": 0.83, + "grad_norm": 0.3263390069030087, + "learning_rate": 1.4326856387591114e-06, + "loss": 0.1991, + "step": 14493 + }, + { + "epoch": 0.83, + "grad_norm": 0.26938715579382766, + "learning_rate": 1.4317259976488806e-06, + "loss": 0.2504, + "step": 14494 + }, + { + "epoch": 0.83, + "grad_norm": 0.642656981752686, + "learning_rate": 1.4307666532576115e-06, + "loss": 0.3615, + "step": 14495 + }, + { + "epoch": 0.83, + "grad_norm": 0.31108348734056135, + "learning_rate": 1.429807605618525e-06, + "loss": 0.2628, + "step": 14496 + }, + { + "epoch": 0.83, + "grad_norm": 0.5051662006316499, + "learning_rate": 1.4288488547648328e-06, + "loss": 0.2379, + "step": 14497 + }, + { + "epoch": 0.83, + "grad_norm": 0.37356454597055017, + "learning_rate": 1.4278904007297356e-06, + "loss": 0.2442, + "step": 14498 + }, + { + "epoch": 0.83, + "grad_norm": 0.3072156818751481, + "learning_rate": 1.4269322435464229e-06, + "loss": 0.2397, + "step": 14499 + }, + { + "epoch": 0.83, + "grad_norm": 0.9057485951009321, + "learning_rate": 1.425974383248081e-06, + "loss": 0.516, + "step": 14500 + }, + { + "epoch": 0.83, + "grad_norm": 0.3275530243981576, + "learning_rate": 1.425016819867876e-06, + "loss": 0.2634, + "step": 14501 + }, + { + "epoch": 0.83, + "grad_norm": 0.5692330083855973, + "learning_rate": 1.4240595534389712e-06, + "loss": 0.2918, + "step": 14502 + }, + { + "epoch": 0.83, + "grad_norm": 0.22938802148421475, + "learning_rate": 1.4231025839945123e-06, + "loss": 0.1967, + "step": 14503 + }, + { + "epoch": 0.83, + "grad_norm": 1.3925463484862073, + "learning_rate": 1.422145911567645e-06, + "loss": 0.192, + "step": 14504 + }, + { + "epoch": 0.83, + "grad_norm": 0.661586160240331, + "learning_rate": 1.4211895361914961e-06, + "loss": 0.2866, + "step": 14505 + }, + { + "epoch": 0.83, + "grad_norm": 0.24761832146932847, + "learning_rate": 1.4202334578991838e-06, + "loss": 0.2535, + "step": 14506 + }, + { + "epoch": 0.83, + "grad_norm": 0.6307606543288709, + "learning_rate": 1.419277676723816e-06, + "loss": 0.3035, + "step": 14507 + }, + { + "epoch": 0.83, + "grad_norm": 0.3912702731263923, + "learning_rate": 1.4183221926984958e-06, + "loss": 0.294, + "step": 14508 + }, + { + "epoch": 0.83, + "grad_norm": 0.21325530333087425, + "learning_rate": 1.4173670058563082e-06, + "loss": 0.2035, + "step": 14509 + }, + { + "epoch": 0.83, + "grad_norm": 0.46726091602695835, + "learning_rate": 1.4164121162303335e-06, + "loss": 0.2975, + "step": 14510 + }, + { + "epoch": 0.83, + "grad_norm": 0.4739114950446037, + "learning_rate": 1.4154575238536373e-06, + "loss": 0.1686, + "step": 14511 + }, + { + "epoch": 0.83, + "grad_norm": 0.4402434355675197, + "learning_rate": 1.4145032287592753e-06, + "loss": 0.3343, + "step": 14512 + }, + { + "epoch": 0.83, + "grad_norm": 0.5244827607300405, + "learning_rate": 1.4135492309803e-06, + "loss": 0.3307, + "step": 14513 + }, + { + "epoch": 0.83, + "grad_norm": 0.277632422490554, + "learning_rate": 1.4125955305497453e-06, + "loss": 0.2066, + "step": 14514 + }, + { + "epoch": 0.83, + "grad_norm": 0.32009804462674135, + "learning_rate": 1.4116421275006386e-06, + "loss": 0.2483, + "step": 14515 + }, + { + "epoch": 0.83, + "grad_norm": 0.6266221865588981, + "learning_rate": 1.410689021865993e-06, + "loss": 0.2119, + "step": 14516 + }, + { + "epoch": 0.83, + "grad_norm": 0.3426461166367405, + "learning_rate": 1.4097362136788196e-06, + "loss": 0.225, + "step": 14517 + }, + { + "epoch": 0.83, + "grad_norm": 0.32656014034310793, + "learning_rate": 1.408783702972112e-06, + "loss": 0.2931, + "step": 14518 + }, + { + "epoch": 0.83, + "grad_norm": 0.8290687136751642, + "learning_rate": 1.4078314897788558e-06, + "loss": 0.4557, + "step": 14519 + }, + { + "epoch": 0.83, + "grad_norm": 0.3310944158287448, + "learning_rate": 1.4068795741320241e-06, + "loss": 0.152, + "step": 14520 + }, + { + "epoch": 0.83, + "grad_norm": 0.23025651377461948, + "learning_rate": 1.4059279560645845e-06, + "loss": 0.2156, + "step": 14521 + }, + { + "epoch": 0.83, + "grad_norm": 0.3482816224844952, + "learning_rate": 1.4049766356094897e-06, + "loss": 0.2382, + "step": 14522 + }, + { + "epoch": 0.83, + "grad_norm": 0.6896887320183515, + "learning_rate": 1.4040256127996842e-06, + "loss": 0.345, + "step": 14523 + }, + { + "epoch": 0.83, + "grad_norm": 0.376874453876573, + "learning_rate": 1.403074887668101e-06, + "loss": 0.2253, + "step": 14524 + }, + { + "epoch": 0.83, + "grad_norm": 0.34520731206085, + "learning_rate": 1.4021244602476658e-06, + "loss": 0.3182, + "step": 14525 + }, + { + "epoch": 0.83, + "grad_norm": 0.590729596317791, + "learning_rate": 1.401174330571291e-06, + "loss": 0.4018, + "step": 14526 + }, + { + "epoch": 0.83, + "grad_norm": 0.24916310579425358, + "learning_rate": 1.4002244986718793e-06, + "loss": 0.1518, + "step": 14527 + }, + { + "epoch": 0.83, + "grad_norm": 0.9781155735931566, + "learning_rate": 1.3992749645823224e-06, + "loss": 0.4291, + "step": 14528 + }, + { + "epoch": 0.83, + "grad_norm": 0.37701695438860766, + "learning_rate": 1.3983257283355044e-06, + "loss": 0.2982, + "step": 14529 + }, + { + "epoch": 0.83, + "grad_norm": 0.2968145176708719, + "learning_rate": 1.3973767899642976e-06, + "loss": 0.2351, + "step": 14530 + }, + { + "epoch": 0.83, + "grad_norm": 1.2241899528039462, + "learning_rate": 1.396428149501562e-06, + "loss": 0.8343, + "step": 14531 + }, + { + "epoch": 0.83, + "grad_norm": 0.6203633467767298, + "learning_rate": 1.3954798069801468e-06, + "loss": 0.4185, + "step": 14532 + }, + { + "epoch": 0.83, + "grad_norm": 0.2830199485054341, + "learning_rate": 1.394531762432899e-06, + "loss": 0.2174, + "step": 14533 + }, + { + "epoch": 0.84, + "grad_norm": 0.3082002175777337, + "learning_rate": 1.3935840158926461e-06, + "loss": 0.2218, + "step": 14534 + }, + { + "epoch": 0.84, + "grad_norm": 0.5911035214641597, + "learning_rate": 1.3926365673922082e-06, + "loss": 0.3011, + "step": 14535 + }, + { + "epoch": 0.84, + "grad_norm": 0.40215627658380904, + "learning_rate": 1.3916894169643969e-06, + "loss": 0.3002, + "step": 14536 + }, + { + "epoch": 0.84, + "grad_norm": 0.3721036297309638, + "learning_rate": 1.390742564642007e-06, + "loss": 0.2617, + "step": 14537 + }, + { + "epoch": 0.84, + "grad_norm": 0.6464102731862245, + "learning_rate": 1.3897960104578357e-06, + "loss": 0.3488, + "step": 14538 + }, + { + "epoch": 0.84, + "grad_norm": 0.4497840173503336, + "learning_rate": 1.3888497544446578e-06, + "loss": 0.2986, + "step": 14539 + }, + { + "epoch": 0.84, + "grad_norm": 0.3635716294730615, + "learning_rate": 1.3879037966352426e-06, + "loss": 0.192, + "step": 14540 + }, + { + "epoch": 0.84, + "grad_norm": 0.31564871045658416, + "learning_rate": 1.3869581370623464e-06, + "loss": 0.2819, + "step": 14541 + }, + { + "epoch": 0.84, + "grad_norm": 0.35301383748174164, + "learning_rate": 1.3860127757587215e-06, + "loss": 0.2734, + "step": 14542 + }, + { + "epoch": 0.84, + "grad_norm": 0.9236845077921836, + "learning_rate": 1.3850677127571033e-06, + "loss": 0.444, + "step": 14543 + }, + { + "epoch": 0.84, + "grad_norm": 0.8576242151750284, + "learning_rate": 1.3841229480902207e-06, + "loss": 0.449, + "step": 14544 + }, + { + "epoch": 0.84, + "grad_norm": 0.317992615475425, + "learning_rate": 1.3831784817907867e-06, + "loss": 0.27, + "step": 14545 + }, + { + "epoch": 0.84, + "grad_norm": 0.4192884397471493, + "learning_rate": 1.382234313891515e-06, + "loss": 0.3332, + "step": 14546 + }, + { + "epoch": 0.84, + "grad_norm": 0.281125353055018, + "learning_rate": 1.3812904444250973e-06, + "loss": 0.1339, + "step": 14547 + }, + { + "epoch": 0.84, + "grad_norm": 0.4240781429981899, + "learning_rate": 1.3803468734242208e-06, + "loss": 0.2947, + "step": 14548 + }, + { + "epoch": 0.84, + "grad_norm": 0.34789522278090906, + "learning_rate": 1.3794036009215628e-06, + "loss": 0.3019, + "step": 14549 + }, + { + "epoch": 0.84, + "grad_norm": 0.6447680621171787, + "learning_rate": 1.3784606269497835e-06, + "loss": 0.1991, + "step": 14550 + }, + { + "epoch": 0.84, + "grad_norm": 0.42031524895275557, + "learning_rate": 1.377517951541545e-06, + "loss": 0.2907, + "step": 14551 + }, + { + "epoch": 0.84, + "grad_norm": 0.5888956245708498, + "learning_rate": 1.3765755747294906e-06, + "loss": 0.357, + "step": 14552 + }, + { + "epoch": 0.84, + "grad_norm": 0.22556962698291796, + "learning_rate": 1.3756334965462502e-06, + "loss": 0.1829, + "step": 14553 + }, + { + "epoch": 0.84, + "grad_norm": 0.2882609123569876, + "learning_rate": 1.3746917170244522e-06, + "loss": 0.2108, + "step": 14554 + }, + { + "epoch": 0.84, + "grad_norm": 1.3360077461689062, + "learning_rate": 1.3737502361967092e-06, + "loss": 0.6013, + "step": 14555 + }, + { + "epoch": 0.84, + "grad_norm": 0.8700106534469142, + "learning_rate": 1.3728090540956241e-06, + "loss": 0.2877, + "step": 14556 + }, + { + "epoch": 0.84, + "grad_norm": 0.2599726917673043, + "learning_rate": 1.3718681707537895e-06, + "loss": 0.2489, + "step": 14557 + }, + { + "epoch": 0.84, + "grad_norm": 0.4713611148460899, + "learning_rate": 1.3709275862037908e-06, + "loss": 0.3234, + "step": 14558 + }, + { + "epoch": 0.84, + "grad_norm": 0.2975155229884535, + "learning_rate": 1.3699873004781983e-06, + "loss": 0.1767, + "step": 14559 + }, + { + "epoch": 0.84, + "grad_norm": 0.31430545435525203, + "learning_rate": 1.369047313609575e-06, + "loss": 0.1938, + "step": 14560 + }, + { + "epoch": 0.84, + "grad_norm": 0.3427307670134785, + "learning_rate": 1.3681076256304715e-06, + "loss": 0.3059, + "step": 14561 + }, + { + "epoch": 0.84, + "grad_norm": 0.8615199044778867, + "learning_rate": 1.3671682365734273e-06, + "loss": 0.4229, + "step": 14562 + }, + { + "epoch": 0.84, + "grad_norm": 0.31342070762916385, + "learning_rate": 1.3662291464709787e-06, + "loss": 0.2215, + "step": 14563 + }, + { + "epoch": 0.84, + "grad_norm": 0.8984623397142127, + "learning_rate": 1.365290355355644e-06, + "loss": 0.4016, + "step": 14564 + }, + { + "epoch": 0.84, + "grad_norm": 0.2759744627759715, + "learning_rate": 1.3643518632599317e-06, + "loss": 0.2307, + "step": 14565 + }, + { + "epoch": 0.84, + "grad_norm": 0.2279301524681025, + "learning_rate": 1.3634136702163415e-06, + "loss": 0.1532, + "step": 14566 + }, + { + "epoch": 0.84, + "grad_norm": 1.1406842605842138, + "learning_rate": 1.362475776257367e-06, + "loss": 0.7491, + "step": 14567 + }, + { + "epoch": 0.84, + "grad_norm": 0.586708889053408, + "learning_rate": 1.3615381814154848e-06, + "loss": 0.3272, + "step": 14568 + }, + { + "epoch": 0.84, + "grad_norm": 0.28618586482601904, + "learning_rate": 1.3606008857231634e-06, + "loss": 0.222, + "step": 14569 + }, + { + "epoch": 0.84, + "grad_norm": 0.5022053060065834, + "learning_rate": 1.3596638892128599e-06, + "loss": 0.3399, + "step": 14570 + }, + { + "epoch": 0.84, + "grad_norm": 0.2986255975167238, + "learning_rate": 1.3587271919170276e-06, + "loss": 0.1707, + "step": 14571 + }, + { + "epoch": 0.84, + "grad_norm": 0.35800026833904114, + "learning_rate": 1.3577907938681e-06, + "loss": 0.2807, + "step": 14572 + }, + { + "epoch": 0.84, + "grad_norm": 0.32434733804341365, + "learning_rate": 1.356854695098505e-06, + "loss": 0.256, + "step": 14573 + }, + { + "epoch": 0.84, + "grad_norm": 0.7751751650216921, + "learning_rate": 1.3559188956406587e-06, + "loss": 0.3757, + "step": 14574 + }, + { + "epoch": 0.84, + "grad_norm": 0.3422898662062519, + "learning_rate": 1.354983395526972e-06, + "loss": 0.2493, + "step": 14575 + }, + { + "epoch": 0.84, + "grad_norm": 0.5236379453340966, + "learning_rate": 1.3540481947898377e-06, + "loss": 0.2442, + "step": 14576 + }, + { + "epoch": 0.84, + "grad_norm": 0.4769062265048514, + "learning_rate": 1.3531132934616432e-06, + "loss": 0.3491, + "step": 14577 + }, + { + "epoch": 0.84, + "grad_norm": 0.26125303282449164, + "learning_rate": 1.3521786915747636e-06, + "loss": 0.1996, + "step": 14578 + }, + { + "epoch": 0.84, + "grad_norm": 0.4615285075486771, + "learning_rate": 1.3512443891615612e-06, + "loss": 0.2125, + "step": 14579 + }, + { + "epoch": 0.84, + "grad_norm": 0.4793211131504592, + "learning_rate": 1.3503103862543964e-06, + "loss": 0.3389, + "step": 14580 + }, + { + "epoch": 0.84, + "grad_norm": 0.27583841894197575, + "learning_rate": 1.3493766828856113e-06, + "loss": 0.2668, + "step": 14581 + }, + { + "epoch": 0.84, + "grad_norm": 1.3888028705359416, + "learning_rate": 1.348443279087539e-06, + "loss": 0.247, + "step": 14582 + }, + { + "epoch": 0.84, + "grad_norm": 0.5835514940606136, + "learning_rate": 1.3475101748925024e-06, + "loss": 0.2396, + "step": 14583 + }, + { + "epoch": 0.84, + "grad_norm": 0.3265317907950052, + "learning_rate": 1.3465773703328177e-06, + "loss": 0.2626, + "step": 14584 + }, + { + "epoch": 0.84, + "grad_norm": 0.34205468813191087, + "learning_rate": 1.3456448654407871e-06, + "loss": 0.2876, + "step": 14585 + }, + { + "epoch": 0.84, + "grad_norm": 0.7978010123803017, + "learning_rate": 1.3447126602487026e-06, + "loss": 0.3144, + "step": 14586 + }, + { + "epoch": 0.84, + "grad_norm": 0.31640964650076336, + "learning_rate": 1.343780754788847e-06, + "loss": 0.2539, + "step": 14587 + }, + { + "epoch": 0.84, + "grad_norm": 0.35450717795972697, + "learning_rate": 1.3428491490934904e-06, + "loss": 0.1691, + "step": 14588 + }, + { + "epoch": 0.84, + "grad_norm": 0.29752438649468915, + "learning_rate": 1.3419178431948964e-06, + "loss": 0.2152, + "step": 14589 + }, + { + "epoch": 0.84, + "grad_norm": 0.34828109234051496, + "learning_rate": 1.3409868371253155e-06, + "loss": 0.266, + "step": 14590 + }, + { + "epoch": 0.84, + "grad_norm": 0.400501330494556, + "learning_rate": 1.3400561309169845e-06, + "loss": 0.2795, + "step": 14591 + }, + { + "epoch": 0.84, + "grad_norm": 0.32453042151607786, + "learning_rate": 1.3391257246021404e-06, + "loss": 0.2643, + "step": 14592 + }, + { + "epoch": 0.84, + "grad_norm": 0.3316450046499373, + "learning_rate": 1.3381956182130008e-06, + "loss": 0.2635, + "step": 14593 + }, + { + "epoch": 0.84, + "grad_norm": 0.5078673764787308, + "learning_rate": 1.3372658117817738e-06, + "loss": 0.2464, + "step": 14594 + }, + { + "epoch": 0.84, + "grad_norm": 1.7956324440457225, + "learning_rate": 1.3363363053406564e-06, + "loss": 0.1765, + "step": 14595 + }, + { + "epoch": 0.84, + "grad_norm": 0.306535566421557, + "learning_rate": 1.3354070989218426e-06, + "loss": 0.2439, + "step": 14596 + }, + { + "epoch": 0.84, + "grad_norm": 0.35559449451805636, + "learning_rate": 1.334478192557509e-06, + "loss": 0.3066, + "step": 14597 + }, + { + "epoch": 0.84, + "grad_norm": 0.9039033717873965, + "learning_rate": 1.333549586279822e-06, + "loss": 0.4907, + "step": 14598 + }, + { + "epoch": 0.84, + "grad_norm": 0.23362546523475522, + "learning_rate": 1.3326212801209392e-06, + "loss": 0.1655, + "step": 14599 + }, + { + "epoch": 0.84, + "grad_norm": 0.41212155732044253, + "learning_rate": 1.3316932741130106e-06, + "loss": 0.2784, + "step": 14600 + }, + { + "epoch": 0.84, + "grad_norm": 0.37043529185291274, + "learning_rate": 1.3307655682881704e-06, + "loss": 0.2799, + "step": 14601 + }, + { + "epoch": 0.84, + "grad_norm": 0.3008023321460584, + "learning_rate": 1.3298381626785461e-06, + "loss": 0.2106, + "step": 14602 + }, + { + "epoch": 0.84, + "grad_norm": 0.6480680200258, + "learning_rate": 1.3289110573162534e-06, + "loss": 0.3836, + "step": 14603 + }, + { + "epoch": 0.84, + "grad_norm": 0.3419447209679332, + "learning_rate": 1.3279842522333964e-06, + "loss": 0.3303, + "step": 14604 + }, + { + "epoch": 0.84, + "grad_norm": 0.28926904223399996, + "learning_rate": 1.3270577474620737e-06, + "loss": 0.1862, + "step": 14605 + }, + { + "epoch": 0.84, + "grad_norm": 0.333294240317376, + "learning_rate": 1.326131543034368e-06, + "loss": 0.1797, + "step": 14606 + }, + { + "epoch": 0.84, + "grad_norm": 0.7753789409052013, + "learning_rate": 1.3252056389823542e-06, + "loss": 0.3864, + "step": 14607 + }, + { + "epoch": 0.84, + "grad_norm": 0.4255672538071689, + "learning_rate": 1.3242800353380935e-06, + "loss": 0.2057, + "step": 14608 + }, + { + "epoch": 0.84, + "grad_norm": 0.30257530347677447, + "learning_rate": 1.3233547321336449e-06, + "loss": 0.2747, + "step": 14609 + }, + { + "epoch": 0.84, + "grad_norm": 1.1949937386415568, + "learning_rate": 1.322429729401048e-06, + "loss": 0.6453, + "step": 14610 + }, + { + "epoch": 0.84, + "grad_norm": 0.39220038644195765, + "learning_rate": 1.3215050271723372e-06, + "loss": 0.2691, + "step": 14611 + }, + { + "epoch": 0.84, + "grad_norm": 0.19771103947243748, + "learning_rate": 1.3205806254795316e-06, + "loss": 0.177, + "step": 14612 + }, + { + "epoch": 0.84, + "grad_norm": 1.340854636617087, + "learning_rate": 1.3196565243546477e-06, + "loss": 0.6778, + "step": 14613 + }, + { + "epoch": 0.84, + "grad_norm": 0.40782738073281505, + "learning_rate": 1.3187327238296855e-06, + "loss": 0.2768, + "step": 14614 + }, + { + "epoch": 0.84, + "grad_norm": 0.48597633138533963, + "learning_rate": 1.3178092239366357e-06, + "loss": 0.2581, + "step": 14615 + }, + { + "epoch": 0.84, + "grad_norm": 0.35702165673584346, + "learning_rate": 1.316886024707479e-06, + "loss": 0.298, + "step": 14616 + }, + { + "epoch": 0.84, + "grad_norm": 0.43957135064561503, + "learning_rate": 1.3159631261741835e-06, + "loss": 0.2841, + "step": 14617 + }, + { + "epoch": 0.84, + "grad_norm": 0.2303763190919167, + "learning_rate": 1.315040528368714e-06, + "loss": 0.1285, + "step": 14618 + }, + { + "epoch": 0.84, + "grad_norm": 0.7590643493551357, + "learning_rate": 1.3141182313230173e-06, + "loss": 0.3696, + "step": 14619 + }, + { + "epoch": 0.84, + "grad_norm": 0.3120861870291768, + "learning_rate": 1.313196235069033e-06, + "loss": 0.2635, + "step": 14620 + }, + { + "epoch": 0.84, + "grad_norm": 0.3375885377689183, + "learning_rate": 1.3122745396386893e-06, + "loss": 0.2568, + "step": 14621 + }, + { + "epoch": 0.84, + "grad_norm": 1.0596125325492274, + "learning_rate": 1.311353145063905e-06, + "loss": 0.6659, + "step": 14622 + }, + { + "epoch": 0.84, + "grad_norm": 0.32892295818439166, + "learning_rate": 1.3104320513765867e-06, + "loss": 0.2509, + "step": 14623 + }, + { + "epoch": 0.84, + "grad_norm": 0.3057490849134694, + "learning_rate": 1.3095112586086322e-06, + "loss": 0.1777, + "step": 14624 + }, + { + "epoch": 0.84, + "grad_norm": 0.35619765249654883, + "learning_rate": 1.3085907667919295e-06, + "loss": 0.2711, + "step": 14625 + }, + { + "epoch": 0.84, + "grad_norm": 0.5930467136699953, + "learning_rate": 1.3076705759583562e-06, + "loss": 0.2686, + "step": 14626 + }, + { + "epoch": 0.84, + "grad_norm": 0.4096537171452396, + "learning_rate": 1.3067506861397771e-06, + "loss": 0.3089, + "step": 14627 + }, + { + "epoch": 0.84, + "grad_norm": 0.31363251366168626, + "learning_rate": 1.3058310973680478e-06, + "loss": 0.2472, + "step": 14628 + }, + { + "epoch": 0.84, + "grad_norm": 0.6261934366587812, + "learning_rate": 1.3049118096750102e-06, + "loss": 0.322, + "step": 14629 + }, + { + "epoch": 0.84, + "grad_norm": 0.23884814580511757, + "learning_rate": 1.3039928230925058e-06, + "loss": 0.2023, + "step": 14630 + }, + { + "epoch": 0.84, + "grad_norm": 0.7587479042041084, + "learning_rate": 1.303074137652357e-06, + "loss": 0.2866, + "step": 14631 + }, + { + "epoch": 0.84, + "grad_norm": 0.3200019958475434, + "learning_rate": 1.302155753386376e-06, + "loss": 0.2691, + "step": 14632 + }, + { + "epoch": 0.84, + "grad_norm": 0.38354019565695807, + "learning_rate": 1.3012376703263652e-06, + "loss": 0.3087, + "step": 14633 + }, + { + "epoch": 0.84, + "grad_norm": 1.2891788763038219, + "learning_rate": 1.3003198885041212e-06, + "loss": 0.757, + "step": 14634 + }, + { + "epoch": 0.84, + "grad_norm": 0.3551321237848732, + "learning_rate": 1.2994024079514257e-06, + "loss": 0.161, + "step": 14635 + }, + { + "epoch": 0.84, + "grad_norm": 0.2695399257198015, + "learning_rate": 1.2984852287000515e-06, + "loss": 0.2375, + "step": 14636 + }, + { + "epoch": 0.84, + "grad_norm": 0.4797545726288128, + "learning_rate": 1.297568350781757e-06, + "loss": 0.3093, + "step": 14637 + }, + { + "epoch": 0.84, + "grad_norm": 0.29754951174458355, + "learning_rate": 1.296651774228298e-06, + "loss": 0.1445, + "step": 14638 + }, + { + "epoch": 0.84, + "grad_norm": 0.573274589294748, + "learning_rate": 1.2957354990714145e-06, + "loss": 0.3267, + "step": 14639 + }, + { + "epoch": 0.84, + "grad_norm": 0.3397778778825856, + "learning_rate": 1.2948195253428364e-06, + "loss": 0.3433, + "step": 14640 + }, + { + "epoch": 0.84, + "grad_norm": 0.3975827403188928, + "learning_rate": 1.2939038530742832e-06, + "loss": 0.1751, + "step": 14641 + }, + { + "epoch": 0.84, + "grad_norm": 0.5512201139354378, + "learning_rate": 1.2929884822974626e-06, + "loss": 0.3175, + "step": 14642 + }, + { + "epoch": 0.84, + "grad_norm": 0.3345476677624379, + "learning_rate": 1.2920734130440793e-06, + "loss": 0.2317, + "step": 14643 + }, + { + "epoch": 0.84, + "grad_norm": 0.27178999005541327, + "learning_rate": 1.2911586453458203e-06, + "loss": 0.1999, + "step": 14644 + }, + { + "epoch": 0.84, + "grad_norm": 0.393265473755049, + "learning_rate": 1.2902441792343611e-06, + "loss": 0.2757, + "step": 14645 + }, + { + "epoch": 0.84, + "grad_norm": 1.0589891218703797, + "learning_rate": 1.2893300147413702e-06, + "loss": 0.5945, + "step": 14646 + }, + { + "epoch": 0.84, + "grad_norm": 0.7805817018750784, + "learning_rate": 1.2884161518985083e-06, + "loss": 0.356, + "step": 14647 + }, + { + "epoch": 0.84, + "grad_norm": 0.24815947867005486, + "learning_rate": 1.2875025907374206e-06, + "loss": 0.2283, + "step": 14648 + }, + { + "epoch": 0.84, + "grad_norm": 0.7997982329037202, + "learning_rate": 1.2865893312897438e-06, + "loss": 0.3963, + "step": 14649 + }, + { + "epoch": 0.84, + "grad_norm": 0.24864551993847825, + "learning_rate": 1.2856763735871003e-06, + "loss": 0.1527, + "step": 14650 + }, + { + "epoch": 0.84, + "grad_norm": 0.3838687286845385, + "learning_rate": 1.2847637176611128e-06, + "loss": 0.2545, + "step": 14651 + }, + { + "epoch": 0.84, + "grad_norm": 0.33946848678803093, + "learning_rate": 1.2838513635433824e-06, + "loss": 0.2888, + "step": 14652 + }, + { + "epoch": 0.84, + "grad_norm": 0.5313711952676399, + "learning_rate": 1.2829393112655052e-06, + "loss": 0.3075, + "step": 14653 + }, + { + "epoch": 0.84, + "grad_norm": 0.35368230488042623, + "learning_rate": 1.2820275608590638e-06, + "loss": 0.2496, + "step": 14654 + }, + { + "epoch": 0.84, + "grad_norm": 0.5093236803162801, + "learning_rate": 1.2811161123556337e-06, + "loss": 0.2507, + "step": 14655 + }, + { + "epoch": 0.84, + "grad_norm": 0.2429324623945723, + "learning_rate": 1.2802049657867777e-06, + "loss": 0.2158, + "step": 14656 + }, + { + "epoch": 0.84, + "grad_norm": 0.4075378255586554, + "learning_rate": 1.2792941211840481e-06, + "loss": 0.2347, + "step": 14657 + }, + { + "epoch": 0.84, + "grad_norm": 0.813489974515319, + "learning_rate": 1.2783835785789867e-06, + "loss": 0.4763, + "step": 14658 + }, + { + "epoch": 0.84, + "grad_norm": 0.5005999740093483, + "learning_rate": 1.277473338003129e-06, + "loss": 0.3164, + "step": 14659 + }, + { + "epoch": 0.84, + "grad_norm": 0.3206738895171215, + "learning_rate": 1.2765633994879933e-06, + "loss": 0.2816, + "step": 14660 + }, + { + "epoch": 0.84, + "grad_norm": 0.5703995448615973, + "learning_rate": 1.2756537630650934e-06, + "loss": 0.2392, + "step": 14661 + }, + { + "epoch": 0.84, + "grad_norm": 0.24137241230342263, + "learning_rate": 1.274744428765926e-06, + "loss": 0.1746, + "step": 14662 + }, + { + "epoch": 0.84, + "grad_norm": 0.3367852646989017, + "learning_rate": 1.2738353966219863e-06, + "loss": 0.2836, + "step": 14663 + }, + { + "epoch": 0.84, + "grad_norm": 0.3384807639190101, + "learning_rate": 1.2729266666647511e-06, + "loss": 0.264, + "step": 14664 + }, + { + "epoch": 0.84, + "grad_norm": 0.6314320406801325, + "learning_rate": 1.2720182389256896e-06, + "loss": 0.362, + "step": 14665 + }, + { + "epoch": 0.84, + "grad_norm": 0.3457808849956551, + "learning_rate": 1.2711101134362624e-06, + "loss": 0.2846, + "step": 14666 + }, + { + "epoch": 0.84, + "grad_norm": 1.5896610511400546, + "learning_rate": 1.2702022902279132e-06, + "loss": 0.1807, + "step": 14667 + }, + { + "epoch": 0.84, + "grad_norm": 0.20164738798995183, + "learning_rate": 1.2692947693320867e-06, + "loss": 0.1918, + "step": 14668 + }, + { + "epoch": 0.84, + "grad_norm": 0.34464558067401213, + "learning_rate": 1.2683875507802058e-06, + "loss": 0.2875, + "step": 14669 + }, + { + "epoch": 0.84, + "grad_norm": 0.7667195525112961, + "learning_rate": 1.2674806346036895e-06, + "loss": 0.3144, + "step": 14670 + }, + { + "epoch": 0.84, + "grad_norm": 0.524432798062477, + "learning_rate": 1.2665740208339406e-06, + "loss": 0.3225, + "step": 14671 + }, + { + "epoch": 0.84, + "grad_norm": 0.25184485692333597, + "learning_rate": 1.2656677095023607e-06, + "loss": 0.2584, + "step": 14672 + }, + { + "epoch": 0.84, + "grad_norm": 1.4804813262002852, + "learning_rate": 1.2647617006403312e-06, + "loss": 0.4913, + "step": 14673 + }, + { + "epoch": 0.84, + "grad_norm": 0.16779157793554556, + "learning_rate": 1.2638559942792294e-06, + "loss": 0.0875, + "step": 14674 + }, + { + "epoch": 0.84, + "grad_norm": 0.39519690266180457, + "learning_rate": 1.2629505904504158e-06, + "loss": 0.3048, + "step": 14675 + }, + { + "epoch": 0.84, + "grad_norm": 0.3542800298985227, + "learning_rate": 1.2620454891852507e-06, + "loss": 0.3001, + "step": 14676 + }, + { + "epoch": 0.84, + "grad_norm": 0.38034372533991734, + "learning_rate": 1.2611406905150736e-06, + "loss": 0.1732, + "step": 14677 + }, + { + "epoch": 0.84, + "grad_norm": 0.38478117119739746, + "learning_rate": 1.2602361944712193e-06, + "loss": 0.2879, + "step": 14678 + }, + { + "epoch": 0.84, + "grad_norm": 0.5953891231925955, + "learning_rate": 1.2593320010850096e-06, + "loss": 0.329, + "step": 14679 + }, + { + "epoch": 0.84, + "grad_norm": 0.4313422987262344, + "learning_rate": 1.258428110387754e-06, + "loss": 0.2362, + "step": 14680 + }, + { + "epoch": 0.84, + "grad_norm": 0.3322900970369777, + "learning_rate": 1.2575245224107602e-06, + "loss": 0.2568, + "step": 14681 + }, + { + "epoch": 0.84, + "grad_norm": 0.5778985999746848, + "learning_rate": 1.256621237185316e-06, + "loss": 0.388, + "step": 14682 + }, + { + "epoch": 0.84, + "grad_norm": 0.25766138829972673, + "learning_rate": 1.2557182547427016e-06, + "loss": 0.1669, + "step": 14683 + }, + { + "epoch": 0.84, + "grad_norm": 0.23468154882529396, + "learning_rate": 1.2548155751141867e-06, + "loss": 0.2158, + "step": 14684 + }, + { + "epoch": 0.84, + "grad_norm": 1.4762527134046515, + "learning_rate": 1.2539131983310349e-06, + "loss": 0.5962, + "step": 14685 + }, + { + "epoch": 0.84, + "grad_norm": 1.3406802197347614, + "learning_rate": 1.2530111244244925e-06, + "loss": 0.6108, + "step": 14686 + }, + { + "epoch": 0.84, + "grad_norm": 0.2989282297239217, + "learning_rate": 1.2521093534257977e-06, + "loss": 0.2035, + "step": 14687 + }, + { + "epoch": 0.84, + "grad_norm": 0.32967555204294186, + "learning_rate": 1.2512078853661813e-06, + "loss": 0.2942, + "step": 14688 + }, + { + "epoch": 0.84, + "grad_norm": 0.26575499268107783, + "learning_rate": 1.2503067202768592e-06, + "loss": 0.1926, + "step": 14689 + }, + { + "epoch": 0.84, + "grad_norm": 0.34793399148411513, + "learning_rate": 1.2494058581890388e-06, + "loss": 0.1929, + "step": 14690 + }, + { + "epoch": 0.84, + "grad_norm": 0.44843755470370344, + "learning_rate": 1.2485052991339174e-06, + "loss": 0.2978, + "step": 14691 + }, + { + "epoch": 0.84, + "grad_norm": 0.5016887458702256, + "learning_rate": 1.247605043142679e-06, + "loss": 0.2965, + "step": 14692 + }, + { + "epoch": 0.84, + "grad_norm": 0.35071326656103574, + "learning_rate": 1.2467050902465038e-06, + "loss": 0.2073, + "step": 14693 + }, + { + "epoch": 0.84, + "grad_norm": 0.7348697816631484, + "learning_rate": 1.2458054404765552e-06, + "loss": 0.4092, + "step": 14694 + }, + { + "epoch": 0.84, + "grad_norm": 0.3464063282503142, + "learning_rate": 1.2449060938639869e-06, + "loss": 0.3186, + "step": 14695 + }, + { + "epoch": 0.84, + "grad_norm": 0.2497142048809561, + "learning_rate": 1.2440070504399426e-06, + "loss": 0.1682, + "step": 14696 + }, + { + "epoch": 0.84, + "grad_norm": 0.5003880498698998, + "learning_rate": 1.243108310235559e-06, + "loss": 0.2321, + "step": 14697 + }, + { + "epoch": 0.84, + "grad_norm": 0.6704924239750555, + "learning_rate": 1.2422098732819587e-06, + "loss": 0.3914, + "step": 14698 + }, + { + "epoch": 0.84, + "grad_norm": 0.30569771907205934, + "learning_rate": 1.2413117396102548e-06, + "loss": 0.2449, + "step": 14699 + }, + { + "epoch": 0.84, + "grad_norm": 0.3158934485374403, + "learning_rate": 1.2404139092515455e-06, + "loss": 0.2607, + "step": 14700 + }, + { + "epoch": 0.84, + "grad_norm": 0.46902780635018154, + "learning_rate": 1.2395163822369283e-06, + "loss": 0.2458, + "step": 14701 + }, + { + "epoch": 0.84, + "grad_norm": 0.24793381285511187, + "learning_rate": 1.2386191585974815e-06, + "loss": 0.2093, + "step": 14702 + }, + { + "epoch": 0.84, + "grad_norm": 0.568270893805069, + "learning_rate": 1.2377222383642773e-06, + "loss": 0.2455, + "step": 14703 + }, + { + "epoch": 0.84, + "grad_norm": 0.4898535112631793, + "learning_rate": 1.2368256215683727e-06, + "loss": 0.3084, + "step": 14704 + }, + { + "epoch": 0.84, + "grad_norm": 0.3244437239492716, + "learning_rate": 1.235929308240822e-06, + "loss": 0.2521, + "step": 14705 + }, + { + "epoch": 0.84, + "grad_norm": 0.8715598875641944, + "learning_rate": 1.2350332984126623e-06, + "loss": 0.3682, + "step": 14706 + }, + { + "epoch": 0.84, + "grad_norm": 0.32979803200018204, + "learning_rate": 1.2341375921149224e-06, + "loss": 0.2899, + "step": 14707 + }, + { + "epoch": 0.85, + "grad_norm": 0.2545636420010435, + "learning_rate": 1.2332421893786218e-06, + "loss": 0.2019, + "step": 14708 + }, + { + "epoch": 0.85, + "grad_norm": 0.4792169662322209, + "learning_rate": 1.2323470902347645e-06, + "loss": 0.2032, + "step": 14709 + }, + { + "epoch": 0.85, + "grad_norm": 0.5781818608555402, + "learning_rate": 1.2314522947143526e-06, + "loss": 0.2895, + "step": 14710 + }, + { + "epoch": 0.85, + "grad_norm": 0.4198601434035437, + "learning_rate": 1.23055780284837e-06, + "loss": 0.27, + "step": 14711 + }, + { + "epoch": 0.85, + "grad_norm": 0.2760695555581516, + "learning_rate": 1.2296636146677942e-06, + "loss": 0.2711, + "step": 14712 + }, + { + "epoch": 0.85, + "grad_norm": 0.8359298804618515, + "learning_rate": 1.2287697302035883e-06, + "loss": 0.2242, + "step": 14713 + }, + { + "epoch": 0.85, + "grad_norm": 0.35699267197134954, + "learning_rate": 1.227876149486712e-06, + "loss": 0.2875, + "step": 14714 + }, + { + "epoch": 0.85, + "grad_norm": 0.25071982956057953, + "learning_rate": 1.226982872548107e-06, + "loss": 0.1969, + "step": 14715 + }, + { + "epoch": 0.85, + "grad_norm": 0.43126805352484987, + "learning_rate": 1.2260898994187075e-06, + "loss": 0.2655, + "step": 14716 + }, + { + "epoch": 0.85, + "grad_norm": 0.3063248769232248, + "learning_rate": 1.2251972301294358e-06, + "loss": 0.2335, + "step": 14717 + }, + { + "epoch": 0.85, + "grad_norm": 1.0832029188795427, + "learning_rate": 1.2243048647112078e-06, + "loss": 0.7687, + "step": 14718 + }, + { + "epoch": 0.85, + "grad_norm": 0.3293628692072448, + "learning_rate": 1.2234128031949266e-06, + "loss": 0.2336, + "step": 14719 + }, + { + "epoch": 0.85, + "grad_norm": 0.3376833571189543, + "learning_rate": 1.222521045611481e-06, + "loss": 0.2651, + "step": 14720 + }, + { + "epoch": 0.85, + "grad_norm": 0.37187860367678754, + "learning_rate": 1.2216295919917553e-06, + "loss": 0.2057, + "step": 14721 + }, + { + "epoch": 0.85, + "grad_norm": 0.8919618832579139, + "learning_rate": 1.220738442366619e-06, + "loss": 0.4321, + "step": 14722 + }, + { + "epoch": 0.85, + "grad_norm": 0.2582549238915909, + "learning_rate": 1.2198475967669333e-06, + "loss": 0.21, + "step": 14723 + }, + { + "epoch": 0.85, + "grad_norm": 0.4988948087103634, + "learning_rate": 1.2189570552235475e-06, + "loss": 0.3751, + "step": 14724 + }, + { + "epoch": 0.85, + "grad_norm": 1.0828966430021156, + "learning_rate": 1.2180668177672984e-06, + "loss": 0.552, + "step": 14725 + }, + { + "epoch": 0.85, + "grad_norm": 0.2535459801459226, + "learning_rate": 1.217176884429021e-06, + "loss": 0.1598, + "step": 14726 + }, + { + "epoch": 0.85, + "grad_norm": 0.4715556154628492, + "learning_rate": 1.21628725523953e-06, + "loss": 0.3407, + "step": 14727 + }, + { + "epoch": 0.85, + "grad_norm": 0.333192726383167, + "learning_rate": 1.2153979302296338e-06, + "loss": 0.2542, + "step": 14728 + }, + { + "epoch": 0.85, + "grad_norm": 0.3590585580216333, + "learning_rate": 1.2145089094301265e-06, + "loss": 0.1648, + "step": 14729 + }, + { + "epoch": 0.85, + "grad_norm": 0.4915144416030057, + "learning_rate": 1.2136201928718005e-06, + "loss": 0.3565, + "step": 14730 + }, + { + "epoch": 0.85, + "grad_norm": 0.3515123559483987, + "learning_rate": 1.21273178058543e-06, + "loss": 0.2912, + "step": 14731 + }, + { + "epoch": 0.85, + "grad_norm": 0.38349679742074594, + "learning_rate": 1.21184367260178e-06, + "loss": 0.1698, + "step": 14732 + }, + { + "epoch": 0.85, + "grad_norm": 0.40644587846781555, + "learning_rate": 1.2109558689516054e-06, + "loss": 0.3135, + "step": 14733 + }, + { + "epoch": 0.85, + "grad_norm": 0.3399566033908146, + "learning_rate": 1.210068369665649e-06, + "loss": 0.1875, + "step": 14734 + }, + { + "epoch": 0.85, + "grad_norm": 0.31943117545798166, + "learning_rate": 1.2091811747746484e-06, + "loss": 0.2578, + "step": 14735 + }, + { + "epoch": 0.85, + "grad_norm": 0.3502609198287449, + "learning_rate": 1.208294284309327e-06, + "loss": 0.2722, + "step": 14736 + }, + { + "epoch": 0.85, + "grad_norm": 1.4569221299642199, + "learning_rate": 1.2074076983003956e-06, + "loss": 0.6047, + "step": 14737 + }, + { + "epoch": 0.85, + "grad_norm": 0.40102742209080916, + "learning_rate": 1.2065214167785554e-06, + "loss": 0.2702, + "step": 14738 + }, + { + "epoch": 0.85, + "grad_norm": 0.29010025220729196, + "learning_rate": 1.2056354397745029e-06, + "loss": 0.2478, + "step": 14739 + }, + { + "epoch": 0.85, + "grad_norm": 0.3096437615449813, + "learning_rate": 1.2047497673189169e-06, + "loss": 0.1753, + "step": 14740 + }, + { + "epoch": 0.85, + "grad_norm": 0.3867388418154201, + "learning_rate": 1.2038643994424682e-06, + "loss": 0.2674, + "step": 14741 + }, + { + "epoch": 0.85, + "grad_norm": 0.4669314450298384, + "learning_rate": 1.2029793361758146e-06, + "loss": 0.2701, + "step": 14742 + }, + { + "epoch": 0.85, + "grad_norm": 0.36142721745288964, + "learning_rate": 1.2020945775496107e-06, + "loss": 0.2782, + "step": 14743 + }, + { + "epoch": 0.85, + "grad_norm": 0.2954197524558824, + "learning_rate": 1.201210123594494e-06, + "loss": 0.2351, + "step": 14744 + }, + { + "epoch": 0.85, + "grad_norm": 0.6366188960797053, + "learning_rate": 1.200325974341091e-06, + "loss": 0.3014, + "step": 14745 + }, + { + "epoch": 0.85, + "grad_norm": 0.23679746876696145, + "learning_rate": 1.199442129820022e-06, + "loss": 0.1938, + "step": 14746 + }, + { + "epoch": 0.85, + "grad_norm": 0.3110930947191409, + "learning_rate": 1.1985585900618912e-06, + "loss": 0.2664, + "step": 14747 + }, + { + "epoch": 0.85, + "grad_norm": 0.5009217757277309, + "learning_rate": 1.1976753550972998e-06, + "loss": 0.3774, + "step": 14748 + }, + { + "epoch": 0.85, + "grad_norm": 0.5908704134342048, + "learning_rate": 1.196792424956833e-06, + "loss": 0.2874, + "step": 14749 + }, + { + "epoch": 0.85, + "grad_norm": 0.527797718182077, + "learning_rate": 1.1959097996710656e-06, + "loss": 0.2624, + "step": 14750 + }, + { + "epoch": 0.85, + "grad_norm": 0.25644427630796324, + "learning_rate": 1.1950274792705618e-06, + "loss": 0.2696, + "step": 14751 + }, + { + "epoch": 0.85, + "grad_norm": 0.16068604115031754, + "learning_rate": 1.1941454637858784e-06, + "loss": 0.0712, + "step": 14752 + }, + { + "epoch": 0.85, + "grad_norm": 0.5669102028958434, + "learning_rate": 1.19326375324756e-06, + "loss": 0.3149, + "step": 14753 + }, + { + "epoch": 0.85, + "grad_norm": 0.37777598291555, + "learning_rate": 1.1923823476861395e-06, + "loss": 0.3254, + "step": 14754 + }, + { + "epoch": 0.85, + "grad_norm": 0.3227218948916543, + "learning_rate": 1.1915012471321385e-06, + "loss": 0.2587, + "step": 14755 + }, + { + "epoch": 0.85, + "grad_norm": 0.3981084157797156, + "learning_rate": 1.1906204516160713e-06, + "loss": 0.3059, + "step": 14756 + }, + { + "epoch": 0.85, + "grad_norm": 0.5493564974138063, + "learning_rate": 1.189739961168439e-06, + "loss": 0.3309, + "step": 14757 + }, + { + "epoch": 0.85, + "grad_norm": 0.31382816631129834, + "learning_rate": 1.1888597758197319e-06, + "loss": 0.1758, + "step": 14758 + }, + { + "epoch": 0.85, + "grad_norm": 0.25289083122443995, + "learning_rate": 1.1879798956004307e-06, + "loss": 0.2251, + "step": 14759 + }, + { + "epoch": 0.85, + "grad_norm": 0.5491193092701862, + "learning_rate": 1.1871003205410092e-06, + "loss": 0.4062, + "step": 14760 + }, + { + "epoch": 0.85, + "grad_norm": 0.6820129227771387, + "learning_rate": 1.186221050671924e-06, + "loss": 0.3746, + "step": 14761 + }, + { + "epoch": 0.85, + "grad_norm": 0.358774579788031, + "learning_rate": 1.1853420860236253e-06, + "loss": 0.1911, + "step": 14762 + }, + { + "epoch": 0.85, + "grad_norm": 0.30419959453244505, + "learning_rate": 1.1844634266265487e-06, + "loss": 0.266, + "step": 14763 + }, + { + "epoch": 0.85, + "grad_norm": 0.4638461851125854, + "learning_rate": 1.1835850725111264e-06, + "loss": 0.2343, + "step": 14764 + }, + { + "epoch": 0.85, + "grad_norm": 0.22267885583250036, + "learning_rate": 1.1827070237077743e-06, + "loss": 0.1363, + "step": 14765 + }, + { + "epoch": 0.85, + "grad_norm": 0.5251370761115037, + "learning_rate": 1.1818292802468989e-06, + "loss": 0.3503, + "step": 14766 + }, + { + "epoch": 0.85, + "grad_norm": 0.4038183648154914, + "learning_rate": 1.1809518421588939e-06, + "loss": 0.2937, + "step": 14767 + }, + { + "epoch": 0.85, + "grad_norm": 0.5477079411295382, + "learning_rate": 1.1800747094741493e-06, + "loss": 0.1356, + "step": 14768 + }, + { + "epoch": 0.85, + "grad_norm": 0.42781575837813424, + "learning_rate": 1.1791978822230388e-06, + "loss": 0.2874, + "step": 14769 + }, + { + "epoch": 0.85, + "grad_norm": 0.38030178099469747, + "learning_rate": 1.1783213604359268e-06, + "loss": 0.2965, + "step": 14770 + }, + { + "epoch": 0.85, + "grad_norm": 0.19456608161814554, + "learning_rate": 1.1774451441431655e-06, + "loss": 0.0866, + "step": 14771 + }, + { + "epoch": 0.85, + "grad_norm": 0.39569769749296096, + "learning_rate": 1.1765692333750977e-06, + "loss": 0.2675, + "step": 14772 + }, + { + "epoch": 0.85, + "grad_norm": 0.6547391841835668, + "learning_rate": 1.17569362816206e-06, + "loss": 0.3861, + "step": 14773 + }, + { + "epoch": 0.85, + "grad_norm": 0.2570354731252588, + "learning_rate": 1.174818328534373e-06, + "loss": 0.1988, + "step": 14774 + }, + { + "epoch": 0.85, + "grad_norm": 0.27903556542289093, + "learning_rate": 1.1739433345223482e-06, + "loss": 0.2137, + "step": 14775 + }, + { + "epoch": 0.85, + "grad_norm": 1.4667329346849802, + "learning_rate": 1.1730686461562835e-06, + "loss": 0.4638, + "step": 14776 + }, + { + "epoch": 0.85, + "grad_norm": 0.5838213979491272, + "learning_rate": 1.172194263466474e-06, + "loss": 0.3362, + "step": 14777 + }, + { + "epoch": 0.85, + "grad_norm": 0.3959474359561361, + "learning_rate": 1.1713201864831968e-06, + "loss": 0.2403, + "step": 14778 + }, + { + "epoch": 0.85, + "grad_norm": 0.33580085060755466, + "learning_rate": 1.1704464152367234e-06, + "loss": 0.2984, + "step": 14779 + }, + { + "epoch": 0.85, + "grad_norm": 0.22515204910858588, + "learning_rate": 1.1695729497573082e-06, + "loss": 0.156, + "step": 14780 + }, + { + "epoch": 0.85, + "grad_norm": 0.41575650449527546, + "learning_rate": 1.168699790075204e-06, + "loss": 0.2199, + "step": 14781 + }, + { + "epoch": 0.85, + "grad_norm": 0.5596586766328981, + "learning_rate": 1.1678269362206463e-06, + "loss": 0.3317, + "step": 14782 + }, + { + "epoch": 0.85, + "grad_norm": 0.3552412822573221, + "learning_rate": 1.166954388223862e-06, + "loss": 0.25, + "step": 14783 + }, + { + "epoch": 0.85, + "grad_norm": 0.36014971889060016, + "learning_rate": 1.1660821461150673e-06, + "loss": 0.2429, + "step": 14784 + }, + { + "epoch": 0.85, + "grad_norm": 0.42642570719810025, + "learning_rate": 1.1652102099244667e-06, + "loss": 0.2757, + "step": 14785 + }, + { + "epoch": 0.85, + "grad_norm": 0.2994411664078834, + "learning_rate": 1.1643385796822582e-06, + "loss": 0.2318, + "step": 14786 + }, + { + "epoch": 0.85, + "grad_norm": 0.29557420750535496, + "learning_rate": 1.1634672554186243e-06, + "loss": 0.259, + "step": 14787 + }, + { + "epoch": 0.85, + "grad_norm": 1.675274623281233, + "learning_rate": 1.16259623716374e-06, + "loss": 0.2517, + "step": 14788 + }, + { + "epoch": 0.85, + "grad_norm": 0.6074414344124812, + "learning_rate": 1.1617255249477677e-06, + "loss": 0.316, + "step": 14789 + }, + { + "epoch": 0.85, + "grad_norm": 0.360726252157882, + "learning_rate": 1.16085511880086e-06, + "loss": 0.2795, + "step": 14790 + }, + { + "epoch": 0.85, + "grad_norm": 0.32731888538386056, + "learning_rate": 1.1599850187531603e-06, + "loss": 0.2495, + "step": 14791 + }, + { + "epoch": 0.85, + "grad_norm": 0.2693011419688268, + "learning_rate": 1.1591152248347959e-06, + "loss": 0.159, + "step": 14792 + }, + { + "epoch": 0.85, + "grad_norm": 0.3694108954988957, + "learning_rate": 1.1582457370758948e-06, + "loss": 0.2588, + "step": 14793 + }, + { + "epoch": 0.85, + "grad_norm": 0.42252599397338797, + "learning_rate": 1.157376555506562e-06, + "loss": 0.2219, + "step": 14794 + }, + { + "epoch": 0.85, + "grad_norm": 0.3670240559105416, + "learning_rate": 1.1565076801568997e-06, + "loss": 0.2801, + "step": 14795 + }, + { + "epoch": 0.85, + "grad_norm": 0.4043612092318145, + "learning_rate": 1.1556391110569965e-06, + "loss": 0.2843, + "step": 14796 + }, + { + "epoch": 0.85, + "grad_norm": 1.1551526431797676, + "learning_rate": 1.1547708482369279e-06, + "loss": 0.719, + "step": 14797 + }, + { + "epoch": 0.85, + "grad_norm": 0.22288561492500292, + "learning_rate": 1.1539028917267668e-06, + "loss": 0.1688, + "step": 14798 + }, + { + "epoch": 0.85, + "grad_norm": 0.2946033880771682, + "learning_rate": 1.1530352415565683e-06, + "loss": 0.2451, + "step": 14799 + }, + { + "epoch": 0.85, + "grad_norm": 0.9260136414904041, + "learning_rate": 1.152167897756379e-06, + "loss": 0.3756, + "step": 14800 + }, + { + "epoch": 0.85, + "grad_norm": 0.46156669228028563, + "learning_rate": 1.1513008603562327e-06, + "loss": 0.2187, + "step": 14801 + }, + { + "epoch": 0.85, + "grad_norm": 0.4098473307632537, + "learning_rate": 1.1504341293861588e-06, + "loss": 0.3288, + "step": 14802 + }, + { + "epoch": 0.85, + "grad_norm": 0.3391465615133372, + "learning_rate": 1.149567704876171e-06, + "loss": 0.3124, + "step": 14803 + }, + { + "epoch": 0.85, + "grad_norm": 0.24197185666517396, + "learning_rate": 1.1487015868562723e-06, + "loss": 0.0848, + "step": 14804 + }, + { + "epoch": 0.85, + "grad_norm": 0.30112775030509276, + "learning_rate": 1.147835775356455e-06, + "loss": 0.2428, + "step": 14805 + }, + { + "epoch": 0.85, + "grad_norm": 0.31423519705737857, + "learning_rate": 1.1469702704067064e-06, + "loss": 0.3014, + "step": 14806 + }, + { + "epoch": 0.85, + "grad_norm": 0.7919262173209054, + "learning_rate": 1.146105072036997e-06, + "loss": 0.2429, + "step": 14807 + }, + { + "epoch": 0.85, + "grad_norm": 0.35309423242059595, + "learning_rate": 1.1452401802772884e-06, + "loss": 0.2779, + "step": 14808 + }, + { + "epoch": 0.85, + "grad_norm": 1.1765512888066414, + "learning_rate": 1.144375595157532e-06, + "loss": 0.7295, + "step": 14809 + }, + { + "epoch": 0.85, + "grad_norm": 0.26690693768113766, + "learning_rate": 1.143511316707665e-06, + "loss": 0.2349, + "step": 14810 + }, + { + "epoch": 0.85, + "grad_norm": 0.22741761840826571, + "learning_rate": 1.1426473449576225e-06, + "loss": 0.1552, + "step": 14811 + }, + { + "epoch": 0.85, + "grad_norm": 0.8487452061726126, + "learning_rate": 1.1417836799373205e-06, + "loss": 0.3599, + "step": 14812 + }, + { + "epoch": 0.85, + "grad_norm": 0.7051579592509625, + "learning_rate": 1.1409203216766706e-06, + "loss": 0.4184, + "step": 14813 + }, + { + "epoch": 0.85, + "grad_norm": 0.24162397410920866, + "learning_rate": 1.1400572702055657e-06, + "loss": 0.1935, + "step": 14814 + }, + { + "epoch": 0.85, + "grad_norm": 0.45583372486177876, + "learning_rate": 1.1391945255538994e-06, + "loss": 0.3813, + "step": 14815 + }, + { + "epoch": 0.85, + "grad_norm": 0.4161358729060462, + "learning_rate": 1.1383320877515446e-06, + "loss": 0.2391, + "step": 14816 + }, + { + "epoch": 0.85, + "grad_norm": 0.26035907038958234, + "learning_rate": 1.1374699568283698e-06, + "loss": 0.1717, + "step": 14817 + }, + { + "epoch": 0.85, + "grad_norm": 0.32519859938343304, + "learning_rate": 1.1366081328142264e-06, + "loss": 0.2887, + "step": 14818 + }, + { + "epoch": 0.85, + "grad_norm": 0.6847648618208944, + "learning_rate": 1.135746615738965e-06, + "loss": 0.3494, + "step": 14819 + }, + { + "epoch": 0.85, + "grad_norm": 0.32858251668966476, + "learning_rate": 1.1348854056324166e-06, + "loss": 0.212, + "step": 14820 + }, + { + "epoch": 0.85, + "grad_norm": 1.1507405287489314, + "learning_rate": 1.1340245025244045e-06, + "loss": 0.7616, + "step": 14821 + }, + { + "epoch": 0.85, + "grad_norm": 0.36743606071499174, + "learning_rate": 1.133163906444742e-06, + "loss": 0.3013, + "step": 14822 + }, + { + "epoch": 0.85, + "grad_norm": 0.3275912685999875, + "learning_rate": 1.132303617423236e-06, + "loss": 0.2387, + "step": 14823 + }, + { + "epoch": 0.85, + "grad_norm": 0.34411631210529814, + "learning_rate": 1.131443635489672e-06, + "loss": 0.1776, + "step": 14824 + }, + { + "epoch": 0.85, + "grad_norm": 1.4080656699672398, + "learning_rate": 1.1305839606738334e-06, + "loss": 0.717, + "step": 14825 + }, + { + "epoch": 0.85, + "grad_norm": 0.25755544714127737, + "learning_rate": 1.129724593005489e-06, + "loss": 0.2429, + "step": 14826 + }, + { + "epoch": 0.85, + "grad_norm": 0.4605922701134536, + "learning_rate": 1.1288655325144027e-06, + "loss": 0.3024, + "step": 14827 + }, + { + "epoch": 0.85, + "grad_norm": 0.8821629142995729, + "learning_rate": 1.1280067792303218e-06, + "loss": 0.4034, + "step": 14828 + }, + { + "epoch": 0.85, + "grad_norm": 0.3203927459270146, + "learning_rate": 1.1271483331829835e-06, + "loss": 0.2634, + "step": 14829 + }, + { + "epoch": 0.85, + "grad_norm": 0.33166642821629216, + "learning_rate": 1.1262901944021165e-06, + "loss": 0.245, + "step": 14830 + }, + { + "epoch": 0.85, + "grad_norm": 0.3227943214683181, + "learning_rate": 1.125432362917439e-06, + "loss": 0.1672, + "step": 14831 + }, + { + "epoch": 0.85, + "grad_norm": 0.31125204319825606, + "learning_rate": 1.1245748387586575e-06, + "loss": 0.2647, + "step": 14832 + }, + { + "epoch": 0.85, + "grad_norm": 0.8780581738774083, + "learning_rate": 1.123717621955468e-06, + "loss": 0.364, + "step": 14833 + }, + { + "epoch": 0.85, + "grad_norm": 0.3650475322050246, + "learning_rate": 1.1228607125375534e-06, + "loss": 0.2866, + "step": 14834 + }, + { + "epoch": 0.85, + "grad_norm": 0.40110708736483236, + "learning_rate": 1.1220041105345935e-06, + "loss": 0.2838, + "step": 14835 + }, + { + "epoch": 0.85, + "grad_norm": 0.560535169035649, + "learning_rate": 1.121147815976248e-06, + "loss": 0.3457, + "step": 14836 + }, + { + "epoch": 0.85, + "grad_norm": 0.20911265248523273, + "learning_rate": 1.1202918288921727e-06, + "loss": 0.143, + "step": 14837 + }, + { + "epoch": 0.85, + "grad_norm": 0.3534204337682163, + "learning_rate": 1.1194361493120099e-06, + "loss": 0.268, + "step": 14838 + }, + { + "epoch": 0.85, + "grad_norm": 0.47013886187275117, + "learning_rate": 1.118580777265388e-06, + "loss": 0.3605, + "step": 14839 + }, + { + "epoch": 0.85, + "grad_norm": 0.7626126324582352, + "learning_rate": 1.1177257127819353e-06, + "loss": 0.3003, + "step": 14840 + }, + { + "epoch": 0.85, + "grad_norm": 0.31851659287056955, + "learning_rate": 1.1168709558912583e-06, + "loss": 0.2653, + "step": 14841 + }, + { + "epoch": 0.85, + "grad_norm": 0.3898838568972243, + "learning_rate": 1.116016506622959e-06, + "loss": 0.2784, + "step": 14842 + }, + { + "epoch": 0.85, + "grad_norm": 0.15368938464097603, + "learning_rate": 1.1151623650066224e-06, + "loss": 0.0877, + "step": 14843 + }, + { + "epoch": 0.85, + "grad_norm": 0.30142584311757015, + "learning_rate": 1.114308531071835e-06, + "loss": 0.238, + "step": 14844 + }, + { + "epoch": 0.85, + "grad_norm": 1.2307746884376518, + "learning_rate": 1.1134550048481596e-06, + "loss": 0.5617, + "step": 14845 + }, + { + "epoch": 0.85, + "grad_norm": 0.3237698295002861, + "learning_rate": 1.1126017863651562e-06, + "loss": 0.2563, + "step": 14846 + }, + { + "epoch": 0.85, + "grad_norm": 0.31133573341743925, + "learning_rate": 1.1117488756523677e-06, + "loss": 0.2625, + "step": 14847 + }, + { + "epoch": 0.85, + "grad_norm": 1.3573196658449307, + "learning_rate": 1.1108962727393368e-06, + "loss": 0.5345, + "step": 14848 + }, + { + "epoch": 0.85, + "grad_norm": 0.27633793321812017, + "learning_rate": 1.110043977655585e-06, + "loss": 0.1962, + "step": 14849 + }, + { + "epoch": 0.85, + "grad_norm": 0.23377373058249695, + "learning_rate": 1.109191990430628e-06, + "loss": 0.2015, + "step": 14850 + }, + { + "epoch": 0.85, + "grad_norm": 1.2731592859606273, + "learning_rate": 1.1083403110939695e-06, + "loss": 0.7351, + "step": 14851 + }, + { + "epoch": 0.85, + "grad_norm": 0.635556152341861, + "learning_rate": 1.107488939675102e-06, + "loss": 0.374, + "step": 14852 + }, + { + "epoch": 0.85, + "grad_norm": 0.3848628568892182, + "learning_rate": 1.1066378762035125e-06, + "loss": 0.1684, + "step": 14853 + }, + { + "epoch": 0.85, + "grad_norm": 0.3072768771702361, + "learning_rate": 1.1057871207086713e-06, + "loss": 0.2661, + "step": 14854 + }, + { + "epoch": 0.85, + "grad_norm": 0.3946099595301403, + "learning_rate": 1.1049366732200383e-06, + "loss": 0.2413, + "step": 14855 + }, + { + "epoch": 0.85, + "grad_norm": 0.3380880546143608, + "learning_rate": 1.104086533767067e-06, + "loss": 0.1868, + "step": 14856 + }, + { + "epoch": 0.85, + "grad_norm": 0.4442492965914318, + "learning_rate": 1.1032367023791957e-06, + "loss": 0.3175, + "step": 14857 + }, + { + "epoch": 0.85, + "grad_norm": 0.2528655784441553, + "learning_rate": 1.1023871790858553e-06, + "loss": 0.2477, + "step": 14858 + }, + { + "epoch": 0.85, + "grad_norm": 0.28372712548736984, + "learning_rate": 1.1015379639164625e-06, + "loss": 0.1826, + "step": 14859 + }, + { + "epoch": 0.85, + "grad_norm": 1.1834437417520254, + "learning_rate": 1.100689056900429e-06, + "loss": 0.5269, + "step": 14860 + }, + { + "epoch": 0.85, + "grad_norm": 0.3866428294105834, + "learning_rate": 1.0998404580671507e-06, + "loss": 0.2489, + "step": 14861 + }, + { + "epoch": 0.85, + "grad_norm": 0.25646265297625703, + "learning_rate": 1.0989921674460146e-06, + "loss": 0.2444, + "step": 14862 + }, + { + "epoch": 0.85, + "grad_norm": 0.44426050608931333, + "learning_rate": 1.0981441850663976e-06, + "loss": 0.263, + "step": 14863 + }, + { + "epoch": 0.85, + "grad_norm": 0.43875055594403534, + "learning_rate": 1.0972965109576628e-06, + "loss": 0.25, + "step": 14864 + }, + { + "epoch": 0.85, + "grad_norm": 0.4141839426559247, + "learning_rate": 1.0964491451491677e-06, + "loss": 0.2651, + "step": 14865 + }, + { + "epoch": 0.85, + "grad_norm": 0.2750628065825932, + "learning_rate": 1.0956020876702567e-06, + "loss": 0.2278, + "step": 14866 + }, + { + "epoch": 0.85, + "grad_norm": 1.0146697997317078, + "learning_rate": 1.094755338550263e-06, + "loss": 0.5553, + "step": 14867 + }, + { + "epoch": 0.85, + "grad_norm": 0.3163524879755541, + "learning_rate": 1.0939088978185053e-06, + "loss": 0.2456, + "step": 14868 + }, + { + "epoch": 0.85, + "grad_norm": 0.6128769729330104, + "learning_rate": 1.0930627655043036e-06, + "loss": 0.2857, + "step": 14869 + }, + { + "epoch": 0.85, + "grad_norm": 0.23060783739572102, + "learning_rate": 1.0922169416369531e-06, + "loss": 0.2126, + "step": 14870 + }, + { + "epoch": 0.85, + "grad_norm": 0.6672465753746528, + "learning_rate": 1.0913714262457486e-06, + "loss": 0.3115, + "step": 14871 + }, + { + "epoch": 0.85, + "grad_norm": 0.3828226055546082, + "learning_rate": 1.0905262193599665e-06, + "loss": 0.3032, + "step": 14872 + }, + { + "epoch": 0.85, + "grad_norm": 0.30441911790214365, + "learning_rate": 1.0896813210088797e-06, + "loss": 0.2394, + "step": 14873 + }, + { + "epoch": 0.85, + "grad_norm": 0.6542472129796099, + "learning_rate": 1.0888367312217452e-06, + "loss": 0.2881, + "step": 14874 + }, + { + "epoch": 0.85, + "grad_norm": 0.37036987367157137, + "learning_rate": 1.0879924500278116e-06, + "loss": 0.2922, + "step": 14875 + }, + { + "epoch": 0.85, + "grad_norm": 0.2711333428099278, + "learning_rate": 1.087148477456317e-06, + "loss": 0.1101, + "step": 14876 + }, + { + "epoch": 0.85, + "grad_norm": 0.3898315605171779, + "learning_rate": 1.0863048135364851e-06, + "loss": 0.2556, + "step": 14877 + }, + { + "epoch": 0.85, + "grad_norm": 0.32269214206454216, + "learning_rate": 1.0854614582975353e-06, + "loss": 0.2821, + "step": 14878 + }, + { + "epoch": 0.85, + "grad_norm": 1.4122935758067834, + "learning_rate": 1.084618411768673e-06, + "loss": 0.3141, + "step": 14879 + }, + { + "epoch": 0.85, + "grad_norm": 0.40451562657458473, + "learning_rate": 1.0837756739790916e-06, + "loss": 0.269, + "step": 14880 + }, + { + "epoch": 0.85, + "grad_norm": 0.36283485462304216, + "learning_rate": 1.0829332449579732e-06, + "loss": 0.322, + "step": 14881 + }, + { + "epoch": 0.86, + "grad_norm": 0.28832612026905335, + "learning_rate": 1.0820911247344944e-06, + "loss": 0.1806, + "step": 14882 + }, + { + "epoch": 0.86, + "grad_norm": 0.26597397855279037, + "learning_rate": 1.0812493133378166e-06, + "loss": 0.1913, + "step": 14883 + }, + { + "epoch": 0.86, + "grad_norm": 0.5791713121666998, + "learning_rate": 1.0804078107970917e-06, + "loss": 0.3546, + "step": 14884 + }, + { + "epoch": 0.86, + "grad_norm": 0.5003590603484814, + "learning_rate": 1.0795666171414597e-06, + "loss": 0.3228, + "step": 14885 + }, + { + "epoch": 0.86, + "grad_norm": 0.2956405417591853, + "learning_rate": 1.0787257324000533e-06, + "loss": 0.2204, + "step": 14886 + }, + { + "epoch": 0.86, + "grad_norm": 0.5080100270148686, + "learning_rate": 1.077885156601991e-06, + "loss": 0.3585, + "step": 14887 + }, + { + "epoch": 0.86, + "grad_norm": 0.49071658954089276, + "learning_rate": 1.0770448897763818e-06, + "loss": 0.2817, + "step": 14888 + }, + { + "epoch": 0.86, + "grad_norm": 0.15267670632605457, + "learning_rate": 1.0762049319523248e-06, + "loss": 0.0704, + "step": 14889 + }, + { + "epoch": 0.86, + "grad_norm": 0.2648087586985502, + "learning_rate": 1.075365283158908e-06, + "loss": 0.2688, + "step": 14890 + }, + { + "epoch": 0.86, + "grad_norm": 0.6311471408354534, + "learning_rate": 1.0745259434252065e-06, + "loss": 0.3905, + "step": 14891 + }, + { + "epoch": 0.86, + "grad_norm": 0.671787505272611, + "learning_rate": 1.0736869127802884e-06, + "loss": 0.1974, + "step": 14892 + }, + { + "epoch": 0.86, + "grad_norm": 0.2957218483145482, + "learning_rate": 1.0728481912532062e-06, + "loss": 0.2763, + "step": 14893 + }, + { + "epoch": 0.86, + "grad_norm": 0.32442168752672784, + "learning_rate": 1.07200977887301e-06, + "loss": 0.2365, + "step": 14894 + }, + { + "epoch": 0.86, + "grad_norm": 0.3205645128811862, + "learning_rate": 1.0711716756687307e-06, + "loss": 0.0799, + "step": 14895 + }, + { + "epoch": 0.86, + "grad_norm": 0.40600771972047073, + "learning_rate": 1.070333881669392e-06, + "loss": 0.2881, + "step": 14896 + }, + { + "epoch": 0.86, + "grad_norm": 0.4958254382676697, + "learning_rate": 1.0694963969040062e-06, + "loss": 0.3264, + "step": 14897 + }, + { + "epoch": 0.86, + "grad_norm": 0.40568174234009224, + "learning_rate": 1.0686592214015766e-06, + "loss": 0.2968, + "step": 14898 + }, + { + "epoch": 0.86, + "grad_norm": 0.32870509799899306, + "learning_rate": 1.067822355191095e-06, + "loss": 0.235, + "step": 14899 + }, + { + "epoch": 0.86, + "grad_norm": 1.0628361533545403, + "learning_rate": 1.0669857983015408e-06, + "loss": 0.4543, + "step": 14900 + }, + { + "epoch": 0.86, + "grad_norm": 0.20065123792574188, + "learning_rate": 1.0661495507618845e-06, + "loss": 0.1679, + "step": 14901 + }, + { + "epoch": 0.86, + "grad_norm": 0.29083032401850334, + "learning_rate": 1.0653136126010832e-06, + "loss": 0.223, + "step": 14902 + }, + { + "epoch": 0.86, + "grad_norm": 0.5620063770534373, + "learning_rate": 1.06447798384809e-06, + "loss": 0.3486, + "step": 14903 + }, + { + "epoch": 0.86, + "grad_norm": 0.554067422855577, + "learning_rate": 1.0636426645318387e-06, + "loss": 0.3698, + "step": 14904 + }, + { + "epoch": 0.86, + "grad_norm": 0.28406674047779545, + "learning_rate": 1.0628076546812583e-06, + "loss": 0.2318, + "step": 14905 + }, + { + "epoch": 0.86, + "grad_norm": 0.5269874383643349, + "learning_rate": 1.0619729543252622e-06, + "loss": 0.3355, + "step": 14906 + }, + { + "epoch": 0.86, + "grad_norm": 0.24121729656072172, + "learning_rate": 1.0611385634927607e-06, + "loss": 0.1645, + "step": 14907 + }, + { + "epoch": 0.86, + "grad_norm": 0.35797358852460026, + "learning_rate": 1.0603044822126463e-06, + "loss": 0.2266, + "step": 14908 + }, + { + "epoch": 0.86, + "grad_norm": 0.3378453061765035, + "learning_rate": 1.0594707105138024e-06, + "loss": 0.2897, + "step": 14909 + }, + { + "epoch": 0.86, + "grad_norm": 0.8746212764884232, + "learning_rate": 1.0586372484251018e-06, + "loss": 0.391, + "step": 14910 + }, + { + "epoch": 0.86, + "grad_norm": 0.32721325459226824, + "learning_rate": 1.057804095975411e-06, + "loss": 0.2678, + "step": 14911 + }, + { + "epoch": 0.86, + "grad_norm": 1.5419081597400264, + "learning_rate": 1.0569712531935805e-06, + "loss": 0.277, + "step": 14912 + }, + { + "epoch": 0.86, + "grad_norm": 0.2279118449162286, + "learning_rate": 1.0561387201084494e-06, + "loss": 0.206, + "step": 14913 + }, + { + "epoch": 0.86, + "grad_norm": 0.288549030985663, + "learning_rate": 1.0553064967488514e-06, + "loss": 0.2256, + "step": 14914 + }, + { + "epoch": 0.86, + "grad_norm": 0.6606990633306803, + "learning_rate": 1.054474583143602e-06, + "loss": 0.2891, + "step": 14915 + }, + { + "epoch": 0.86, + "grad_norm": 0.6655593582916631, + "learning_rate": 1.0536429793215152e-06, + "loss": 0.399, + "step": 14916 + }, + { + "epoch": 0.86, + "grad_norm": 0.24780553816203488, + "learning_rate": 1.0528116853113867e-06, + "loss": 0.2537, + "step": 14917 + }, + { + "epoch": 0.86, + "grad_norm": 1.4479855245872628, + "learning_rate": 1.0519807011420057e-06, + "loss": 0.2714, + "step": 14918 + }, + { + "epoch": 0.86, + "grad_norm": 0.6947298411391986, + "learning_rate": 1.051150026842146e-06, + "loss": 0.3632, + "step": 14919 + }, + { + "epoch": 0.86, + "grad_norm": 0.3823419776922345, + "learning_rate": 1.0503196624405775e-06, + "loss": 0.259, + "step": 14920 + }, + { + "epoch": 0.86, + "grad_norm": 0.2174358692451919, + "learning_rate": 1.0494896079660554e-06, + "loss": 0.1818, + "step": 14921 + }, + { + "epoch": 0.86, + "grad_norm": 0.6206679548783893, + "learning_rate": 1.0486598634473221e-06, + "loss": 0.3312, + "step": 14922 + }, + { + "epoch": 0.86, + "grad_norm": 0.40084767726771736, + "learning_rate": 1.0478304289131115e-06, + "loss": 0.3024, + "step": 14923 + }, + { + "epoch": 0.86, + "grad_norm": 1.3130018162299788, + "learning_rate": 1.0470013043921523e-06, + "loss": 0.6565, + "step": 14924 + }, + { + "epoch": 0.86, + "grad_norm": 0.317664336895266, + "learning_rate": 1.046172489913151e-06, + "loss": 0.237, + "step": 14925 + }, + { + "epoch": 0.86, + "grad_norm": 0.3425571725675746, + "learning_rate": 1.0453439855048108e-06, + "loss": 0.2797, + "step": 14926 + }, + { + "epoch": 0.86, + "grad_norm": 0.30894367529548855, + "learning_rate": 1.0445157911958214e-06, + "loss": 0.1792, + "step": 14927 + }, + { + "epoch": 0.86, + "grad_norm": 1.014875460132905, + "learning_rate": 1.0436879070148675e-06, + "loss": 0.5139, + "step": 14928 + }, + { + "epoch": 0.86, + "grad_norm": 0.26322226528383447, + "learning_rate": 1.042860332990615e-06, + "loss": 0.2515, + "step": 14929 + }, + { + "epoch": 0.86, + "grad_norm": 0.5905553635015627, + "learning_rate": 1.0420330691517256e-06, + "loss": 0.3113, + "step": 14930 + }, + { + "epoch": 0.86, + "grad_norm": 0.5689957343239428, + "learning_rate": 1.0412061155268428e-06, + "loss": 0.2489, + "step": 14931 + }, + { + "epoch": 0.86, + "grad_norm": 0.3543218393322141, + "learning_rate": 1.0403794721446092e-06, + "loss": 0.257, + "step": 14932 + }, + { + "epoch": 0.86, + "grad_norm": 0.386805524160091, + "learning_rate": 1.03955313903365e-06, + "loss": 0.3118, + "step": 14933 + }, + { + "epoch": 0.86, + "grad_norm": 0.18668486006155857, + "learning_rate": 1.0387271162225787e-06, + "loss": 0.1076, + "step": 14934 + }, + { + "epoch": 0.86, + "grad_norm": 0.39926953655521297, + "learning_rate": 1.0379014037400014e-06, + "loss": 0.2844, + "step": 14935 + }, + { + "epoch": 0.86, + "grad_norm": 0.8440325835444368, + "learning_rate": 1.0370760016145142e-06, + "loss": 0.4694, + "step": 14936 + }, + { + "epoch": 0.86, + "grad_norm": 0.3958906252639677, + "learning_rate": 1.036250909874701e-06, + "loss": 0.3087, + "step": 14937 + }, + { + "epoch": 0.86, + "grad_norm": 0.2876575469257746, + "learning_rate": 1.0354261285491319e-06, + "loss": 0.188, + "step": 14938 + }, + { + "epoch": 0.86, + "grad_norm": 0.24698100951360044, + "learning_rate": 1.0346016576663686e-06, + "loss": 0.1631, + "step": 14939 + }, + { + "epoch": 0.86, + "grad_norm": 0.8047880203921186, + "learning_rate": 1.0337774972549675e-06, + "loss": 0.5828, + "step": 14940 + }, + { + "epoch": 0.86, + "grad_norm": 0.2618812698548808, + "learning_rate": 1.0329536473434653e-06, + "loss": 0.2086, + "step": 14941 + }, + { + "epoch": 0.86, + "grad_norm": 0.513815456683671, + "learning_rate": 1.032130107960393e-06, + "loss": 0.3363, + "step": 14942 + }, + { + "epoch": 0.86, + "grad_norm": 0.6800722467991159, + "learning_rate": 1.0313068791342683e-06, + "loss": 0.3421, + "step": 14943 + }, + { + "epoch": 0.86, + "grad_norm": 0.2965021973445377, + "learning_rate": 1.0304839608936002e-06, + "loss": 0.197, + "step": 14944 + }, + { + "epoch": 0.86, + "grad_norm": 0.28748538312306504, + "learning_rate": 1.0296613532668875e-06, + "loss": 0.2301, + "step": 14945 + }, + { + "epoch": 0.86, + "grad_norm": 1.1561225785911142, + "learning_rate": 1.0288390562826178e-06, + "loss": 0.697, + "step": 14946 + }, + { + "epoch": 0.86, + "grad_norm": 0.22153187673332958, + "learning_rate": 1.0280170699692648e-06, + "loss": 0.1684, + "step": 14947 + }, + { + "epoch": 0.86, + "grad_norm": 0.4287418102379653, + "learning_rate": 1.0271953943552938e-06, + "loss": 0.3314, + "step": 14948 + }, + { + "epoch": 0.86, + "grad_norm": 0.5123710533271378, + "learning_rate": 1.0263740294691615e-06, + "loss": 0.3298, + "step": 14949 + }, + { + "epoch": 0.86, + "grad_norm": 0.31713937270697534, + "learning_rate": 1.0255529753393112e-06, + "loss": 0.2379, + "step": 14950 + }, + { + "epoch": 0.86, + "grad_norm": 1.6366738751403234, + "learning_rate": 1.0247322319941745e-06, + "loss": 0.2046, + "step": 14951 + }, + { + "epoch": 0.86, + "grad_norm": 0.45705670046593916, + "learning_rate": 1.023911799462174e-06, + "loss": 0.3897, + "step": 14952 + }, + { + "epoch": 0.86, + "grad_norm": 0.27150260711708984, + "learning_rate": 1.0230916777717226e-06, + "loss": 0.2514, + "step": 14953 + }, + { + "epoch": 0.86, + "grad_norm": 0.5578410636590693, + "learning_rate": 1.0222718669512211e-06, + "loss": 0.2062, + "step": 14954 + }, + { + "epoch": 0.86, + "grad_norm": 0.5180872333426131, + "learning_rate": 1.0214523670290587e-06, + "loss": 0.2397, + "step": 14955 + }, + { + "epoch": 0.86, + "grad_norm": 0.33763059935600714, + "learning_rate": 1.0206331780336154e-06, + "loss": 0.255, + "step": 14956 + }, + { + "epoch": 0.86, + "grad_norm": 0.34511369249002094, + "learning_rate": 1.0198142999932559e-06, + "loss": 0.2375, + "step": 14957 + }, + { + "epoch": 0.86, + "grad_norm": 0.8756372942785727, + "learning_rate": 1.0189957329363465e-06, + "loss": 0.5402, + "step": 14958 + }, + { + "epoch": 0.86, + "grad_norm": 0.40461371137970736, + "learning_rate": 1.0181774768912255e-06, + "loss": 0.2893, + "step": 14959 + }, + { + "epoch": 0.86, + "grad_norm": 0.38107539894620857, + "learning_rate": 1.0173595318862305e-06, + "loss": 0.2672, + "step": 14960 + }, + { + "epoch": 0.86, + "grad_norm": 0.22422982508923442, + "learning_rate": 1.01654189794969e-06, + "loss": 0.1835, + "step": 14961 + }, + { + "epoch": 0.86, + "grad_norm": 0.33600638655085446, + "learning_rate": 1.0157245751099188e-06, + "loss": 0.2601, + "step": 14962 + }, + { + "epoch": 0.86, + "grad_norm": 1.4870843447136277, + "learning_rate": 1.0149075633952178e-06, + "loss": 0.5065, + "step": 14963 + }, + { + "epoch": 0.86, + "grad_norm": 0.4281829927047752, + "learning_rate": 1.0140908628338796e-06, + "loss": 0.2577, + "step": 14964 + }, + { + "epoch": 0.86, + "grad_norm": 0.2609028341284687, + "learning_rate": 1.013274473454191e-06, + "loss": 0.2483, + "step": 14965 + }, + { + "epoch": 0.86, + "grad_norm": 0.6522575513963987, + "learning_rate": 1.0124583952844214e-06, + "loss": 0.3881, + "step": 14966 + }, + { + "epoch": 0.86, + "grad_norm": 0.2859295010336732, + "learning_rate": 1.0116426283528301e-06, + "loss": 0.0838, + "step": 14967 + }, + { + "epoch": 0.86, + "grad_norm": 0.32065626081543513, + "learning_rate": 1.0108271726876684e-06, + "loss": 0.239, + "step": 14968 + }, + { + "epoch": 0.86, + "grad_norm": 0.34553309967875206, + "learning_rate": 1.0100120283171733e-06, + "loss": 0.2821, + "step": 14969 + }, + { + "epoch": 0.86, + "grad_norm": 0.7021634157856494, + "learning_rate": 1.0091971952695768e-06, + "loss": 0.3325, + "step": 14970 + }, + { + "epoch": 0.86, + "grad_norm": 0.3073095774730255, + "learning_rate": 1.008382673573095e-06, + "loss": 0.2581, + "step": 14971 + }, + { + "epoch": 0.86, + "grad_norm": 0.8590640171628728, + "learning_rate": 1.0075684632559346e-06, + "loss": 0.4479, + "step": 14972 + }, + { + "epoch": 0.86, + "grad_norm": 0.22537341360656152, + "learning_rate": 1.0067545643462895e-06, + "loss": 0.2082, + "step": 14973 + }, + { + "epoch": 0.86, + "grad_norm": 0.3298064748892064, + "learning_rate": 1.0059409768723495e-06, + "loss": 0.1858, + "step": 14974 + }, + { + "epoch": 0.86, + "grad_norm": 1.147862075159621, + "learning_rate": 1.0051277008622861e-06, + "loss": 0.5509, + "step": 14975 + }, + { + "epoch": 0.86, + "grad_norm": 0.31479894074709697, + "learning_rate": 1.004314736344264e-06, + "loss": 0.2775, + "step": 14976 + }, + { + "epoch": 0.86, + "grad_norm": 0.3955090912581276, + "learning_rate": 1.0035020833464338e-06, + "loss": 0.1986, + "step": 14977 + }, + { + "epoch": 0.86, + "grad_norm": 0.5281904876913323, + "learning_rate": 1.0026897418969417e-06, + "loss": 0.3507, + "step": 14978 + }, + { + "epoch": 0.86, + "grad_norm": 0.30591248057503395, + "learning_rate": 1.0018777120239165e-06, + "loss": 0.1526, + "step": 14979 + }, + { + "epoch": 0.86, + "grad_norm": 0.37828916984177724, + "learning_rate": 1.0010659937554789e-06, + "loss": 0.1737, + "step": 14980 + }, + { + "epoch": 0.86, + "grad_norm": 0.31146064451236316, + "learning_rate": 1.00025458711974e-06, + "loss": 0.2923, + "step": 14981 + }, + { + "epoch": 0.86, + "grad_norm": 0.7401631176839336, + "learning_rate": 9.99443492144795e-07, + "loss": 0.3723, + "step": 14982 + }, + { + "epoch": 0.86, + "grad_norm": 0.29215619453143027, + "learning_rate": 9.986327088587378e-07, + "loss": 0.2067, + "step": 14983 + }, + { + "epoch": 0.86, + "grad_norm": 0.34927543617015566, + "learning_rate": 9.978222372896417e-07, + "loss": 0.3224, + "step": 14984 + }, + { + "epoch": 0.86, + "grad_norm": 0.42741603016621244, + "learning_rate": 9.970120774655744e-07, + "loss": 0.2293, + "step": 14985 + }, + { + "epoch": 0.86, + "grad_norm": 0.31526568971659064, + "learning_rate": 9.9620222941459e-07, + "loss": 0.2038, + "step": 14986 + }, + { + "epoch": 0.86, + "grad_norm": 0.5250602102861858, + "learning_rate": 9.953926931647372e-07, + "loss": 0.2446, + "step": 14987 + }, + { + "epoch": 0.86, + "grad_norm": 0.3332569485498697, + "learning_rate": 9.945834687440491e-07, + "loss": 0.297, + "step": 14988 + }, + { + "epoch": 0.86, + "grad_norm": 0.40668710547493864, + "learning_rate": 9.937745561805478e-07, + "loss": 0.2977, + "step": 14989 + }, + { + "epoch": 0.86, + "grad_norm": 0.5698087862797053, + "learning_rate": 9.92965955502244e-07, + "loss": 0.2558, + "step": 14990 + }, + { + "epoch": 0.86, + "grad_norm": 0.252459910213438, + "learning_rate": 9.921576667371458e-07, + "loss": 0.1633, + "step": 14991 + }, + { + "epoch": 0.86, + "grad_norm": 0.3259181171702919, + "learning_rate": 9.91349689913238e-07, + "loss": 0.2698, + "step": 14992 + }, + { + "epoch": 0.86, + "grad_norm": 0.33737889420877937, + "learning_rate": 9.90542025058503e-07, + "loss": 0.2462, + "step": 14993 + }, + { + "epoch": 0.86, + "grad_norm": 0.6389429058168169, + "learning_rate": 9.897346722009095e-07, + "loss": 0.3715, + "step": 14994 + }, + { + "epoch": 0.86, + "grad_norm": 0.6251259359040213, + "learning_rate": 9.889276313684171e-07, + "loss": 0.3601, + "step": 14995 + }, + { + "epoch": 0.86, + "grad_norm": 0.2480426106504089, + "learning_rate": 9.88120902588975e-07, + "loss": 0.2357, + "step": 14996 + }, + { + "epoch": 0.86, + "grad_norm": 0.3518028245212191, + "learning_rate": 9.87314485890517e-07, + "loss": 0.1729, + "step": 14997 + }, + { + "epoch": 0.86, + "grad_norm": 0.6043858429194576, + "learning_rate": 9.8650838130097e-07, + "loss": 0.3186, + "step": 14998 + }, + { + "epoch": 0.86, + "grad_norm": 0.37360874956552786, + "learning_rate": 9.857025888482518e-07, + "loss": 0.2994, + "step": 14999 + }, + { + "epoch": 0.86, + "grad_norm": 0.33171762764892554, + "learning_rate": 9.848971085602655e-07, + "loss": 0.2454, + "step": 15000 + }, + { + "epoch": 0.86, + "grad_norm": 0.620980650429492, + "learning_rate": 9.84091940464904e-07, + "loss": 0.2839, + "step": 15001 + }, + { + "epoch": 0.86, + "grad_norm": 0.4069755655302794, + "learning_rate": 9.832870845900488e-07, + "loss": 0.3088, + "step": 15002 + }, + { + "epoch": 0.86, + "grad_norm": 0.17326883779583802, + "learning_rate": 9.824825409635763e-07, + "loss": 0.0697, + "step": 15003 + }, + { + "epoch": 0.86, + "grad_norm": 0.3107041090929514, + "learning_rate": 9.816783096133463e-07, + "loss": 0.2584, + "step": 15004 + }, + { + "epoch": 0.86, + "grad_norm": 0.354031848651155, + "learning_rate": 9.80874390567208e-07, + "loss": 0.2983, + "step": 15005 + }, + { + "epoch": 0.86, + "grad_norm": 0.8439772860687007, + "learning_rate": 9.800707838530021e-07, + "loss": 0.312, + "step": 15006 + }, + { + "epoch": 0.86, + "grad_norm": 0.41508853480396724, + "learning_rate": 9.792674894985553e-07, + "loss": 0.2951, + "step": 15007 + }, + { + "epoch": 0.86, + "grad_norm": 0.5387336155751442, + "learning_rate": 9.78464507531689e-07, + "loss": 0.3999, + "step": 15008 + }, + { + "epoch": 0.86, + "grad_norm": 0.30233725667514955, + "learning_rate": 9.776618379802093e-07, + "loss": 0.1861, + "step": 15009 + }, + { + "epoch": 0.86, + "grad_norm": 0.4230746134209038, + "learning_rate": 9.768594808719113e-07, + "loss": 0.2633, + "step": 15010 + }, + { + "epoch": 0.86, + "grad_norm": 0.48842838044318104, + "learning_rate": 9.76057436234581e-07, + "loss": 0.3434, + "step": 15011 + }, + { + "epoch": 0.86, + "grad_norm": 0.283475291759844, + "learning_rate": 9.752557040959943e-07, + "loss": 0.2384, + "step": 15012 + }, + { + "epoch": 0.86, + "grad_norm": 0.32374263109755874, + "learning_rate": 9.744542844839145e-07, + "loss": 0.1523, + "step": 15013 + }, + { + "epoch": 0.86, + "grad_norm": 0.3835245373644557, + "learning_rate": 9.736531774260948e-07, + "loss": 0.3108, + "step": 15014 + }, + { + "epoch": 0.86, + "grad_norm": 0.8657684258513281, + "learning_rate": 9.728523829502768e-07, + "loss": 0.365, + "step": 15015 + }, + { + "epoch": 0.86, + "grad_norm": 0.2924587753221655, + "learning_rate": 9.720519010841933e-07, + "loss": 0.2123, + "step": 15016 + }, + { + "epoch": 0.86, + "grad_norm": 0.3928088182505235, + "learning_rate": 9.712517318555637e-07, + "loss": 0.3112, + "step": 15017 + }, + { + "epoch": 0.86, + "grad_norm": 0.29042677444468945, + "learning_rate": 9.704518752921e-07, + "loss": 0.1606, + "step": 15018 + }, + { + "epoch": 0.86, + "grad_norm": 0.3775481237884491, + "learning_rate": 9.696523314214978e-07, + "loss": 0.1905, + "step": 15019 + }, + { + "epoch": 0.86, + "grad_norm": 0.2703635013510745, + "learning_rate": 9.688531002714464e-07, + "loss": 0.2672, + "step": 15020 + }, + { + "epoch": 0.86, + "grad_norm": 1.2847216612253773, + "learning_rate": 9.680541818696254e-07, + "loss": 0.5317, + "step": 15021 + }, + { + "epoch": 0.86, + "grad_norm": 0.5357320718859065, + "learning_rate": 9.672555762436997e-07, + "loss": 0.2516, + "step": 15022 + }, + { + "epoch": 0.86, + "grad_norm": 0.29402400287893316, + "learning_rate": 9.66457283421325e-07, + "loss": 0.2488, + "step": 15023 + }, + { + "epoch": 0.86, + "grad_norm": 0.2730938089664483, + "learning_rate": 9.656593034301432e-07, + "loss": 0.2212, + "step": 15024 + }, + { + "epoch": 0.86, + "grad_norm": 0.5329117555031294, + "learning_rate": 9.648616362977959e-07, + "loss": 0.3298, + "step": 15025 + }, + { + "epoch": 0.86, + "grad_norm": 0.3581964345211006, + "learning_rate": 9.640642820518997e-07, + "loss": 0.2328, + "step": 15026 + }, + { + "epoch": 0.86, + "grad_norm": 0.6362827691871921, + "learning_rate": 9.63267240720067e-07, + "loss": 0.3489, + "step": 15027 + }, + { + "epoch": 0.86, + "grad_norm": 0.29267465950559424, + "learning_rate": 9.62470512329904e-07, + "loss": 0.2516, + "step": 15028 + }, + { + "epoch": 0.86, + "grad_norm": 0.36347141522829296, + "learning_rate": 9.616740969089967e-07, + "loss": 0.2125, + "step": 15029 + }, + { + "epoch": 0.86, + "grad_norm": 0.2846777407910682, + "learning_rate": 9.608779944849278e-07, + "loss": 0.1992, + "step": 15030 + }, + { + "epoch": 0.86, + "grad_norm": 1.2644418677329352, + "learning_rate": 9.600822050852654e-07, + "loss": 0.7545, + "step": 15031 + }, + { + "epoch": 0.86, + "grad_norm": 0.21804919025258576, + "learning_rate": 9.592867287375652e-07, + "loss": 0.2162, + "step": 15032 + }, + { + "epoch": 0.86, + "grad_norm": 0.593650145047429, + "learning_rate": 9.584915654693782e-07, + "loss": 0.3514, + "step": 15033 + }, + { + "epoch": 0.86, + "grad_norm": 0.9895576760248582, + "learning_rate": 9.576967153082406e-07, + "loss": 0.4155, + "step": 15034 + }, + { + "epoch": 0.86, + "grad_norm": 0.23430158277619348, + "learning_rate": 9.569021782816767e-07, + "loss": 0.1612, + "step": 15035 + }, + { + "epoch": 0.86, + "grad_norm": 0.27556174932560695, + "learning_rate": 9.561079544171992e-07, + "loss": 0.2646, + "step": 15036 + }, + { + "epoch": 0.86, + "grad_norm": 0.808225331928117, + "learning_rate": 9.553140437423157e-07, + "loss": 0.5553, + "step": 15037 + }, + { + "epoch": 0.86, + "grad_norm": 0.37205200444797243, + "learning_rate": 9.545204462845192e-07, + "loss": 0.2542, + "step": 15038 + }, + { + "epoch": 0.86, + "grad_norm": 0.6746390579793102, + "learning_rate": 9.537271620712896e-07, + "loss": 0.2945, + "step": 15039 + }, + { + "epoch": 0.86, + "grad_norm": 0.3469141740881783, + "learning_rate": 9.529341911300982e-07, + "loss": 0.2678, + "step": 15040 + }, + { + "epoch": 0.86, + "grad_norm": 0.3863948928230087, + "learning_rate": 9.521415334884088e-07, + "loss": 0.2668, + "step": 15041 + }, + { + "epoch": 0.86, + "grad_norm": 0.34364358012632695, + "learning_rate": 9.513491891736681e-07, + "loss": 0.126, + "step": 15042 + }, + { + "epoch": 0.86, + "grad_norm": 0.26545776152588485, + "learning_rate": 9.505571582133166e-07, + "loss": 0.2173, + "step": 15043 + }, + { + "epoch": 0.86, + "grad_norm": 0.34794469679286133, + "learning_rate": 9.497654406347812e-07, + "loss": 0.2684, + "step": 15044 + }, + { + "epoch": 0.86, + "grad_norm": 0.4445266623529254, + "learning_rate": 9.489740364654776e-07, + "loss": 0.2681, + "step": 15045 + }, + { + "epoch": 0.86, + "grad_norm": 0.6025872062236165, + "learning_rate": 9.481829457328162e-07, + "loss": 0.3286, + "step": 15046 + }, + { + "epoch": 0.86, + "grad_norm": 0.37119355184782665, + "learning_rate": 9.473921684641896e-07, + "loss": 0.2974, + "step": 15047 + }, + { + "epoch": 0.86, + "grad_norm": 0.3547465998055965, + "learning_rate": 9.466017046869835e-07, + "loss": 0.3058, + "step": 15048 + }, + { + "epoch": 0.86, + "grad_norm": 0.8409961481560209, + "learning_rate": 9.458115544285684e-07, + "loss": 0.3469, + "step": 15049 + }, + { + "epoch": 0.86, + "grad_norm": 0.32572567903383914, + "learning_rate": 9.450217177163123e-07, + "loss": 0.2468, + "step": 15050 + }, + { + "epoch": 0.86, + "grad_norm": 0.2476762696013451, + "learning_rate": 9.442321945775646e-07, + "loss": 0.206, + "step": 15051 + }, + { + "epoch": 0.86, + "grad_norm": 0.638595287493113, + "learning_rate": 9.434429850396665e-07, + "loss": 0.2482, + "step": 15052 + }, + { + "epoch": 0.86, + "grad_norm": 0.33552988930315564, + "learning_rate": 9.426540891299463e-07, + "loss": 0.254, + "step": 15053 + }, + { + "epoch": 0.86, + "grad_norm": 1.286349510665451, + "learning_rate": 9.418655068757276e-07, + "loss": 0.6941, + "step": 15054 + }, + { + "epoch": 0.86, + "grad_norm": 0.39628036912695536, + "learning_rate": 9.410772383043176e-07, + "loss": 0.2364, + "step": 15055 + }, + { + "epoch": 0.87, + "grad_norm": 0.2941849588804919, + "learning_rate": 9.402892834430122e-07, + "loss": 0.2882, + "step": 15056 + }, + { + "epoch": 0.87, + "grad_norm": 0.42903190992151646, + "learning_rate": 9.395016423190984e-07, + "loss": 0.2431, + "step": 15057 + }, + { + "epoch": 0.87, + "grad_norm": 0.4266723252802292, + "learning_rate": 9.387143149598543e-07, + "loss": 0.0998, + "step": 15058 + }, + { + "epoch": 0.87, + "grad_norm": 0.3951244070626443, + "learning_rate": 9.379273013925449e-07, + "loss": 0.2723, + "step": 15059 + }, + { + "epoch": 0.87, + "grad_norm": 0.3569081459857391, + "learning_rate": 9.371406016444229e-07, + "loss": 0.3066, + "step": 15060 + }, + { + "epoch": 0.87, + "grad_norm": 0.7314097994167829, + "learning_rate": 9.363542157427297e-07, + "loss": 0.3921, + "step": 15061 + }, + { + "epoch": 0.87, + "grad_norm": 0.2963625225107329, + "learning_rate": 9.355681437147024e-07, + "loss": 0.1872, + "step": 15062 + }, + { + "epoch": 0.87, + "grad_norm": 0.3283347937303761, + "learning_rate": 9.347823855875604e-07, + "loss": 0.2963, + "step": 15063 + }, + { + "epoch": 0.87, + "grad_norm": 0.297389999754831, + "learning_rate": 9.339969413885142e-07, + "loss": 0.2066, + "step": 15064 + }, + { + "epoch": 0.87, + "grad_norm": 0.35212332109937267, + "learning_rate": 9.33211811144763e-07, + "loss": 0.1968, + "step": 15065 + }, + { + "epoch": 0.87, + "grad_norm": 0.8605096147432884, + "learning_rate": 9.324269948834985e-07, + "loss": 0.4698, + "step": 15066 + }, + { + "epoch": 0.87, + "grad_norm": 0.30120778723725855, + "learning_rate": 9.316424926318967e-07, + "loss": 0.2745, + "step": 15067 + }, + { + "epoch": 0.87, + "grad_norm": 0.3371975813552838, + "learning_rate": 9.30858304417126e-07, + "loss": 0.2051, + "step": 15068 + }, + { + "epoch": 0.87, + "grad_norm": 0.2826029807524462, + "learning_rate": 9.300744302663401e-07, + "loss": 0.2156, + "step": 15069 + }, + { + "epoch": 0.87, + "grad_norm": 1.4581018247895585, + "learning_rate": 9.292908702066883e-07, + "loss": 0.7467, + "step": 15070 + }, + { + "epoch": 0.87, + "grad_norm": 0.3579852541537003, + "learning_rate": 9.285076242653035e-07, + "loss": 0.1911, + "step": 15071 + }, + { + "epoch": 0.87, + "grad_norm": 0.34615695380006883, + "learning_rate": 9.277246924693106e-07, + "loss": 0.2995, + "step": 15072 + }, + { + "epoch": 0.87, + "grad_norm": 0.6649173448975993, + "learning_rate": 9.269420748458202e-07, + "loss": 0.3692, + "step": 15073 + }, + { + "epoch": 0.87, + "grad_norm": 0.3270716735112408, + "learning_rate": 9.261597714219351e-07, + "loss": 0.2589, + "step": 15074 + }, + { + "epoch": 0.87, + "grad_norm": 0.5415825274888945, + "learning_rate": 9.253777822247479e-07, + "loss": 0.2645, + "step": 15075 + }, + { + "epoch": 0.87, + "grad_norm": 0.28196842106823133, + "learning_rate": 9.24596107281338e-07, + "loss": 0.1962, + "step": 15076 + }, + { + "epoch": 0.87, + "grad_norm": 0.306635893181428, + "learning_rate": 9.238147466187742e-07, + "loss": 0.2415, + "step": 15077 + }, + { + "epoch": 0.87, + "grad_norm": 0.7446253266487983, + "learning_rate": 9.230337002641144e-07, + "loss": 0.3045, + "step": 15078 + }, + { + "epoch": 0.87, + "grad_norm": 0.3272308542677074, + "learning_rate": 9.222529682444081e-07, + "loss": 0.2904, + "step": 15079 + }, + { + "epoch": 0.87, + "grad_norm": 0.3088128697895234, + "learning_rate": 9.214725505866929e-07, + "loss": 0.2339, + "step": 15080 + }, + { + "epoch": 0.87, + "grad_norm": 0.9928672480486425, + "learning_rate": 9.206924473179913e-07, + "loss": 0.3694, + "step": 15081 + }, + { + "epoch": 0.87, + "grad_norm": 0.3445131715535893, + "learning_rate": 9.199126584653184e-07, + "loss": 0.1636, + "step": 15082 + }, + { + "epoch": 0.87, + "grad_norm": 0.3972284436044029, + "learning_rate": 9.191331840556816e-07, + "loss": 0.2828, + "step": 15083 + }, + { + "epoch": 0.87, + "grad_norm": 0.27892382887335904, + "learning_rate": 9.183540241160715e-07, + "loss": 0.2442, + "step": 15084 + }, + { + "epoch": 0.87, + "grad_norm": 1.0493720657729202, + "learning_rate": 9.175751786734722e-07, + "loss": 0.5299, + "step": 15085 + }, + { + "epoch": 0.87, + "grad_norm": 0.313880941212933, + "learning_rate": 9.167966477548529e-07, + "loss": 0.2344, + "step": 15086 + }, + { + "epoch": 0.87, + "grad_norm": 0.5243978675286038, + "learning_rate": 9.160184313871745e-07, + "loss": 0.3414, + "step": 15087 + }, + { + "epoch": 0.87, + "grad_norm": 0.23520310415924797, + "learning_rate": 9.152405295973877e-07, + "loss": 0.1365, + "step": 15088 + }, + { + "epoch": 0.87, + "grad_norm": 0.315451777087604, + "learning_rate": 9.144629424124318e-07, + "loss": 0.2393, + "step": 15089 + }, + { + "epoch": 0.87, + "grad_norm": 0.6343131932703363, + "learning_rate": 9.136856698592323e-07, + "loss": 0.3756, + "step": 15090 + }, + { + "epoch": 0.87, + "grad_norm": 0.33787984330910664, + "learning_rate": 9.129087119647062e-07, + "loss": 0.2509, + "step": 15091 + }, + { + "epoch": 0.87, + "grad_norm": 0.31430488166281095, + "learning_rate": 9.121320687557622e-07, + "loss": 0.247, + "step": 15092 + }, + { + "epoch": 0.87, + "grad_norm": 1.2083642377805037, + "learning_rate": 9.113557402592965e-07, + "loss": 0.5698, + "step": 15093 + }, + { + "epoch": 0.87, + "grad_norm": 0.272352156350627, + "learning_rate": 9.105797265021865e-07, + "loss": 0.123, + "step": 15094 + }, + { + "epoch": 0.87, + "grad_norm": 0.26727226411819544, + "learning_rate": 9.098040275113118e-07, + "loss": 0.2503, + "step": 15095 + }, + { + "epoch": 0.87, + "grad_norm": 0.4165842297184612, + "learning_rate": 9.09028643313532e-07, + "loss": 0.2908, + "step": 15096 + }, + { + "epoch": 0.87, + "grad_norm": 0.8381958437662107, + "learning_rate": 9.082535739357001e-07, + "loss": 0.3653, + "step": 15097 + }, + { + "epoch": 0.87, + "grad_norm": 0.3056167343849403, + "learning_rate": 9.074788194046557e-07, + "loss": 0.2505, + "step": 15098 + }, + { + "epoch": 0.87, + "grad_norm": 0.5242561797322017, + "learning_rate": 9.067043797472264e-07, + "loss": 0.321, + "step": 15099 + }, + { + "epoch": 0.87, + "grad_norm": 0.2684495155203418, + "learning_rate": 9.05930254990236e-07, + "loss": 0.2153, + "step": 15100 + }, + { + "epoch": 0.87, + "grad_norm": 0.4350688261100347, + "learning_rate": 9.0515644516049e-07, + "loss": 0.188, + "step": 15101 + }, + { + "epoch": 0.87, + "grad_norm": 0.5120139042621656, + "learning_rate": 9.043829502847845e-07, + "loss": 0.343, + "step": 15102 + }, + { + "epoch": 0.87, + "grad_norm": 0.35141044392890897, + "learning_rate": 9.036097703899049e-07, + "loss": 0.329, + "step": 15103 + }, + { + "epoch": 0.87, + "grad_norm": 0.3241126312677885, + "learning_rate": 9.028369055026287e-07, + "loss": 0.1678, + "step": 15104 + }, + { + "epoch": 0.87, + "grad_norm": 0.6065691927483036, + "learning_rate": 9.020643556497211e-07, + "loss": 0.3254, + "step": 15105 + }, + { + "epoch": 0.87, + "grad_norm": 0.7583614764645674, + "learning_rate": 9.012921208579317e-07, + "loss": 0.3871, + "step": 15106 + }, + { + "epoch": 0.87, + "grad_norm": 0.25091863341591597, + "learning_rate": 9.005202011540037e-07, + "loss": 0.2057, + "step": 15107 + }, + { + "epoch": 0.87, + "grad_norm": 0.290779990812648, + "learning_rate": 8.997485965646724e-07, + "loss": 0.2231, + "step": 15108 + }, + { + "epoch": 0.87, + "grad_norm": 1.2478961440750131, + "learning_rate": 8.989773071166552e-07, + "loss": 0.8038, + "step": 15109 + }, + { + "epoch": 0.87, + "grad_norm": 0.2953342710533784, + "learning_rate": 8.982063328366631e-07, + "loss": 0.1998, + "step": 15110 + }, + { + "epoch": 0.87, + "grad_norm": 0.5276949035579362, + "learning_rate": 8.974356737513934e-07, + "loss": 0.3769, + "step": 15111 + }, + { + "epoch": 0.87, + "grad_norm": 0.4520637612174389, + "learning_rate": 8.966653298875339e-07, + "loss": 0.3065, + "step": 15112 + }, + { + "epoch": 0.87, + "grad_norm": 0.3955320368936747, + "learning_rate": 8.958953012717641e-07, + "loss": 0.308, + "step": 15113 + }, + { + "epoch": 0.87, + "grad_norm": 0.25203374514690974, + "learning_rate": 8.951255879307486e-07, + "loss": 0.1288, + "step": 15114 + }, + { + "epoch": 0.87, + "grad_norm": 0.3478143118831568, + "learning_rate": 8.943561898911424e-07, + "loss": 0.3261, + "step": 15115 + }, + { + "epoch": 0.87, + "grad_norm": 0.7966981365506505, + "learning_rate": 8.935871071795876e-07, + "loss": 0.3132, + "step": 15116 + }, + { + "epoch": 0.87, + "grad_norm": 0.34943890063486505, + "learning_rate": 8.928183398227219e-07, + "loss": 0.2148, + "step": 15117 + }, + { + "epoch": 0.87, + "grad_norm": 0.42402665162699893, + "learning_rate": 8.920498878471651e-07, + "loss": 0.2931, + "step": 15118 + }, + { + "epoch": 0.87, + "grad_norm": 0.374210011293103, + "learning_rate": 8.912817512795302e-07, + "loss": 0.2996, + "step": 15119 + }, + { + "epoch": 0.87, + "grad_norm": 0.2495954819765893, + "learning_rate": 8.905139301464139e-07, + "loss": 0.1498, + "step": 15120 + }, + { + "epoch": 0.87, + "grad_norm": 1.1354208775040406, + "learning_rate": 8.897464244744103e-07, + "loss": 0.808, + "step": 15121 + }, + { + "epoch": 0.87, + "grad_norm": 0.691843031359836, + "learning_rate": 8.88979234290096e-07, + "loss": 0.347, + "step": 15122 + }, + { + "epoch": 0.87, + "grad_norm": 0.25810167310409116, + "learning_rate": 8.882123596200387e-07, + "loss": 0.2779, + "step": 15123 + }, + { + "epoch": 0.87, + "grad_norm": 0.664422180294338, + "learning_rate": 8.874458004907971e-07, + "loss": 0.2791, + "step": 15124 + }, + { + "epoch": 0.87, + "grad_norm": 0.5426279817665275, + "learning_rate": 8.866795569289122e-07, + "loss": 0.266, + "step": 15125 + }, + { + "epoch": 0.87, + "grad_norm": 0.31902793423548365, + "learning_rate": 8.859136289609272e-07, + "loss": 0.2559, + "step": 15126 + }, + { + "epoch": 0.87, + "grad_norm": 0.3112222568220345, + "learning_rate": 8.851480166133586e-07, + "loss": 0.2472, + "step": 15127 + }, + { + "epoch": 0.87, + "grad_norm": 0.2615724470027559, + "learning_rate": 8.843827199127208e-07, + "loss": 0.2101, + "step": 15128 + }, + { + "epoch": 0.87, + "grad_norm": 0.541148258436055, + "learning_rate": 8.836177388855183e-07, + "loss": 0.3172, + "step": 15129 + }, + { + "epoch": 0.87, + "grad_norm": 0.4333991198695846, + "learning_rate": 8.82853073558243e-07, + "loss": 0.2728, + "step": 15130 + }, + { + "epoch": 0.87, + "grad_norm": 0.3145467505444975, + "learning_rate": 8.820887239573728e-07, + "loss": 0.2714, + "step": 15131 + }, + { + "epoch": 0.87, + "grad_norm": 0.5162011923967506, + "learning_rate": 8.813246901093763e-07, + "loss": 0.3445, + "step": 15132 + }, + { + "epoch": 0.87, + "grad_norm": 0.1839520405237534, + "learning_rate": 8.80560972040716e-07, + "loss": 0.123, + "step": 15133 + }, + { + "epoch": 0.87, + "grad_norm": 0.5364721309908936, + "learning_rate": 8.797975697778361e-07, + "loss": 0.3405, + "step": 15134 + }, + { + "epoch": 0.87, + "grad_norm": 0.2625779164027942, + "learning_rate": 8.790344833471753e-07, + "loss": 0.2696, + "step": 15135 + }, + { + "epoch": 0.87, + "grad_norm": 0.7388355872405135, + "learning_rate": 8.782717127751572e-07, + "loss": 0.3782, + "step": 15136 + }, + { + "epoch": 0.87, + "grad_norm": 0.7507920836315224, + "learning_rate": 8.775092580881961e-07, + "loss": 0.1164, + "step": 15137 + }, + { + "epoch": 0.87, + "grad_norm": 0.3979987615357961, + "learning_rate": 8.767471193126987e-07, + "loss": 0.2978, + "step": 15138 + }, + { + "epoch": 0.87, + "grad_norm": 0.3314991768932052, + "learning_rate": 8.75985296475057e-07, + "loss": 0.2987, + "step": 15139 + }, + { + "epoch": 0.87, + "grad_norm": 0.3167246892166629, + "learning_rate": 8.752237896016513e-07, + "loss": 0.1281, + "step": 15140 + }, + { + "epoch": 0.87, + "grad_norm": 0.37153197907359864, + "learning_rate": 8.744625987188516e-07, + "loss": 0.3026, + "step": 15141 + }, + { + "epoch": 0.87, + "grad_norm": 0.50583112778751, + "learning_rate": 8.737017238530221e-07, + "loss": 0.2881, + "step": 15142 + }, + { + "epoch": 0.87, + "grad_norm": 0.30099834150968574, + "learning_rate": 8.729411650305086e-07, + "loss": 0.2031, + "step": 15143 + }, + { + "epoch": 0.87, + "grad_norm": 0.40016934367057977, + "learning_rate": 8.721809222776512e-07, + "loss": 0.3229, + "step": 15144 + }, + { + "epoch": 0.87, + "grad_norm": 1.1087364033476483, + "learning_rate": 8.71420995620773e-07, + "loss": 0.7046, + "step": 15145 + }, + { + "epoch": 0.87, + "grad_norm": 0.24438582944496376, + "learning_rate": 8.706613850861955e-07, + "loss": 0.1804, + "step": 15146 + }, + { + "epoch": 0.87, + "grad_norm": 0.2983420842661307, + "learning_rate": 8.69902090700222e-07, + "loss": 0.2841, + "step": 15147 + }, + { + "epoch": 0.87, + "grad_norm": 0.47140707285888844, + "learning_rate": 8.691431124891458e-07, + "loss": 0.259, + "step": 15148 + }, + { + "epoch": 0.87, + "grad_norm": 0.5977151144145306, + "learning_rate": 8.683844504792516e-07, + "loss": 0.2461, + "step": 15149 + }, + { + "epoch": 0.87, + "grad_norm": 0.3293070448411575, + "learning_rate": 8.676261046968082e-07, + "loss": 0.2188, + "step": 15150 + }, + { + "epoch": 0.87, + "grad_norm": 0.35148732981494435, + "learning_rate": 8.668680751680836e-07, + "loss": 0.3271, + "step": 15151 + }, + { + "epoch": 0.87, + "grad_norm": 0.5480960876299462, + "learning_rate": 8.661103619193235e-07, + "loss": 0.3355, + "step": 15152 + }, + { + "epoch": 0.87, + "grad_norm": 0.3996302396384634, + "learning_rate": 8.653529649767689e-07, + "loss": 0.2401, + "step": 15153 + }, + { + "epoch": 0.87, + "grad_norm": 0.28270246411719746, + "learning_rate": 8.64595884366648e-07, + "loss": 0.194, + "step": 15154 + }, + { + "epoch": 0.87, + "grad_norm": 0.453050144014881, + "learning_rate": 8.638391201151786e-07, + "loss": 0.2769, + "step": 15155 + }, + { + "epoch": 0.87, + "grad_norm": 0.29278379222233086, + "learning_rate": 8.630826722485686e-07, + "loss": 0.1997, + "step": 15156 + }, + { + "epoch": 0.87, + "grad_norm": 0.6741223471885249, + "learning_rate": 8.623265407930126e-07, + "loss": 0.4103, + "step": 15157 + }, + { + "epoch": 0.87, + "grad_norm": 0.46914591529300426, + "learning_rate": 8.615707257746942e-07, + "loss": 0.3219, + "step": 15158 + }, + { + "epoch": 0.87, + "grad_norm": 0.2827772048682046, + "learning_rate": 8.608152272197901e-07, + "loss": 0.2253, + "step": 15159 + }, + { + "epoch": 0.87, + "grad_norm": 0.31540560824285985, + "learning_rate": 8.600600451544638e-07, + "loss": 0.1769, + "step": 15160 + }, + { + "epoch": 0.87, + "grad_norm": 1.0518770248703995, + "learning_rate": 8.593051796048623e-07, + "loss": 0.3286, + "step": 15161 + }, + { + "epoch": 0.87, + "grad_norm": 0.3572068024893413, + "learning_rate": 8.58550630597128e-07, + "loss": 0.2964, + "step": 15162 + }, + { + "epoch": 0.87, + "grad_norm": 0.30446816276194233, + "learning_rate": 8.577963981573944e-07, + "loss": 0.238, + "step": 15163 + }, + { + "epoch": 0.87, + "grad_norm": 0.5673975171436122, + "learning_rate": 8.570424823117785e-07, + "loss": 0.3419, + "step": 15164 + }, + { + "epoch": 0.87, + "grad_norm": 0.38060387474042917, + "learning_rate": 8.56288883086388e-07, + "loss": 0.2754, + "step": 15165 + }, + { + "epoch": 0.87, + "grad_norm": 0.19743050333583234, + "learning_rate": 8.55535600507319e-07, + "loss": 0.1249, + "step": 15166 + }, + { + "epoch": 0.87, + "grad_norm": 0.4663590562791462, + "learning_rate": 8.547826346006594e-07, + "loss": 0.2865, + "step": 15167 + }, + { + "epoch": 0.87, + "grad_norm": 0.3707588567212866, + "learning_rate": 8.540299853924849e-07, + "loss": 0.3017, + "step": 15168 + }, + { + "epoch": 0.87, + "grad_norm": 0.7253472188843394, + "learning_rate": 8.532776529088582e-07, + "loss": 0.304, + "step": 15169 + }, + { + "epoch": 0.87, + "grad_norm": 0.27858164729369395, + "learning_rate": 8.525256371758317e-07, + "loss": 0.2592, + "step": 15170 + }, + { + "epoch": 0.87, + "grad_norm": 0.38438389855409005, + "learning_rate": 8.517739382194512e-07, + "loss": 0.2976, + "step": 15171 + }, + { + "epoch": 0.87, + "grad_norm": 0.35220266077902224, + "learning_rate": 8.510225560657459e-07, + "loss": 0.0923, + "step": 15172 + }, + { + "epoch": 0.87, + "grad_norm": 1.3201247021161922, + "learning_rate": 8.50271490740735e-07, + "loss": 0.5593, + "step": 15173 + }, + { + "epoch": 0.87, + "grad_norm": 0.32291369852792134, + "learning_rate": 8.495207422704299e-07, + "loss": 0.2695, + "step": 15174 + }, + { + "epoch": 0.87, + "grad_norm": 0.34051148799422554, + "learning_rate": 8.487703106808254e-07, + "loss": 0.2965, + "step": 15175 + }, + { + "epoch": 0.87, + "grad_norm": 1.5064423885781546, + "learning_rate": 8.480201959979139e-07, + "loss": 0.3937, + "step": 15176 + }, + { + "epoch": 0.87, + "grad_norm": 0.31970419618362195, + "learning_rate": 8.472703982476694e-07, + "loss": 0.2563, + "step": 15177 + }, + { + "epoch": 0.87, + "grad_norm": 0.23113554443336126, + "learning_rate": 8.465209174560574e-07, + "loss": 0.1821, + "step": 15178 + }, + { + "epoch": 0.87, + "grad_norm": 0.4125928845847924, + "learning_rate": 8.457717536490307e-07, + "loss": 0.2227, + "step": 15179 + }, + { + "epoch": 0.87, + "grad_norm": 0.33512692119389803, + "learning_rate": 8.450229068525351e-07, + "loss": 0.2999, + "step": 15180 + }, + { + "epoch": 0.87, + "grad_norm": 0.612047740136571, + "learning_rate": 8.442743770925044e-07, + "loss": 0.3659, + "step": 15181 + }, + { + "epoch": 0.87, + "grad_norm": 0.32224924038452785, + "learning_rate": 8.435261643948567e-07, + "loss": 0.2515, + "step": 15182 + }, + { + "epoch": 0.87, + "grad_norm": 0.4883487465666448, + "learning_rate": 8.427782687855035e-07, + "loss": 0.264, + "step": 15183 + }, + { + "epoch": 0.87, + "grad_norm": 0.5277231776949652, + "learning_rate": 8.420306902903464e-07, + "loss": 0.217, + "step": 15184 + }, + { + "epoch": 0.87, + "grad_norm": 0.3979449991906311, + "learning_rate": 8.412834289352734e-07, + "loss": 0.171, + "step": 15185 + }, + { + "epoch": 0.87, + "grad_norm": 0.36033874060148274, + "learning_rate": 8.405364847461606e-07, + "loss": 0.2843, + "step": 15186 + }, + { + "epoch": 0.87, + "grad_norm": 0.3147674767160009, + "learning_rate": 8.397898577488739e-07, + "loss": 0.3099, + "step": 15187 + }, + { + "epoch": 0.87, + "grad_norm": 1.3165994656358568, + "learning_rate": 8.390435479692726e-07, + "loss": 0.7248, + "step": 15188 + }, + { + "epoch": 0.87, + "grad_norm": 0.2795136011073454, + "learning_rate": 8.382975554331985e-07, + "loss": 0.1823, + "step": 15189 + }, + { + "epoch": 0.87, + "grad_norm": 0.33718281834093666, + "learning_rate": 8.375518801664873e-07, + "loss": 0.2743, + "step": 15190 + }, + { + "epoch": 0.87, + "grad_norm": 0.31695562935276533, + "learning_rate": 8.368065221949595e-07, + "loss": 0.2208, + "step": 15191 + }, + { + "epoch": 0.87, + "grad_norm": 0.3135609234920389, + "learning_rate": 8.360614815444268e-07, + "loss": 0.2054, + "step": 15192 + }, + { + "epoch": 0.87, + "grad_norm": 0.9047987658050868, + "learning_rate": 8.353167582406918e-07, + "loss": 0.4717, + "step": 15193 + }, + { + "epoch": 0.87, + "grad_norm": 0.32879063579291573, + "learning_rate": 8.345723523095462e-07, + "loss": 0.2928, + "step": 15194 + }, + { + "epoch": 0.87, + "grad_norm": 0.33331989963810826, + "learning_rate": 8.338282637767614e-07, + "loss": 0.1939, + "step": 15195 + }, + { + "epoch": 0.87, + "grad_norm": 0.4331013516301031, + "learning_rate": 8.330844926681114e-07, + "loss": 0.2289, + "step": 15196 + }, + { + "epoch": 0.87, + "grad_norm": 0.4826586936930948, + "learning_rate": 8.323410390093523e-07, + "loss": 0.2577, + "step": 15197 + }, + { + "epoch": 0.87, + "grad_norm": 0.20794647329271113, + "learning_rate": 8.315979028262277e-07, + "loss": 0.2144, + "step": 15198 + }, + { + "epoch": 0.87, + "grad_norm": 0.9167179052248758, + "learning_rate": 8.308550841444718e-07, + "loss": 0.493, + "step": 15199 + }, + { + "epoch": 0.87, + "grad_norm": 0.9684662978300949, + "learning_rate": 8.301125829898126e-07, + "loss": 0.6385, + "step": 15200 + }, + { + "epoch": 0.87, + "grad_norm": 0.31667062423994347, + "learning_rate": 8.293703993879599e-07, + "loss": 0.262, + "step": 15201 + }, + { + "epoch": 0.87, + "grad_norm": 0.3174213408244066, + "learning_rate": 8.28628533364616e-07, + "loss": 0.2297, + "step": 15202 + }, + { + "epoch": 0.87, + "grad_norm": 0.6367147811096763, + "learning_rate": 8.278869849454718e-07, + "loss": 0.3327, + "step": 15203 + }, + { + "epoch": 0.87, + "grad_norm": 0.41740650979949356, + "learning_rate": 8.271457541562045e-07, + "loss": 0.2915, + "step": 15204 + }, + { + "epoch": 0.87, + "grad_norm": 0.19492825352143867, + "learning_rate": 8.264048410224879e-07, + "loss": 0.15, + "step": 15205 + }, + { + "epoch": 0.87, + "grad_norm": 0.34968830067182716, + "learning_rate": 8.25664245569976e-07, + "loss": 0.2954, + "step": 15206 + }, + { + "epoch": 0.87, + "grad_norm": 0.38126528864990045, + "learning_rate": 8.249239678243171e-07, + "loss": 0.2521, + "step": 15207 + }, + { + "epoch": 0.87, + "grad_norm": 0.4991098352243059, + "learning_rate": 8.241840078111452e-07, + "loss": 0.2371, + "step": 15208 + }, + { + "epoch": 0.87, + "grad_norm": 0.7523629811440098, + "learning_rate": 8.234443655560886e-07, + "loss": 0.3825, + "step": 15209 + }, + { + "epoch": 0.87, + "grad_norm": 0.27570811560535324, + "learning_rate": 8.227050410847592e-07, + "loss": 0.268, + "step": 15210 + }, + { + "epoch": 0.87, + "grad_norm": 0.23902070137790352, + "learning_rate": 8.219660344227587e-07, + "loss": 0.2191, + "step": 15211 + }, + { + "epoch": 0.87, + "grad_norm": 1.005631527918455, + "learning_rate": 8.212273455956787e-07, + "loss": 0.3593, + "step": 15212 + }, + { + "epoch": 0.87, + "grad_norm": 0.31857938219013965, + "learning_rate": 8.204889746291022e-07, + "loss": 0.2688, + "step": 15213 + }, + { + "epoch": 0.87, + "grad_norm": 0.38049878062907533, + "learning_rate": 8.197509215485988e-07, + "loss": 0.2892, + "step": 15214 + }, + { + "epoch": 0.87, + "grad_norm": 0.7259267300163725, + "learning_rate": 8.190131863797246e-07, + "loss": 0.2979, + "step": 15215 + }, + { + "epoch": 0.87, + "grad_norm": 0.3431148587829527, + "learning_rate": 8.182757691480303e-07, + "loss": 0.2404, + "step": 15216 + }, + { + "epoch": 0.87, + "grad_norm": 0.4465896566952469, + "learning_rate": 8.175386698790489e-07, + "loss": 0.3118, + "step": 15217 + }, + { + "epoch": 0.87, + "grad_norm": 0.27790041083033745, + "learning_rate": 8.168018885983109e-07, + "loss": 0.2146, + "step": 15218 + }, + { + "epoch": 0.87, + "grad_norm": 0.3068585203990971, + "learning_rate": 8.160654253313282e-07, + "loss": 0.2547, + "step": 15219 + }, + { + "epoch": 0.87, + "grad_norm": 0.7012299019399189, + "learning_rate": 8.15329280103605e-07, + "loss": 0.3996, + "step": 15220 + }, + { + "epoch": 0.87, + "grad_norm": 0.3526300473721663, + "learning_rate": 8.145934529406319e-07, + "loss": 0.2639, + "step": 15221 + }, + { + "epoch": 0.87, + "grad_norm": 0.4329912494767619, + "learning_rate": 8.13857943867894e-07, + "loss": 0.275, + "step": 15222 + }, + { + "epoch": 0.87, + "grad_norm": 0.5705532148120219, + "learning_rate": 8.13122752910861e-07, + "loss": 0.3568, + "step": 15223 + }, + { + "epoch": 0.87, + "grad_norm": 0.29636431949967834, + "learning_rate": 8.123878800949914e-07, + "loss": 0.2048, + "step": 15224 + }, + { + "epoch": 0.87, + "grad_norm": 0.35704937882642657, + "learning_rate": 8.116533254457337e-07, + "loss": 0.186, + "step": 15225 + }, + { + "epoch": 0.87, + "grad_norm": 0.3023244294858326, + "learning_rate": 8.109190889885277e-07, + "loss": 0.2889, + "step": 15226 + }, + { + "epoch": 0.87, + "grad_norm": 1.1384083091604984, + "learning_rate": 8.101851707487973e-07, + "loss": 0.4531, + "step": 15227 + }, + { + "epoch": 0.87, + "grad_norm": 0.36463495630098414, + "learning_rate": 8.094515707519623e-07, + "loss": 0.1598, + "step": 15228 + }, + { + "epoch": 0.87, + "grad_norm": 0.544002008754593, + "learning_rate": 8.087182890234202e-07, + "loss": 0.3947, + "step": 15229 + }, + { + "epoch": 0.88, + "grad_norm": 0.25996577539505555, + "learning_rate": 8.079853255885705e-07, + "loss": 0.2442, + "step": 15230 + }, + { + "epoch": 0.88, + "grad_norm": 0.22928359992702024, + "learning_rate": 8.072526804727943e-07, + "loss": 0.1601, + "step": 15231 + }, + { + "epoch": 0.88, + "grad_norm": 0.548664404540612, + "learning_rate": 8.06520353701461e-07, + "loss": 0.369, + "step": 15232 + }, + { + "epoch": 0.88, + "grad_norm": 1.4342997506272013, + "learning_rate": 8.057883452999316e-07, + "loss": 0.5444, + "step": 15233 + }, + { + "epoch": 0.88, + "grad_norm": 0.22084093260537044, + "learning_rate": 8.050566552935579e-07, + "loss": 0.2115, + "step": 15234 + }, + { + "epoch": 0.88, + "grad_norm": 1.2855665998775354, + "learning_rate": 8.043252837076776e-07, + "loss": 0.6016, + "step": 15235 + }, + { + "epoch": 0.88, + "grad_norm": 0.43839777602461616, + "learning_rate": 8.03594230567617e-07, + "loss": 0.3084, + "step": 15236 + }, + { + "epoch": 0.88, + "grad_norm": 0.3919309008857394, + "learning_rate": 8.028634958986903e-07, + "loss": 0.271, + "step": 15237 + }, + { + "epoch": 0.88, + "grad_norm": 0.2905695696643427, + "learning_rate": 8.021330797262072e-07, + "loss": 0.2399, + "step": 15238 + }, + { + "epoch": 0.88, + "grad_norm": 0.5220398686554123, + "learning_rate": 8.01402982075461e-07, + "loss": 0.2565, + "step": 15239 + }, + { + "epoch": 0.88, + "grad_norm": 0.5903199356384773, + "learning_rate": 8.006732029717335e-07, + "loss": 0.2823, + "step": 15240 + }, + { + "epoch": 0.88, + "grad_norm": 0.3598694129624296, + "learning_rate": 7.999437424402967e-07, + "loss": 0.2532, + "step": 15241 + }, + { + "epoch": 0.88, + "grad_norm": 0.33400510821115537, + "learning_rate": 7.992146005064105e-07, + "loss": 0.3282, + "step": 15242 + }, + { + "epoch": 0.88, + "grad_norm": 0.5293015346627737, + "learning_rate": 7.984857771953303e-07, + "loss": 0.2665, + "step": 15243 + }, + { + "epoch": 0.88, + "grad_norm": 0.21466042777465733, + "learning_rate": 7.977572725322913e-07, + "loss": 0.1591, + "step": 15244 + }, + { + "epoch": 0.88, + "grad_norm": 0.4726424277010858, + "learning_rate": 7.970290865425212e-07, + "loss": 0.3258, + "step": 15245 + }, + { + "epoch": 0.88, + "grad_norm": 0.4202027933289129, + "learning_rate": 7.963012192512376e-07, + "loss": 0.2639, + "step": 15246 + }, + { + "epoch": 0.88, + "grad_norm": 0.3945913782257165, + "learning_rate": 7.955736706836481e-07, + "loss": 0.2548, + "step": 15247 + }, + { + "epoch": 0.88, + "grad_norm": 0.6633476242535613, + "learning_rate": 7.94846440864947e-07, + "loss": 0.3651, + "step": 15248 + }, + { + "epoch": 0.88, + "grad_norm": 0.3693773855404697, + "learning_rate": 7.941195298203175e-07, + "loss": 0.2616, + "step": 15249 + }, + { + "epoch": 0.88, + "grad_norm": 0.260791519573847, + "learning_rate": 7.933929375749317e-07, + "loss": 0.2281, + "step": 15250 + }, + { + "epoch": 0.88, + "grad_norm": 0.5155000696175669, + "learning_rate": 7.926666641539538e-07, + "loss": 0.1134, + "step": 15251 + }, + { + "epoch": 0.88, + "grad_norm": 0.3980620444030963, + "learning_rate": 7.919407095825337e-07, + "loss": 0.2641, + "step": 15252 + }, + { + "epoch": 0.88, + "grad_norm": 0.5403914162100106, + "learning_rate": 7.912150738858104e-07, + "loss": 0.3727, + "step": 15253 + }, + { + "epoch": 0.88, + "grad_norm": 0.3470276410754074, + "learning_rate": 7.904897570889136e-07, + "loss": 0.2602, + "step": 15254 + }, + { + "epoch": 0.88, + "grad_norm": 0.3700558072591282, + "learning_rate": 7.897647592169578e-07, + "loss": 0.2559, + "step": 15255 + }, + { + "epoch": 0.88, + "grad_norm": 0.531602700604024, + "learning_rate": 7.890400802950548e-07, + "loss": 0.3307, + "step": 15256 + }, + { + "epoch": 0.88, + "grad_norm": 0.25900870477851295, + "learning_rate": 7.883157203482982e-07, + "loss": 0.1727, + "step": 15257 + }, + { + "epoch": 0.88, + "grad_norm": 0.5904195157887109, + "learning_rate": 7.875916794017713e-07, + "loss": 0.2851, + "step": 15258 + }, + { + "epoch": 0.88, + "grad_norm": 0.38738009696102815, + "learning_rate": 7.868679574805472e-07, + "loss": 0.3192, + "step": 15259 + }, + { + "epoch": 0.88, + "grad_norm": 0.6042547956260029, + "learning_rate": 7.861445546096902e-07, + "loss": 0.3113, + "step": 15260 + }, + { + "epoch": 0.88, + "grad_norm": 0.29172509317629647, + "learning_rate": 7.854214708142538e-07, + "loss": 0.2586, + "step": 15261 + }, + { + "epoch": 0.88, + "grad_norm": 0.3643435241482438, + "learning_rate": 7.846987061192723e-07, + "loss": 0.2565, + "step": 15262 + }, + { + "epoch": 0.88, + "grad_norm": 0.3800601781197826, + "learning_rate": 7.839762605497791e-07, + "loss": 0.1566, + "step": 15263 + }, + { + "epoch": 0.88, + "grad_norm": 0.30248056133318446, + "learning_rate": 7.83254134130793e-07, + "loss": 0.1106, + "step": 15264 + }, + { + "epoch": 0.88, + "grad_norm": 0.3365648909541551, + "learning_rate": 7.825323268873187e-07, + "loss": 0.2817, + "step": 15265 + }, + { + "epoch": 0.88, + "grad_norm": 0.4406402840444798, + "learning_rate": 7.818108388443546e-07, + "loss": 0.3342, + "step": 15266 + }, + { + "epoch": 0.88, + "grad_norm": 0.7090457854227306, + "learning_rate": 7.810896700268822e-07, + "loss": 0.223, + "step": 15267 + }, + { + "epoch": 0.88, + "grad_norm": 0.2873213588453266, + "learning_rate": 7.803688204598803e-07, + "loss": 0.2282, + "step": 15268 + }, + { + "epoch": 0.88, + "grad_norm": 0.36227730773662065, + "learning_rate": 7.796482901683089e-07, + "loss": 0.2392, + "step": 15269 + }, + { + "epoch": 0.88, + "grad_norm": 0.3593253101252005, + "learning_rate": 7.789280791771214e-07, + "loss": 0.1979, + "step": 15270 + }, + { + "epoch": 0.88, + "grad_norm": 0.3621820947758829, + "learning_rate": 7.782081875112568e-07, + "loss": 0.2827, + "step": 15271 + }, + { + "epoch": 0.88, + "grad_norm": 0.6322813462372843, + "learning_rate": 7.774886151956473e-07, + "loss": 0.3907, + "step": 15272 + }, + { + "epoch": 0.88, + "grad_norm": 0.2985759612155513, + "learning_rate": 7.767693622552097e-07, + "loss": 0.2031, + "step": 15273 + }, + { + "epoch": 0.88, + "grad_norm": 0.40022176584749375, + "learning_rate": 7.76050428714854e-07, + "loss": 0.2846, + "step": 15274 + }, + { + "epoch": 0.88, + "grad_norm": 0.3104592255829074, + "learning_rate": 7.753318145994727e-07, + "loss": 0.1448, + "step": 15275 + }, + { + "epoch": 0.88, + "grad_norm": 0.49120492596421095, + "learning_rate": 7.746135199339556e-07, + "loss": 0.3065, + "step": 15276 + }, + { + "epoch": 0.88, + "grad_norm": 0.28632281845942664, + "learning_rate": 7.738955447431762e-07, + "loss": 0.2348, + "step": 15277 + }, + { + "epoch": 0.88, + "grad_norm": 0.4665069876855719, + "learning_rate": 7.731778890519969e-07, + "loss": 0.3292, + "step": 15278 + }, + { + "epoch": 0.88, + "grad_norm": 1.4626098353168415, + "learning_rate": 7.724605528852702e-07, + "loss": 0.5868, + "step": 15279 + }, + { + "epoch": 0.88, + "grad_norm": 0.35047118850359527, + "learning_rate": 7.717435362678361e-07, + "loss": 0.1945, + "step": 15280 + }, + { + "epoch": 0.88, + "grad_norm": 0.23788708299943856, + "learning_rate": 7.71026839224529e-07, + "loss": 0.2045, + "step": 15281 + }, + { + "epoch": 0.88, + "grad_norm": 0.6010442606371211, + "learning_rate": 7.703104617801649e-07, + "loss": 0.3379, + "step": 15282 + }, + { + "epoch": 0.88, + "grad_norm": 0.32129184135251976, + "learning_rate": 7.695944039595526e-07, + "loss": 0.2314, + "step": 15283 + }, + { + "epoch": 0.88, + "grad_norm": 1.2358162206069345, + "learning_rate": 7.688786657874881e-07, + "loss": 0.757, + "step": 15284 + }, + { + "epoch": 0.88, + "grad_norm": 0.302450566459852, + "learning_rate": 7.681632472887601e-07, + "loss": 0.2894, + "step": 15285 + }, + { + "epoch": 0.88, + "grad_norm": 0.36525775072758804, + "learning_rate": 7.674481484881413e-07, + "loss": 0.2433, + "step": 15286 + }, + { + "epoch": 0.88, + "grad_norm": 0.12990322688717965, + "learning_rate": 7.667333694103962e-07, + "loss": 0.07, + "step": 15287 + }, + { + "epoch": 0.88, + "grad_norm": 0.530598155745847, + "learning_rate": 7.660189100802762e-07, + "loss": 0.3354, + "step": 15288 + }, + { + "epoch": 0.88, + "grad_norm": 0.26335616434420217, + "learning_rate": 7.653047705225258e-07, + "loss": 0.2555, + "step": 15289 + }, + { + "epoch": 0.88, + "grad_norm": 0.4268927274168955, + "learning_rate": 7.645909507618732e-07, + "loss": 0.2675, + "step": 15290 + }, + { + "epoch": 0.88, + "grad_norm": 0.7647099515322016, + "learning_rate": 7.638774508230395e-07, + "loss": 0.4027, + "step": 15291 + }, + { + "epoch": 0.88, + "grad_norm": 0.3398062115060478, + "learning_rate": 7.631642707307319e-07, + "loss": 0.2611, + "step": 15292 + }, + { + "epoch": 0.88, + "grad_norm": 0.2679660624561897, + "learning_rate": 7.624514105096492e-07, + "loss": 0.2, + "step": 15293 + }, + { + "epoch": 0.88, + "grad_norm": 0.6026562844344492, + "learning_rate": 7.617388701844764e-07, + "loss": 0.3465, + "step": 15294 + }, + { + "epoch": 0.88, + "grad_norm": 0.25505961167579344, + "learning_rate": 7.610266497798913e-07, + "loss": 0.2145, + "step": 15295 + }, + { + "epoch": 0.88, + "grad_norm": 1.200358480855169, + "learning_rate": 7.603147493205531e-07, + "loss": 0.3905, + "step": 15296 + }, + { + "epoch": 0.88, + "grad_norm": 0.3565559048959195, + "learning_rate": 7.59603168831119e-07, + "loss": 0.2969, + "step": 15297 + }, + { + "epoch": 0.88, + "grad_norm": 0.3149087956944596, + "learning_rate": 7.588919083362301e-07, + "loss": 0.2396, + "step": 15298 + }, + { + "epoch": 0.88, + "grad_norm": 0.6796516619216567, + "learning_rate": 7.581809678605167e-07, + "loss": 0.3689, + "step": 15299 + }, + { + "epoch": 0.88, + "grad_norm": 0.34306570273858433, + "learning_rate": 7.574703474285971e-07, + "loss": 0.2018, + "step": 15300 + }, + { + "epoch": 0.88, + "grad_norm": 0.253075691212108, + "learning_rate": 7.567600470650849e-07, + "loss": 0.2466, + "step": 15301 + }, + { + "epoch": 0.88, + "grad_norm": 0.41639118533895647, + "learning_rate": 7.560500667945736e-07, + "loss": 0.2633, + "step": 15302 + }, + { + "epoch": 0.88, + "grad_norm": 0.9729597607745787, + "learning_rate": 7.553404066416514e-07, + "loss": 0.2259, + "step": 15303 + }, + { + "epoch": 0.88, + "grad_norm": 0.36267534769957355, + "learning_rate": 7.546310666308909e-07, + "loss": 0.2609, + "step": 15304 + }, + { + "epoch": 0.88, + "grad_norm": 0.3762109035053571, + "learning_rate": 7.539220467868613e-07, + "loss": 0.2912, + "step": 15305 + }, + { + "epoch": 0.88, + "grad_norm": 0.37831387911988446, + "learning_rate": 7.532133471341141e-07, + "loss": 0.1914, + "step": 15306 + }, + { + "epoch": 0.88, + "grad_norm": 0.2679166493911752, + "learning_rate": 7.525049676971907e-07, + "loss": 0.2282, + "step": 15307 + }, + { + "epoch": 0.88, + "grad_norm": 1.1468120655160323, + "learning_rate": 7.517969085006227e-07, + "loss": 0.7836, + "step": 15308 + }, + { + "epoch": 0.88, + "grad_norm": 0.3255850537957562, + "learning_rate": 7.510891695689282e-07, + "loss": 0.2195, + "step": 15309 + }, + { + "epoch": 0.88, + "grad_norm": 0.35726564990652665, + "learning_rate": 7.503817509266198e-07, + "loss": 0.2898, + "step": 15310 + }, + { + "epoch": 0.88, + "grad_norm": 0.6503155915878299, + "learning_rate": 7.496746525981935e-07, + "loss": 0.3677, + "step": 15311 + }, + { + "epoch": 0.88, + "grad_norm": 0.37840816000222754, + "learning_rate": 7.489678746081364e-07, + "loss": 0.2404, + "step": 15312 + }, + { + "epoch": 0.88, + "grad_norm": 0.29412005921689116, + "learning_rate": 7.482614169809222e-07, + "loss": 0.1814, + "step": 15313 + }, + { + "epoch": 0.88, + "grad_norm": 0.4733546443356241, + "learning_rate": 7.475552797410191e-07, + "loss": 0.396, + "step": 15314 + }, + { + "epoch": 0.88, + "grad_norm": 0.44759966457147815, + "learning_rate": 7.468494629128786e-07, + "loss": 0.2897, + "step": 15315 + }, + { + "epoch": 0.88, + "grad_norm": 0.32367457949655865, + "learning_rate": 7.461439665209435e-07, + "loss": 0.2157, + "step": 15316 + }, + { + "epoch": 0.88, + "grad_norm": 0.4080924876089095, + "learning_rate": 7.454387905896432e-07, + "loss": 0.306, + "step": 15317 + }, + { + "epoch": 0.88, + "grad_norm": 1.2171090379852552, + "learning_rate": 7.447339351434013e-07, + "loss": 0.4721, + "step": 15318 + }, + { + "epoch": 0.88, + "grad_norm": 0.2858970690734332, + "learning_rate": 7.44029400206625e-07, + "loss": 0.1938, + "step": 15319 + }, + { + "epoch": 0.88, + "grad_norm": 1.2425084966006583, + "learning_rate": 7.433251858037127e-07, + "loss": 0.7118, + "step": 15320 + }, + { + "epoch": 0.88, + "grad_norm": 0.2196747731628239, + "learning_rate": 7.426212919590503e-07, + "loss": 0.2056, + "step": 15321 + }, + { + "epoch": 0.88, + "grad_norm": 0.29882954862740696, + "learning_rate": 7.419177186970139e-07, + "loss": 0.2113, + "step": 15322 + }, + { + "epoch": 0.88, + "grad_norm": 0.8049704432913872, + "learning_rate": 7.412144660419706e-07, + "loss": 0.4391, + "step": 15323 + }, + { + "epoch": 0.88, + "grad_norm": 0.5968983898094012, + "learning_rate": 7.405115340182723e-07, + "loss": 0.3162, + "step": 15324 + }, + { + "epoch": 0.88, + "grad_norm": 0.28930754291418537, + "learning_rate": 7.398089226502603e-07, + "loss": 0.2629, + "step": 15325 + }, + { + "epoch": 0.88, + "grad_norm": 1.1015836359844582, + "learning_rate": 7.391066319622664e-07, + "loss": 0.5367, + "step": 15326 + }, + { + "epoch": 0.88, + "grad_norm": 0.3365458212983297, + "learning_rate": 7.384046619786123e-07, + "loss": 0.2162, + "step": 15327 + }, + { + "epoch": 0.88, + "grad_norm": 0.44176905324324517, + "learning_rate": 7.377030127236073e-07, + "loss": 0.2576, + "step": 15328 + }, + { + "epoch": 0.88, + "grad_norm": 0.2811618795819501, + "learning_rate": 7.370016842215488e-07, + "loss": 0.2339, + "step": 15329 + }, + { + "epoch": 0.88, + "grad_norm": 1.281599200325273, + "learning_rate": 7.363006764967228e-07, + "loss": 0.6169, + "step": 15330 + }, + { + "epoch": 0.88, + "grad_norm": 0.3883951981613511, + "learning_rate": 7.355999895734067e-07, + "loss": 0.2458, + "step": 15331 + }, + { + "epoch": 0.88, + "grad_norm": 0.35906664183412457, + "learning_rate": 7.348996234758643e-07, + "loss": 0.2571, + "step": 15332 + }, + { + "epoch": 0.88, + "grad_norm": 0.4223644010226172, + "learning_rate": 7.341995782283506e-07, + "loss": 0.2908, + "step": 15333 + }, + { + "epoch": 0.88, + "grad_norm": 0.39088380406576523, + "learning_rate": 7.334998538551042e-07, + "loss": 0.273, + "step": 15334 + }, + { + "epoch": 0.88, + "grad_norm": 0.3214635287901621, + "learning_rate": 7.328004503803609e-07, + "loss": 0.1508, + "step": 15335 + }, + { + "epoch": 0.88, + "grad_norm": 0.5600623509613721, + "learning_rate": 7.321013678283407e-07, + "loss": 0.3081, + "step": 15336 + }, + { + "epoch": 0.88, + "grad_norm": 0.272040944032163, + "learning_rate": 7.314026062232504e-07, + "loss": 0.244, + "step": 15337 + }, + { + "epoch": 0.88, + "grad_norm": 1.168178953668496, + "learning_rate": 7.307041655892877e-07, + "loss": 0.7761, + "step": 15338 + }, + { + "epoch": 0.88, + "grad_norm": 0.5828528196274684, + "learning_rate": 7.300060459506431e-07, + "loss": 0.3033, + "step": 15339 + }, + { + "epoch": 0.88, + "grad_norm": 0.25286178191941955, + "learning_rate": 7.293082473314905e-07, + "loss": 0.2107, + "step": 15340 + }, + { + "epoch": 0.88, + "grad_norm": 0.2580447143807208, + "learning_rate": 7.286107697559952e-07, + "loss": 0.2347, + "step": 15341 + }, + { + "epoch": 0.88, + "grad_norm": 1.3155362942915954, + "learning_rate": 7.279136132483078e-07, + "loss": 0.2337, + "step": 15342 + }, + { + "epoch": 0.88, + "grad_norm": 0.3069915058726905, + "learning_rate": 7.272167778325756e-07, + "loss": 0.2612, + "step": 15343 + }, + { + "epoch": 0.88, + "grad_norm": 0.872607402488223, + "learning_rate": 7.265202635329272e-07, + "loss": 0.5209, + "step": 15344 + }, + { + "epoch": 0.88, + "grad_norm": 0.33572217596314796, + "learning_rate": 7.258240703734832e-07, + "loss": 0.2534, + "step": 15345 + }, + { + "epoch": 0.88, + "grad_norm": 0.36957113937852054, + "learning_rate": 7.251281983783532e-07, + "loss": 0.2541, + "step": 15346 + }, + { + "epoch": 0.88, + "grad_norm": 0.25658032069472986, + "learning_rate": 7.244326475716323e-07, + "loss": 0.1934, + "step": 15347 + }, + { + "epoch": 0.88, + "grad_norm": 0.36518152555370853, + "learning_rate": 7.237374179774125e-07, + "loss": 0.2461, + "step": 15348 + }, + { + "epoch": 0.88, + "grad_norm": 0.3860018083993852, + "learning_rate": 7.230425096197669e-07, + "loss": 0.2512, + "step": 15349 + }, + { + "epoch": 0.88, + "grad_norm": 0.4407144779168672, + "learning_rate": 7.223479225227603e-07, + "loss": 0.3225, + "step": 15350 + }, + { + "epoch": 0.88, + "grad_norm": 0.8099477334987211, + "learning_rate": 7.216536567104449e-07, + "loss": 0.4107, + "step": 15351 + }, + { + "epoch": 0.88, + "grad_norm": 0.3715733078958953, + "learning_rate": 7.209597122068657e-07, + "loss": 0.1808, + "step": 15352 + }, + { + "epoch": 0.88, + "grad_norm": 0.2045474274396735, + "learning_rate": 7.202660890360524e-07, + "loss": 0.2173, + "step": 15353 + }, + { + "epoch": 0.88, + "grad_norm": 1.600767535273116, + "learning_rate": 7.195727872220248e-07, + "loss": 0.7834, + "step": 15354 + }, + { + "epoch": 0.88, + "grad_norm": 0.2723621726814691, + "learning_rate": 7.188798067887926e-07, + "loss": 0.1995, + "step": 15355 + }, + { + "epoch": 0.88, + "grad_norm": 0.4313450825328893, + "learning_rate": 7.181871477603542e-07, + "loss": 0.3322, + "step": 15356 + }, + { + "epoch": 0.88, + "grad_norm": 0.43795447255344067, + "learning_rate": 7.174948101606949e-07, + "loss": 0.3239, + "step": 15357 + }, + { + "epoch": 0.88, + "grad_norm": 0.3489367764018725, + "learning_rate": 7.168027940137923e-07, + "loss": 0.1882, + "step": 15358 + }, + { + "epoch": 0.88, + "grad_norm": 0.2626421008762479, + "learning_rate": 7.161110993436093e-07, + "loss": 0.1515, + "step": 15359 + }, + { + "epoch": 0.88, + "grad_norm": 0.3860502345972539, + "learning_rate": 7.15419726174098e-07, + "loss": 0.2858, + "step": 15360 + }, + { + "epoch": 0.88, + "grad_norm": 0.36349650078670603, + "learning_rate": 7.147286745292049e-07, + "loss": 0.1971, + "step": 15361 + }, + { + "epoch": 0.88, + "grad_norm": 0.44592750277136384, + "learning_rate": 7.140379444328571e-07, + "loss": 0.3327, + "step": 15362 + }, + { + "epoch": 0.88, + "grad_norm": 1.0136525434025818, + "learning_rate": 7.13347535908977e-07, + "loss": 0.5487, + "step": 15363 + }, + { + "epoch": 0.88, + "grad_norm": 0.405578078064654, + "learning_rate": 7.126574489814719e-07, + "loss": 0.2968, + "step": 15364 + }, + { + "epoch": 0.88, + "grad_norm": 0.2241058466957011, + "learning_rate": 7.119676836742407e-07, + "loss": 0.1817, + "step": 15365 + }, + { + "epoch": 0.88, + "grad_norm": 0.5356592030749281, + "learning_rate": 7.112782400111684e-07, + "loss": 0.2669, + "step": 15366 + }, + { + "epoch": 0.88, + "grad_norm": 0.6030656386460477, + "learning_rate": 7.105891180161306e-07, + "loss": 0.3748, + "step": 15367 + }, + { + "epoch": 0.88, + "grad_norm": 0.24970782740649353, + "learning_rate": 7.099003177129926e-07, + "loss": 0.2436, + "step": 15368 + }, + { + "epoch": 0.88, + "grad_norm": 1.3865134035245326, + "learning_rate": 7.092118391256076e-07, + "loss": 0.6055, + "step": 15369 + }, + { + "epoch": 0.88, + "grad_norm": 0.7176124074822394, + "learning_rate": 7.085236822778174e-07, + "loss": 0.2688, + "step": 15370 + }, + { + "epoch": 0.88, + "grad_norm": 0.2343011862368419, + "learning_rate": 7.078358471934521e-07, + "loss": 0.1487, + "step": 15371 + }, + { + "epoch": 0.88, + "grad_norm": 0.378432979445646, + "learning_rate": 7.071483338963303e-07, + "loss": 0.3006, + "step": 15372 + }, + { + "epoch": 0.88, + "grad_norm": 0.48086836045064396, + "learning_rate": 7.064611424102641e-07, + "loss": 0.2961, + "step": 15373 + }, + { + "epoch": 0.88, + "grad_norm": 0.37437277656526946, + "learning_rate": 7.057742727590478e-07, + "loss": 0.3221, + "step": 15374 + }, + { + "epoch": 0.88, + "grad_norm": 0.9436427928262109, + "learning_rate": 7.050877249664701e-07, + "loss": 0.3674, + "step": 15375 + }, + { + "epoch": 0.88, + "grad_norm": 0.2964656910026224, + "learning_rate": 7.04401499056302e-07, + "loss": 0.2569, + "step": 15376 + }, + { + "epoch": 0.88, + "grad_norm": 0.3690822885989384, + "learning_rate": 7.037155950523123e-07, + "loss": 0.307, + "step": 15377 + }, + { + "epoch": 0.88, + "grad_norm": 0.26884006202752997, + "learning_rate": 7.030300129782519e-07, + "loss": 0.131, + "step": 15378 + }, + { + "epoch": 0.88, + "grad_norm": 0.40264916010642365, + "learning_rate": 7.023447528578631e-07, + "loss": 0.2621, + "step": 15379 + }, + { + "epoch": 0.88, + "grad_norm": 0.35794945612338336, + "learning_rate": 7.016598147148735e-07, + "loss": 0.2974, + "step": 15380 + }, + { + "epoch": 0.88, + "grad_norm": 0.5066463148507542, + "learning_rate": 7.009751985730062e-07, + "loss": 0.2751, + "step": 15381 + }, + { + "epoch": 0.88, + "grad_norm": 0.8604249835369419, + "learning_rate": 7.00290904455968e-07, + "loss": 0.3744, + "step": 15382 + }, + { + "epoch": 0.88, + "grad_norm": 0.29377213308048267, + "learning_rate": 6.996069323874555e-07, + "loss": 0.2061, + "step": 15383 + }, + { + "epoch": 0.88, + "grad_norm": 0.30323452832902037, + "learning_rate": 6.989232823911551e-07, + "loss": 0.2308, + "step": 15384 + }, + { + "epoch": 0.88, + "grad_norm": 0.7647519096570284, + "learning_rate": 6.982399544907403e-07, + "loss": 0.3747, + "step": 15385 + }, + { + "epoch": 0.88, + "grad_norm": 0.36798900089852343, + "learning_rate": 6.975569487098766e-07, + "loss": 0.3002, + "step": 15386 + }, + { + "epoch": 0.88, + "grad_norm": 1.3360407176215778, + "learning_rate": 6.968742650722172e-07, + "loss": 0.7616, + "step": 15387 + }, + { + "epoch": 0.88, + "grad_norm": 0.2787925854981777, + "learning_rate": 6.961919036014009e-07, + "loss": 0.2048, + "step": 15388 + }, + { + "epoch": 0.88, + "grad_norm": 0.4689798303850397, + "learning_rate": 6.955098643210578e-07, + "loss": 0.2941, + "step": 15389 + }, + { + "epoch": 0.88, + "grad_norm": 0.641487238466754, + "learning_rate": 6.94828147254809e-07, + "loss": 0.3726, + "step": 15390 + }, + { + "epoch": 0.88, + "grad_norm": 0.2461102831506282, + "learning_rate": 6.941467524262613e-07, + "loss": 0.1119, + "step": 15391 + }, + { + "epoch": 0.88, + "grad_norm": 0.25551098270018807, + "learning_rate": 6.934656798590122e-07, + "loss": 0.281, + "step": 15392 + }, + { + "epoch": 0.88, + "grad_norm": 1.1202496115317333, + "learning_rate": 6.927849295766442e-07, + "loss": 0.5997, + "step": 15393 + }, + { + "epoch": 0.88, + "grad_norm": 0.6445255900823864, + "learning_rate": 6.92104501602735e-07, + "loss": 0.0993, + "step": 15394 + }, + { + "epoch": 0.88, + "grad_norm": 0.40685012455812186, + "learning_rate": 6.91424395960848e-07, + "loss": 0.295, + "step": 15395 + }, + { + "epoch": 0.88, + "grad_norm": 0.3592962271783745, + "learning_rate": 6.907446126745332e-07, + "loss": 0.3047, + "step": 15396 + }, + { + "epoch": 0.88, + "grad_norm": 0.2749244052812137, + "learning_rate": 6.900651517673318e-07, + "loss": 0.1568, + "step": 15397 + }, + { + "epoch": 0.88, + "grad_norm": 0.6044148011731777, + "learning_rate": 6.893860132627739e-07, + "loss": 0.3646, + "step": 15398 + }, + { + "epoch": 0.88, + "grad_norm": 0.49796483369447875, + "learning_rate": 6.887071971843783e-07, + "loss": 0.2891, + "step": 15399 + }, + { + "epoch": 0.88, + "grad_norm": 0.33803796040103634, + "learning_rate": 6.880287035556521e-07, + "loss": 0.2728, + "step": 15400 + }, + { + "epoch": 0.88, + "grad_norm": 0.343782499945726, + "learning_rate": 6.873505324000895e-07, + "loss": 0.2097, + "step": 15401 + }, + { + "epoch": 0.88, + "grad_norm": 0.6284160615889932, + "learning_rate": 6.866726837411797e-07, + "loss": 0.3594, + "step": 15402 + }, + { + "epoch": 0.88, + "grad_norm": 0.3844903560072926, + "learning_rate": 6.859951576023937e-07, + "loss": 0.2492, + "step": 15403 + }, + { + "epoch": 0.89, + "grad_norm": 0.22190970897036372, + "learning_rate": 6.853179540071963e-07, + "loss": 0.2188, + "step": 15404 + }, + { + "epoch": 0.89, + "grad_norm": 0.46152722721730727, + "learning_rate": 6.846410729790342e-07, + "loss": 0.2762, + "step": 15405 + }, + { + "epoch": 0.89, + "grad_norm": 0.6960344758067857, + "learning_rate": 6.839645145413543e-07, + "loss": 0.3359, + "step": 15406 + }, + { + "epoch": 0.89, + "grad_norm": 0.3315547930125616, + "learning_rate": 6.832882787175809e-07, + "loss": 0.2416, + "step": 15407 + }, + { + "epoch": 0.89, + "grad_norm": 0.34418183719622625, + "learning_rate": 6.826123655311356e-07, + "loss": 0.2972, + "step": 15408 + }, + { + "epoch": 0.89, + "grad_norm": 0.7768735537156152, + "learning_rate": 6.819367750054217e-07, + "loss": 0.4253, + "step": 15409 + }, + { + "epoch": 0.89, + "grad_norm": 0.3022317412638133, + "learning_rate": 6.812615071638363e-07, + "loss": 0.2187, + "step": 15410 + }, + { + "epoch": 0.89, + "grad_norm": 0.30097106667699985, + "learning_rate": 6.805865620297659e-07, + "loss": 0.1357, + "step": 15411 + }, + { + "epoch": 0.89, + "grad_norm": 0.3699431546810101, + "learning_rate": 6.799119396265807e-07, + "loss": 0.2815, + "step": 15412 + }, + { + "epoch": 0.89, + "grad_norm": 0.36422006459287853, + "learning_rate": 6.792376399776457e-07, + "loss": 0.2794, + "step": 15413 + }, + { + "epoch": 0.89, + "grad_norm": 0.7376350234779959, + "learning_rate": 6.785636631063075e-07, + "loss": 0.2866, + "step": 15414 + }, + { + "epoch": 0.89, + "grad_norm": 0.4651567840856827, + "learning_rate": 6.778900090359119e-07, + "loss": 0.3252, + "step": 15415 + }, + { + "epoch": 0.89, + "grad_norm": 0.3479007390754298, + "learning_rate": 6.772166777897838e-07, + "loss": 0.2525, + "step": 15416 + }, + { + "epoch": 0.89, + "grad_norm": 0.20645592583378722, + "learning_rate": 6.765436693912408e-07, + "loss": 0.1553, + "step": 15417 + }, + { + "epoch": 0.89, + "grad_norm": 0.5925146855129958, + "learning_rate": 6.758709838635879e-07, + "loss": 0.3312, + "step": 15418 + }, + { + "epoch": 0.89, + "grad_norm": 0.36018604511633256, + "learning_rate": 6.751986212301242e-07, + "loss": 0.2976, + "step": 15419 + }, + { + "epoch": 0.89, + "grad_norm": 0.34434040713944425, + "learning_rate": 6.745265815141311e-07, + "loss": 0.2374, + "step": 15420 + }, + { + "epoch": 0.89, + "grad_norm": 1.2688158338463227, + "learning_rate": 6.73854864738881e-07, + "loss": 0.5905, + "step": 15421 + }, + { + "epoch": 0.89, + "grad_norm": 0.3130992444743742, + "learning_rate": 6.731834709276353e-07, + "loss": 0.2281, + "step": 15422 + }, + { + "epoch": 0.89, + "grad_norm": 0.373687076761885, + "learning_rate": 6.725124001036454e-07, + "loss": 0.2071, + "step": 15423 + }, + { + "epoch": 0.89, + "grad_norm": 0.49870489961270104, + "learning_rate": 6.718416522901506e-07, + "loss": 0.3269, + "step": 15424 + }, + { + "epoch": 0.89, + "grad_norm": 0.23765638274613635, + "learning_rate": 6.711712275103776e-07, + "loss": 0.221, + "step": 15425 + }, + { + "epoch": 0.89, + "grad_norm": 1.3561355023422061, + "learning_rate": 6.705011257875449e-07, + "loss": 0.5169, + "step": 15426 + }, + { + "epoch": 0.89, + "grad_norm": 0.3737441156518405, + "learning_rate": 6.698313471448547e-07, + "loss": 0.2337, + "step": 15427 + }, + { + "epoch": 0.89, + "grad_norm": 0.28412374842415977, + "learning_rate": 6.691618916055053e-07, + "loss": 0.2634, + "step": 15428 + }, + { + "epoch": 0.89, + "grad_norm": 1.260617821916191, + "learning_rate": 6.684927591926793e-07, + "loss": 0.6065, + "step": 15429 + }, + { + "epoch": 0.89, + "grad_norm": 0.416719995810165, + "learning_rate": 6.678239499295469e-07, + "loss": 0.1985, + "step": 15430 + }, + { + "epoch": 0.89, + "grad_norm": 0.3731138532303315, + "learning_rate": 6.671554638392696e-07, + "loss": 0.2707, + "step": 15431 + }, + { + "epoch": 0.89, + "grad_norm": 0.26428445121349753, + "learning_rate": 6.664873009449979e-07, + "loss": 0.2391, + "step": 15432 + }, + { + "epoch": 0.89, + "grad_norm": 0.4735796217441745, + "learning_rate": 6.658194612698687e-07, + "loss": 0.2467, + "step": 15433 + }, + { + "epoch": 0.89, + "grad_norm": 0.3392053344013183, + "learning_rate": 6.651519448370092e-07, + "loss": 0.2764, + "step": 15434 + }, + { + "epoch": 0.89, + "grad_norm": 0.4614700653109635, + "learning_rate": 6.644847516695385e-07, + "loss": 0.3329, + "step": 15435 + }, + { + "epoch": 0.89, + "grad_norm": 0.3974557145871179, + "learning_rate": 6.638178817905594e-07, + "loss": 0.2162, + "step": 15436 + }, + { + "epoch": 0.89, + "grad_norm": 0.31729301878064864, + "learning_rate": 6.631513352231644e-07, + "loss": 0.2351, + "step": 15437 + }, + { + "epoch": 0.89, + "grad_norm": 0.6014970938156791, + "learning_rate": 6.624851119904385e-07, + "loss": 0.2425, + "step": 15438 + }, + { + "epoch": 0.89, + "grad_norm": 0.46471253231576026, + "learning_rate": 6.618192121154488e-07, + "loss": 0.3678, + "step": 15439 + }, + { + "epoch": 0.89, + "grad_norm": 0.3236888175086646, + "learning_rate": 6.611536356212612e-07, + "loss": 0.2154, + "step": 15440 + }, + { + "epoch": 0.89, + "grad_norm": 0.5426087946165549, + "learning_rate": 6.604883825309205e-07, + "loss": 0.3937, + "step": 15441 + }, + { + "epoch": 0.89, + "grad_norm": 1.5414638027935905, + "learning_rate": 6.598234528674663e-07, + "loss": 0.6158, + "step": 15442 + }, + { + "epoch": 0.89, + "grad_norm": 0.18180585805377175, + "learning_rate": 6.591588466539222e-07, + "loss": 0.1348, + "step": 15443 + }, + { + "epoch": 0.89, + "grad_norm": 0.33920005873970366, + "learning_rate": 6.584945639133067e-07, + "loss": 0.299, + "step": 15444 + }, + { + "epoch": 0.89, + "grad_norm": 1.3235160572056892, + "learning_rate": 6.578306046686234e-07, + "loss": 0.6165, + "step": 15445 + }, + { + "epoch": 0.89, + "grad_norm": 0.31586260327256654, + "learning_rate": 6.57166968942865e-07, + "loss": 0.2081, + "step": 15446 + }, + { + "epoch": 0.89, + "grad_norm": 0.4470041807775742, + "learning_rate": 6.565036567590099e-07, + "loss": 0.3292, + "step": 15447 + }, + { + "epoch": 0.89, + "grad_norm": 0.50546493909433, + "learning_rate": 6.558406681400342e-07, + "loss": 0.3378, + "step": 15448 + }, + { + "epoch": 0.89, + "grad_norm": 0.33419202257022956, + "learning_rate": 6.55178003108894e-07, + "loss": 0.2611, + "step": 15449 + }, + { + "epoch": 0.89, + "grad_norm": 0.16134420138718436, + "learning_rate": 6.545156616885373e-07, + "loss": 0.071, + "step": 15450 + }, + { + "epoch": 0.89, + "grad_norm": 0.32891260695083896, + "learning_rate": 6.538536439019016e-07, + "loss": 0.3021, + "step": 15451 + }, + { + "epoch": 0.89, + "grad_norm": 0.37903738502174683, + "learning_rate": 6.531919497719097e-07, + "loss": 0.2468, + "step": 15452 + }, + { + "epoch": 0.89, + "grad_norm": 0.4409361141174233, + "learning_rate": 6.52530579321482e-07, + "loss": 0.2674, + "step": 15453 + }, + { + "epoch": 0.89, + "grad_norm": 0.5984776888889717, + "learning_rate": 6.51869532573517e-07, + "loss": 0.2518, + "step": 15454 + }, + { + "epoch": 0.89, + "grad_norm": 0.37448200080698674, + "learning_rate": 6.512088095509095e-07, + "loss": 0.2645, + "step": 15455 + }, + { + "epoch": 0.89, + "grad_norm": 0.24355986018680098, + "learning_rate": 6.505484102765358e-07, + "loss": 0.1973, + "step": 15456 + }, + { + "epoch": 0.89, + "grad_norm": 0.8738256093138846, + "learning_rate": 6.498883347732709e-07, + "loss": 0.4653, + "step": 15457 + }, + { + "epoch": 0.89, + "grad_norm": 0.39720267810411064, + "learning_rate": 6.492285830639711e-07, + "loss": 0.2898, + "step": 15458 + }, + { + "epoch": 0.89, + "grad_norm": 0.284470585352059, + "learning_rate": 6.485691551714835e-07, + "loss": 0.2527, + "step": 15459 + }, + { + "epoch": 0.89, + "grad_norm": 1.3712482470378675, + "learning_rate": 6.47910051118642e-07, + "loss": 0.5142, + "step": 15460 + }, + { + "epoch": 0.89, + "grad_norm": 0.3337385065605379, + "learning_rate": 6.472512709282752e-07, + "loss": 0.2579, + "step": 15461 + }, + { + "epoch": 0.89, + "grad_norm": 0.2548757339390069, + "learning_rate": 6.465928146231937e-07, + "loss": 0.1738, + "step": 15462 + }, + { + "epoch": 0.89, + "grad_norm": 0.33354702595338487, + "learning_rate": 6.459346822262014e-07, + "loss": 0.2561, + "step": 15463 + }, + { + "epoch": 0.89, + "grad_norm": 0.31942036759526615, + "learning_rate": 6.45276873760089e-07, + "loss": 0.2559, + "step": 15464 + }, + { + "epoch": 0.89, + "grad_norm": 0.8281221679177381, + "learning_rate": 6.44619389247636e-07, + "loss": 0.4434, + "step": 15465 + }, + { + "epoch": 0.89, + "grad_norm": 1.4513030484506004, + "learning_rate": 6.439622287116121e-07, + "loss": 0.2286, + "step": 15466 + }, + { + "epoch": 0.89, + "grad_norm": 0.2902777682810745, + "learning_rate": 6.433053921747734e-07, + "loss": 0.2554, + "step": 15467 + }, + { + "epoch": 0.89, + "grad_norm": 0.22307685318256087, + "learning_rate": 6.42648879659864e-07, + "loss": 0.1937, + "step": 15468 + }, + { + "epoch": 0.89, + "grad_norm": 0.6450982193385442, + "learning_rate": 6.419926911896246e-07, + "loss": 0.2968, + "step": 15469 + }, + { + "epoch": 0.89, + "grad_norm": 0.40342553770057044, + "learning_rate": 6.413368267867748e-07, + "loss": 0.2651, + "step": 15470 + }, + { + "epoch": 0.89, + "grad_norm": 0.3625069981846449, + "learning_rate": 6.406812864740286e-07, + "loss": 0.3225, + "step": 15471 + }, + { + "epoch": 0.89, + "grad_norm": 0.5406013839654554, + "learning_rate": 6.400260702740857e-07, + "loss": 0.2295, + "step": 15472 + }, + { + "epoch": 0.89, + "grad_norm": 0.3813327291140086, + "learning_rate": 6.39371178209639e-07, + "loss": 0.2555, + "step": 15473 + }, + { + "epoch": 0.89, + "grad_norm": 0.248468868214602, + "learning_rate": 6.387166103033659e-07, + "loss": 0.1838, + "step": 15474 + }, + { + "epoch": 0.89, + "grad_norm": 0.3369238767144438, + "learning_rate": 6.38062366577934e-07, + "loss": 0.2943, + "step": 15475 + }, + { + "epoch": 0.89, + "grad_norm": 0.49041949518461936, + "learning_rate": 6.374084470559993e-07, + "loss": 0.2287, + "step": 15476 + }, + { + "epoch": 0.89, + "grad_norm": 0.5345573593990525, + "learning_rate": 6.367548517602062e-07, + "loss": 0.3918, + "step": 15477 + }, + { + "epoch": 0.89, + "grad_norm": 1.267321575127367, + "learning_rate": 6.36101580713191e-07, + "loss": 0.4503, + "step": 15478 + }, + { + "epoch": 0.89, + "grad_norm": 0.24989474901885625, + "learning_rate": 6.354486339375765e-07, + "loss": 0.2039, + "step": 15479 + }, + { + "epoch": 0.89, + "grad_norm": 0.30215440765844515, + "learning_rate": 6.347960114559726e-07, + "loss": 0.2355, + "step": 15480 + }, + { + "epoch": 0.89, + "grad_norm": 0.47055963128076644, + "learning_rate": 6.341437132909778e-07, + "loss": 0.2552, + "step": 15481 + }, + { + "epoch": 0.89, + "grad_norm": 0.28025144025055243, + "learning_rate": 6.334917394651863e-07, + "loss": 0.1913, + "step": 15482 + }, + { + "epoch": 0.89, + "grad_norm": 0.47105549542319564, + "learning_rate": 6.328400900011722e-07, + "loss": 0.3661, + "step": 15483 + }, + { + "epoch": 0.89, + "grad_norm": 0.5301274686641892, + "learning_rate": 6.321887649215031e-07, + "loss": 0.3488, + "step": 15484 + }, + { + "epoch": 0.89, + "grad_norm": 0.34412718839375167, + "learning_rate": 6.31537764248733e-07, + "loss": 0.1598, + "step": 15485 + }, + { + "epoch": 0.89, + "grad_norm": 0.49793609859674237, + "learning_rate": 6.308870880054085e-07, + "loss": 0.3045, + "step": 15486 + }, + { + "epoch": 0.89, + "grad_norm": 0.2893282719469128, + "learning_rate": 6.302367362140616e-07, + "loss": 0.2616, + "step": 15487 + }, + { + "epoch": 0.89, + "grad_norm": 0.34383649815869993, + "learning_rate": 6.295867088972141e-07, + "loss": 0.2231, + "step": 15488 + }, + { + "epoch": 0.89, + "grad_norm": 0.3281453009232224, + "learning_rate": 6.289370060773748e-07, + "loss": 0.2295, + "step": 15489 + }, + { + "epoch": 0.89, + "grad_norm": 0.536206100347878, + "learning_rate": 6.282876277770433e-07, + "loss": 0.2901, + "step": 15490 + }, + { + "epoch": 0.89, + "grad_norm": 0.3532249976082039, + "learning_rate": 6.276385740187097e-07, + "loss": 0.258, + "step": 15491 + }, + { + "epoch": 0.89, + "grad_norm": 0.3603329578354436, + "learning_rate": 6.26989844824849e-07, + "loss": 0.2514, + "step": 15492 + }, + { + "epoch": 0.89, + "grad_norm": 0.9432767223998106, + "learning_rate": 6.263414402179269e-07, + "loss": 0.402, + "step": 15493 + }, + { + "epoch": 0.89, + "grad_norm": 0.30261911972964056, + "learning_rate": 6.256933602203963e-07, + "loss": 0.2115, + "step": 15494 + }, + { + "epoch": 0.89, + "grad_norm": 0.23543770198075897, + "learning_rate": 6.250456048547027e-07, + "loss": 0.2335, + "step": 15495 + }, + { + "epoch": 0.89, + "grad_norm": 0.4415768113353055, + "learning_rate": 6.243981741432769e-07, + "loss": 0.2614, + "step": 15496 + }, + { + "epoch": 0.89, + "grad_norm": 0.5812569473819925, + "learning_rate": 6.23751068108539e-07, + "loss": 0.3034, + "step": 15497 + }, + { + "epoch": 0.89, + "grad_norm": 0.36478079392776935, + "learning_rate": 6.231042867728987e-07, + "loss": 0.2275, + "step": 15498 + }, + { + "epoch": 0.89, + "grad_norm": 0.3750949965379845, + "learning_rate": 6.224578301587536e-07, + "loss": 0.29, + "step": 15499 + }, + { + "epoch": 0.89, + "grad_norm": 0.266345866146398, + "learning_rate": 6.218116982884903e-07, + "loss": 0.2056, + "step": 15500 + }, + { + "epoch": 0.89, + "grad_norm": 0.5182817369574994, + "learning_rate": 6.211658911844854e-07, + "loss": 0.3603, + "step": 15501 + }, + { + "epoch": 0.89, + "grad_norm": 0.32107800900252353, + "learning_rate": 6.205204088690997e-07, + "loss": 0.1834, + "step": 15502 + }, + { + "epoch": 0.89, + "grad_norm": 0.3540271412785568, + "learning_rate": 6.198752513646911e-07, + "loss": 0.2723, + "step": 15503 + }, + { + "epoch": 0.89, + "grad_norm": 0.5177309374569469, + "learning_rate": 6.192304186935993e-07, + "loss": 0.3293, + "step": 15504 + }, + { + "epoch": 0.89, + "grad_norm": 0.8842981293585628, + "learning_rate": 6.185859108781544e-07, + "loss": 0.2621, + "step": 15505 + }, + { + "epoch": 0.89, + "grad_norm": 0.8063127372023199, + "learning_rate": 6.179417279406752e-07, + "loss": 0.5123, + "step": 15506 + }, + { + "epoch": 0.89, + "grad_norm": 0.2330689085737917, + "learning_rate": 6.172978699034715e-07, + "loss": 0.2458, + "step": 15507 + }, + { + "epoch": 0.89, + "grad_norm": 0.2998705740226622, + "learning_rate": 6.166543367888389e-07, + "loss": 0.1521, + "step": 15508 + }, + { + "epoch": 0.89, + "grad_norm": 0.5991385238658149, + "learning_rate": 6.160111286190629e-07, + "loss": 0.3678, + "step": 15509 + }, + { + "epoch": 0.89, + "grad_norm": 0.41892626382614, + "learning_rate": 6.153682454164167e-07, + "loss": 0.2752, + "step": 15510 + }, + { + "epoch": 0.89, + "grad_norm": 0.38775365797584355, + "learning_rate": 6.14725687203167e-07, + "loss": 0.2397, + "step": 15511 + }, + { + "epoch": 0.89, + "grad_norm": 0.6237671895525037, + "learning_rate": 6.140834540015617e-07, + "loss": 0.3729, + "step": 15512 + }, + { + "epoch": 0.89, + "grad_norm": 0.29434266789911306, + "learning_rate": 6.134415458338439e-07, + "loss": 0.2485, + "step": 15513 + }, + { + "epoch": 0.89, + "grad_norm": 0.44565125934806743, + "learning_rate": 6.127999627222414e-07, + "loss": 0.238, + "step": 15514 + }, + { + "epoch": 0.89, + "grad_norm": 0.28957696302459607, + "learning_rate": 6.121587046889709e-07, + "loss": 0.2293, + "step": 15515 + }, + { + "epoch": 0.89, + "grad_norm": 0.381380545026879, + "learning_rate": 6.115177717562426e-07, + "loss": 0.3005, + "step": 15516 + }, + { + "epoch": 0.89, + "grad_norm": 1.280503866463015, + "learning_rate": 6.108771639462496e-07, + "loss": 0.4066, + "step": 15517 + }, + { + "epoch": 0.89, + "grad_norm": 0.31632331249443907, + "learning_rate": 6.102368812811776e-07, + "loss": 0.2172, + "step": 15518 + }, + { + "epoch": 0.89, + "grad_norm": 0.30631108778593535, + "learning_rate": 6.095969237831956e-07, + "loss": 0.2671, + "step": 15519 + }, + { + "epoch": 0.89, + "grad_norm": 0.4657106174751502, + "learning_rate": 6.089572914744712e-07, + "loss": 0.2943, + "step": 15520 + }, + { + "epoch": 0.89, + "grad_norm": 0.3192673232648922, + "learning_rate": 6.083179843771513e-07, + "loss": 0.1749, + "step": 15521 + }, + { + "epoch": 0.89, + "grad_norm": 0.5648607556412087, + "learning_rate": 6.076790025133761e-07, + "loss": 0.3423, + "step": 15522 + }, + { + "epoch": 0.89, + "grad_norm": 0.38998891302247396, + "learning_rate": 6.070403459052721e-07, + "loss": 0.3006, + "step": 15523 + }, + { + "epoch": 0.89, + "grad_norm": 0.47612449309706734, + "learning_rate": 6.064020145749572e-07, + "loss": 0.2506, + "step": 15524 + }, + { + "epoch": 0.89, + "grad_norm": 0.3699572894444798, + "learning_rate": 6.057640085445371e-07, + "loss": 0.2902, + "step": 15525 + }, + { + "epoch": 0.89, + "grad_norm": 0.33072600556263054, + "learning_rate": 6.051263278361064e-07, + "loss": 0.2912, + "step": 15526 + }, + { + "epoch": 0.89, + "grad_norm": 0.5229226557439318, + "learning_rate": 6.04488972471744e-07, + "loss": 0.2665, + "step": 15527 + }, + { + "epoch": 0.89, + "grad_norm": 0.23899799542644962, + "learning_rate": 6.038519424735268e-07, + "loss": 0.1533, + "step": 15528 + }, + { + "epoch": 0.89, + "grad_norm": 1.2521803906393414, + "learning_rate": 6.032152378635125e-07, + "loss": 0.71, + "step": 15529 + }, + { + "epoch": 0.89, + "grad_norm": 0.332009032357602, + "learning_rate": 6.025788586637516e-07, + "loss": 0.3173, + "step": 15530 + }, + { + "epoch": 0.89, + "grad_norm": 0.3054581468802576, + "learning_rate": 6.019428048962794e-07, + "loss": 0.2127, + "step": 15531 + }, + { + "epoch": 0.89, + "grad_norm": 0.6988792384416537, + "learning_rate": 6.013070765831242e-07, + "loss": 0.3557, + "step": 15532 + }, + { + "epoch": 0.89, + "grad_norm": 0.2612692825758358, + "learning_rate": 6.006716737463003e-07, + "loss": 0.1574, + "step": 15533 + }, + { + "epoch": 0.89, + "grad_norm": 0.35768416541218084, + "learning_rate": 6.000365964078125e-07, + "loss": 0.2071, + "step": 15534 + }, + { + "epoch": 0.89, + "grad_norm": 0.360522914540225, + "learning_rate": 5.99401844589651e-07, + "loss": 0.2844, + "step": 15535 + }, + { + "epoch": 0.89, + "grad_norm": 0.7207181920540063, + "learning_rate": 5.987674183138015e-07, + "loss": 0.4223, + "step": 15536 + }, + { + "epoch": 0.89, + "grad_norm": 0.34250474149267907, + "learning_rate": 5.98133317602233e-07, + "loss": 0.2512, + "step": 15537 + }, + { + "epoch": 0.89, + "grad_norm": 0.42762939818118384, + "learning_rate": 5.974995424769026e-07, + "loss": 0.2639, + "step": 15538 + }, + { + "epoch": 0.89, + "grad_norm": 0.28191737906201586, + "learning_rate": 5.968660929597581e-07, + "loss": 0.2076, + "step": 15539 + }, + { + "epoch": 0.89, + "grad_norm": 0.3401995555671143, + "learning_rate": 5.962329690727353e-07, + "loss": 0.2638, + "step": 15540 + }, + { + "epoch": 0.89, + "grad_norm": 0.5155883682044767, + "learning_rate": 5.956001708377623e-07, + "loss": 0.1423, + "step": 15541 + }, + { + "epoch": 0.89, + "grad_norm": 0.3712414578625411, + "learning_rate": 5.949676982767505e-07, + "loss": 0.3232, + "step": 15542 + }, + { + "epoch": 0.89, + "grad_norm": 0.32353924162903186, + "learning_rate": 5.943355514116033e-07, + "loss": 0.2585, + "step": 15543 + }, + { + "epoch": 0.89, + "grad_norm": 0.7305438124068102, + "learning_rate": 5.937037302642101e-07, + "loss": 0.3036, + "step": 15544 + }, + { + "epoch": 0.89, + "grad_norm": 0.29228028451163485, + "learning_rate": 5.930722348564533e-07, + "loss": 0.1928, + "step": 15545 + }, + { + "epoch": 0.89, + "grad_norm": 0.34765287698989866, + "learning_rate": 5.924410652102009e-07, + "loss": 0.255, + "step": 15546 + }, + { + "epoch": 0.89, + "grad_norm": 0.3763887260898906, + "learning_rate": 5.918102213473087e-07, + "loss": 0.2487, + "step": 15547 + }, + { + "epoch": 0.89, + "grad_norm": 0.6285264044725666, + "learning_rate": 5.911797032896239e-07, + "loss": 0.3656, + "step": 15548 + }, + { + "epoch": 0.89, + "grad_norm": 0.33477221937072554, + "learning_rate": 5.905495110589821e-07, + "loss": 0.2613, + "step": 15549 + }, + { + "epoch": 0.89, + "grad_norm": 1.252759326879859, + "learning_rate": 5.89919644677206e-07, + "loss": 0.8002, + "step": 15550 + }, + { + "epoch": 0.89, + "grad_norm": 0.3344029733336263, + "learning_rate": 5.892901041661092e-07, + "loss": 0.2161, + "step": 15551 + }, + { + "epoch": 0.89, + "grad_norm": 0.20909466560736703, + "learning_rate": 5.886608895474888e-07, + "loss": 0.1845, + "step": 15552 + }, + { + "epoch": 0.89, + "grad_norm": 1.1886244262668104, + "learning_rate": 5.880320008431384e-07, + "loss": 0.6513, + "step": 15553 + }, + { + "epoch": 0.89, + "grad_norm": 0.29300410427650325, + "learning_rate": 5.874034380748362e-07, + "loss": 0.2342, + "step": 15554 + }, + { + "epoch": 0.89, + "grad_norm": 0.34717227520428, + "learning_rate": 5.867752012643469e-07, + "loss": 0.285, + "step": 15555 + }, + { + "epoch": 0.89, + "grad_norm": 0.7185220879859632, + "learning_rate": 5.861472904334287e-07, + "loss": 0.3823, + "step": 15556 + }, + { + "epoch": 0.89, + "grad_norm": 0.48824641014498565, + "learning_rate": 5.855197056038231e-07, + "loss": 0.1113, + "step": 15557 + }, + { + "epoch": 0.89, + "grad_norm": 0.34978211070504545, + "learning_rate": 5.848924467972661e-07, + "loss": 0.2534, + "step": 15558 + }, + { + "epoch": 0.89, + "grad_norm": 0.257049533946427, + "learning_rate": 5.842655140354791e-07, + "loss": 0.2372, + "step": 15559 + }, + { + "epoch": 0.89, + "grad_norm": 0.596660439530161, + "learning_rate": 5.836389073401727e-07, + "loss": 0.283, + "step": 15560 + }, + { + "epoch": 0.89, + "grad_norm": 0.31315917280327593, + "learning_rate": 5.830126267330449e-07, + "loss": 0.2764, + "step": 15561 + }, + { + "epoch": 0.89, + "grad_norm": 0.35240639619956515, + "learning_rate": 5.823866722357863e-07, + "loss": 0.3343, + "step": 15562 + }, + { + "epoch": 0.89, + "grad_norm": 1.620476298451587, + "learning_rate": 5.817610438700716e-07, + "loss": 0.5078, + "step": 15563 + }, + { + "epoch": 0.89, + "grad_norm": 0.2704294455548528, + "learning_rate": 5.811357416575681e-07, + "loss": 0.1573, + "step": 15564 + }, + { + "epoch": 0.89, + "grad_norm": 0.4428142185190504, + "learning_rate": 5.805107656199272e-07, + "loss": 0.2494, + "step": 15565 + }, + { + "epoch": 0.89, + "grad_norm": 0.3292874439000769, + "learning_rate": 5.79886115778795e-07, + "loss": 0.2965, + "step": 15566 + }, + { + "epoch": 0.89, + "grad_norm": 0.30624266788213034, + "learning_rate": 5.792617921558008e-07, + "loss": 0.1981, + "step": 15567 + }, + { + "epoch": 0.89, + "grad_norm": 1.2556421070753658, + "learning_rate": 5.786377947725652e-07, + "loss": 0.736, + "step": 15568 + }, + { + "epoch": 0.89, + "grad_norm": 1.5193881640144948, + "learning_rate": 5.780141236506975e-07, + "loss": 0.567, + "step": 15569 + }, + { + "epoch": 0.89, + "grad_norm": 0.22623774423910054, + "learning_rate": 5.77390778811796e-07, + "loss": 0.2157, + "step": 15570 + }, + { + "epoch": 0.89, + "grad_norm": 0.28603043110314136, + "learning_rate": 5.767677602774469e-07, + "loss": 0.1825, + "step": 15571 + }, + { + "epoch": 0.89, + "grad_norm": 0.5968504572934171, + "learning_rate": 5.761450680692249e-07, + "loss": 0.3951, + "step": 15572 + }, + { + "epoch": 0.89, + "grad_norm": 0.2821106282963114, + "learning_rate": 5.755227022086918e-07, + "loss": 0.1834, + "step": 15573 + }, + { + "epoch": 0.89, + "grad_norm": 0.3511480956550723, + "learning_rate": 5.749006627174048e-07, + "loss": 0.3039, + "step": 15574 + }, + { + "epoch": 0.89, + "grad_norm": 1.2740202637398323, + "learning_rate": 5.742789496169021e-07, + "loss": 0.4536, + "step": 15575 + }, + { + "epoch": 0.89, + "grad_norm": 0.36967814771582713, + "learning_rate": 5.736575629287145e-07, + "loss": 0.2536, + "step": 15576 + }, + { + "epoch": 0.89, + "grad_norm": 0.3182282890886471, + "learning_rate": 5.730365026743579e-07, + "loss": 0.2127, + "step": 15577 + }, + { + "epoch": 0.9, + "grad_norm": 0.2597618110444984, + "learning_rate": 5.72415768875344e-07, + "loss": 0.2431, + "step": 15578 + }, + { + "epoch": 0.9, + "grad_norm": 0.40873608355276525, + "learning_rate": 5.717953615531668e-07, + "loss": 0.2492, + "step": 15579 + }, + { + "epoch": 0.9, + "grad_norm": 0.5488195331582982, + "learning_rate": 5.711752807293102e-07, + "loss": 0.2588, + "step": 15580 + }, + { + "epoch": 0.9, + "grad_norm": 1.40896979648835, + "learning_rate": 5.705555264252483e-07, + "loss": 0.5422, + "step": 15581 + }, + { + "epoch": 0.9, + "grad_norm": 0.2511592502937873, + "learning_rate": 5.699360986624414e-07, + "loss": 0.2355, + "step": 15582 + }, + { + "epoch": 0.9, + "grad_norm": 0.5226165719071535, + "learning_rate": 5.693169974623435e-07, + "loss": 0.2665, + "step": 15583 + }, + { + "epoch": 0.9, + "grad_norm": 0.4341487348411721, + "learning_rate": 5.686982228463933e-07, + "loss": 0.2977, + "step": 15584 + }, + { + "epoch": 0.9, + "grad_norm": 0.2937040743182103, + "learning_rate": 5.680797748360168e-07, + "loss": 0.2368, + "step": 15585 + }, + { + "epoch": 0.9, + "grad_norm": 0.24353668748816057, + "learning_rate": 5.674616534526312e-07, + "loss": 0.1934, + "step": 15586 + }, + { + "epoch": 0.9, + "grad_norm": 1.674275057185165, + "learning_rate": 5.66843858717645e-07, + "loss": 0.4936, + "step": 15587 + }, + { + "epoch": 0.9, + "grad_norm": 0.33392941409136523, + "learning_rate": 5.66226390652449e-07, + "loss": 0.2589, + "step": 15588 + }, + { + "epoch": 0.9, + "grad_norm": 0.5883240032566877, + "learning_rate": 5.656092492784282e-07, + "loss": 0.3758, + "step": 15589 + }, + { + "epoch": 0.9, + "grad_norm": 0.3373756640161482, + "learning_rate": 5.649924346169522e-07, + "loss": 0.2541, + "step": 15590 + }, + { + "epoch": 0.9, + "grad_norm": 0.5502096680976466, + "learning_rate": 5.643759466893839e-07, + "loss": 0.3176, + "step": 15591 + }, + { + "epoch": 0.9, + "grad_norm": 0.23347398396256974, + "learning_rate": 5.637597855170707e-07, + "loss": 0.1993, + "step": 15592 + }, + { + "epoch": 0.9, + "grad_norm": 0.3502673113280859, + "learning_rate": 5.631439511213499e-07, + "loss": 0.2337, + "step": 15593 + }, + { + "epoch": 0.9, + "grad_norm": 0.3869961913722, + "learning_rate": 5.625284435235478e-07, + "loss": 0.2529, + "step": 15594 + }, + { + "epoch": 0.9, + "grad_norm": 0.676089532643328, + "learning_rate": 5.619132627449797e-07, + "loss": 0.3139, + "step": 15595 + }, + { + "epoch": 0.9, + "grad_norm": 1.3271539215905461, + "learning_rate": 5.612984088069507e-07, + "loss": 0.3681, + "step": 15596 + }, + { + "epoch": 0.9, + "grad_norm": 0.41527056555619096, + "learning_rate": 5.606838817307514e-07, + "loss": 0.2618, + "step": 15597 + }, + { + "epoch": 0.9, + "grad_norm": 0.20296961040711048, + "learning_rate": 5.600696815376639e-07, + "loss": 0.207, + "step": 15598 + }, + { + "epoch": 0.9, + "grad_norm": 0.7358714300620751, + "learning_rate": 5.594558082489565e-07, + "loss": 0.2504, + "step": 15599 + }, + { + "epoch": 0.9, + "grad_norm": 0.37675397571544944, + "learning_rate": 5.58842261885889e-07, + "loss": 0.277, + "step": 15600 + }, + { + "epoch": 0.9, + "grad_norm": 0.6241152115560328, + "learning_rate": 5.582290424697078e-07, + "loss": 0.2907, + "step": 15601 + }, + { + "epoch": 0.9, + "grad_norm": 0.3432574165161779, + "learning_rate": 5.576161500216481e-07, + "loss": 0.3207, + "step": 15602 + }, + { + "epoch": 0.9, + "grad_norm": 0.32805857188412496, + "learning_rate": 5.570035845629362e-07, + "loss": 0.207, + "step": 15603 + }, + { + "epoch": 0.9, + "grad_norm": 0.3566758030118385, + "learning_rate": 5.563913461147841e-07, + "loss": 0.2272, + "step": 15604 + }, + { + "epoch": 0.9, + "grad_norm": 0.33717272175655655, + "learning_rate": 5.557794346983936e-07, + "loss": 0.2613, + "step": 15605 + }, + { + "epoch": 0.9, + "grad_norm": 0.25959042748500355, + "learning_rate": 5.551678503349545e-07, + "loss": 0.2154, + "step": 15606 + }, + { + "epoch": 0.9, + "grad_norm": 0.8584149753122465, + "learning_rate": 5.545565930456464e-07, + "loss": 0.4831, + "step": 15607 + }, + { + "epoch": 0.9, + "grad_norm": 1.2299027222034258, + "learning_rate": 5.539456628516382e-07, + "loss": 0.7181, + "step": 15608 + }, + { + "epoch": 0.9, + "grad_norm": 0.7561475327791286, + "learning_rate": 5.53335059774085e-07, + "loss": 0.1436, + "step": 15609 + }, + { + "epoch": 0.9, + "grad_norm": 0.2262751318435284, + "learning_rate": 5.527247838341332e-07, + "loss": 0.2338, + "step": 15610 + }, + { + "epoch": 0.9, + "grad_norm": 0.40729868452013945, + "learning_rate": 5.521148350529137e-07, + "loss": 0.2774, + "step": 15611 + }, + { + "epoch": 0.9, + "grad_norm": 0.8980545590794237, + "learning_rate": 5.51505213451553e-07, + "loss": 0.3379, + "step": 15612 + }, + { + "epoch": 0.9, + "grad_norm": 0.3863824149031958, + "learning_rate": 5.508959190511609e-07, + "loss": 0.2108, + "step": 15613 + }, + { + "epoch": 0.9, + "grad_norm": 0.33532397215502635, + "learning_rate": 5.502869518728359e-07, + "loss": 0.3238, + "step": 15614 + }, + { + "epoch": 0.9, + "grad_norm": 0.5569176551550571, + "learning_rate": 5.49678311937667e-07, + "loss": 0.3709, + "step": 15615 + }, + { + "epoch": 0.9, + "grad_norm": 0.36933007186294964, + "learning_rate": 5.490699992667326e-07, + "loss": 0.2373, + "step": 15616 + }, + { + "epoch": 0.9, + "grad_norm": 0.45251797358873475, + "learning_rate": 5.48462013881097e-07, + "loss": 0.2836, + "step": 15617 + }, + { + "epoch": 0.9, + "grad_norm": 0.23540557748561147, + "learning_rate": 5.478543558018167e-07, + "loss": 0.2141, + "step": 15618 + }, + { + "epoch": 0.9, + "grad_norm": 0.4132520532778912, + "learning_rate": 5.472470250499328e-07, + "loss": 0.2118, + "step": 15619 + }, + { + "epoch": 0.9, + "grad_norm": 1.2284634022109895, + "learning_rate": 5.466400216464774e-07, + "loss": 0.7019, + "step": 15620 + }, + { + "epoch": 0.9, + "grad_norm": 0.3909616326979364, + "learning_rate": 5.460333456124722e-07, + "loss": 0.2444, + "step": 15621 + }, + { + "epoch": 0.9, + "grad_norm": 0.280909579303553, + "learning_rate": 5.454269969689252e-07, + "loss": 0.2283, + "step": 15622 + }, + { + "epoch": 0.9, + "grad_norm": 0.6943885695673067, + "learning_rate": 5.448209757368361e-07, + "loss": 0.3467, + "step": 15623 + }, + { + "epoch": 0.9, + "grad_norm": 0.214468125396571, + "learning_rate": 5.442152819371882e-07, + "loss": 0.1708, + "step": 15624 + }, + { + "epoch": 0.9, + "grad_norm": 0.5495278130056973, + "learning_rate": 5.436099155909592e-07, + "loss": 0.3312, + "step": 15625 + }, + { + "epoch": 0.9, + "grad_norm": 0.314657048380129, + "learning_rate": 5.430048767191121e-07, + "loss": 0.2532, + "step": 15626 + }, + { + "epoch": 0.9, + "grad_norm": 0.5585893096454551, + "learning_rate": 5.424001653426003e-07, + "loss": 0.3362, + "step": 15627 + }, + { + "epoch": 0.9, + "grad_norm": 0.36554119112794337, + "learning_rate": 5.417957814823627e-07, + "loss": 0.2746, + "step": 15628 + }, + { + "epoch": 0.9, + "grad_norm": 0.3853663692179569, + "learning_rate": 5.411917251593313e-07, + "loss": 0.2478, + "step": 15629 + }, + { + "epoch": 0.9, + "grad_norm": 0.21621671494700923, + "learning_rate": 5.405879963944238e-07, + "loss": 0.1392, + "step": 15630 + }, + { + "epoch": 0.9, + "grad_norm": 0.4678193698155111, + "learning_rate": 5.39984595208547e-07, + "loss": 0.2854, + "step": 15631 + }, + { + "epoch": 0.9, + "grad_norm": 0.8065746449842706, + "learning_rate": 5.393815216225972e-07, + "loss": 0.3653, + "step": 15632 + }, + { + "epoch": 0.9, + "grad_norm": 0.3145087220616073, + "learning_rate": 5.387787756574592e-07, + "loss": 0.2735, + "step": 15633 + }, + { + "epoch": 0.9, + "grad_norm": 0.3790211085952469, + "learning_rate": 5.381763573340049e-07, + "loss": 0.2876, + "step": 15634 + }, + { + "epoch": 0.9, + "grad_norm": 0.9396952767327181, + "learning_rate": 5.375742666730955e-07, + "loss": 0.2666, + "step": 15635 + }, + { + "epoch": 0.9, + "grad_norm": 0.23651179482052748, + "learning_rate": 5.36972503695582e-07, + "loss": 0.1632, + "step": 15636 + }, + { + "epoch": 0.9, + "grad_norm": 0.31046373469158267, + "learning_rate": 5.363710684223045e-07, + "loss": 0.264, + "step": 15637 + }, + { + "epoch": 0.9, + "grad_norm": 0.48144032175554063, + "learning_rate": 5.357699608740907e-07, + "loss": 0.3497, + "step": 15638 + }, + { + "epoch": 0.9, + "grad_norm": 0.538184412368237, + "learning_rate": 5.351691810717552e-07, + "loss": 0.2247, + "step": 15639 + }, + { + "epoch": 0.9, + "grad_norm": 0.42659928158946236, + "learning_rate": 5.345687290361035e-07, + "loss": 0.3106, + "step": 15640 + }, + { + "epoch": 0.9, + "grad_norm": 0.6164374849438661, + "learning_rate": 5.339686047879311e-07, + "loss": 0.3437, + "step": 15641 + }, + { + "epoch": 0.9, + "grad_norm": 0.21254629178819517, + "learning_rate": 5.333688083480182e-07, + "loss": 0.1303, + "step": 15642 + }, + { + "epoch": 0.9, + "grad_norm": 0.4149495701684598, + "learning_rate": 5.327693397371369e-07, + "loss": 0.2945, + "step": 15643 + }, + { + "epoch": 0.9, + "grad_norm": 0.8395100725171276, + "learning_rate": 5.321701989760452e-07, + "loss": 0.4472, + "step": 15644 + }, + { + "epoch": 0.9, + "grad_norm": 0.2746630354052698, + "learning_rate": 5.315713860854921e-07, + "loss": 0.2244, + "step": 15645 + }, + { + "epoch": 0.9, + "grad_norm": 0.3804893461734729, + "learning_rate": 5.309729010862163e-07, + "loss": 0.3088, + "step": 15646 + }, + { + "epoch": 0.9, + "grad_norm": 1.3065235001536424, + "learning_rate": 5.303747439989415e-07, + "loss": 0.4975, + "step": 15647 + }, + { + "epoch": 0.9, + "grad_norm": 0.1564879560582931, + "learning_rate": 5.29776914844382e-07, + "loss": 0.0707, + "step": 15648 + }, + { + "epoch": 0.9, + "grad_norm": 0.285253213852128, + "learning_rate": 5.291794136432393e-07, + "loss": 0.2486, + "step": 15649 + }, + { + "epoch": 0.9, + "grad_norm": 0.5030656323905838, + "learning_rate": 5.285822404162066e-07, + "loss": 0.3373, + "step": 15650 + }, + { + "epoch": 0.9, + "grad_norm": 0.7274055892952281, + "learning_rate": 5.279853951839653e-07, + "loss": 0.4009, + "step": 15651 + }, + { + "epoch": 0.9, + "grad_norm": 0.3379741609185097, + "learning_rate": 5.27388877967181e-07, + "loss": 0.2087, + "step": 15652 + }, + { + "epoch": 0.9, + "grad_norm": 0.5153461591870591, + "learning_rate": 5.267926887865127e-07, + "loss": 0.2932, + "step": 15653 + }, + { + "epoch": 0.9, + "grad_norm": 0.33147772787988616, + "learning_rate": 5.261968276626062e-07, + "loss": 0.2518, + "step": 15654 + }, + { + "epoch": 0.9, + "grad_norm": 0.2169728639824801, + "learning_rate": 5.256012946160971e-07, + "loss": 0.1588, + "step": 15655 + }, + { + "epoch": 0.9, + "grad_norm": 0.7769396824390693, + "learning_rate": 5.250060896676068e-07, + "loss": 0.4372, + "step": 15656 + }, + { + "epoch": 0.9, + "grad_norm": 0.3540478962758659, + "learning_rate": 5.244112128377477e-07, + "loss": 0.2981, + "step": 15657 + }, + { + "epoch": 0.9, + "grad_norm": 0.31449744620930736, + "learning_rate": 5.238166641471221e-07, + "loss": 0.2103, + "step": 15658 + }, + { + "epoch": 0.9, + "grad_norm": 1.3570129545424443, + "learning_rate": 5.23222443616318e-07, + "loss": 0.4924, + "step": 15659 + }, + { + "epoch": 0.9, + "grad_norm": 0.28912954651128253, + "learning_rate": 5.226285512659123e-07, + "loss": 0.1437, + "step": 15660 + }, + { + "epoch": 0.9, + "grad_norm": 0.2773332070677685, + "learning_rate": 5.220349871164732e-07, + "loss": 0.2187, + "step": 15661 + }, + { + "epoch": 0.9, + "grad_norm": 0.4015892649555536, + "learning_rate": 5.214417511885539e-07, + "loss": 0.308, + "step": 15662 + }, + { + "epoch": 0.9, + "grad_norm": 0.6413421071522666, + "learning_rate": 5.208488435026992e-07, + "loss": 0.4395, + "step": 15663 + }, + { + "epoch": 0.9, + "grad_norm": 0.4767384537292822, + "learning_rate": 5.202562640794429e-07, + "loss": 0.2826, + "step": 15664 + }, + { + "epoch": 0.9, + "grad_norm": 0.38488084441750964, + "learning_rate": 5.196640129393038e-07, + "loss": 0.2458, + "step": 15665 + }, + { + "epoch": 0.9, + "grad_norm": 1.3259987938419826, + "learning_rate": 5.190720901027901e-07, + "loss": 0.5462, + "step": 15666 + }, + { + "epoch": 0.9, + "grad_norm": 0.2478349774975706, + "learning_rate": 5.184804955904066e-07, + "loss": 0.2062, + "step": 15667 + }, + { + "epoch": 0.9, + "grad_norm": 0.5459979152515864, + "learning_rate": 5.178892294226334e-07, + "loss": 0.3124, + "step": 15668 + }, + { + "epoch": 0.9, + "grad_norm": 0.35238957217762756, + "learning_rate": 5.172982916199465e-07, + "loss": 0.3126, + "step": 15669 + }, + { + "epoch": 0.9, + "grad_norm": 0.3040986720543595, + "learning_rate": 5.167076822028149e-07, + "loss": 0.2522, + "step": 15670 + }, + { + "epoch": 0.9, + "grad_norm": 0.23105199235660073, + "learning_rate": 5.16117401191687e-07, + "loss": 0.0663, + "step": 15671 + }, + { + "epoch": 0.9, + "grad_norm": 0.4992153585528372, + "learning_rate": 5.155274486070072e-07, + "loss": 0.3346, + "step": 15672 + }, + { + "epoch": 0.9, + "grad_norm": 0.261839200482087, + "learning_rate": 5.149378244692027e-07, + "loss": 0.2322, + "step": 15673 + }, + { + "epoch": 0.9, + "grad_norm": 0.5089769058171006, + "learning_rate": 5.143485287986927e-07, + "loss": 0.2294, + "step": 15674 + }, + { + "epoch": 0.9, + "grad_norm": 0.3794901247148647, + "learning_rate": 5.137595616158863e-07, + "loss": 0.2656, + "step": 15675 + }, + { + "epoch": 0.9, + "grad_norm": 0.33643466430036256, + "learning_rate": 5.131709229411785e-07, + "loss": 0.2438, + "step": 15676 + }, + { + "epoch": 0.9, + "grad_norm": 0.3647097642493535, + "learning_rate": 5.12582612794954e-07, + "loss": 0.282, + "step": 15677 + }, + { + "epoch": 0.9, + "grad_norm": 0.7738986859788042, + "learning_rate": 5.119946311975843e-07, + "loss": 0.2076, + "step": 15678 + }, + { + "epoch": 0.9, + "grad_norm": 0.34867075246133533, + "learning_rate": 5.114069781694331e-07, + "loss": 0.2548, + "step": 15679 + }, + { + "epoch": 0.9, + "grad_norm": 0.6216173151236968, + "learning_rate": 5.108196537308507e-07, + "loss": 0.3724, + "step": 15680 + }, + { + "epoch": 0.9, + "grad_norm": 0.2894107217697507, + "learning_rate": 5.102326579021754e-07, + "loss": 0.2367, + "step": 15681 + }, + { + "epoch": 0.9, + "grad_norm": 0.33805317461218487, + "learning_rate": 5.096459907037344e-07, + "loss": 0.2521, + "step": 15682 + }, + { + "epoch": 0.9, + "grad_norm": 0.30537643202596476, + "learning_rate": 5.090596521558455e-07, + "loss": 0.1848, + "step": 15683 + }, + { + "epoch": 0.9, + "grad_norm": 0.44721678914316987, + "learning_rate": 5.084736422788128e-07, + "loss": 0.2464, + "step": 15684 + }, + { + "epoch": 0.9, + "grad_norm": 0.25725829442115, + "learning_rate": 5.078879610929299e-07, + "loss": 0.2437, + "step": 15685 + }, + { + "epoch": 0.9, + "grad_norm": 0.6813280238885259, + "learning_rate": 5.073026086184785e-07, + "loss": 0.4465, + "step": 15686 + }, + { + "epoch": 0.9, + "grad_norm": 0.22755669708281842, + "learning_rate": 5.067175848757288e-07, + "loss": 0.1386, + "step": 15687 + }, + { + "epoch": 0.9, + "grad_norm": 0.23563842364379814, + "learning_rate": 5.061328898849416e-07, + "loss": 0.1959, + "step": 15688 + }, + { + "epoch": 0.9, + "grad_norm": 0.3534225275012728, + "learning_rate": 5.055485236663638e-07, + "loss": 0.2719, + "step": 15689 + }, + { + "epoch": 0.9, + "grad_norm": 0.6615023017392966, + "learning_rate": 5.049644862402336e-07, + "loss": 0.3489, + "step": 15690 + }, + { + "epoch": 0.9, + "grad_norm": 0.30468397890498616, + "learning_rate": 5.043807776267729e-07, + "loss": 0.203, + "step": 15691 + }, + { + "epoch": 0.9, + "grad_norm": 1.2458868096231006, + "learning_rate": 5.037973978461985e-07, + "loss": 0.6232, + "step": 15692 + }, + { + "epoch": 0.9, + "grad_norm": 0.3412161058565019, + "learning_rate": 5.032143469187123e-07, + "loss": 0.3043, + "step": 15693 + }, + { + "epoch": 0.9, + "grad_norm": 0.2687736892933335, + "learning_rate": 5.026316248645047e-07, + "loss": 0.1856, + "step": 15694 + }, + { + "epoch": 0.9, + "grad_norm": 0.2705685324835451, + "learning_rate": 5.020492317037539e-07, + "loss": 0.1572, + "step": 15695 + }, + { + "epoch": 0.9, + "grad_norm": 0.4139012116678503, + "learning_rate": 5.014671674566319e-07, + "loss": 0.3283, + "step": 15696 + }, + { + "epoch": 0.9, + "grad_norm": 0.304048179285271, + "learning_rate": 5.008854321432932e-07, + "loss": 0.1848, + "step": 15697 + }, + { + "epoch": 0.9, + "grad_norm": 0.5339104269414403, + "learning_rate": 5.003040257838831e-07, + "loss": 0.3313, + "step": 15698 + }, + { + "epoch": 0.9, + "grad_norm": 1.1617194889579199, + "learning_rate": 4.997229483985366e-07, + "loss": 0.4965, + "step": 15699 + }, + { + "epoch": 0.9, + "grad_norm": 0.4026837415595859, + "learning_rate": 4.991422000073753e-07, + "loss": 0.2646, + "step": 15700 + }, + { + "epoch": 0.9, + "grad_norm": 0.21280087447699006, + "learning_rate": 4.985617806305121e-07, + "loss": 0.1822, + "step": 15701 + }, + { + "epoch": 0.9, + "grad_norm": 0.7633811247449117, + "learning_rate": 4.979816902880441e-07, + "loss": 0.3467, + "step": 15702 + }, + { + "epoch": 0.9, + "grad_norm": 0.3855766888818858, + "learning_rate": 4.97401929000062e-07, + "loss": 0.2693, + "step": 15703 + }, + { + "epoch": 0.9, + "grad_norm": 0.34135917725265313, + "learning_rate": 4.968224967866431e-07, + "loss": 0.2474, + "step": 15704 + }, + { + "epoch": 0.9, + "grad_norm": 0.4960999651515975, + "learning_rate": 4.962433936678523e-07, + "loss": 0.3536, + "step": 15705 + }, + { + "epoch": 0.9, + "grad_norm": 0.4109252916237451, + "learning_rate": 4.956646196637438e-07, + "loss": 0.2418, + "step": 15706 + }, + { + "epoch": 0.9, + "grad_norm": 0.3962821887700003, + "learning_rate": 4.950861747943603e-07, + "loss": 0.203, + "step": 15707 + }, + { + "epoch": 0.9, + "grad_norm": 0.32311958760614523, + "learning_rate": 4.945080590797346e-07, + "loss": 0.2377, + "step": 15708 + }, + { + "epoch": 0.9, + "grad_norm": 0.2812146473652669, + "learning_rate": 4.939302725398865e-07, + "loss": 0.2372, + "step": 15709 + }, + { + "epoch": 0.9, + "grad_norm": 1.489036443344534, + "learning_rate": 4.933528151948241e-07, + "loss": 0.2296, + "step": 15710 + }, + { + "epoch": 0.9, + "grad_norm": 1.1842550860891952, + "learning_rate": 4.92775687064545e-07, + "loss": 0.8144, + "step": 15711 + }, + { + "epoch": 0.9, + "grad_norm": 0.3325210959542368, + "learning_rate": 4.921988881690332e-07, + "loss": 0.2314, + "step": 15712 + }, + { + "epoch": 0.9, + "grad_norm": 0.4057706489840598, + "learning_rate": 4.91622418528267e-07, + "loss": 0.3007, + "step": 15713 + }, + { + "epoch": 0.9, + "grad_norm": 0.33307672193546267, + "learning_rate": 4.910462781622072e-07, + "loss": 0.1754, + "step": 15714 + }, + { + "epoch": 0.9, + "grad_norm": 0.6227748994436654, + "learning_rate": 4.904704670908067e-07, + "loss": 0.3603, + "step": 15715 + }, + { + "epoch": 0.9, + "grad_norm": 0.42078372345346, + "learning_rate": 4.89894985334003e-07, + "loss": 0.2779, + "step": 15716 + }, + { + "epoch": 0.9, + "grad_norm": 0.30774036410484645, + "learning_rate": 4.893198329117277e-07, + "loss": 0.2554, + "step": 15717 + }, + { + "epoch": 0.9, + "grad_norm": 0.41423426847798006, + "learning_rate": 4.887450098438984e-07, + "loss": 0.2504, + "step": 15718 + }, + { + "epoch": 0.9, + "grad_norm": 0.4670151159835837, + "learning_rate": 4.881705161504202e-07, + "loss": 0.3101, + "step": 15719 + }, + { + "epoch": 0.9, + "grad_norm": 0.21198178399623216, + "learning_rate": 4.875963518511872e-07, + "loss": 0.1375, + "step": 15720 + }, + { + "epoch": 0.9, + "grad_norm": 0.3436783552350103, + "learning_rate": 4.870225169660836e-07, + "loss": 0.2452, + "step": 15721 + }, + { + "epoch": 0.9, + "grad_norm": 0.6392564449176125, + "learning_rate": 4.864490115149823e-07, + "loss": 0.3374, + "step": 15722 + }, + { + "epoch": 0.9, + "grad_norm": 0.781527813043474, + "learning_rate": 4.858758355177418e-07, + "loss": 0.337, + "step": 15723 + }, + { + "epoch": 0.9, + "grad_norm": 0.3130251635811382, + "learning_rate": 4.853029889942129e-07, + "loss": 0.2628, + "step": 15724 + }, + { + "epoch": 0.9, + "grad_norm": 0.35452990662259, + "learning_rate": 4.84730471964231e-07, + "loss": 0.2988, + "step": 15725 + }, + { + "epoch": 0.9, + "grad_norm": 0.41564857624579266, + "learning_rate": 4.841582844476244e-07, + "loss": 0.2544, + "step": 15726 + }, + { + "epoch": 0.9, + "grad_norm": 0.2698526571349199, + "learning_rate": 4.835864264642076e-07, + "loss": 0.1338, + "step": 15727 + }, + { + "epoch": 0.9, + "grad_norm": 0.5453059779722633, + "learning_rate": 4.830148980337834e-07, + "loss": 0.3428, + "step": 15728 + }, + { + "epoch": 0.9, + "grad_norm": 0.3471356881056745, + "learning_rate": 4.824436991761428e-07, + "loss": 0.3259, + "step": 15729 + }, + { + "epoch": 0.9, + "grad_norm": 0.34684817296127346, + "learning_rate": 4.818728299110686e-07, + "loss": 0.2164, + "step": 15730 + }, + { + "epoch": 0.9, + "grad_norm": 0.8953070877213622, + "learning_rate": 4.813022902583286e-07, + "loss": 0.4185, + "step": 15731 + }, + { + "epoch": 0.9, + "grad_norm": 0.41813356179096717, + "learning_rate": 4.807320802376824e-07, + "loss": 0.2948, + "step": 15732 + }, + { + "epoch": 0.9, + "grad_norm": 0.1447814308629961, + "learning_rate": 4.801621998688722e-07, + "loss": 0.0716, + "step": 15733 + }, + { + "epoch": 0.9, + "grad_norm": 0.3883904878065946, + "learning_rate": 4.795926491716396e-07, + "loss": 0.3139, + "step": 15734 + }, + { + "epoch": 0.9, + "grad_norm": 0.6516174961256956, + "learning_rate": 4.790234281657025e-07, + "loss": 0.3845, + "step": 15735 + }, + { + "epoch": 0.9, + "grad_norm": 0.3672484706266206, + "learning_rate": 4.784545368707738e-07, + "loss": 0.204, + "step": 15736 + }, + { + "epoch": 0.9, + "grad_norm": 0.32729802797217145, + "learning_rate": 4.778859753065545e-07, + "loss": 0.2943, + "step": 15737 + }, + { + "epoch": 0.9, + "grad_norm": 0.4014564746228084, + "learning_rate": 4.773177434927356e-07, + "loss": 0.1903, + "step": 15738 + }, + { + "epoch": 0.9, + "grad_norm": 0.42003454062639795, + "learning_rate": 4.767498414489935e-07, + "loss": 0.2329, + "step": 15739 + }, + { + "epoch": 0.9, + "grad_norm": 0.25069726353847577, + "learning_rate": 4.7618226919499465e-07, + "loss": 0.233, + "step": 15740 + }, + { + "epoch": 0.9, + "grad_norm": 0.7447684355120814, + "learning_rate": 4.756150267503934e-07, + "loss": 0.4077, + "step": 15741 + }, + { + "epoch": 0.9, + "grad_norm": 0.5399344736483198, + "learning_rate": 4.750481141348362e-07, + "loss": 0.3129, + "step": 15742 + }, + { + "epoch": 0.9, + "grad_norm": 0.34196695458758986, + "learning_rate": 4.7448153136795185e-07, + "loss": 0.2277, + "step": 15743 + }, + { + "epoch": 0.9, + "grad_norm": 0.38306387196871694, + "learning_rate": 4.739152784693635e-07, + "loss": 0.2897, + "step": 15744 + }, + { + "epoch": 0.9, + "grad_norm": 0.2161941592051655, + "learning_rate": 4.733493554586777e-07, + "loss": 0.1393, + "step": 15745 + }, + { + "epoch": 0.9, + "grad_norm": 0.359758705296901, + "learning_rate": 4.727837623554954e-07, + "loss": 0.2245, + "step": 15746 + }, + { + "epoch": 0.9, + "grad_norm": 0.6364944282856332, + "learning_rate": 4.722184991794021e-07, + "loss": 0.368, + "step": 15747 + }, + { + "epoch": 0.9, + "grad_norm": 0.29869548642001714, + "learning_rate": 4.7165356594997215e-07, + "loss": 0.2794, + "step": 15748 + }, + { + "epoch": 0.9, + "grad_norm": 0.3514622933369766, + "learning_rate": 4.710889626867687e-07, + "loss": 0.2274, + "step": 15749 + }, + { + "epoch": 0.9, + "grad_norm": 0.534507202273457, + "learning_rate": 4.7052468940934405e-07, + "loss": 0.2445, + "step": 15750 + }, + { + "epoch": 0.9, + "grad_norm": 0.40165702496329014, + "learning_rate": 4.699607461372413e-07, + "loss": 0.2209, + "step": 15751 + }, + { + "epoch": 0.91, + "grad_norm": 0.29933132482387254, + "learning_rate": 4.6939713288998824e-07, + "loss": 0.2888, + "step": 15752 + }, + { + "epoch": 0.91, + "grad_norm": 0.39195373443976395, + "learning_rate": 4.6883384968710146e-07, + "loss": 0.275, + "step": 15753 + }, + { + "epoch": 0.91, + "grad_norm": 0.804794875047898, + "learning_rate": 4.682708965480887e-07, + "loss": 0.447, + "step": 15754 + }, + { + "epoch": 0.91, + "grad_norm": 0.38284619581894447, + "learning_rate": 4.677082734924454e-07, + "loss": 0.2555, + "step": 15755 + }, + { + "epoch": 0.91, + "grad_norm": 0.34044252869584235, + "learning_rate": 4.67145980539655e-07, + "loss": 0.2351, + "step": 15756 + }, + { + "epoch": 0.91, + "grad_norm": 0.2561805020204338, + "learning_rate": 4.665840177091885e-07, + "loss": 0.1603, + "step": 15757 + }, + { + "epoch": 0.91, + "grad_norm": 0.3286363690720469, + "learning_rate": 4.66022385020507e-07, + "loss": 0.2712, + "step": 15758 + }, + { + "epoch": 0.91, + "grad_norm": 0.7834022205802545, + "learning_rate": 4.6546108249306163e-07, + "loss": 0.3245, + "step": 15759 + }, + { + "epoch": 0.91, + "grad_norm": 0.2849494240717211, + "learning_rate": 4.649001101462891e-07, + "loss": 0.2749, + "step": 15760 + }, + { + "epoch": 0.91, + "grad_norm": 0.3713613181819242, + "learning_rate": 4.6433946799961605e-07, + "loss": 0.2917, + "step": 15761 + }, + { + "epoch": 0.91, + "grad_norm": 1.3943161939139344, + "learning_rate": 4.6377915607245583e-07, + "loss": 0.218, + "step": 15762 + }, + { + "epoch": 0.91, + "grad_norm": 0.35360704239027846, + "learning_rate": 4.6321917438421294e-07, + "loss": 0.2162, + "step": 15763 + }, + { + "epoch": 0.91, + "grad_norm": 0.29520125023410765, + "learning_rate": 4.626595229542818e-07, + "loss": 0.2779, + "step": 15764 + }, + { + "epoch": 0.91, + "grad_norm": 0.40025625010104504, + "learning_rate": 4.621002018020404e-07, + "loss": 0.3336, + "step": 15765 + }, + { + "epoch": 0.91, + "grad_norm": 0.18183834726311107, + "learning_rate": 4.615412109468587e-07, + "loss": 0.1206, + "step": 15766 + }, + { + "epoch": 0.91, + "grad_norm": 0.3861098147618549, + "learning_rate": 4.6098255040809447e-07, + "loss": 0.275, + "step": 15767 + }, + { + "epoch": 0.91, + "grad_norm": 0.39267454900302395, + "learning_rate": 4.604242202050957e-07, + "loss": 0.2835, + "step": 15768 + }, + { + "epoch": 0.91, + "grad_norm": 0.8010348120564672, + "learning_rate": 4.5986622035719575e-07, + "loss": 0.1828, + "step": 15769 + }, + { + "epoch": 0.91, + "grad_norm": 0.35292055566037195, + "learning_rate": 4.59308550883717e-07, + "loss": 0.2744, + "step": 15770 + }, + { + "epoch": 0.91, + "grad_norm": 0.44243941838987, + "learning_rate": 4.5875121180397276e-07, + "loss": 0.2978, + "step": 15771 + }, + { + "epoch": 0.91, + "grad_norm": 0.29214452693872744, + "learning_rate": 4.581942031372655e-07, + "loss": 0.2196, + "step": 15772 + }, + { + "epoch": 0.91, + "grad_norm": 0.3078542986697614, + "learning_rate": 4.5763752490288194e-07, + "loss": 0.242, + "step": 15773 + }, + { + "epoch": 0.91, + "grad_norm": 0.8844199684923539, + "learning_rate": 4.570811771201e-07, + "loss": 0.4156, + "step": 15774 + }, + { + "epoch": 0.91, + "grad_norm": 0.6335765581868591, + "learning_rate": 4.5652515980818546e-07, + "loss": 0.3745, + "step": 15775 + }, + { + "epoch": 0.91, + "grad_norm": 0.22674350740370733, + "learning_rate": 4.5596947298639614e-07, + "loss": 0.212, + "step": 15776 + }, + { + "epoch": 0.91, + "grad_norm": 1.7659474533173485, + "learning_rate": 4.554141166739734e-07, + "loss": 0.5087, + "step": 15777 + }, + { + "epoch": 0.91, + "grad_norm": 0.2534645385419527, + "learning_rate": 4.548590908901496e-07, + "loss": 0.1883, + "step": 15778 + }, + { + "epoch": 0.91, + "grad_norm": 0.283779004129321, + "learning_rate": 4.5430439565414263e-07, + "loss": 0.1839, + "step": 15779 + }, + { + "epoch": 0.91, + "grad_norm": 0.3476013206398067, + "learning_rate": 4.5375003098516613e-07, + "loss": 0.2717, + "step": 15780 + }, + { + "epoch": 0.91, + "grad_norm": 0.5541988538944898, + "learning_rate": 4.5319599690241576e-07, + "loss": 0.3387, + "step": 15781 + }, + { + "epoch": 0.91, + "grad_norm": 0.34841705919786675, + "learning_rate": 4.5264229342507736e-07, + "loss": 0.2278, + "step": 15782 + }, + { + "epoch": 0.91, + "grad_norm": 1.2826263441085395, + "learning_rate": 4.5208892057232446e-07, + "loss": 0.6443, + "step": 15783 + }, + { + "epoch": 0.91, + "grad_norm": 0.25060351873355025, + "learning_rate": 4.515358783633228e-07, + "loss": 0.238, + "step": 15784 + }, + { + "epoch": 0.91, + "grad_norm": 0.2457619049739712, + "learning_rate": 4.5098316681722266e-07, + "loss": 0.1596, + "step": 15785 + }, + { + "epoch": 0.91, + "grad_norm": 0.7165815667752252, + "learning_rate": 4.5043078595316536e-07, + "loss": 0.3654, + "step": 15786 + }, + { + "epoch": 0.91, + "grad_norm": 0.45627630512144096, + "learning_rate": 4.4987873579027784e-07, + "loss": 0.3349, + "step": 15787 + }, + { + "epoch": 0.91, + "grad_norm": 0.3075011835404249, + "learning_rate": 4.493270163476804e-07, + "loss": 0.2568, + "step": 15788 + }, + { + "epoch": 0.91, + "grad_norm": 0.48719044332200573, + "learning_rate": 4.4877562764447766e-07, + "loss": 0.2597, + "step": 15789 + }, + { + "epoch": 0.91, + "grad_norm": 0.5000417018144054, + "learning_rate": 4.4822456969976444e-07, + "loss": 0.2893, + "step": 15790 + }, + { + "epoch": 0.91, + "grad_norm": 0.2556509931355065, + "learning_rate": 4.4767384253262326e-07, + "loss": 0.2022, + "step": 15791 + }, + { + "epoch": 0.91, + "grad_norm": 0.3334183800000595, + "learning_rate": 4.4712344616212433e-07, + "loss": 0.246, + "step": 15792 + }, + { + "epoch": 0.91, + "grad_norm": 0.978515196075592, + "learning_rate": 4.4657338060733246e-07, + "loss": 0.4858, + "step": 15793 + }, + { + "epoch": 0.91, + "grad_norm": 0.3399680398954725, + "learning_rate": 4.4602364588729243e-07, + "loss": 0.2275, + "step": 15794 + }, + { + "epoch": 0.91, + "grad_norm": 1.5585273086353948, + "learning_rate": 4.454742420210434e-07, + "loss": 0.3749, + "step": 15795 + }, + { + "epoch": 0.91, + "grad_norm": 0.3308699135535816, + "learning_rate": 4.44925169027608e-07, + "loss": 0.3088, + "step": 15796 + }, + { + "epoch": 0.91, + "grad_norm": 0.24890530106124995, + "learning_rate": 4.4437642692600534e-07, + "loss": 0.1998, + "step": 15797 + }, + { + "epoch": 0.91, + "grad_norm": 0.42886569435392907, + "learning_rate": 4.4382801573523595e-07, + "loss": 0.1926, + "step": 15798 + }, + { + "epoch": 0.91, + "grad_norm": 0.3308619773627193, + "learning_rate": 4.4327993547429225e-07, + "loss": 0.2951, + "step": 15799 + }, + { + "epoch": 0.91, + "grad_norm": 0.32885301500455993, + "learning_rate": 4.427321861621514e-07, + "loss": 0.2736, + "step": 15800 + }, + { + "epoch": 0.91, + "grad_norm": 1.1618546884805907, + "learning_rate": 4.4218476781778483e-07, + "loss": 0.5503, + "step": 15801 + }, + { + "epoch": 0.91, + "grad_norm": 0.35167729216980037, + "learning_rate": 4.416376804601508e-07, + "loss": 0.1287, + "step": 15802 + }, + { + "epoch": 0.91, + "grad_norm": 0.3502201067992776, + "learning_rate": 4.410909241081918e-07, + "loss": 0.2668, + "step": 15803 + }, + { + "epoch": 0.91, + "grad_norm": 0.3340832982352592, + "learning_rate": 4.405444987808405e-07, + "loss": 0.2879, + "step": 15804 + }, + { + "epoch": 0.91, + "grad_norm": 0.43546774821008033, + "learning_rate": 4.39998404497024e-07, + "loss": 0.1536, + "step": 15805 + }, + { + "epoch": 0.91, + "grad_norm": 0.3112353537779798, + "learning_rate": 4.3945264127565166e-07, + "loss": 0.2528, + "step": 15806 + }, + { + "epoch": 0.91, + "grad_norm": 0.551693833136559, + "learning_rate": 4.389072091356239e-07, + "loss": 0.3786, + "step": 15807 + }, + { + "epoch": 0.91, + "grad_norm": 0.49880144907478224, + "learning_rate": 4.383621080958267e-07, + "loss": 0.2335, + "step": 15808 + }, + { + "epoch": 0.91, + "grad_norm": 0.3142694304193792, + "learning_rate": 4.378173381751394e-07, + "loss": 0.2585, + "step": 15809 + }, + { + "epoch": 0.91, + "grad_norm": 0.40112648470346424, + "learning_rate": 4.372728993924269e-07, + "loss": 0.266, + "step": 15810 + }, + { + "epoch": 0.91, + "grad_norm": 0.2930212171494269, + "learning_rate": 4.3672879176654303e-07, + "loss": 0.1907, + "step": 15811 + }, + { + "epoch": 0.91, + "grad_norm": 0.34939185440615567, + "learning_rate": 4.3618501531632717e-07, + "loss": 0.282, + "step": 15812 + }, + { + "epoch": 0.91, + "grad_norm": 1.1777285605456114, + "learning_rate": 4.3564157006061535e-07, + "loss": 0.685, + "step": 15813 + }, + { + "epoch": 0.91, + "grad_norm": 0.7188027543719689, + "learning_rate": 4.3509845601822474e-07, + "loss": 0.4602, + "step": 15814 + }, + { + "epoch": 0.91, + "grad_norm": 0.30789053770206504, + "learning_rate": 4.3455567320796366e-07, + "loss": 0.216, + "step": 15815 + }, + { + "epoch": 0.91, + "grad_norm": 0.37512687043786214, + "learning_rate": 4.34013221648627e-07, + "loss": 0.3312, + "step": 15816 + }, + { + "epoch": 0.91, + "grad_norm": 0.3616392023467653, + "learning_rate": 4.3347110135900094e-07, + "loss": 0.1928, + "step": 15817 + }, + { + "epoch": 0.91, + "grad_norm": 0.31199271737258344, + "learning_rate": 4.329293123578604e-07, + "loss": 0.1853, + "step": 15818 + }, + { + "epoch": 0.91, + "grad_norm": 0.4895600138237852, + "learning_rate": 4.3238785466396596e-07, + "loss": 0.3757, + "step": 15819 + }, + { + "epoch": 0.91, + "grad_norm": 0.45924672686202866, + "learning_rate": 4.318467282960681e-07, + "loss": 0.3507, + "step": 15820 + }, + { + "epoch": 0.91, + "grad_norm": 0.3120934451623118, + "learning_rate": 4.3130593327290637e-07, + "loss": 0.1888, + "step": 15821 + }, + { + "epoch": 0.91, + "grad_norm": 0.770298360679297, + "learning_rate": 4.307654696132102e-07, + "loss": 0.3835, + "step": 15822 + }, + { + "epoch": 0.91, + "grad_norm": 0.21490372065099173, + "learning_rate": 4.302253373356935e-07, + "loss": 0.2062, + "step": 15823 + }, + { + "epoch": 0.91, + "grad_norm": 0.2986581235557719, + "learning_rate": 4.296855364590624e-07, + "loss": 0.1844, + "step": 15824 + }, + { + "epoch": 0.91, + "grad_norm": 1.069549265915451, + "learning_rate": 4.2914606700200755e-07, + "loss": 0.7423, + "step": 15825 + }, + { + "epoch": 0.91, + "grad_norm": 0.6599039669777897, + "learning_rate": 4.286069289832151e-07, + "loss": 0.3809, + "step": 15826 + }, + { + "epoch": 0.91, + "grad_norm": 0.419239808764103, + "learning_rate": 4.280681224213523e-07, + "loss": 0.2807, + "step": 15827 + }, + { + "epoch": 0.91, + "grad_norm": 0.31727020613520757, + "learning_rate": 4.2752964733507984e-07, + "loss": 0.2382, + "step": 15828 + }, + { + "epoch": 0.91, + "grad_norm": 0.29397819184063156, + "learning_rate": 4.2699150374304275e-07, + "loss": 0.1889, + "step": 15829 + }, + { + "epoch": 0.91, + "grad_norm": 0.4446204152271284, + "learning_rate": 4.2645369166387727e-07, + "loss": 0.2613, + "step": 15830 + }, + { + "epoch": 0.91, + "grad_norm": 0.39269983702966865, + "learning_rate": 4.259162111162107e-07, + "loss": 0.2553, + "step": 15831 + }, + { + "epoch": 0.91, + "grad_norm": 0.45515828284942017, + "learning_rate": 4.2537906211865375e-07, + "loss": 0.3144, + "step": 15832 + }, + { + "epoch": 0.91, + "grad_norm": 0.3112784755800918, + "learning_rate": 4.2484224468980815e-07, + "loss": 0.2632, + "step": 15833 + }, + { + "epoch": 0.91, + "grad_norm": 1.8587599044679906, + "learning_rate": 4.243057588482624e-07, + "loss": 0.2075, + "step": 15834 + }, + { + "epoch": 0.91, + "grad_norm": 0.23261596393289166, + "learning_rate": 4.237696046125994e-07, + "loss": 0.2007, + "step": 15835 + }, + { + "epoch": 0.91, + "grad_norm": 0.4171560765483998, + "learning_rate": 4.232337820013821e-07, + "loss": 0.2908, + "step": 15836 + }, + { + "epoch": 0.91, + "grad_norm": 0.6894450007615852, + "learning_rate": 4.226982910331656e-07, + "loss": 0.2665, + "step": 15837 + }, + { + "epoch": 0.91, + "grad_norm": 0.631586559127634, + "learning_rate": 4.2216313172649623e-07, + "loss": 0.3694, + "step": 15838 + }, + { + "epoch": 0.91, + "grad_norm": 0.3701344195302594, + "learning_rate": 4.2162830409990583e-07, + "loss": 0.2664, + "step": 15839 + }, + { + "epoch": 0.91, + "grad_norm": 0.32238862938044927, + "learning_rate": 4.2109380817191626e-07, + "loss": 0.2694, + "step": 15840 + }, + { + "epoch": 0.91, + "grad_norm": 0.36861815680269605, + "learning_rate": 4.205596439610349e-07, + "loss": 0.1159, + "step": 15841 + }, + { + "epoch": 0.91, + "grad_norm": 0.3877743415272305, + "learning_rate": 4.2002581148576136e-07, + "loss": 0.2594, + "step": 15842 + }, + { + "epoch": 0.91, + "grad_norm": 0.35220346095956107, + "learning_rate": 4.194923107645821e-07, + "loss": 0.3235, + "step": 15843 + }, + { + "epoch": 0.91, + "grad_norm": 0.5239046538366025, + "learning_rate": 4.189591418159722e-07, + "loss": 0.2708, + "step": 15844 + }, + { + "epoch": 0.91, + "grad_norm": 0.39195351825581, + "learning_rate": 4.1842630465839586e-07, + "loss": 0.2784, + "step": 15845 + }, + { + "epoch": 0.91, + "grad_norm": 0.5495615089399805, + "learning_rate": 4.178937993103027e-07, + "loss": 0.3197, + "step": 15846 + }, + { + "epoch": 0.91, + "grad_norm": 0.27379809113010045, + "learning_rate": 4.1736162579013694e-07, + "loss": 0.1906, + "step": 15847 + }, + { + "epoch": 0.91, + "grad_norm": 0.2684468644693002, + "learning_rate": 4.16829784116326e-07, + "loss": 0.2042, + "step": 15848 + }, + { + "epoch": 0.91, + "grad_norm": 0.4441215646728875, + "learning_rate": 4.1629827430728743e-07, + "loss": 0.2991, + "step": 15849 + }, + { + "epoch": 0.91, + "grad_norm": 0.7808231682802714, + "learning_rate": 4.157670963814264e-07, + "loss": 0.3301, + "step": 15850 + }, + { + "epoch": 0.91, + "grad_norm": 0.2581293927684206, + "learning_rate": 4.1523625035713943e-07, + "loss": 0.2583, + "step": 15851 + }, + { + "epoch": 0.91, + "grad_norm": 0.6004003120556647, + "learning_rate": 4.147057362528095e-07, + "loss": 0.3379, + "step": 15852 + }, + { + "epoch": 0.91, + "grad_norm": 0.5608318585976009, + "learning_rate": 4.141755540868075e-07, + "loss": 0.2723, + "step": 15853 + }, + { + "epoch": 0.91, + "grad_norm": 0.3815204124123482, + "learning_rate": 4.1364570387749324e-07, + "loss": 0.213, + "step": 15854 + }, + { + "epoch": 0.91, + "grad_norm": 0.3536760296474735, + "learning_rate": 4.1311618564321534e-07, + "loss": 0.2938, + "step": 15855 + }, + { + "epoch": 0.91, + "grad_norm": 0.4356058557524649, + "learning_rate": 4.1258699940231353e-07, + "loss": 0.355, + "step": 15856 + }, + { + "epoch": 0.91, + "grad_norm": 0.18578229436004529, + "learning_rate": 4.120581451731109e-07, + "loss": 0.0713, + "step": 15857 + }, + { + "epoch": 0.91, + "grad_norm": 0.4081226800618796, + "learning_rate": 4.1152962297392297e-07, + "loss": 0.273, + "step": 15858 + }, + { + "epoch": 0.91, + "grad_norm": 0.37698659558030323, + "learning_rate": 4.110014328230505e-07, + "loss": 0.3109, + "step": 15859 + }, + { + "epoch": 0.91, + "grad_norm": 0.5250626358401506, + "learning_rate": 4.104735747387867e-07, + "loss": 0.2035, + "step": 15860 + }, + { + "epoch": 0.91, + "grad_norm": 0.3725837020347488, + "learning_rate": 4.099460487394114e-07, + "loss": 0.3016, + "step": 15861 + }, + { + "epoch": 0.91, + "grad_norm": 1.2275656487782942, + "learning_rate": 4.09418854843191e-07, + "loss": 0.7537, + "step": 15862 + }, + { + "epoch": 0.91, + "grad_norm": 0.2383154149448102, + "learning_rate": 4.0889199306838323e-07, + "loss": 0.2219, + "step": 15863 + }, + { + "epoch": 0.91, + "grad_norm": 0.3139643851031367, + "learning_rate": 4.083654634332335e-07, + "loss": 0.1762, + "step": 15864 + }, + { + "epoch": 0.91, + "grad_norm": 0.9530704152814369, + "learning_rate": 4.07839265955976e-07, + "loss": 0.4449, + "step": 15865 + }, + { + "epoch": 0.91, + "grad_norm": 0.5492062970741735, + "learning_rate": 4.073134006548318e-07, + "loss": 0.2962, + "step": 15866 + }, + { + "epoch": 0.91, + "grad_norm": 0.25208480398575406, + "learning_rate": 4.06787867548013e-07, + "loss": 0.2353, + "step": 15867 + }, + { + "epoch": 0.91, + "grad_norm": 1.1304469249273643, + "learning_rate": 4.062626666537162e-07, + "loss": 0.8001, + "step": 15868 + }, + { + "epoch": 0.91, + "grad_norm": 0.19056843516548483, + "learning_rate": 4.0573779799013226e-07, + "loss": 0.1272, + "step": 15869 + }, + { + "epoch": 0.91, + "grad_norm": 0.3996361879336976, + "learning_rate": 4.0521326157543563e-07, + "loss": 0.2277, + "step": 15870 + }, + { + "epoch": 0.91, + "grad_norm": 0.3637325778437535, + "learning_rate": 4.046890574277895e-07, + "loss": 0.2886, + "step": 15871 + }, + { + "epoch": 0.91, + "grad_norm": 0.5237881779689043, + "learning_rate": 4.0416518556534944e-07, + "loss": 0.3128, + "step": 15872 + }, + { + "epoch": 0.91, + "grad_norm": 0.3676819772907118, + "learning_rate": 4.0364164600625753e-07, + "loss": 0.2235, + "step": 15873 + }, + { + "epoch": 0.91, + "grad_norm": 0.47406659233561166, + "learning_rate": 4.0311843876864155e-07, + "loss": 0.3483, + "step": 15874 + }, + { + "epoch": 0.91, + "grad_norm": 0.2780747777411099, + "learning_rate": 4.025955638706203e-07, + "loss": 0.2054, + "step": 15875 + }, + { + "epoch": 0.91, + "grad_norm": 0.28877513130409865, + "learning_rate": 4.020730213303037e-07, + "loss": 0.2163, + "step": 15876 + }, + { + "epoch": 0.91, + "grad_norm": 0.6012066973017609, + "learning_rate": 4.015508111657862e-07, + "loss": 0.2896, + "step": 15877 + }, + { + "epoch": 0.91, + "grad_norm": 0.77096149159783, + "learning_rate": 4.0102893339515e-07, + "loss": 0.3788, + "step": 15878 + }, + { + "epoch": 0.91, + "grad_norm": 0.24983255991293724, + "learning_rate": 4.005073880364696e-07, + "loss": 0.2577, + "step": 15879 + }, + { + "epoch": 0.91, + "grad_norm": 0.8911082860363854, + "learning_rate": 3.999861751078049e-07, + "loss": 0.5244, + "step": 15880 + }, + { + "epoch": 0.91, + "grad_norm": 0.21183230763143193, + "learning_rate": 3.994652946272071e-07, + "loss": 0.1423, + "step": 15881 + }, + { + "epoch": 0.91, + "grad_norm": 0.39131821841894676, + "learning_rate": 3.989447466127128e-07, + "loss": 0.2829, + "step": 15882 + }, + { + "epoch": 0.91, + "grad_norm": 0.32030932741388807, + "learning_rate": 3.984245310823498e-07, + "loss": 0.2383, + "step": 15883 + }, + { + "epoch": 0.91, + "grad_norm": 0.5922727533151578, + "learning_rate": 3.9790464805413044e-07, + "loss": 0.3336, + "step": 15884 + }, + { + "epoch": 0.91, + "grad_norm": 0.3930583813091873, + "learning_rate": 3.973850975460614e-07, + "loss": 0.33, + "step": 15885 + }, + { + "epoch": 0.91, + "grad_norm": 0.4788539037882134, + "learning_rate": 3.9686587957613377e-07, + "loss": 0.2603, + "step": 15886 + }, + { + "epoch": 0.91, + "grad_norm": 0.2870566855876852, + "learning_rate": 3.963469941623288e-07, + "loss": 0.2128, + "step": 15887 + }, + { + "epoch": 0.91, + "grad_norm": 0.2673197346089193, + "learning_rate": 3.958284413226121e-07, + "loss": 0.2114, + "step": 15888 + }, + { + "epoch": 0.91, + "grad_norm": 0.5354140355201578, + "learning_rate": 3.9531022107494486e-07, + "loss": 0.3267, + "step": 15889 + }, + { + "epoch": 0.91, + "grad_norm": 0.35721314864795733, + "learning_rate": 3.9479233343727165e-07, + "loss": 0.1804, + "step": 15890 + }, + { + "epoch": 0.91, + "grad_norm": 0.28916473867304665, + "learning_rate": 3.9427477842752693e-07, + "loss": 0.2645, + "step": 15891 + }, + { + "epoch": 0.91, + "grad_norm": 1.0814982506679522, + "learning_rate": 3.9375755606363306e-07, + "loss": 0.6821, + "step": 15892 + }, + { + "epoch": 0.91, + "grad_norm": 0.4496409680340063, + "learning_rate": 3.9324066636350136e-07, + "loss": 0.1017, + "step": 15893 + }, + { + "epoch": 0.91, + "grad_norm": 0.3384135104379383, + "learning_rate": 3.92724109345034e-07, + "loss": 0.2612, + "step": 15894 + }, + { + "epoch": 0.91, + "grad_norm": 0.2919069332707678, + "learning_rate": 3.922078850261168e-07, + "loss": 0.2677, + "step": 15895 + }, + { + "epoch": 0.91, + "grad_norm": 1.075665763003214, + "learning_rate": 3.9169199342462774e-07, + "loss": 0.1625, + "step": 15896 + }, + { + "epoch": 0.91, + "grad_norm": 0.3534241016716571, + "learning_rate": 3.9117643455843016e-07, + "loss": 0.2953, + "step": 15897 + }, + { + "epoch": 0.91, + "grad_norm": 0.5174200198693831, + "learning_rate": 3.906612084453809e-07, + "loss": 0.3611, + "step": 15898 + }, + { + "epoch": 0.91, + "grad_norm": 0.408525886252803, + "learning_rate": 3.9014631510332135e-07, + "loss": 0.2211, + "step": 15899 + }, + { + "epoch": 0.91, + "grad_norm": 0.396189831826831, + "learning_rate": 3.896317545500805e-07, + "loss": 0.2782, + "step": 15900 + }, + { + "epoch": 0.91, + "grad_norm": 0.29171815179801414, + "learning_rate": 3.8911752680347857e-07, + "loss": 0.1854, + "step": 15901 + }, + { + "epoch": 0.91, + "grad_norm": 0.3603406957800757, + "learning_rate": 3.8860363188132356e-07, + "loss": 0.2891, + "step": 15902 + }, + { + "epoch": 0.91, + "grad_norm": 0.2890968051915757, + "learning_rate": 3.880900698014134e-07, + "loss": 0.1957, + "step": 15903 + }, + { + "epoch": 0.91, + "grad_norm": 1.2876865594992615, + "learning_rate": 3.8757684058152947e-07, + "loss": 0.7166, + "step": 15904 + }, + { + "epoch": 0.91, + "grad_norm": 0.602658793155093, + "learning_rate": 3.8706394423944524e-07, + "loss": 0.3449, + "step": 15905 + }, + { + "epoch": 0.91, + "grad_norm": 0.34185570155092393, + "learning_rate": 3.8655138079292444e-07, + "loss": 0.2216, + "step": 15906 + }, + { + "epoch": 0.91, + "grad_norm": 0.2891611853218796, + "learning_rate": 3.8603915025971605e-07, + "loss": 0.2511, + "step": 15907 + }, + { + "epoch": 0.91, + "grad_norm": 0.3595831908817427, + "learning_rate": 3.855272526575582e-07, + "loss": 0.2175, + "step": 15908 + }, + { + "epoch": 0.91, + "grad_norm": 0.31805676469838995, + "learning_rate": 3.8501568800417663e-07, + "loss": 0.2128, + "step": 15909 + }, + { + "epoch": 0.91, + "grad_norm": 0.3700899135798883, + "learning_rate": 3.845044563172895e-07, + "loss": 0.3155, + "step": 15910 + }, + { + "epoch": 0.91, + "grad_norm": 0.6586535137821252, + "learning_rate": 3.8399355761460036e-07, + "loss": 0.3655, + "step": 15911 + }, + { + "epoch": 0.91, + "grad_norm": 0.3068097109261226, + "learning_rate": 3.8348299191380057e-07, + "loss": 0.2049, + "step": 15912 + }, + { + "epoch": 0.91, + "grad_norm": 0.288649576944417, + "learning_rate": 3.8297275923256936e-07, + "loss": 0.1484, + "step": 15913 + }, + { + "epoch": 0.91, + "grad_norm": 0.3339644155133282, + "learning_rate": 3.824628595885793e-07, + "loss": 0.2961, + "step": 15914 + }, + { + "epoch": 0.91, + "grad_norm": 0.3166963614134166, + "learning_rate": 3.8195329299948737e-07, + "loss": 0.2589, + "step": 15915 + }, + { + "epoch": 0.91, + "grad_norm": 0.8291946937549082, + "learning_rate": 3.814440594829394e-07, + "loss": 0.3325, + "step": 15916 + }, + { + "epoch": 0.91, + "grad_norm": 0.6188592393447846, + "learning_rate": 3.8093515905656797e-07, + "loss": 0.3916, + "step": 15917 + }, + { + "epoch": 0.91, + "grad_norm": 0.3410052252876215, + "learning_rate": 3.804265917380001e-07, + "loss": 0.2626, + "step": 15918 + }, + { + "epoch": 0.91, + "grad_norm": 0.3440402045586406, + "learning_rate": 3.7991835754484616e-07, + "loss": 0.224, + "step": 15919 + }, + { + "epoch": 0.91, + "grad_norm": 0.2800763142332023, + "learning_rate": 3.794104564947054e-07, + "loss": 0.1761, + "step": 15920 + }, + { + "epoch": 0.91, + "grad_norm": 0.31545824296470104, + "learning_rate": 3.789028886051671e-07, + "loss": 0.2512, + "step": 15921 + }, + { + "epoch": 0.91, + "grad_norm": 0.5130003457284722, + "learning_rate": 3.7839565389380606e-07, + "loss": 0.2595, + "step": 15922 + }, + { + "epoch": 0.91, + "grad_norm": 0.4206333487665775, + "learning_rate": 3.7788875237819156e-07, + "loss": 0.3264, + "step": 15923 + }, + { + "epoch": 0.91, + "grad_norm": 0.4075238314358624, + "learning_rate": 3.7738218407587514e-07, + "loss": 0.2339, + "step": 15924 + }, + { + "epoch": 0.91, + "grad_norm": 0.40120391979014103, + "learning_rate": 3.768759490044005e-07, + "loss": 0.1618, + "step": 15925 + }, + { + "epoch": 0.92, + "grad_norm": 0.24959491992631752, + "learning_rate": 3.76370047181297e-07, + "loss": 0.2329, + "step": 15926 + }, + { + "epoch": 0.92, + "grad_norm": 0.35098746815589077, + "learning_rate": 3.7586447862408617e-07, + "loss": 0.2625, + "step": 15927 + }, + { + "epoch": 0.92, + "grad_norm": 0.6981973004763733, + "learning_rate": 3.7535924335027396e-07, + "loss": 0.4138, + "step": 15928 + }, + { + "epoch": 0.92, + "grad_norm": 0.7501774342098396, + "learning_rate": 3.7485434137735754e-07, + "loss": 0.2783, + "step": 15929 + }, + { + "epoch": 0.92, + "grad_norm": 0.3149510966890482, + "learning_rate": 3.743497727228207e-07, + "loss": 0.2552, + "step": 15930 + }, + { + "epoch": 0.92, + "grad_norm": 0.3666851503761762, + "learning_rate": 3.738455374041372e-07, + "loss": 0.3152, + "step": 15931 + }, + { + "epoch": 0.92, + "grad_norm": 0.1810162314400088, + "learning_rate": 3.7334163543876977e-07, + "loss": 0.098, + "step": 15932 + }, + { + "epoch": 0.92, + "grad_norm": 0.304726241285453, + "learning_rate": 3.7283806684416777e-07, + "loss": 0.259, + "step": 15933 + }, + { + "epoch": 0.92, + "grad_norm": 0.3382470948180131, + "learning_rate": 3.723348316377695e-07, + "loss": 0.2932, + "step": 15934 + }, + { + "epoch": 0.92, + "grad_norm": 0.8426507956023479, + "learning_rate": 3.718319298369999e-07, + "loss": 0.3081, + "step": 15935 + }, + { + "epoch": 0.92, + "grad_norm": 0.35343634631778686, + "learning_rate": 3.7132936145927835e-07, + "loss": 0.2407, + "step": 15936 + }, + { + "epoch": 0.92, + "grad_norm": 1.545706171785692, + "learning_rate": 3.708271265220087e-07, + "loss": 0.5846, + "step": 15937 + }, + { + "epoch": 0.92, + "grad_norm": 0.2584584444218997, + "learning_rate": 3.703252250425782e-07, + "loss": 0.2462, + "step": 15938 + }, + { + "epoch": 0.92, + "grad_norm": 0.28528019760679263, + "learning_rate": 3.6982365703837286e-07, + "loss": 0.2056, + "step": 15939 + }, + { + "epoch": 0.92, + "grad_norm": 0.6252324364531351, + "learning_rate": 3.6932242252675997e-07, + "loss": 0.3615, + "step": 15940 + }, + { + "epoch": 0.92, + "grad_norm": 0.5203778132134718, + "learning_rate": 3.6882152152509674e-07, + "loss": 0.2695, + "step": 15941 + }, + { + "epoch": 0.92, + "grad_norm": 0.28609701313106567, + "learning_rate": 3.683209540507304e-07, + "loss": 0.1995, + "step": 15942 + }, + { + "epoch": 0.92, + "grad_norm": 0.4592887764684259, + "learning_rate": 3.678207201209949e-07, + "loss": 0.2873, + "step": 15943 + }, + { + "epoch": 0.92, + "grad_norm": 0.4645147472870708, + "learning_rate": 3.673208197532152e-07, + "loss": 0.2766, + "step": 15944 + }, + { + "epoch": 0.92, + "grad_norm": 0.34419848756733823, + "learning_rate": 3.6682125296469973e-07, + "loss": 0.1808, + "step": 15945 + }, + { + "epoch": 0.92, + "grad_norm": 0.3378083035715725, + "learning_rate": 3.6632201977275126e-07, + "loss": 0.3049, + "step": 15946 + }, + { + "epoch": 0.92, + "grad_norm": 0.45811410825743665, + "learning_rate": 3.658231201946549e-07, + "loss": 0.2619, + "step": 15947 + }, + { + "epoch": 0.92, + "grad_norm": 0.28931462287240073, + "learning_rate": 3.6532455424769133e-07, + "loss": 0.1857, + "step": 15948 + }, + { + "epoch": 0.92, + "grad_norm": 1.285888454753432, + "learning_rate": 3.6482632194912436e-07, + "loss": 0.4671, + "step": 15949 + }, + { + "epoch": 0.92, + "grad_norm": 0.36370191667119234, + "learning_rate": 3.64328423316207e-07, + "loss": 0.3186, + "step": 15950 + }, + { + "epoch": 0.92, + "grad_norm": 0.39727285401672247, + "learning_rate": 3.638308583661809e-07, + "loss": 0.2611, + "step": 15951 + }, + { + "epoch": 0.92, + "grad_norm": 0.42335563840042184, + "learning_rate": 3.633336271162791e-07, + "loss": 0.2897, + "step": 15952 + }, + { + "epoch": 0.92, + "grad_norm": 0.32788117129585664, + "learning_rate": 3.6283672958371987e-07, + "loss": 0.1735, + "step": 15953 + }, + { + "epoch": 0.92, + "grad_norm": 0.2670674166828153, + "learning_rate": 3.623401657857095e-07, + "loss": 0.2516, + "step": 15954 + }, + { + "epoch": 0.92, + "grad_norm": 0.5172019125312697, + "learning_rate": 3.618439357394443e-07, + "loss": 0.2023, + "step": 15955 + }, + { + "epoch": 0.92, + "grad_norm": 0.765943640524781, + "learning_rate": 3.613480394621094e-07, + "loss": 0.3744, + "step": 15956 + }, + { + "epoch": 0.92, + "grad_norm": 0.3254959834274066, + "learning_rate": 3.608524769708788e-07, + "loss": 0.2652, + "step": 15957 + }, + { + "epoch": 0.92, + "grad_norm": 0.29855816184102585, + "learning_rate": 3.6035724828291096e-07, + "loss": 0.2446, + "step": 15958 + }, + { + "epoch": 0.92, + "grad_norm": 0.28100532394478656, + "learning_rate": 3.598623534153578e-07, + "loss": 0.2126, + "step": 15959 + }, + { + "epoch": 0.92, + "grad_norm": 0.42355858312063266, + "learning_rate": 3.593677923853556e-07, + "loss": 0.3086, + "step": 15960 + }, + { + "epoch": 0.92, + "grad_norm": 0.5485094544149791, + "learning_rate": 3.5887356521003283e-07, + "loss": 0.2255, + "step": 15961 + }, + { + "epoch": 0.92, + "grad_norm": 0.3308684754469159, + "learning_rate": 3.583796719065047e-07, + "loss": 0.3017, + "step": 15962 + }, + { + "epoch": 0.92, + "grad_norm": 0.5477406568092534, + "learning_rate": 3.578861124918731e-07, + "loss": 0.3118, + "step": 15963 + }, + { + "epoch": 0.92, + "grad_norm": 0.37672975801509384, + "learning_rate": 3.5739288698323107e-07, + "loss": 0.3116, + "step": 15964 + }, + { + "epoch": 0.92, + "grad_norm": 0.20393972677598093, + "learning_rate": 3.568999953976582e-07, + "loss": 0.154, + "step": 15965 + }, + { + "epoch": 0.92, + "grad_norm": 0.3064175262555501, + "learning_rate": 3.564074377522253e-07, + "loss": 0.2573, + "step": 15966 + }, + { + "epoch": 0.92, + "grad_norm": 0.5742413490053281, + "learning_rate": 3.5591521406398654e-07, + "loss": 0.3097, + "step": 15967 + }, + { + "epoch": 0.92, + "grad_norm": 0.5365842583125046, + "learning_rate": 3.554233243499894e-07, + "loss": 0.2825, + "step": 15968 + }, + { + "epoch": 0.92, + "grad_norm": 0.43114460143022276, + "learning_rate": 3.5493176862726794e-07, + "loss": 0.253, + "step": 15969 + }, + { + "epoch": 0.92, + "grad_norm": 0.2924940242715081, + "learning_rate": 3.5444054691284535e-07, + "loss": 0.2877, + "step": 15970 + }, + { + "epoch": 0.92, + "grad_norm": 0.2374504232369525, + "learning_rate": 3.539496592237335e-07, + "loss": 0.1154, + "step": 15971 + }, + { + "epoch": 0.92, + "grad_norm": 0.2552973385212853, + "learning_rate": 3.5345910557692655e-07, + "loss": 0.1876, + "step": 15972 + }, + { + "epoch": 0.92, + "grad_norm": 0.4709639877224451, + "learning_rate": 3.529688859894176e-07, + "loss": 0.3019, + "step": 15973 + }, + { + "epoch": 0.92, + "grad_norm": 0.3310973746232077, + "learning_rate": 3.5247900047818193e-07, + "loss": 0.2604, + "step": 15974 + }, + { + "epoch": 0.92, + "grad_norm": 0.43855069686197606, + "learning_rate": 3.5198944906018273e-07, + "loss": 0.269, + "step": 15975 + }, + { + "epoch": 0.92, + "grad_norm": 0.5369183319927688, + "learning_rate": 3.5150023175237303e-07, + "loss": 0.3707, + "step": 15976 + }, + { + "epoch": 0.92, + "grad_norm": 0.3176682852295683, + "learning_rate": 3.5101134857169704e-07, + "loss": 0.2594, + "step": 15977 + }, + { + "epoch": 0.92, + "grad_norm": 0.24078063210633607, + "learning_rate": 3.505227995350824e-07, + "loss": 0.1773, + "step": 15978 + }, + { + "epoch": 0.92, + "grad_norm": 0.5265718725264137, + "learning_rate": 3.5003458465944884e-07, + "loss": 0.3234, + "step": 15979 + }, + { + "epoch": 0.92, + "grad_norm": 0.667710297637546, + "learning_rate": 3.495467039617018e-07, + "loss": 0.3737, + "step": 15980 + }, + { + "epoch": 0.92, + "grad_norm": 0.7002359606242801, + "learning_rate": 3.4905915745873763e-07, + "loss": 0.1409, + "step": 15981 + }, + { + "epoch": 0.92, + "grad_norm": 0.2570986693620255, + "learning_rate": 3.4857194516744075e-07, + "loss": 0.2817, + "step": 15982 + }, + { + "epoch": 0.92, + "grad_norm": 0.4987963183833602, + "learning_rate": 3.4808506710468204e-07, + "loss": 0.2641, + "step": 15983 + }, + { + "epoch": 0.92, + "grad_norm": 0.28035991687745376, + "learning_rate": 3.4759852328732136e-07, + "loss": 0.1183, + "step": 15984 + }, + { + "epoch": 0.92, + "grad_norm": 0.3520688668657019, + "learning_rate": 3.4711231373220854e-07, + "loss": 0.2765, + "step": 15985 + }, + { + "epoch": 0.92, + "grad_norm": 0.3869463660741132, + "learning_rate": 3.466264384561824e-07, + "loss": 0.2959, + "step": 15986 + }, + { + "epoch": 0.92, + "grad_norm": 0.41454146732657704, + "learning_rate": 3.461408974760683e-07, + "loss": 0.1616, + "step": 15987 + }, + { + "epoch": 0.92, + "grad_norm": 0.5291939781843723, + "learning_rate": 3.456556908086783e-07, + "loss": 0.4009, + "step": 15988 + }, + { + "epoch": 0.92, + "grad_norm": 0.48867694572595083, + "learning_rate": 3.4517081847081693e-07, + "loss": 0.3361, + "step": 15989 + }, + { + "epoch": 0.92, + "grad_norm": 0.30895000784527976, + "learning_rate": 3.44686280479275e-07, + "loss": 0.2123, + "step": 15990 + }, + { + "epoch": 0.92, + "grad_norm": 0.4070864437798445, + "learning_rate": 3.442020768508325e-07, + "loss": 0.2457, + "step": 15991 + }, + { + "epoch": 0.92, + "grad_norm": 0.525896819446171, + "learning_rate": 3.4371820760225606e-07, + "loss": 0.2261, + "step": 15992 + }, + { + "epoch": 0.92, + "grad_norm": 0.389551843659952, + "learning_rate": 3.432346727503033e-07, + "loss": 0.2679, + "step": 15993 + }, + { + "epoch": 0.92, + "grad_norm": 0.29140564211839626, + "learning_rate": 3.427514723117187e-07, + "loss": 0.2335, + "step": 15994 + }, + { + "epoch": 0.92, + "grad_norm": 1.1747614721852366, + "learning_rate": 3.4226860630323545e-07, + "loss": 0.604, + "step": 15995 + }, + { + "epoch": 0.92, + "grad_norm": 0.4861490590325455, + "learning_rate": 3.4178607474157464e-07, + "loss": 0.3133, + "step": 15996 + }, + { + "epoch": 0.92, + "grad_norm": 0.2647163166220459, + "learning_rate": 3.413038776434474e-07, + "loss": 0.1841, + "step": 15997 + }, + { + "epoch": 0.92, + "grad_norm": 0.30848881097191544, + "learning_rate": 3.408220150255492e-07, + "loss": 0.243, + "step": 15998 + }, + { + "epoch": 0.92, + "grad_norm": 0.7268327398882253, + "learning_rate": 3.403404869045712e-07, + "loss": 0.3952, + "step": 15999 + }, + { + "epoch": 0.92, + "grad_norm": 0.3471361648604528, + "learning_rate": 3.398592932971878e-07, + "loss": 0.2387, + "step": 16000 + }, + { + "epoch": 0.92, + "grad_norm": 0.4117894608929416, + "learning_rate": 3.393784342200601e-07, + "loss": 0.2862, + "step": 16001 + }, + { + "epoch": 0.92, + "grad_norm": 0.7838377566107934, + "learning_rate": 3.388979096898415e-07, + "loss": 0.3532, + "step": 16002 + }, + { + "epoch": 0.92, + "grad_norm": 0.34177113425113487, + "learning_rate": 3.3841771972317414e-07, + "loss": 0.269, + "step": 16003 + }, + { + "epoch": 0.92, + "grad_norm": 0.301355411444743, + "learning_rate": 3.3793786433668596e-07, + "loss": 0.0814, + "step": 16004 + }, + { + "epoch": 0.92, + "grad_norm": 0.30844575612555714, + "learning_rate": 3.3745834354699247e-07, + "loss": 0.262, + "step": 16005 + }, + { + "epoch": 0.92, + "grad_norm": 0.3747679957008508, + "learning_rate": 3.3697915737070154e-07, + "loss": 0.306, + "step": 16006 + }, + { + "epoch": 0.92, + "grad_norm": 0.7646152394209245, + "learning_rate": 3.365003058244076e-07, + "loss": 0.3003, + "step": 16007 + }, + { + "epoch": 0.92, + "grad_norm": 0.536131589767793, + "learning_rate": 3.3602178892469193e-07, + "loss": 0.3079, + "step": 16008 + }, + { + "epoch": 0.92, + "grad_norm": 0.3947217103658755, + "learning_rate": 3.355436066881268e-07, + "loss": 0.283, + "step": 16009 + }, + { + "epoch": 0.92, + "grad_norm": 0.22066852034689402, + "learning_rate": 3.3506575913127006e-07, + "loss": 0.1689, + "step": 16010 + }, + { + "epoch": 0.92, + "grad_norm": 0.5902292519591517, + "learning_rate": 3.3458824627067067e-07, + "loss": 0.3642, + "step": 16011 + }, + { + "epoch": 0.92, + "grad_norm": 0.381931762583577, + "learning_rate": 3.3411106812286544e-07, + "loss": 0.3037, + "step": 16012 + }, + { + "epoch": 0.92, + "grad_norm": 0.33341807366384135, + "learning_rate": 3.336342247043778e-07, + "loss": 0.2449, + "step": 16013 + }, + { + "epoch": 0.92, + "grad_norm": 0.5143593485060982, + "learning_rate": 3.331577160317201e-07, + "loss": 0.294, + "step": 16014 + }, + { + "epoch": 0.92, + "grad_norm": 0.4661460986029127, + "learning_rate": 3.3268154212139583e-07, + "loss": 0.2655, + "step": 16015 + }, + { + "epoch": 0.92, + "grad_norm": 0.3431806133508932, + "learning_rate": 3.3220570298989507e-07, + "loss": 0.1848, + "step": 16016 + }, + { + "epoch": 0.92, + "grad_norm": 0.30176925676400573, + "learning_rate": 3.317301986536947e-07, + "loss": 0.2456, + "step": 16017 + }, + { + "epoch": 0.92, + "grad_norm": 0.3353558657145527, + "learning_rate": 3.3125502912926044e-07, + "loss": 0.2543, + "step": 16018 + }, + { + "epoch": 0.92, + "grad_norm": 0.5655804810882348, + "learning_rate": 3.307801944330491e-07, + "loss": 0.3556, + "step": 16019 + }, + { + "epoch": 0.92, + "grad_norm": 0.33186886555261347, + "learning_rate": 3.303056945815053e-07, + "loss": 0.0592, + "step": 16020 + }, + { + "epoch": 0.92, + "grad_norm": 0.27348568532345147, + "learning_rate": 3.2983152959105924e-07, + "loss": 0.2488, + "step": 16021 + }, + { + "epoch": 0.92, + "grad_norm": 0.25027602062501897, + "learning_rate": 3.2935769947813003e-07, + "loss": 0.2053, + "step": 16022 + }, + { + "epoch": 0.92, + "grad_norm": 0.810186576406879, + "learning_rate": 3.2888420425912783e-07, + "loss": 0.2417, + "step": 16023 + }, + { + "epoch": 0.92, + "grad_norm": 0.347940790008678, + "learning_rate": 3.2841104395045174e-07, + "loss": 0.2546, + "step": 16024 + }, + { + "epoch": 0.92, + "grad_norm": 0.43409577516476583, + "learning_rate": 3.279382185684843e-07, + "loss": 0.332, + "step": 16025 + }, + { + "epoch": 0.92, + "grad_norm": 0.5257069977199804, + "learning_rate": 3.274657281296001e-07, + "loss": 0.317, + "step": 16026 + }, + { + "epoch": 0.92, + "grad_norm": 0.3115977415753507, + "learning_rate": 3.269935726501616e-07, + "loss": 0.1908, + "step": 16027 + }, + { + "epoch": 0.92, + "grad_norm": 0.31145969908220766, + "learning_rate": 3.265217521465203e-07, + "loss": 0.1627, + "step": 16028 + }, + { + "epoch": 0.92, + "grad_norm": 0.304969715955574, + "learning_rate": 3.2605026663501403e-07, + "loss": 0.271, + "step": 16029 + }, + { + "epoch": 0.92, + "grad_norm": 0.3198391503543351, + "learning_rate": 3.2557911613197213e-07, + "loss": 0.2093, + "step": 16030 + }, + { + "epoch": 0.92, + "grad_norm": 0.5495654930323556, + "learning_rate": 3.251083006537081e-07, + "loss": 0.3726, + "step": 16031 + }, + { + "epoch": 0.92, + "grad_norm": 0.9376399677814717, + "learning_rate": 3.24637820216529e-07, + "loss": 0.4465, + "step": 16032 + }, + { + "epoch": 0.92, + "grad_norm": 0.27483983067880535, + "learning_rate": 3.241676748367251e-07, + "loss": 0.2189, + "step": 16033 + }, + { + "epoch": 0.92, + "grad_norm": 0.3926888204978273, + "learning_rate": 3.2369786453057996e-07, + "loss": 0.2467, + "step": 16034 + }, + { + "epoch": 0.92, + "grad_norm": 0.8656459700472929, + "learning_rate": 3.232283893143595e-07, + "loss": 0.4894, + "step": 16035 + }, + { + "epoch": 0.92, + "grad_norm": 0.347362517901405, + "learning_rate": 3.2275924920432525e-07, + "loss": 0.2104, + "step": 16036 + }, + { + "epoch": 0.92, + "grad_norm": 0.3434981831692561, + "learning_rate": 3.2229044421672183e-07, + "loss": 0.3017, + "step": 16037 + }, + { + "epoch": 0.92, + "grad_norm": 0.6020647893512497, + "learning_rate": 3.2182197436778527e-07, + "loss": 0.212, + "step": 16038 + }, + { + "epoch": 0.92, + "grad_norm": 0.33810505101496374, + "learning_rate": 3.213538396737359e-07, + "loss": 0.2483, + "step": 16039 + }, + { + "epoch": 0.92, + "grad_norm": 0.31886277314169437, + "learning_rate": 3.2088604015078737e-07, + "loss": 0.0928, + "step": 16040 + }, + { + "epoch": 0.92, + "grad_norm": 0.3547676364178774, + "learning_rate": 3.20418575815139e-07, + "loss": 0.2903, + "step": 16041 + }, + { + "epoch": 0.92, + "grad_norm": 0.330343531602598, + "learning_rate": 3.1995144668298004e-07, + "loss": 0.2459, + "step": 16042 + }, + { + "epoch": 0.92, + "grad_norm": 0.8111748137958689, + "learning_rate": 3.1948465277048424e-07, + "loss": 0.3584, + "step": 16043 + }, + { + "epoch": 0.92, + "grad_norm": 0.35521287147768266, + "learning_rate": 3.190181940938197e-07, + "loss": 0.2285, + "step": 16044 + }, + { + "epoch": 0.92, + "grad_norm": 0.2691973622179739, + "learning_rate": 3.185520706691392e-07, + "loss": 0.2408, + "step": 16045 + }, + { + "epoch": 0.92, + "grad_norm": 1.3561969579912896, + "learning_rate": 3.1808628251258413e-07, + "loss": 0.2051, + "step": 16046 + }, + { + "epoch": 0.92, + "grad_norm": 0.6097716949414717, + "learning_rate": 3.1762082964028385e-07, + "loss": 0.3688, + "step": 16047 + }, + { + "epoch": 0.92, + "grad_norm": 0.37746575318449793, + "learning_rate": 3.171557120683588e-07, + "loss": 0.2568, + "step": 16048 + }, + { + "epoch": 0.92, + "grad_norm": 0.28599774774097053, + "learning_rate": 3.166909298129139e-07, + "loss": 0.2567, + "step": 16049 + }, + { + "epoch": 0.92, + "grad_norm": 0.2873137393901833, + "learning_rate": 3.162264828900474e-07, + "loss": 0.1658, + "step": 16050 + }, + { + "epoch": 0.92, + "grad_norm": 0.3359843271780522, + "learning_rate": 3.1576237131584084e-07, + "loss": 0.2672, + "step": 16051 + }, + { + "epoch": 0.92, + "grad_norm": 0.8905987280455674, + "learning_rate": 3.1529859510636585e-07, + "loss": 0.4539, + "step": 16052 + }, + { + "epoch": 0.92, + "grad_norm": 0.3531928411300171, + "learning_rate": 3.1483515427768506e-07, + "loss": 0.2499, + "step": 16053 + }, + { + "epoch": 0.92, + "grad_norm": 0.32954114792842953, + "learning_rate": 3.143720488458457e-07, + "loss": 0.2502, + "step": 16054 + }, + { + "epoch": 0.92, + "grad_norm": 0.3633234307925469, + "learning_rate": 3.139092788268872e-07, + "loss": 0.2719, + "step": 16055 + }, + { + "epoch": 0.92, + "grad_norm": 0.29719624462330824, + "learning_rate": 3.1344684423683214e-07, + "loss": 0.1561, + "step": 16056 + }, + { + "epoch": 0.92, + "grad_norm": 0.27943018114548945, + "learning_rate": 3.1298474509169676e-07, + "loss": 0.2495, + "step": 16057 + }, + { + "epoch": 0.92, + "grad_norm": 0.8756150438352185, + "learning_rate": 3.1252298140748374e-07, + "loss": 0.3744, + "step": 16058 + }, + { + "epoch": 0.92, + "grad_norm": 0.719734852026446, + "learning_rate": 3.120615532001836e-07, + "loss": 0.2874, + "step": 16059 + }, + { + "epoch": 0.92, + "grad_norm": 0.3029303968028634, + "learning_rate": 3.1160046048577365e-07, + "loss": 0.2542, + "step": 16060 + }, + { + "epoch": 0.92, + "grad_norm": 0.3691633990742467, + "learning_rate": 3.1113970328022433e-07, + "loss": 0.3045, + "step": 16061 + }, + { + "epoch": 0.92, + "grad_norm": 0.3109473557320388, + "learning_rate": 3.106792815994908e-07, + "loss": 0.1067, + "step": 16062 + }, + { + "epoch": 0.92, + "grad_norm": 0.332168543197011, + "learning_rate": 3.1021919545951683e-07, + "loss": 0.2444, + "step": 16063 + }, + { + "epoch": 0.92, + "grad_norm": 0.8139045767802737, + "learning_rate": 3.0975944487623534e-07, + "loss": 0.3702, + "step": 16064 + }, + { + "epoch": 0.92, + "grad_norm": 0.33745728439735845, + "learning_rate": 3.093000298655668e-07, + "loss": 0.296, + "step": 16065 + }, + { + "epoch": 0.92, + "grad_norm": 0.32467491783135444, + "learning_rate": 3.0884095044342197e-07, + "loss": 0.18, + "step": 16066 + }, + { + "epoch": 0.92, + "grad_norm": 0.5142360344956464, + "learning_rate": 3.0838220662569807e-07, + "loss": 0.3945, + "step": 16067 + }, + { + "epoch": 0.92, + "grad_norm": 0.23229234122279033, + "learning_rate": 3.0792379842828234e-07, + "loss": 0.2061, + "step": 16068 + }, + { + "epoch": 0.92, + "grad_norm": 0.4327552771597201, + "learning_rate": 3.074657258670477e-07, + "loss": 0.1711, + "step": 16069 + }, + { + "epoch": 0.92, + "grad_norm": 0.4375268155712646, + "learning_rate": 3.070079889578592e-07, + "loss": 0.3044, + "step": 16070 + }, + { + "epoch": 0.92, + "grad_norm": 0.9263858301221611, + "learning_rate": 3.0655058771656755e-07, + "loss": 0.4369, + "step": 16071 + }, + { + "epoch": 0.92, + "grad_norm": 0.2924196438960813, + "learning_rate": 3.060935221590111e-07, + "loss": 0.1915, + "step": 16072 + }, + { + "epoch": 0.92, + "grad_norm": 0.3187869889458041, + "learning_rate": 3.056367923010195e-07, + "loss": 0.2959, + "step": 16073 + }, + { + "epoch": 0.92, + "grad_norm": 0.2271257776392587, + "learning_rate": 3.0518039815841004e-07, + "loss": 0.13, + "step": 16074 + }, + { + "epoch": 0.92, + "grad_norm": 0.3284032596538808, + "learning_rate": 3.0472433974698566e-07, + "loss": 0.2056, + "step": 16075 + }, + { + "epoch": 0.92, + "grad_norm": 0.7342704365795844, + "learning_rate": 3.042686170825404e-07, + "loss": 0.3854, + "step": 16076 + }, + { + "epoch": 0.92, + "grad_norm": 0.34661328109772904, + "learning_rate": 3.0381323018085496e-07, + "loss": 0.3171, + "step": 16077 + }, + { + "epoch": 0.92, + "grad_norm": 0.38368727591985285, + "learning_rate": 3.0335817905770115e-07, + "loss": 0.2618, + "step": 16078 + }, + { + "epoch": 0.92, + "grad_norm": 0.5057502110746248, + "learning_rate": 3.029034637288375e-07, + "loss": 0.2623, + "step": 16079 + }, + { + "epoch": 0.92, + "grad_norm": 0.2947538085012821, + "learning_rate": 3.024490842100092e-07, + "loss": 0.2066, + "step": 16080 + }, + { + "epoch": 0.92, + "grad_norm": 0.26156274907930127, + "learning_rate": 3.019950405169514e-07, + "loss": 0.2431, + "step": 16081 + }, + { + "epoch": 0.92, + "grad_norm": 0.5329165928014303, + "learning_rate": 3.015413326653893e-07, + "loss": 0.2872, + "step": 16082 + }, + { + "epoch": 0.92, + "grad_norm": 1.347908220713831, + "learning_rate": 3.0108796067103376e-07, + "loss": 0.5954, + "step": 16083 + }, + { + "epoch": 0.92, + "grad_norm": 0.5714727630722087, + "learning_rate": 3.0063492454958434e-07, + "loss": 0.3317, + "step": 16084 + }, + { + "epoch": 0.92, + "grad_norm": 0.2701745129360195, + "learning_rate": 3.001822243167307e-07, + "loss": 0.2408, + "step": 16085 + }, + { + "epoch": 0.92, + "grad_norm": 0.43758200115949175, + "learning_rate": 2.997298599881493e-07, + "loss": 0.2901, + "step": 16086 + }, + { + "epoch": 0.92, + "grad_norm": 0.576522748883847, + "learning_rate": 2.992778315795064e-07, + "loss": 0.3509, + "step": 16087 + }, + { + "epoch": 0.92, + "grad_norm": 0.23680880305406177, + "learning_rate": 2.988261391064551e-07, + "loss": 0.2101, + "step": 16088 + }, + { + "epoch": 0.92, + "grad_norm": 0.4501988796850427, + "learning_rate": 2.9837478258463725e-07, + "loss": 0.324, + "step": 16089 + }, + { + "epoch": 0.92, + "grad_norm": 0.5710226158588361, + "learning_rate": 2.9792376202968263e-07, + "loss": 0.3863, + "step": 16090 + }, + { + "epoch": 0.92, + "grad_norm": 0.41143826535651795, + "learning_rate": 2.974730774572121e-07, + "loss": 0.2862, + "step": 16091 + }, + { + "epoch": 0.92, + "grad_norm": 0.45703676201711263, + "learning_rate": 2.970227288828309e-07, + "loss": 0.2569, + "step": 16092 + }, + { + "epoch": 0.92, + "grad_norm": 0.30212625877525195, + "learning_rate": 2.965727163221366e-07, + "loss": 0.2634, + "step": 16093 + }, + { + "epoch": 0.92, + "grad_norm": 0.2798986506628534, + "learning_rate": 2.9612303979071e-07, + "loss": 0.2054, + "step": 16094 + }, + { + "epoch": 0.92, + "grad_norm": 1.641861194235436, + "learning_rate": 2.9567369930412646e-07, + "loss": 0.3078, + "step": 16095 + }, + { + "epoch": 0.92, + "grad_norm": 0.30785882672148424, + "learning_rate": 2.9522469487794467e-07, + "loss": 0.2824, + "step": 16096 + }, + { + "epoch": 0.92, + "grad_norm": 0.37031077069707474, + "learning_rate": 2.9477602652771554e-07, + "loss": 0.318, + "step": 16097 + }, + { + "epoch": 0.92, + "grad_norm": 0.6730152886289593, + "learning_rate": 2.943276942689732e-07, + "loss": 0.2634, + "step": 16098 + }, + { + "epoch": 0.92, + "grad_norm": 0.4854497688120956, + "learning_rate": 2.9387969811724757e-07, + "loss": 0.2925, + "step": 16099 + }, + { + "epoch": 0.93, + "grad_norm": 0.35689041820120426, + "learning_rate": 2.9343203808804953e-07, + "loss": 0.252, + "step": 16100 + }, + { + "epoch": 0.93, + "grad_norm": 0.29101679336873626, + "learning_rate": 2.9298471419688335e-07, + "loss": 0.2409, + "step": 16101 + }, + { + "epoch": 0.93, + "grad_norm": 0.33551396449091, + "learning_rate": 2.925377264592388e-07, + "loss": 0.1526, + "step": 16102 + }, + { + "epoch": 0.93, + "grad_norm": 0.403795727148337, + "learning_rate": 2.9209107489059474e-07, + "loss": 0.2942, + "step": 16103 + }, + { + "epoch": 0.93, + "grad_norm": 0.35408713603736885, + "learning_rate": 2.916447595064198e-07, + "loss": 0.2958, + "step": 16104 + }, + { + "epoch": 0.93, + "grad_norm": 0.6365072948732012, + "learning_rate": 2.9119878032216944e-07, + "loss": 0.1797, + "step": 16105 + }, + { + "epoch": 0.93, + "grad_norm": 0.2916939704666153, + "learning_rate": 2.90753137353289e-07, + "loss": 0.2252, + "step": 16106 + }, + { + "epoch": 0.93, + "grad_norm": 0.46903057760889666, + "learning_rate": 2.903078306152085e-07, + "loss": 0.2796, + "step": 16107 + }, + { + "epoch": 0.93, + "grad_norm": 0.30356242185573823, + "learning_rate": 2.8986286012335105e-07, + "loss": 0.2131, + "step": 16108 + }, + { + "epoch": 0.93, + "grad_norm": 0.3753404509166921, + "learning_rate": 2.8941822589312443e-07, + "loss": 0.3212, + "step": 16109 + }, + { + "epoch": 0.93, + "grad_norm": 0.6360470819564403, + "learning_rate": 2.889739279399262e-07, + "loss": 0.3638, + "step": 16110 + }, + { + "epoch": 0.93, + "grad_norm": 0.33098743442000983, + "learning_rate": 2.885299662791452e-07, + "loss": 0.1529, + "step": 16111 + }, + { + "epoch": 0.93, + "grad_norm": 0.3058883589226098, + "learning_rate": 2.880863409261525e-07, + "loss": 0.28, + "step": 16112 + }, + { + "epoch": 0.93, + "grad_norm": 0.25707865836058724, + "learning_rate": 2.876430518963136e-07, + "loss": 0.1953, + "step": 16113 + }, + { + "epoch": 0.93, + "grad_norm": 0.6049082040792976, + "learning_rate": 2.872000992049773e-07, + "loss": 0.3962, + "step": 16114 + }, + { + "epoch": 0.93, + "grad_norm": 0.36179534195311464, + "learning_rate": 2.867574828674824e-07, + "loss": 0.2377, + "step": 16115 + }, + { + "epoch": 0.93, + "grad_norm": 0.35134754663657647, + "learning_rate": 2.8631520289916004e-07, + "loss": 0.305, + "step": 16116 + }, + { + "epoch": 0.93, + "grad_norm": 0.6873273078284741, + "learning_rate": 2.858732593153246e-07, + "loss": 0.2425, + "step": 16117 + }, + { + "epoch": 0.93, + "grad_norm": 0.4355865580407592, + "learning_rate": 2.8543165213128057e-07, + "loss": 0.2198, + "step": 16118 + }, + { + "epoch": 0.93, + "grad_norm": 0.2721525238168374, + "learning_rate": 2.8499038136231894e-07, + "loss": 0.2223, + "step": 16119 + }, + { + "epoch": 0.93, + "grad_norm": 0.3305990370089696, + "learning_rate": 2.845494470237242e-07, + "loss": 0.2855, + "step": 16120 + }, + { + "epoch": 0.93, + "grad_norm": 0.3257204072520625, + "learning_rate": 2.841088491307642e-07, + "loss": 0.2172, + "step": 16121 + }, + { + "epoch": 0.93, + "grad_norm": 0.7299514903666677, + "learning_rate": 2.8366858769869663e-07, + "loss": 0.4215, + "step": 16122 + }, + { + "epoch": 0.93, + "grad_norm": 1.0502896884292527, + "learning_rate": 2.8322866274276715e-07, + "loss": 0.3683, + "step": 16123 + }, + { + "epoch": 0.93, + "grad_norm": 0.2414259325709617, + "learning_rate": 2.8278907427821245e-07, + "loss": 0.223, + "step": 16124 + }, + { + "epoch": 0.93, + "grad_norm": 0.4580905634662621, + "learning_rate": 2.8234982232025365e-07, + "loss": 0.2462, + "step": 16125 + }, + { + "epoch": 0.93, + "grad_norm": 0.6134757196152566, + "learning_rate": 2.8191090688410305e-07, + "loss": 0.3354, + "step": 16126 + }, + { + "epoch": 0.93, + "grad_norm": 0.34813149354241363, + "learning_rate": 2.8147232798496074e-07, + "loss": 0.2758, + "step": 16127 + }, + { + "epoch": 0.93, + "grad_norm": 0.2445101180031528, + "learning_rate": 2.810340856380112e-07, + "loss": 0.1985, + "step": 16128 + }, + { + "epoch": 0.93, + "grad_norm": 0.9321604201835165, + "learning_rate": 2.8059617985843557e-07, + "loss": 0.3523, + "step": 16129 + }, + { + "epoch": 0.93, + "grad_norm": 0.36475217838282614, + "learning_rate": 2.801586106613963e-07, + "loss": 0.2633, + "step": 16130 + }, + { + "epoch": 0.93, + "grad_norm": 0.9028096980700738, + "learning_rate": 2.797213780620456e-07, + "loss": 0.2922, + "step": 16131 + }, + { + "epoch": 0.93, + "grad_norm": 0.2847058382844327, + "learning_rate": 2.7928448207552474e-07, + "loss": 0.2463, + "step": 16132 + }, + { + "epoch": 0.93, + "grad_norm": 0.38924969775553325, + "learning_rate": 2.7884792271696603e-07, + "loss": 0.3169, + "step": 16133 + }, + { + "epoch": 0.93, + "grad_norm": 0.24599208379016946, + "learning_rate": 2.784117000014852e-07, + "loss": 0.1124, + "step": 16134 + }, + { + "epoch": 0.93, + "grad_norm": 0.6790767935493142, + "learning_rate": 2.7797581394418907e-07, + "loss": 0.3211, + "step": 16135 + }, + { + "epoch": 0.93, + "grad_norm": 0.308392724398636, + "learning_rate": 2.7754026456017104e-07, + "loss": 0.2932, + "step": 16136 + }, + { + "epoch": 0.93, + "grad_norm": 0.4612725789887395, + "learning_rate": 2.7710505186451684e-07, + "loss": 0.2619, + "step": 16137 + }, + { + "epoch": 0.93, + "grad_norm": 0.6119165445617544, + "learning_rate": 2.766701758722967e-07, + "loss": 0.3625, + "step": 16138 + }, + { + "epoch": 0.93, + "grad_norm": 0.3401083159351843, + "learning_rate": 2.7623563659857186e-07, + "loss": 0.3047, + "step": 16139 + }, + { + "epoch": 0.93, + "grad_norm": 0.22356893956156418, + "learning_rate": 2.758014340583859e-07, + "loss": 0.216, + "step": 16140 + }, + { + "epoch": 0.93, + "grad_norm": 1.9203710855895133, + "learning_rate": 2.7536756826678e-07, + "loss": 0.199, + "step": 16141 + }, + { + "epoch": 0.93, + "grad_norm": 0.3429263728146882, + "learning_rate": 2.749340392387767e-07, + "loss": 0.2606, + "step": 16142 + }, + { + "epoch": 0.93, + "grad_norm": 0.6493811930991484, + "learning_rate": 2.745008469893884e-07, + "loss": 0.3888, + "step": 16143 + }, + { + "epoch": 0.93, + "grad_norm": 0.3245298437449268, + "learning_rate": 2.740679915336175e-07, + "loss": 0.2407, + "step": 16144 + }, + { + "epoch": 0.93, + "grad_norm": 0.3407142144619474, + "learning_rate": 2.7363547288645544e-07, + "loss": 0.3001, + "step": 16145 + }, + { + "epoch": 0.93, + "grad_norm": 0.4127223640520308, + "learning_rate": 2.732032910628779e-07, + "loss": 0.2387, + "step": 16146 + }, + { + "epoch": 0.93, + "grad_norm": 0.4410479453956271, + "learning_rate": 2.727714460778519e-07, + "loss": 0.0986, + "step": 16147 + }, + { + "epoch": 0.93, + "grad_norm": 0.23114652387431825, + "learning_rate": 2.7233993794633205e-07, + "loss": 0.2363, + "step": 16148 + }, + { + "epoch": 0.93, + "grad_norm": 0.7707876240880364, + "learning_rate": 2.7190876668326207e-07, + "loss": 0.4322, + "step": 16149 + }, + { + "epoch": 0.93, + "grad_norm": 0.5681333562696601, + "learning_rate": 2.7147793230357434e-07, + "loss": 0.3007, + "step": 16150 + }, + { + "epoch": 0.93, + "grad_norm": 0.334019256141992, + "learning_rate": 2.71047434822187e-07, + "loss": 0.2636, + "step": 16151 + }, + { + "epoch": 0.93, + "grad_norm": 0.25145171079300954, + "learning_rate": 2.7061727425400695e-07, + "loss": 0.2368, + "step": 16152 + }, + { + "epoch": 0.93, + "grad_norm": 0.37370438219164975, + "learning_rate": 2.701874506139335e-07, + "loss": 0.2354, + "step": 16153 + }, + { + "epoch": 0.93, + "grad_norm": 0.32680459189413763, + "learning_rate": 2.6975796391685125e-07, + "loss": 0.1946, + "step": 16154 + }, + { + "epoch": 0.93, + "grad_norm": 0.5853293839882767, + "learning_rate": 2.6932881417763067e-07, + "loss": 0.342, + "step": 16155 + }, + { + "epoch": 0.93, + "grad_norm": 0.3247975509729384, + "learning_rate": 2.689000014111365e-07, + "loss": 0.3094, + "step": 16156 + }, + { + "epoch": 0.93, + "grad_norm": 0.33160684037157295, + "learning_rate": 2.684715256322146e-07, + "loss": 0.2061, + "step": 16157 + }, + { + "epoch": 0.93, + "grad_norm": 0.36609071143521194, + "learning_rate": 2.680433868557064e-07, + "loss": 0.2004, + "step": 16158 + }, + { + "epoch": 0.93, + "grad_norm": 1.2305571895568488, + "learning_rate": 2.676155850964379e-07, + "loss": 0.4733, + "step": 16159 + }, + { + "epoch": 0.93, + "grad_norm": 0.22411989695587825, + "learning_rate": 2.6718812036922283e-07, + "loss": 0.2119, + "step": 16160 + }, + { + "epoch": 0.93, + "grad_norm": 0.6334435102540285, + "learning_rate": 2.6676099268886367e-07, + "loss": 0.3685, + "step": 16161 + }, + { + "epoch": 0.93, + "grad_norm": 1.2434403340381204, + "learning_rate": 2.6633420207015426e-07, + "loss": 0.8358, + "step": 16162 + }, + { + "epoch": 0.93, + "grad_norm": 0.2768553025982602, + "learning_rate": 2.659077485278716e-07, + "loss": 0.1891, + "step": 16163 + }, + { + "epoch": 0.93, + "grad_norm": 1.8087700309256254, + "learning_rate": 2.6548163207678614e-07, + "loss": 0.3039, + "step": 16164 + }, + { + "epoch": 0.93, + "grad_norm": 0.3263779764663397, + "learning_rate": 2.6505585273165156e-07, + "loss": 0.1664, + "step": 16165 + }, + { + "epoch": 0.93, + "grad_norm": 0.31180764115083154, + "learning_rate": 2.6463041050721615e-07, + "loss": 0.2553, + "step": 16166 + }, + { + "epoch": 0.93, + "grad_norm": 0.8241373179238439, + "learning_rate": 2.642053054182103e-07, + "loss": 0.3087, + "step": 16167 + }, + { + "epoch": 0.93, + "grad_norm": 0.3446713828640923, + "learning_rate": 2.637805374793556e-07, + "loss": 0.3118, + "step": 16168 + }, + { + "epoch": 0.93, + "grad_norm": 0.32013571828818826, + "learning_rate": 2.633561067053625e-07, + "loss": 0.2598, + "step": 16169 + }, + { + "epoch": 0.93, + "grad_norm": 0.4038815496361895, + "learning_rate": 2.629320131109281e-07, + "loss": 0.1137, + "step": 16170 + }, + { + "epoch": 0.93, + "grad_norm": 0.2753926358926855, + "learning_rate": 2.6250825671074065e-07, + "loss": 0.2389, + "step": 16171 + }, + { + "epoch": 0.93, + "grad_norm": 0.4119719794526924, + "learning_rate": 2.6208483751947286e-07, + "loss": 0.2821, + "step": 16172 + }, + { + "epoch": 0.93, + "grad_norm": 0.47065265482665175, + "learning_rate": 2.6166175555178864e-07, + "loss": 0.2604, + "step": 16173 + }, + { + "epoch": 0.93, + "grad_norm": 0.7827261475775145, + "learning_rate": 2.612390108223395e-07, + "loss": 0.579, + "step": 16174 + }, + { + "epoch": 0.93, + "grad_norm": 0.33742011781888565, + "learning_rate": 2.6081660334576376e-07, + "loss": 0.2539, + "step": 16175 + }, + { + "epoch": 0.93, + "grad_norm": 0.35489690817213293, + "learning_rate": 2.6039453313669084e-07, + "loss": 0.224, + "step": 16176 + }, + { + "epoch": 0.93, + "grad_norm": 0.40363854862203224, + "learning_rate": 2.599728002097346e-07, + "loss": 0.2276, + "step": 16177 + }, + { + "epoch": 0.93, + "grad_norm": 0.2998525771419882, + "learning_rate": 2.5955140457950334e-07, + "loss": 0.254, + "step": 16178 + }, + { + "epoch": 0.93, + "grad_norm": 0.3225328457185091, + "learning_rate": 2.591303462605876e-07, + "loss": 0.2568, + "step": 16179 + }, + { + "epoch": 0.93, + "grad_norm": 0.40344121357429547, + "learning_rate": 2.587096252675703e-07, + "loss": 0.2584, + "step": 16180 + }, + { + "epoch": 0.93, + "grad_norm": 0.4726299423223656, + "learning_rate": 2.5828924161501847e-07, + "loss": 0.2424, + "step": 16181 + }, + { + "epoch": 0.93, + "grad_norm": 1.2427679629599802, + "learning_rate": 2.5786919531749056e-07, + "loss": 0.5494, + "step": 16182 + }, + { + "epoch": 0.93, + "grad_norm": 0.3302371945217039, + "learning_rate": 2.5744948638953495e-07, + "loss": 0.2585, + "step": 16183 + }, + { + "epoch": 0.93, + "grad_norm": 0.2960406165691513, + "learning_rate": 2.570301148456833e-07, + "loss": 0.246, + "step": 16184 + }, + { + "epoch": 0.93, + "grad_norm": 0.22857777782773264, + "learning_rate": 2.566110807004618e-07, + "loss": 0.1342, + "step": 16185 + }, + { + "epoch": 0.93, + "grad_norm": 0.9455741587639533, + "learning_rate": 2.5619238396837665e-07, + "loss": 0.5122, + "step": 16186 + }, + { + "epoch": 0.93, + "grad_norm": 0.3025016716582683, + "learning_rate": 2.5577402466393286e-07, + "loss": 0.238, + "step": 16187 + }, + { + "epoch": 0.93, + "grad_norm": 0.39578317038127414, + "learning_rate": 2.5535600280161444e-07, + "loss": 0.3052, + "step": 16188 + }, + { + "epoch": 0.93, + "grad_norm": 0.5815913779636789, + "learning_rate": 2.549383183958998e-07, + "loss": 0.3511, + "step": 16189 + }, + { + "epoch": 0.93, + "grad_norm": 0.38518298786026156, + "learning_rate": 2.5452097146125063e-07, + "loss": 0.1683, + "step": 16190 + }, + { + "epoch": 0.93, + "grad_norm": 0.22205133621720435, + "learning_rate": 2.5410396201212105e-07, + "loss": 0.2007, + "step": 16191 + }, + { + "epoch": 0.93, + "grad_norm": 0.5057323622641279, + "learning_rate": 2.536872900629539e-07, + "loss": 0.3944, + "step": 16192 + }, + { + "epoch": 0.93, + "grad_norm": 0.3595273845184508, + "learning_rate": 2.5327095562817537e-07, + "loss": 0.1673, + "step": 16193 + }, + { + "epoch": 0.93, + "grad_norm": 0.5360787331974713, + "learning_rate": 2.528549587222051e-07, + "loss": 0.3074, + "step": 16194 + }, + { + "epoch": 0.93, + "grad_norm": 0.442058729074706, + "learning_rate": 2.524392993594482e-07, + "loss": 0.3311, + "step": 16195 + }, + { + "epoch": 0.93, + "grad_norm": 0.2855255735605113, + "learning_rate": 2.5202397755429876e-07, + "loss": 0.2012, + "step": 16196 + }, + { + "epoch": 0.93, + "grad_norm": 0.23458277362119945, + "learning_rate": 2.5160899332114075e-07, + "loss": 0.1566, + "step": 16197 + }, + { + "epoch": 0.93, + "grad_norm": 1.307433304730941, + "learning_rate": 2.5119434667434384e-07, + "loss": 0.7377, + "step": 16198 + }, + { + "epoch": 0.93, + "grad_norm": 0.3418249915562307, + "learning_rate": 2.507800376282665e-07, + "loss": 0.1928, + "step": 16199 + }, + { + "epoch": 0.93, + "grad_norm": 0.3322723765898175, + "learning_rate": 2.5036606619725847e-07, + "loss": 0.3077, + "step": 16200 + }, + { + "epoch": 0.93, + "grad_norm": 0.7679882980968665, + "learning_rate": 2.499524323956537e-07, + "loss": 0.4053, + "step": 16201 + }, + { + "epoch": 0.93, + "grad_norm": 0.5105729769391152, + "learning_rate": 2.495391362377775e-07, + "loss": 0.2702, + "step": 16202 + }, + { + "epoch": 0.93, + "grad_norm": 0.20248897495158377, + "learning_rate": 2.4912617773794057e-07, + "loss": 0.16, + "step": 16203 + }, + { + "epoch": 0.93, + "grad_norm": 0.45523324068273235, + "learning_rate": 2.4871355691044595e-07, + "loss": 0.3446, + "step": 16204 + }, + { + "epoch": 0.93, + "grad_norm": 0.31116609474106843, + "learning_rate": 2.4830127376958113e-07, + "loss": 0.2482, + "step": 16205 + }, + { + "epoch": 0.93, + "grad_norm": 0.7147870871308801, + "learning_rate": 2.4788932832962465e-07, + "loss": 0.2833, + "step": 16206 + }, + { + "epoch": 0.93, + "grad_norm": 0.36457125106980254, + "learning_rate": 2.4747772060484064e-07, + "loss": 0.2912, + "step": 16207 + }, + { + "epoch": 0.93, + "grad_norm": 0.5240461723284267, + "learning_rate": 2.470664506094844e-07, + "loss": 0.2765, + "step": 16208 + }, + { + "epoch": 0.93, + "grad_norm": 0.22721409677761747, + "learning_rate": 2.466555183577968e-07, + "loss": 0.146, + "step": 16209 + }, + { + "epoch": 0.93, + "grad_norm": 0.8567658532302702, + "learning_rate": 2.462449238640097e-07, + "loss": 0.5097, + "step": 16210 + }, + { + "epoch": 0.93, + "grad_norm": 0.37293244224659416, + "learning_rate": 2.458346671423406e-07, + "loss": 0.2645, + "step": 16211 + }, + { + "epoch": 0.93, + "grad_norm": 0.27145054299585214, + "learning_rate": 2.4542474820699823e-07, + "loss": 0.244, + "step": 16212 + }, + { + "epoch": 0.93, + "grad_norm": 1.3981516737715696, + "learning_rate": 2.450151670721768e-07, + "loss": 0.4875, + "step": 16213 + }, + { + "epoch": 0.93, + "grad_norm": 0.5986289510427129, + "learning_rate": 2.446059237520615e-07, + "loss": 0.2993, + "step": 16214 + }, + { + "epoch": 0.93, + "grad_norm": 0.22481400806917337, + "learning_rate": 2.441970182608222e-07, + "loss": 0.2347, + "step": 16215 + }, + { + "epoch": 0.93, + "grad_norm": 0.30778683391514594, + "learning_rate": 2.43788450612622e-07, + "loss": 0.1411, + "step": 16216 + }, + { + "epoch": 0.93, + "grad_norm": 0.41000123055651855, + "learning_rate": 2.433802208216085e-07, + "loss": 0.2763, + "step": 16217 + }, + { + "epoch": 0.93, + "grad_norm": 0.4744266292202893, + "learning_rate": 2.42972328901917e-07, + "loss": 0.3694, + "step": 16218 + }, + { + "epoch": 0.93, + "grad_norm": 0.3541890062856887, + "learning_rate": 2.4256477486767517e-07, + "loss": 0.2324, + "step": 16219 + }, + { + "epoch": 0.93, + "grad_norm": 0.41346592434653073, + "learning_rate": 2.4215755873299497e-07, + "loss": 0.2446, + "step": 16220 + }, + { + "epoch": 0.93, + "grad_norm": 0.5884279680128669, + "learning_rate": 2.4175068051197957e-07, + "loss": 0.3041, + "step": 16221 + }, + { + "epoch": 0.93, + "grad_norm": 0.4402183600759972, + "learning_rate": 2.413441402187178e-07, + "loss": 0.2166, + "step": 16222 + }, + { + "epoch": 0.93, + "grad_norm": 0.2384371145087381, + "learning_rate": 2.4093793786728934e-07, + "loss": 0.2403, + "step": 16223 + }, + { + "epoch": 0.93, + "grad_norm": 0.4502437190455561, + "learning_rate": 2.4053207347175976e-07, + "loss": 0.3311, + "step": 16224 + }, + { + "epoch": 0.93, + "grad_norm": 0.4684769525518414, + "learning_rate": 2.401265470461855e-07, + "loss": 0.12, + "step": 16225 + }, + { + "epoch": 0.93, + "grad_norm": 0.5547322450352692, + "learning_rate": 2.397213586046099e-07, + "loss": 0.2999, + "step": 16226 + }, + { + "epoch": 0.93, + "grad_norm": 0.2818633700967168, + "learning_rate": 2.3931650816106267e-07, + "loss": 0.2502, + "step": 16227 + }, + { + "epoch": 0.93, + "grad_norm": 0.4363782611754942, + "learning_rate": 2.3891199572956493e-07, + "loss": 0.3209, + "step": 16228 + }, + { + "epoch": 0.93, + "grad_norm": 0.47684193737555375, + "learning_rate": 2.3850782132412544e-07, + "loss": 0.2063, + "step": 16229 + }, + { + "epoch": 0.93, + "grad_norm": 0.3005978735016527, + "learning_rate": 2.3810398495874076e-07, + "loss": 0.2733, + "step": 16230 + }, + { + "epoch": 0.93, + "grad_norm": 0.2758614772557928, + "learning_rate": 2.377004866473953e-07, + "loss": 0.252, + "step": 16231 + }, + { + "epoch": 0.93, + "grad_norm": 0.5232494032312904, + "learning_rate": 2.3729732640406233e-07, + "loss": 0.0864, + "step": 16232 + }, + { + "epoch": 0.93, + "grad_norm": 0.43196860650976154, + "learning_rate": 2.3689450424270179e-07, + "loss": 0.2845, + "step": 16233 + }, + { + "epoch": 0.93, + "grad_norm": 0.6300939297391976, + "learning_rate": 2.364920201772658e-07, + "loss": 0.3533, + "step": 16234 + }, + { + "epoch": 0.93, + "grad_norm": 0.24702854532908614, + "learning_rate": 2.3608987422169106e-07, + "loss": 0.2239, + "step": 16235 + }, + { + "epoch": 0.93, + "grad_norm": 0.5147166917845664, + "learning_rate": 2.3568806638990527e-07, + "loss": 0.384, + "step": 16236 + }, + { + "epoch": 0.93, + "grad_norm": 0.34917477325603674, + "learning_rate": 2.3528659669581954e-07, + "loss": 0.1735, + "step": 16237 + }, + { + "epoch": 0.93, + "grad_norm": 0.35170576502926393, + "learning_rate": 2.3488546515334054e-07, + "loss": 0.1596, + "step": 16238 + }, + { + "epoch": 0.93, + "grad_norm": 0.35057358099260666, + "learning_rate": 2.3448467177635826e-07, + "loss": 0.291, + "step": 16239 + }, + { + "epoch": 0.93, + "grad_norm": 0.43437427364444037, + "learning_rate": 2.3408421657875158e-07, + "loss": 0.3281, + "step": 16240 + }, + { + "epoch": 0.93, + "grad_norm": 0.4994355097839041, + "learning_rate": 2.3368409957438832e-07, + "loss": 0.3334, + "step": 16241 + }, + { + "epoch": 0.93, + "grad_norm": 0.3989427545060095, + "learning_rate": 2.3328432077712516e-07, + "loss": 0.2308, + "step": 16242 + }, + { + "epoch": 0.93, + "grad_norm": 0.22736027420925461, + "learning_rate": 2.3288488020080546e-07, + "loss": 0.2094, + "step": 16243 + }, + { + "epoch": 0.93, + "grad_norm": 1.355549695082054, + "learning_rate": 2.3248577785926264e-07, + "loss": 0.3942, + "step": 16244 + }, + { + "epoch": 0.93, + "grad_norm": 0.3061093657317648, + "learning_rate": 2.3208701376631672e-07, + "loss": 0.2012, + "step": 16245 + }, + { + "epoch": 0.93, + "grad_norm": 0.4283494643377407, + "learning_rate": 2.3168858793577774e-07, + "loss": 0.3167, + "step": 16246 + }, + { + "epoch": 0.93, + "grad_norm": 0.40586491113206713, + "learning_rate": 2.3129050038144362e-07, + "loss": 0.3005, + "step": 16247 + }, + { + "epoch": 0.93, + "grad_norm": 0.30763307469813905, + "learning_rate": 2.3089275111709886e-07, + "loss": 0.2203, + "step": 16248 + }, + { + "epoch": 0.93, + "grad_norm": 0.2669467232153082, + "learning_rate": 2.3049534015651686e-07, + "loss": 0.1807, + "step": 16249 + }, + { + "epoch": 0.93, + "grad_norm": 0.7019907586813147, + "learning_rate": 2.3009826751346332e-07, + "loss": 0.3552, + "step": 16250 + }, + { + "epoch": 0.93, + "grad_norm": 0.2421896836266599, + "learning_rate": 2.2970153320168498e-07, + "loss": 0.2243, + "step": 16251 + }, + { + "epoch": 0.93, + "grad_norm": 0.8531496453021868, + "learning_rate": 2.293051372349231e-07, + "loss": 0.404, + "step": 16252 + }, + { + "epoch": 0.93, + "grad_norm": 0.7552706084359576, + "learning_rate": 2.2890907962690335e-07, + "loss": 0.548, + "step": 16253 + }, + { + "epoch": 0.93, + "grad_norm": 0.3945319567289995, + "learning_rate": 2.2851336039134363e-07, + "loss": 0.2455, + "step": 16254 + }, + { + "epoch": 0.93, + "grad_norm": 0.21129773765826465, + "learning_rate": 2.2811797954194527e-07, + "loss": 0.172, + "step": 16255 + }, + { + "epoch": 0.93, + "grad_norm": 0.5848773278732337, + "learning_rate": 2.2772293709240168e-07, + "loss": 0.3105, + "step": 16256 + }, + { + "epoch": 0.93, + "grad_norm": 0.39209953000005016, + "learning_rate": 2.2732823305639197e-07, + "loss": 0.3028, + "step": 16257 + }, + { + "epoch": 0.93, + "grad_norm": 0.45565939341834677, + "learning_rate": 2.2693386744758405e-07, + "loss": 0.2577, + "step": 16258 + }, + { + "epoch": 0.93, + "grad_norm": 0.3638561329873051, + "learning_rate": 2.2653984027963704e-07, + "loss": 0.2771, + "step": 16259 + }, + { + "epoch": 0.93, + "grad_norm": 0.3977592178558374, + "learning_rate": 2.2614615156619556e-07, + "loss": 0.3164, + "step": 16260 + }, + { + "epoch": 0.93, + "grad_norm": 0.2486224244899392, + "learning_rate": 2.2575280132089315e-07, + "loss": 0.0902, + "step": 16261 + }, + { + "epoch": 0.93, + "grad_norm": 0.41724306546277456, + "learning_rate": 2.2535978955734895e-07, + "loss": 0.2994, + "step": 16262 + }, + { + "epoch": 0.93, + "grad_norm": 0.26274641689573147, + "learning_rate": 2.2496711628917644e-07, + "loss": 0.2548, + "step": 16263 + }, + { + "epoch": 0.93, + "grad_norm": 0.7297326479778752, + "learning_rate": 2.2457478152997148e-07, + "loss": 0.4205, + "step": 16264 + }, + { + "epoch": 0.93, + "grad_norm": 1.0018169427080976, + "learning_rate": 2.2418278529332203e-07, + "loss": 0.4632, + "step": 16265 + }, + { + "epoch": 0.93, + "grad_norm": 0.3327648178772939, + "learning_rate": 2.2379112759280176e-07, + "loss": 0.2609, + "step": 16266 + }, + { + "epoch": 0.93, + "grad_norm": 0.23721807132324804, + "learning_rate": 2.2339980844197527e-07, + "loss": 0.2063, + "step": 16267 + }, + { + "epoch": 0.93, + "grad_norm": 0.5633019302848433, + "learning_rate": 2.2300882785439183e-07, + "loss": 0.2385, + "step": 16268 + }, + { + "epoch": 0.93, + "grad_norm": 0.39111215606894656, + "learning_rate": 2.2261818584359386e-07, + "loss": 0.2894, + "step": 16269 + }, + { + "epoch": 0.93, + "grad_norm": 1.4038427231112138, + "learning_rate": 2.222278824231061e-07, + "loss": 0.4999, + "step": 16270 + }, + { + "epoch": 0.93, + "grad_norm": 0.2967884736462052, + "learning_rate": 2.2183791760644668e-07, + "loss": 0.2486, + "step": 16271 + }, + { + "epoch": 0.93, + "grad_norm": 0.34349186837418483, + "learning_rate": 2.214482914071203e-07, + "loss": 0.2819, + "step": 16272 + }, + { + "epoch": 0.93, + "grad_norm": 0.8920189124308012, + "learning_rate": 2.2105900383861956e-07, + "loss": 0.4225, + "step": 16273 + }, + { + "epoch": 0.94, + "grad_norm": 0.28684107801549114, + "learning_rate": 2.2067005491442362e-07, + "loss": 0.244, + "step": 16274 + }, + { + "epoch": 0.94, + "grad_norm": 0.3494953028521881, + "learning_rate": 2.2028144464800393e-07, + "loss": 0.275, + "step": 16275 + }, + { + "epoch": 0.94, + "grad_norm": 0.35670384309614495, + "learning_rate": 2.1989317305281755e-07, + "loss": 0.179, + "step": 16276 + }, + { + "epoch": 0.94, + "grad_norm": 0.996066078136281, + "learning_rate": 2.1950524014231033e-07, + "loss": 0.4171, + "step": 16277 + }, + { + "epoch": 0.94, + "grad_norm": 0.347523126063495, + "learning_rate": 2.191176459299138e-07, + "loss": 0.1977, + "step": 16278 + }, + { + "epoch": 0.94, + "grad_norm": 0.32388881265928143, + "learning_rate": 2.1873039042905497e-07, + "loss": 0.2703, + "step": 16279 + }, + { + "epoch": 0.94, + "grad_norm": 0.39745643594948593, + "learning_rate": 2.183434736531409e-07, + "loss": 0.2683, + "step": 16280 + }, + { + "epoch": 0.94, + "grad_norm": 0.22006091226414445, + "learning_rate": 2.1795689561557308e-07, + "loss": 0.152, + "step": 16281 + }, + { + "epoch": 0.94, + "grad_norm": 0.34373104169964486, + "learning_rate": 2.175706563297353e-07, + "loss": 0.2868, + "step": 16282 + }, + { + "epoch": 0.94, + "grad_norm": 1.263619920512286, + "learning_rate": 2.1718475580900567e-07, + "loss": 0.7482, + "step": 16283 + }, + { + "epoch": 0.94, + "grad_norm": 0.29616054478209414, + "learning_rate": 2.16799194066748e-07, + "loss": 0.1906, + "step": 16284 + }, + { + "epoch": 0.94, + "grad_norm": 0.5626197863716071, + "learning_rate": 2.164139711163138e-07, + "loss": 0.3607, + "step": 16285 + }, + { + "epoch": 0.94, + "grad_norm": 0.3590447014920199, + "learning_rate": 2.160290869710424e-07, + "loss": 0.3023, + "step": 16286 + }, + { + "epoch": 0.94, + "grad_norm": 0.22648512679657767, + "learning_rate": 2.1564454164426207e-07, + "loss": 0.1571, + "step": 16287 + }, + { + "epoch": 0.94, + "grad_norm": 0.4333570135949622, + "learning_rate": 2.1526033514929213e-07, + "loss": 0.2604, + "step": 16288 + }, + { + "epoch": 0.94, + "grad_norm": 1.220905292397416, + "learning_rate": 2.1487646749943524e-07, + "loss": 0.5481, + "step": 16289 + }, + { + "epoch": 0.94, + "grad_norm": 0.32203788282666024, + "learning_rate": 2.1449293870798637e-07, + "loss": 0.2688, + "step": 16290 + }, + { + "epoch": 0.94, + "grad_norm": 0.34693965055254655, + "learning_rate": 2.1410974878822487e-07, + "loss": 0.2552, + "step": 16291 + }, + { + "epoch": 0.94, + "grad_norm": 0.551396319559656, + "learning_rate": 2.1372689775342238e-07, + "loss": 0.2369, + "step": 16292 + }, + { + "epoch": 0.94, + "grad_norm": 0.30887122673256734, + "learning_rate": 2.1334438561683713e-07, + "loss": 0.2373, + "step": 16293 + }, + { + "epoch": 0.94, + "grad_norm": 0.22820054330620598, + "learning_rate": 2.1296221239171523e-07, + "loss": 0.1915, + "step": 16294 + }, + { + "epoch": 0.94, + "grad_norm": 1.137231060487121, + "learning_rate": 2.125803780912894e-07, + "loss": 0.7172, + "step": 16295 + }, + { + "epoch": 0.94, + "grad_norm": 0.40545905296169865, + "learning_rate": 2.1219888272878575e-07, + "loss": 0.2602, + "step": 16296 + }, + { + "epoch": 0.94, + "grad_norm": 0.4730775018614647, + "learning_rate": 2.118177263174137e-07, + "loss": 0.2668, + "step": 16297 + }, + { + "epoch": 0.94, + "grad_norm": 0.34205215126746363, + "learning_rate": 2.114369088703727e-07, + "loss": 0.2772, + "step": 16298 + }, + { + "epoch": 0.94, + "grad_norm": 0.32729918038453004, + "learning_rate": 2.1105643040085112e-07, + "loss": 0.2464, + "step": 16299 + }, + { + "epoch": 0.94, + "grad_norm": 0.24639056641925142, + "learning_rate": 2.106762909220228e-07, + "loss": 0.1184, + "step": 16300 + }, + { + "epoch": 0.94, + "grad_norm": 0.99073060626058, + "learning_rate": 2.10296490447055e-07, + "loss": 0.4329, + "step": 16301 + }, + { + "epoch": 0.94, + "grad_norm": 0.32121737540817974, + "learning_rate": 2.0991702898909838e-07, + "loss": 0.2518, + "step": 16302 + }, + { + "epoch": 0.94, + "grad_norm": 0.3301117105900323, + "learning_rate": 2.0953790656129457e-07, + "loss": 0.3058, + "step": 16303 + }, + { + "epoch": 0.94, + "grad_norm": 1.7456922022694272, + "learning_rate": 2.091591231767709e-07, + "loss": 0.1374, + "step": 16304 + }, + { + "epoch": 0.94, + "grad_norm": 0.32467499870042676, + "learning_rate": 2.0878067884864683e-07, + "loss": 0.2509, + "step": 16305 + }, + { + "epoch": 0.94, + "grad_norm": 0.39956950076075687, + "learning_rate": 2.0840257359002635e-07, + "loss": 0.2391, + "step": 16306 + }, + { + "epoch": 0.94, + "grad_norm": 0.3293116288624257, + "learning_rate": 2.0802480741400456e-07, + "loss": 0.2718, + "step": 16307 + }, + { + "epoch": 0.94, + "grad_norm": 0.3131425030863296, + "learning_rate": 2.0764738033366095e-07, + "loss": 0.2571, + "step": 16308 + }, + { + "epoch": 0.94, + "grad_norm": 0.44822453408391827, + "learning_rate": 2.0727029236206953e-07, + "loss": 0.237, + "step": 16309 + }, + { + "epoch": 0.94, + "grad_norm": 0.3492819711372927, + "learning_rate": 2.068935435122854e-07, + "loss": 0.2403, + "step": 16310 + }, + { + "epoch": 0.94, + "grad_norm": 0.3019190826036198, + "learning_rate": 2.0651713379735706e-07, + "loss": 0.2593, + "step": 16311 + }, + { + "epoch": 0.94, + "grad_norm": 0.4584884405827178, + "learning_rate": 2.0614106323031846e-07, + "loss": 0.2696, + "step": 16312 + }, + { + "epoch": 0.94, + "grad_norm": 0.411254505830708, + "learning_rate": 2.0576533182419477e-07, + "loss": 0.2811, + "step": 16313 + }, + { + "epoch": 0.94, + "grad_norm": 0.3902520439694165, + "learning_rate": 2.053899395919956e-07, + "loss": 0.2648, + "step": 16314 + }, + { + "epoch": 0.94, + "grad_norm": 0.353453516685148, + "learning_rate": 2.0501488654672276e-07, + "loss": 0.2453, + "step": 16315 + }, + { + "epoch": 0.94, + "grad_norm": 1.1755006016717067, + "learning_rate": 2.0464017270136139e-07, + "loss": 0.4489, + "step": 16316 + }, + { + "epoch": 0.94, + "grad_norm": 0.36115634958306614, + "learning_rate": 2.0426579806889114e-07, + "loss": 0.1607, + "step": 16317 + }, + { + "epoch": 0.94, + "grad_norm": 0.2925206750954721, + "learning_rate": 2.0389176266227494e-07, + "loss": 0.3033, + "step": 16318 + }, + { + "epoch": 0.94, + "grad_norm": 0.7559070874858043, + "learning_rate": 2.0351806649446582e-07, + "loss": 0.4248, + "step": 16319 + }, + { + "epoch": 0.94, + "grad_norm": 0.3267369295851818, + "learning_rate": 2.0314470957840337e-07, + "loss": 0.2155, + "step": 16320 + }, + { + "epoch": 0.94, + "grad_norm": 0.27187878348807465, + "learning_rate": 2.0277169192701951e-07, + "loss": 0.1862, + "step": 16321 + }, + { + "epoch": 0.94, + "grad_norm": 0.3450218801283784, + "learning_rate": 2.0239901355323166e-07, + "loss": 0.2994, + "step": 16322 + }, + { + "epoch": 0.94, + "grad_norm": 0.33760499589787035, + "learning_rate": 2.0202667446994396e-07, + "loss": 0.1877, + "step": 16323 + }, + { + "epoch": 0.94, + "grad_norm": 1.3079162281268204, + "learning_rate": 2.016546746900505e-07, + "loss": 0.5858, + "step": 16324 + }, + { + "epoch": 0.94, + "grad_norm": 0.5508684131706696, + "learning_rate": 2.0128301422643437e-07, + "loss": 0.3595, + "step": 16325 + }, + { + "epoch": 0.94, + "grad_norm": 0.22361769791366404, + "learning_rate": 2.0091169309196635e-07, + "loss": 0.219, + "step": 16326 + }, + { + "epoch": 0.94, + "grad_norm": 0.27103442373090697, + "learning_rate": 2.0054071129950503e-07, + "loss": 0.1882, + "step": 16327 + }, + { + "epoch": 0.94, + "grad_norm": 1.4862555952033187, + "learning_rate": 2.0017006886189793e-07, + "loss": 0.5859, + "step": 16328 + }, + { + "epoch": 0.94, + "grad_norm": 0.43694139275442706, + "learning_rate": 1.9979976579197924e-07, + "loss": 0.2516, + "step": 16329 + }, + { + "epoch": 0.94, + "grad_norm": 0.2777763051503804, + "learning_rate": 1.9942980210257313e-07, + "loss": 0.2499, + "step": 16330 + }, + { + "epoch": 0.94, + "grad_norm": 0.5401113539028809, + "learning_rate": 1.9906017780649267e-07, + "loss": 0.3562, + "step": 16331 + }, + { + "epoch": 0.94, + "grad_norm": 0.5545585769788984, + "learning_rate": 1.9869089291653544e-07, + "loss": 0.3261, + "step": 16332 + }, + { + "epoch": 0.94, + "grad_norm": 0.24578812393967553, + "learning_rate": 1.9832194744549117e-07, + "loss": 0.1533, + "step": 16333 + }, + { + "epoch": 0.94, + "grad_norm": 0.3471473882109775, + "learning_rate": 1.9795334140613741e-07, + "loss": 0.2771, + "step": 16334 + }, + { + "epoch": 0.94, + "grad_norm": 0.7065238619042722, + "learning_rate": 1.9758507481123734e-07, + "loss": 0.3106, + "step": 16335 + }, + { + "epoch": 0.94, + "grad_norm": 0.37428051274745233, + "learning_rate": 1.9721714767354516e-07, + "loss": 0.2519, + "step": 16336 + }, + { + "epoch": 0.94, + "grad_norm": 0.8941617265103616, + "learning_rate": 1.968495600058018e-07, + "loss": 0.5307, + "step": 16337 + }, + { + "epoch": 0.94, + "grad_norm": 0.31408047918318077, + "learning_rate": 1.9648231182073484e-07, + "loss": 0.2839, + "step": 16338 + }, + { + "epoch": 0.94, + "grad_norm": 0.21954413848341273, + "learning_rate": 1.9611540313106526e-07, + "loss": 0.1522, + "step": 16339 + }, + { + "epoch": 0.94, + "grad_norm": 1.4172099353996386, + "learning_rate": 1.957488339494973e-07, + "loss": 0.4308, + "step": 16340 + }, + { + "epoch": 0.94, + "grad_norm": 0.4576229631078692, + "learning_rate": 1.9538260428872636e-07, + "loss": 0.2894, + "step": 16341 + }, + { + "epoch": 0.94, + "grad_norm": 0.29422998452537247, + "learning_rate": 1.9501671416143342e-07, + "loss": 0.2752, + "step": 16342 + }, + { + "epoch": 0.94, + "grad_norm": 0.7309287759530217, + "learning_rate": 1.9465116358029057e-07, + "loss": 0.3446, + "step": 16343 + }, + { + "epoch": 0.94, + "grad_norm": 0.3900915110844801, + "learning_rate": 1.9428595255795657e-07, + "loss": 0.2888, + "step": 16344 + }, + { + "epoch": 0.94, + "grad_norm": 0.37353761465583235, + "learning_rate": 1.9392108110707686e-07, + "loss": 0.2302, + "step": 16345 + }, + { + "epoch": 0.94, + "grad_norm": 0.3062970278982408, + "learning_rate": 1.935565492402891e-07, + "loss": 0.2056, + "step": 16346 + }, + { + "epoch": 0.94, + "grad_norm": 0.5981284968410989, + "learning_rate": 1.9319235697021766e-07, + "loss": 0.3264, + "step": 16347 + }, + { + "epoch": 0.94, + "grad_norm": 0.35572231889161077, + "learning_rate": 1.9282850430947242e-07, + "loss": 0.3058, + "step": 16348 + }, + { + "epoch": 0.94, + "grad_norm": 0.3250188011219933, + "learning_rate": 1.9246499127065333e-07, + "loss": 0.2664, + "step": 16349 + }, + { + "epoch": 0.94, + "grad_norm": 0.5737774291728073, + "learning_rate": 1.9210181786635028e-07, + "loss": 0.2784, + "step": 16350 + }, + { + "epoch": 0.94, + "grad_norm": 0.4012088600693107, + "learning_rate": 1.9173898410913995e-07, + "loss": 0.2982, + "step": 16351 + }, + { + "epoch": 0.94, + "grad_norm": 0.29848629529359066, + "learning_rate": 1.9137649001158665e-07, + "loss": 0.1829, + "step": 16352 + }, + { + "epoch": 0.94, + "grad_norm": 0.48598014526128647, + "learning_rate": 1.9101433558624483e-07, + "loss": 0.2018, + "step": 16353 + }, + { + "epoch": 0.94, + "grad_norm": 0.26365262037215137, + "learning_rate": 1.9065252084565222e-07, + "loss": 0.2652, + "step": 16354 + }, + { + "epoch": 0.94, + "grad_norm": 1.2940730599519867, + "learning_rate": 1.9029104580234325e-07, + "loss": 0.7013, + "step": 16355 + }, + { + "epoch": 0.94, + "grad_norm": 0.6116318818103755, + "learning_rate": 1.8992991046883236e-07, + "loss": 0.1767, + "step": 16356 + }, + { + "epoch": 0.94, + "grad_norm": 0.23131950704158802, + "learning_rate": 1.895691148576273e-07, + "loss": 0.1767, + "step": 16357 + }, + { + "epoch": 0.94, + "grad_norm": 0.3293918268569444, + "learning_rate": 1.8920865898122143e-07, + "loss": 0.2735, + "step": 16358 + }, + { + "epoch": 0.94, + "grad_norm": 0.4511372616577314, + "learning_rate": 1.888485428520992e-07, + "loss": 0.2337, + "step": 16359 + }, + { + "epoch": 0.94, + "grad_norm": 0.3753456647512689, + "learning_rate": 1.884887664827284e-07, + "loss": 0.2896, + "step": 16360 + }, + { + "epoch": 0.94, + "grad_norm": 0.44961231456689443, + "learning_rate": 1.881293298855713e-07, + "loss": 0.3533, + "step": 16361 + }, + { + "epoch": 0.94, + "grad_norm": 0.28662578208318146, + "learning_rate": 1.877702330730724e-07, + "loss": 0.1942, + "step": 16362 + }, + { + "epoch": 0.94, + "grad_norm": 0.6551363416683533, + "learning_rate": 1.874114760576684e-07, + "loss": 0.3072, + "step": 16363 + }, + { + "epoch": 0.94, + "grad_norm": 0.32419575634159736, + "learning_rate": 1.870530588517827e-07, + "loss": 0.1952, + "step": 16364 + }, + { + "epoch": 0.94, + "grad_norm": 0.3382957834583971, + "learning_rate": 1.8669498146782871e-07, + "loss": 0.2804, + "step": 16365 + }, + { + "epoch": 0.94, + "grad_norm": 0.27619581453105596, + "learning_rate": 1.863372439182054e-07, + "loss": 0.2148, + "step": 16366 + }, + { + "epoch": 0.94, + "grad_norm": 1.2113862017013661, + "learning_rate": 1.8597984621530063e-07, + "loss": 0.8154, + "step": 16367 + }, + { + "epoch": 0.94, + "grad_norm": 1.391102015454029, + "learning_rate": 1.8562278837149228e-07, + "loss": 0.4795, + "step": 16368 + }, + { + "epoch": 0.94, + "grad_norm": 0.3298554751561711, + "learning_rate": 1.852660703991438e-07, + "loss": 0.1875, + "step": 16369 + }, + { + "epoch": 0.94, + "grad_norm": 0.33732172554386936, + "learning_rate": 1.8490969231061085e-07, + "loss": 0.3013, + "step": 16370 + }, + { + "epoch": 0.94, + "grad_norm": 0.41691582029462937, + "learning_rate": 1.8455365411823134e-07, + "loss": 0.2349, + "step": 16371 + }, + { + "epoch": 0.94, + "grad_norm": 0.23903592330484194, + "learning_rate": 1.8419795583433763e-07, + "loss": 0.1662, + "step": 16372 + }, + { + "epoch": 0.94, + "grad_norm": 0.48747978730964747, + "learning_rate": 1.8384259747124766e-07, + "loss": 0.3685, + "step": 16373 + }, + { + "epoch": 0.94, + "grad_norm": 0.4383352697210021, + "learning_rate": 1.834875790412649e-07, + "loss": 0.3244, + "step": 16374 + }, + { + "epoch": 0.94, + "grad_norm": 0.34083610981257056, + "learning_rate": 1.831329005566851e-07, + "loss": 0.2185, + "step": 16375 + }, + { + "epoch": 0.94, + "grad_norm": 0.6216981447352833, + "learning_rate": 1.8277856202979282e-07, + "loss": 0.3541, + "step": 16376 + }, + { + "epoch": 0.94, + "grad_norm": 0.2519742262206242, + "learning_rate": 1.8242456347285498e-07, + "loss": 0.2207, + "step": 16377 + }, + { + "epoch": 0.94, + "grad_norm": 0.27485795073992864, + "learning_rate": 1.8207090489813284e-07, + "loss": 0.2107, + "step": 16378 + }, + { + "epoch": 0.94, + "grad_norm": 0.9112149768074835, + "learning_rate": 1.8171758631787327e-07, + "loss": 0.5171, + "step": 16379 + }, + { + "epoch": 0.94, + "grad_norm": 0.6028629951517451, + "learning_rate": 1.8136460774431097e-07, + "loss": 0.2255, + "step": 16380 + }, + { + "epoch": 0.94, + "grad_norm": 0.3830929246588872, + "learning_rate": 1.8101196918967056e-07, + "loss": 0.2811, + "step": 16381 + }, + { + "epoch": 0.94, + "grad_norm": 0.3110335698157431, + "learning_rate": 1.806596706661634e-07, + "loss": 0.2559, + "step": 16382 + }, + { + "epoch": 0.94, + "grad_norm": 0.3532742177417911, + "learning_rate": 1.8030771218598863e-07, + "loss": 0.2153, + "step": 16383 + }, + { + "epoch": 0.94, + "grad_norm": 0.27118052800532905, + "learning_rate": 1.799560937613365e-07, + "loss": 0.2216, + "step": 16384 + }, + { + "epoch": 0.94, + "grad_norm": 0.30009922060906546, + "learning_rate": 1.7960481540438278e-07, + "loss": 0.259, + "step": 16385 + }, + { + "epoch": 0.94, + "grad_norm": 1.2439511677777768, + "learning_rate": 1.7925387712729113e-07, + "loss": 0.3727, + "step": 16386 + }, + { + "epoch": 0.94, + "grad_norm": 0.3074681980467877, + "learning_rate": 1.7890327894221515e-07, + "loss": 0.2405, + "step": 16387 + }, + { + "epoch": 0.94, + "grad_norm": 0.6218933087042393, + "learning_rate": 1.7855302086129734e-07, + "loss": 0.2886, + "step": 16388 + }, + { + "epoch": 0.94, + "grad_norm": 0.25243688522148056, + "learning_rate": 1.7820310289666577e-07, + "loss": 0.2225, + "step": 16389 + }, + { + "epoch": 0.94, + "grad_norm": 0.24248323345775227, + "learning_rate": 1.7785352506043852e-07, + "loss": 0.2009, + "step": 16390 + }, + { + "epoch": 0.94, + "grad_norm": 1.140361489090267, + "learning_rate": 1.7750428736472146e-07, + "loss": 0.7097, + "step": 16391 + }, + { + "epoch": 0.94, + "grad_norm": 0.6494164060996802, + "learning_rate": 1.7715538982160717e-07, + "loss": 0.2132, + "step": 16392 + }, + { + "epoch": 0.94, + "grad_norm": 0.29027986290742447, + "learning_rate": 1.7680683244318154e-07, + "loss": 0.274, + "step": 16393 + }, + { + "epoch": 0.94, + "grad_norm": 0.4739590651850567, + "learning_rate": 1.7645861524151152e-07, + "loss": 0.3406, + "step": 16394 + }, + { + "epoch": 0.94, + "grad_norm": 0.47333802545971043, + "learning_rate": 1.7611073822865753e-07, + "loss": 0.1103, + "step": 16395 + }, + { + "epoch": 0.94, + "grad_norm": 0.34722614276277375, + "learning_rate": 1.7576320141666548e-07, + "loss": 0.297, + "step": 16396 + }, + { + "epoch": 0.94, + "grad_norm": 0.4741094317552018, + "learning_rate": 1.7541600481757238e-07, + "loss": 0.3714, + "step": 16397 + }, + { + "epoch": 0.94, + "grad_norm": 0.30253332304639474, + "learning_rate": 1.7506914844340084e-07, + "loss": 0.2009, + "step": 16398 + }, + { + "epoch": 0.94, + "grad_norm": 0.3499583597663478, + "learning_rate": 1.7472263230616126e-07, + "loss": 0.2693, + "step": 16399 + }, + { + "epoch": 0.94, + "grad_norm": 0.9313377074422858, + "learning_rate": 1.7437645641785404e-07, + "loss": 0.4164, + "step": 16400 + }, + { + "epoch": 0.94, + "grad_norm": 0.3084187512136342, + "learning_rate": 1.7403062079046851e-07, + "loss": 0.2116, + "step": 16401 + }, + { + "epoch": 0.94, + "grad_norm": 0.3905940549961325, + "learning_rate": 1.736851254359795e-07, + "loss": 0.2868, + "step": 16402 + }, + { + "epoch": 0.94, + "grad_norm": 0.45116394117225134, + "learning_rate": 1.7333997036635296e-07, + "loss": 0.3161, + "step": 16403 + }, + { + "epoch": 0.94, + "grad_norm": 0.587337210292463, + "learning_rate": 1.7299515559354052e-07, + "loss": 0.3472, + "step": 16404 + }, + { + "epoch": 0.94, + "grad_norm": 0.24303637781952572, + "learning_rate": 1.7265068112948257e-07, + "loss": 0.1978, + "step": 16405 + }, + { + "epoch": 0.94, + "grad_norm": 0.3114290454839184, + "learning_rate": 1.7230654698610848e-07, + "loss": 0.2325, + "step": 16406 + }, + { + "epoch": 0.94, + "grad_norm": 1.4056712180482611, + "learning_rate": 1.7196275317533761e-07, + "loss": 0.4493, + "step": 16407 + }, + { + "epoch": 0.94, + "grad_norm": 0.28533591224264054, + "learning_rate": 1.7161929970907266e-07, + "loss": 0.2084, + "step": 16408 + }, + { + "epoch": 0.94, + "grad_norm": 0.33232194989980035, + "learning_rate": 1.7127618659920963e-07, + "loss": 0.2976, + "step": 16409 + }, + { + "epoch": 0.94, + "grad_norm": 0.5558040330434648, + "learning_rate": 1.7093341385762907e-07, + "loss": 0.3583, + "step": 16410 + }, + { + "epoch": 0.94, + "grad_norm": 0.1846507440261721, + "learning_rate": 1.7059098149620257e-07, + "loss": 0.1403, + "step": 16411 + }, + { + "epoch": 0.94, + "grad_norm": 1.3482449752952768, + "learning_rate": 1.702488895267862e-07, + "loss": 0.5575, + "step": 16412 + }, + { + "epoch": 0.94, + "grad_norm": 0.3510275776236248, + "learning_rate": 1.6990713796122938e-07, + "loss": 0.2993, + "step": 16413 + }, + { + "epoch": 0.94, + "grad_norm": 0.2897476651472436, + "learning_rate": 1.6956572681136485e-07, + "loss": 0.1903, + "step": 16414 + }, + { + "epoch": 0.94, + "grad_norm": 0.6730125448322015, + "learning_rate": 1.692246560890176e-07, + "loss": 0.4091, + "step": 16415 + }, + { + "epoch": 0.94, + "grad_norm": 0.43855648011694554, + "learning_rate": 1.688839258059971e-07, + "loss": 0.3141, + "step": 16416 + }, + { + "epoch": 0.94, + "grad_norm": 0.4039676932669491, + "learning_rate": 1.6854353597410278e-07, + "loss": 0.2686, + "step": 16417 + }, + { + "epoch": 0.94, + "grad_norm": 0.24169214358315877, + "learning_rate": 1.68203486605123e-07, + "loss": 0.1656, + "step": 16418 + }, + { + "epoch": 0.94, + "grad_norm": 1.4027029186189741, + "learning_rate": 1.6786377771083496e-07, + "loss": 0.5126, + "step": 16419 + }, + { + "epoch": 0.94, + "grad_norm": 0.4016732571923031, + "learning_rate": 1.675244093030015e-07, + "loss": 0.272, + "step": 16420 + }, + { + "epoch": 0.94, + "grad_norm": 0.31401939566076453, + "learning_rate": 1.6718538139337325e-07, + "loss": 0.251, + "step": 16421 + }, + { + "epoch": 0.94, + "grad_norm": 0.7274054388947614, + "learning_rate": 1.6684669399369412e-07, + "loss": 0.4075, + "step": 16422 + }, + { + "epoch": 0.94, + "grad_norm": 0.3134544640656414, + "learning_rate": 1.6650834711569031e-07, + "loss": 0.2514, + "step": 16423 + }, + { + "epoch": 0.94, + "grad_norm": 0.29983200999934284, + "learning_rate": 1.661703407710802e-07, + "loss": 0.0921, + "step": 16424 + }, + { + "epoch": 0.94, + "grad_norm": 0.3718216113469689, + "learning_rate": 1.6583267497156663e-07, + "loss": 0.3093, + "step": 16425 + }, + { + "epoch": 0.94, + "grad_norm": 0.3162630195231657, + "learning_rate": 1.6549534972884584e-07, + "loss": 0.2512, + "step": 16426 + }, + { + "epoch": 0.94, + "grad_norm": 0.5407154342108497, + "learning_rate": 1.6515836505459848e-07, + "loss": 0.3433, + "step": 16427 + }, + { + "epoch": 0.94, + "grad_norm": 0.4685866469053301, + "learning_rate": 1.648217209604941e-07, + "loss": 0.2603, + "step": 16428 + }, + { + "epoch": 0.94, + "grad_norm": 0.2709135275382767, + "learning_rate": 1.6448541745819113e-07, + "loss": 0.2503, + "step": 16429 + }, + { + "epoch": 0.94, + "grad_norm": 0.3388604124213988, + "learning_rate": 1.6414945455933363e-07, + "loss": 0.1982, + "step": 16430 + }, + { + "epoch": 0.94, + "grad_norm": 0.9826337080521574, + "learning_rate": 1.638138322755578e-07, + "loss": 0.2629, + "step": 16431 + }, + { + "epoch": 0.94, + "grad_norm": 0.300363016379023, + "learning_rate": 1.634785506184866e-07, + "loss": 0.2463, + "step": 16432 + }, + { + "epoch": 0.94, + "grad_norm": 0.34111656134447776, + "learning_rate": 1.6314360959973075e-07, + "loss": 0.2959, + "step": 16433 + }, + { + "epoch": 0.94, + "grad_norm": 1.0121583179292197, + "learning_rate": 1.628090092308876e-07, + "loss": 0.3706, + "step": 16434 + }, + { + "epoch": 0.94, + "grad_norm": 0.39314249015309166, + "learning_rate": 1.6247474952354568e-07, + "loss": 0.2884, + "step": 16435 + }, + { + "epoch": 0.94, + "grad_norm": 0.2459622684578627, + "learning_rate": 1.621408304892802e-07, + "loss": 0.19, + "step": 16436 + }, + { + "epoch": 0.94, + "grad_norm": 0.3447675748059979, + "learning_rate": 1.618072521396552e-07, + "loss": 0.247, + "step": 16437 + }, + { + "epoch": 0.94, + "grad_norm": 0.41316622660519126, + "learning_rate": 1.6147401448622145e-07, + "loss": 0.2924, + "step": 16438 + }, + { + "epoch": 0.94, + "grad_norm": 0.44203133836751707, + "learning_rate": 1.6114111754051976e-07, + "loss": 0.3195, + "step": 16439 + }, + { + "epoch": 0.94, + "grad_norm": 0.48436498902588176, + "learning_rate": 1.6080856131407862e-07, + "loss": 0.3431, + "step": 16440 + }, + { + "epoch": 0.94, + "grad_norm": 0.28460937853923546, + "learning_rate": 1.6047634581841331e-07, + "loss": 0.184, + "step": 16441 + }, + { + "epoch": 0.94, + "grad_norm": 0.2880830569609324, + "learning_rate": 1.6014447106502907e-07, + "loss": 0.1768, + "step": 16442 + }, + { + "epoch": 0.94, + "grad_norm": 0.8736808553813734, + "learning_rate": 1.5981293706541888e-07, + "loss": 0.3771, + "step": 16443 + }, + { + "epoch": 0.94, + "grad_norm": 0.31317366638812105, + "learning_rate": 1.5948174383106362e-07, + "loss": 0.2167, + "step": 16444 + }, + { + "epoch": 0.94, + "grad_norm": 0.3277336661706098, + "learning_rate": 1.5915089137343186e-07, + "loss": 0.3097, + "step": 16445 + }, + { + "epoch": 0.94, + "grad_norm": 1.1496737135542834, + "learning_rate": 1.5882037970398111e-07, + "loss": 0.7183, + "step": 16446 + }, + { + "epoch": 0.94, + "grad_norm": 0.29250473166722574, + "learning_rate": 1.584902088341589e-07, + "loss": 0.1838, + "step": 16447 + }, + { + "epoch": 0.95, + "grad_norm": 0.2878214529095352, + "learning_rate": 1.5816037877539715e-07, + "loss": 0.1679, + "step": 16448 + }, + { + "epoch": 0.95, + "grad_norm": 0.3309627514911893, + "learning_rate": 1.5783088953911784e-07, + "loss": 0.3004, + "step": 16449 + }, + { + "epoch": 0.95, + "grad_norm": 0.33326136046621196, + "learning_rate": 1.5750174113673077e-07, + "loss": 0.2167, + "step": 16450 + }, + { + "epoch": 0.95, + "grad_norm": 1.179074426417154, + "learning_rate": 1.5717293357963682e-07, + "loss": 0.5454, + "step": 16451 + }, + { + "epoch": 0.95, + "grad_norm": 0.4564239880365591, + "learning_rate": 1.5684446687922017e-07, + "loss": 0.3404, + "step": 16452 + }, + { + "epoch": 0.95, + "grad_norm": 0.30674601902753773, + "learning_rate": 1.565163410468562e-07, + "loss": 0.2332, + "step": 16453 + }, + { + "epoch": 0.95, + "grad_norm": 0.2190549081107368, + "learning_rate": 1.5618855609390803e-07, + "loss": 0.1242, + "step": 16454 + }, + { + "epoch": 0.95, + "grad_norm": 0.5729376141354351, + "learning_rate": 1.558611120317266e-07, + "loss": 0.3633, + "step": 16455 + }, + { + "epoch": 0.95, + "grad_norm": 0.5545582014596254, + "learning_rate": 1.5553400887165172e-07, + "loss": 0.2793, + "step": 16456 + }, + { + "epoch": 0.95, + "grad_norm": 0.2458146065487911, + "learning_rate": 1.5520724662501207e-07, + "loss": 0.2315, + "step": 16457 + }, + { + "epoch": 0.95, + "grad_norm": 1.1168303612442845, + "learning_rate": 1.5488082530312087e-07, + "loss": 0.694, + "step": 16458 + }, + { + "epoch": 0.95, + "grad_norm": 0.6203300463305141, + "learning_rate": 1.545547449172835e-07, + "loss": 0.2942, + "step": 16459 + }, + { + "epoch": 0.95, + "grad_norm": 0.24751408093210903, + "learning_rate": 1.5422900547879206e-07, + "loss": 0.199, + "step": 16460 + }, + { + "epoch": 0.95, + "grad_norm": 0.4335318922233136, + "learning_rate": 1.5390360699892636e-07, + "loss": 0.3313, + "step": 16461 + }, + { + "epoch": 0.95, + "grad_norm": 0.3373096852746386, + "learning_rate": 1.5357854948895634e-07, + "loss": 0.2321, + "step": 16462 + }, + { + "epoch": 0.95, + "grad_norm": 0.3630705794262418, + "learning_rate": 1.532538329601363e-07, + "loss": 0.2461, + "step": 16463 + }, + { + "epoch": 0.95, + "grad_norm": 0.5126707395414409, + "learning_rate": 1.529294574237139e-07, + "loss": 0.3202, + "step": 16464 + }, + { + "epoch": 0.95, + "grad_norm": 0.2925570376583863, + "learning_rate": 1.5260542289092016e-07, + "loss": 0.2425, + "step": 16465 + }, + { + "epoch": 0.95, + "grad_norm": 0.5729303620205985, + "learning_rate": 1.5228172937297837e-07, + "loss": 0.3226, + "step": 16466 + }, + { + "epoch": 0.95, + "grad_norm": 0.4405506344298435, + "learning_rate": 1.5195837688109506e-07, + "loss": 0.2103, + "step": 16467 + }, + { + "epoch": 0.95, + "grad_norm": 0.23129493919039548, + "learning_rate": 1.5163536542647018e-07, + "loss": 0.2064, + "step": 16468 + }, + { + "epoch": 0.95, + "grad_norm": 0.34775353831476136, + "learning_rate": 1.5131269502029034e-07, + "loss": 0.3053, + "step": 16469 + }, + { + "epoch": 0.95, + "grad_norm": 1.0191758063869858, + "learning_rate": 1.509903656737277e-07, + "loss": 0.5071, + "step": 16470 + }, + { + "epoch": 0.95, + "grad_norm": 0.6469909048210354, + "learning_rate": 1.506683773979445e-07, + "loss": 0.352, + "step": 16471 + }, + { + "epoch": 0.95, + "grad_norm": 0.3790611624365381, + "learning_rate": 1.5034673020409173e-07, + "loss": 0.2984, + "step": 16472 + }, + { + "epoch": 0.95, + "grad_norm": 0.2816475223008202, + "learning_rate": 1.5002542410330946e-07, + "loss": 0.2091, + "step": 16473 + }, + { + "epoch": 0.95, + "grad_norm": 0.33164768844534576, + "learning_rate": 1.4970445910672205e-07, + "loss": 0.1769, + "step": 16474 + }, + { + "epoch": 0.95, + "grad_norm": 0.400650506425723, + "learning_rate": 1.493838352254462e-07, + "loss": 0.3036, + "step": 16475 + }, + { + "epoch": 0.95, + "grad_norm": 0.32037790280791784, + "learning_rate": 1.4906355247058412e-07, + "loss": 0.2392, + "step": 16476 + }, + { + "epoch": 0.95, + "grad_norm": 0.605568545808694, + "learning_rate": 1.487436108532292e-07, + "loss": 0.33, + "step": 16477 + }, + { + "epoch": 0.95, + "grad_norm": 0.3674500331935406, + "learning_rate": 1.4842401038445808e-07, + "loss": 0.2867, + "step": 16478 + }, + { + "epoch": 0.95, + "grad_norm": 0.9639296926238876, + "learning_rate": 1.4810475107533973e-07, + "loss": 0.3768, + "step": 16479 + }, + { + "epoch": 0.95, + "grad_norm": 0.2325175694217112, + "learning_rate": 1.4778583293692972e-07, + "loss": 0.1823, + "step": 16480 + }, + { + "epoch": 0.95, + "grad_norm": 0.30507855821640345, + "learning_rate": 1.4746725598027367e-07, + "loss": 0.257, + "step": 16481 + }, + { + "epoch": 0.95, + "grad_norm": 1.3321349423875684, + "learning_rate": 1.4714902021640277e-07, + "loss": 0.7486, + "step": 16482 + }, + { + "epoch": 0.95, + "grad_norm": 0.5040720847135953, + "learning_rate": 1.4683112565633706e-07, + "loss": 0.2263, + "step": 16483 + }, + { + "epoch": 0.95, + "grad_norm": 0.3693885098403536, + "learning_rate": 1.4651357231108555e-07, + "loss": 0.2884, + "step": 16484 + }, + { + "epoch": 0.95, + "grad_norm": 0.3716837935292826, + "learning_rate": 1.4619636019164608e-07, + "loss": 0.3051, + "step": 16485 + }, + { + "epoch": 0.95, + "grad_norm": 0.20352716304565405, + "learning_rate": 1.458794893090032e-07, + "loss": 0.1185, + "step": 16486 + }, + { + "epoch": 0.95, + "grad_norm": 0.5404248538254562, + "learning_rate": 1.4556295967412925e-07, + "loss": 0.3645, + "step": 16487 + }, + { + "epoch": 0.95, + "grad_norm": 0.35677414223681986, + "learning_rate": 1.4524677129798547e-07, + "loss": 0.2902, + "step": 16488 + }, + { + "epoch": 0.95, + "grad_norm": 0.5106964245865684, + "learning_rate": 1.449309241915231e-07, + "loss": 0.1938, + "step": 16489 + }, + { + "epoch": 0.95, + "grad_norm": 0.3935425552716978, + "learning_rate": 1.4461541836568004e-07, + "loss": 0.3145, + "step": 16490 + }, + { + "epoch": 0.95, + "grad_norm": 1.3403643170390191, + "learning_rate": 1.443002538313798e-07, + "loss": 0.5119, + "step": 16491 + }, + { + "epoch": 0.95, + "grad_norm": 0.26232504943660767, + "learning_rate": 1.4398543059953918e-07, + "loss": 0.2427, + "step": 16492 + }, + { + "epoch": 0.95, + "grad_norm": 0.30415871484094553, + "learning_rate": 1.4367094868105725e-07, + "loss": 0.2046, + "step": 16493 + }, + { + "epoch": 0.95, + "grad_norm": 0.42062338088496104, + "learning_rate": 1.433568080868286e-07, + "loss": 0.2729, + "step": 16494 + }, + { + "epoch": 0.95, + "grad_norm": 0.5711244914388044, + "learning_rate": 1.4304300882772903e-07, + "loss": 0.3911, + "step": 16495 + }, + { + "epoch": 0.95, + "grad_norm": 0.2384222357994241, + "learning_rate": 1.4272955091462648e-07, + "loss": 0.2252, + "step": 16496 + }, + { + "epoch": 0.95, + "grad_norm": 1.3215017477278463, + "learning_rate": 1.424164343583745e-07, + "loss": 0.532, + "step": 16497 + }, + { + "epoch": 0.95, + "grad_norm": 1.0583273806987223, + "learning_rate": 1.4210365916981882e-07, + "loss": 0.4026, + "step": 16498 + }, + { + "epoch": 0.95, + "grad_norm": 0.3118227358818172, + "learning_rate": 1.4179122535978862e-07, + "loss": 0.2325, + "step": 16499 + }, + { + "epoch": 0.95, + "grad_norm": 0.34214996786542395, + "learning_rate": 1.414791329391052e-07, + "loss": 0.3061, + "step": 16500 + }, + { + "epoch": 0.95, + "grad_norm": 0.3931429662229714, + "learning_rate": 1.4116738191857437e-07, + "loss": 0.3039, + "step": 16501 + }, + { + "epoch": 0.95, + "grad_norm": 0.23757841266563745, + "learning_rate": 1.4085597230899418e-07, + "loss": 0.167, + "step": 16502 + }, + { + "epoch": 0.95, + "grad_norm": 1.2952326611305722, + "learning_rate": 1.4054490412114817e-07, + "loss": 0.5202, + "step": 16503 + }, + { + "epoch": 0.95, + "grad_norm": 0.3303112645977888, + "learning_rate": 1.402341773658078e-07, + "loss": 0.2998, + "step": 16504 + }, + { + "epoch": 0.95, + "grad_norm": 0.3496116138968531, + "learning_rate": 1.3992379205373219e-07, + "loss": 0.2641, + "step": 16505 + }, + { + "epoch": 0.95, + "grad_norm": 0.6284477927092317, + "learning_rate": 1.3961374819567386e-07, + "loss": 0.3239, + "step": 16506 + }, + { + "epoch": 0.95, + "grad_norm": 0.29321421158262606, + "learning_rate": 1.3930404580236646e-07, + "loss": 0.1722, + "step": 16507 + }, + { + "epoch": 0.95, + "grad_norm": 0.27799789142741405, + "learning_rate": 1.3899468488453583e-07, + "loss": 0.2652, + "step": 16508 + }, + { + "epoch": 0.95, + "grad_norm": 0.5390190495988705, + "learning_rate": 1.3868566545289563e-07, + "loss": 0.2118, + "step": 16509 + }, + { + "epoch": 0.95, + "grad_norm": 1.2106732546022587, + "learning_rate": 1.383769875181462e-07, + "loss": 0.6369, + "step": 16510 + }, + { + "epoch": 0.95, + "grad_norm": 0.3303768375574665, + "learning_rate": 1.38068651090979e-07, + "loss": 0.2693, + "step": 16511 + }, + { + "epoch": 0.95, + "grad_norm": 0.31858858994348516, + "learning_rate": 1.377606561820699e-07, + "loss": 0.2569, + "step": 16512 + }, + { + "epoch": 0.95, + "grad_norm": 0.41073201497419076, + "learning_rate": 1.3745300280208373e-07, + "loss": 0.249, + "step": 16513 + }, + { + "epoch": 0.95, + "grad_norm": 0.25752711484513774, + "learning_rate": 1.371456909616764e-07, + "loss": 0.2332, + "step": 16514 + }, + { + "epoch": 0.95, + "grad_norm": 1.4245672657375734, + "learning_rate": 1.3683872067149052e-07, + "loss": 0.6246, + "step": 16515 + }, + { + "epoch": 0.95, + "grad_norm": 0.27505237674289085, + "learning_rate": 1.3653209194215534e-07, + "loss": 0.2159, + "step": 16516 + }, + { + "epoch": 0.95, + "grad_norm": 0.3313341959115871, + "learning_rate": 1.3622580478428903e-07, + "loss": 0.2806, + "step": 16517 + }, + { + "epoch": 0.95, + "grad_norm": 0.6390544637914571, + "learning_rate": 1.3591985920849981e-07, + "loss": 0.3746, + "step": 16518 + }, + { + "epoch": 0.95, + "grad_norm": 0.29075587750528226, + "learning_rate": 1.356142552253814e-07, + "loss": 0.1736, + "step": 16519 + }, + { + "epoch": 0.95, + "grad_norm": 0.28201683673325467, + "learning_rate": 1.3530899284551756e-07, + "loss": 0.2628, + "step": 16520 + }, + { + "epoch": 0.95, + "grad_norm": 0.5231549591416089, + "learning_rate": 1.3500407207947875e-07, + "loss": 0.2276, + "step": 16521 + }, + { + "epoch": 0.95, + "grad_norm": 0.6591201808581352, + "learning_rate": 1.3469949293782426e-07, + "loss": 0.2343, + "step": 16522 + }, + { + "epoch": 0.95, + "grad_norm": 0.35442038050860036, + "learning_rate": 1.3439525543110232e-07, + "loss": 0.285, + "step": 16523 + }, + { + "epoch": 0.95, + "grad_norm": 0.3256582404882992, + "learning_rate": 1.3409135956984897e-07, + "loss": 0.2858, + "step": 16524 + }, + { + "epoch": 0.95, + "grad_norm": 0.4867984784902485, + "learning_rate": 1.337878053645869e-07, + "loss": 0.1892, + "step": 16525 + }, + { + "epoch": 0.95, + "grad_norm": 0.2675448451742374, + "learning_rate": 1.334845928258288e-07, + "loss": 0.2195, + "step": 16526 + }, + { + "epoch": 0.95, + "grad_norm": 0.4860536461056225, + "learning_rate": 1.331817219640752e-07, + "loss": 0.2422, + "step": 16527 + }, + { + "epoch": 0.95, + "grad_norm": 0.3300567767788356, + "learning_rate": 1.3287919278981544e-07, + "loss": 0.2799, + "step": 16528 + }, + { + "epoch": 0.95, + "grad_norm": 0.33493831219224574, + "learning_rate": 1.3257700531352334e-07, + "loss": 0.2111, + "step": 16529 + }, + { + "epoch": 0.95, + "grad_norm": 0.7971942715761426, + "learning_rate": 1.3227515954566506e-07, + "loss": 0.4725, + "step": 16530 + }, + { + "epoch": 0.95, + "grad_norm": 1.2101032035959423, + "learning_rate": 1.319736554966955e-07, + "loss": 0.4656, + "step": 16531 + }, + { + "epoch": 0.95, + "grad_norm": 0.2161492687029256, + "learning_rate": 1.31672493177053e-07, + "loss": 0.2144, + "step": 16532 + }, + { + "epoch": 0.95, + "grad_norm": 0.2706126200232461, + "learning_rate": 1.3137167259716698e-07, + "loss": 0.1613, + "step": 16533 + }, + { + "epoch": 0.95, + "grad_norm": 0.7201379209477112, + "learning_rate": 1.310711937674569e-07, + "loss": 0.3627, + "step": 16534 + }, + { + "epoch": 0.95, + "grad_norm": 0.3188833536266433, + "learning_rate": 1.3077105669832556e-07, + "loss": 0.2127, + "step": 16535 + }, + { + "epoch": 0.95, + "grad_norm": 0.353800166876077, + "learning_rate": 1.3047126140016907e-07, + "loss": 0.29, + "step": 16536 + }, + { + "epoch": 0.95, + "grad_norm": 1.152362615048209, + "learning_rate": 1.3017180788336804e-07, + "loss": 0.5908, + "step": 16537 + }, + { + "epoch": 0.95, + "grad_norm": 0.2201443708188749, + "learning_rate": 1.29872696158293e-07, + "loss": 0.1572, + "step": 16538 + }, + { + "epoch": 0.95, + "grad_norm": 0.39768952646586436, + "learning_rate": 1.295739262353013e-07, + "loss": 0.2506, + "step": 16539 + }, + { + "epoch": 0.95, + "grad_norm": 0.3711468804434103, + "learning_rate": 1.2927549812474128e-07, + "loss": 0.3222, + "step": 16540 + }, + { + "epoch": 0.95, + "grad_norm": 0.3875931478557323, + "learning_rate": 1.2897741183694578e-07, + "loss": 0.2718, + "step": 16541 + }, + { + "epoch": 0.95, + "grad_norm": 0.5040208709722376, + "learning_rate": 1.286796673822388e-07, + "loss": 0.2931, + "step": 16542 + }, + { + "epoch": 0.95, + "grad_norm": 0.3644096645649208, + "learning_rate": 1.2838226477092875e-07, + "loss": 0.2835, + "step": 16543 + }, + { + "epoch": 0.95, + "grad_norm": 0.43199632651889797, + "learning_rate": 1.2808520401331737e-07, + "loss": 0.2786, + "step": 16544 + }, + { + "epoch": 0.95, + "grad_norm": 0.26276909694452727, + "learning_rate": 1.27788485119692e-07, + "loss": 0.142, + "step": 16545 + }, + { + "epoch": 0.95, + "grad_norm": 0.6481758566901482, + "learning_rate": 1.2749210810032664e-07, + "loss": 0.3816, + "step": 16546 + }, + { + "epoch": 0.95, + "grad_norm": 0.3026318614423293, + "learning_rate": 1.2719607296548309e-07, + "loss": 0.2441, + "step": 16547 + }, + { + "epoch": 0.95, + "grad_norm": 0.31330158406573605, + "learning_rate": 1.2690037972541646e-07, + "loss": 0.2622, + "step": 16548 + }, + { + "epoch": 0.95, + "grad_norm": 1.5618122703655006, + "learning_rate": 1.2660502839036526e-07, + "loss": 0.5628, + "step": 16549 + }, + { + "epoch": 0.95, + "grad_norm": 0.3455198798238632, + "learning_rate": 1.2631001897055683e-07, + "loss": 0.2721, + "step": 16550 + }, + { + "epoch": 0.95, + "grad_norm": 0.2731384236231438, + "learning_rate": 1.2601535147620746e-07, + "loss": 0.1375, + "step": 16551 + }, + { + "epoch": 0.95, + "grad_norm": 0.3394524321219831, + "learning_rate": 1.2572102591752234e-07, + "loss": 0.3059, + "step": 16552 + }, + { + "epoch": 0.95, + "grad_norm": 0.30354042132773396, + "learning_rate": 1.2542704230469326e-07, + "loss": 0.2326, + "step": 16553 + }, + { + "epoch": 0.95, + "grad_norm": 1.1255505639202197, + "learning_rate": 1.2513340064790102e-07, + "loss": 0.6941, + "step": 16554 + }, + { + "epoch": 0.95, + "grad_norm": 0.35766936306315206, + "learning_rate": 1.2484010095731414e-07, + "loss": 0.246, + "step": 16555 + }, + { + "epoch": 0.95, + "grad_norm": 0.3323040325151218, + "learning_rate": 1.2454714324309115e-07, + "loss": 0.2591, + "step": 16556 + }, + { + "epoch": 0.95, + "grad_norm": 0.7689015974335848, + "learning_rate": 1.2425452751537503e-07, + "loss": 0.3713, + "step": 16557 + }, + { + "epoch": 0.95, + "grad_norm": 0.2691954444647513, + "learning_rate": 1.2396225378430105e-07, + "loss": 0.144, + "step": 16558 + }, + { + "epoch": 0.95, + "grad_norm": 0.38664284669524407, + "learning_rate": 1.2367032205998775e-07, + "loss": 0.2492, + "step": 16559 + }, + { + "epoch": 0.95, + "grad_norm": 0.30289171963027917, + "learning_rate": 1.2337873235254704e-07, + "loss": 0.3089, + "step": 16560 + }, + { + "epoch": 0.95, + "grad_norm": 1.3377802262789582, + "learning_rate": 1.2308748467207753e-07, + "loss": 0.2227, + "step": 16561 + }, + { + "epoch": 0.95, + "grad_norm": 0.39685089389976136, + "learning_rate": 1.2279657902866226e-07, + "loss": 0.2741, + "step": 16562 + }, + { + "epoch": 0.95, + "grad_norm": 0.34991767076091934, + "learning_rate": 1.225060154323776e-07, + "loss": 0.2956, + "step": 16563 + }, + { + "epoch": 0.95, + "grad_norm": 0.2420095759472581, + "learning_rate": 1.222157938932833e-07, + "loss": 0.1584, + "step": 16564 + }, + { + "epoch": 0.95, + "grad_norm": 0.4161152844228987, + "learning_rate": 1.219259144214324e-07, + "loss": 0.2804, + "step": 16565 + }, + { + "epoch": 0.95, + "grad_norm": 0.5728617107352475, + "learning_rate": 1.216363770268625e-07, + "loss": 0.4119, + "step": 16566 + }, + { + "epoch": 0.95, + "grad_norm": 0.5569364520388921, + "learning_rate": 1.2134718171960103e-07, + "loss": 0.3633, + "step": 16567 + }, + { + "epoch": 0.95, + "grad_norm": 0.2639185558147655, + "learning_rate": 1.2105832850966004e-07, + "loss": 0.2085, + "step": 16568 + }, + { + "epoch": 0.95, + "grad_norm": 0.59735708477971, + "learning_rate": 1.2076981740704485e-07, + "loss": 0.3744, + "step": 16569 + }, + { + "epoch": 0.95, + "grad_norm": 0.34765113855029134, + "learning_rate": 1.2048164842174636e-07, + "loss": 0.1818, + "step": 16570 + }, + { + "epoch": 0.95, + "grad_norm": 0.2991838323871734, + "learning_rate": 1.2019382156374326e-07, + "loss": 0.1983, + "step": 16571 + }, + { + "epoch": 0.95, + "grad_norm": 0.33619733070672253, + "learning_rate": 1.1990633684300424e-07, + "loss": 0.2832, + "step": 16572 + }, + { + "epoch": 0.95, + "grad_norm": 0.9724457561401293, + "learning_rate": 1.1961919426948244e-07, + "loss": 0.4762, + "step": 16573 + }, + { + "epoch": 0.95, + "grad_norm": 0.3870489682298207, + "learning_rate": 1.1933239385312324e-07, + "loss": 0.2104, + "step": 16574 + }, + { + "epoch": 0.95, + "grad_norm": 0.361921769610525, + "learning_rate": 1.190459356038598e-07, + "loss": 0.2939, + "step": 16575 + }, + { + "epoch": 0.95, + "grad_norm": 0.3518873026409385, + "learning_rate": 1.1875981953160975e-07, + "loss": 0.2099, + "step": 16576 + }, + { + "epoch": 0.95, + "grad_norm": 0.3527276444123807, + "learning_rate": 1.1847404564628185e-07, + "loss": 0.1475, + "step": 16577 + }, + { + "epoch": 0.95, + "grad_norm": 0.2743531371236152, + "learning_rate": 1.181886139577737e-07, + "loss": 0.2313, + "step": 16578 + }, + { + "epoch": 0.95, + "grad_norm": 0.338837035760285, + "learning_rate": 1.1790352447596853e-07, + "loss": 0.3039, + "step": 16579 + }, + { + "epoch": 0.95, + "grad_norm": 0.5475170217274716, + "learning_rate": 1.1761877721073845e-07, + "loss": 0.3508, + "step": 16580 + }, + { + "epoch": 0.95, + "grad_norm": 0.5025763106213428, + "learning_rate": 1.1733437217194665e-07, + "loss": 0.2398, + "step": 16581 + }, + { + "epoch": 0.95, + "grad_norm": 0.535478845267447, + "learning_rate": 1.1705030936943973e-07, + "loss": 0.2167, + "step": 16582 + }, + { + "epoch": 0.95, + "grad_norm": 0.3968889128451416, + "learning_rate": 1.1676658881305647e-07, + "loss": 0.304, + "step": 16583 + }, + { + "epoch": 0.95, + "grad_norm": 0.21367273624373442, + "learning_rate": 1.1648321051262012e-07, + "loss": 0.196, + "step": 16584 + }, + { + "epoch": 0.95, + "grad_norm": 0.80451182074773, + "learning_rate": 1.1620017447794507e-07, + "loss": 0.3895, + "step": 16585 + }, + { + "epoch": 0.95, + "grad_norm": 0.5701986470439201, + "learning_rate": 1.1591748071883458e-07, + "loss": 0.2978, + "step": 16586 + }, + { + "epoch": 0.95, + "grad_norm": 0.2685745996616732, + "learning_rate": 1.1563512924507525e-07, + "loss": 0.246, + "step": 16587 + }, + { + "epoch": 0.95, + "grad_norm": 1.4199611521261792, + "learning_rate": 1.1535312006644706e-07, + "loss": 0.7062, + "step": 16588 + }, + { + "epoch": 0.95, + "grad_norm": 0.3948249665352689, + "learning_rate": 1.150714531927144e-07, + "loss": 0.2906, + "step": 16589 + }, + { + "epoch": 0.95, + "grad_norm": 0.5276002741172402, + "learning_rate": 1.1479012863363282e-07, + "loss": 0.3783, + "step": 16590 + }, + { + "epoch": 0.95, + "grad_norm": 0.24658030549592613, + "learning_rate": 1.1450914639894451e-07, + "loss": 0.2207, + "step": 16591 + }, + { + "epoch": 0.95, + "grad_norm": 0.3155271224796519, + "learning_rate": 1.1422850649837836e-07, + "loss": 0.223, + "step": 16592 + }, + { + "epoch": 0.95, + "grad_norm": 0.5925497152046855, + "learning_rate": 1.1394820894165437e-07, + "loss": 0.2979, + "step": 16593 + }, + { + "epoch": 0.95, + "grad_norm": 1.5226704293130526, + "learning_rate": 1.1366825373847923e-07, + "loss": 0.2853, + "step": 16594 + }, + { + "epoch": 0.95, + "grad_norm": 0.4706632168946928, + "learning_rate": 1.1338864089854629e-07, + "loss": 0.2925, + "step": 16595 + }, + { + "epoch": 0.95, + "grad_norm": 0.35243983811365676, + "learning_rate": 1.1310937043154113e-07, + "loss": 0.2916, + "step": 16596 + }, + { + "epoch": 0.95, + "grad_norm": 0.31547774494065683, + "learning_rate": 1.128304423471327e-07, + "loss": 0.1765, + "step": 16597 + }, + { + "epoch": 0.95, + "grad_norm": 0.640983379760982, + "learning_rate": 1.1255185665497992e-07, + "loss": 0.3632, + "step": 16598 + }, + { + "epoch": 0.95, + "grad_norm": 0.2510122641943343, + "learning_rate": 1.1227361336473175e-07, + "loss": 0.256, + "step": 16599 + }, + { + "epoch": 0.95, + "grad_norm": 1.4540579733239993, + "learning_rate": 1.119957124860238e-07, + "loss": 0.203, + "step": 16600 + }, + { + "epoch": 0.95, + "grad_norm": 0.6368166691950049, + "learning_rate": 1.1171815402847841e-07, + "loss": 0.3969, + "step": 16601 + }, + { + "epoch": 0.95, + "grad_norm": 0.351165725596696, + "learning_rate": 1.1144093800170786e-07, + "loss": 0.2781, + "step": 16602 + }, + { + "epoch": 0.95, + "grad_norm": 0.3683204475698124, + "learning_rate": 1.1116406441531335e-07, + "loss": 0.3086, + "step": 16603 + }, + { + "epoch": 0.95, + "grad_norm": 0.23192559345215039, + "learning_rate": 1.1088753327888169e-07, + "loss": 0.1139, + "step": 16604 + }, + { + "epoch": 0.95, + "grad_norm": 0.4094259485578723, + "learning_rate": 1.1061134460198964e-07, + "loss": 0.2794, + "step": 16605 + }, + { + "epoch": 0.95, + "grad_norm": 1.2833862226613364, + "learning_rate": 1.1033549839420066e-07, + "loss": 0.5491, + "step": 16606 + }, + { + "epoch": 0.95, + "grad_norm": 0.2600204249198676, + "learning_rate": 1.1005999466506822e-07, + "loss": 0.2268, + "step": 16607 + }, + { + "epoch": 0.95, + "grad_norm": 0.36192005945864036, + "learning_rate": 1.0978483342413359e-07, + "loss": 0.288, + "step": 16608 + }, + { + "epoch": 0.95, + "grad_norm": 0.5971870477119776, + "learning_rate": 1.0951001468092471e-07, + "loss": 0.3491, + "step": 16609 + }, + { + "epoch": 0.95, + "grad_norm": 0.1664000066752861, + "learning_rate": 1.0923553844495838e-07, + "loss": 0.1133, + "step": 16610 + }, + { + "epoch": 0.95, + "grad_norm": 0.3779109104685889, + "learning_rate": 1.0896140472574035e-07, + "loss": 0.3052, + "step": 16611 + }, + { + "epoch": 0.95, + "grad_norm": 0.48651148623474383, + "learning_rate": 1.0868761353276414e-07, + "loss": 0.3653, + "step": 16612 + }, + { + "epoch": 0.95, + "grad_norm": 0.6419661588127807, + "learning_rate": 1.0841416487550994e-07, + "loss": 0.2488, + "step": 16613 + }, + { + "epoch": 0.95, + "grad_norm": 0.3649414029689163, + "learning_rate": 1.0814105876344794e-07, + "loss": 0.2928, + "step": 16614 + }, + { + "epoch": 0.95, + "grad_norm": 0.3596194680368247, + "learning_rate": 1.0786829520603503e-07, + "loss": 0.3083, + "step": 16615 + }, + { + "epoch": 0.95, + "grad_norm": 0.2692613514757282, + "learning_rate": 1.0759587421271811e-07, + "loss": 0.1606, + "step": 16616 + }, + { + "epoch": 0.95, + "grad_norm": 0.3453726228128851, + "learning_rate": 1.0732379579293184e-07, + "loss": 0.1923, + "step": 16617 + }, + { + "epoch": 0.95, + "grad_norm": 1.3122016638571212, + "learning_rate": 1.0705205995609536e-07, + "loss": 0.5729, + "step": 16618 + }, + { + "epoch": 0.95, + "grad_norm": 0.3154318714815696, + "learning_rate": 1.0678066671162113e-07, + "loss": 0.2759, + "step": 16619 + }, + { + "epoch": 0.95, + "grad_norm": 0.3255968830256943, + "learning_rate": 1.0650961606890719e-07, + "loss": 0.2382, + "step": 16620 + }, + { + "epoch": 0.95, + "grad_norm": 1.2859265206571604, + "learning_rate": 1.062389080373405e-07, + "loss": 0.8102, + "step": 16621 + }, + { + "epoch": 0.96, + "grad_norm": 0.26955193137320504, + "learning_rate": 1.0596854262629352e-07, + "loss": 0.1498, + "step": 16622 + }, + { + "epoch": 0.96, + "grad_norm": 0.24722787460287485, + "learning_rate": 1.0569851984513102e-07, + "loss": 0.2281, + "step": 16623 + }, + { + "epoch": 0.96, + "grad_norm": 1.3302779187313543, + "learning_rate": 1.0542883970320328e-07, + "loss": 0.5446, + "step": 16624 + }, + { + "epoch": 0.96, + "grad_norm": 0.5332660462662445, + "learning_rate": 1.051595022098506e-07, + "loss": 0.3458, + "step": 16625 + }, + { + "epoch": 0.96, + "grad_norm": 0.30330182429363667, + "learning_rate": 1.0489050737439777e-07, + "loss": 0.2059, + "step": 16626 + }, + { + "epoch": 0.96, + "grad_norm": 0.36778833648956744, + "learning_rate": 1.0462185520616064e-07, + "loss": 0.315, + "step": 16627 + }, + { + "epoch": 0.96, + "grad_norm": 0.23797657274293577, + "learning_rate": 1.0435354571444401e-07, + "loss": 0.1594, + "step": 16628 + }, + { + "epoch": 0.96, + "grad_norm": 0.3561924831436635, + "learning_rate": 1.0408557890853821e-07, + "loss": 0.2595, + "step": 16629 + }, + { + "epoch": 0.96, + "grad_norm": 0.4752465406071092, + "learning_rate": 1.038179547977236e-07, + "loss": 0.2584, + "step": 16630 + }, + { + "epoch": 0.96, + "grad_norm": 0.39518269810907725, + "learning_rate": 1.0355067339126723e-07, + "loss": 0.2972, + "step": 16631 + }, + { + "epoch": 0.96, + "grad_norm": 0.36175612579907546, + "learning_rate": 1.0328373469842502e-07, + "loss": 0.2588, + "step": 16632 + }, + { + "epoch": 0.96, + "grad_norm": 1.571137981694321, + "learning_rate": 1.0301713872844288e-07, + "loss": 0.414, + "step": 16633 + }, + { + "epoch": 0.96, + "grad_norm": 0.3156274111668739, + "learning_rate": 1.0275088549055123e-07, + "loss": 0.226, + "step": 16634 + }, + { + "epoch": 0.96, + "grad_norm": 0.2669295187752056, + "learning_rate": 1.0248497499396936e-07, + "loss": 0.2455, + "step": 16635 + }, + { + "epoch": 0.96, + "grad_norm": 0.4475781213109659, + "learning_rate": 1.022194072479088e-07, + "loss": 0.1764, + "step": 16636 + }, + { + "epoch": 0.96, + "grad_norm": 0.7397982641900223, + "learning_rate": 1.019541822615644e-07, + "loss": 0.4622, + "step": 16637 + }, + { + "epoch": 0.96, + "grad_norm": 0.32650303175664785, + "learning_rate": 1.0168930004412103e-07, + "loss": 0.2467, + "step": 16638 + }, + { + "epoch": 0.96, + "grad_norm": 0.3796386468131196, + "learning_rate": 1.0142476060475137e-07, + "loss": 0.2462, + "step": 16639 + }, + { + "epoch": 0.96, + "grad_norm": 0.8287190883309027, + "learning_rate": 1.0116056395261586e-07, + "loss": 0.327, + "step": 16640 + }, + { + "epoch": 0.96, + "grad_norm": 0.20625582020929892, + "learning_rate": 1.0089671009686497e-07, + "loss": 0.2011, + "step": 16641 + }, + { + "epoch": 0.96, + "grad_norm": 0.5948962808042882, + "learning_rate": 1.0063319904663471e-07, + "loss": 0.4045, + "step": 16642 + }, + { + "epoch": 0.96, + "grad_norm": 0.29778674786226755, + "learning_rate": 1.0037003081105223e-07, + "loss": 0.2352, + "step": 16643 + }, + { + "epoch": 0.96, + "grad_norm": 0.33424304060632115, + "learning_rate": 1.0010720539922914e-07, + "loss": 0.2542, + "step": 16644 + }, + { + "epoch": 0.96, + "grad_norm": 1.1934592174173186, + "learning_rate": 9.984472282026814e-08, + "loss": 0.702, + "step": 16645 + }, + { + "epoch": 0.96, + "grad_norm": 0.3274171866966553, + "learning_rate": 9.958258308325975e-08, + "loss": 0.2261, + "step": 16646 + }, + { + "epoch": 0.96, + "grad_norm": 0.34650531219482306, + "learning_rate": 9.932078619727892e-08, + "loss": 0.2777, + "step": 16647 + }, + { + "epoch": 0.96, + "grad_norm": 0.3254717650580907, + "learning_rate": 9.905933217139397e-08, + "loss": 0.2203, + "step": 16648 + }, + { + "epoch": 0.96, + "grad_norm": 0.7048051219031087, + "learning_rate": 9.879822101465874e-08, + "loss": 0.2505, + "step": 16649 + }, + { + "epoch": 0.96, + "grad_norm": 0.3501335317692874, + "learning_rate": 9.853745273611604e-08, + "loss": 0.275, + "step": 16650 + }, + { + "epoch": 0.96, + "grad_norm": 0.34539266905883126, + "learning_rate": 9.827702734479528e-08, + "loss": 0.3104, + "step": 16651 + }, + { + "epoch": 0.96, + "grad_norm": 2.0335167832865215, + "learning_rate": 9.801694484971369e-08, + "loss": 0.2343, + "step": 16652 + }, + { + "epoch": 0.96, + "grad_norm": 0.30600946235260734, + "learning_rate": 9.775720525988076e-08, + "loss": 0.2483, + "step": 16653 + }, + { + "epoch": 0.96, + "grad_norm": 0.3128501072923363, + "learning_rate": 9.749780858429036e-08, + "loss": 0.2964, + "step": 16654 + }, + { + "epoch": 0.96, + "grad_norm": 0.2661670102671037, + "learning_rate": 9.723875483192536e-08, + "loss": 0.1817, + "step": 16655 + }, + { + "epoch": 0.96, + "grad_norm": 0.2967207607658545, + "learning_rate": 9.69800440117552e-08, + "loss": 0.2061, + "step": 16656 + }, + { + "epoch": 0.96, + "grad_norm": 1.3288708256421893, + "learning_rate": 9.672167613274053e-08, + "loss": 0.7043, + "step": 16657 + }, + { + "epoch": 0.96, + "grad_norm": 0.3621252628942739, + "learning_rate": 9.64636512038286e-08, + "loss": 0.2824, + "step": 16658 + }, + { + "epoch": 0.96, + "grad_norm": 0.2996790863119093, + "learning_rate": 9.620596923395454e-08, + "loss": 0.2111, + "step": 16659 + }, + { + "epoch": 0.96, + "grad_norm": 0.7797861094274411, + "learning_rate": 9.594863023204226e-08, + "loss": 0.3865, + "step": 16660 + }, + { + "epoch": 0.96, + "grad_norm": 0.24464123847918023, + "learning_rate": 9.569163420700245e-08, + "loss": 0.183, + "step": 16661 + }, + { + "epoch": 0.96, + "grad_norm": 0.3100921626514089, + "learning_rate": 9.543498116773576e-08, + "loss": 0.1861, + "step": 16662 + }, + { + "epoch": 0.96, + "grad_norm": 0.3610536933026674, + "learning_rate": 9.51786711231295e-08, + "loss": 0.2764, + "step": 16663 + }, + { + "epoch": 0.96, + "grad_norm": 0.6986584506879514, + "learning_rate": 9.492270408206106e-08, + "loss": 0.3745, + "step": 16664 + }, + { + "epoch": 0.96, + "grad_norm": 0.2895372923549709, + "learning_rate": 9.46670800533922e-08, + "loss": 0.1944, + "step": 16665 + }, + { + "epoch": 0.96, + "grad_norm": 0.3473184880519653, + "learning_rate": 9.441179904597697e-08, + "loss": 0.3016, + "step": 16666 + }, + { + "epoch": 0.96, + "grad_norm": 0.38263612931781815, + "learning_rate": 9.415686106865496e-08, + "loss": 0.2469, + "step": 16667 + }, + { + "epoch": 0.96, + "grad_norm": 0.4296040570362124, + "learning_rate": 9.390226613025466e-08, + "loss": 0.248, + "step": 16668 + }, + { + "epoch": 0.96, + "grad_norm": 0.31216699027746436, + "learning_rate": 9.364801423959235e-08, + "loss": 0.1625, + "step": 16669 + }, + { + "epoch": 0.96, + "grad_norm": 0.33552034711426854, + "learning_rate": 9.339410540547433e-08, + "loss": 0.2803, + "step": 16670 + }, + { + "epoch": 0.96, + "grad_norm": 0.3310185221024679, + "learning_rate": 9.314053963669245e-08, + "loss": 0.254, + "step": 16671 + }, + { + "epoch": 0.96, + "grad_norm": 0.7250290339843236, + "learning_rate": 9.288731694202747e-08, + "loss": 0.2567, + "step": 16672 + }, + { + "epoch": 0.96, + "grad_norm": 1.2242252500615893, + "learning_rate": 9.263443733024791e-08, + "loss": 0.7467, + "step": 16673 + }, + { + "epoch": 0.96, + "grad_norm": 0.32309648425980414, + "learning_rate": 9.238190081011345e-08, + "loss": 0.2512, + "step": 16674 + }, + { + "epoch": 0.96, + "grad_norm": 0.19987086897763234, + "learning_rate": 9.212970739036709e-08, + "loss": 0.1648, + "step": 16675 + }, + { + "epoch": 0.96, + "grad_norm": 0.5338994500229073, + "learning_rate": 9.187785707974183e-08, + "loss": 0.3529, + "step": 16676 + }, + { + "epoch": 0.96, + "grad_norm": 0.32266698545901074, + "learning_rate": 9.162634988696184e-08, + "loss": 0.2438, + "step": 16677 + }, + { + "epoch": 0.96, + "grad_norm": 0.5167703108082058, + "learning_rate": 9.137518582073345e-08, + "loss": 0.3471, + "step": 16678 + }, + { + "epoch": 0.96, + "grad_norm": 0.4186625502370127, + "learning_rate": 9.112436488975751e-08, + "loss": 0.2912, + "step": 16679 + }, + { + "epoch": 0.96, + "grad_norm": 0.3821134635943973, + "learning_rate": 9.087388710271927e-08, + "loss": 0.2757, + "step": 16680 + }, + { + "epoch": 0.96, + "grad_norm": 0.25700002896802115, + "learning_rate": 9.06237524682918e-08, + "loss": 0.1689, + "step": 16681 + }, + { + "epoch": 0.96, + "grad_norm": 0.3561604016730922, + "learning_rate": 9.037396099513707e-08, + "loss": 0.2703, + "step": 16682 + }, + { + "epoch": 0.96, + "grad_norm": 0.39261394030257535, + "learning_rate": 9.012451269190592e-08, + "loss": 0.2591, + "step": 16683 + }, + { + "epoch": 0.96, + "grad_norm": 0.6363852734792065, + "learning_rate": 8.987540756723811e-08, + "loss": 0.3443, + "step": 16684 + }, + { + "epoch": 0.96, + "grad_norm": 1.3103970018735047, + "learning_rate": 8.962664562975676e-08, + "loss": 0.3247, + "step": 16685 + }, + { + "epoch": 0.96, + "grad_norm": 0.3028875377252202, + "learning_rate": 8.93782268880794e-08, + "loss": 0.2357, + "step": 16686 + }, + { + "epoch": 0.96, + "grad_norm": 0.2752372436372155, + "learning_rate": 8.913015135080805e-08, + "loss": 0.2525, + "step": 16687 + }, + { + "epoch": 0.96, + "grad_norm": 0.36823697113929454, + "learning_rate": 8.888241902653361e-08, + "loss": 0.1942, + "step": 16688 + }, + { + "epoch": 0.96, + "grad_norm": 0.4023075320401017, + "learning_rate": 8.863502992383477e-08, + "loss": 0.2551, + "step": 16689 + }, + { + "epoch": 0.96, + "grad_norm": 0.2965441139243796, + "learning_rate": 8.838798405127802e-08, + "loss": 0.2671, + "step": 16690 + }, + { + "epoch": 0.96, + "grad_norm": 1.325159274198456, + "learning_rate": 8.814128141741984e-08, + "loss": 0.6663, + "step": 16691 + }, + { + "epoch": 0.96, + "grad_norm": 0.2863213468271409, + "learning_rate": 8.78949220308023e-08, + "loss": 0.2085, + "step": 16692 + }, + { + "epoch": 0.96, + "grad_norm": 0.7635493682524934, + "learning_rate": 8.764890589995745e-08, + "loss": 0.4148, + "step": 16693 + }, + { + "epoch": 0.96, + "grad_norm": 0.2738614036932938, + "learning_rate": 8.740323303340514e-08, + "loss": 0.2462, + "step": 16694 + }, + { + "epoch": 0.96, + "grad_norm": 0.26102284400214015, + "learning_rate": 8.715790343965192e-08, + "loss": 0.1567, + "step": 16695 + }, + { + "epoch": 0.96, + "grad_norm": 1.418513087327854, + "learning_rate": 8.691291712719541e-08, + "loss": 0.4353, + "step": 16696 + }, + { + "epoch": 0.96, + "grad_norm": 1.1335322604001175, + "learning_rate": 8.666827410451772e-08, + "loss": 0.7633, + "step": 16697 + }, + { + "epoch": 0.96, + "grad_norm": 0.24218728721153193, + "learning_rate": 8.642397438008987e-08, + "loss": 0.2166, + "step": 16698 + }, + { + "epoch": 0.96, + "grad_norm": 0.44698841899876174, + "learning_rate": 8.618001796237507e-08, + "loss": 0.3161, + "step": 16699 + }, + { + "epoch": 0.96, + "grad_norm": 0.8319313930088971, + "learning_rate": 8.593640485981991e-08, + "loss": 0.194, + "step": 16700 + }, + { + "epoch": 0.96, + "grad_norm": 0.3124725689478206, + "learning_rate": 8.569313508086097e-08, + "loss": 0.0728, + "step": 16701 + }, + { + "epoch": 0.96, + "grad_norm": 0.2663921373594587, + "learning_rate": 8.545020863392262e-08, + "loss": 0.2698, + "step": 16702 + }, + { + "epoch": 0.96, + "grad_norm": 1.0964726846573836, + "learning_rate": 8.520762552741591e-08, + "loss": 0.5473, + "step": 16703 + }, + { + "epoch": 0.96, + "grad_norm": 0.5929084586700467, + "learning_rate": 8.496538576974412e-08, + "loss": 0.3214, + "step": 16704 + }, + { + "epoch": 0.96, + "grad_norm": 0.3420457220417289, + "learning_rate": 8.472348936929387e-08, + "loss": 0.2463, + "step": 16705 + }, + { + "epoch": 0.96, + "grad_norm": 0.33912810289817497, + "learning_rate": 8.448193633444291e-08, + "loss": 0.3064, + "step": 16706 + }, + { + "epoch": 0.96, + "grad_norm": 0.2234517371912271, + "learning_rate": 8.424072667355565e-08, + "loss": 0.1544, + "step": 16707 + }, + { + "epoch": 0.96, + "grad_norm": 0.44621753014878657, + "learning_rate": 8.399986039498653e-08, + "loss": 0.2155, + "step": 16708 + }, + { + "epoch": 0.96, + "grad_norm": 0.8496786623009368, + "learning_rate": 8.375933750707554e-08, + "loss": 0.4511, + "step": 16709 + }, + { + "epoch": 0.96, + "grad_norm": 0.2951581272073628, + "learning_rate": 8.35191580181527e-08, + "loss": 0.2546, + "step": 16710 + }, + { + "epoch": 0.96, + "grad_norm": 0.3656795630471628, + "learning_rate": 8.327932193653355e-08, + "loss": 0.2506, + "step": 16711 + }, + { + "epoch": 0.96, + "grad_norm": 0.5578558576279988, + "learning_rate": 8.30398292705259e-08, + "loss": 0.2439, + "step": 16712 + }, + { + "epoch": 0.96, + "grad_norm": 0.27556519383523753, + "learning_rate": 8.280068002842312e-08, + "loss": 0.2176, + "step": 16713 + }, + { + "epoch": 0.96, + "grad_norm": 0.2691015371715588, + "learning_rate": 8.256187421850636e-08, + "loss": 0.2203, + "step": 16714 + }, + { + "epoch": 0.96, + "grad_norm": 0.6831366435714303, + "learning_rate": 8.232341184904458e-08, + "loss": 0.3823, + "step": 16715 + }, + { + "epoch": 0.96, + "grad_norm": 0.39566843622236897, + "learning_rate": 8.20852929282967e-08, + "loss": 0.2655, + "step": 16716 + }, + { + "epoch": 0.96, + "grad_norm": 0.5305390025366589, + "learning_rate": 8.184751746450947e-08, + "loss": 0.3602, + "step": 16717 + }, + { + "epoch": 0.96, + "grad_norm": 0.3804937384566211, + "learning_rate": 8.16100854659152e-08, + "loss": 0.243, + "step": 16718 + }, + { + "epoch": 0.96, + "grad_norm": 0.24366733203716592, + "learning_rate": 8.13729969407373e-08, + "loss": 0.1461, + "step": 16719 + }, + { + "epoch": 0.96, + "grad_norm": 0.3626781267030705, + "learning_rate": 8.113625189718588e-08, + "loss": 0.2923, + "step": 16720 + }, + { + "epoch": 0.96, + "grad_norm": 0.3382050876755194, + "learning_rate": 8.089985034346104e-08, + "loss": 0.2521, + "step": 16721 + }, + { + "epoch": 0.96, + "grad_norm": 0.6097586807838828, + "learning_rate": 8.066379228774624e-08, + "loss": 0.3351, + "step": 16722 + }, + { + "epoch": 0.96, + "grad_norm": 0.34159479554399735, + "learning_rate": 8.042807773821826e-08, + "loss": 0.285, + "step": 16723 + }, + { + "epoch": 0.96, + "grad_norm": 1.4091610780001418, + "learning_rate": 8.019270670303946e-08, + "loss": 0.2663, + "step": 16724 + }, + { + "epoch": 0.96, + "grad_norm": 0.2633520677569486, + "learning_rate": 7.995767919036002e-08, + "loss": 0.1632, + "step": 16725 + }, + { + "epoch": 0.96, + "grad_norm": 0.25363763611530493, + "learning_rate": 7.972299520832005e-08, + "loss": 0.2677, + "step": 16726 + }, + { + "epoch": 0.96, + "grad_norm": 1.0371752079589232, + "learning_rate": 7.948865476504641e-08, + "loss": 0.2723, + "step": 16727 + }, + { + "epoch": 0.96, + "grad_norm": 0.5501126198181006, + "learning_rate": 7.925465786865372e-08, + "loss": 0.3554, + "step": 16728 + }, + { + "epoch": 0.96, + "grad_norm": 0.3783143100194757, + "learning_rate": 7.902100452724548e-08, + "loss": 0.3148, + "step": 16729 + }, + { + "epoch": 0.96, + "grad_norm": 0.35404309098921694, + "learning_rate": 7.878769474891413e-08, + "loss": 0.2972, + "step": 16730 + }, + { + "epoch": 0.96, + "grad_norm": 0.19637934444938035, + "learning_rate": 7.855472854173763e-08, + "loss": 0.0779, + "step": 16731 + }, + { + "epoch": 0.96, + "grad_norm": 0.3869329443831731, + "learning_rate": 7.832210591378398e-08, + "loss": 0.2814, + "step": 16732 + }, + { + "epoch": 0.96, + "grad_norm": 0.42170939268755797, + "learning_rate": 7.808982687311006e-08, + "loss": 0.3055, + "step": 16733 + }, + { + "epoch": 0.96, + "grad_norm": 0.2777667725765214, + "learning_rate": 7.785789142775834e-08, + "loss": 0.2124, + "step": 16734 + }, + { + "epoch": 0.96, + "grad_norm": 0.5345760328975254, + "learning_rate": 7.762629958576129e-08, + "loss": 0.3851, + "step": 16735 + }, + { + "epoch": 0.96, + "grad_norm": 1.316711270975337, + "learning_rate": 7.739505135513803e-08, + "loss": 0.541, + "step": 16736 + }, + { + "epoch": 0.96, + "grad_norm": 0.2516530707494128, + "learning_rate": 7.716414674389771e-08, + "loss": 0.148, + "step": 16737 + }, + { + "epoch": 0.96, + "grad_norm": 0.23169324602476324, + "learning_rate": 7.693358576003617e-08, + "loss": 0.214, + "step": 16738 + }, + { + "epoch": 0.96, + "grad_norm": 0.5344331125889435, + "learning_rate": 7.670336841153925e-08, + "loss": 0.3644, + "step": 16739 + }, + { + "epoch": 0.96, + "grad_norm": 0.4752035059563894, + "learning_rate": 7.64734947063761e-08, + "loss": 0.2514, + "step": 16740 + }, + { + "epoch": 0.96, + "grad_norm": 0.36429165610906206, + "learning_rate": 7.624396465251038e-08, + "loss": 0.2732, + "step": 16741 + }, + { + "epoch": 0.96, + "grad_norm": 0.3708596666694886, + "learning_rate": 7.601477825788905e-08, + "loss": 0.2794, + "step": 16742 + }, + { + "epoch": 0.96, + "grad_norm": 0.3845077104005944, + "learning_rate": 7.57859355304491e-08, + "loss": 0.1939, + "step": 16743 + }, + { + "epoch": 0.96, + "grad_norm": 0.3137435856663455, + "learning_rate": 7.55574364781153e-08, + "loss": 0.2363, + "step": 16744 + }, + { + "epoch": 0.96, + "grad_norm": 0.33829037017062646, + "learning_rate": 7.532928110880133e-08, + "loss": 0.2301, + "step": 16745 + }, + { + "epoch": 0.96, + "grad_norm": 0.40890395308043054, + "learning_rate": 7.510146943040641e-08, + "loss": 0.3042, + "step": 16746 + }, + { + "epoch": 0.96, + "grad_norm": 0.3051775333200117, + "learning_rate": 7.487400145082203e-08, + "loss": 0.2107, + "step": 16747 + }, + { + "epoch": 0.96, + "grad_norm": 1.3662893373185263, + "learning_rate": 7.464687717792407e-08, + "loss": 0.4789, + "step": 16748 + }, + { + "epoch": 0.96, + "grad_norm": 0.33082857403563304, + "learning_rate": 7.442009661957738e-08, + "loss": 0.2837, + "step": 16749 + }, + { + "epoch": 0.96, + "grad_norm": 0.33747596972844696, + "learning_rate": 7.41936597836368e-08, + "loss": 0.2082, + "step": 16750 + }, + { + "epoch": 0.96, + "grad_norm": 0.3298350815450901, + "learning_rate": 7.396756667794158e-08, + "loss": 0.2119, + "step": 16751 + }, + { + "epoch": 0.96, + "grad_norm": 0.7593553931238317, + "learning_rate": 7.374181731032326e-08, + "loss": 0.5444, + "step": 16752 + }, + { + "epoch": 0.96, + "grad_norm": 0.3203570525125074, + "learning_rate": 7.35164116885989e-08, + "loss": 0.2598, + "step": 16753 + }, + { + "epoch": 0.96, + "grad_norm": 0.33154457148997163, + "learning_rate": 7.329134982057562e-08, + "loss": 0.2206, + "step": 16754 + }, + { + "epoch": 0.96, + "grad_norm": 0.7366957717865998, + "learning_rate": 7.306663171404494e-08, + "loss": 0.3779, + "step": 16755 + }, + { + "epoch": 0.96, + "grad_norm": 0.30293447315905003, + "learning_rate": 7.284225737678952e-08, + "loss": 0.2432, + "step": 16756 + }, + { + "epoch": 0.96, + "grad_norm": 0.25522354037367, + "learning_rate": 7.261822681657982e-08, + "loss": 0.1827, + "step": 16757 + }, + { + "epoch": 0.96, + "grad_norm": 1.388424240051691, + "learning_rate": 7.239454004117519e-08, + "loss": 0.8719, + "step": 16758 + }, + { + "epoch": 0.96, + "grad_norm": 0.31357229248114443, + "learning_rate": 7.217119705831943e-08, + "loss": 0.2698, + "step": 16759 + }, + { + "epoch": 0.96, + "grad_norm": 0.8776215112811757, + "learning_rate": 7.19481978757497e-08, + "loss": 0.2509, + "step": 16760 + }, + { + "epoch": 0.96, + "grad_norm": 0.3567502082271883, + "learning_rate": 7.172554250118535e-08, + "loss": 0.3052, + "step": 16761 + }, + { + "epoch": 0.96, + "grad_norm": 0.3012814796438905, + "learning_rate": 7.150323094233912e-08, + "loss": 0.2416, + "step": 16762 + }, + { + "epoch": 0.96, + "grad_norm": 0.44014096773481015, + "learning_rate": 7.128126320690931e-08, + "loss": 0.0926, + "step": 16763 + }, + { + "epoch": 0.96, + "grad_norm": 0.7301742900613, + "learning_rate": 7.105963930258308e-08, + "loss": 0.4903, + "step": 16764 + }, + { + "epoch": 0.96, + "grad_norm": 0.22739964947646552, + "learning_rate": 7.083835923703319e-08, + "loss": 0.2198, + "step": 16765 + }, + { + "epoch": 0.96, + "grad_norm": 0.4968594490313015, + "learning_rate": 7.061742301792462e-08, + "loss": 0.3535, + "step": 16766 + }, + { + "epoch": 0.96, + "grad_norm": 0.49300324716515354, + "learning_rate": 7.039683065290792e-08, + "loss": 0.2496, + "step": 16767 + }, + { + "epoch": 0.96, + "grad_norm": 0.3440836293024113, + "learning_rate": 7.017658214962142e-08, + "loss": 0.2844, + "step": 16768 + }, + { + "epoch": 0.96, + "grad_norm": 0.7243014548865391, + "learning_rate": 6.995667751569346e-08, + "loss": 0.3251, + "step": 16769 + }, + { + "epoch": 0.96, + "grad_norm": 0.3702799611297618, + "learning_rate": 6.973711675873795e-08, + "loss": 0.2672, + "step": 16770 + }, + { + "epoch": 0.96, + "grad_norm": 0.37034677737138777, + "learning_rate": 6.951789988635992e-08, + "loss": 0.2754, + "step": 16771 + }, + { + "epoch": 0.96, + "grad_norm": 0.33444108241821474, + "learning_rate": 6.929902690614998e-08, + "loss": 0.2079, + "step": 16772 + }, + { + "epoch": 0.96, + "grad_norm": 0.3614151509687135, + "learning_rate": 6.90804978256876e-08, + "loss": 0.2578, + "step": 16773 + }, + { + "epoch": 0.96, + "grad_norm": 0.34113505165220437, + "learning_rate": 6.886231265254007e-08, + "loss": 0.2359, + "step": 16774 + }, + { + "epoch": 0.96, + "grad_norm": 1.2018390028919796, + "learning_rate": 6.864447139426356e-08, + "loss": 0.5018, + "step": 16775 + }, + { + "epoch": 0.96, + "grad_norm": 0.3092635740058215, + "learning_rate": 6.842697405840204e-08, + "loss": 0.1559, + "step": 16776 + }, + { + "epoch": 0.96, + "grad_norm": 0.2583079412979954, + "learning_rate": 6.820982065248837e-08, + "loss": 0.252, + "step": 16777 + }, + { + "epoch": 0.96, + "grad_norm": 0.3110707555011408, + "learning_rate": 6.79930111840399e-08, + "loss": 0.2555, + "step": 16778 + }, + { + "epoch": 0.96, + "grad_norm": 0.9741701424650024, + "learning_rate": 6.777654566056724e-08, + "loss": 0.3839, + "step": 16779 + }, + { + "epoch": 0.96, + "grad_norm": 0.3323721668501855, + "learning_rate": 6.756042408956554e-08, + "loss": 0.1876, + "step": 16780 + }, + { + "epoch": 0.96, + "grad_norm": 0.5357257838410326, + "learning_rate": 6.73446464785199e-08, + "loss": 0.3502, + "step": 16781 + }, + { + "epoch": 0.96, + "grad_norm": 0.44370665817522104, + "learning_rate": 6.712921283490103e-08, + "loss": 0.3634, + "step": 16782 + }, + { + "epoch": 0.96, + "grad_norm": 0.28311071631336937, + "learning_rate": 6.691412316617075e-08, + "loss": 0.1912, + "step": 16783 + }, + { + "epoch": 0.96, + "grad_norm": 0.5741547571259257, + "learning_rate": 6.66993774797775e-08, + "loss": 0.3838, + "step": 16784 + }, + { + "epoch": 0.96, + "grad_norm": 0.2227072118964059, + "learning_rate": 6.64849757831576e-08, + "loss": 0.205, + "step": 16785 + }, + { + "epoch": 0.96, + "grad_norm": 0.2907201308397229, + "learning_rate": 6.627091808373509e-08, + "loss": 0.1812, + "step": 16786 + }, + { + "epoch": 0.96, + "grad_norm": 1.299735719613476, + "learning_rate": 6.605720438892515e-08, + "loss": 0.7159, + "step": 16787 + }, + { + "epoch": 0.96, + "grad_norm": 0.4745022016836694, + "learning_rate": 6.584383470612631e-08, + "loss": 0.3508, + "step": 16788 + }, + { + "epoch": 0.96, + "grad_norm": 0.2982418095480476, + "learning_rate": 6.563080904272712e-08, + "loss": 0.1854, + "step": 16789 + }, + { + "epoch": 0.96, + "grad_norm": 0.3813855226753747, + "learning_rate": 6.54181274061072e-08, + "loss": 0.3044, + "step": 16790 + }, + { + "epoch": 0.96, + "grad_norm": 0.32301668216479656, + "learning_rate": 6.520578980362957e-08, + "loss": 0.1707, + "step": 16791 + }, + { + "epoch": 0.96, + "grad_norm": 0.41005198828464845, + "learning_rate": 6.499379624264834e-08, + "loss": 0.2734, + "step": 16792 + }, + { + "epoch": 0.96, + "grad_norm": 0.277449872251276, + "learning_rate": 6.478214673050542e-08, + "loss": 0.2324, + "step": 16793 + }, + { + "epoch": 0.96, + "grad_norm": 1.2249416877156012, + "learning_rate": 6.45708412745294e-08, + "loss": 0.7376, + "step": 16794 + }, + { + "epoch": 0.96, + "grad_norm": 0.3198672617571951, + "learning_rate": 6.435987988203662e-08, + "loss": 0.241, + "step": 16795 + }, + { + "epoch": 0.97, + "grad_norm": 0.6394970517054889, + "learning_rate": 6.414926256033461e-08, + "loss": 0.2757, + "step": 16796 + }, + { + "epoch": 0.97, + "grad_norm": 0.22601691033016935, + "learning_rate": 6.393898931671749e-08, + "loss": 0.2039, + "step": 16797 + }, + { + "epoch": 0.97, + "grad_norm": 0.33681906182926935, + "learning_rate": 6.372906015846502e-08, + "loss": 0.2788, + "step": 16798 + }, + { + "epoch": 0.97, + "grad_norm": 1.303947428546528, + "learning_rate": 6.351947509284695e-08, + "loss": 0.2012, + "step": 16799 + }, + { + "epoch": 0.97, + "grad_norm": 0.44996544450292303, + "learning_rate": 6.331023412712411e-08, + "loss": 0.3338, + "step": 16800 + }, + { + "epoch": 0.97, + "grad_norm": 0.2537708506596748, + "learning_rate": 6.310133726853962e-08, + "loss": 0.23, + "step": 16801 + }, + { + "epoch": 0.97, + "grad_norm": 0.8024519107979677, + "learning_rate": 6.289278452432768e-08, + "loss": 0.3105, + "step": 16802 + }, + { + "epoch": 0.97, + "grad_norm": 0.3602472486183106, + "learning_rate": 6.268457590171251e-08, + "loss": 0.1993, + "step": 16803 + }, + { + "epoch": 0.97, + "grad_norm": 0.5559272095999094, + "learning_rate": 6.247671140790279e-08, + "loss": 0.2386, + "step": 16804 + }, + { + "epoch": 0.97, + "grad_norm": 0.2690640064334697, + "learning_rate": 6.226919105009721e-08, + "loss": 0.269, + "step": 16805 + }, + { + "epoch": 0.97, + "grad_norm": 0.5439058823170019, + "learning_rate": 6.206201483548224e-08, + "loss": 0.3134, + "step": 16806 + }, + { + "epoch": 0.97, + "grad_norm": 0.4256086949463779, + "learning_rate": 6.185518277123215e-08, + "loss": 0.2782, + "step": 16807 + }, + { + "epoch": 0.97, + "grad_norm": 0.5052015564502045, + "learning_rate": 6.164869486451008e-08, + "loss": 0.3196, + "step": 16808 + }, + { + "epoch": 0.97, + "grad_norm": 0.20772748215582018, + "learning_rate": 6.144255112246589e-08, + "loss": 0.1723, + "step": 16809 + }, + { + "epoch": 0.97, + "grad_norm": 0.40896993870319204, + "learning_rate": 6.12367515522394e-08, + "loss": 0.3089, + "step": 16810 + }, + { + "epoch": 0.97, + "grad_norm": 0.5635204637734937, + "learning_rate": 6.103129616095605e-08, + "loss": 0.3505, + "step": 16811 + }, + { + "epoch": 0.97, + "grad_norm": 0.4005554484110214, + "learning_rate": 6.082618495573234e-08, + "loss": 0.2544, + "step": 16812 + }, + { + "epoch": 0.97, + "grad_norm": 0.33308458781620476, + "learning_rate": 6.062141794366927e-08, + "loss": 0.2608, + "step": 16813 + }, + { + "epoch": 0.97, + "grad_norm": 0.66920581948784, + "learning_rate": 6.041699513186005e-08, + "loss": 0.4106, + "step": 16814 + }, + { + "epoch": 0.97, + "grad_norm": 0.32580508384371565, + "learning_rate": 6.021291652738348e-08, + "loss": 0.09, + "step": 16815 + }, + { + "epoch": 0.97, + "grad_norm": 0.3156954468518769, + "learning_rate": 6.0009182137305e-08, + "loss": 0.241, + "step": 16816 + }, + { + "epoch": 0.97, + "grad_norm": 0.35874233754083895, + "learning_rate": 5.98057919686823e-08, + "loss": 0.3018, + "step": 16817 + }, + { + "epoch": 0.97, + "grad_norm": 0.5415758946024039, + "learning_rate": 5.9602746028556425e-08, + "loss": 0.3908, + "step": 16818 + }, + { + "epoch": 0.97, + "grad_norm": 0.32112130011980533, + "learning_rate": 5.9400044323960625e-08, + "loss": 0.2137, + "step": 16819 + }, + { + "epoch": 0.97, + "grad_norm": 1.3192494544105202, + "learning_rate": 5.919768686191263e-08, + "loss": 0.6982, + "step": 16820 + }, + { + "epoch": 0.97, + "grad_norm": 0.27660356577726986, + "learning_rate": 5.8995673649422383e-08, + "loss": 0.2368, + "step": 16821 + }, + { + "epoch": 0.97, + "grad_norm": 0.31310707140945643, + "learning_rate": 5.879400469348429e-08, + "loss": 0.0847, + "step": 16822 + }, + { + "epoch": 0.97, + "grad_norm": 0.38135073290776306, + "learning_rate": 5.859268000108276e-08, + "loss": 0.3013, + "step": 16823 + }, + { + "epoch": 0.97, + "grad_norm": 0.3411340438737624, + "learning_rate": 5.8391699579188885e-08, + "loss": 0.2999, + "step": 16824 + }, + { + "epoch": 0.97, + "grad_norm": 0.3468521220123775, + "learning_rate": 5.819106343476266e-08, + "loss": 0.1525, + "step": 16825 + }, + { + "epoch": 0.97, + "grad_norm": 0.38139159913608844, + "learning_rate": 5.799077157475297e-08, + "loss": 0.3019, + "step": 16826 + }, + { + "epoch": 0.97, + "grad_norm": 0.35172703587555404, + "learning_rate": 5.779082400609426e-08, + "loss": 0.1948, + "step": 16827 + }, + { + "epoch": 0.97, + "grad_norm": 0.3715864195679574, + "learning_rate": 5.7591220735712105e-08, + "loss": 0.1732, + "step": 16828 + }, + { + "epoch": 0.97, + "grad_norm": 0.2882289558296544, + "learning_rate": 5.7391961770519865e-08, + "loss": 0.3003, + "step": 16829 + }, + { + "epoch": 0.97, + "grad_norm": 0.8134034249660722, + "learning_rate": 5.7193047117415356e-08, + "loss": 0.4185, + "step": 16830 + }, + { + "epoch": 0.97, + "grad_norm": 0.5514544419384692, + "learning_rate": 5.699447678328751e-08, + "loss": 0.373, + "step": 16831 + }, + { + "epoch": 0.97, + "grad_norm": 0.29257468097948075, + "learning_rate": 5.6796250775014164e-08, + "loss": 0.2282, + "step": 16832 + }, + { + "epoch": 0.97, + "grad_norm": 0.5000178068678629, + "learning_rate": 5.6598369099458705e-08, + "loss": 0.2971, + "step": 16833 + }, + { + "epoch": 0.97, + "grad_norm": 0.22716384655094343, + "learning_rate": 5.640083176347455e-08, + "loss": 0.1603, + "step": 16834 + }, + { + "epoch": 0.97, + "grad_norm": 0.3518311840836283, + "learning_rate": 5.620363877390178e-08, + "loss": 0.2388, + "step": 16835 + }, + { + "epoch": 0.97, + "grad_norm": 0.45494879435889785, + "learning_rate": 5.600679013756938e-08, + "loss": 0.3139, + "step": 16836 + }, + { + "epoch": 0.97, + "grad_norm": 0.31631780442761426, + "learning_rate": 5.581028586129411e-08, + "loss": 0.2677, + "step": 16837 + }, + { + "epoch": 0.97, + "grad_norm": 0.5723125886588851, + "learning_rate": 5.561412595188165e-08, + "loss": 0.2285, + "step": 16838 + }, + { + "epoch": 0.97, + "grad_norm": 0.9015022205745714, + "learning_rate": 5.541831041612322e-08, + "loss": 0.4787, + "step": 16839 + }, + { + "epoch": 0.97, + "grad_norm": 0.37034387207268293, + "learning_rate": 5.5222839260802294e-08, + "loss": 0.2729, + "step": 16840 + }, + { + "epoch": 0.97, + "grad_norm": 0.30520368304807893, + "learning_rate": 5.502771249268568e-08, + "loss": 0.2811, + "step": 16841 + }, + { + "epoch": 0.97, + "grad_norm": 0.2898764190409881, + "learning_rate": 5.483293011853241e-08, + "loss": 0.145, + "step": 16842 + }, + { + "epoch": 0.97, + "grad_norm": 0.756039605785083, + "learning_rate": 5.4638492145087096e-08, + "loss": 0.3867, + "step": 16843 + }, + { + "epoch": 0.97, + "grad_norm": 0.3807809778880975, + "learning_rate": 5.4444398579083235e-08, + "loss": 0.2543, + "step": 16844 + }, + { + "epoch": 0.97, + "grad_norm": 0.37486671787998804, + "learning_rate": 5.425064942724212e-08, + "loss": 0.2385, + "step": 16845 + }, + { + "epoch": 0.97, + "grad_norm": 0.596115630845532, + "learning_rate": 5.4057244696272826e-08, + "loss": 0.3478, + "step": 16846 + }, + { + "epoch": 0.97, + "grad_norm": 0.36027385165231446, + "learning_rate": 5.386418439287444e-08, + "loss": 0.2733, + "step": 16847 + }, + { + "epoch": 0.97, + "grad_norm": 0.26856822778458994, + "learning_rate": 5.3671468523731617e-08, + "loss": 0.1785, + "step": 16848 + }, + { + "epoch": 0.97, + "grad_norm": 0.37989515893189935, + "learning_rate": 5.3479097095516795e-08, + "loss": 0.3023, + "step": 16849 + }, + { + "epoch": 0.97, + "grad_norm": 0.2918242952616663, + "learning_rate": 5.328707011489465e-08, + "loss": 0.2222, + "step": 16850 + }, + { + "epoch": 0.97, + "grad_norm": 0.7682960082736956, + "learning_rate": 5.3095387588512074e-08, + "loss": 0.2587, + "step": 16851 + }, + { + "epoch": 0.97, + "grad_norm": 0.36395943813381715, + "learning_rate": 5.2904049523009315e-08, + "loss": 0.2663, + "step": 16852 + }, + { + "epoch": 0.97, + "grad_norm": 0.38743575817296055, + "learning_rate": 5.271305592501108e-08, + "loss": 0.3176, + "step": 16853 + }, + { + "epoch": 0.97, + "grad_norm": 0.417320125421086, + "learning_rate": 5.252240680113319e-08, + "loss": 0.2276, + "step": 16854 + }, + { + "epoch": 0.97, + "grad_norm": 0.22973767123986893, + "learning_rate": 5.233210215797591e-08, + "loss": 0.104, + "step": 16855 + }, + { + "epoch": 0.97, + "grad_norm": 0.3841946691455176, + "learning_rate": 5.2142142002129524e-08, + "loss": 0.3001, + "step": 16856 + }, + { + "epoch": 0.97, + "grad_norm": 0.34093585258614106, + "learning_rate": 5.195252634017434e-08, + "loss": 0.2781, + "step": 16857 + }, + { + "epoch": 0.97, + "grad_norm": 0.5523198990332471, + "learning_rate": 5.1763255178673974e-08, + "loss": 0.2075, + "step": 16858 + }, + { + "epoch": 0.97, + "grad_norm": 0.4296501087072229, + "learning_rate": 5.1574328524184316e-08, + "loss": 0.2917, + "step": 16859 + }, + { + "epoch": 0.97, + "grad_norm": 0.47266839506660946, + "learning_rate": 5.1385746383249e-08, + "loss": 0.32, + "step": 16860 + }, + { + "epoch": 0.97, + "grad_norm": 0.21272369188801774, + "learning_rate": 5.1197508762397265e-08, + "loss": 0.1139, + "step": 16861 + }, + { + "epoch": 0.97, + "grad_norm": 0.2604253939433585, + "learning_rate": 5.1009615668147217e-08, + "loss": 0.2097, + "step": 16862 + }, + { + "epoch": 0.97, + "grad_norm": 0.610360799421692, + "learning_rate": 5.082206710700699e-08, + "loss": 0.3483, + "step": 16863 + }, + { + "epoch": 0.97, + "grad_norm": 0.295175344003875, + "learning_rate": 5.063486308547028e-08, + "loss": 0.2115, + "step": 16864 + }, + { + "epoch": 0.97, + "grad_norm": 0.33680233097645323, + "learning_rate": 5.044800361002078e-08, + "loss": 0.2811, + "step": 16865 + }, + { + "epoch": 0.97, + "grad_norm": 1.372302629109178, + "learning_rate": 5.0261488687128876e-08, + "loss": 0.62, + "step": 16866 + }, + { + "epoch": 0.97, + "grad_norm": 0.24720787872927993, + "learning_rate": 5.007531832325385e-08, + "loss": 0.1813, + "step": 16867 + }, + { + "epoch": 0.97, + "grad_norm": 0.2469605731267985, + "learning_rate": 4.9889492524842766e-08, + "loss": 0.2309, + "step": 16868 + }, + { + "epoch": 0.97, + "grad_norm": 0.5687789725130987, + "learning_rate": 4.970401129833047e-08, + "loss": 0.3839, + "step": 16869 + }, + { + "epoch": 0.97, + "grad_norm": 0.7360556561940557, + "learning_rate": 4.9518874650139604e-08, + "loss": 0.3072, + "step": 16870 + }, + { + "epoch": 0.97, + "grad_norm": 0.36413256453148496, + "learning_rate": 4.933408258668393e-08, + "loss": 0.2074, + "step": 16871 + }, + { + "epoch": 0.97, + "grad_norm": 0.33474022926915215, + "learning_rate": 4.914963511436055e-08, + "loss": 0.2959, + "step": 16872 + }, + { + "epoch": 0.97, + "grad_norm": 0.3531737978180413, + "learning_rate": 4.896553223955658e-08, + "loss": 0.2193, + "step": 16873 + }, + { + "epoch": 0.97, + "grad_norm": 0.3508589055917248, + "learning_rate": 4.878177396864914e-08, + "loss": 0.2221, + "step": 16874 + }, + { + "epoch": 0.97, + "grad_norm": 0.42703325762760735, + "learning_rate": 4.859836030800091e-08, + "loss": 0.261, + "step": 16875 + }, + { + "epoch": 0.97, + "grad_norm": 0.29443074994484736, + "learning_rate": 4.8415291263962383e-08, + "loss": 0.2574, + "step": 16876 + }, + { + "epoch": 0.97, + "grad_norm": 0.37489421083434077, + "learning_rate": 4.823256684287625e-08, + "loss": 0.2146, + "step": 16877 + }, + { + "epoch": 0.97, + "grad_norm": 1.1846274794014118, + "learning_rate": 4.805018705106745e-08, + "loss": 0.6589, + "step": 16878 + }, + { + "epoch": 0.97, + "grad_norm": 0.9644830863353985, + "learning_rate": 4.7868151894852054e-08, + "loss": 0.3864, + "step": 16879 + }, + { + "epoch": 0.97, + "grad_norm": 0.24407539568102385, + "learning_rate": 4.768646138053501e-08, + "loss": 0.2382, + "step": 16880 + }, + { + "epoch": 0.97, + "grad_norm": 0.2693149381967699, + "learning_rate": 4.750511551440906e-08, + "loss": 0.189, + "step": 16881 + }, + { + "epoch": 0.97, + "grad_norm": 1.318880587886877, + "learning_rate": 4.732411430275141e-08, + "loss": 0.455, + "step": 16882 + }, + { + "epoch": 0.97, + "grad_norm": 0.32987152777098505, + "learning_rate": 4.71434577518326e-08, + "loss": 0.2373, + "step": 16883 + }, + { + "epoch": 0.97, + "grad_norm": 0.33653567396866907, + "learning_rate": 4.696314586790762e-08, + "loss": 0.2356, + "step": 16884 + }, + { + "epoch": 0.97, + "grad_norm": 0.6980998648832283, + "learning_rate": 4.6783178657221486e-08, + "loss": 0.4677, + "step": 16885 + }, + { + "epoch": 0.97, + "grad_norm": 0.32680210012665406, + "learning_rate": 4.6603556126004756e-08, + "loss": 0.2597, + "step": 16886 + }, + { + "epoch": 0.97, + "grad_norm": 0.23594603800560673, + "learning_rate": 4.642427828047913e-08, + "loss": 0.1121, + "step": 16887 + }, + { + "epoch": 0.97, + "grad_norm": 0.32282382268927773, + "learning_rate": 4.624534512685297e-08, + "loss": 0.2613, + "step": 16888 + }, + { + "epoch": 0.97, + "grad_norm": 0.3497080476951133, + "learning_rate": 4.6066756671322434e-08, + "loss": 0.2655, + "step": 16889 + }, + { + "epoch": 0.97, + "grad_norm": 0.9983668851762743, + "learning_rate": 4.588851292007257e-08, + "loss": 0.5257, + "step": 16890 + }, + { + "epoch": 0.97, + "grad_norm": 0.40613672708852766, + "learning_rate": 4.5710613879275115e-08, + "loss": 0.2886, + "step": 16891 + }, + { + "epoch": 0.97, + "grad_norm": 0.3492455486029241, + "learning_rate": 4.5533059555090684e-08, + "loss": 0.2554, + "step": 16892 + }, + { + "epoch": 0.97, + "grad_norm": 0.4983795945378701, + "learning_rate": 4.535584995366882e-08, + "loss": 0.3268, + "step": 16893 + }, + { + "epoch": 0.97, + "grad_norm": 0.2967937991637615, + "learning_rate": 4.517898508114571e-08, + "loss": 0.0822, + "step": 16894 + }, + { + "epoch": 0.97, + "grad_norm": 0.3401773758082715, + "learning_rate": 4.500246494364535e-08, + "loss": 0.2568, + "step": 16895 + }, + { + "epoch": 0.97, + "grad_norm": 0.33076498271613985, + "learning_rate": 4.482628954728285e-08, + "loss": 0.3311, + "step": 16896 + }, + { + "epoch": 0.97, + "grad_norm": 0.6425786521864342, + "learning_rate": 4.465045889815778e-08, + "loss": 0.3003, + "step": 16897 + }, + { + "epoch": 0.97, + "grad_norm": 0.2972202986795993, + "learning_rate": 4.447497300235859e-08, + "loss": 0.2615, + "step": 16898 + }, + { + "epoch": 0.97, + "grad_norm": 0.3100186032119291, + "learning_rate": 4.4299831865962653e-08, + "loss": 0.1789, + "step": 16899 + }, + { + "epoch": 0.97, + "grad_norm": 0.3280509719318347, + "learning_rate": 4.412503549503622e-08, + "loss": 0.2357, + "step": 16900 + }, + { + "epoch": 0.97, + "grad_norm": 0.33145619269969046, + "learning_rate": 4.3950583895631116e-08, + "loss": 0.2536, + "step": 16901 + }, + { + "epoch": 0.97, + "grad_norm": 0.7463227424917745, + "learning_rate": 4.377647707379029e-08, + "loss": 0.5095, + "step": 16902 + }, + { + "epoch": 0.97, + "grad_norm": 0.31520197157597846, + "learning_rate": 4.360271503554114e-08, + "loss": 0.2509, + "step": 16903 + }, + { + "epoch": 0.97, + "grad_norm": 0.33216346525271806, + "learning_rate": 4.342929778690108e-08, + "loss": 0.2673, + "step": 16904 + }, + { + "epoch": 0.97, + "grad_norm": 1.4729941097580517, + "learning_rate": 4.325622533387752e-08, + "loss": 0.5392, + "step": 16905 + }, + { + "epoch": 0.97, + "grad_norm": 0.3713416552428508, + "learning_rate": 4.308349768246234e-08, + "loss": 0.1466, + "step": 16906 + }, + { + "epoch": 0.97, + "grad_norm": 0.28593857290381, + "learning_rate": 4.291111483863741e-08, + "loss": 0.1936, + "step": 16907 + }, + { + "epoch": 0.97, + "grad_norm": 0.34487575236955365, + "learning_rate": 4.273907680837241e-08, + "loss": 0.2875, + "step": 16908 + }, + { + "epoch": 0.97, + "grad_norm": 0.5930762811522095, + "learning_rate": 4.2567383597624804e-08, + "loss": 0.394, + "step": 16909 + }, + { + "epoch": 0.97, + "grad_norm": 0.2870976878198615, + "learning_rate": 4.239603521234092e-08, + "loss": 0.188, + "step": 16910 + }, + { + "epoch": 0.97, + "grad_norm": 1.5038426691479727, + "learning_rate": 4.2225031658453816e-08, + "loss": 0.6532, + "step": 16911 + }, + { + "epoch": 0.97, + "grad_norm": 0.22444525476756616, + "learning_rate": 4.205437294188541e-08, + "loss": 0.2015, + "step": 16912 + }, + { + "epoch": 0.97, + "grad_norm": 0.2935048318220947, + "learning_rate": 4.1884059068546534e-08, + "loss": 0.1828, + "step": 16913 + }, + { + "epoch": 0.97, + "grad_norm": 0.5488045078725402, + "learning_rate": 4.171409004433358e-08, + "loss": 0.3798, + "step": 16914 + }, + { + "epoch": 0.97, + "grad_norm": 0.887381775060886, + "learning_rate": 4.154446587513406e-08, + "loss": 0.5585, + "step": 16915 + }, + { + "epoch": 0.97, + "grad_norm": 0.2382508451881509, + "learning_rate": 4.137518656682216e-08, + "loss": 0.2414, + "step": 16916 + }, + { + "epoch": 0.97, + "grad_norm": 0.6288484829732698, + "learning_rate": 4.120625212525875e-08, + "loss": 0.2158, + "step": 16917 + }, + { + "epoch": 0.97, + "grad_norm": 0.2514810105291575, + "learning_rate": 4.10376625562936e-08, + "loss": 0.1322, + "step": 16918 + }, + { + "epoch": 0.97, + "grad_norm": 0.4334570249701634, + "learning_rate": 4.086941786576759e-08, + "loss": 0.2788, + "step": 16919 + }, + { + "epoch": 0.97, + "grad_norm": 0.2745662783710371, + "learning_rate": 4.070151805950384e-08, + "loss": 0.2448, + "step": 16920 + }, + { + "epoch": 0.97, + "grad_norm": 0.8311620037180578, + "learning_rate": 4.053396314331881e-08, + "loss": 0.5275, + "step": 16921 + }, + { + "epoch": 0.97, + "grad_norm": 0.3124622685734896, + "learning_rate": 4.0366753123014526e-08, + "loss": 0.2508, + "step": 16922 + }, + { + "epoch": 0.97, + "grad_norm": 1.9206862580118762, + "learning_rate": 4.0199888004381907e-08, + "loss": 0.2277, + "step": 16923 + }, + { + "epoch": 0.97, + "grad_norm": 0.22491161238978494, + "learning_rate": 4.003336779319855e-08, + "loss": 0.2061, + "step": 16924 + }, + { + "epoch": 0.97, + "grad_norm": 0.3165803242891882, + "learning_rate": 3.9867192495230965e-08, + "loss": 0.2662, + "step": 16925 + }, + { + "epoch": 0.97, + "grad_norm": 0.6313708675545323, + "learning_rate": 3.970136211623343e-08, + "loss": 0.2733, + "step": 16926 + }, + { + "epoch": 0.97, + "grad_norm": 0.3125624883391193, + "learning_rate": 3.9535876661951356e-08, + "loss": 0.3, + "step": 16927 + }, + { + "epoch": 0.97, + "grad_norm": 0.5088215981290167, + "learning_rate": 3.937073613811237e-08, + "loss": 0.2573, + "step": 16928 + }, + { + "epoch": 0.97, + "grad_norm": 0.422677352010954, + "learning_rate": 3.920594055043636e-08, + "loss": 0.2661, + "step": 16929 + }, + { + "epoch": 0.97, + "grad_norm": 0.32130140929879897, + "learning_rate": 3.9041489904629857e-08, + "loss": 0.1425, + "step": 16930 + }, + { + "epoch": 0.97, + "grad_norm": 0.4239268555816805, + "learning_rate": 3.8877384206389426e-08, + "loss": 0.3111, + "step": 16931 + }, + { + "epoch": 0.97, + "grad_norm": 0.29273897539680954, + "learning_rate": 3.8713623461396066e-08, + "loss": 0.2897, + "step": 16932 + }, + { + "epoch": 0.97, + "grad_norm": 1.7208118477900014, + "learning_rate": 3.855020767532191e-08, + "loss": 0.3823, + "step": 16933 + }, + { + "epoch": 0.97, + "grad_norm": 0.39365836561310663, + "learning_rate": 3.8387136853825776e-08, + "loss": 0.2538, + "step": 16934 + }, + { + "epoch": 0.97, + "grad_norm": 0.3864142535331856, + "learning_rate": 3.822441100255425e-08, + "loss": 0.2928, + "step": 16935 + }, + { + "epoch": 0.97, + "grad_norm": 0.4594334507085826, + "learning_rate": 3.806203012714394e-08, + "loss": 0.2365, + "step": 16936 + }, + { + "epoch": 0.97, + "grad_norm": 0.36622375232538923, + "learning_rate": 3.7899994233216996e-08, + "loss": 0.2491, + "step": 16937 + }, + { + "epoch": 0.97, + "grad_norm": 0.2574689612287729, + "learning_rate": 3.77383033263834e-08, + "loss": 0.2046, + "step": 16938 + }, + { + "epoch": 0.97, + "grad_norm": 0.32583886214228214, + "learning_rate": 3.757695741224532e-08, + "loss": 0.2641, + "step": 16939 + }, + { + "epoch": 0.97, + "grad_norm": 0.30582437989842254, + "learning_rate": 3.7415956496388295e-08, + "loss": 0.2625, + "step": 16940 + }, + { + "epoch": 0.97, + "grad_norm": 1.272482518357978, + "learning_rate": 3.7255300584388976e-08, + "loss": 0.6096, + "step": 16941 + }, + { + "epoch": 0.97, + "grad_norm": 0.6476407086373608, + "learning_rate": 3.709498968180958e-08, + "loss": 0.371, + "step": 16942 + }, + { + "epoch": 0.97, + "grad_norm": 0.24076540832073126, + "learning_rate": 3.693502379420233e-08, + "loss": 0.1992, + "step": 16943 + }, + { + "epoch": 0.97, + "grad_norm": 0.47951685652698056, + "learning_rate": 3.677540292710724e-08, + "loss": 0.3247, + "step": 16944 + }, + { + "epoch": 0.97, + "grad_norm": 0.434123742526139, + "learning_rate": 3.6616127086051e-08, + "loss": 0.3129, + "step": 16945 + }, + { + "epoch": 0.97, + "grad_norm": 0.1810864281589373, + "learning_rate": 3.64571962765492e-08, + "loss": 0.0715, + "step": 16946 + }, + { + "epoch": 0.97, + "grad_norm": 0.32436565764474184, + "learning_rate": 3.629861050410743e-08, + "loss": 0.2812, + "step": 16947 + }, + { + "epoch": 0.97, + "grad_norm": 0.4181731715703347, + "learning_rate": 3.6140369774215755e-08, + "loss": 0.3126, + "step": 16948 + }, + { + "epoch": 0.97, + "grad_norm": 0.5866495465077649, + "learning_rate": 3.5982474092355334e-08, + "loss": 0.1783, + "step": 16949 + }, + { + "epoch": 0.97, + "grad_norm": 0.28246328115634156, + "learning_rate": 3.58249234639918e-08, + "loss": 0.2231, + "step": 16950 + }, + { + "epoch": 0.97, + "grad_norm": 0.3389229225390919, + "learning_rate": 3.566771789458412e-08, + "loss": 0.3261, + "step": 16951 + }, + { + "epoch": 0.97, + "grad_norm": 0.21439535606761287, + "learning_rate": 3.55108573895746e-08, + "loss": 0.1327, + "step": 16952 + }, + { + "epoch": 0.97, + "grad_norm": 0.5498107837423801, + "learning_rate": 3.535434195439558e-08, + "loss": 0.3353, + "step": 16953 + }, + { + "epoch": 0.97, + "grad_norm": 0.6198576731338329, + "learning_rate": 3.5198171594467145e-08, + "loss": 0.3519, + "step": 16954 + }, + { + "epoch": 0.97, + "grad_norm": 0.38369011378607276, + "learning_rate": 3.504234631519721e-08, + "loss": 0.2421, + "step": 16955 + }, + { + "epoch": 0.97, + "grad_norm": 0.2903915426775922, + "learning_rate": 3.4886866121982555e-08, + "loss": 0.2236, + "step": 16956 + }, + { + "epoch": 0.97, + "grad_norm": 1.078379182005405, + "learning_rate": 3.473173102020666e-08, + "loss": 0.7698, + "step": 16957 + }, + { + "epoch": 0.97, + "grad_norm": 0.2756133154465786, + "learning_rate": 3.4576941015243003e-08, + "loss": 0.1562, + "step": 16958 + }, + { + "epoch": 0.97, + "grad_norm": 0.27127846778563536, + "learning_rate": 3.4422496112451745e-08, + "loss": 0.2316, + "step": 16959 + }, + { + "epoch": 0.97, + "grad_norm": 0.40386928409495443, + "learning_rate": 3.426839631718082e-08, + "loss": 0.3061, + "step": 16960 + }, + { + "epoch": 0.97, + "grad_norm": 0.4168805787576852, + "learning_rate": 3.411464163476597e-08, + "loss": 0.2353, + "step": 16961 + }, + { + "epoch": 0.97, + "grad_norm": 0.47659370916054206, + "learning_rate": 3.3961232070532927e-08, + "loss": 0.2327, + "step": 16962 + }, + { + "epoch": 0.97, + "grad_norm": 0.34162007685684154, + "learning_rate": 3.380816762979411e-08, + "loss": 0.3236, + "step": 16963 + }, + { + "epoch": 0.97, + "grad_norm": 0.2637327032439912, + "learning_rate": 3.3655448317849725e-08, + "loss": 0.1839, + "step": 16964 + }, + { + "epoch": 0.97, + "grad_norm": 0.4139189954002851, + "learning_rate": 3.350307413998888e-08, + "loss": 0.2518, + "step": 16965 + }, + { + "epoch": 0.97, + "grad_norm": 0.6670530591028055, + "learning_rate": 3.335104510148734e-08, + "loss": 0.3724, + "step": 16966 + }, + { + "epoch": 0.97, + "grad_norm": 0.31514853852025554, + "learning_rate": 3.3199361207610916e-08, + "loss": 0.2825, + "step": 16967 + }, + { + "epoch": 0.97, + "grad_norm": 0.39693340246090936, + "learning_rate": 3.304802246361205e-08, + "loss": 0.2747, + "step": 16968 + }, + { + "epoch": 0.97, + "grad_norm": 0.937981789113555, + "learning_rate": 3.2897028874731006e-08, + "loss": 0.5085, + "step": 16969 + }, + { + "epoch": 0.98, + "grad_norm": 0.35797398099669914, + "learning_rate": 3.274638044619805e-08, + "loss": 0.2483, + "step": 16970 + }, + { + "epoch": 0.98, + "grad_norm": 0.24761901524686186, + "learning_rate": 3.2596077183228993e-08, + "loss": 0.2691, + "step": 16971 + }, + { + "epoch": 0.98, + "grad_norm": 0.4298641626817817, + "learning_rate": 3.244611909102857e-08, + "loss": 0.1868, + "step": 16972 + }, + { + "epoch": 0.98, + "grad_norm": 0.6401200224266734, + "learning_rate": 3.229650617479152e-08, + "loss": 0.3064, + "step": 16973 + }, + { + "epoch": 0.98, + "grad_norm": 0.3937630678490877, + "learning_rate": 3.2147238439697026e-08, + "loss": 0.3115, + "step": 16974 + }, + { + "epoch": 0.98, + "grad_norm": 0.31684257741207245, + "learning_rate": 3.19983158909154e-08, + "loss": 0.248, + "step": 16975 + }, + { + "epoch": 0.98, + "grad_norm": 0.47739044032650685, + "learning_rate": 3.1849738533603625e-08, + "loss": 0.273, + "step": 16976 + }, + { + "epoch": 0.98, + "grad_norm": 0.4075103274369792, + "learning_rate": 3.1701506372906476e-08, + "loss": 0.2934, + "step": 16977 + }, + { + "epoch": 0.98, + "grad_norm": 0.3212448915171606, + "learning_rate": 3.155361941395763e-08, + "loss": 0.1268, + "step": 16978 + }, + { + "epoch": 0.98, + "grad_norm": 0.32874130733070184, + "learning_rate": 3.140607766187853e-08, + "loss": 0.251, + "step": 16979 + }, + { + "epoch": 0.98, + "grad_norm": 0.39753759795242666, + "learning_rate": 3.125888112177733e-08, + "loss": 0.3094, + "step": 16980 + }, + { + "epoch": 0.98, + "grad_norm": 0.8441004038029325, + "learning_rate": 3.1112029798753274e-08, + "loss": 0.5548, + "step": 16981 + }, + { + "epoch": 0.98, + "grad_norm": 0.5623939560608282, + "learning_rate": 3.096552369789119e-08, + "loss": 0.2477, + "step": 16982 + }, + { + "epoch": 0.98, + "grad_norm": 0.24903041506543536, + "learning_rate": 3.081936282426368e-08, + "loss": 0.2619, + "step": 16983 + }, + { + "epoch": 0.98, + "grad_norm": 0.29800042452218295, + "learning_rate": 3.067354718293336e-08, + "loss": 0.1459, + "step": 16984 + }, + { + "epoch": 0.98, + "grad_norm": 0.8593406624322196, + "learning_rate": 3.0528076778949536e-08, + "loss": 0.1092, + "step": 16985 + }, + { + "epoch": 0.98, + "grad_norm": 0.34340164329049183, + "learning_rate": 3.038295161734928e-08, + "loss": 0.2913, + "step": 16986 + }, + { + "epoch": 0.98, + "grad_norm": 0.32860953048223024, + "learning_rate": 3.023817170315857e-08, + "loss": 0.2968, + "step": 16987 + }, + { + "epoch": 0.98, + "grad_norm": 0.6455934770032342, + "learning_rate": 3.0093737041392293e-08, + "loss": 0.2445, + "step": 16988 + }, + { + "epoch": 0.98, + "grad_norm": 0.4043626940495485, + "learning_rate": 2.994964763704977e-08, + "loss": 0.2695, + "step": 16989 + }, + { + "epoch": 0.98, + "grad_norm": 0.22943688435845902, + "learning_rate": 2.980590349512258e-08, + "loss": 0.16, + "step": 16990 + }, + { + "epoch": 0.98, + "grad_norm": 0.3037975887387826, + "learning_rate": 2.966250462058895e-08, + "loss": 0.2288, + "step": 16991 + }, + { + "epoch": 0.98, + "grad_norm": 0.3512971318276826, + "learning_rate": 2.9519451018413804e-08, + "loss": 0.2918, + "step": 16992 + }, + { + "epoch": 0.98, + "grad_norm": 0.9975814522598434, + "learning_rate": 2.9376742693550954e-08, + "loss": 0.4212, + "step": 16993 + }, + { + "epoch": 0.98, + "grad_norm": 0.7888704691744978, + "learning_rate": 2.9234379650943113e-08, + "loss": 0.3391, + "step": 16994 + }, + { + "epoch": 0.98, + "grad_norm": 0.2782399609130223, + "learning_rate": 2.9092361895519673e-08, + "loss": 0.2218, + "step": 16995 + }, + { + "epoch": 0.98, + "grad_norm": 0.22388047250656798, + "learning_rate": 2.895068943219892e-08, + "loss": 0.1858, + "step": 16996 + }, + { + "epoch": 0.98, + "grad_norm": 0.8291268218917041, + "learning_rate": 2.880936226588693e-08, + "loss": 0.3896, + "step": 16997 + }, + { + "epoch": 0.98, + "grad_norm": 0.3064483112237091, + "learning_rate": 2.866838040147868e-08, + "loss": 0.21, + "step": 16998 + }, + { + "epoch": 0.98, + "grad_norm": 0.3857237505293713, + "learning_rate": 2.8527743843854704e-08, + "loss": 0.309, + "step": 16999 + }, + { + "epoch": 0.98, + "grad_norm": 0.8772816244367653, + "learning_rate": 2.8387452597886666e-08, + "loss": 0.4426, + "step": 17000 + }, + { + "epoch": 0.98, + "grad_norm": 0.3463263519606244, + "learning_rate": 2.824750666843179e-08, + "loss": 0.1876, + "step": 17001 + }, + { + "epoch": 0.98, + "grad_norm": 0.26138233646250353, + "learning_rate": 2.810790606033731e-08, + "loss": 0.203, + "step": 17002 + }, + { + "epoch": 0.98, + "grad_norm": 0.5924213734480587, + "learning_rate": 2.7968650778438245e-08, + "loss": 0.2967, + "step": 17003 + }, + { + "epoch": 0.98, + "grad_norm": 0.3005423451194237, + "learning_rate": 2.7829740827555185e-08, + "loss": 0.2472, + "step": 17004 + }, + { + "epoch": 0.98, + "grad_norm": 0.6290851599106121, + "learning_rate": 2.769117621249873e-08, + "loss": 0.3087, + "step": 17005 + }, + { + "epoch": 0.98, + "grad_norm": 0.5028037464387212, + "learning_rate": 2.7552956938068364e-08, + "loss": 0.366, + "step": 17006 + }, + { + "epoch": 0.98, + "grad_norm": 0.26322989882195386, + "learning_rate": 2.741508300905138e-08, + "loss": 0.2482, + "step": 17007 + }, + { + "epoch": 0.98, + "grad_norm": 1.7885337997270865, + "learning_rate": 2.727755443021951e-08, + "loss": 0.1453, + "step": 17008 + }, + { + "epoch": 0.98, + "grad_norm": 0.3236513169203668, + "learning_rate": 2.7140371206337834e-08, + "loss": 0.2128, + "step": 17009 + }, + { + "epoch": 0.98, + "grad_norm": 0.410045538047261, + "learning_rate": 2.7003533342156995e-08, + "loss": 0.2497, + "step": 17010 + }, + { + "epoch": 0.98, + "grad_norm": 0.3294855926484948, + "learning_rate": 2.6867040842414316e-08, + "loss": 0.2396, + "step": 17011 + }, + { + "epoch": 0.98, + "grad_norm": 0.7913529950119497, + "learning_rate": 2.6730893711837124e-08, + "loss": 0.4242, + "step": 17012 + }, + { + "epoch": 0.98, + "grad_norm": 0.34653414142176686, + "learning_rate": 2.6595091955139428e-08, + "loss": 0.2621, + "step": 17013 + }, + { + "epoch": 0.98, + "grad_norm": 0.27650331142096424, + "learning_rate": 2.6459635577026355e-08, + "loss": 0.1645, + "step": 17014 + }, + { + "epoch": 0.98, + "grad_norm": 0.34663882196846774, + "learning_rate": 2.6324524582186374e-08, + "loss": 0.2303, + "step": 17015 + }, + { + "epoch": 0.98, + "grad_norm": 0.34330890321049773, + "learning_rate": 2.6189758975299074e-08, + "loss": 0.2632, + "step": 17016 + }, + { + "epoch": 0.98, + "grad_norm": 0.6417232033799707, + "learning_rate": 2.6055338761031835e-08, + "loss": 0.4148, + "step": 17017 + }, + { + "epoch": 0.98, + "grad_norm": 0.3119913260890965, + "learning_rate": 2.592126394403982e-08, + "loss": 0.2295, + "step": 17018 + }, + { + "epoch": 0.98, + "grad_norm": 0.37198388859885884, + "learning_rate": 2.5787534528964875e-08, + "loss": 0.2445, + "step": 17019 + }, + { + "epoch": 0.98, + "grad_norm": 1.3027242854603744, + "learning_rate": 2.5654150520438848e-08, + "loss": 0.3769, + "step": 17020 + }, + { + "epoch": 0.98, + "grad_norm": 0.3261165338744473, + "learning_rate": 2.5521111923080266e-08, + "loss": 0.1333, + "step": 17021 + }, + { + "epoch": 0.98, + "grad_norm": 0.3275988014909243, + "learning_rate": 2.5388418741497668e-08, + "loss": 0.2532, + "step": 17022 + }, + { + "epoch": 0.98, + "grad_norm": 0.3388604127745746, + "learning_rate": 2.5256070980284042e-08, + "loss": 0.2924, + "step": 17023 + }, + { + "epoch": 0.98, + "grad_norm": 1.4462487671630624, + "learning_rate": 2.5124068644024613e-08, + "loss": 0.2823, + "step": 17024 + }, + { + "epoch": 0.98, + "grad_norm": 0.3366263902770327, + "learning_rate": 2.4992411737289057e-08, + "loss": 0.2503, + "step": 17025 + }, + { + "epoch": 0.98, + "grad_norm": 1.4836462318839685, + "learning_rate": 2.4861100264638172e-08, + "loss": 0.6723, + "step": 17026 + }, + { + "epoch": 0.98, + "grad_norm": 0.27144366896564376, + "learning_rate": 2.473013423061832e-08, + "loss": 0.221, + "step": 17027 + }, + { + "epoch": 0.98, + "grad_norm": 0.31117856512925224, + "learning_rate": 2.459951363976476e-08, + "loss": 0.2513, + "step": 17028 + }, + { + "epoch": 0.98, + "grad_norm": 0.38810367971759135, + "learning_rate": 2.4469238496600546e-08, + "loss": 0.2505, + "step": 17029 + }, + { + "epoch": 0.98, + "grad_norm": 0.33623722779449305, + "learning_rate": 2.433930880563762e-08, + "loss": 0.2981, + "step": 17030 + }, + { + "epoch": 0.98, + "grad_norm": 0.2907484340689656, + "learning_rate": 2.4209724571376826e-08, + "loss": 0.1823, + "step": 17031 + }, + { + "epoch": 0.98, + "grad_norm": 1.158270296591638, + "learning_rate": 2.4080485798302355e-08, + "loss": 0.4486, + "step": 17032 + }, + { + "epoch": 0.98, + "grad_norm": 0.5911414484895722, + "learning_rate": 2.395159249089285e-08, + "loss": 0.3196, + "step": 17033 + }, + { + "epoch": 0.98, + "grad_norm": 0.2877702121234691, + "learning_rate": 2.3823044653610295e-08, + "loss": 0.1904, + "step": 17034 + }, + { + "epoch": 0.98, + "grad_norm": 0.2822114281047928, + "learning_rate": 2.3694842290907792e-08, + "loss": 0.2648, + "step": 17035 + }, + { + "epoch": 0.98, + "grad_norm": 0.4802254996038075, + "learning_rate": 2.356698540722291e-08, + "loss": 0.2575, + "step": 17036 + }, + { + "epoch": 0.98, + "grad_norm": 0.2828728976084193, + "learning_rate": 2.343947400698432e-08, + "loss": 0.1845, + "step": 17037 + }, + { + "epoch": 0.98, + "grad_norm": 0.3516289049644702, + "learning_rate": 2.3312308094607382e-08, + "loss": 0.2959, + "step": 17038 + }, + { + "epoch": 0.98, + "grad_norm": 0.7073007426790366, + "learning_rate": 2.3185487674497463e-08, + "loss": 0.3777, + "step": 17039 + }, + { + "epoch": 0.98, + "grad_norm": 0.3036204352872146, + "learning_rate": 2.3059012751044386e-08, + "loss": 0.2045, + "step": 17040 + }, + { + "epoch": 0.98, + "grad_norm": 1.0686989334600154, + "learning_rate": 2.2932883328629087e-08, + "loss": 0.7031, + "step": 17041 + }, + { + "epoch": 0.98, + "grad_norm": 0.26671675957112734, + "learning_rate": 2.280709941161807e-08, + "loss": 0.2414, + "step": 17042 + }, + { + "epoch": 0.98, + "grad_norm": 0.3138331144274908, + "learning_rate": 2.268166100436897e-08, + "loss": 0.2286, + "step": 17043 + }, + { + "epoch": 0.98, + "grad_norm": 0.5218556651137621, + "learning_rate": 2.2556568111223866e-08, + "loss": 0.2401, + "step": 17044 + }, + { + "epoch": 0.98, + "grad_norm": 0.7268865897870525, + "learning_rate": 2.2431820736517062e-08, + "loss": 0.3442, + "step": 17045 + }, + { + "epoch": 0.98, + "grad_norm": 0.34937516121279033, + "learning_rate": 2.2307418884566225e-08, + "loss": 0.2662, + "step": 17046 + }, + { + "epoch": 0.98, + "grad_norm": 0.32484541126394684, + "learning_rate": 2.2183362559681232e-08, + "loss": 0.2435, + "step": 17047 + }, + { + "epoch": 0.98, + "grad_norm": 0.2538310862070762, + "learning_rate": 2.205965176615643e-08, + "loss": 0.1857, + "step": 17048 + }, + { + "epoch": 0.98, + "grad_norm": 0.36626610756991873, + "learning_rate": 2.1936286508278393e-08, + "loss": 0.234, + "step": 17049 + }, + { + "epoch": 0.98, + "grad_norm": 0.31560292290674424, + "learning_rate": 2.1813266790315922e-08, + "loss": 0.2447, + "step": 17050 + }, + { + "epoch": 0.98, + "grad_norm": 0.8116602561966607, + "learning_rate": 2.169059261653228e-08, + "loss": 0.3647, + "step": 17051 + }, + { + "epoch": 0.98, + "grad_norm": 0.646895185331955, + "learning_rate": 2.1568263991174065e-08, + "loss": 0.2433, + "step": 17052 + }, + { + "epoch": 0.98, + "grad_norm": 0.3426135098172122, + "learning_rate": 2.144628091847678e-08, + "loss": 0.2603, + "step": 17053 + }, + { + "epoch": 0.98, + "grad_norm": 0.2182673378928602, + "learning_rate": 2.1324643402667045e-08, + "loss": 0.2074, + "step": 17054 + }, + { + "epoch": 0.98, + "grad_norm": 0.4041287114922914, + "learning_rate": 2.1203351447954824e-08, + "loss": 0.2777, + "step": 17055 + }, + { + "epoch": 0.98, + "grad_norm": 0.4575676863248586, + "learning_rate": 2.10824050585412e-08, + "loss": 0.2954, + "step": 17056 + }, + { + "epoch": 0.98, + "grad_norm": 1.0410511963285844, + "learning_rate": 2.0961804238616156e-08, + "loss": 0.2411, + "step": 17057 + }, + { + "epoch": 0.98, + "grad_norm": 0.29720838275806993, + "learning_rate": 2.0841548992354132e-08, + "loss": 0.2345, + "step": 17058 + }, + { + "epoch": 0.98, + "grad_norm": 0.37419418678071553, + "learning_rate": 2.0721639323919573e-08, + "loss": 0.3246, + "step": 17059 + }, + { + "epoch": 0.98, + "grad_norm": 0.16552952889999026, + "learning_rate": 2.0602075237465825e-08, + "loss": 0.0844, + "step": 17060 + }, + { + "epoch": 0.98, + "grad_norm": 0.41038045330779327, + "learning_rate": 2.0482856737132906e-08, + "loss": 0.2749, + "step": 17061 + }, + { + "epoch": 0.98, + "grad_norm": 0.29247558336905394, + "learning_rate": 2.0363983827049737e-08, + "loss": 0.2884, + "step": 17062 + }, + { + "epoch": 0.98, + "grad_norm": 1.2705941693360938, + "learning_rate": 2.0245456511333028e-08, + "loss": 0.2699, + "step": 17063 + }, + { + "epoch": 0.98, + "grad_norm": 0.3362945356885263, + "learning_rate": 2.012727479408616e-08, + "loss": 0.2474, + "step": 17064 + }, + { + "epoch": 0.98, + "grad_norm": 1.231997007937773, + "learning_rate": 2.0009438679403636e-08, + "loss": 0.6624, + "step": 17065 + }, + { + "epoch": 0.98, + "grad_norm": 0.329125988206052, + "learning_rate": 1.9891948171364417e-08, + "loss": 0.2482, + "step": 17066 + }, + { + "epoch": 0.98, + "grad_norm": 0.4091946800044302, + "learning_rate": 1.9774803274038578e-08, + "loss": 0.2754, + "step": 17067 + }, + { + "epoch": 0.98, + "grad_norm": 0.35837796725133636, + "learning_rate": 1.9658003991480656e-08, + "loss": 0.258, + "step": 17068 + }, + { + "epoch": 0.98, + "grad_norm": 0.5680477821646825, + "learning_rate": 1.9541550327738524e-08, + "loss": 0.2052, + "step": 17069 + }, + { + "epoch": 0.98, + "grad_norm": 0.2690535902574051, + "learning_rate": 1.942544228684229e-08, + "loss": 0.2, + "step": 17070 + }, + { + "epoch": 0.98, + "grad_norm": 0.3929295717834575, + "learning_rate": 1.930967987281429e-08, + "loss": 0.3381, + "step": 17071 + }, + { + "epoch": 0.98, + "grad_norm": 0.8100108431977137, + "learning_rate": 1.9194263089662435e-08, + "loss": 0.4147, + "step": 17072 + }, + { + "epoch": 0.98, + "grad_norm": 0.4949306445932574, + "learning_rate": 1.9079191941384635e-08, + "loss": 0.2105, + "step": 17073 + }, + { + "epoch": 0.98, + "grad_norm": 0.2799353141961879, + "learning_rate": 1.8964466431964367e-08, + "loss": 0.2766, + "step": 17074 + }, + { + "epoch": 0.98, + "grad_norm": 0.25292916616971384, + "learning_rate": 1.8850086565376236e-08, + "loss": 0.1642, + "step": 17075 + }, + { + "epoch": 0.98, + "grad_norm": 0.43814973227571347, + "learning_rate": 1.8736052345580403e-08, + "loss": 0.188, + "step": 17076 + }, + { + "epoch": 0.98, + "grad_norm": 0.5327994711284099, + "learning_rate": 1.862236377652593e-08, + "loss": 0.3839, + "step": 17077 + }, + { + "epoch": 0.98, + "grad_norm": 0.35915994320120836, + "learning_rate": 1.8509020862149673e-08, + "loss": 0.3232, + "step": 17078 + }, + { + "epoch": 0.98, + "grad_norm": 0.4653115137961894, + "learning_rate": 1.8396023606376268e-08, + "loss": 0.3291, + "step": 17079 + }, + { + "epoch": 0.98, + "grad_norm": 0.29836713476307714, + "learning_rate": 1.828337201311925e-08, + "loss": 0.1782, + "step": 17080 + }, + { + "epoch": 0.98, + "grad_norm": 0.5164581462048005, + "learning_rate": 1.817106608628105e-08, + "loss": 0.2172, + "step": 17081 + }, + { + "epoch": 0.98, + "grad_norm": 0.28972253741358867, + "learning_rate": 1.805910582974857e-08, + "loss": 0.2673, + "step": 17082 + }, + { + "epoch": 0.98, + "grad_norm": 0.3477015924607169, + "learning_rate": 1.7947491247399808e-08, + "loss": 0.2427, + "step": 17083 + }, + { + "epoch": 0.98, + "grad_norm": 0.577825325414684, + "learning_rate": 1.783622234310056e-08, + "loss": 0.3721, + "step": 17084 + }, + { + "epoch": 0.98, + "grad_norm": 0.41418340683900945, + "learning_rate": 1.772529912070442e-08, + "loss": 0.2571, + "step": 17085 + }, + { + "epoch": 0.98, + "grad_norm": 0.2782321577020592, + "learning_rate": 1.7614721584051643e-08, + "loss": 0.2388, + "step": 17086 + }, + { + "epoch": 0.98, + "grad_norm": 0.3467169897838902, + "learning_rate": 1.7504489736971385e-08, + "loss": 0.1657, + "step": 17087 + }, + { + "epoch": 0.98, + "grad_norm": 0.7927895004364096, + "learning_rate": 1.73946035832806e-08, + "loss": 0.5113, + "step": 17088 + }, + { + "epoch": 0.98, + "grad_norm": 0.29960578060265763, + "learning_rate": 1.728506312678624e-08, + "loss": 0.2258, + "step": 17089 + }, + { + "epoch": 0.98, + "grad_norm": 0.32245881570389173, + "learning_rate": 1.7175868371281936e-08, + "loss": 0.3108, + "step": 17090 + }, + { + "epoch": 0.98, + "grad_norm": 0.951002243046634, + "learning_rate": 1.7067019320546886e-08, + "loss": 0.3718, + "step": 17091 + }, + { + "epoch": 0.98, + "grad_norm": 0.38421464500733016, + "learning_rate": 1.6958515978351407e-08, + "loss": 0.2816, + "step": 17092 + }, + { + "epoch": 0.98, + "grad_norm": 0.15888882951589792, + "learning_rate": 1.6850358348453612e-08, + "loss": 0.0663, + "step": 17093 + }, + { + "epoch": 0.98, + "grad_norm": 0.3003963983604446, + "learning_rate": 1.6742546434598272e-08, + "loss": 0.2845, + "step": 17094 + }, + { + "epoch": 0.98, + "grad_norm": 0.36253705809280756, + "learning_rate": 1.6635080240520186e-08, + "loss": 0.2953, + "step": 17095 + }, + { + "epoch": 0.98, + "grad_norm": 0.6824644163387067, + "learning_rate": 1.6527959769939705e-08, + "loss": 0.2952, + "step": 17096 + }, + { + "epoch": 0.98, + "grad_norm": 0.6366164928559752, + "learning_rate": 1.6421185026566088e-08, + "loss": 0.2909, + "step": 17097 + }, + { + "epoch": 0.98, + "grad_norm": 0.24641835079060603, + "learning_rate": 1.631475601409749e-08, + "loss": 0.2552, + "step": 17098 + }, + { + "epoch": 0.98, + "grad_norm": 0.31674059694920764, + "learning_rate": 1.6208672736219843e-08, + "loss": 0.0891, + "step": 17099 + }, + { + "epoch": 0.98, + "grad_norm": 0.5181426442377051, + "learning_rate": 1.6102935196606883e-08, + "loss": 0.3263, + "step": 17100 + }, + { + "epoch": 0.98, + "grad_norm": 0.3799271362409939, + "learning_rate": 1.5997543398919013e-08, + "loss": 0.2932, + "step": 17101 + }, + { + "epoch": 0.98, + "grad_norm": 0.32179637117374854, + "learning_rate": 1.5892497346807754e-08, + "loss": 0.271, + "step": 17102 + }, + { + "epoch": 0.98, + "grad_norm": 0.7109955981462678, + "learning_rate": 1.57877970439102e-08, + "loss": 0.3308, + "step": 17103 + }, + { + "epoch": 0.98, + "grad_norm": 0.3955832021279465, + "learning_rate": 1.568344249385123e-08, + "loss": 0.2954, + "step": 17104 + }, + { + "epoch": 0.98, + "grad_norm": 0.28272887435926286, + "learning_rate": 1.5579433700245727e-08, + "loss": 0.1893, + "step": 17105 + }, + { + "epoch": 0.98, + "grad_norm": 0.3781045982315596, + "learning_rate": 1.5475770666694145e-08, + "loss": 0.2298, + "step": 17106 + }, + { + "epoch": 0.98, + "grad_norm": 0.3939611312355522, + "learning_rate": 1.5372453396788057e-08, + "loss": 0.284, + "step": 17107 + }, + { + "epoch": 0.98, + "grad_norm": 0.7118015447019852, + "learning_rate": 1.52694818941046e-08, + "loss": 0.439, + "step": 17108 + }, + { + "epoch": 0.98, + "grad_norm": 0.4439561205053077, + "learning_rate": 1.516685616220981e-08, + "loss": 0.1962, + "step": 17109 + }, + { + "epoch": 0.98, + "grad_norm": 0.28543838310868797, + "learning_rate": 1.506457620465751e-08, + "loss": 0.2636, + "step": 17110 + }, + { + "epoch": 0.98, + "grad_norm": 0.32304533937000707, + "learning_rate": 1.4962642024989316e-08, + "loss": 0.1582, + "step": 17111 + }, + { + "epoch": 0.98, + "grad_norm": 0.556431431252915, + "learning_rate": 1.4861053626734623e-08, + "loss": 0.2584, + "step": 17112 + }, + { + "epoch": 0.98, + "grad_norm": 0.3452856886959734, + "learning_rate": 1.475981101341284e-08, + "loss": 0.2698, + "step": 17113 + }, + { + "epoch": 0.98, + "grad_norm": 0.34444279352265655, + "learning_rate": 1.4658914188530049e-08, + "loss": 0.3137, + "step": 17114 + }, + { + "epoch": 0.98, + "grad_norm": 1.0069469226681833, + "learning_rate": 1.4558363155579013e-08, + "loss": 0.1646, + "step": 17115 + }, + { + "epoch": 0.98, + "grad_norm": 0.3590631076036183, + "learning_rate": 1.4458157918042503e-08, + "loss": 0.2807, + "step": 17116 + }, + { + "epoch": 0.98, + "grad_norm": 0.3076290801115484, + "learning_rate": 1.4358298479391076e-08, + "loss": 0.2447, + "step": 17117 + }, + { + "epoch": 0.98, + "grad_norm": 0.4345708272045855, + "learning_rate": 1.4258784843081963e-08, + "loss": 0.3126, + "step": 17118 + }, + { + "epoch": 0.98, + "grad_norm": 0.35796512209441345, + "learning_rate": 1.415961701256241e-08, + "loss": 0.2191, + "step": 17119 + }, + { + "epoch": 0.98, + "grad_norm": 0.43834411557560704, + "learning_rate": 1.4060794991265226e-08, + "loss": 0.3003, + "step": 17120 + }, + { + "epoch": 0.98, + "grad_norm": 0.3023620756441359, + "learning_rate": 1.3962318782613226e-08, + "loss": 0.267, + "step": 17121 + }, + { + "epoch": 0.98, + "grad_norm": 0.33571420647735895, + "learning_rate": 1.386418839001702e-08, + "loss": 0.1946, + "step": 17122 + }, + { + "epoch": 0.98, + "grad_norm": 0.7962831409761743, + "learning_rate": 1.3766403816873886e-08, + "loss": 0.3344, + "step": 17123 + }, + { + "epoch": 0.98, + "grad_norm": 0.40328605944188584, + "learning_rate": 1.3668965066571115e-08, + "loss": 0.2527, + "step": 17124 + }, + { + "epoch": 0.98, + "grad_norm": 0.27251997461785155, + "learning_rate": 1.3571872142483789e-08, + "loss": 0.1927, + "step": 17125 + }, + { + "epoch": 0.98, + "grad_norm": 0.27747022869745464, + "learning_rate": 1.3475125047971438e-08, + "loss": 0.2665, + "step": 17126 + }, + { + "epoch": 0.98, + "grad_norm": 1.2894017763114165, + "learning_rate": 1.3378723786386938e-08, + "loss": 0.6075, + "step": 17127 + }, + { + "epoch": 0.98, + "grad_norm": 0.3370384947172715, + "learning_rate": 1.3282668361067619e-08, + "loss": 0.185, + "step": 17128 + }, + { + "epoch": 0.98, + "grad_norm": 0.33942967898729376, + "learning_rate": 1.3186958775339709e-08, + "loss": 0.3023, + "step": 17129 + }, + { + "epoch": 0.98, + "grad_norm": 0.6884605520324051, + "learning_rate": 1.3091595032518333e-08, + "loss": 0.3898, + "step": 17130 + }, + { + "epoch": 0.98, + "grad_norm": 0.32177530158067874, + "learning_rate": 1.2996577135906408e-08, + "loss": 0.2731, + "step": 17131 + }, + { + "epoch": 0.98, + "grad_norm": 0.22547779418670175, + "learning_rate": 1.290190508879241e-08, + "loss": 0.1392, + "step": 17132 + }, + { + "epoch": 0.98, + "grad_norm": 0.26112428619009537, + "learning_rate": 1.280757889445594e-08, + "loss": 0.2366, + "step": 17133 + }, + { + "epoch": 0.98, + "grad_norm": 0.35491189542978396, + "learning_rate": 1.2713598556164386e-08, + "loss": 0.2418, + "step": 17134 + }, + { + "epoch": 0.98, + "grad_norm": 0.6705417107991852, + "learning_rate": 1.2619964077170699e-08, + "loss": 0.2883, + "step": 17135 + }, + { + "epoch": 0.98, + "grad_norm": 0.7273092453633878, + "learning_rate": 1.252667546071784e-08, + "loss": 0.3569, + "step": 17136 + }, + { + "epoch": 0.98, + "grad_norm": 0.3310531820505807, + "learning_rate": 1.2433732710037671e-08, + "loss": 0.2706, + "step": 17137 + }, + { + "epoch": 0.98, + "grad_norm": 0.23516781376709966, + "learning_rate": 1.2341135828347617e-08, + "loss": 0.2121, + "step": 17138 + }, + { + "epoch": 0.98, + "grad_norm": 0.5182882686249745, + "learning_rate": 1.2248884818854001e-08, + "loss": 0.2102, + "step": 17139 + }, + { + "epoch": 0.98, + "grad_norm": 0.347744576905726, + "learning_rate": 1.2156979684753157e-08, + "loss": 0.2669, + "step": 17140 + }, + { + "epoch": 0.98, + "grad_norm": 0.30419991686902886, + "learning_rate": 1.2065420429225872e-08, + "loss": 0.2622, + "step": 17141 + }, + { + "epoch": 0.98, + "grad_norm": 1.6698959296242852, + "learning_rate": 1.1974207055444054e-08, + "loss": 0.6203, + "step": 17142 + }, + { + "epoch": 0.98, + "grad_norm": 0.3200165060424897, + "learning_rate": 1.1883339566565178e-08, + "loss": 0.2531, + "step": 17143 + }, + { + "epoch": 0.99, + "grad_norm": 0.411575213284914, + "learning_rate": 1.1792817965736725e-08, + "loss": 0.2983, + "step": 17144 + }, + { + "epoch": 0.99, + "grad_norm": 0.2582650364241364, + "learning_rate": 1.1702642256093965e-08, + "loss": 0.1813, + "step": 17145 + }, + { + "epoch": 0.99, + "grad_norm": 0.3650810325649037, + "learning_rate": 1.1612812440758847e-08, + "loss": 0.2605, + "step": 17146 + }, + { + "epoch": 0.99, + "grad_norm": 0.5358995044698758, + "learning_rate": 1.1523328522843324e-08, + "loss": 0.3528, + "step": 17147 + }, + { + "epoch": 0.99, + "grad_norm": 0.9446703893782842, + "learning_rate": 1.1434190505443809e-08, + "loss": 0.384, + "step": 17148 + }, + { + "epoch": 0.99, + "grad_norm": 0.2741899067962737, + "learning_rate": 1.1345398391650053e-08, + "loss": 0.2475, + "step": 17149 + }, + { + "epoch": 0.99, + "grad_norm": 0.5004404118735434, + "learning_rate": 1.1256952184535153e-08, + "loss": 0.3734, + "step": 17150 + }, + { + "epoch": 0.99, + "grad_norm": 0.363040297891496, + "learning_rate": 1.1168851887163323e-08, + "loss": 0.1338, + "step": 17151 + }, + { + "epoch": 0.99, + "grad_norm": 0.3385171036240212, + "learning_rate": 1.1081097502584349e-08, + "loss": 0.2484, + "step": 17152 + }, + { + "epoch": 0.99, + "grad_norm": 0.4682014674094445, + "learning_rate": 1.0993689033836907e-08, + "loss": 0.3082, + "step": 17153 + }, + { + "epoch": 0.99, + "grad_norm": 0.541802237828164, + "learning_rate": 1.090662648394858e-08, + "loss": 0.2646, + "step": 17154 + }, + { + "epoch": 0.99, + "grad_norm": 0.4216574372674074, + "learning_rate": 1.0819909855933618e-08, + "loss": 0.2951, + "step": 17155 + }, + { + "epoch": 0.99, + "grad_norm": 0.5286671846393574, + "learning_rate": 1.073353915279629e-08, + "loss": 0.3723, + "step": 17156 + }, + { + "epoch": 0.99, + "grad_norm": 0.26883762854101706, + "learning_rate": 1.0647514377527535e-08, + "loss": 0.2437, + "step": 17157 + }, + { + "epoch": 0.99, + "grad_norm": 0.2838856173997619, + "learning_rate": 1.0561835533104969e-08, + "loss": 0.1802, + "step": 17158 + }, + { + "epoch": 0.99, + "grad_norm": 0.48001083283485046, + "learning_rate": 1.0476502622496221e-08, + "loss": 0.2635, + "step": 17159 + }, + { + "epoch": 0.99, + "grad_norm": 1.2070202127543408, + "learning_rate": 1.0391515648657813e-08, + "loss": 0.6532, + "step": 17160 + }, + { + "epoch": 0.99, + "grad_norm": 0.24790652896927554, + "learning_rate": 1.0306874614530727e-08, + "loss": 0.2034, + "step": 17161 + }, + { + "epoch": 0.99, + "grad_norm": 0.5456230843925484, + "learning_rate": 1.022257952304706e-08, + "loss": 0.3501, + "step": 17162 + }, + { + "epoch": 0.99, + "grad_norm": 0.872555453274828, + "learning_rate": 1.0138630377125591e-08, + "loss": 0.3599, + "step": 17163 + }, + { + "epoch": 0.99, + "grad_norm": 0.30090643775201775, + "learning_rate": 1.0055027179675104e-08, + "loss": 0.211, + "step": 17164 + }, + { + "epoch": 0.99, + "grad_norm": 0.20651735036815919, + "learning_rate": 9.971769933587728e-09, + "loss": 0.1962, + "step": 17165 + }, + { + "epoch": 0.99, + "grad_norm": 1.3557084430328847, + "learning_rate": 9.888858641750044e-09, + "loss": 0.5679, + "step": 17166 + }, + { + "epoch": 0.99, + "grad_norm": 0.4083475818641006, + "learning_rate": 9.806293307030868e-09, + "loss": 0.2982, + "step": 17167 + }, + { + "epoch": 0.99, + "grad_norm": 0.5826905889124464, + "learning_rate": 9.724073932289024e-09, + "loss": 0.2523, + "step": 17168 + }, + { + "epoch": 0.99, + "grad_norm": 0.3194676400384813, + "learning_rate": 9.642200520374457e-09, + "loss": 0.299, + "step": 17169 + }, + { + "epoch": 0.99, + "grad_norm": 0.29693589850133373, + "learning_rate": 9.560673074120452e-09, + "loss": 0.2682, + "step": 17170 + }, + { + "epoch": 0.99, + "grad_norm": 0.16895972237669396, + "learning_rate": 9.47949159635031e-09, + "loss": 0.0703, + "step": 17171 + }, + { + "epoch": 0.99, + "grad_norm": 0.46335545917394955, + "learning_rate": 9.398656089876224e-09, + "loss": 0.2993, + "step": 17172 + }, + { + "epoch": 0.99, + "grad_norm": 0.29457119427898515, + "learning_rate": 9.318166557497066e-09, + "loss": 0.2394, + "step": 17173 + }, + { + "epoch": 0.99, + "grad_norm": 0.47416681421393464, + "learning_rate": 9.238023001999496e-09, + "loss": 0.2596, + "step": 17174 + }, + { + "epoch": 0.99, + "grad_norm": 0.6675035464853056, + "learning_rate": 9.158225426160183e-09, + "loss": 0.3474, + "step": 17175 + }, + { + "epoch": 0.99, + "grad_norm": 0.5988924080851362, + "learning_rate": 9.078773832742471e-09, + "loss": 0.2719, + "step": 17176 + }, + { + "epoch": 0.99, + "grad_norm": 0.18081883458352935, + "learning_rate": 8.999668224496383e-09, + "loss": 0.1884, + "step": 17177 + }, + { + "epoch": 0.99, + "grad_norm": 1.1298631304084927, + "learning_rate": 8.92090860416195e-09, + "loss": 0.555, + "step": 17178 + }, + { + "epoch": 0.99, + "grad_norm": 0.40152654309722646, + "learning_rate": 8.842494974466986e-09, + "loss": 0.2712, + "step": 17179 + }, + { + "epoch": 0.99, + "grad_norm": 0.49817004543825866, + "learning_rate": 8.764427338127102e-09, + "loss": 0.3426, + "step": 17180 + }, + { + "epoch": 0.99, + "grad_norm": 0.31938785931852637, + "learning_rate": 8.686705697845688e-09, + "loss": 0.2383, + "step": 17181 + }, + { + "epoch": 0.99, + "grad_norm": 0.3866142623720773, + "learning_rate": 8.609330056313926e-09, + "loss": 0.2452, + "step": 17182 + }, + { + "epoch": 0.99, + "grad_norm": 0.2502692400459461, + "learning_rate": 8.532300416210781e-09, + "loss": 0.1758, + "step": 17183 + }, + { + "epoch": 0.99, + "grad_norm": 0.5478422286740277, + "learning_rate": 8.455616780205234e-09, + "loss": 0.243, + "step": 17184 + }, + { + "epoch": 0.99, + "grad_norm": 0.3054428534977997, + "learning_rate": 8.379279150951824e-09, + "loss": 0.259, + "step": 17185 + }, + { + "epoch": 0.99, + "grad_norm": 0.563806009862002, + "learning_rate": 8.303287531093996e-09, + "loss": 0.3525, + "step": 17186 + }, + { + "epoch": 0.99, + "grad_norm": 0.6691683684864551, + "learning_rate": 8.227641923264085e-09, + "loss": 0.3025, + "step": 17187 + }, + { + "epoch": 0.99, + "grad_norm": 0.3912003511948697, + "learning_rate": 8.152342330081109e-09, + "loss": 0.247, + "step": 17188 + }, + { + "epoch": 0.99, + "grad_norm": 0.29904732930023953, + "learning_rate": 8.077388754151872e-09, + "loss": 0.2665, + "step": 17189 + }, + { + "epoch": 0.99, + "grad_norm": 0.28293767680946147, + "learning_rate": 8.002781198074294e-09, + "loss": 0.1153, + "step": 17190 + }, + { + "epoch": 0.99, + "grad_norm": 0.39065053153705637, + "learning_rate": 7.928519664430756e-09, + "loss": 0.2872, + "step": 17191 + }, + { + "epoch": 0.99, + "grad_norm": 0.4524577827612803, + "learning_rate": 7.854604155791423e-09, + "loss": 0.3092, + "step": 17192 + }, + { + "epoch": 0.99, + "grad_norm": 0.3234766790264359, + "learning_rate": 7.78103467471869e-09, + "loss": 0.301, + "step": 17193 + }, + { + "epoch": 0.99, + "grad_norm": 0.5498364437172288, + "learning_rate": 7.70781122375941e-09, + "loss": 0.0939, + "step": 17194 + }, + { + "epoch": 0.99, + "grad_norm": 0.290298275004318, + "learning_rate": 7.634933805448219e-09, + "loss": 0.2089, + "step": 17195 + }, + { + "epoch": 0.99, + "grad_norm": 0.2698869203749294, + "learning_rate": 7.562402422309767e-09, + "loss": 0.2444, + "step": 17196 + }, + { + "epoch": 0.99, + "grad_norm": 0.5583643820899699, + "learning_rate": 7.490217076855377e-09, + "loss": 0.1874, + "step": 17197 + }, + { + "epoch": 0.99, + "grad_norm": 0.37483103695911446, + "learning_rate": 7.418377771585273e-09, + "loss": 0.3031, + "step": 17198 + }, + { + "epoch": 0.99, + "grad_norm": 1.2057443310019202, + "learning_rate": 7.346884508987462e-09, + "loss": 0.5939, + "step": 17199 + }, + { + "epoch": 0.99, + "grad_norm": 0.40814755288192517, + "learning_rate": 7.275737291536633e-09, + "loss": 0.1775, + "step": 17200 + }, + { + "epoch": 0.99, + "grad_norm": 0.23334911812197162, + "learning_rate": 7.204936121697481e-09, + "loss": 0.2408, + "step": 17201 + }, + { + "epoch": 0.99, + "grad_norm": 0.452400964531313, + "learning_rate": 7.134481001922488e-09, + "loss": 0.253, + "step": 17202 + }, + { + "epoch": 0.99, + "grad_norm": 0.3952783924255758, + "learning_rate": 7.064371934649706e-09, + "loss": 0.2208, + "step": 17203 + }, + { + "epoch": 0.99, + "grad_norm": 0.5326550442015726, + "learning_rate": 6.9946089223082995e-09, + "loss": 0.3627, + "step": 17204 + }, + { + "epoch": 0.99, + "grad_norm": 0.34586123136046604, + "learning_rate": 6.925191967314115e-09, + "loss": 0.3045, + "step": 17205 + }, + { + "epoch": 0.99, + "grad_norm": 0.7623488186505916, + "learning_rate": 6.856121072070787e-09, + "loss": 0.3254, + "step": 17206 + }, + { + "epoch": 0.99, + "grad_norm": 0.4536690860312234, + "learning_rate": 6.787396238969735e-09, + "loss": 0.2152, + "step": 17207 + }, + { + "epoch": 0.99, + "grad_norm": 0.23043597811227537, + "learning_rate": 6.719017470392386e-09, + "loss": 0.2123, + "step": 17208 + }, + { + "epoch": 0.99, + "grad_norm": 0.39678585504428626, + "learning_rate": 6.650984768704627e-09, + "loss": 0.2842, + "step": 17209 + }, + { + "epoch": 0.99, + "grad_norm": 0.484976822562212, + "learning_rate": 6.583298136264571e-09, + "loss": 0.2722, + "step": 17210 + }, + { + "epoch": 0.99, + "grad_norm": 1.1659165788472297, + "learning_rate": 6.515957575413678e-09, + "loss": 0.5502, + "step": 17211 + }, + { + "epoch": 0.99, + "grad_norm": 0.4491331575497026, + "learning_rate": 6.448963088486748e-09, + "loss": 0.3109, + "step": 17212 + }, + { + "epoch": 0.99, + "grad_norm": 0.27854955150557587, + "learning_rate": 6.382314677803037e-09, + "loss": 0.2077, + "step": 17213 + }, + { + "epoch": 0.99, + "grad_norm": 0.41626719286042163, + "learning_rate": 6.316012345668476e-09, + "loss": 0.2705, + "step": 17214 + }, + { + "epoch": 0.99, + "grad_norm": 0.7680084509491535, + "learning_rate": 6.2500560943812295e-09, + "loss": 0.3548, + "step": 17215 + }, + { + "epoch": 0.99, + "grad_norm": 0.25815630989197186, + "learning_rate": 6.184445926225024e-09, + "loss": 0.2255, + "step": 17216 + }, + { + "epoch": 0.99, + "grad_norm": 0.3151753336653738, + "learning_rate": 6.119181843471378e-09, + "loss": 0.2446, + "step": 17217 + }, + { + "epoch": 0.99, + "grad_norm": 0.9113337178352028, + "learning_rate": 6.0542638483818136e-09, + "loss": 0.3112, + "step": 17218 + }, + { + "epoch": 0.99, + "grad_norm": 0.3686064816471827, + "learning_rate": 5.989691943202314e-09, + "loss": 0.2553, + "step": 17219 + }, + { + "epoch": 0.99, + "grad_norm": 0.4338394855360507, + "learning_rate": 5.925466130169977e-09, + "loss": 0.2657, + "step": 17220 + }, + { + "epoch": 0.99, + "grad_norm": 0.2913724177552018, + "learning_rate": 5.86158641150969e-09, + "loss": 0.2284, + "step": 17221 + }, + { + "epoch": 0.99, + "grad_norm": 0.3690652954085403, + "learning_rate": 5.798052789431907e-09, + "loss": 0.3086, + "step": 17222 + }, + { + "epoch": 0.99, + "grad_norm": 0.44592350230275896, + "learning_rate": 5.734865266138201e-09, + "loss": 0.1507, + "step": 17223 + }, + { + "epoch": 0.99, + "grad_norm": 0.29599152713750027, + "learning_rate": 5.67202384381682e-09, + "loss": 0.2542, + "step": 17224 + }, + { + "epoch": 0.99, + "grad_norm": 0.38827856045711373, + "learning_rate": 5.609528524642694e-09, + "loss": 0.3015, + "step": 17225 + }, + { + "epoch": 0.99, + "grad_norm": 0.5828565839256031, + "learning_rate": 5.547379310781864e-09, + "loss": 0.2791, + "step": 17226 + }, + { + "epoch": 0.99, + "grad_norm": 0.7721637320865353, + "learning_rate": 5.485576204383725e-09, + "loss": 0.4538, + "step": 17227 + }, + { + "epoch": 0.99, + "grad_norm": 0.3448076249812229, + "learning_rate": 5.424119207592115e-09, + "loss": 0.2812, + "step": 17228 + }, + { + "epoch": 0.99, + "grad_norm": 0.21500697134765617, + "learning_rate": 5.3630083225331145e-09, + "loss": 0.1745, + "step": 17229 + }, + { + "epoch": 0.99, + "grad_norm": 1.028394499800441, + "learning_rate": 5.302243551322806e-09, + "loss": 0.4091, + "step": 17230 + }, + { + "epoch": 0.99, + "grad_norm": 0.3682420380289012, + "learning_rate": 5.2418248960661725e-09, + "loss": 0.2909, + "step": 17231 + }, + { + "epoch": 0.99, + "grad_norm": 0.3277072529659946, + "learning_rate": 5.181752358854874e-09, + "loss": 0.3007, + "step": 17232 + }, + { + "epoch": 0.99, + "grad_norm": 0.4392349799183583, + "learning_rate": 5.1220259417705806e-09, + "loss": 0.1281, + "step": 17233 + }, + { + "epoch": 0.99, + "grad_norm": 0.3499626554649006, + "learning_rate": 5.0626456468805265e-09, + "loss": 0.2961, + "step": 17234 + }, + { + "epoch": 0.99, + "grad_norm": 0.28294895281739385, + "learning_rate": 5.003611476240844e-09, + "loss": 0.1918, + "step": 17235 + }, + { + "epoch": 0.99, + "grad_norm": 0.30877597909823423, + "learning_rate": 4.944923431896564e-09, + "loss": 0.218, + "step": 17236 + }, + { + "epoch": 0.99, + "grad_norm": 0.35228097598873376, + "learning_rate": 4.886581515880506e-09, + "loss": 0.2528, + "step": 17237 + }, + { + "epoch": 0.99, + "grad_norm": 0.8101994452225071, + "learning_rate": 4.828585730211055e-09, + "loss": 0.3758, + "step": 17238 + }, + { + "epoch": 0.99, + "grad_norm": 0.8340505753102018, + "learning_rate": 4.770936076898825e-09, + "loss": 0.2225, + "step": 17239 + }, + { + "epoch": 0.99, + "grad_norm": 0.29161964930278367, + "learning_rate": 4.713632557938885e-09, + "loss": 0.2682, + "step": 17240 + }, + { + "epoch": 0.99, + "grad_norm": 0.3629011642961109, + "learning_rate": 4.6566751753163166e-09, + "loss": 0.2429, + "step": 17241 + }, + { + "epoch": 0.99, + "grad_norm": 0.455365093725818, + "learning_rate": 4.600063931002874e-09, + "loss": 0.2451, + "step": 17242 + }, + { + "epoch": 0.99, + "grad_norm": 0.336152325585062, + "learning_rate": 4.543798826959211e-09, + "loss": 0.2254, + "step": 17243 + }, + { + "epoch": 0.99, + "grad_norm": 0.3599515795554048, + "learning_rate": 4.4878798651337705e-09, + "loss": 0.2767, + "step": 17244 + }, + { + "epoch": 0.99, + "grad_norm": 0.9079207839441679, + "learning_rate": 4.4323070474638906e-09, + "loss": 0.4227, + "step": 17245 + }, + { + "epoch": 0.99, + "grad_norm": 0.3120814844657969, + "learning_rate": 4.377080375873588e-09, + "loss": 0.2139, + "step": 17246 + }, + { + "epoch": 0.99, + "grad_norm": 0.2816458631063591, + "learning_rate": 4.322199852274667e-09, + "loss": 0.2005, + "step": 17247 + }, + { + "epoch": 0.99, + "grad_norm": 0.34555457410158374, + "learning_rate": 4.267665478567829e-09, + "loss": 0.2776, + "step": 17248 + }, + { + "epoch": 0.99, + "grad_norm": 0.3508270629408411, + "learning_rate": 4.213477256642673e-09, + "loss": 0.22, + "step": 17249 + }, + { + "epoch": 0.99, + "grad_norm": 0.8936437076660343, + "learning_rate": 4.159635188375477e-09, + "loss": 0.3986, + "step": 17250 + }, + { + "epoch": 0.99, + "grad_norm": 1.2272218348029869, + "learning_rate": 4.106139275629195e-09, + "loss": 0.6651, + "step": 17251 + }, + { + "epoch": 0.99, + "grad_norm": 0.22314666519834417, + "learning_rate": 4.0529895202579e-09, + "loss": 0.2085, + "step": 17252 + }, + { + "epoch": 0.99, + "grad_norm": 1.4471815041901033, + "learning_rate": 4.00018592410123e-09, + "loss": 0.6434, + "step": 17253 + }, + { + "epoch": 0.99, + "grad_norm": 0.3906800807951317, + "learning_rate": 3.947728488988833e-09, + "loss": 0.2578, + "step": 17254 + }, + { + "epoch": 0.99, + "grad_norm": 0.2559969138747373, + "learning_rate": 3.895617216735925e-09, + "loss": 0.199, + "step": 17255 + }, + { + "epoch": 0.99, + "grad_norm": 0.3663610961053545, + "learning_rate": 3.843852109148838e-09, + "loss": 0.2337, + "step": 17256 + }, + { + "epoch": 0.99, + "grad_norm": 1.1749116983977523, + "learning_rate": 3.792433168019471e-09, + "loss": 0.7044, + "step": 17257 + }, + { + "epoch": 0.99, + "grad_norm": 0.32863069988556126, + "learning_rate": 3.741360395127513e-09, + "loss": 0.2506, + "step": 17258 + }, + { + "epoch": 0.99, + "grad_norm": 0.7208685948356803, + "learning_rate": 3.6906337922426593e-09, + "loss": 0.2665, + "step": 17259 + }, + { + "epoch": 0.99, + "grad_norm": 0.3628388138696765, + "learning_rate": 3.640253361121282e-09, + "loss": 0.309, + "step": 17260 + }, + { + "epoch": 0.99, + "grad_norm": 0.36290377979163024, + "learning_rate": 3.590219103508652e-09, + "loss": 0.2359, + "step": 17261 + }, + { + "epoch": 0.99, + "grad_norm": 0.3023237550567913, + "learning_rate": 3.540531021135607e-09, + "loss": 0.1374, + "step": 17262 + }, + { + "epoch": 0.99, + "grad_norm": 0.3912028186810661, + "learning_rate": 3.491189115725213e-09, + "loss": 0.3087, + "step": 17263 + }, + { + "epoch": 0.99, + "grad_norm": 0.3489042396908763, + "learning_rate": 3.4421933889849936e-09, + "loss": 0.2758, + "step": 17264 + }, + { + "epoch": 0.99, + "grad_norm": 0.3906870321219293, + "learning_rate": 3.3935438426113687e-09, + "loss": 0.262, + "step": 17265 + }, + { + "epoch": 0.99, + "grad_norm": 0.9233473327277593, + "learning_rate": 3.3452404782896577e-09, + "loss": 0.4265, + "step": 17266 + }, + { + "epoch": 0.99, + "grad_norm": 0.2371891988216668, + "learning_rate": 3.2972832976918557e-09, + "loss": 0.1978, + "step": 17267 + }, + { + "epoch": 0.99, + "grad_norm": 0.26164169308525725, + "learning_rate": 3.2496723024799672e-09, + "loss": 0.2508, + "step": 17268 + }, + { + "epoch": 0.99, + "grad_norm": 1.0699215886321223, + "learning_rate": 3.2024074943015626e-09, + "loss": 0.4488, + "step": 17269 + }, + { + "epoch": 0.99, + "grad_norm": 0.3099846689515529, + "learning_rate": 3.1554888747942213e-09, + "loss": 0.2469, + "step": 17270 + }, + { + "epoch": 0.99, + "grad_norm": 0.6191979829959496, + "learning_rate": 3.10891644558331e-09, + "loss": 0.3591, + "step": 17271 + }, + { + "epoch": 0.99, + "grad_norm": 0.34374932351710824, + "learning_rate": 3.0626902082797615e-09, + "loss": 0.2373, + "step": 17272 + }, + { + "epoch": 0.99, + "grad_norm": 0.34319801121283044, + "learning_rate": 3.0168101644845183e-09, + "loss": 0.2517, + "step": 17273 + }, + { + "epoch": 0.99, + "grad_norm": 0.38067607576018514, + "learning_rate": 2.9712763157885293e-09, + "loss": 0.1608, + "step": 17274 + }, + { + "epoch": 0.99, + "grad_norm": 0.3449780354209424, + "learning_rate": 2.9260886637672014e-09, + "loss": 0.2681, + "step": 17275 + }, + { + "epoch": 0.99, + "grad_norm": 0.30939155012806313, + "learning_rate": 2.881247209984839e-09, + "loss": 0.2485, + "step": 17276 + }, + { + "epoch": 0.99, + "grad_norm": 0.6205897551993651, + "learning_rate": 2.8367519559957537e-09, + "loss": 0.3882, + "step": 17277 + }, + { + "epoch": 0.99, + "grad_norm": 1.1477515882330112, + "learning_rate": 2.792602903339825e-09, + "loss": 0.2529, + "step": 17278 + }, + { + "epoch": 0.99, + "grad_norm": 0.38265407718892425, + "learning_rate": 2.7488000535458303e-09, + "loss": 0.2418, + "step": 17279 + }, + { + "epoch": 0.99, + "grad_norm": 0.19879501472534586, + "learning_rate": 2.7053434081314447e-09, + "loss": 0.2037, + "step": 17280 + }, + { + "epoch": 0.99, + "grad_norm": 0.9142794037777149, + "learning_rate": 2.6622329686010196e-09, + "loss": 0.4647, + "step": 17281 + }, + { + "epoch": 0.99, + "grad_norm": 0.28941532117602325, + "learning_rate": 2.619468736446695e-09, + "loss": 0.1886, + "step": 17282 + }, + { + "epoch": 0.99, + "grad_norm": 0.6324264222405012, + "learning_rate": 2.5770507131517297e-09, + "loss": 0.4013, + "step": 17283 + }, + { + "epoch": 0.99, + "grad_norm": 0.328680654110541, + "learning_rate": 2.5349789001827274e-09, + "loss": 0.2716, + "step": 17284 + }, + { + "epoch": 0.99, + "grad_norm": 0.2842382251427557, + "learning_rate": 2.4932532989974113e-09, + "loss": 0.1838, + "step": 17285 + }, + { + "epoch": 0.99, + "grad_norm": 0.32087338368593543, + "learning_rate": 2.4518739110412913e-09, + "loss": 0.1578, + "step": 17286 + }, + { + "epoch": 0.99, + "grad_norm": 0.46121708923217086, + "learning_rate": 2.410840737746556e-09, + "loss": 0.3881, + "step": 17287 + }, + { + "epoch": 0.99, + "grad_norm": 0.25180801069509645, + "learning_rate": 2.37015378053429e-09, + "loss": 0.2123, + "step": 17288 + }, + { + "epoch": 0.99, + "grad_norm": 0.6193436792258021, + "learning_rate": 2.329813040814477e-09, + "loss": 0.3735, + "step": 17289 + }, + { + "epoch": 0.99, + "grad_norm": 1.283115078097724, + "learning_rate": 2.289818519982667e-09, + "loss": 0.4848, + "step": 17290 + }, + { + "epoch": 0.99, + "grad_norm": 0.2806743140172014, + "learning_rate": 2.2501702194244192e-09, + "loss": 0.1837, + "step": 17291 + }, + { + "epoch": 0.99, + "grad_norm": 0.28516428759808665, + "learning_rate": 2.2108681405141885e-09, + "loss": 0.2424, + "step": 17292 + }, + { + "epoch": 0.99, + "grad_norm": 0.38473609798441183, + "learning_rate": 2.1719122846097783e-09, + "loss": 0.2753, + "step": 17293 + }, + { + "epoch": 0.99, + "grad_norm": 0.31851548707689087, + "learning_rate": 2.1333026530634403e-09, + "loss": 0.238, + "step": 17294 + }, + { + "epoch": 0.99, + "grad_norm": 1.181637073888693, + "learning_rate": 2.0950392472107726e-09, + "loss": 0.3125, + "step": 17295 + }, + { + "epoch": 0.99, + "grad_norm": 0.3795736909128083, + "learning_rate": 2.0571220683762717e-09, + "loss": 0.2708, + "step": 17296 + }, + { + "epoch": 0.99, + "grad_norm": 0.6764029907443159, + "learning_rate": 2.019551117874441e-09, + "loss": 0.3151, + "step": 17297 + }, + { + "epoch": 0.99, + "grad_norm": 0.200451458051388, + "learning_rate": 1.9823263970042416e-09, + "loss": 0.1499, + "step": 17298 + }, + { + "epoch": 0.99, + "grad_norm": 0.34620613205315875, + "learning_rate": 1.9454479070579735e-09, + "loss": 0.2849, + "step": 17299 + }, + { + "epoch": 0.99, + "grad_norm": 0.5328903980935618, + "learning_rate": 1.9089156493101722e-09, + "loss": 0.3001, + "step": 17300 + }, + { + "epoch": 0.99, + "grad_norm": 0.342820967041687, + "learning_rate": 1.8727296250264924e-09, + "loss": 0.2557, + "step": 17301 + }, + { + "epoch": 0.99, + "grad_norm": 1.1405798965958176, + "learning_rate": 1.8368898354603759e-09, + "loss": 0.4979, + "step": 17302 + }, + { + "epoch": 0.99, + "grad_norm": 0.3883494449017486, + "learning_rate": 1.8013962818530516e-09, + "loss": 0.273, + "step": 17303 + }, + { + "epoch": 0.99, + "grad_norm": 0.21653975646538426, + "learning_rate": 1.7662489654324267e-09, + "loss": 0.206, + "step": 17304 + }, + { + "epoch": 0.99, + "grad_norm": 0.7855936171737142, + "learning_rate": 1.7314478874175255e-09, + "loss": 0.406, + "step": 17305 + }, + { + "epoch": 0.99, + "grad_norm": 0.43869514092740386, + "learning_rate": 1.6969930490129406e-09, + "loss": 0.2855, + "step": 17306 + }, + { + "epoch": 0.99, + "grad_norm": 0.2802090575204732, + "learning_rate": 1.662884451411051e-09, + "loss": 0.2516, + "step": 17307 + }, + { + "epoch": 0.99, + "grad_norm": 0.5347954446359646, + "learning_rate": 1.6291220957942443e-09, + "loss": 0.23, + "step": 17308 + }, + { + "epoch": 0.99, + "grad_norm": 0.43073846683388817, + "learning_rate": 1.5957059833293653e-09, + "loss": 0.2637, + "step": 17309 + }, + { + "epoch": 0.99, + "grad_norm": 0.5787939035509605, + "learning_rate": 1.5626361151765967e-09, + "loss": 0.3245, + "step": 17310 + }, + { + "epoch": 0.99, + "grad_norm": 0.30066704859583493, + "learning_rate": 1.5299124924794684e-09, + "loss": 0.2534, + "step": 17311 + }, + { + "epoch": 0.99, + "grad_norm": 0.3953423827246001, + "learning_rate": 1.497535116371518e-09, + "loss": 0.2949, + "step": 17312 + }, + { + "epoch": 0.99, + "grad_norm": 0.5339570681109478, + "learning_rate": 1.4655039879740706e-09, + "loss": 0.3517, + "step": 17313 + }, + { + "epoch": 0.99, + "grad_norm": 0.2465958574137639, + "learning_rate": 1.4338191083962394e-09, + "loss": 0.0919, + "step": 17314 + }, + { + "epoch": 0.99, + "grad_norm": 0.3375883743148337, + "learning_rate": 1.4024804787349244e-09, + "loss": 0.2508, + "step": 17315 + }, + { + "epoch": 0.99, + "grad_norm": 0.34858518129319876, + "learning_rate": 1.371488100075924e-09, + "loss": 0.2834, + "step": 17316 + }, + { + "epoch": 0.99, + "grad_norm": 0.5732830362171144, + "learning_rate": 1.3408419734928235e-09, + "loss": 0.2936, + "step": 17317 + }, + { + "epoch": 1.0, + "grad_norm": 0.5490046678633443, + "learning_rate": 1.3105421000458861e-09, + "loss": 0.3244, + "step": 17318 + }, + { + "epoch": 1.0, + "grad_norm": 0.25973593846651, + "learning_rate": 1.280588480785383e-09, + "loss": 0.2386, + "step": 17319 + }, + { + "epoch": 1.0, + "grad_norm": 0.40068654789339847, + "learning_rate": 1.2509811167482622e-09, + "loss": 0.2308, + "step": 17320 + }, + { + "epoch": 1.0, + "grad_norm": 0.27862506869394177, + "learning_rate": 1.2217200089592596e-09, + "loss": 0.0688, + "step": 17321 + }, + { + "epoch": 1.0, + "grad_norm": 0.3646002178721917, + "learning_rate": 1.192805158432009e-09, + "loss": 0.2923, + "step": 17322 + }, + { + "epoch": 1.0, + "grad_norm": 0.4507387100999731, + "learning_rate": 1.164236566167931e-09, + "loss": 0.3267, + "step": 17323 + }, + { + "epoch": 1.0, + "grad_norm": 0.2687264725141766, + "learning_rate": 1.1360142331562351e-09, + "loss": 0.2057, + "step": 17324 + }, + { + "epoch": 1.0, + "grad_norm": 0.35005883485614064, + "learning_rate": 1.1081381603750275e-09, + "loss": 0.2811, + "step": 17325 + }, + { + "epoch": 1.0, + "grad_norm": 0.47338451847770335, + "learning_rate": 1.0806083487890917e-09, + "loss": 0.2107, + "step": 17326 + }, + { + "epoch": 1.0, + "grad_norm": 0.25788576820285475, + "learning_rate": 1.0534247993509994e-09, + "loss": 0.2088, + "step": 17327 + }, + { + "epoch": 1.0, + "grad_norm": 0.5201529560961416, + "learning_rate": 1.0265875130033298e-09, + "loss": 0.3249, + "step": 17328 + }, + { + "epoch": 1.0, + "grad_norm": 0.7291318527937328, + "learning_rate": 1.0000964906753396e-09, + "loss": 0.3989, + "step": 17329 + }, + { + "epoch": 1.0, + "grad_norm": 0.6160151881883443, + "learning_rate": 9.739517332829628e-10, + "loss": 0.3685, + "step": 17330 + }, + { + "epoch": 1.0, + "grad_norm": 0.2861802045422825, + "learning_rate": 9.481532417332518e-10, + "loss": 0.2201, + "step": 17331 + }, + { + "epoch": 1.0, + "grad_norm": 0.33069292196442907, + "learning_rate": 9.227010169188256e-10, + "loss": 0.1932, + "step": 17332 + }, + { + "epoch": 1.0, + "grad_norm": 0.5335904809825592, + "learning_rate": 8.975950597212014e-10, + "loss": 0.318, + "step": 17333 + }, + { + "epoch": 1.0, + "grad_norm": 0.34095909495518917, + "learning_rate": 8.728353710107939e-10, + "loss": 0.217, + "step": 17334 + }, + { + "epoch": 1.0, + "grad_norm": 0.3489415469794577, + "learning_rate": 8.484219516435854e-10, + "loss": 0.3089, + "step": 17335 + }, + { + "epoch": 1.0, + "grad_norm": 0.77950741238855, + "learning_rate": 8.243548024655656e-10, + "loss": 0.4937, + "step": 17336 + }, + { + "epoch": 1.0, + "grad_norm": 0.3601096759079263, + "learning_rate": 8.006339243094019e-10, + "loss": 0.2173, + "step": 17337 + }, + { + "epoch": 1.0, + "grad_norm": 0.295337610488898, + "learning_rate": 7.772593179977694e-10, + "loss": 0.1954, + "step": 17338 + }, + { + "epoch": 1.0, + "grad_norm": 0.30542651027800183, + "learning_rate": 7.542309843400209e-10, + "loss": 0.2761, + "step": 17339 + }, + { + "epoch": 1.0, + "grad_norm": 0.31871131623653226, + "learning_rate": 7.315489241332963e-10, + "loss": 0.2258, + "step": 17340 + }, + { + "epoch": 1.0, + "grad_norm": 1.4402612913254227, + "learning_rate": 7.092131381625233e-10, + "loss": 0.71, + "step": 17341 + }, + { + "epoch": 1.0, + "grad_norm": 0.8675495422620723, + "learning_rate": 6.872236272026378e-10, + "loss": 0.4088, + "step": 17342 + }, + { + "epoch": 1.0, + "grad_norm": 0.2562096236525775, + "learning_rate": 6.655803920130322e-10, + "loss": 0.2502, + "step": 17343 + }, + { + "epoch": 1.0, + "grad_norm": 0.3887523928164326, + "learning_rate": 6.442834333453274e-10, + "loss": 0.1314, + "step": 17344 + }, + { + "epoch": 1.0, + "grad_norm": 0.49486610459972546, + "learning_rate": 6.233327519356014e-10, + "loss": 0.3164, + "step": 17345 + }, + { + "epoch": 1.0, + "grad_norm": 0.2790524317225934, + "learning_rate": 6.0272834850994e-10, + "loss": 0.2222, + "step": 17346 + }, + { + "epoch": 1.0, + "grad_norm": 0.3245395596263069, + "learning_rate": 5.824702237822167e-10, + "loss": 0.2424, + "step": 17347 + }, + { + "epoch": 1.0, + "grad_norm": 0.550353076174964, + "learning_rate": 5.62558378452982e-10, + "loss": 0.3918, + "step": 17348 + }, + { + "epoch": 1.0, + "grad_norm": 0.3946960811855935, + "learning_rate": 5.429928132127948e-10, + "loss": 0.2718, + "step": 17349 + }, + { + "epoch": 1.0, + "grad_norm": 0.577554881905842, + "learning_rate": 5.23773528737781e-10, + "loss": 0.289, + "step": 17350 + }, + { + "epoch": 1.0, + "grad_norm": 0.26363339789068374, + "learning_rate": 5.049005256951845e-10, + "loss": 0.2336, + "step": 17351 + }, + { + "epoch": 1.0, + "grad_norm": 0.2628831020493132, + "learning_rate": 4.863738047378164e-10, + "loss": 0.2008, + "step": 17352 + }, + { + "epoch": 1.0, + "grad_norm": 1.316507968789768, + "learning_rate": 4.681933665084959e-10, + "loss": 0.3222, + "step": 17353 + }, + { + "epoch": 1.0, + "grad_norm": 1.1554124886092956, + "learning_rate": 4.5035921163449905e-10, + "loss": 0.4016, + "step": 17354 + }, + { + "epoch": 1.0, + "grad_norm": 0.24058819285647187, + "learning_rate": 4.3287134073422e-10, + "loss": 0.2455, + "step": 17355 + }, + { + "epoch": 1.0, + "grad_norm": 0.6174960471662688, + "learning_rate": 4.1572975441384055e-10, + "loss": 0.4182, + "step": 17356 + }, + { + "epoch": 1.0, + "grad_norm": 0.3324218340581116, + "learning_rate": 3.9893445326733003e-10, + "loss": 0.1355, + "step": 17357 + }, + { + "epoch": 1.0, + "grad_norm": 0.26942764406218134, + "learning_rate": 3.824854378753351e-10, + "loss": 0.2026, + "step": 17358 + }, + { + "epoch": 1.0, + "grad_norm": 0.35914740702980735, + "learning_rate": 3.6638270880851034e-10, + "loss": 0.2896, + "step": 17359 + }, + { + "epoch": 1.0, + "grad_norm": 0.5455096931025434, + "learning_rate": 3.5062626662307753e-10, + "loss": 0.2895, + "step": 17360 + }, + { + "epoch": 1.0, + "grad_norm": 0.3443424930766011, + "learning_rate": 3.352161118652664e-10, + "loss": 0.2766, + "step": 17361 + }, + { + "epoch": 1.0, + "grad_norm": 0.6169748491571364, + "learning_rate": 3.2015224506909414e-10, + "loss": 0.3999, + "step": 17362 + }, + { + "epoch": 1.0, + "grad_norm": 0.22643565427717527, + "learning_rate": 3.0543466675636567e-10, + "loss": 0.1819, + "step": 17363 + }, + { + "epoch": 1.0, + "grad_norm": 0.33718256571038757, + "learning_rate": 2.9106337743667336e-10, + "loss": 0.2753, + "step": 17364 + }, + { + "epoch": 1.0, + "grad_norm": 0.5574590750528694, + "learning_rate": 2.7703837760739706e-10, + "loss": 0.2991, + "step": 17365 + }, + { + "epoch": 1.0, + "grad_norm": 0.36791725805618314, + "learning_rate": 2.6335966775370423e-10, + "loss": 0.2042, + "step": 17366 + }, + { + "epoch": 1.0, + "grad_norm": 0.2976665692719068, + "learning_rate": 2.500272483496602e-10, + "loss": 0.2583, + "step": 17367 + }, + { + "epoch": 1.0, + "grad_norm": 0.563196407125086, + "learning_rate": 2.370411198582279e-10, + "loss": 0.3444, + "step": 17368 + }, + { + "epoch": 1.0, + "grad_norm": 0.4542464555776609, + "learning_rate": 2.2440128272682716e-10, + "loss": 0.2276, + "step": 17369 + }, + { + "epoch": 1.0, + "grad_norm": 0.2292789630209967, + "learning_rate": 2.1210773739510637e-10, + "loss": 0.1631, + "step": 17370 + }, + { + "epoch": 1.0, + "grad_norm": 0.3201790858158781, + "learning_rate": 2.0016048428828095e-10, + "loss": 0.2865, + "step": 17371 + }, + { + "epoch": 1.0, + "grad_norm": 1.1917003624322648, + "learning_rate": 1.8855952381935384e-10, + "loss": 0.6258, + "step": 17372 + }, + { + "epoch": 1.0, + "grad_norm": 0.29913451282453496, + "learning_rate": 1.7730485639133598e-10, + "loss": 0.2036, + "step": 17373 + }, + { + "epoch": 1.0, + "grad_norm": 0.4936270409112451, + "learning_rate": 1.6639648239280547e-10, + "loss": 0.3253, + "step": 17374 + }, + { + "epoch": 1.0, + "grad_norm": 0.5255633349642953, + "learning_rate": 1.5583440220234835e-10, + "loss": 0.3583, + "step": 17375 + }, + { + "epoch": 1.0, + "grad_norm": 0.21257105141332128, + "learning_rate": 1.4561861618411778e-10, + "loss": 0.1564, + "step": 17376 + }, + { + "epoch": 1.0, + "grad_norm": 0.4397563294608192, + "learning_rate": 1.357491246944953e-10, + "loss": 0.3039, + "step": 17377 + }, + { + "epoch": 1.0, + "grad_norm": 0.43366888050053, + "learning_rate": 1.2622592807320922e-10, + "loss": 0.3299, + "step": 17378 + }, + { + "epoch": 1.0, + "grad_norm": 0.25224273533898794, + "learning_rate": 1.1704902665110596e-10, + "loss": 0.2109, + "step": 17379 + }, + { + "epoch": 1.0, + "grad_norm": 0.876683409290362, + "learning_rate": 1.082184207445991e-10, + "loss": 0.5038, + "step": 17380 + }, + { + "epoch": 1.0, + "grad_norm": 0.5709334603042204, + "learning_rate": 9.97341106612204e-11, + "loss": 0.2467, + "step": 17381 + }, + { + "epoch": 1.0, + "grad_norm": 0.3252570517790537, + "learning_rate": 9.159609669406876e-11, + "loss": 0.2698, + "step": 17382 + }, + { + "epoch": 1.0, + "grad_norm": 0.2536394980429637, + "learning_rate": 8.380437912514083e-11, + "loss": 0.2018, + "step": 17383 + }, + { + "epoch": 1.0, + "grad_norm": 0.5485757717623354, + "learning_rate": 7.635895822311057e-11, + "loss": 0.3742, + "step": 17384 + }, + { + "epoch": 1.0, + "grad_norm": 0.3684524592933877, + "learning_rate": 6.925983424777016e-11, + "loss": 0.2536, + "step": 17385 + }, + { + "epoch": 1.0, + "grad_norm": 0.3760098681531618, + "learning_rate": 6.250700744336869e-11, + "loss": 0.2418, + "step": 17386 + }, + { + "epoch": 1.0, + "grad_norm": 0.44201853065463803, + "learning_rate": 5.610047804527341e-11, + "loss": 0.2955, + "step": 17387 + }, + { + "epoch": 1.0, + "grad_norm": 0.3492915963630672, + "learning_rate": 5.00402462733085e-11, + "loss": 0.2622, + "step": 17388 + }, + { + "epoch": 1.0, + "grad_norm": 0.40702154184454287, + "learning_rate": 4.4326312338416333e-11, + "loss": 0.1927, + "step": 17389 + }, + { + "epoch": 1.0, + "grad_norm": 0.4248298337535507, + "learning_rate": 3.895867643932683e-11, + "loss": 0.3135, + "step": 17390 + }, + { + "epoch": 1.0, + "grad_norm": 0.26443247178119667, + "learning_rate": 3.3937338760337e-11, + "loss": 0.2393, + "step": 17391 + }, + { + "epoch": 1.0, + "grad_norm": 0.47992056831554575, + "learning_rate": 2.92622994768621e-11, + "loss": 0.1771, + "step": 17392 + }, + { + "epoch": 1.0, + "grad_norm": 1.7800715125968598, + "learning_rate": 2.4933558749884456e-11, + "loss": 0.4894, + "step": 17393 + }, + { + "epoch": 1.0, + "grad_norm": 0.2614907187670138, + "learning_rate": 2.0951116729284182e-11, + "loss": 0.2234, + "step": 17394 + }, + { + "epoch": 1.0, + "grad_norm": 0.33747834456865705, + "learning_rate": 1.731497355272893e-11, + "loss": 0.319, + "step": 17395 + }, + { + "epoch": 1.0, + "grad_norm": 0.5460667339487827, + "learning_rate": 1.4025129346784127e-11, + "loss": 0.3004, + "step": 17396 + }, + { + "epoch": 1.0, + "grad_norm": 0.32489280345231525, + "learning_rate": 1.1081584224692521e-11, + "loss": 0.2555, + "step": 17397 + }, + { + "epoch": 1.0, + "grad_norm": 0.5915103642932011, + "learning_rate": 8.484338289704852e-12, + "loss": 0.3031, + "step": 17398 + }, + { + "epoch": 1.0, + "grad_norm": 0.37556640988554935, + "learning_rate": 6.233391630638963e-12, + "loss": 0.2357, + "step": 17399 + }, + { + "epoch": 1.0, + "grad_norm": 0.29506593921851954, + "learning_rate": 4.328744325210466e-12, + "loss": 0.2447, + "step": 17400 + }, + { + "epoch": 1.0, + "grad_norm": 0.7211757381438109, + "learning_rate": 2.770396440032741e-12, + "loss": 0.4654, + "step": 17401 + }, + { + "epoch": 1.0, + "grad_norm": 0.316378855890181, + "learning_rate": 1.5583480295067177e-12, + "loss": 0.278, + "step": 17402 + }, + { + "epoch": 1.0, + "grad_norm": 0.38606123702259176, + "learning_rate": 6.925991336004245e-13, + "loss": 0.2463, + "step": 17403 + }, + { + "epoch": 1.0, + "grad_norm": 0.26007754178125586, + "learning_rate": 1.7314978451032915e-13, + "loss": 0.1748, + "step": 17404 + }, + { + "epoch": 1.0, + "grad_norm": 1.5025672310396203, + "learning_rate": 0.0, + "loss": 0.1774, + "step": 17405 + }, + { + "epoch": 1.0, + "step": 17405, + "total_flos": 0.0, + "train_loss": 0.3205628498764781, + "train_runtime": 138539.6755, + "train_samples_per_second": 60.365, + "train_steps_per_second": 0.126 + } + ], + "logging_steps": 1.0, + "max_steps": 17405, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 300, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}